summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorclang-format <noreply@google.com>2016-07-22 20:07:03 -0700
committerJames Zern <jzern@google.com>2016-07-25 14:14:19 -0700
commit099bd7f07e510a4ea26acd9c1f2820be7785a510 (patch)
treecfbf7f94ce8365ff7271b6365b4e8ef3b0f6440e
parent82070ae9393b1e79559d81fcf1aa89c2e4aa58ee (diff)
downloadlibvpx-099bd7f07e510a4ea26acd9c1f2820be7785a510.tar
libvpx-099bd7f07e510a4ea26acd9c1f2820be7785a510.tar.gz
libvpx-099bd7f07e510a4ea26acd9c1f2820be7785a510.tar.bz2
libvpx-099bd7f07e510a4ea26acd9c1f2820be7785a510.zip
vpx_dsp: apply clang-format
Change-Id: I3ea3e77364879928bd916f2b0a7838073ade5975
-rw-r--r--vpx_dsp/add_noise.c2
-rw-r--r--vpx_dsp/arm/avg_neon.c37
-rw-r--r--vpx_dsp/arm/fwd_txfm_neon.c16
-rw-r--r--vpx_dsp/arm/hadamard_neon.c26
-rw-r--r--vpx_dsp/arm/idct16x16_1_add_neon.c85
-rw-r--r--vpx_dsp/arm/idct16x16_add_neon.c2369
-rw-r--r--vpx_dsp/arm/idct16x16_neon.c104
-rw-r--r--vpx_dsp/arm/idct32x32_1_add_neon.c247
-rw-r--r--vpx_dsp/arm/idct32x32_add_neon.c1331
-rw-r--r--vpx_dsp/arm/idct4x4_1_add_neon.c56
-rw-r--r--vpx_dsp/arm/idct4x4_add_neon.c271
-rw-r--r--vpx_dsp/arm/idct8x8_1_add_neon.c93
-rw-r--r--vpx_dsp/arm/idct8x8_add_neon.c1004
-rw-r--r--vpx_dsp/arm/intrapred_neon.c110
-rw-r--r--vpx_dsp/arm/loopfilter_16_neon.c314
-rw-r--r--vpx_dsp/arm/loopfilter_4_neon.c477
-rw-r--r--vpx_dsp/arm/loopfilter_8_neon.c696
-rw-r--r--vpx_dsp/arm/loopfilter_neon.c32
-rw-r--r--vpx_dsp/arm/sad4d_neon.c52
-rw-r--r--vpx_dsp/arm/sad_neon.c165
-rw-r--r--vpx_dsp/arm/subpel_variance_media.c99
-rw-r--r--vpx_dsp/arm/subpel_variance_neon.c65
-rw-r--r--vpx_dsp/arm/subtract_neon.c49
-rw-r--r--vpx_dsp/arm/variance_neon.c515
-rw-r--r--vpx_dsp/arm/vpx_convolve8_avg_neon.c112
-rw-r--r--vpx_dsp/arm/vpx_convolve8_neon.c116
-rw-r--r--vpx_dsp/arm/vpx_convolve_avg_neon.c39
-rw-r--r--vpx_dsp/arm/vpx_convolve_copy_neon.c24
-rw-r--r--vpx_dsp/arm/vpx_convolve_neon.c37
-rw-r--r--vpx_dsp/avg.c58
-rw-r--r--vpx_dsp/bitreader.c27
-rw-r--r--vpx_dsp/bitreader.h16
-rw-r--r--vpx_dsp/bitreader_buffer.c11
-rw-r--r--vpx_dsp/bitwriter.c15
-rw-r--r--vpx_dsp/bitwriter.h3
-rw-r--r--vpx_dsp/bitwriter_buffer.c9
-rw-r--r--vpx_dsp/deblock.c83
-rw-r--r--vpx_dsp/fastssim.c226
-rw-r--r--vpx_dsp/fwd_txfm.c103
-rw-r--r--vpx_dsp/intrapred.c259
-rw-r--r--vpx_dsp/inv_txfm.c316
-rw-r--r--vpx_dsp/inv_txfm.h14
-rw-r--r--vpx_dsp/loopfilter.c463
-rw-r--r--vpx_dsp/mips/add_noise_msa.c4
-rw-r--r--vpx_dsp/mips/common_dspr2.h24
-rw-r--r--vpx_dsp/mips/convolve2_avg_dspr2.c122
-rw-r--r--vpx_dsp/mips/convolve2_avg_horiz_dspr2.c182
-rw-r--r--vpx_dsp/mips/convolve2_dspr2.c1049
-rw-r--r--vpx_dsp/mips/convolve2_horiz_dspr2.c145
-rw-r--r--vpx_dsp/mips/convolve2_vert_dspr2.c132
-rw-r--r--vpx_dsp/mips/convolve8_avg_dspr2.c355
-rw-r--r--vpx_dsp/mips/convolve8_avg_horiz_dspr2.c206
-rw-r--r--vpx_dsp/mips/convolve8_dspr2.c1542
-rw-r--r--vpx_dsp/mips/convolve8_horiz_dspr2.c175
-rw-r--r--vpx_dsp/mips/convolve8_vert_dspr2.c112
-rw-r--r--vpx_dsp/mips/convolve_common_dspr2.h19
-rw-r--r--vpx_dsp/mips/deblock_msa.c345
-rw-r--r--vpx_dsp/mips/fwd_dct32x32_msa.c110
-rw-r--r--vpx_dsp/mips/fwd_txfm_msa.c68
-rw-r--r--vpx_dsp/mips/fwd_txfm_msa.h669
-rw-r--r--vpx_dsp/mips/idct16x16_msa.c52
-rw-r--r--vpx_dsp/mips/idct32x32_msa.c92
-rw-r--r--vpx_dsp/mips/idct4x4_msa.c4
-rw-r--r--vpx_dsp/mips/idct8x8_msa.c28
-rw-r--r--vpx_dsp/mips/intrapred16_dspr2.c49
-rw-r--r--vpx_dsp/mips/intrapred4_dspr2.c67
-rw-r--r--vpx_dsp/mips/intrapred8_dspr2.c79
-rw-r--r--vpx_dsp/mips/intrapred_msa.c41
-rw-r--r--vpx_dsp/mips/inv_txfm_dspr2.h56
-rw-r--r--vpx_dsp/mips/inv_txfm_msa.h749
-rw-r--r--vpx_dsp/mips/itrans16_dspr2.c474
-rw-r--r--vpx_dsp/mips/itrans32_cols_dspr2.c413
-rw-r--r--vpx_dsp/mips/itrans32_dspr2.c408
-rw-r--r--vpx_dsp/mips/itrans4_dspr2.c130
-rw-r--r--vpx_dsp/mips/itrans8_dspr2.c162
-rw-r--r--vpx_dsp/mips/loopfilter_16_msa.c90
-rw-r--r--vpx_dsp/mips/loopfilter_4_msa.c30
-rw-r--r--vpx_dsp/mips/loopfilter_8_msa.c68
-rw-r--r--vpx_dsp/mips/loopfilter_filters_dspr2.c210
-rw-r--r--vpx_dsp/mips/loopfilter_filters_dspr2.h308
-rw-r--r--vpx_dsp/mips/loopfilter_macros_dspr2.h835
-rw-r--r--vpx_dsp/mips/loopfilter_masks_dspr2.h122
-rw-r--r--vpx_dsp/mips/loopfilter_mb_dspr2.c427
-rw-r--r--vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c563
-rw-r--r--vpx_dsp/mips/loopfilter_mb_vert_dspr2.c521
-rw-r--r--vpx_dsp/mips/loopfilter_msa.h456
-rw-r--r--vpx_dsp/mips/macros_msa.h2141
-rw-r--r--vpx_dsp/mips/sad_msa.c375
-rw-r--r--vpx_dsp/mips/sub_pixel_variance_msa.c784
-rw-r--r--vpx_dsp/mips/subtract_msa.c32
-rw-r--r--vpx_dsp/mips/txfm_macros_msa.h149
-rw-r--r--vpx_dsp/mips/variance_msa.c88
-rw-r--r--vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c172
-rw-r--r--vpx_dsp/mips/vpx_convolve8_avg_msa.c263
-rw-r--r--vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c196
-rw-r--r--vpx_dsp/mips/vpx_convolve8_horiz_msa.c62
-rw-r--r--vpx_dsp/mips/vpx_convolve8_msa.c118
-rw-r--r--vpx_dsp/mips/vpx_convolve8_vert_msa.c64
-rw-r--r--vpx_dsp/mips/vpx_convolve_avg_msa.c56
-rw-r--r--vpx_dsp/mips/vpx_convolve_copy_msa.c8
-rw-r--r--vpx_dsp/mips/vpx_convolve_msa.h195
-rw-r--r--vpx_dsp/prob.c38
-rw-r--r--vpx_dsp/prob.h10
-rw-r--r--vpx_dsp/psnr.c111
-rw-r--r--vpx_dsp/psnr.h20
-rw-r--r--vpx_dsp/psnrhvs.c171
-rw-r--r--vpx_dsp/quantize.c122
-rw-r--r--vpx_dsp/quantize.h19
-rw-r--r--vpx_dsp/sad.c149
-rw-r--r--vpx_dsp/ssim.c143
-rw-r--r--vpx_dsp/ssim.h16
-rw-r--r--vpx_dsp/subtract.c26
-rw-r--r--vpx_dsp/txfm_common.h20
-rw-r--r--vpx_dsp/variance.c602
-rw-r--r--vpx_dsp/variance.h60
-rw-r--r--vpx_dsp/vpx_convolve.c270
-rw-r--r--vpx_dsp/vpx_convolve.h4
-rw-r--r--vpx_dsp/vpx_dsp_common.h9
-rw-r--r--vpx_dsp/vpx_dsp_rtcd.c4
-rw-r--r--vpx_dsp/vpx_filter.h1
-rw-r--r--vpx_dsp/x86/avg_intrin_sse2.c27
-rw-r--r--vpx_dsp/x86/convolve.h410
-rw-r--r--vpx_dsp/x86/fwd_dct32x32_impl_avx2.h3260
-rw-r--r--vpx_dsp/x86/fwd_dct32x32_impl_sse2.h3167
-rw-r--r--vpx_dsp/x86/fwd_txfm_avx2.c10
-rw-r--r--vpx_dsp/x86/fwd_txfm_impl_sse2.h230
-rw-r--r--vpx_dsp/x86/fwd_txfm_sse2.c160
-rw-r--r--vpx_dsp/x86/fwd_txfm_sse2.h269
-rw-r--r--vpx_dsp/x86/halfpix_variance_sse2.c20
-rw-r--r--vpx_dsp/x86/highbd_loopfilter_sse2.c424
-rw-r--r--vpx_dsp/x86/highbd_quantize_intrin_sse2.c73
-rw-r--r--vpx_dsp/x86/highbd_variance_sse2.c794
-rw-r--r--vpx_dsp/x86/inv_txfm_sse2.c2233
-rw-r--r--vpx_dsp/x86/inv_txfm_sse2.h80
-rw-r--r--vpx_dsp/x86/loopfilter_avx2.c1654
-rw-r--r--vpx_dsp/x86/loopfilter_sse2.c562
-rw-r--r--vpx_dsp/x86/quantize_sse2.c14
-rw-r--r--vpx_dsp/x86/sad4d_avx2.c36
-rw-r--r--vpx_dsp/x86/sad_avx2.c287
-rw-r--r--vpx_dsp/x86/txfm_common_sse2.h6
-rw-r--r--vpx_dsp/x86/variance_avx2.c122
-rw-r--r--vpx_dsp/x86/variance_impl_avx2.c719
-rw-r--r--vpx_dsp/x86/variance_sse2.c334
-rw-r--r--vpx_dsp/x86/vpx_asm_stubs.c4
-rw-r--r--vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c491
-rw-r--r--vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c207
146 files changed, 21368 insertions, 22562 deletions
diff --git a/vpx_dsp/add_noise.c b/vpx_dsp/add_noise.c
index 29dbd5c75..4d192d78e 100644
--- a/vpx_dsp/add_noise.c
+++ b/vpx_dsp/add_noise.c
@@ -48,7 +48,7 @@ int vpx_setup_noise(double sigma, int8_t *noise, int size) {
// set up a 256 entry lookup that matches gaussian distribution
for (i = -32; i < 32; ++i) {
- const int a_i = (int) (0.5 + 256 * gaussian(sigma, 0, i));
+ const int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
if (a_i) {
for (j = 0; j < a_i; ++j) {
char_dist[next + j] = (char)i;
diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c
index e52958c54..001517d33 100644
--- a/vpx_dsp/arm/avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c
@@ -198,27 +198,24 @@ int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
}
}
-void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- int *min, int *max) {
+void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int *min, int *max) {
// Load and concatenate.
- const uint8x16_t a01 = vcombine_u8(vld1_u8(a),
- vld1_u8(a + a_stride));
- const uint8x16_t a23 = vcombine_u8(vld1_u8(a + 2 * a_stride),
- vld1_u8(a + 3 * a_stride));
- const uint8x16_t a45 = vcombine_u8(vld1_u8(a + 4 * a_stride),
- vld1_u8(a + 5 * a_stride));
- const uint8x16_t a67 = vcombine_u8(vld1_u8(a + 6 * a_stride),
- vld1_u8(a + 7 * a_stride));
-
- const uint8x16_t b01 = vcombine_u8(vld1_u8(b),
- vld1_u8(b + b_stride));
- const uint8x16_t b23 = vcombine_u8(vld1_u8(b + 2 * b_stride),
- vld1_u8(b + 3 * b_stride));
- const uint8x16_t b45 = vcombine_u8(vld1_u8(b + 4 * b_stride),
- vld1_u8(b + 5 * b_stride));
- const uint8x16_t b67 = vcombine_u8(vld1_u8(b + 6 * b_stride),
- vld1_u8(b + 7 * b_stride));
+ const uint8x16_t a01 = vcombine_u8(vld1_u8(a), vld1_u8(a + a_stride));
+ const uint8x16_t a23 =
+ vcombine_u8(vld1_u8(a + 2 * a_stride), vld1_u8(a + 3 * a_stride));
+ const uint8x16_t a45 =
+ vcombine_u8(vld1_u8(a + 4 * a_stride), vld1_u8(a + 5 * a_stride));
+ const uint8x16_t a67 =
+ vcombine_u8(vld1_u8(a + 6 * a_stride), vld1_u8(a + 7 * a_stride));
+
+ const uint8x16_t b01 = vcombine_u8(vld1_u8(b), vld1_u8(b + b_stride));
+ const uint8x16_t b23 =
+ vcombine_u8(vld1_u8(b + 2 * b_stride), vld1_u8(b + 3 * b_stride));
+ const uint8x16_t b45 =
+ vcombine_u8(vld1_u8(b + 4 * b_stride), vld1_u8(b + 5 * b_stride));
+ const uint8x16_t b67 =
+ vcombine_u8(vld1_u8(b + 6 * b_stride), vld1_u8(b + 7 * b_stride));
// Absolute difference.
const uint8x16_t ab01_diff = vabdq_u8(a01, b01);
diff --git a/vpx_dsp/arm/fwd_txfm_neon.c b/vpx_dsp/arm/fwd_txfm_neon.c
index 9f9de98d9..7cb2ba90d 100644
--- a/vpx_dsp/arm/fwd_txfm_neon.c
+++ b/vpx_dsp/arm/fwd_txfm_neon.c
@@ -131,14 +131,14 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
// 14 15 16 17 54 55 56 57
// 24 25 26 27 64 65 66 67
// 34 35 36 37 74 75 76 77
- const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0),
- vreinterpretq_s32_s16(out_2));
- const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1),
- vreinterpretq_s32_s16(out_3));
- const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4),
- vreinterpretq_s32_s16(out_6));
- const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5),
- vreinterpretq_s32_s16(out_7));
+ const int32x4x2_t r02_s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2));
+ const int32x4x2_t r13_s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3));
+ const int32x4x2_t r46_s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6));
+ const int32x4x2_t r57_s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7));
const int16x8x2_t r01_s16 =
vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
vreinterpretq_s16_s32(r13_s32.val[0]));
diff --git a/vpx_dsp/arm/hadamard_neon.c b/vpx_dsp/arm/hadamard_neon.c
index 21e3e3dba..46b2755ea 100644
--- a/vpx_dsp/arm/hadamard_neon.c
+++ b/vpx_dsp/arm/hadamard_neon.c
@@ -12,9 +12,8 @@
#include "./vpx_dsp_rtcd.h"
-static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1,
- int16x8_t *a2, int16x8_t *a3,
- int16x8_t *a4, int16x8_t *a5,
+static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
+ int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
int16x8_t *a6, int16x8_t *a7) {
const int16x8_t b0 = vaddq_s16(*a0, *a1);
const int16x8_t b1 = vsubq_s16(*a0, *a1);
@@ -47,9 +46,8 @@ static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1,
// TODO(johannkoenig): Make a transpose library and dedup with idct. Consider
// reversing transpose order which may make it easier for the compiler to
// reconcile the vtrn.64 moves.
-static void transpose8x8(int16x8_t *a0, int16x8_t *a1,
- int16x8_t *a2, int16x8_t *a3,
- int16x8_t *a4, int16x8_t *a5,
+static void transpose8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
+ int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
int16x8_t *a6, int16x8_t *a7) {
// Swap 64 bit elements. Goes from:
// a0: 00 01 02 03 04 05 06 07
@@ -91,14 +89,14 @@ static void transpose8x8(int16x8_t *a0, int16x8_t *a1,
// a1657_hi:
// 12 13 28 29 44 45 60 61
// 14 15 30 31 46 47 62 63
- const int32x4x2_t a0246_lo = vtrnq_s32(vreinterpretq_s32_s16(a04_lo),
- vreinterpretq_s32_s16(a26_lo));
- const int32x4x2_t a1357_lo = vtrnq_s32(vreinterpretq_s32_s16(a15_lo),
- vreinterpretq_s32_s16(a37_lo));
- const int32x4x2_t a0246_hi = vtrnq_s32(vreinterpretq_s32_s16(a04_hi),
- vreinterpretq_s32_s16(a26_hi));
- const int32x4x2_t a1357_hi = vtrnq_s32(vreinterpretq_s32_s16(a15_hi),
- vreinterpretq_s32_s16(a37_hi));
+ const int32x4x2_t a0246_lo =
+ vtrnq_s32(vreinterpretq_s32_s16(a04_lo), vreinterpretq_s32_s16(a26_lo));
+ const int32x4x2_t a1357_lo =
+ vtrnq_s32(vreinterpretq_s32_s16(a15_lo), vreinterpretq_s32_s16(a37_lo));
+ const int32x4x2_t a0246_hi =
+ vtrnq_s32(vreinterpretq_s32_s16(a04_hi), vreinterpretq_s32_s16(a26_hi));
+ const int32x4x2_t a1357_hi =
+ vtrnq_s32(vreinterpretq_s32_s16(a15_hi), vreinterpretq_s32_s16(a37_hi));
// Swap 16 bit elements resulting in:
// b0:
diff --git a/vpx_dsp/arm/idct16x16_1_add_neon.c b/vpx_dsp/arm/idct16x16_1_add_neon.c
index f734e4802..466b40889 100644
--- a/vpx_dsp/arm/idct16x16_1_add_neon.c
+++ b/vpx_dsp/arm/idct16x16_1_add_neon.c
@@ -13,49 +13,46 @@
#include "vpx_dsp/inv_txfm.h"
#include "vpx_ports/mem.h"
-void vpx_idct16x16_1_add_neon(
- int16_t *input,
- uint8_t *dest,
- int dest_stride) {
- uint8x8_t d2u8, d3u8, d30u8, d31u8;
- uint64x1_t d2u64, d3u64, d4u64, d5u64;
- uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
- int16x8_t q0s16;
- uint8_t *d1, *d2;
- int16_t i, j, a1, cospi_16_64 = 11585;
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
- out = dct_const_round_shift(out * cospi_16_64);
- a1 = ROUND_POWER_OF_TWO(out, 6);
-
- q0s16 = vdupq_n_s16(a1);
- q0u16 = vreinterpretq_u16_s16(q0s16);
-
- for (d1 = d2 = dest, i = 0; i < 4; i++) {
- for (j = 0; j < 2; j++) {
- d2u64 = vld1_u64((const uint64_t *)d1);
- d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
- d1 += dest_stride;
- d4u64 = vld1_u64((const uint64_t *)d1);
- d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
- d1 += dest_stride;
-
- q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
- q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
- q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
- q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
-
- d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
- d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
- d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
- d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
- vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
- vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
- d2 += dest_stride;
- }
+void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+ uint8x8_t d2u8, d3u8, d30u8, d31u8;
+ uint64x1_t d2u64, d3u64, d4u64, d5u64;
+ uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
+ int16x8_t q0s16;
+ uint8_t *d1, *d2;
+ int16_t i, j, a1, cospi_16_64 = 11585;
+ int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+
+ q0s16 = vdupq_n_s16(a1);
+ q0u16 = vreinterpretq_u16_s16(q0s16);
+
+ for (d1 = d2 = dest, i = 0; i < 4; i++) {
+ for (j = 0; j < 2; j++) {
+ d2u64 = vld1_u64((const uint64_t *)d1);
+ d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
+ d1 += dest_stride;
+ d4u64 = vld1_u64((const uint64_t *)d1);
+ d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
+ d1 += dest_stride;
+
+ q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
+ q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
+ q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
+ q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+ d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
+ vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
+ d2 += dest_stride;
}
- return;
+ }
+ return;
}
diff --git a/vpx_dsp/arm/idct16x16_add_neon.c b/vpx_dsp/arm/idct16x16_add_neon.c
index 651ebb21f..6c03aff60 100644
--- a/vpx_dsp/arm/idct16x16_add_neon.c
+++ b/vpx_dsp/arm/idct16x16_add_neon.c
@@ -13,1175 +13,736 @@
#include "./vpx_config.h"
#include "vpx_dsp/txfm_common.h"
-static INLINE void TRANSPOSE8X8(
- int16x8_t *q8s16,
- int16x8_t *q9s16,
- int16x8_t *q10s16,
- int16x8_t *q11s16,
- int16x8_t *q12s16,
- int16x8_t *q13s16,
- int16x8_t *q14s16,
- int16x8_t *q15s16) {
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
- int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
- int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
-
- d16s16 = vget_low_s16(*q8s16);
- d17s16 = vget_high_s16(*q8s16);
- d18s16 = vget_low_s16(*q9s16);
- d19s16 = vget_high_s16(*q9s16);
- d20s16 = vget_low_s16(*q10s16);
- d21s16 = vget_high_s16(*q10s16);
- d22s16 = vget_low_s16(*q11s16);
- d23s16 = vget_high_s16(*q11s16);
- d24s16 = vget_low_s16(*q12s16);
- d25s16 = vget_high_s16(*q12s16);
- d26s16 = vget_low_s16(*q13s16);
- d27s16 = vget_high_s16(*q13s16);
- d28s16 = vget_low_s16(*q14s16);
- d29s16 = vget_high_s16(*q14s16);
- d30s16 = vget_low_s16(*q15s16);
- d31s16 = vget_high_s16(*q15s16);
-
- *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
- *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
- *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
- *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
- *q12s16 = vcombine_s16(d17s16, d25s16);
- *q13s16 = vcombine_s16(d19s16, d27s16);
- *q14s16 = vcombine_s16(d21s16, d29s16);
- *q15s16 = vcombine_s16(d23s16, d31s16);
-
- q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
- vreinterpretq_s32_s16(*q10s16));
- q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
- vreinterpretq_s32_s16(*q11s16));
- q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
- vreinterpretq_s32_s16(*q14s16));
- q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
- vreinterpretq_s32_s16(*q15s16));
-
- q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
- vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
- q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
- vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
- q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
- vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
- q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
- vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
-
- *q8s16 = q0x2s16.val[0];
- *q9s16 = q0x2s16.val[1];
- *q10s16 = q1x2s16.val[0];
- *q11s16 = q1x2s16.val[1];
- *q12s16 = q2x2s16.val[0];
- *q13s16 = q2x2s16.val[1];
- *q14s16 = q3x2s16.val[0];
- *q15s16 = q3x2s16.val[1];
- return;
+static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16,
+ int16x8_t *q10s16, int16x8_t *q11s16,
+ int16x8_t *q12s16, int16x8_t *q13s16,
+ int16x8_t *q14s16, int16x8_t *q15s16) {
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+ int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
+ *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
+ *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
+ *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
+ *q12s16 = vcombine_s16(d17s16, d25s16);
+ *q13s16 = vcombine_s16(d19s16, d27s16);
+ *q14s16 = vcombine_s16(d21s16, d29s16);
+ *q15s16 = vcombine_s16(d23s16, d31s16);
+
+ q0x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16));
+ q1x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16));
+ q2x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16));
+ q3x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16));
+
+ q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
+ vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
+ q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
+ vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
+ q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
+ vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
+ q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
+ vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
+
+ *q8s16 = q0x2s16.val[0];
+ *q9s16 = q0x2s16.val[1];
+ *q10s16 = q1x2s16.val[0];
+ *q11s16 = q1x2s16.val[1];
+ *q12s16 = q2x2s16.val[0];
+ *q13s16 = q2x2s16.val[1];
+ *q14s16 = q3x2s16.val[0];
+ *q15s16 = q3x2s16.val[1];
+ return;
}
-void vpx_idct16x16_256_add_neon_pass1(
- int16_t *in,
- int16_t *out,
- int output_stride) {
- int16x4_t d0s16, d1s16, d2s16, d3s16;
- int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
- uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
- uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
- int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
- int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
- int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
- int16x8x2_t q0x2s16;
-
- q0x2s16 = vld2q_s16(in);
- q8s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q9s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q10s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q11s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q12s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q13s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q14s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q15s16 = q0x2s16.val[0];
-
- TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
- &q12s16, &q13s16, &q14s16, &q15s16);
-
- d16s16 = vget_low_s16(q8s16);
- d17s16 = vget_high_s16(q8s16);
- d18s16 = vget_low_s16(q9s16);
- d19s16 = vget_high_s16(q9s16);
- d20s16 = vget_low_s16(q10s16);
- d21s16 = vget_high_s16(q10s16);
- d22s16 = vget_low_s16(q11s16);
- d23s16 = vget_high_s16(q11s16);
- d24s16 = vget_low_s16(q12s16);
- d25s16 = vget_high_s16(q12s16);
- d26s16 = vget_low_s16(q13s16);
- d27s16 = vget_high_s16(q13s16);
- d28s16 = vget_low_s16(q14s16);
- d29s16 = vget_high_s16(q14s16);
- d30s16 = vget_low_s16(q15s16);
- d31s16 = vget_high_s16(q15s16);
-
- // stage 3
- d0s16 = vdup_n_s16(cospi_28_64);
- d1s16 = vdup_n_s16(cospi_4_64);
-
- q2s32 = vmull_s16(d18s16, d0s16);
- q3s32 = vmull_s16(d19s16, d0s16);
- q5s32 = vmull_s16(d18s16, d1s16);
- q6s32 = vmull_s16(d19s16, d1s16);
-
- q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
- q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
- q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
- q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
-
- d2s16 = vdup_n_s16(cospi_12_64);
- d3s16 = vdup_n_s16(cospi_20_64);
-
- d8s16 = vqrshrn_n_s32(q2s32, 14);
- d9s16 = vqrshrn_n_s32(q3s32, 14);
- d14s16 = vqrshrn_n_s32(q5s32, 14);
- d15s16 = vqrshrn_n_s32(q6s32, 14);
- q4s16 = vcombine_s16(d8s16, d9s16);
- q7s16 = vcombine_s16(d14s16, d15s16);
-
- q2s32 = vmull_s16(d26s16, d2s16);
- q3s32 = vmull_s16(d27s16, d2s16);
- q9s32 = vmull_s16(d26s16, d3s16);
- q15s32 = vmull_s16(d27s16, d3s16);
-
- q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
- q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
- q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
- q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
-
- d10s16 = vqrshrn_n_s32(q2s32, 14);
- d11s16 = vqrshrn_n_s32(q3s32, 14);
- d12s16 = vqrshrn_n_s32(q9s32, 14);
- d13s16 = vqrshrn_n_s32(q15s32, 14);
- q5s16 = vcombine_s16(d10s16, d11s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
-
- // stage 4
- d30s16 = vdup_n_s16(cospi_16_64);
-
- q2s32 = vmull_s16(d16s16, d30s16);
- q11s32 = vmull_s16(d17s16, d30s16);
- q0s32 = vmull_s16(d24s16, d30s16);
- q1s32 = vmull_s16(d25s16, d30s16);
-
- d30s16 = vdup_n_s16(cospi_24_64);
- d31s16 = vdup_n_s16(cospi_8_64);
-
- q3s32 = vaddq_s32(q2s32, q0s32);
- q12s32 = vaddq_s32(q11s32, q1s32);
- q13s32 = vsubq_s32(q2s32, q0s32);
- q1s32 = vsubq_s32(q11s32, q1s32);
-
- d16s16 = vqrshrn_n_s32(q3s32, 14);
- d17s16 = vqrshrn_n_s32(q12s32, 14);
- d18s16 = vqrshrn_n_s32(q13s32, 14);
- d19s16 = vqrshrn_n_s32(q1s32, 14);
- q8s16 = vcombine_s16(d16s16, d17s16);
- q9s16 = vcombine_s16(d18s16, d19s16);
-
- q0s32 = vmull_s16(d20s16, d31s16);
- q1s32 = vmull_s16(d21s16, d31s16);
- q12s32 = vmull_s16(d20s16, d30s16);
- q13s32 = vmull_s16(d21s16, d30s16);
-
- q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
- q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
- q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
- q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
-
- d22s16 = vqrshrn_n_s32(q0s32, 14);
- d23s16 = vqrshrn_n_s32(q1s32, 14);
- d20s16 = vqrshrn_n_s32(q12s32, 14);
- d21s16 = vqrshrn_n_s32(q13s32, 14);
- q10s16 = vcombine_s16(d20s16, d21s16);
- q11s16 = vcombine_s16(d22s16, d23s16);
-
- q13s16 = vsubq_s16(q4s16, q5s16);
- q4s16 = vaddq_s16(q4s16, q5s16);
- q14s16 = vsubq_s16(q7s16, q6s16);
- q15s16 = vaddq_s16(q6s16, q7s16);
- d26s16 = vget_low_s16(q13s16);
- d27s16 = vget_high_s16(q13s16);
- d28s16 = vget_low_s16(q14s16);
- d29s16 = vget_high_s16(q14s16);
-
- // stage 5
- q0s16 = vaddq_s16(q8s16, q11s16);
- q1s16 = vaddq_s16(q9s16, q10s16);
- q2s16 = vsubq_s16(q9s16, q10s16);
- q3s16 = vsubq_s16(q8s16, q11s16);
-
- d16s16 = vdup_n_s16(cospi_16_64);
-
- q11s32 = vmull_s16(d26s16, d16s16);
- q12s32 = vmull_s16(d27s16, d16s16);
- q9s32 = vmull_s16(d28s16, d16s16);
- q10s32 = vmull_s16(d29s16, d16s16);
-
- q6s32 = vsubq_s32(q9s32, q11s32);
- q13s32 = vsubq_s32(q10s32, q12s32);
- q9s32 = vaddq_s32(q9s32, q11s32);
- q10s32 = vaddq_s32(q10s32, q12s32);
-
- d10s16 = vqrshrn_n_s32(q6s32, 14);
- d11s16 = vqrshrn_n_s32(q13s32, 14);
- d12s16 = vqrshrn_n_s32(q9s32, 14);
- d13s16 = vqrshrn_n_s32(q10s32, 14);
- q5s16 = vcombine_s16(d10s16, d11s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
-
- // stage 6
- q8s16 = vaddq_s16(q0s16, q15s16);
- q9s16 = vaddq_s16(q1s16, q6s16);
- q10s16 = vaddq_s16(q2s16, q5s16);
- q11s16 = vaddq_s16(q3s16, q4s16);
- q12s16 = vsubq_s16(q3s16, q4s16);
- q13s16 = vsubq_s16(q2s16, q5s16);
- q14s16 = vsubq_s16(q1s16, q6s16);
+void vpx_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out,
+ int output_stride) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
+ uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+ int16x8x2_t q0x2s16;
+
+ q0x2s16 = vld2q_s16(in);
+ q8s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q9s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q10s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q11s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q12s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q13s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q14s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q15s16 = q0x2s16.val[0];
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+ &q15s16);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+ d30s16 = vget_low_s16(q15s16);
+ d31s16 = vget_high_s16(q15s16);
+
+ // stage 3
+ d0s16 = vdup_n_s16(cospi_28_64);
+ d1s16 = vdup_n_s16(cospi_4_64);
+
+ q2s32 = vmull_s16(d18s16, d0s16);
+ q3s32 = vmull_s16(d19s16, d0s16);
+ q5s32 = vmull_s16(d18s16, d1s16);
+ q6s32 = vmull_s16(d19s16, d1s16);
+
+ q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+ q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+ q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
+ q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
+
+ d2s16 = vdup_n_s16(cospi_12_64);
+ d3s16 = vdup_n_s16(cospi_20_64);
+
+ d8s16 = vqrshrn_n_s32(q2s32, 14);
+ d9s16 = vqrshrn_n_s32(q3s32, 14);
+ d14s16 = vqrshrn_n_s32(q5s32, 14);
+ d15s16 = vqrshrn_n_s32(q6s32, 14);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+ q7s16 = vcombine_s16(d14s16, d15s16);
+
+ q2s32 = vmull_s16(d26s16, d2s16);
+ q3s32 = vmull_s16(d27s16, d2s16);
+ q9s32 = vmull_s16(d26s16, d3s16);
+ q15s32 = vmull_s16(d27s16, d3s16);
+
+ q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
+ q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
+ q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+ q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
+
+ d10s16 = vqrshrn_n_s32(q2s32, 14);
+ d11s16 = vqrshrn_n_s32(q3s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q15s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ // stage 4
+ d30s16 = vdup_n_s16(cospi_16_64);
+
+ q2s32 = vmull_s16(d16s16, d30s16);
+ q11s32 = vmull_s16(d17s16, d30s16);
+ q0s32 = vmull_s16(d24s16, d30s16);
+ q1s32 = vmull_s16(d25s16, d30s16);
+
+ d30s16 = vdup_n_s16(cospi_24_64);
+ d31s16 = vdup_n_s16(cospi_8_64);
+
+ q3s32 = vaddq_s32(q2s32, q0s32);
+ q12s32 = vaddq_s32(q11s32, q1s32);
+ q13s32 = vsubq_s32(q2s32, q0s32);
+ q1s32 = vsubq_s32(q11s32, q1s32);
+
+ d16s16 = vqrshrn_n_s32(q3s32, 14);
+ d17s16 = vqrshrn_n_s32(q12s32, 14);
+ d18s16 = vqrshrn_n_s32(q13s32, 14);
+ d19s16 = vqrshrn_n_s32(q1s32, 14);
+ q8s16 = vcombine_s16(d16s16, d17s16);
+ q9s16 = vcombine_s16(d18s16, d19s16);
+
+ q0s32 = vmull_s16(d20s16, d31s16);
+ q1s32 = vmull_s16(d21s16, d31s16);
+ q12s32 = vmull_s16(d20s16, d30s16);
+ q13s32 = vmull_s16(d21s16, d30s16);
+
+ q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
+ q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
+ q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
+ q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
+
+ d22s16 = vqrshrn_n_s32(q0s32, 14);
+ d23s16 = vqrshrn_n_s32(q1s32, 14);
+ d20s16 = vqrshrn_n_s32(q12s32, 14);
+ d21s16 = vqrshrn_n_s32(q13s32, 14);
+ q10s16 = vcombine_s16(d20s16, d21s16);
+ q11s16 = vcombine_s16(d22s16, d23s16);
+
+ q13s16 = vsubq_s16(q4s16, q5s16);
+ q4s16 = vaddq_s16(q4s16, q5s16);
+ q14s16 = vsubq_s16(q7s16, q6s16);
+ q15s16 = vaddq_s16(q6s16, q7s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+
+ // stage 5
+ q0s16 = vaddq_s16(q8s16, q11s16);
+ q1s16 = vaddq_s16(q9s16, q10s16);
+ q2s16 = vsubq_s16(q9s16, q10s16);
+ q3s16 = vsubq_s16(q8s16, q11s16);
+
+ d16s16 = vdup_n_s16(cospi_16_64);
+
+ q11s32 = vmull_s16(d26s16, d16s16);
+ q12s32 = vmull_s16(d27s16, d16s16);
+ q9s32 = vmull_s16(d28s16, d16s16);
+ q10s32 = vmull_s16(d29s16, d16s16);
+
+ q6s32 = vsubq_s32(q9s32, q11s32);
+ q13s32 = vsubq_s32(q10s32, q12s32);
+ q9s32 = vaddq_s32(q9s32, q11s32);
+ q10s32 = vaddq_s32(q10s32, q12s32);
+
+ d10s16 = vqrshrn_n_s32(q6s32, 14);
+ d11s16 = vqrshrn_n_s32(q13s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q10s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ // stage 6
+ q8s16 = vaddq_s16(q0s16, q15s16);
+ q9s16 = vaddq_s16(q1s16, q6s16);
+ q10s16 = vaddq_s16(q2s16, q5s16);
+ q11s16 = vaddq_s16(q3s16, q4s16);
+ q12s16 = vsubq_s16(q3s16, q4s16);
+ q13s16 = vsubq_s16(q2s16, q5s16);
+ q14s16 = vsubq_s16(q1s16, q6s16);
+ q15s16 = vsubq_s16(q0s16, q15s16);
+
+ d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
+ d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
+ d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+ d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+ d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
+ d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
+ d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
+ d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+ d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+ d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+ d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+ // store the data
+ output_stride >>= 1; // output_stride / 2, out is int16_t
+ vst1_u64((uint64_t *)out, d16u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d17u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d18u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d19u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d20u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d21u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d22u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d23u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d24u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d28u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d29u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d30u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d31u64);
+ return;
+}
+
+void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
+ int16_t *pass1Output, int16_t skip_adding,
+ uint8_t *dest, int dest_stride) {
+ uint8_t *d;
+ uint8x8_t d12u8, d13u8;
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ uint64x1_t d24u64, d25u64, d26u64, d27u64;
+ int64x1_t d12s64, d13s64;
+ uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
+ uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32;
+ int16x8x2_t q0x2s16;
+
+ q0x2s16 = vld2q_s16(src);
+ q8s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q9s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q10s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q11s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q12s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q13s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q14s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q15s16 = q0x2s16.val[0];
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+ &q15s16);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+ d30s16 = vget_low_s16(q15s16);
+ d31s16 = vget_high_s16(q15s16);
+
+ // stage 3
+ d12s16 = vdup_n_s16(cospi_30_64);
+ d13s16 = vdup_n_s16(cospi_2_64);
+
+ q2s32 = vmull_s16(d16s16, d12s16);
+ q3s32 = vmull_s16(d17s16, d12s16);
+ q1s32 = vmull_s16(d16s16, d13s16);
+ q4s32 = vmull_s16(d17s16, d13s16);
+
+ q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
+ q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
+ q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
+ q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
+
+ d0s16 = vqrshrn_n_s32(q2s32, 14);
+ d1s16 = vqrshrn_n_s32(q3s32, 14);
+ d14s16 = vqrshrn_n_s32(q1s32, 14);
+ d15s16 = vqrshrn_n_s32(q4s32, 14);
+ q0s16 = vcombine_s16(d0s16, d1s16);
+ q7s16 = vcombine_s16(d14s16, d15s16);
+
+ d30s16 = vdup_n_s16(cospi_14_64);
+ d31s16 = vdup_n_s16(cospi_18_64);
+
+ q2s32 = vmull_s16(d24s16, d30s16);
+ q3s32 = vmull_s16(d25s16, d30s16);
+ q4s32 = vmull_s16(d24s16, d31s16);
+ q5s32 = vmull_s16(d25s16, d31s16);
+
+ q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
+ q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
+ q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
+ q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
+
+ d2s16 = vqrshrn_n_s32(q2s32, 14);
+ d3s16 = vqrshrn_n_s32(q3s32, 14);
+ d12s16 = vqrshrn_n_s32(q4s32, 14);
+ d13s16 = vqrshrn_n_s32(q5s32, 14);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ d30s16 = vdup_n_s16(cospi_22_64);
+ d31s16 = vdup_n_s16(cospi_10_64);
+
+ q11s32 = vmull_s16(d20s16, d30s16);
+ q12s32 = vmull_s16(d21s16, d30s16);
+ q4s32 = vmull_s16(d20s16, d31s16);
+ q5s32 = vmull_s16(d21s16, d31s16);
+
+ q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
+ q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
+ q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
+ q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
+
+ d4s16 = vqrshrn_n_s32(q11s32, 14);
+ d5s16 = vqrshrn_n_s32(q12s32, 14);
+ d11s16 = vqrshrn_n_s32(q5s32, 14);
+ d10s16 = vqrshrn_n_s32(q4s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ d30s16 = vdup_n_s16(cospi_6_64);
+ d31s16 = vdup_n_s16(cospi_26_64);
+
+ q10s32 = vmull_s16(d28s16, d30s16);
+ q11s32 = vmull_s16(d29s16, d30s16);
+ q12s32 = vmull_s16(d28s16, d31s16);
+ q13s32 = vmull_s16(d29s16, d31s16);
+
+ q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
+ q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
+ q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
+ q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
+
+ d6s16 = vqrshrn_n_s32(q10s32, 14);
+ d7s16 = vqrshrn_n_s32(q11s32, 14);
+ d8s16 = vqrshrn_n_s32(q12s32, 14);
+ d9s16 = vqrshrn_n_s32(q13s32, 14);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+
+ // stage 3
+ q9s16 = vsubq_s16(q0s16, q1s16);
+ q0s16 = vaddq_s16(q0s16, q1s16);
+ q10s16 = vsubq_s16(q3s16, q2s16);
+ q11s16 = vaddq_s16(q2s16, q3s16);
+ q12s16 = vaddq_s16(q4s16, q5s16);
+ q13s16 = vsubq_s16(q4s16, q5s16);
+ q14s16 = vsubq_s16(q7s16, q6s16);
+ q7s16 = vaddq_s16(q6s16, q7s16);
+
+ // stage 4
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+
+ d30s16 = vdup_n_s16(cospi_8_64);
+ d31s16 = vdup_n_s16(cospi_24_64);
+
+ q2s32 = vmull_s16(d18s16, d31s16);
+ q3s32 = vmull_s16(d19s16, d31s16);
+ q4s32 = vmull_s16(d28s16, d31s16);
+ q5s32 = vmull_s16(d29s16, d31s16);
+
+ q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
+ q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
+ q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
+ q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
+
+ d12s16 = vqrshrn_n_s32(q2s32, 14);
+ d13s16 = vqrshrn_n_s32(q3s32, 14);
+ d2s16 = vqrshrn_n_s32(q4s32, 14);
+ d3s16 = vqrshrn_n_s32(q5s32, 14);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ q3s16 = q11s16;
+ q4s16 = q12s16;
+
+ d30s16 = vdup_n_s16(-cospi_8_64);
+ q11s32 = vmull_s16(d26s16, d30s16);
+ q12s32 = vmull_s16(d27s16, d30s16);
+ q8s32 = vmull_s16(d20s16, d30s16);
+ q9s32 = vmull_s16(d21s16, d30s16);
+
+ q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
+ q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
+ q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
+ q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
+
+ d4s16 = vqrshrn_n_s32(q11s32, 14);
+ d5s16 = vqrshrn_n_s32(q12s32, 14);
+ d10s16 = vqrshrn_n_s32(q8s32, 14);
+ d11s16 = vqrshrn_n_s32(q9s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ // stage 5
+ q8s16 = vaddq_s16(q0s16, q3s16);
+ q9s16 = vaddq_s16(q1s16, q2s16);
+ q10s16 = vsubq_s16(q1s16, q2s16);
+ q11s16 = vsubq_s16(q0s16, q3s16);
+ q12s16 = vsubq_s16(q7s16, q4s16);
+ q13s16 = vsubq_s16(q6s16, q5s16);
+ q14s16 = vaddq_s16(q6s16, q5s16);
+ q15s16 = vaddq_s16(q7s16, q4s16);
+
+ // stage 6
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+
+ d14s16 = vdup_n_s16(cospi_16_64);
+
+ q3s32 = vmull_s16(d26s16, d14s16);
+ q4s32 = vmull_s16(d27s16, d14s16);
+ q0s32 = vmull_s16(d20s16, d14s16);
+ q1s32 = vmull_s16(d21s16, d14s16);
+
+ q5s32 = vsubq_s32(q3s32, q0s32);
+ q6s32 = vsubq_s32(q4s32, q1s32);
+ q10s32 = vaddq_s32(q3s32, q0s32);
+ q4s32 = vaddq_s32(q4s32, q1s32);
+
+ d4s16 = vqrshrn_n_s32(q5s32, 14);
+ d5s16 = vqrshrn_n_s32(q6s32, 14);
+ d10s16 = vqrshrn_n_s32(q10s32, 14);
+ d11s16 = vqrshrn_n_s32(q4s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ q0s32 = vmull_s16(d22s16, d14s16);
+ q1s32 = vmull_s16(d23s16, d14s16);
+ q13s32 = vmull_s16(d24s16, d14s16);
+ q6s32 = vmull_s16(d25s16, d14s16);
+
+ q10s32 = vsubq_s32(q13s32, q0s32);
+ q4s32 = vsubq_s32(q6s32, q1s32);
+ q13s32 = vaddq_s32(q13s32, q0s32);
+ q6s32 = vaddq_s32(q6s32, q1s32);
+
+ d6s16 = vqrshrn_n_s32(q10s32, 14);
+ d7s16 = vqrshrn_n_s32(q4s32, 14);
+ d8s16 = vqrshrn_n_s32(q13s32, 14);
+ d9s16 = vqrshrn_n_s32(q6s32, 14);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+
+ // stage 7
+ if (skip_adding != 0) {
+ d = dest;
+ // load the data in pass1
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ d13s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+
+ q12s16 = vaddq_s16(q0s16, q15s16);
+ q13s16 = vaddq_s16(q1s16, q14s16);
+ q12s16 = vrshrq_n_s16(q12s16, 6);
+ q13s16 = vrshrq_n_s16(q13s16, 6);
+ q12u16 =
+ vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
+ q13u16 =
+ vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+ d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+ d += dest_stride;
+ q14s16 = vsubq_s16(q1s16, q14s16);
q15s16 = vsubq_s16(q0s16, q15s16);
- d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
- d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
- d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
- d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
- d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
- d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
- d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
- d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
- d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
- d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
- d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
- // store the data
- output_stride >>= 1; // output_stride / 2, out is int16_t
- vst1_u64((uint64_t *)out, d16u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d17u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d18u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d19u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d20u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d21u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d22u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d23u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d24u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d25u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d26u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d27u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d28u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d29u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d30u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d31u64);
- return;
-}
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ d13s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q12s16 = vaddq_s16(q10s16, q5s16);
+ q13s16 = vaddq_s16(q11s16, q4s16);
+ q12s16 = vrshrq_n_s16(q12s16, 6);
+ q13s16 = vrshrq_n_s16(q13s16, 6);
+ q12u16 =
+ vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
+ q13u16 =
+ vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+ d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+ d += dest_stride;
+ q4s16 = vsubq_s16(q11s16, q4s16);
+ q5s16 = vsubq_s16(q10s16, q5s16);
-void vpx_idct16x16_256_add_neon_pass2(
- int16_t *src,
- int16_t *out,
- int16_t *pass1Output,
- int16_t skip_adding,
- uint8_t *dest,
- int dest_stride) {
- uint8_t *d;
- uint8x8_t d12u8, d13u8;
- int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
- int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
- uint64x1_t d24u64, d25u64, d26u64, d27u64;
- int64x1_t d12s64, d13s64;
- uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
- uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
- int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
- int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
- int32x4_t q10s32, q11s32, q12s32, q13s32;
- int16x8x2_t q0x2s16;
-
- q0x2s16 = vld2q_s16(src);
- q8s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q9s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q10s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q11s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q12s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q13s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q14s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q15s16 = q0x2s16.val[0];
-
- TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
- &q12s16, &q13s16, &q14s16, &q15s16);
-
- d16s16 = vget_low_s16(q8s16);
- d17s16 = vget_high_s16(q8s16);
- d18s16 = vget_low_s16(q9s16);
- d19s16 = vget_high_s16(q9s16);
- d20s16 = vget_low_s16(q10s16);
- d21s16 = vget_high_s16(q10s16);
- d22s16 = vget_low_s16(q11s16);
- d23s16 = vget_high_s16(q11s16);
- d24s16 = vget_low_s16(q12s16);
- d25s16 = vget_high_s16(q12s16);
- d26s16 = vget_low_s16(q13s16);
- d27s16 = vget_high_s16(q13s16);
- d28s16 = vget_low_s16(q14s16);
- d29s16 = vget_high_s16(q14s16);
- d30s16 = vget_low_s16(q15s16);
- d31s16 = vget_high_s16(q15s16);
-
- // stage 3
- d12s16 = vdup_n_s16(cospi_30_64);
- d13s16 = vdup_n_s16(cospi_2_64);
-
- q2s32 = vmull_s16(d16s16, d12s16);
- q3s32 = vmull_s16(d17s16, d12s16);
- q1s32 = vmull_s16(d16s16, d13s16);
- q4s32 = vmull_s16(d17s16, d13s16);
-
- q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
- q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
- q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
- q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
-
- d0s16 = vqrshrn_n_s32(q2s32, 14);
- d1s16 = vqrshrn_n_s32(q3s32, 14);
- d14s16 = vqrshrn_n_s32(q1s32, 14);
- d15s16 = vqrshrn_n_s32(q4s32, 14);
- q0s16 = vcombine_s16(d0s16, d1s16);
- q7s16 = vcombine_s16(d14s16, d15s16);
-
- d30s16 = vdup_n_s16(cospi_14_64);
- d31s16 = vdup_n_s16(cospi_18_64);
-
- q2s32 = vmull_s16(d24s16, d30s16);
- q3s32 = vmull_s16(d25s16, d30s16);
- q4s32 = vmull_s16(d24s16, d31s16);
- q5s32 = vmull_s16(d25s16, d31s16);
-
- q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
- q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
- q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
- q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
-
- d2s16 = vqrshrn_n_s32(q2s32, 14);
- d3s16 = vqrshrn_n_s32(q3s32, 14);
- d12s16 = vqrshrn_n_s32(q4s32, 14);
- d13s16 = vqrshrn_n_s32(q5s32, 14);
- q1s16 = vcombine_s16(d2s16, d3s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
-
- d30s16 = vdup_n_s16(cospi_22_64);
- d31s16 = vdup_n_s16(cospi_10_64);
-
- q11s32 = vmull_s16(d20s16, d30s16);
- q12s32 = vmull_s16(d21s16, d30s16);
- q4s32 = vmull_s16(d20s16, d31s16);
- q5s32 = vmull_s16(d21s16, d31s16);
-
- q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
- q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
- q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
- q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
-
- d4s16 = vqrshrn_n_s32(q11s32, 14);
- d5s16 = vqrshrn_n_s32(q12s32, 14);
- d11s16 = vqrshrn_n_s32(q5s32, 14);
- d10s16 = vqrshrn_n_s32(q4s32, 14);
- q2s16 = vcombine_s16(d4s16, d5s16);
- q5s16 = vcombine_s16(d10s16, d11s16);
-
- d30s16 = vdup_n_s16(cospi_6_64);
- d31s16 = vdup_n_s16(cospi_26_64);
-
- q10s32 = vmull_s16(d28s16, d30s16);
- q11s32 = vmull_s16(d29s16, d30s16);
- q12s32 = vmull_s16(d28s16, d31s16);
- q13s32 = vmull_s16(d29s16, d31s16);
-
- q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
- q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
- q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
- q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
-
- d6s16 = vqrshrn_n_s32(q10s32, 14);
- d7s16 = vqrshrn_n_s32(q11s32, 14);
- d8s16 = vqrshrn_n_s32(q12s32, 14);
- d9s16 = vqrshrn_n_s32(q13s32, 14);
- q3s16 = vcombine_s16(d6s16, d7s16);
- q4s16 = vcombine_s16(d8s16, d9s16);
-
- // stage 3
- q9s16 = vsubq_s16(q0s16, q1s16);
- q0s16 = vaddq_s16(q0s16, q1s16);
- q10s16 = vsubq_s16(q3s16, q2s16);
- q11s16 = vaddq_s16(q2s16, q3s16);
- q12s16 = vaddq_s16(q4s16, q5s16);
- q13s16 = vsubq_s16(q4s16, q5s16);
- q14s16 = vsubq_s16(q7s16, q6s16);
- q7s16 = vaddq_s16(q6s16, q7s16);
-
- // stage 4
- d18s16 = vget_low_s16(q9s16);
- d19s16 = vget_high_s16(q9s16);
- d20s16 = vget_low_s16(q10s16);
- d21s16 = vget_high_s16(q10s16);
- d26s16 = vget_low_s16(q13s16);
- d27s16 = vget_high_s16(q13s16);
- d28s16 = vget_low_s16(q14s16);
- d29s16 = vget_high_s16(q14s16);
-
- d30s16 = vdup_n_s16(cospi_8_64);
- d31s16 = vdup_n_s16(cospi_24_64);
-
- q2s32 = vmull_s16(d18s16, d31s16);
- q3s32 = vmull_s16(d19s16, d31s16);
- q4s32 = vmull_s16(d28s16, d31s16);
- q5s32 = vmull_s16(d29s16, d31s16);
-
- q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
- q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
- q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
- q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
-
- d12s16 = vqrshrn_n_s32(q2s32, 14);
- d13s16 = vqrshrn_n_s32(q3s32, 14);
- d2s16 = vqrshrn_n_s32(q4s32, 14);
- d3s16 = vqrshrn_n_s32(q5s32, 14);
- q1s16 = vcombine_s16(d2s16, d3s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
-
- q3s16 = q11s16;
- q4s16 = q12s16;
-
- d30s16 = vdup_n_s16(-cospi_8_64);
- q11s32 = vmull_s16(d26s16, d30s16);
- q12s32 = vmull_s16(d27s16, d30s16);
- q8s32 = vmull_s16(d20s16, d30s16);
- q9s32 = vmull_s16(d21s16, d30s16);
-
- q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
- q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
- q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
- q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
-
- d4s16 = vqrshrn_n_s32(q11s32, 14);
- d5s16 = vqrshrn_n_s32(q12s32, 14);
- d10s16 = vqrshrn_n_s32(q8s32, 14);
- d11s16 = vqrshrn_n_s32(q9s32, 14);
- q2s16 = vcombine_s16(d4s16, d5s16);
- q5s16 = vcombine_s16(d10s16, d11s16);
-
- // stage 5
- q8s16 = vaddq_s16(q0s16, q3s16);
- q9s16 = vaddq_s16(q1s16, q2s16);
- q10s16 = vsubq_s16(q1s16, q2s16);
- q11s16 = vsubq_s16(q0s16, q3s16);
- q12s16 = vsubq_s16(q7s16, q4s16);
- q13s16 = vsubq_s16(q6s16, q5s16);
- q14s16 = vaddq_s16(q6s16, q5s16);
- q15s16 = vaddq_s16(q7s16, q4s16);
-
- // stage 6
- d20s16 = vget_low_s16(q10s16);
- d21s16 = vget_high_s16(q10s16);
- d22s16 = vget_low_s16(q11s16);
- d23s16 = vget_high_s16(q11s16);
- d24s16 = vget_low_s16(q12s16);
- d25s16 = vget_high_s16(q12s16);
- d26s16 = vget_low_s16(q13s16);
- d27s16 = vget_high_s16(q13s16);
-
- d14s16 = vdup_n_s16(cospi_16_64);
-
- q3s32 = vmull_s16(d26s16, d14s16);
- q4s32 = vmull_s16(d27s16, d14s16);
- q0s32 = vmull_s16(d20s16, d14s16);
- q1s32 = vmull_s16(d21s16, d14s16);
-
- q5s32 = vsubq_s32(q3s32, q0s32);
- q6s32 = vsubq_s32(q4s32, q1s32);
- q10s32 = vaddq_s32(q3s32, q0s32);
- q4s32 = vaddq_s32(q4s32, q1s32);
-
- d4s16 = vqrshrn_n_s32(q5s32, 14);
- d5s16 = vqrshrn_n_s32(q6s32, 14);
- d10s16 = vqrshrn_n_s32(q10s32, 14);
- d11s16 = vqrshrn_n_s32(q4s32, 14);
- q2s16 = vcombine_s16(d4s16, d5s16);
- q5s16 = vcombine_s16(d10s16, d11s16);
-
- q0s32 = vmull_s16(d22s16, d14s16);
- q1s32 = vmull_s16(d23s16, d14s16);
- q13s32 = vmull_s16(d24s16, d14s16);
- q6s32 = vmull_s16(d25s16, d14s16);
-
- q10s32 = vsubq_s32(q13s32, q0s32);
- q4s32 = vsubq_s32(q6s32, q1s32);
- q13s32 = vaddq_s32(q13s32, q0s32);
- q6s32 = vaddq_s32(q6s32, q1s32);
-
- d6s16 = vqrshrn_n_s32(q10s32, 14);
- d7s16 = vqrshrn_n_s32(q4s32, 14);
- d8s16 = vqrshrn_n_s32(q13s32, 14);
- d9s16 = vqrshrn_n_s32(q6s32, 14);
- q3s16 = vcombine_s16(d6s16, d7s16);
- q4s16 = vcombine_s16(d8s16, d9s16);
-
- // stage 7
- if (skip_adding != 0) {
- d = dest;
- // load the data in pass1
- q0s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q1s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- d13s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
-
- q12s16 = vaddq_s16(q0s16, q15s16);
- q13s16 = vaddq_s16(q1s16, q14s16);
- q12s16 = vrshrq_n_s16(q12s16, 6);
- q13s16 = vrshrq_n_s16(q13s16, 6);
- q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
- vreinterpret_u8_s64(d12s64));
- q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
- vreinterpret_u8_s64(d13s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
- d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
- d += dest_stride;
- q14s16 = vsubq_s16(q1s16, q14s16);
- q15s16 = vsubq_s16(q0s16, q15s16);
-
- q10s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q11s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- d13s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q12s16 = vaddq_s16(q10s16, q5s16);
- q13s16 = vaddq_s16(q11s16, q4s16);
- q12s16 = vrshrq_n_s16(q12s16, 6);
- q13s16 = vrshrq_n_s16(q13s16, 6);
- q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
- vreinterpret_u8_s64(d12s64));
- q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
- vreinterpret_u8_s64(d13s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
- d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
- d += dest_stride;
- q4s16 = vsubq_s16(q11s16, q4s16);
- q5s16 = vsubq_s16(q10s16, q5s16);
-
- q0s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q1s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- d13s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q12s16 = vaddq_s16(q0s16, q3s16);
- q13s16 = vaddq_s16(q1s16, q2s16);
- q12s16 = vrshrq_n_s16(q12s16, 6);
- q13s16 = vrshrq_n_s16(q13s16, 6);
- q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
- vreinterpret_u8_s64(d12s64));
- q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
- vreinterpret_u8_s64(d13s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
- d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
- d += dest_stride;
- q2s16 = vsubq_s16(q1s16, q2s16);
- q3s16 = vsubq_s16(q0s16, q3s16);
-
- q10s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q11s16 = vld1q_s16(pass1Output);
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- d13s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q12s16 = vaddq_s16(q10s16, q9s16);
- q13s16 = vaddq_s16(q11s16, q8s16);
- q12s16 = vrshrq_n_s16(q12s16, 6);
- q13s16 = vrshrq_n_s16(q13s16, 6);
- q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
- vreinterpret_u8_s64(d12s64));
- q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
- vreinterpret_u8_s64(d13s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
- d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
- d += dest_stride;
- q8s16 = vsubq_s16(q11s16, q8s16);
- q9s16 = vsubq_s16(q10s16, q9s16);
-
- // store the data out 8,9,10,11,12,13,14,15
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q8s16 = vrshrq_n_s16(q8s16, 6);
- q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
- vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
-
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q9s16 = vrshrq_n_s16(q9s16, 6);
- q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
- vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
-
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q2s16 = vrshrq_n_s16(q2s16, 6);
- q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16),
- vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
-
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q3s16 = vrshrq_n_s16(q3s16, 6);
- q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16),
- vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
-
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q4s16 = vrshrq_n_s16(q4s16, 6);
- q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16),
- vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
-
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q5s16 = vrshrq_n_s16(q5s16, 6);
- q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16),
- vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
-
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q14s16 = vrshrq_n_s16(q14s16, 6);
- q14u16 = vaddw_u8(vreinterpretq_u16_s16(q14s16),
- vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
-
- d12s64 = vld1_s64((int64_t *)dest);
- q15s16 = vrshrq_n_s16(q15s16, 6);
- q15u16 = vaddw_u8(vreinterpretq_u16_s16(q15s16),
- vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- } else { // skip_adding_dest
- q0s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q1s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q12s16 = vaddq_s16(q0s16, q15s16);
- q13s16 = vaddq_s16(q1s16, q14s16);
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- vst1_u64((uint64_t *)out, d24u64);
- out += 4;
- vst1_u64((uint64_t *)out, d25u64);
- out += 12;
- vst1_u64((uint64_t *)out, d26u64);
- out += 4;
- vst1_u64((uint64_t *)out, d27u64);
- out += 12;
- q14s16 = vsubq_s16(q1s16, q14s16);
- q15s16 = vsubq_s16(q0s16, q15s16);
-
- q10s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q11s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q12s16 = vaddq_s16(q10s16, q5s16);
- q13s16 = vaddq_s16(q11s16, q4s16);
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- vst1_u64((uint64_t *)out, d24u64);
- out += 4;
- vst1_u64((uint64_t *)out, d25u64);
- out += 12;
- vst1_u64((uint64_t *)out, d26u64);
- out += 4;
- vst1_u64((uint64_t *)out, d27u64);
- out += 12;
- q4s16 = vsubq_s16(q11s16, q4s16);
- q5s16 = vsubq_s16(q10s16, q5s16);
-
- q0s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q1s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q12s16 = vaddq_s16(q0s16, q3s16);
- q13s16 = vaddq_s16(q1s16, q2s16);
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- vst1_u64((uint64_t *)out, d24u64);
- out += 4;
- vst1_u64((uint64_t *)out, d25u64);
- out += 12;
- vst1_u64((uint64_t *)out, d26u64);
- out += 4;
- vst1_u64((uint64_t *)out, d27u64);
- out += 12;
- q2s16 = vsubq_s16(q1s16, q2s16);
- q3s16 = vsubq_s16(q0s16, q3s16);
-
- q10s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q11s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q12s16 = vaddq_s16(q10s16, q9s16);
- q13s16 = vaddq_s16(q11s16, q8s16);
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- vst1_u64((uint64_t *)out, d24u64);
- out += 4;
- vst1_u64((uint64_t *)out, d25u64);
- out += 12;
- vst1_u64((uint64_t *)out, d26u64);
- out += 4;
- vst1_u64((uint64_t *)out, d27u64);
- out += 12;
- q8s16 = vsubq_s16(q11s16, q8s16);
- q9s16 = vsubq_s16(q10s16, q9s16);
-
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
- out += 12;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
- out += 12;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
- out += 12;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
- out += 12;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
- out += 12;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
- out += 12;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
- out += 12;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
- }
- return;
-}
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ d13s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q12s16 = vaddq_s16(q0s16, q3s16);
+ q13s16 = vaddq_s16(q1s16, q2s16);
+ q12s16 = vrshrq_n_s16(q12s16, 6);
+ q13s16 = vrshrq_n_s16(q13s16, 6);
+ q12u16 =
+ vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
+ q13u16 =
+ vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+ d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+ d += dest_stride;
+ q2s16 = vsubq_s16(q1s16, q2s16);
+ q3s16 = vsubq_s16(q0s16, q3s16);
-void vpx_idct16x16_10_add_neon_pass1(
- int16_t *in,
- int16_t *out,
- int output_stride) {
- int16x4_t d4s16;
- int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
- uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
- uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
- int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
- int32x4_t q6s32, q9s32;
- int32x4_t q10s32, q11s32, q12s32, q15s32;
- int16x8x2_t q0x2s16;
-
- q0x2s16 = vld2q_s16(in);
- q8s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q9s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q10s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q11s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q12s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q13s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q14s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q15s16 = q0x2s16.val[0];
-
- TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
- &q12s16, &q13s16, &q14s16, &q15s16);
-
- // stage 3
- q0s16 = vdupq_n_s16(cospi_28_64 * 2);
- q1s16 = vdupq_n_s16(cospi_4_64 * 2);
-
- q4s16 = vqrdmulhq_s16(q9s16, q0s16);
- q7s16 = vqrdmulhq_s16(q9s16, q1s16);
-
- // stage 4
- q1s16 = vdupq_n_s16(cospi_16_64 * 2);
- d4s16 = vdup_n_s16(cospi_16_64);
-
- q8s16 = vqrdmulhq_s16(q8s16, q1s16);
-
- d8s16 = vget_low_s16(q4s16);
- d9s16 = vget_high_s16(q4s16);
- d14s16 = vget_low_s16(q7s16);
- d15s16 = vget_high_s16(q7s16);
- q9s32 = vmull_s16(d14s16, d4s16);
- q10s32 = vmull_s16(d15s16, d4s16);
- q12s32 = vmull_s16(d9s16, d4s16);
- q11s32 = vmull_s16(d8s16, d4s16);
-
- q15s32 = vsubq_s32(q10s32, q12s32);
- q6s32 = vsubq_s32(q9s32, q11s32);
- q9s32 = vaddq_s32(q9s32, q11s32);
- q10s32 = vaddq_s32(q10s32, q12s32);
-
- d11s16 = vqrshrn_n_s32(q15s32, 14);
- d10s16 = vqrshrn_n_s32(q6s32, 14);
- d12s16 = vqrshrn_n_s32(q9s32, 14);
- d13s16 = vqrshrn_n_s32(q10s32, 14);
- q5s16 = vcombine_s16(d10s16, d11s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
-
- // stage 6
- q2s16 = vaddq_s16(q8s16, q7s16);
- q9s16 = vaddq_s16(q8s16, q6s16);
- q10s16 = vaddq_s16(q8s16, q5s16);
- q11s16 = vaddq_s16(q8s16, q4s16);
- q12s16 = vsubq_s16(q8s16, q4s16);
- q13s16 = vsubq_s16(q8s16, q5s16);
- q14s16 = vsubq_s16(q8s16, q6s16);
- q15s16 = vsubq_s16(q8s16, q7s16);
-
- d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
- d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
- d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
- d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
- d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
- d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
- d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
- d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
- d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
- d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
- d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
- // store the data
- output_stride >>= 1; // output_stride / 2, out is int16_t
- vst1_u64((uint64_t *)out, d4u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d5u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d18u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d19u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d20u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d21u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d22u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d23u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d24u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d25u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d26u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d27u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d28u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d29u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d30u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d31u64);
- return;
-}
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ d13s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q12s16 = vaddq_s16(q10s16, q9s16);
+ q13s16 = vaddq_s16(q11s16, q8s16);
+ q12s16 = vrshrq_n_s16(q12s16, 6);
+ q13s16 = vrshrq_n_s16(q13s16, 6);
+ q12u16 =
+ vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
+ q13u16 =
+ vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+ d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+ d += dest_stride;
+ q8s16 = vsubq_s16(q11s16, q8s16);
+ q9s16 = vsubq_s16(q10s16, q9s16);
-void vpx_idct16x16_10_add_neon_pass2(
- int16_t *src,
- int16_t *out,
- int16_t *pass1Output,
- int16_t skip_adding,
- uint8_t *dest,
- int dest_stride) {
- int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
- int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
- int16x4_t d20s16, d21s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
- uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
- uint64x1_t d16u64, d17u64, d18u64, d19u64;
- uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
- int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
- int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
- int32x4_t q10s32, q11s32, q12s32, q13s32;
- int16x8x2_t q0x2s16;
- (void)skip_adding;
- (void)dest;
- (void)dest_stride;
-
- q0x2s16 = vld2q_s16(src);
- q8s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q9s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q10s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q11s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q12s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q13s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q14s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q15s16 = q0x2s16.val[0];
-
- TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
- &q12s16, &q13s16, &q14s16, &q15s16);
-
- // stage 3
- q6s16 = vdupq_n_s16(cospi_30_64 * 2);
- q0s16 = vqrdmulhq_s16(q8s16, q6s16);
- q6s16 = vdupq_n_s16(cospi_2_64 * 2);
- q7s16 = vqrdmulhq_s16(q8s16, q6s16);
-
- q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
- q14s16 = vdupq_n_s16(cospi_6_64 * 2);
- q3s16 = vqrdmulhq_s16(q9s16, q15s16);
- q4s16 = vqrdmulhq_s16(q9s16, q14s16);
-
- // stage 4
- d0s16 = vget_low_s16(q0s16);
- d1s16 = vget_high_s16(q0s16);
- d6s16 = vget_low_s16(q3s16);
- d7s16 = vget_high_s16(q3s16);
- d8s16 = vget_low_s16(q4s16);
- d9s16 = vget_high_s16(q4s16);
- d14s16 = vget_low_s16(q7s16);
- d15s16 = vget_high_s16(q7s16);
-
- d30s16 = vdup_n_s16(cospi_8_64);
- d31s16 = vdup_n_s16(cospi_24_64);
-
- q12s32 = vmull_s16(d14s16, d31s16);
- q5s32 = vmull_s16(d15s16, d31s16);
- q2s32 = vmull_s16(d0s16, d31s16);
- q11s32 = vmull_s16(d1s16, d31s16);
-
- q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
- q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
- q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
- q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
-
- d2s16 = vqrshrn_n_s32(q12s32, 14);
- d3s16 = vqrshrn_n_s32(q5s32, 14);
- d12s16 = vqrshrn_n_s32(q2s32, 14);
- d13s16 = vqrshrn_n_s32(q11s32, 14);
- q1s16 = vcombine_s16(d2s16, d3s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
-
- d30s16 = vdup_n_s16(-cospi_8_64);
- q10s32 = vmull_s16(d8s16, d30s16);
- q13s32 = vmull_s16(d9s16, d30s16);
- q8s32 = vmull_s16(d6s16, d30s16);
- q9s32 = vmull_s16(d7s16, d30s16);
-
- q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
- q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
- q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
- q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
-
- d4s16 = vqrshrn_n_s32(q10s32, 14);
- d5s16 = vqrshrn_n_s32(q13s32, 14);
- d10s16 = vqrshrn_n_s32(q8s32, 14);
- d11s16 = vqrshrn_n_s32(q9s32, 14);
- q2s16 = vcombine_s16(d4s16, d5s16);
- q5s16 = vcombine_s16(d10s16, d11s16);
-
- // stage 5
- q8s16 = vaddq_s16(q0s16, q3s16);
- q9s16 = vaddq_s16(q1s16, q2s16);
- q10s16 = vsubq_s16(q1s16, q2s16);
- q11s16 = vsubq_s16(q0s16, q3s16);
- q12s16 = vsubq_s16(q7s16, q4s16);
- q13s16 = vsubq_s16(q6s16, q5s16);
- q14s16 = vaddq_s16(q6s16, q5s16);
- q15s16 = vaddq_s16(q7s16, q4s16);
-
- // stage 6
- d20s16 = vget_low_s16(q10s16);
- d21s16 = vget_high_s16(q10s16);
- d22s16 = vget_low_s16(q11s16);
- d23s16 = vget_high_s16(q11s16);
- d24s16 = vget_low_s16(q12s16);
- d25s16 = vget_high_s16(q12s16);
- d26s16 = vget_low_s16(q13s16);
- d27s16 = vget_high_s16(q13s16);
-
- d14s16 = vdup_n_s16(cospi_16_64);
- q3s32 = vmull_s16(d26s16, d14s16);
- q4s32 = vmull_s16(d27s16, d14s16);
- q0s32 = vmull_s16(d20s16, d14s16);
- q1s32 = vmull_s16(d21s16, d14s16);
-
- q5s32 = vsubq_s32(q3s32, q0s32);
- q6s32 = vsubq_s32(q4s32, q1s32);
- q0s32 = vaddq_s32(q3s32, q0s32);
- q4s32 = vaddq_s32(q4s32, q1s32);
-
- d4s16 = vqrshrn_n_s32(q5s32, 14);
- d5s16 = vqrshrn_n_s32(q6s32, 14);
- d10s16 = vqrshrn_n_s32(q0s32, 14);
- d11s16 = vqrshrn_n_s32(q4s32, 14);
- q2s16 = vcombine_s16(d4s16, d5s16);
- q5s16 = vcombine_s16(d10s16, d11s16);
-
- q0s32 = vmull_s16(d22s16, d14s16);
- q1s32 = vmull_s16(d23s16, d14s16);
- q13s32 = vmull_s16(d24s16, d14s16);
- q6s32 = vmull_s16(d25s16, d14s16);
-
- q10s32 = vsubq_s32(q13s32, q0s32);
- q4s32 = vsubq_s32(q6s32, q1s32);
- q13s32 = vaddq_s32(q13s32, q0s32);
- q6s32 = vaddq_s32(q6s32, q1s32);
-
- d6s16 = vqrshrn_n_s32(q10s32, 14);
- d7s16 = vqrshrn_n_s32(q4s32, 14);
- d8s16 = vqrshrn_n_s32(q13s32, 14);
- d9s16 = vqrshrn_n_s32(q6s32, 14);
- q3s16 = vcombine_s16(d6s16, d7s16);
- q4s16 = vcombine_s16(d8s16, d9s16);
-
- // stage 7
+ // store the data out 8,9,10,11,12,13,14,15
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q8s16 = vrshrq_n_s16(q8s16, 6);
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q9s16 = vrshrq_n_s16(q9s16, 6);
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q2s16 = vrshrq_n_s16(q2s16, 6);
+ q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q3s16 = vrshrq_n_s16(q3s16, 6);
+ q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q4s16 = vrshrq_n_s16(q4s16, 6);
+ q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q5s16 = vrshrq_n_s16(q5s16, 6);
+ q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q14s16 = vrshrq_n_s16(q14s16, 6);
+ q14u16 =
+ vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ q15s16 = vrshrq_n_s16(q15s16, 6);
+ q15u16 =
+ vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ } else { // skip_adding_dest
q0s16 = vld1q_s16(pass1Output);
pass1Output += 8;
q1s16 = vld1q_s16(pass1Output);
@@ -1248,6 +809,7 @@ void vpx_idct16x16_10_add_neon_pass2(
q10s16 = vld1q_s16(pass1Output);
pass1Output += 8;
q11s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
q12s16 = vaddq_s16(q10s16, q9s16);
q13s16 = vaddq_s16(q11s16, q8s16);
d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
@@ -1265,53 +827,468 @@ void vpx_idct16x16_10_add_neon_pass2(
q8s16 = vsubq_s16(q11s16, q8s16);
q9s16 = vsubq_s16(q10s16, q9s16);
- d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
- d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
- d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16));
- d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16));
- d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16));
- d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16));
- d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
- d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
- d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
- d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
- d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
- d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
- d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
- d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
- d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
- d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
- vst1_u64((uint64_t *)out, d16u64);
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
out += 4;
- vst1_u64((uint64_t *)out, d17u64);
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
out += 12;
- vst1_u64((uint64_t *)out, d18u64);
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
out += 4;
- vst1_u64((uint64_t *)out, d19u64);
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
out += 12;
- vst1_u64((uint64_t *)out, d4u64);
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
out += 4;
- vst1_u64((uint64_t *)out, d5u64);
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
out += 12;
- vst1_u64((uint64_t *)out, d6u64);
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
out += 4;
- vst1_u64((uint64_t *)out, d7u64);
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
out += 12;
- vst1_u64((uint64_t *)out, d8u64);
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
out += 4;
- vst1_u64((uint64_t *)out, d9u64);
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
out += 12;
- vst1_u64((uint64_t *)out, d10u64);
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
out += 4;
- vst1_u64((uint64_t *)out, d11u64);
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
out += 12;
- vst1_u64((uint64_t *)out, d28u64);
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
out += 4;
- vst1_u64((uint64_t *)out, d29u64);
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
out += 12;
- vst1_u64((uint64_t *)out, d30u64);
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
out += 4;
- vst1_u64((uint64_t *)out, d31u64);
- return;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
+ }
+ return;
+}
+
+void vpx_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out,
+ int output_stride) {
+ int16x4_t d4s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
+ uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+ int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4_t q6s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q15s32;
+ int16x8x2_t q0x2s16;
+
+ q0x2s16 = vld2q_s16(in);
+ q8s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q9s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q10s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q11s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q12s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q13s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q14s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q15s16 = q0x2s16.val[0];
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+ &q15s16);
+
+ // stage 3
+ q0s16 = vdupq_n_s16(cospi_28_64 * 2);
+ q1s16 = vdupq_n_s16(cospi_4_64 * 2);
+
+ q4s16 = vqrdmulhq_s16(q9s16, q0s16);
+ q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+
+ // stage 4
+ q1s16 = vdupq_n_s16(cospi_16_64 * 2);
+ d4s16 = vdup_n_s16(cospi_16_64);
+
+ q8s16 = vqrdmulhq_s16(q8s16, q1s16);
+
+ d8s16 = vget_low_s16(q4s16);
+ d9s16 = vget_high_s16(q4s16);
+ d14s16 = vget_low_s16(q7s16);
+ d15s16 = vget_high_s16(q7s16);
+ q9s32 = vmull_s16(d14s16, d4s16);
+ q10s32 = vmull_s16(d15s16, d4s16);
+ q12s32 = vmull_s16(d9s16, d4s16);
+ q11s32 = vmull_s16(d8s16, d4s16);
+
+ q15s32 = vsubq_s32(q10s32, q12s32);
+ q6s32 = vsubq_s32(q9s32, q11s32);
+ q9s32 = vaddq_s32(q9s32, q11s32);
+ q10s32 = vaddq_s32(q10s32, q12s32);
+
+ d11s16 = vqrshrn_n_s32(q15s32, 14);
+ d10s16 = vqrshrn_n_s32(q6s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q10s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ // stage 6
+ q2s16 = vaddq_s16(q8s16, q7s16);
+ q9s16 = vaddq_s16(q8s16, q6s16);
+ q10s16 = vaddq_s16(q8s16, q5s16);
+ q11s16 = vaddq_s16(q8s16, q4s16);
+ q12s16 = vsubq_s16(q8s16, q4s16);
+ q13s16 = vsubq_s16(q8s16, q5s16);
+ q14s16 = vsubq_s16(q8s16, q6s16);
+ q15s16 = vsubq_s16(q8s16, q7s16);
+
+ d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
+ d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
+ d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+ d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+ d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
+ d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
+ d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
+ d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+ d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+ d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+ d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+ // store the data
+ output_stride >>= 1; // output_stride / 2, out is int16_t
+ vst1_u64((uint64_t *)out, d4u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d5u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d18u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d19u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d20u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d21u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d22u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d23u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d24u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d28u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d29u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d30u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d31u64);
+ return;
+}
+
+void vpx_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out,
+ int16_t *pass1Output, int16_t skip_adding,
+ uint8_t *dest, int dest_stride) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
+ uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
+ uint64x1_t d16u64, d17u64, d18u64, d19u64;
+ uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32;
+ int16x8x2_t q0x2s16;
+ (void)skip_adding;
+ (void)dest;
+ (void)dest_stride;
+
+ q0x2s16 = vld2q_s16(src);
+ q8s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q9s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q10s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q11s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q12s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q13s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q14s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q15s16 = q0x2s16.val[0];
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+ &q15s16);
+
+ // stage 3
+ q6s16 = vdupq_n_s16(cospi_30_64 * 2);
+ q0s16 = vqrdmulhq_s16(q8s16, q6s16);
+ q6s16 = vdupq_n_s16(cospi_2_64 * 2);
+ q7s16 = vqrdmulhq_s16(q8s16, q6s16);
+
+ q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
+ q14s16 = vdupq_n_s16(cospi_6_64 * 2);
+ q3s16 = vqrdmulhq_s16(q9s16, q15s16);
+ q4s16 = vqrdmulhq_s16(q9s16, q14s16);
+
+ // stage 4
+ d0s16 = vget_low_s16(q0s16);
+ d1s16 = vget_high_s16(q0s16);
+ d6s16 = vget_low_s16(q3s16);
+ d7s16 = vget_high_s16(q3s16);
+ d8s16 = vget_low_s16(q4s16);
+ d9s16 = vget_high_s16(q4s16);
+ d14s16 = vget_low_s16(q7s16);
+ d15s16 = vget_high_s16(q7s16);
+
+ d30s16 = vdup_n_s16(cospi_8_64);
+ d31s16 = vdup_n_s16(cospi_24_64);
+
+ q12s32 = vmull_s16(d14s16, d31s16);
+ q5s32 = vmull_s16(d15s16, d31s16);
+ q2s32 = vmull_s16(d0s16, d31s16);
+ q11s32 = vmull_s16(d1s16, d31s16);
+
+ q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
+ q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
+ q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
+ q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
+
+ d2s16 = vqrshrn_n_s32(q12s32, 14);
+ d3s16 = vqrshrn_n_s32(q5s32, 14);
+ d12s16 = vqrshrn_n_s32(q2s32, 14);
+ d13s16 = vqrshrn_n_s32(q11s32, 14);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ d30s16 = vdup_n_s16(-cospi_8_64);
+ q10s32 = vmull_s16(d8s16, d30s16);
+ q13s32 = vmull_s16(d9s16, d30s16);
+ q8s32 = vmull_s16(d6s16, d30s16);
+ q9s32 = vmull_s16(d7s16, d30s16);
+
+ q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
+ q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
+ q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
+ q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
+
+ d4s16 = vqrshrn_n_s32(q10s32, 14);
+ d5s16 = vqrshrn_n_s32(q13s32, 14);
+ d10s16 = vqrshrn_n_s32(q8s32, 14);
+ d11s16 = vqrshrn_n_s32(q9s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ // stage 5
+ q8s16 = vaddq_s16(q0s16, q3s16);
+ q9s16 = vaddq_s16(q1s16, q2s16);
+ q10s16 = vsubq_s16(q1s16, q2s16);
+ q11s16 = vsubq_s16(q0s16, q3s16);
+ q12s16 = vsubq_s16(q7s16, q4s16);
+ q13s16 = vsubq_s16(q6s16, q5s16);
+ q14s16 = vaddq_s16(q6s16, q5s16);
+ q15s16 = vaddq_s16(q7s16, q4s16);
+
+ // stage 6
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+
+ d14s16 = vdup_n_s16(cospi_16_64);
+ q3s32 = vmull_s16(d26s16, d14s16);
+ q4s32 = vmull_s16(d27s16, d14s16);
+ q0s32 = vmull_s16(d20s16, d14s16);
+ q1s32 = vmull_s16(d21s16, d14s16);
+
+ q5s32 = vsubq_s32(q3s32, q0s32);
+ q6s32 = vsubq_s32(q4s32, q1s32);
+ q0s32 = vaddq_s32(q3s32, q0s32);
+ q4s32 = vaddq_s32(q4s32, q1s32);
+
+ d4s16 = vqrshrn_n_s32(q5s32, 14);
+ d5s16 = vqrshrn_n_s32(q6s32, 14);
+ d10s16 = vqrshrn_n_s32(q0s32, 14);
+ d11s16 = vqrshrn_n_s32(q4s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ q0s32 = vmull_s16(d22s16, d14s16);
+ q1s32 = vmull_s16(d23s16, d14s16);
+ q13s32 = vmull_s16(d24s16, d14s16);
+ q6s32 = vmull_s16(d25s16, d14s16);
+
+ q10s32 = vsubq_s32(q13s32, q0s32);
+ q4s32 = vsubq_s32(q6s32, q1s32);
+ q13s32 = vaddq_s32(q13s32, q0s32);
+ q6s32 = vaddq_s32(q6s32, q1s32);
+
+ d6s16 = vqrshrn_n_s32(q10s32, 14);
+ d7s16 = vqrshrn_n_s32(q4s32, 14);
+ d8s16 = vqrshrn_n_s32(q13s32, 14);
+ d9s16 = vqrshrn_n_s32(q6s32, 14);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+
+ // stage 7
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q0s16, q15s16);
+ q13s16 = vaddq_s16(q1s16, q14s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q14s16 = vsubq_s16(q1s16, q14s16);
+ q15s16 = vsubq_s16(q0s16, q15s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q10s16, q5s16);
+ q13s16 = vaddq_s16(q11s16, q4s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q4s16 = vsubq_s16(q11s16, q4s16);
+ q5s16 = vsubq_s16(q10s16, q5s16);
+
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q0s16, q3s16);
+ q13s16 = vaddq_s16(q1s16, q2s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q2s16 = vsubq_s16(q1s16, q2s16);
+ q3s16 = vsubq_s16(q0s16, q3s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ q12s16 = vaddq_s16(q10s16, q9s16);
+ q13s16 = vaddq_s16(q11s16, q8s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q8s16 = vsubq_s16(q11s16, q8s16);
+ q9s16 = vsubq_s16(q10s16, q9s16);
+
+ d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
+ d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
+ d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16));
+ d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16));
+ d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16));
+ d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16));
+ d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
+ d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
+ d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
+ d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
+ d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+ d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+ d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+ d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+ d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+ d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+ vst1_u64((uint64_t *)out, d16u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d17u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d18u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d19u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d4u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d5u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d6u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d7u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d8u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d9u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d10u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d11u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d28u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d29u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d30u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d31u64);
+ return;
}
diff --git a/vpx_dsp/arm/idct16x16_neon.c b/vpx_dsp/arm/idct16x16_neon.c
index 352979aa1..ecc263df2 100644
--- a/vpx_dsp/arm/idct16x16_neon.c
+++ b/vpx_dsp/arm/idct16x16_neon.c
@@ -10,24 +10,16 @@
#include "vpx_dsp/vpx_dsp_common.h"
-void vpx_idct16x16_256_add_neon_pass1(const int16_t *input,
- int16_t *output,
+void vpx_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output,
int output_stride);
-void vpx_idct16x16_256_add_neon_pass2(const int16_t *src,
- int16_t *output,
- int16_t *pass1Output,
- int16_t skip_adding,
- uint8_t *dest,
- int dest_stride);
-void vpx_idct16x16_10_add_neon_pass1(const int16_t *input,
- int16_t *output,
+void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output,
+ int16_t *pass1Output, int16_t skip_adding,
+ uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output,
int output_stride);
-void vpx_idct16x16_10_add_neon_pass2(const int16_t *src,
- int16_t *output,
- int16_t *pass1Output,
- int16_t skip_adding,
- uint8_t *dest,
- int dest_stride);
+void vpx_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output,
+ int16_t *pass1Output, int16_t skip_adding,
+ uint8_t *dest, int dest_stride);
#if HAVE_NEON_ASM
/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
@@ -35,13 +27,13 @@ extern void vpx_push_neon(int64_t *store);
extern void vpx_pop_neon(int64_t *store);
#endif // HAVE_NEON_ASM
-void vpx_idct16x16_256_add_neon(const int16_t *input,
- uint8_t *dest, int dest_stride) {
+void vpx_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
#if HAVE_NEON_ASM
int64_t store_reg[8];
#endif
- int16_t pass1_output[16*16] = {0};
- int16_t row_idct_output[16*16] = {0};
+ int16_t pass1_output[16 * 16] = { 0 };
+ int16_t row_idct_output[16 * 16] = { 0 };
#if HAVE_NEON_ASM
// save d8-d15 register values.
@@ -56,27 +48,19 @@ void vpx_idct16x16_256_add_neon(const int16_t *input,
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
- vpx_idct16x16_256_add_neon_pass2(input+1,
- row_idct_output,
- pass1_output,
- 0,
- dest,
- dest_stride);
+ vpx_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
+ dest, dest_stride);
/* Parallel idct on the lower 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vpx_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);
+ vpx_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
- vpx_idct16x16_256_add_neon_pass2(input+8*16+1,
- row_idct_output+8,
- pass1_output,
- 0,
- dest,
- dest_stride);
+ vpx_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8,
+ pass1_output, 0, dest, dest_stride);
/* Parallel idct on the left 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
@@ -86,27 +70,20 @@ void vpx_idct16x16_256_add_neon(const int16_t *input,
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
- vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,
- row_idct_output,
- pass1_output,
- 1,
- dest,
- dest_stride);
+ vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
+ pass1_output, 1, dest, dest_stride);
/* Parallel idct on the right 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+ vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
- vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
- row_idct_output+8,
- pass1_output,
- 1,
- dest+8,
- dest_stride);
+ vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
+ row_idct_output + 8, pass1_output, 1,
+ dest + 8, dest_stride);
#if HAVE_NEON_ASM
// restore d8-d15 register values.
@@ -116,13 +93,13 @@ void vpx_idct16x16_256_add_neon(const int16_t *input,
return;
}
-void vpx_idct16x16_10_add_neon(const int16_t *input,
- uint8_t *dest, int dest_stride) {
+void vpx_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
#if HAVE_NEON_ASM
int64_t store_reg[8];
#endif
- int16_t pass1_output[16*16] = {0};
- int16_t row_idct_output[16*16] = {0};
+ int16_t pass1_output[16 * 16] = { 0 };
+ int16_t row_idct_output[16 * 16] = { 0 };
#if HAVE_NEON_ASM
// save d8-d15 register values.
@@ -137,12 +114,8 @@ void vpx_idct16x16_10_add_neon(const int16_t *input,
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
- vpx_idct16x16_10_add_neon_pass2(input+1,
- row_idct_output,
- pass1_output,
- 0,
- dest,
- dest_stride);
+ vpx_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
+ dest, dest_stride);
/* Skip Parallel idct on the lower 8 rows as they are all 0s */
@@ -154,27 +127,20 @@ void vpx_idct16x16_10_add_neon(const int16_t *input,
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
- vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,
- row_idct_output,
- pass1_output,
- 1,
- dest,
- dest_stride);
+ vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
+ pass1_output, 1, dest, dest_stride);
/* Parallel idct on the right 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+ vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
- vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
- row_idct_output+8,
- pass1_output,
- 1,
- dest+8,
- dest_stride);
+ vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
+ row_idct_output + 8, pass1_output, 1,
+ dest + 8, dest_stride);
#if HAVE_NEON_ASM
// restore d8-d15 register values.
diff --git a/vpx_dsp/arm/idct32x32_1_add_neon.c b/vpx_dsp/arm/idct32x32_1_add_neon.c
index c25c0c4a5..dab7d098e 100644
--- a/vpx_dsp/arm/idct32x32_1_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_1_add_neon.c
@@ -15,151 +15,126 @@
#include "vpx_dsp/inv_txfm.h"
#include "vpx_ports/mem.h"
-static INLINE void LD_16x8(
- uint8_t *d,
- int d_stride,
- uint8x16_t *q8u8,
- uint8x16_t *q9u8,
- uint8x16_t *q10u8,
- uint8x16_t *q11u8,
- uint8x16_t *q12u8,
- uint8x16_t *q13u8,
- uint8x16_t *q14u8,
- uint8x16_t *q15u8) {
- *q8u8 = vld1q_u8(d);
- d += d_stride;
- *q9u8 = vld1q_u8(d);
- d += d_stride;
- *q10u8 = vld1q_u8(d);
- d += d_stride;
- *q11u8 = vld1q_u8(d);
- d += d_stride;
- *q12u8 = vld1q_u8(d);
- d += d_stride;
- *q13u8 = vld1q_u8(d);
- d += d_stride;
- *q14u8 = vld1q_u8(d);
- d += d_stride;
- *q15u8 = vld1q_u8(d);
- return;
+static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
+ uint8x16_t *q9u8, uint8x16_t *q10u8,
+ uint8x16_t *q11u8, uint8x16_t *q12u8,
+ uint8x16_t *q13u8, uint8x16_t *q14u8,
+ uint8x16_t *q15u8) {
+ *q8u8 = vld1q_u8(d);
+ d += d_stride;
+ *q9u8 = vld1q_u8(d);
+ d += d_stride;
+ *q10u8 = vld1q_u8(d);
+ d += d_stride;
+ *q11u8 = vld1q_u8(d);
+ d += d_stride;
+ *q12u8 = vld1q_u8(d);
+ d += d_stride;
+ *q13u8 = vld1q_u8(d);
+ d += d_stride;
+ *q14u8 = vld1q_u8(d);
+ d += d_stride;
+ *q15u8 = vld1q_u8(d);
+ return;
}
-static INLINE void ADD_DIFF_16x8(
- uint8x16_t qdiffu8,
- uint8x16_t *q8u8,
- uint8x16_t *q9u8,
- uint8x16_t *q10u8,
- uint8x16_t *q11u8,
- uint8x16_t *q12u8,
- uint8x16_t *q13u8,
- uint8x16_t *q14u8,
- uint8x16_t *q15u8) {
- *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
- *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
- *q10u8 = vqaddq_u8(*q10u8, qdiffu8);
- *q11u8 = vqaddq_u8(*q11u8, qdiffu8);
- *q12u8 = vqaddq_u8(*q12u8, qdiffu8);
- *q13u8 = vqaddq_u8(*q13u8, qdiffu8);
- *q14u8 = vqaddq_u8(*q14u8, qdiffu8);
- *q15u8 = vqaddq_u8(*q15u8, qdiffu8);
- return;
+static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
+ uint8x16_t *q9u8, uint8x16_t *q10u8,
+ uint8x16_t *q11u8, uint8x16_t *q12u8,
+ uint8x16_t *q13u8, uint8x16_t *q14u8,
+ uint8x16_t *q15u8) {
+ *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
+ *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
+ *q10u8 = vqaddq_u8(*q10u8, qdiffu8);
+ *q11u8 = vqaddq_u8(*q11u8, qdiffu8);
+ *q12u8 = vqaddq_u8(*q12u8, qdiffu8);
+ *q13u8 = vqaddq_u8(*q13u8, qdiffu8);
+ *q14u8 = vqaddq_u8(*q14u8, qdiffu8);
+ *q15u8 = vqaddq_u8(*q15u8, qdiffu8);
+ return;
}
-static INLINE void SUB_DIFF_16x8(
- uint8x16_t qdiffu8,
- uint8x16_t *q8u8,
- uint8x16_t *q9u8,
- uint8x16_t *q10u8,
- uint8x16_t *q11u8,
- uint8x16_t *q12u8,
- uint8x16_t *q13u8,
- uint8x16_t *q14u8,
- uint8x16_t *q15u8) {
- *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
- *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
- *q10u8 = vqsubq_u8(*q10u8, qdiffu8);
- *q11u8 = vqsubq_u8(*q11u8, qdiffu8);
- *q12u8 = vqsubq_u8(*q12u8, qdiffu8);
- *q13u8 = vqsubq_u8(*q13u8, qdiffu8);
- *q14u8 = vqsubq_u8(*q14u8, qdiffu8);
- *q15u8 = vqsubq_u8(*q15u8, qdiffu8);
- return;
+static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
+ uint8x16_t *q9u8, uint8x16_t *q10u8,
+ uint8x16_t *q11u8, uint8x16_t *q12u8,
+ uint8x16_t *q13u8, uint8x16_t *q14u8,
+ uint8x16_t *q15u8) {
+ *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
+ *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
+ *q10u8 = vqsubq_u8(*q10u8, qdiffu8);
+ *q11u8 = vqsubq_u8(*q11u8, qdiffu8);
+ *q12u8 = vqsubq_u8(*q12u8, qdiffu8);
+ *q13u8 = vqsubq_u8(*q13u8, qdiffu8);
+ *q14u8 = vqsubq_u8(*q14u8, qdiffu8);
+ *q15u8 = vqsubq_u8(*q15u8, qdiffu8);
+ return;
}
-static INLINE void ST_16x8(
- uint8_t *d,
- int d_stride,
- uint8x16_t *q8u8,
- uint8x16_t *q9u8,
- uint8x16_t *q10u8,
- uint8x16_t *q11u8,
- uint8x16_t *q12u8,
- uint8x16_t *q13u8,
- uint8x16_t *q14u8,
- uint8x16_t *q15u8) {
- vst1q_u8(d, *q8u8);
- d += d_stride;
- vst1q_u8(d, *q9u8);
- d += d_stride;
- vst1q_u8(d, *q10u8);
- d += d_stride;
- vst1q_u8(d, *q11u8);
- d += d_stride;
- vst1q_u8(d, *q12u8);
- d += d_stride;
- vst1q_u8(d, *q13u8);
- d += d_stride;
- vst1q_u8(d, *q14u8);
- d += d_stride;
- vst1q_u8(d, *q15u8);
- return;
+static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
+ uint8x16_t *q9u8, uint8x16_t *q10u8,
+ uint8x16_t *q11u8, uint8x16_t *q12u8,
+ uint8x16_t *q13u8, uint8x16_t *q14u8,
+ uint8x16_t *q15u8) {
+ vst1q_u8(d, *q8u8);
+ d += d_stride;
+ vst1q_u8(d, *q9u8);
+ d += d_stride;
+ vst1q_u8(d, *q10u8);
+ d += d_stride;
+ vst1q_u8(d, *q11u8);
+ d += d_stride;
+ vst1q_u8(d, *q12u8);
+ d += d_stride;
+ vst1q_u8(d, *q13u8);
+ d += d_stride;
+ vst1q_u8(d, *q14u8);
+ d += d_stride;
+ vst1q_u8(d, *q15u8);
+ return;
}
-void vpx_idct32x32_1_add_neon(
- int16_t *input,
- uint8_t *dest,
- int dest_stride) {
- uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
- int i, j, dest_stride8;
- uint8_t *d;
- int16_t a1, cospi_16_64 = 11585;
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+void vpx_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+ uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+ int i, j, dest_stride8;
+ uint8_t *d;
+ int16_t a1, cospi_16_64 = 11585;
+ int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
- out = dct_const_round_shift(out * cospi_16_64);
- a1 = ROUND_POWER_OF_TWO(out, 6);
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 6);
- dest_stride8 = dest_stride * 8;
- if (a1 >= 0) { // diff_positive_32_32
- a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
- q0u8 = vdupq_n_u8(a1);
- for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop
- d = dest;
- for (j = 0; j < 4; j++) {
- LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
- &q12u8, &q13u8, &q14u8, &q15u8);
- ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
- &q12u8, &q13u8, &q14u8, &q15u8);
- ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
- &q12u8, &q13u8, &q14u8, &q15u8);
- d += dest_stride8;
- }
- }
- } else { // diff_negative_32_32
- a1 = -a1;
- a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
- q0u8 = vdupq_n_u8(a1);
- for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop
- d = dest;
- for (j = 0; j < 4; j++) {
- LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
- &q12u8, &q13u8, &q14u8, &q15u8);
- SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
- &q12u8, &q13u8, &q14u8, &q15u8);
- ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
- &q12u8, &q13u8, &q14u8, &q15u8);
- d += dest_stride8;
- }
- }
+ dest_stride8 = dest_stride * 8;
+ if (a1 >= 0) { // diff_positive_32_32
+ a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
+ q0u8 = vdupq_n_u8(a1);
+ for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop
+ d = dest;
+ for (j = 0; j < 4; j++) {
+ LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+ &q14u8, &q15u8);
+ ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+ &q14u8, &q15u8);
+ ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+ &q14u8, &q15u8);
+ d += dest_stride8;
+ }
}
- return;
+ } else { // diff_negative_32_32
+ a1 = -a1;
+ a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
+ q0u8 = vdupq_n_u8(a1);
+ for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop
+ d = dest;
+ for (j = 0; j < 4; j++) {
+ LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+ &q14u8, &q15u8);
+ SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+ &q14u8, &q15u8);
+ ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+ &q14u8, &q15u8);
+ d += dest_stride8;
+ }
+ }
+ }
+ return;
}
diff --git a/vpx_dsp/arm/idct32x32_add_neon.c b/vpx_dsp/arm/idct32x32_add_neon.c
index 025437eb9..04f51bfdd 100644
--- a/vpx_dsp/arm/idct32x32_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_add_neon.c
@@ -14,706 +14,671 @@
#include "vpx_dsp/txfm_common.h"
#define LOAD_FROM_TRANSPOSED(prev, first, second) \
- q14s16 = vld1q_s16(trans_buf + first * 8); \
- q13s16 = vld1q_s16(trans_buf + second * 8);
+ q14s16 = vld1q_s16(trans_buf + first * 8); \
+ q13s16 = vld1q_s16(trans_buf + second * 8);
#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
- qA = vld1q_s16(out + first * 32); \
- qB = vld1q_s16(out + second * 32);
+ qA = vld1q_s16(out + first * 32); \
+ qB = vld1q_s16(out + second * 32);
#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
- vst1q_s16(out + first * 32, qA); \
- vst1q_s16(out + second * 32, qB);
-
-#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
- __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \
- q6s16, q7s16, q8s16, q9s16);
-static INLINE void __STORE_COMBINE_CENTER_RESULTS(
- uint8_t *p1,
- uint8_t *p2,
- int stride,
- int16x8_t q6s16,
- int16x8_t q7s16,
- int16x8_t q8s16,
- int16x8_t q9s16) {
- int16x4_t d8s16, d9s16, d10s16, d11s16;
-
- d8s16 = vld1_s16((int16_t *)p1);
- p1 += stride;
- d11s16 = vld1_s16((int16_t *)p2);
- p2 -= stride;
- d9s16 = vld1_s16((int16_t *)p1);
- d10s16 = vld1_s16((int16_t *)p2);
-
- q7s16 = vrshrq_n_s16(q7s16, 6);
- q8s16 = vrshrq_n_s16(q8s16, 6);
- q9s16 = vrshrq_n_s16(q9s16, 6);
- q6s16 = vrshrq_n_s16(q6s16, 6);
-
- q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
- vreinterpret_u8_s16(d9s16)));
- q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16),
- vreinterpret_u8_s16(d10s16)));
- q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16),
- vreinterpret_u8_s16(d11s16)));
- q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
- vreinterpret_u8_s16(d8s16)));
-
- d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
- d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
- d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
- d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
-
- vst1_s16((int16_t *)p1, d9s16);
- p1 -= stride;
- vst1_s16((int16_t *)p2, d10s16);
- p2 += stride;
- vst1_s16((int16_t *)p1, d8s16);
- vst1_s16((int16_t *)p2, d11s16);
- return;
+ vst1q_s16(out + first * 32, qA); \
+ vst1q_s16(out + second * 32, qB);
+
+#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
+ __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, q6s16, q7s16, q8s16, q9s16);
+static INLINE void __STORE_COMBINE_CENTER_RESULTS(uint8_t *p1, uint8_t *p2,
+ int stride, int16x8_t q6s16,
+ int16x8_t q7s16,
+ int16x8_t q8s16,
+ int16x8_t q9s16) {
+ int16x4_t d8s16, d9s16, d10s16, d11s16;
+
+ d8s16 = vld1_s16((int16_t *)p1);
+ p1 += stride;
+ d11s16 = vld1_s16((int16_t *)p2);
+ p2 -= stride;
+ d9s16 = vld1_s16((int16_t *)p1);
+ d10s16 = vld1_s16((int16_t *)p2);
+
+ q7s16 = vrshrq_n_s16(q7s16, 6);
+ q8s16 = vrshrq_n_s16(q8s16, 6);
+ q9s16 = vrshrq_n_s16(q9s16, 6);
+ q6s16 = vrshrq_n_s16(q6s16, 6);
+
+ q7s16 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d9s16)));
+ q8s16 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s16(d10s16)));
+ q9s16 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s16(d11s16)));
+ q6s16 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d8s16)));
+
+ d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
+ d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
+ d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
+ d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+
+ vst1_s16((int16_t *)p1, d9s16);
+ p1 -= stride;
+ vst1_s16((int16_t *)p2, d10s16);
+ p2 += stride;
+ vst1_s16((int16_t *)p1, d8s16);
+ vst1_s16((int16_t *)p2, d11s16);
+ return;
}
-#define STORE_COMBINE_EXTREME_RESULTS(r7, r6); \
- __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \
- q4s16, q5s16, q6s16, q7s16);
-static INLINE void __STORE_COMBINE_EXTREME_RESULTS(
- uint8_t *p1,
- uint8_t *p2,
- int stride,
- int16x8_t q4s16,
- int16x8_t q5s16,
- int16x8_t q6s16,
- int16x8_t q7s16) {
- int16x4_t d4s16, d5s16, d6s16, d7s16;
-
- d4s16 = vld1_s16((int16_t *)p1);
- p1 += stride;
- d7s16 = vld1_s16((int16_t *)p2);
- p2 -= stride;
- d5s16 = vld1_s16((int16_t *)p1);
- d6s16 = vld1_s16((int16_t *)p2);
-
- q5s16 = vrshrq_n_s16(q5s16, 6);
- q6s16 = vrshrq_n_s16(q6s16, 6);
- q7s16 = vrshrq_n_s16(q7s16, 6);
- q4s16 = vrshrq_n_s16(q4s16, 6);
-
- q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16),
- vreinterpret_u8_s16(d5s16)));
- q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
- vreinterpret_u8_s16(d6s16)));
- q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
- vreinterpret_u8_s16(d7s16)));
- q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16),
- vreinterpret_u8_s16(d4s16)));
-
- d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
- d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
- d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
- d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
-
- vst1_s16((int16_t *)p1, d5s16);
- p1 -= stride;
- vst1_s16((int16_t *)p2, d6s16);
- p2 += stride;
- vst1_s16((int16_t *)p2, d7s16);
- vst1_s16((int16_t *)p1, d4s16);
- return;
+#define STORE_COMBINE_EXTREME_RESULTS(r7, r6) \
+ __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, q4s16, q5s16, q6s16, q7s16);
+static INLINE void __STORE_COMBINE_EXTREME_RESULTS(uint8_t *p1, uint8_t *p2,
+ int stride, int16x8_t q4s16,
+ int16x8_t q5s16,
+ int16x8_t q6s16,
+ int16x8_t q7s16) {
+ int16x4_t d4s16, d5s16, d6s16, d7s16;
+
+ d4s16 = vld1_s16((int16_t *)p1);
+ p1 += stride;
+ d7s16 = vld1_s16((int16_t *)p2);
+ p2 -= stride;
+ d5s16 = vld1_s16((int16_t *)p1);
+ d6s16 = vld1_s16((int16_t *)p2);
+
+ q5s16 = vrshrq_n_s16(q5s16, 6);
+ q6s16 = vrshrq_n_s16(q6s16, 6);
+ q7s16 = vrshrq_n_s16(q7s16, 6);
+ q4s16 = vrshrq_n_s16(q4s16, 6);
+
+ q5s16 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s16(d5s16)));
+ q6s16 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d6s16)));
+ q7s16 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d7s16)));
+ q4s16 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s16(d4s16)));
+
+ d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
+ d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+ d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
+ d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
+
+ vst1_s16((int16_t *)p1, d5s16);
+ p1 -= stride;
+ vst1_s16((int16_t *)p2, d6s16);
+ p2 += stride;
+ vst1_s16((int16_t *)p2, d7s16);
+ vst1_s16((int16_t *)p1, d4s16);
+ return;
}
#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
- DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
-static INLINE void DO_BUTTERFLY(
- int16x8_t q14s16,
- int16x8_t q13s16,
- int16_t first_const,
- int16_t second_const,
- int16x8_t *qAs16,
- int16x8_t *qBs16) {
- int16x4_t d30s16, d31s16;
- int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
- int16x4_t dCs16, dDs16, dAs16, dBs16;
-
- dCs16 = vget_low_s16(q14s16);
- dDs16 = vget_high_s16(q14s16);
- dAs16 = vget_low_s16(q13s16);
- dBs16 = vget_high_s16(q13s16);
-
- d30s16 = vdup_n_s16(first_const);
- d31s16 = vdup_n_s16(second_const);
-
- q8s32 = vmull_s16(dCs16, d30s16);
- q10s32 = vmull_s16(dAs16, d31s16);
- q9s32 = vmull_s16(dDs16, d30s16);
- q11s32 = vmull_s16(dBs16, d31s16);
- q12s32 = vmull_s16(dCs16, d31s16);
-
- q8s32 = vsubq_s32(q8s32, q10s32);
- q9s32 = vsubq_s32(q9s32, q11s32);
-
- q10s32 = vmull_s16(dDs16, d31s16);
- q11s32 = vmull_s16(dAs16, d30s16);
- q15s32 = vmull_s16(dBs16, d30s16);
-
- q11s32 = vaddq_s32(q12s32, q11s32);
- q10s32 = vaddq_s32(q10s32, q15s32);
-
- *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14),
- vqrshrn_n_s32(q9s32, 14));
- *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14),
- vqrshrn_n_s32(q10s32, 14));
- return;
+ DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
+static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16,
+ int16_t first_const, int16_t second_const,
+ int16x8_t *qAs16, int16x8_t *qBs16) {
+ int16x4_t d30s16, d31s16;
+ int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
+ int16x4_t dCs16, dDs16, dAs16, dBs16;
+
+ dCs16 = vget_low_s16(q14s16);
+ dDs16 = vget_high_s16(q14s16);
+ dAs16 = vget_low_s16(q13s16);
+ dBs16 = vget_high_s16(q13s16);
+
+ d30s16 = vdup_n_s16(first_const);
+ d31s16 = vdup_n_s16(second_const);
+
+ q8s32 = vmull_s16(dCs16, d30s16);
+ q10s32 = vmull_s16(dAs16, d31s16);
+ q9s32 = vmull_s16(dDs16, d30s16);
+ q11s32 = vmull_s16(dBs16, d31s16);
+ q12s32 = vmull_s16(dCs16, d31s16);
+
+ q8s32 = vsubq_s32(q8s32, q10s32);
+ q9s32 = vsubq_s32(q9s32, q11s32);
+
+ q10s32 = vmull_s16(dDs16, d31s16);
+ q11s32 = vmull_s16(dAs16, d30s16);
+ q15s32 = vmull_s16(dBs16, d30s16);
+
+ q11s32 = vaddq_s32(q12s32, q11s32);
+ q10s32 = vaddq_s32(q10s32, q15s32);
+
+ *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), vqrshrn_n_s32(q9s32, 14));
+ *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), vqrshrn_n_s32(q10s32, 14));
+ return;
}
-static INLINE void idct32_transpose_pair(
- int16_t *input,
- int16_t *t_buf) {
- int16_t *in;
- int i;
- const int stride = 32;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
- int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
- int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
-
- for (i = 0; i < 4; i++, input += 8) {
- in = input;
- q8s16 = vld1q_s16(in);
- in += stride;
- q9s16 = vld1q_s16(in);
- in += stride;
- q10s16 = vld1q_s16(in);
- in += stride;
- q11s16 = vld1q_s16(in);
- in += stride;
- q12s16 = vld1q_s16(in);
- in += stride;
- q13s16 = vld1q_s16(in);
- in += stride;
- q14s16 = vld1q_s16(in);
- in += stride;
- q15s16 = vld1q_s16(in);
-
- d16s16 = vget_low_s16(q8s16);
- d17s16 = vget_high_s16(q8s16);
- d18s16 = vget_low_s16(q9s16);
- d19s16 = vget_high_s16(q9s16);
- d20s16 = vget_low_s16(q10s16);
- d21s16 = vget_high_s16(q10s16);
- d22s16 = vget_low_s16(q11s16);
- d23s16 = vget_high_s16(q11s16);
- d24s16 = vget_low_s16(q12s16);
- d25s16 = vget_high_s16(q12s16);
- d26s16 = vget_low_s16(q13s16);
- d27s16 = vget_high_s16(q13s16);
- d28s16 = vget_low_s16(q14s16);
- d29s16 = vget_high_s16(q14s16);
- d30s16 = vget_low_s16(q15s16);
- d31s16 = vget_high_s16(q15s16);
-
- q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
- q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
- q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
- q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
- q12s16 = vcombine_s16(d17s16, d25s16);
- q13s16 = vcombine_s16(d19s16, d27s16);
- q14s16 = vcombine_s16(d21s16, d29s16);
- q15s16 = vcombine_s16(d23s16, d31s16);
-
- q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
- vreinterpretq_s32_s16(q10s16));
- q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16),
- vreinterpretq_s32_s16(q11s16));
- q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16),
- vreinterpretq_s32_s16(q14s16));
- q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16),
- vreinterpretq_s32_s16(q15s16));
-
- q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
- vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
- q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
- vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
- q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
- vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
- q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
- vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
-
- vst1q_s16(t_buf, q0x2s16.val[0]);
- t_buf += 8;
- vst1q_s16(t_buf, q0x2s16.val[1]);
- t_buf += 8;
- vst1q_s16(t_buf, q1x2s16.val[0]);
- t_buf += 8;
- vst1q_s16(t_buf, q1x2s16.val[1]);
- t_buf += 8;
- vst1q_s16(t_buf, q2x2s16.val[0]);
- t_buf += 8;
- vst1q_s16(t_buf, q2x2s16.val[1]);
- t_buf += 8;
- vst1q_s16(t_buf, q3x2s16.val[0]);
- t_buf += 8;
- vst1q_s16(t_buf, q3x2s16.val[1]);
- t_buf += 8;
- }
- return;
+static INLINE void idct32_transpose_pair(int16_t *input, int16_t *t_buf) {
+ int16_t *in;
+ int i;
+ const int stride = 32;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+ int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+ for (i = 0; i < 4; i++, input += 8) {
+ in = input;
+ q8s16 = vld1q_s16(in);
+ in += stride;
+ q9s16 = vld1q_s16(in);
+ in += stride;
+ q10s16 = vld1q_s16(in);
+ in += stride;
+ q11s16 = vld1q_s16(in);
+ in += stride;
+ q12s16 = vld1q_s16(in);
+ in += stride;
+ q13s16 = vld1q_s16(in);
+ in += stride;
+ q14s16 = vld1q_s16(in);
+ in += stride;
+ q15s16 = vld1q_s16(in);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+ d30s16 = vget_low_s16(q15s16);
+ d31s16 = vget_high_s16(q15s16);
+
+ q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
+ q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
+ q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
+ q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
+ q12s16 = vcombine_s16(d17s16, d25s16);
+ q13s16 = vcombine_s16(d19s16, d27s16);
+ q14s16 = vcombine_s16(d21s16, d29s16);
+ q15s16 = vcombine_s16(d23s16, d31s16);
+
+ q0x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q10s16));
+ q1x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(q9s16), vreinterpretq_s32_s16(q11s16));
+ q2x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(q12s16), vreinterpretq_s32_s16(q14s16));
+ q3x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(q13s16), vreinterpretq_s32_s16(q15s16));
+
+ q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
+ vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
+ q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
+ vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
+ q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
+ vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
+ q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
+ vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
+
+ vst1q_s16(t_buf, q0x2s16.val[0]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q0x2s16.val[1]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q1x2s16.val[0]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q1x2s16.val[1]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q2x2s16.val[0]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q2x2s16.val[1]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q3x2s16.val[0]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q3x2s16.val[1]);
+ t_buf += 8;
+ }
+ return;
}
-static INLINE void idct32_bands_end_1st_pass(
- int16_t *out,
- int16x8_t q2s16,
- int16x8_t q3s16,
- int16x8_t q6s16,
- int16x8_t q7s16,
- int16x8_t q8s16,
- int16x8_t q9s16,
- int16x8_t q10s16,
- int16x8_t q11s16,
- int16x8_t q12s16,
- int16x8_t q13s16,
- int16x8_t q14s16,
- int16x8_t q15s16) {
- int16x8_t q0s16, q1s16, q4s16, q5s16;
-
- STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
- STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
-
- LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
- q4s16 = vaddq_s16(q2s16, q1s16);
- q5s16 = vaddq_s16(q3s16, q0s16);
- q6s16 = vsubq_s16(q3s16, q0s16);
- q7s16 = vsubq_s16(q2s16, q1s16);
- STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
- STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
-
- LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
- q2s16 = vaddq_s16(q10s16, q1s16);
- q3s16 = vaddq_s16(q11s16, q0s16);
- q4s16 = vsubq_s16(q11s16, q0s16);
- q5s16 = vsubq_s16(q10s16, q1s16);
-
- LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
- q8s16 = vaddq_s16(q4s16, q1s16);
- q9s16 = vaddq_s16(q5s16, q0s16);
- q6s16 = vsubq_s16(q5s16, q0s16);
- q7s16 = vsubq_s16(q4s16, q1s16);
- STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
- STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
-
- LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
- q4s16 = vaddq_s16(q2s16, q1s16);
- q5s16 = vaddq_s16(q3s16, q0s16);
- q6s16 = vsubq_s16(q3s16, q0s16);
- q7s16 = vsubq_s16(q2s16, q1s16);
- STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
- STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
-
- LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
- q2s16 = vaddq_s16(q12s16, q1s16);
- q3s16 = vaddq_s16(q13s16, q0s16);
- q4s16 = vsubq_s16(q13s16, q0s16);
- q5s16 = vsubq_s16(q12s16, q1s16);
-
- LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
- q8s16 = vaddq_s16(q4s16, q1s16);
- q9s16 = vaddq_s16(q5s16, q0s16);
- q6s16 = vsubq_s16(q5s16, q0s16);
- q7s16 = vsubq_s16(q4s16, q1s16);
- STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
- STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
-
- LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
- q4s16 = vaddq_s16(q2s16, q1s16);
- q5s16 = vaddq_s16(q3s16, q0s16);
- q6s16 = vsubq_s16(q3s16, q0s16);
- q7s16 = vsubq_s16(q2s16, q1s16);
- STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
- STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
-
- LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
- q2s16 = vaddq_s16(q14s16, q1s16);
- q3s16 = vaddq_s16(q15s16, q0s16);
- q4s16 = vsubq_s16(q15s16, q0s16);
- q5s16 = vsubq_s16(q14s16, q1s16);
-
- LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
- q8s16 = vaddq_s16(q4s16, q1s16);
- q9s16 = vaddq_s16(q5s16, q0s16);
- q6s16 = vsubq_s16(q5s16, q0s16);
- q7s16 = vsubq_s16(q4s16, q1s16);
- STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
- STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
-
- LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
- q4s16 = vaddq_s16(q2s16, q1s16);
- q5s16 = vaddq_s16(q3s16, q0s16);
- q6s16 = vsubq_s16(q3s16, q0s16);
- q7s16 = vsubq_s16(q2s16, q1s16);
- STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
- STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
- return;
+static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16,
+ int16x8_t q3s16, int16x8_t q6s16,
+ int16x8_t q7s16, int16x8_t q8s16,
+ int16x8_t q9s16, int16x8_t q10s16,
+ int16x8_t q11s16, int16x8_t q12s16,
+ int16x8_t q13s16, int16x8_t q14s16,
+ int16x8_t q15s16) {
+ int16x8_t q0s16, q1s16, q4s16, q5s16;
+
+ STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
+ STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
+
+ LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
+ STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
+
+ LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
+ q2s16 = vaddq_s16(q10s16, q1s16);
+ q3s16 = vaddq_s16(q11s16, q0s16);
+ q4s16 = vsubq_s16(q11s16, q0s16);
+ q5s16 = vsubq_s16(q10s16, q1s16);
+
+ LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
+ STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
+
+ LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
+ STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
+
+ LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
+ q2s16 = vaddq_s16(q12s16, q1s16);
+ q3s16 = vaddq_s16(q13s16, q0s16);
+ q4s16 = vsubq_s16(q13s16, q0s16);
+ q5s16 = vsubq_s16(q12s16, q1s16);
+
+ LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
+ STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
+
+ LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
+ STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
+
+ LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
+ q2s16 = vaddq_s16(q14s16, q1s16);
+ q3s16 = vaddq_s16(q15s16, q0s16);
+ q4s16 = vsubq_s16(q15s16, q0s16);
+ q5s16 = vsubq_s16(q14s16, q1s16);
+
+ LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
+ STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
+
+ LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
+ STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
+ return;
}
static INLINE void idct32_bands_end_2nd_pass(
- int16_t *out,
- uint8_t *dest,
- int stride,
- int16x8_t q2s16,
- int16x8_t q3s16,
- int16x8_t q6s16,
- int16x8_t q7s16,
- int16x8_t q8s16,
- int16x8_t q9s16,
- int16x8_t q10s16,
- int16x8_t q11s16,
- int16x8_t q12s16,
- int16x8_t q13s16,
- int16x8_t q14s16,
- int16x8_t q15s16) {
- uint8_t *r6 = dest + 31 * stride;
- uint8_t *r7 = dest/* + 0 * stride*/;
- uint8_t *r9 = dest + 15 * stride;
- uint8_t *r10 = dest + 16 * stride;
- int str2 = stride << 1;
- int16x8_t q0s16, q1s16, q4s16, q5s16;
-
- STORE_COMBINE_CENTER_RESULTS(r10, r9);
- r10 += str2; r9 -= str2;
-
- LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
- q4s16 = vaddq_s16(q2s16, q1s16);
- q5s16 = vaddq_s16(q3s16, q0s16);
- q6s16 = vsubq_s16(q3s16, q0s16);
- q7s16 = vsubq_s16(q2s16, q1s16);
- STORE_COMBINE_EXTREME_RESULTS(r7, r6);
- r7 += str2; r6 -= str2;
-
- LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
- q2s16 = vaddq_s16(q10s16, q1s16);
- q3s16 = vaddq_s16(q11s16, q0s16);
- q4s16 = vsubq_s16(q11s16, q0s16);
- q5s16 = vsubq_s16(q10s16, q1s16);
-
- LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
- q8s16 = vaddq_s16(q4s16, q1s16);
- q9s16 = vaddq_s16(q5s16, q0s16);
- q6s16 = vsubq_s16(q5s16, q0s16);
- q7s16 = vsubq_s16(q4s16, q1s16);
- STORE_COMBINE_CENTER_RESULTS(r10, r9);
- r10 += str2; r9 -= str2;
-
- LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
- q4s16 = vaddq_s16(q2s16, q1s16);
- q5s16 = vaddq_s16(q3s16, q0s16);
- q6s16 = vsubq_s16(q3s16, q0s16);
- q7s16 = vsubq_s16(q2s16, q1s16);
- STORE_COMBINE_EXTREME_RESULTS(r7, r6);
- r7 += str2; r6 -= str2;
-
- LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
- q2s16 = vaddq_s16(q12s16, q1s16);
- q3s16 = vaddq_s16(q13s16, q0s16);
- q4s16 = vsubq_s16(q13s16, q0s16);
- q5s16 = vsubq_s16(q12s16, q1s16);
-
- LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
- q8s16 = vaddq_s16(q4s16, q1s16);
- q9s16 = vaddq_s16(q5s16, q0s16);
- q6s16 = vsubq_s16(q5s16, q0s16);
- q7s16 = vsubq_s16(q4s16, q1s16);
- STORE_COMBINE_CENTER_RESULTS(r10, r9);
- r10 += str2; r9 -= str2;
-
- LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
- q4s16 = vaddq_s16(q2s16, q1s16);
- q5s16 = vaddq_s16(q3s16, q0s16);
- q6s16 = vsubq_s16(q3s16, q0s16);
- q7s16 = vsubq_s16(q2s16, q1s16);
- STORE_COMBINE_EXTREME_RESULTS(r7, r6);
- r7 += str2; r6 -= str2;
-
- LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
- q2s16 = vaddq_s16(q14s16, q1s16);
- q3s16 = vaddq_s16(q15s16, q0s16);
- q4s16 = vsubq_s16(q15s16, q0s16);
- q5s16 = vsubq_s16(q14s16, q1s16);
-
- LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
- q8s16 = vaddq_s16(q4s16, q1s16);
- q9s16 = vaddq_s16(q5s16, q0s16);
- q6s16 = vsubq_s16(q5s16, q0s16);
- q7s16 = vsubq_s16(q4s16, q1s16);
- STORE_COMBINE_CENTER_RESULTS(r10, r9);
-
- LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
- q4s16 = vaddq_s16(q2s16, q1s16);
- q5s16 = vaddq_s16(q3s16, q0s16);
- q6s16 = vsubq_s16(q3s16, q0s16);
- q7s16 = vsubq_s16(q2s16, q1s16);
- STORE_COMBINE_EXTREME_RESULTS(r7, r6);
- return;
+ int16_t *out, uint8_t *dest, int stride, int16x8_t q2s16, int16x8_t q3s16,
+ int16x8_t q6s16, int16x8_t q7s16, int16x8_t q8s16, int16x8_t q9s16,
+ int16x8_t q10s16, int16x8_t q11s16, int16x8_t q12s16, int16x8_t q13s16,
+ int16x8_t q14s16, int16x8_t q15s16) {
+ uint8_t *r6 = dest + 31 * stride;
+ uint8_t *r7 = dest /* + 0 * stride*/;
+ uint8_t *r9 = dest + 15 * stride;
+ uint8_t *r10 = dest + 16 * stride;
+ int str2 = stride << 1;
+ int16x8_t q0s16, q1s16, q4s16, q5s16;
+
+ STORE_COMBINE_CENTER_RESULTS(r10, r9);
+ r10 += str2;
+ r9 -= str2;
+
+ LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+ r7 += str2;
+ r6 -= str2;
+
+ LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
+ q2s16 = vaddq_s16(q10s16, q1s16);
+ q3s16 = vaddq_s16(q11s16, q0s16);
+ q4s16 = vsubq_s16(q11s16, q0s16);
+ q5s16 = vsubq_s16(q10s16, q1s16);
+
+ LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_COMBINE_CENTER_RESULTS(r10, r9);
+ r10 += str2;
+ r9 -= str2;
+
+ LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+ r7 += str2;
+ r6 -= str2;
+
+ LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
+ q2s16 = vaddq_s16(q12s16, q1s16);
+ q3s16 = vaddq_s16(q13s16, q0s16);
+ q4s16 = vsubq_s16(q13s16, q0s16);
+ q5s16 = vsubq_s16(q12s16, q1s16);
+
+ LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_COMBINE_CENTER_RESULTS(r10, r9);
+ r10 += str2;
+ r9 -= str2;
+
+ LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+ r7 += str2;
+ r6 -= str2;
+
+ LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
+ q2s16 = vaddq_s16(q14s16, q1s16);
+ q3s16 = vaddq_s16(q15s16, q0s16);
+ q4s16 = vsubq_s16(q15s16, q0s16);
+ q5s16 = vsubq_s16(q14s16, q1s16);
+
+ LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_COMBINE_CENTER_RESULTS(r10, r9);
+
+ LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+ return;
}
-void vpx_idct32x32_1024_add_neon(
- int16_t *input,
- uint8_t *dest,
- int stride) {
- int i, idct32_pass_loop;
- int16_t trans_buf[32 * 8];
- int16_t pass1[32 * 32];
- int16_t pass2[32 * 32];
- int16_t *out;
- int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-
- for (idct32_pass_loop = 0, out = pass1;
- idct32_pass_loop < 2;
- idct32_pass_loop++,
- input = pass1, // the input of pass2 is the result of pass1
- out = pass2) {
- for (i = 0;
- i < 4; i++,
- input += 32 * 8, out += 8) { // idct32_bands_loop
- idct32_transpose_pair(input, trans_buf);
-
- // -----------------------------------------
- // BLOCK A: 16-19,28-31
- // -----------------------------------------
- // generate 16,17,30,31
- // part of stage 1
- LOAD_FROM_TRANSPOSED(0, 1, 31)
- DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
- LOAD_FROM_TRANSPOSED(31, 17, 15)
- DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
- // part of stage 2
- q4s16 = vaddq_s16(q0s16, q1s16);
- q13s16 = vsubq_s16(q0s16, q1s16);
- q6s16 = vaddq_s16(q2s16, q3s16);
- q14s16 = vsubq_s16(q2s16, q3s16);
- // part of stage 3
- DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
-
- // generate 18,19,28,29
- // part of stage 1
- LOAD_FROM_TRANSPOSED(15, 9, 23)
- DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
- LOAD_FROM_TRANSPOSED(23, 25, 7)
- DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
- // part of stage 2
- q13s16 = vsubq_s16(q3s16, q2s16);
- q3s16 = vaddq_s16(q3s16, q2s16);
- q14s16 = vsubq_s16(q1s16, q0s16);
- q2s16 = vaddq_s16(q1s16, q0s16);
- // part of stage 3
- DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
- // part of stage 4
- q8s16 = vaddq_s16(q4s16, q2s16);
- q9s16 = vaddq_s16(q5s16, q0s16);
- q10s16 = vaddq_s16(q7s16, q1s16);
- q15s16 = vaddq_s16(q6s16, q3s16);
- q13s16 = vsubq_s16(q5s16, q0s16);
- q14s16 = vsubq_s16(q7s16, q1s16);
- STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
- STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
- // part of stage 5
- DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
- STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
- // part of stage 4
- q13s16 = vsubq_s16(q4s16, q2s16);
- q14s16 = vsubq_s16(q6s16, q3s16);
- // part of stage 5
- DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
- STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
-
- // -----------------------------------------
- // BLOCK B: 20-23,24-27
- // -----------------------------------------
- // generate 20,21,26,27
- // part of stage 1
- LOAD_FROM_TRANSPOSED(7, 5, 27)
- DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
- LOAD_FROM_TRANSPOSED(27, 21, 11)
- DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
- // part of stage 2
- q13s16 = vsubq_s16(q0s16, q1s16);
- q0s16 = vaddq_s16(q0s16, q1s16);
- q14s16 = vsubq_s16(q2s16, q3s16);
- q2s16 = vaddq_s16(q2s16, q3s16);
- // part of stage 3
- DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
-
- // generate 22,23,24,25
- // part of stage 1
- LOAD_FROM_TRANSPOSED(11, 13, 19)
- DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
- LOAD_FROM_TRANSPOSED(19, 29, 3)
- DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
- // part of stage 2
- q14s16 = vsubq_s16(q4s16, q5s16);
- q5s16 = vaddq_s16(q4s16, q5s16);
- q13s16 = vsubq_s16(q6s16, q7s16);
- q6s16 = vaddq_s16(q6s16, q7s16);
- // part of stage 3
- DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
- // part of stage 4
- q10s16 = vaddq_s16(q7s16, q1s16);
- q11s16 = vaddq_s16(q5s16, q0s16);
- q12s16 = vaddq_s16(q6s16, q2s16);
- q15s16 = vaddq_s16(q4s16, q3s16);
- // part of stage 6
- LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
- q8s16 = vaddq_s16(q14s16, q11s16);
- q9s16 = vaddq_s16(q13s16, q10s16);
- q13s16 = vsubq_s16(q13s16, q10s16);
- q11s16 = vsubq_s16(q14s16, q11s16);
- STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
- LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
- q8s16 = vsubq_s16(q9s16, q12s16);
- q10s16 = vaddq_s16(q14s16, q15s16);
- q14s16 = vsubq_s16(q14s16, q15s16);
- q12s16 = vaddq_s16(q9s16, q12s16);
- STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
- // part of stage 7
- DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
- STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
- q13s16 = q11s16;
- q14s16 = q8s16;
- DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
- STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
- // part of stage 4
- q14s16 = vsubq_s16(q5s16, q0s16);
- q13s16 = vsubq_s16(q6s16, q2s16);
- DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
- q14s16 = vsubq_s16(q7s16, q1s16);
- q13s16 = vsubq_s16(q4s16, q3s16);
- DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
- // part of stage 6
- LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
- q8s16 = vaddq_s16(q14s16, q1s16);
- q9s16 = vaddq_s16(q13s16, q6s16);
- q13s16 = vsubq_s16(q13s16, q6s16);
- q1s16 = vsubq_s16(q14s16, q1s16);
- STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
- LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
- q14s16 = vsubq_s16(q8s16, q5s16);
- q10s16 = vaddq_s16(q8s16, q5s16);
- q11s16 = vaddq_s16(q9s16, q0s16);
- q0s16 = vsubq_s16(q9s16, q0s16);
- STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
- // part of stage 7
- DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
- STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
- DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64,
- &q1s16, &q0s16);
- STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
-
- // -----------------------------------------
- // BLOCK C: 8-10,11-15
- // -----------------------------------------
- // generate 8,9,14,15
- // part of stage 2
- LOAD_FROM_TRANSPOSED(3, 2, 30)
- DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
- LOAD_FROM_TRANSPOSED(30, 18, 14)
- DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
- // part of stage 3
- q13s16 = vsubq_s16(q0s16, q1s16);
- q0s16 = vaddq_s16(q0s16, q1s16);
- q14s16 = vsubq_s16(q2s16, q3s16);
- q2s16 = vaddq_s16(q2s16, q3s16);
- // part of stage 4
- DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
-
- // generate 10,11,12,13
- // part of stage 2
- LOAD_FROM_TRANSPOSED(14, 10, 22)
- DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
- LOAD_FROM_TRANSPOSED(22, 26, 6)
- DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
- // part of stage 3
- q14s16 = vsubq_s16(q4s16, q5s16);
- q5s16 = vaddq_s16(q4s16, q5s16);
- q13s16 = vsubq_s16(q6s16, q7s16);
- q6s16 = vaddq_s16(q6s16, q7s16);
- // part of stage 4
- DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
- // part of stage 5
- q8s16 = vaddq_s16(q0s16, q5s16);
- q9s16 = vaddq_s16(q1s16, q7s16);
- q13s16 = vsubq_s16(q1s16, q7s16);
- q14s16 = vsubq_s16(q3s16, q4s16);
- q10s16 = vaddq_s16(q3s16, q4s16);
- q15s16 = vaddq_s16(q2s16, q6s16);
- STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
- STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
- // part of stage 6
- DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
- STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
- q13s16 = vsubq_s16(q0s16, q5s16);
- q14s16 = vsubq_s16(q2s16, q6s16);
- DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
- STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
-
- // -----------------------------------------
- // BLOCK D: 0-3,4-7
- // -----------------------------------------
- // generate 4,5,6,7
- // part of stage 3
- LOAD_FROM_TRANSPOSED(6, 4, 28)
- DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
- LOAD_FROM_TRANSPOSED(28, 20, 12)
- DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
- // part of stage 4
- q13s16 = vsubq_s16(q0s16, q1s16);
- q0s16 = vaddq_s16(q0s16, q1s16);
- q14s16 = vsubq_s16(q2s16, q3s16);
- q2s16 = vaddq_s16(q2s16, q3s16);
- // part of stage 5
- DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
-
- // generate 0,1,2,3
- // part of stage 4
- LOAD_FROM_TRANSPOSED(12, 0, 16)
- DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
- LOAD_FROM_TRANSPOSED(16, 8, 24)
- DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
- // part of stage 5
- q4s16 = vaddq_s16(q7s16, q6s16);
- q7s16 = vsubq_s16(q7s16, q6s16);
- q6s16 = vsubq_s16(q5s16, q14s16);
- q5s16 = vaddq_s16(q5s16, q14s16);
- // part of stage 6
- q8s16 = vaddq_s16(q4s16, q2s16);
- q9s16 = vaddq_s16(q5s16, q3s16);
- q10s16 = vaddq_s16(q6s16, q1s16);
- q11s16 = vaddq_s16(q7s16, q0s16);
- q12s16 = vsubq_s16(q7s16, q0s16);
- q13s16 = vsubq_s16(q6s16, q1s16);
- q14s16 = vsubq_s16(q5s16, q3s16);
- q15s16 = vsubq_s16(q4s16, q2s16);
- // part of stage 7
- LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
- q2s16 = vaddq_s16(q8s16, q1s16);
- q3s16 = vaddq_s16(q9s16, q0s16);
- q4s16 = vsubq_s16(q9s16, q0s16);
- q5s16 = vsubq_s16(q8s16, q1s16);
- LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
- q8s16 = vaddq_s16(q4s16, q1s16);
- q9s16 = vaddq_s16(q5s16, q0s16);
- q6s16 = vsubq_s16(q5s16, q0s16);
- q7s16 = vsubq_s16(q4s16, q1s16);
-
- if (idct32_pass_loop == 0) {
- idct32_bands_end_1st_pass(out,
- q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
- q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
- } else {
- idct32_bands_end_2nd_pass(out, dest, stride,
- q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
- q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
- dest += 8;
- }
- }
+void vpx_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int stride) {
+ int i, idct32_pass_loop;
+ int16_t trans_buf[32 * 8];
+ int16_t pass1[32 * 32];
+ int16_t pass2[32 * 32];
+ int16_t *out;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+
+ for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
+ idct32_pass_loop++,
+ input = pass1, // the input of pass2 is the result of pass1
+ out = pass2) {
+ for (i = 0; i < 4; i++, input += 32 * 8, out += 8) { // idct32_bands_loop
+ idct32_transpose_pair(input, trans_buf);
+
+ // -----------------------------------------
+ // BLOCK A: 16-19,28-31
+ // -----------------------------------------
+ // generate 16,17,30,31
+ // part of stage 1
+ LOAD_FROM_TRANSPOSED(0, 1, 31)
+ DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(31, 17, 15)
+ DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
+ // part of stage 2
+ q4s16 = vaddq_s16(q0s16, q1s16);
+ q13s16 = vsubq_s16(q0s16, q1s16);
+ q6s16 = vaddq_s16(q2s16, q3s16);
+ q14s16 = vsubq_s16(q2s16, q3s16);
+ // part of stage 3
+ DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
+
+ // generate 18,19,28,29
+ // part of stage 1
+ LOAD_FROM_TRANSPOSED(15, 9, 23)
+ DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(23, 25, 7)
+ DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
+ // part of stage 2
+ q13s16 = vsubq_s16(q3s16, q2s16);
+ q3s16 = vaddq_s16(q3s16, q2s16);
+ q14s16 = vsubq_s16(q1s16, q0s16);
+ q2s16 = vaddq_s16(q1s16, q0s16);
+ // part of stage 3
+ DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
+ // part of stage 4
+ q8s16 = vaddq_s16(q4s16, q2s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q10s16 = vaddq_s16(q7s16, q1s16);
+ q15s16 = vaddq_s16(q6s16, q3s16);
+ q13s16 = vsubq_s16(q5s16, q0s16);
+ q14s16 = vsubq_s16(q7s16, q1s16);
+ STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
+ STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
+ // part of stage 5
+ DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
+ STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
+ // part of stage 4
+ q13s16 = vsubq_s16(q4s16, q2s16);
+ q14s16 = vsubq_s16(q6s16, q3s16);
+ // part of stage 5
+ DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
+ STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
+
+ // -----------------------------------------
+ // BLOCK B: 20-23,24-27
+ // -----------------------------------------
+ // generate 20,21,26,27
+ // part of stage 1
+ LOAD_FROM_TRANSPOSED(7, 5, 27)
+ DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(27, 21, 11)
+ DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
+ // part of stage 2
+ q13s16 = vsubq_s16(q0s16, q1s16);
+ q0s16 = vaddq_s16(q0s16, q1s16);
+ q14s16 = vsubq_s16(q2s16, q3s16);
+ q2s16 = vaddq_s16(q2s16, q3s16);
+ // part of stage 3
+ DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+
+ // generate 22,23,24,25
+ // part of stage 1
+ LOAD_FROM_TRANSPOSED(11, 13, 19)
+ DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
+ LOAD_FROM_TRANSPOSED(19, 29, 3)
+ DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
+ // part of stage 2
+ q14s16 = vsubq_s16(q4s16, q5s16);
+ q5s16 = vaddq_s16(q4s16, q5s16);
+ q13s16 = vsubq_s16(q6s16, q7s16);
+ q6s16 = vaddq_s16(q6s16, q7s16);
+ // part of stage 3
+ DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
+ // part of stage 4
+ q10s16 = vaddq_s16(q7s16, q1s16);
+ q11s16 = vaddq_s16(q5s16, q0s16);
+ q12s16 = vaddq_s16(q6s16, q2s16);
+ q15s16 = vaddq_s16(q4s16, q3s16);
+ // part of stage 6
+ LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
+ q8s16 = vaddq_s16(q14s16, q11s16);
+ q9s16 = vaddq_s16(q13s16, q10s16);
+ q13s16 = vsubq_s16(q13s16, q10s16);
+ q11s16 = vsubq_s16(q14s16, q11s16);
+ STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
+ LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
+ q8s16 = vsubq_s16(q9s16, q12s16);
+ q10s16 = vaddq_s16(q14s16, q15s16);
+ q14s16 = vsubq_s16(q14s16, q15s16);
+ q12s16 = vaddq_s16(q9s16, q12s16);
+ STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
+ // part of stage 7
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+ STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
+ q13s16 = q11s16;
+ q14s16 = q8s16;
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+ STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
+ // part of stage 4
+ q14s16 = vsubq_s16(q5s16, q0s16);
+ q13s16 = vsubq_s16(q6s16, q2s16);
+ DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
+ q14s16 = vsubq_s16(q7s16, q1s16);
+ q13s16 = vsubq_s16(q4s16, q3s16);
+ DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
+ // part of stage 6
+ LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
+ q8s16 = vaddq_s16(q14s16, q1s16);
+ q9s16 = vaddq_s16(q13s16, q6s16);
+ q13s16 = vsubq_s16(q13s16, q6s16);
+ q1s16 = vsubq_s16(q14s16, q1s16);
+ STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
+ LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
+ q14s16 = vsubq_s16(q8s16, q5s16);
+ q10s16 = vaddq_s16(q8s16, q5s16);
+ q11s16 = vaddq_s16(q9s16, q0s16);
+ q0s16 = vsubq_s16(q9s16, q0s16);
+ STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
+ // part of stage 7
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+ STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
+ DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, &q1s16, &q0s16);
+ STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
+
+ // -----------------------------------------
+ // BLOCK C: 8-10,11-15
+ // -----------------------------------------
+ // generate 8,9,14,15
+ // part of stage 2
+ LOAD_FROM_TRANSPOSED(3, 2, 30)
+ DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(30, 18, 14)
+ DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
+ // part of stage 3
+ q13s16 = vsubq_s16(q0s16, q1s16);
+ q0s16 = vaddq_s16(q0s16, q1s16);
+ q14s16 = vsubq_s16(q2s16, q3s16);
+ q2s16 = vaddq_s16(q2s16, q3s16);
+ // part of stage 4
+ DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
+
+ // generate 10,11,12,13
+ // part of stage 2
+ LOAD_FROM_TRANSPOSED(14, 10, 22)
+ DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
+ LOAD_FROM_TRANSPOSED(22, 26, 6)
+ DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
+ // part of stage 3
+ q14s16 = vsubq_s16(q4s16, q5s16);
+ q5s16 = vaddq_s16(q4s16, q5s16);
+ q13s16 = vsubq_s16(q6s16, q7s16);
+ q6s16 = vaddq_s16(q6s16, q7s16);
+ // part of stage 4
+ DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
+ // part of stage 5
+ q8s16 = vaddq_s16(q0s16, q5s16);
+ q9s16 = vaddq_s16(q1s16, q7s16);
+ q13s16 = vsubq_s16(q1s16, q7s16);
+ q14s16 = vsubq_s16(q3s16, q4s16);
+ q10s16 = vaddq_s16(q3s16, q4s16);
+ q15s16 = vaddq_s16(q2s16, q6s16);
+ STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
+ STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
+ // part of stage 6
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+ STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
+ q13s16 = vsubq_s16(q0s16, q5s16);
+ q14s16 = vsubq_s16(q2s16, q6s16);
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+ STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
+
+ // -----------------------------------------
+ // BLOCK D: 0-3,4-7
+ // -----------------------------------------
+ // generate 4,5,6,7
+ // part of stage 3
+ LOAD_FROM_TRANSPOSED(6, 4, 28)
+ DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(28, 20, 12)
+ DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+ // part of stage 4
+ q13s16 = vsubq_s16(q0s16, q1s16);
+ q0s16 = vaddq_s16(q0s16, q1s16);
+ q14s16 = vsubq_s16(q2s16, q3s16);
+ q2s16 = vaddq_s16(q2s16, q3s16);
+ // part of stage 5
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+
+ // generate 0,1,2,3
+ // part of stage 4
+ LOAD_FROM_TRANSPOSED(12, 0, 16)
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
+ LOAD_FROM_TRANSPOSED(16, 8, 24)
+ DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
+ // part of stage 5
+ q4s16 = vaddq_s16(q7s16, q6s16);
+ q7s16 = vsubq_s16(q7s16, q6s16);
+ q6s16 = vsubq_s16(q5s16, q14s16);
+ q5s16 = vaddq_s16(q5s16, q14s16);
+ // part of stage 6
+ q8s16 = vaddq_s16(q4s16, q2s16);
+ q9s16 = vaddq_s16(q5s16, q3s16);
+ q10s16 = vaddq_s16(q6s16, q1s16);
+ q11s16 = vaddq_s16(q7s16, q0s16);
+ q12s16 = vsubq_s16(q7s16, q0s16);
+ q13s16 = vsubq_s16(q6s16, q1s16);
+ q14s16 = vsubq_s16(q5s16, q3s16);
+ q15s16 = vsubq_s16(q4s16, q2s16);
+ // part of stage 7
+ LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
+ q2s16 = vaddq_s16(q8s16, q1s16);
+ q3s16 = vaddq_s16(q9s16, q0s16);
+ q4s16 = vsubq_s16(q9s16, q0s16);
+ q5s16 = vsubq_s16(q8s16, q1s16);
+ LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+
+ if (idct32_pass_loop == 0) {
+ idct32_bands_end_1st_pass(out, q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
+ q10s16, q11s16, q12s16, q13s16, q14s16,
+ q15s16);
+ } else {
+ idct32_bands_end_2nd_pass(out, dest, stride, q2s16, q3s16, q6s16, q7s16,
+ q8s16, q9s16, q10s16, q11s16, q12s16, q13s16,
+ q14s16, q15s16);
+ dest += 8;
+ }
}
- return;
+ }
+ return;
}
diff --git a/vpx_dsp/arm/idct4x4_1_add_neon.c b/vpx_dsp/arm/idct4x4_1_add_neon.c
index ea618700c..9f999e979 100644
--- a/vpx_dsp/arm/idct4x4_1_add_neon.c
+++ b/vpx_dsp/arm/idct4x4_1_add_neon.c
@@ -13,38 +13,34 @@
#include "vpx_dsp/inv_txfm.h"
#include "vpx_ports/mem.h"
-void vpx_idct4x4_1_add_neon(
- int16_t *input,
- uint8_t *dest,
- int dest_stride) {
- uint8x8_t d6u8;
- uint32x2_t d2u32 = vdup_n_u32(0);
- uint16x8_t q8u16;
- int16x8_t q0s16;
- uint8_t *d1, *d2;
- int16_t i, a1, cospi_16_64 = 11585;
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
- out = dct_const_round_shift(out * cospi_16_64);
- a1 = ROUND_POWER_OF_TWO(out, 4);
+void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+ uint8x8_t d6u8;
+ uint32x2_t d2u32 = vdup_n_u32(0);
+ uint16x8_t q8u16;
+ int16x8_t q0s16;
+ uint8_t *d1, *d2;
+ int16_t i, a1, cospi_16_64 = 11585;
+ int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 4);
- q0s16 = vdupq_n_s16(a1);
+ q0s16 = vdupq_n_s16(a1);
- // dc_only_idct_add
- d1 = d2 = dest;
- for (i = 0; i < 2; i++) {
- d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);
- d1 += dest_stride;
- d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
- d1 += dest_stride;
+ // dc_only_idct_add
+ d1 = d2 = dest;
+ for (i = 0; i < 2; i++) {
+ d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);
+ d1 += dest_stride;
+ d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
+ d1 += dest_stride;
- q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16),
- vreinterpret_u8_u32(d2u32));
- d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32));
+ d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
- vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
- d2 += dest_stride;
- vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);
- d2 += dest_stride;
- }
- return;
+ vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
+ d2 += dest_stride;
+ vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);
+ d2 += dest_stride;
+ }
+ return;
}
diff --git a/vpx_dsp/arm/idct4x4_add_neon.c b/vpx_dsp/arm/idct4x4_add_neon.c
index 3c975c99b..382626928 100644
--- a/vpx_dsp/arm/idct4x4_add_neon.c
+++ b/vpx_dsp/arm/idct4x4_add_neon.c
@@ -10,142 +10,137 @@
#include <arm_neon.h>
-void vpx_idct4x4_16_add_neon(
- int16_t *input,
- uint8_t *dest,
- int dest_stride) {
- uint8x8_t d26u8, d27u8;
- uint32x2_t d26u32, d27u32;
- uint16x8_t q8u16, q9u16;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
- int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
- int16x8_t q8s16, q9s16, q13s16, q14s16;
- int32x4_t q1s32, q13s32, q14s32, q15s32;
- int16x4x2_t d0x2s16, d1x2s16;
- int32x4x2_t q0x2s32;
- uint8_t *d;
- int16_t cospi_8_64 = 15137;
- int16_t cospi_16_64 = 11585;
- int16_t cospi_24_64 = 6270;
-
- d26u32 = d27u32 = vdup_n_u32(0);
-
- q8s16 = vld1q_s16(input);
- q9s16 = vld1q_s16(input + 8);
-
- d16s16 = vget_low_s16(q8s16);
- d17s16 = vget_high_s16(q8s16);
- d18s16 = vget_low_s16(q9s16);
- d19s16 = vget_high_s16(q9s16);
-
- d0x2s16 = vtrn_s16(d16s16, d17s16);
- d1x2s16 = vtrn_s16(d18s16, d19s16);
- q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
- q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
-
- d20s16 = vdup_n_s16(cospi_8_64);
- d21s16 = vdup_n_s16(cospi_16_64);
-
- q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
- vreinterpretq_s32_s16(q9s16));
- d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
- d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
- d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
- d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-
- d22s16 = vdup_n_s16(cospi_24_64);
-
- // stage 1
- d23s16 = vadd_s16(d16s16, d18s16);
- d24s16 = vsub_s16(d16s16, d18s16);
-
- q15s32 = vmull_s16(d17s16, d22s16);
- q1s32 = vmull_s16(d17s16, d20s16);
- q13s32 = vmull_s16(d23s16, d21s16);
- q14s32 = vmull_s16(d24s16, d21s16);
-
- q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
- q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
-
- d26s16 = vqrshrn_n_s32(q13s32, 14);
- d27s16 = vqrshrn_n_s32(q14s32, 14);
- d29s16 = vqrshrn_n_s32(q15s32, 14);
- d28s16 = vqrshrn_n_s32(q1s32, 14);
- q13s16 = vcombine_s16(d26s16, d27s16);
- q14s16 = vcombine_s16(d28s16, d29s16);
-
- // stage 2
- q8s16 = vaddq_s16(q13s16, q14s16);
- q9s16 = vsubq_s16(q13s16, q14s16);
-
- d16s16 = vget_low_s16(q8s16);
- d17s16 = vget_high_s16(q8s16);
- d18s16 = vget_high_s16(q9s16); // vswp d18 d19
- d19s16 = vget_low_s16(q9s16);
-
- d0x2s16 = vtrn_s16(d16s16, d17s16);
- d1x2s16 = vtrn_s16(d18s16, d19s16);
- q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
- q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
-
- q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
- vreinterpretq_s32_s16(q9s16));
- d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
- d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
- d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
- d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-
- // do the transform on columns
- // stage 1
- d23s16 = vadd_s16(d16s16, d18s16);
- d24s16 = vsub_s16(d16s16, d18s16);
-
- q15s32 = vmull_s16(d17s16, d22s16);
- q1s32 = vmull_s16(d17s16, d20s16);
- q13s32 = vmull_s16(d23s16, d21s16);
- q14s32 = vmull_s16(d24s16, d21s16);
-
- q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
- q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
-
- d26s16 = vqrshrn_n_s32(q13s32, 14);
- d27s16 = vqrshrn_n_s32(q14s32, 14);
- d29s16 = vqrshrn_n_s32(q15s32, 14);
- d28s16 = vqrshrn_n_s32(q1s32, 14);
- q13s16 = vcombine_s16(d26s16, d27s16);
- q14s16 = vcombine_s16(d28s16, d29s16);
-
- // stage 2
- q8s16 = vaddq_s16(q13s16, q14s16);
- q9s16 = vsubq_s16(q13s16, q14s16);
-
- q8s16 = vrshrq_n_s16(q8s16, 4);
- q9s16 = vrshrq_n_s16(q9s16, 4);
-
- d = dest;
- d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
- d += dest_stride;
- d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
- d += dest_stride;
- d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
- d += dest_stride;
- d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
-
- q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
- vreinterpret_u8_u32(d26u32));
- q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
- vreinterpret_u8_u32(d27u32));
-
- d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
- d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-
- d = dest;
- vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
- d += dest_stride;
- vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
- d += dest_stride;
- vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
- d += dest_stride;
- vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
- return;
+void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+ uint8x8_t d26u8, d27u8;
+ uint32x2_t d26u32, d27u32;
+ uint16x8_t q8u16, q9u16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
+ int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
+ int16x8_t q8s16, q9s16, q13s16, q14s16;
+ int32x4_t q1s32, q13s32, q14s32, q15s32;
+ int16x4x2_t d0x2s16, d1x2s16;
+ int32x4x2_t q0x2s32;
+ uint8_t *d;
+ int16_t cospi_8_64 = 15137;
+ int16_t cospi_16_64 = 11585;
+ int16_t cospi_24_64 = 6270;
+
+ d26u32 = d27u32 = vdup_n_u32(0);
+
+ q8s16 = vld1q_s16(input);
+ q9s16 = vld1q_s16(input + 8);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+
+ d0x2s16 = vtrn_s16(d16s16, d17s16);
+ d1x2s16 = vtrn_s16(d18s16, d19s16);
+ q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
+ q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
+
+ d20s16 = vdup_n_s16(cospi_8_64);
+ d21s16 = vdup_n_s16(cospi_16_64);
+
+ q0x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
+ d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+ d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+ d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+ d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+
+ d22s16 = vdup_n_s16(cospi_24_64);
+
+ // stage 1
+ d23s16 = vadd_s16(d16s16, d18s16);
+ d24s16 = vsub_s16(d16s16, d18s16);
+
+ q15s32 = vmull_s16(d17s16, d22s16);
+ q1s32 = vmull_s16(d17s16, d20s16);
+ q13s32 = vmull_s16(d23s16, d21s16);
+ q14s32 = vmull_s16(d24s16, d21s16);
+
+ q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
+ q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
+
+ d26s16 = vqrshrn_n_s32(q13s32, 14);
+ d27s16 = vqrshrn_n_s32(q14s32, 14);
+ d29s16 = vqrshrn_n_s32(q15s32, 14);
+ d28s16 = vqrshrn_n_s32(q1s32, 14);
+ q13s16 = vcombine_s16(d26s16, d27s16);
+ q14s16 = vcombine_s16(d28s16, d29s16);
+
+ // stage 2
+ q8s16 = vaddq_s16(q13s16, q14s16);
+ q9s16 = vsubq_s16(q13s16, q14s16);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_high_s16(q9s16); // vswp d18 d19
+ d19s16 = vget_low_s16(q9s16);
+
+ d0x2s16 = vtrn_s16(d16s16, d17s16);
+ d1x2s16 = vtrn_s16(d18s16, d19s16);
+ q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
+ q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
+
+ q0x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
+ d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+ d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+ d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+ d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+
+ // do the transform on columns
+ // stage 1
+ d23s16 = vadd_s16(d16s16, d18s16);
+ d24s16 = vsub_s16(d16s16, d18s16);
+
+ q15s32 = vmull_s16(d17s16, d22s16);
+ q1s32 = vmull_s16(d17s16, d20s16);
+ q13s32 = vmull_s16(d23s16, d21s16);
+ q14s32 = vmull_s16(d24s16, d21s16);
+
+ q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
+ q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
+
+ d26s16 = vqrshrn_n_s32(q13s32, 14);
+ d27s16 = vqrshrn_n_s32(q14s32, 14);
+ d29s16 = vqrshrn_n_s32(q15s32, 14);
+ d28s16 = vqrshrn_n_s32(q1s32, 14);
+ q13s16 = vcombine_s16(d26s16, d27s16);
+ q14s16 = vcombine_s16(d28s16, d29s16);
+
+ // stage 2
+ q8s16 = vaddq_s16(q13s16, q14s16);
+ q9s16 = vsubq_s16(q13s16, q14s16);
+
+ q8s16 = vrshrq_n_s16(q8s16, 4);
+ q9s16 = vrshrq_n_s16(q9s16, 4);
+
+ d = dest;
+ d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
+ d += dest_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
+ d += dest_stride;
+ d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
+ d += dest_stride;
+ d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
+
+ d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+
+ d = dest;
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
+ d += dest_stride;
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
+ d += dest_stride;
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
+ d += dest_stride;
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
+ return;
}
diff --git a/vpx_dsp/arm/idct8x8_1_add_neon.c b/vpx_dsp/arm/idct8x8_1_add_neon.c
index c1b801fad..e3db0b876 100644
--- a/vpx_dsp/arm/idct8x8_1_add_neon.c
+++ b/vpx_dsp/arm/idct8x8_1_add_neon.c
@@ -13,52 +13,49 @@
#include "vpx_dsp/inv_txfm.h"
#include "vpx_ports/mem.h"
-void vpx_idct8x8_1_add_neon(
- int16_t *input,
- uint8_t *dest,
- int dest_stride) {
- uint8x8_t d2u8, d3u8, d30u8, d31u8;
- uint64x1_t d2u64, d3u64, d4u64, d5u64;
- uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
- int16x8_t q0s16;
- uint8_t *d1, *d2;
- int16_t i, a1, cospi_16_64 = 11585;
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
- out = dct_const_round_shift(out * cospi_16_64);
- a1 = ROUND_POWER_OF_TWO(out, 5);
-
- q0s16 = vdupq_n_s16(a1);
- q0u16 = vreinterpretq_u16_s16(q0s16);
-
- d1 = d2 = dest;
- for (i = 0; i < 2; i++) {
- d2u64 = vld1_u64((const uint64_t *)d1);
- d1 += dest_stride;
- d3u64 = vld1_u64((const uint64_t *)d1);
- d1 += dest_stride;
- d4u64 = vld1_u64((const uint64_t *)d1);
- d1 += dest_stride;
- d5u64 = vld1_u64((const uint64_t *)d1);
- d1 += dest_stride;
-
- q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
- q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
- q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
- q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
-
- d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
- d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
- d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
- d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
- d2 += dest_stride;
- }
- return;
+void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+ uint8x8_t d2u8, d3u8, d30u8, d31u8;
+ uint64x1_t d2u64, d3u64, d4u64, d5u64;
+ uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
+ int16x8_t q0s16;
+ uint8_t *d1, *d2;
+ int16_t i, a1, cospi_16_64 = 11585;
+ int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 5);
+
+ q0s16 = vdupq_n_s16(a1);
+ q0u16 = vreinterpretq_u16_s16(q0s16);
+
+ d1 = d2 = dest;
+ for (i = 0; i < 2; i++) {
+ d2u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+ d4u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+ d5u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+
+ q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
+ q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
+ q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
+ q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+ d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
+ d2 += dest_stride;
+ }
+ return;
}
diff --git a/vpx_dsp/arm/idct8x8_add_neon.c b/vpx_dsp/arm/idct8x8_add_neon.c
index 4b2c2a6f8..f1c271110 100644
--- a/vpx_dsp/arm/idct8x8_add_neon.c
+++ b/vpx_dsp/arm/idct8x8_add_neon.c
@@ -13,528 +13,496 @@
#include "./vpx_config.h"
#include "vpx_dsp/txfm_common.h"
-static INLINE void TRANSPOSE8X8(
- int16x8_t *q8s16,
- int16x8_t *q9s16,
- int16x8_t *q10s16,
- int16x8_t *q11s16,
- int16x8_t *q12s16,
- int16x8_t *q13s16,
- int16x8_t *q14s16,
- int16x8_t *q15s16) {
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
- int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
- int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
-
- d16s16 = vget_low_s16(*q8s16);
- d17s16 = vget_high_s16(*q8s16);
- d18s16 = vget_low_s16(*q9s16);
- d19s16 = vget_high_s16(*q9s16);
- d20s16 = vget_low_s16(*q10s16);
- d21s16 = vget_high_s16(*q10s16);
- d22s16 = vget_low_s16(*q11s16);
- d23s16 = vget_high_s16(*q11s16);
- d24s16 = vget_low_s16(*q12s16);
- d25s16 = vget_high_s16(*q12s16);
- d26s16 = vget_low_s16(*q13s16);
- d27s16 = vget_high_s16(*q13s16);
- d28s16 = vget_low_s16(*q14s16);
- d29s16 = vget_high_s16(*q14s16);
- d30s16 = vget_low_s16(*q15s16);
- d31s16 = vget_high_s16(*q15s16);
-
- *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
- *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
- *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
- *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
- *q12s16 = vcombine_s16(d17s16, d25s16);
- *q13s16 = vcombine_s16(d19s16, d27s16);
- *q14s16 = vcombine_s16(d21s16, d29s16);
- *q15s16 = vcombine_s16(d23s16, d31s16);
-
- q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
- vreinterpretq_s32_s16(*q10s16));
- q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
- vreinterpretq_s32_s16(*q11s16));
- q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
- vreinterpretq_s32_s16(*q14s16));
- q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
- vreinterpretq_s32_s16(*q15s16));
-
- q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
- vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
- q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
- vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
- q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
- vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
- q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
- vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
-
- *q8s16 = q0x2s16.val[0];
- *q9s16 = q0x2s16.val[1];
- *q10s16 = q1x2s16.val[0];
- *q11s16 = q1x2s16.val[1];
- *q12s16 = q2x2s16.val[0];
- *q13s16 = q2x2s16.val[1];
- *q14s16 = q3x2s16.val[0];
- *q15s16 = q3x2s16.val[1];
- return;
+static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16,
+ int16x8_t *q10s16, int16x8_t *q11s16,
+ int16x8_t *q12s16, int16x8_t *q13s16,
+ int16x8_t *q14s16, int16x8_t *q15s16) {
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+ int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
+ *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
+ *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
+ *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
+ *q12s16 = vcombine_s16(d17s16, d25s16);
+ *q13s16 = vcombine_s16(d19s16, d27s16);
+ *q14s16 = vcombine_s16(d21s16, d29s16);
+ *q15s16 = vcombine_s16(d23s16, d31s16);
+
+ q0x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16));
+ q1x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16));
+ q2x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16));
+ q3x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16));
+
+ q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
+ vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
+ q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
+ vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
+ q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
+ vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
+ q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
+ vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
+
+ *q8s16 = q0x2s16.val[0];
+ *q9s16 = q0x2s16.val[1];
+ *q10s16 = q1x2s16.val[0];
+ *q11s16 = q1x2s16.val[1];
+ *q12s16 = q2x2s16.val[0];
+ *q13s16 = q2x2s16.val[1];
+ *q14s16 = q3x2s16.val[0];
+ *q15s16 = q3x2s16.val[1];
+ return;
}
-static INLINE void IDCT8x8_1D(
- int16x8_t *q8s16,
- int16x8_t *q9s16,
- int16x8_t *q10s16,
- int16x8_t *q11s16,
- int16x8_t *q12s16,
- int16x8_t *q13s16,
- int16x8_t *q14s16,
- int16x8_t *q15s16) {
- int16x4_t d0s16, d1s16, d2s16, d3s16;
- int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
- int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
- int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
- int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
-
- d0s16 = vdup_n_s16(cospi_28_64);
- d1s16 = vdup_n_s16(cospi_4_64);
- d2s16 = vdup_n_s16(cospi_12_64);
- d3s16 = vdup_n_s16(cospi_20_64);
-
- d16s16 = vget_low_s16(*q8s16);
- d17s16 = vget_high_s16(*q8s16);
- d18s16 = vget_low_s16(*q9s16);
- d19s16 = vget_high_s16(*q9s16);
- d20s16 = vget_low_s16(*q10s16);
- d21s16 = vget_high_s16(*q10s16);
- d22s16 = vget_low_s16(*q11s16);
- d23s16 = vget_high_s16(*q11s16);
- d24s16 = vget_low_s16(*q12s16);
- d25s16 = vget_high_s16(*q12s16);
- d26s16 = vget_low_s16(*q13s16);
- d27s16 = vget_high_s16(*q13s16);
- d28s16 = vget_low_s16(*q14s16);
- d29s16 = vget_high_s16(*q14s16);
- d30s16 = vget_low_s16(*q15s16);
- d31s16 = vget_high_s16(*q15s16);
-
- q2s32 = vmull_s16(d18s16, d0s16);
- q3s32 = vmull_s16(d19s16, d0s16);
- q5s32 = vmull_s16(d26s16, d2s16);
- q6s32 = vmull_s16(d27s16, d2s16);
-
- q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
- q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
- q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
- q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
-
- d8s16 = vqrshrn_n_s32(q2s32, 14);
- d9s16 = vqrshrn_n_s32(q3s32, 14);
- d10s16 = vqrshrn_n_s32(q5s32, 14);
- d11s16 = vqrshrn_n_s32(q6s32, 14);
- q4s16 = vcombine_s16(d8s16, d9s16);
- q5s16 = vcombine_s16(d10s16, d11s16);
-
- q2s32 = vmull_s16(d18s16, d1s16);
- q3s32 = vmull_s16(d19s16, d1s16);
- q9s32 = vmull_s16(d26s16, d3s16);
- q13s32 = vmull_s16(d27s16, d3s16);
-
- q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
- q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
- q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
- q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
-
- d14s16 = vqrshrn_n_s32(q2s32, 14);
- d15s16 = vqrshrn_n_s32(q3s32, 14);
- d12s16 = vqrshrn_n_s32(q9s32, 14);
- d13s16 = vqrshrn_n_s32(q13s32, 14);
- q6s16 = vcombine_s16(d12s16, d13s16);
- q7s16 = vcombine_s16(d14s16, d15s16);
-
- d0s16 = vdup_n_s16(cospi_16_64);
-
- q2s32 = vmull_s16(d16s16, d0s16);
- q3s32 = vmull_s16(d17s16, d0s16);
- q13s32 = vmull_s16(d16s16, d0s16);
- q15s32 = vmull_s16(d17s16, d0s16);
-
- q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
- q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
- q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
- q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
-
- d0s16 = vdup_n_s16(cospi_24_64);
- d1s16 = vdup_n_s16(cospi_8_64);
-
- d18s16 = vqrshrn_n_s32(q2s32, 14);
- d19s16 = vqrshrn_n_s32(q3s32, 14);
- d22s16 = vqrshrn_n_s32(q13s32, 14);
- d23s16 = vqrshrn_n_s32(q15s32, 14);
- *q9s16 = vcombine_s16(d18s16, d19s16);
- *q11s16 = vcombine_s16(d22s16, d23s16);
-
- q2s32 = vmull_s16(d20s16, d0s16);
- q3s32 = vmull_s16(d21s16, d0s16);
- q8s32 = vmull_s16(d20s16, d1s16);
- q12s32 = vmull_s16(d21s16, d1s16);
-
- q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
- q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
- q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
- q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
-
- d26s16 = vqrshrn_n_s32(q2s32, 14);
- d27s16 = vqrshrn_n_s32(q3s32, 14);
- d30s16 = vqrshrn_n_s32(q8s32, 14);
- d31s16 = vqrshrn_n_s32(q12s32, 14);
- *q13s16 = vcombine_s16(d26s16, d27s16);
- *q15s16 = vcombine_s16(d30s16, d31s16);
-
- q0s16 = vaddq_s16(*q9s16, *q15s16);
- q1s16 = vaddq_s16(*q11s16, *q13s16);
- q2s16 = vsubq_s16(*q11s16, *q13s16);
- q3s16 = vsubq_s16(*q9s16, *q15s16);
-
- *q13s16 = vsubq_s16(q4s16, q5s16);
- q4s16 = vaddq_s16(q4s16, q5s16);
- *q14s16 = vsubq_s16(q7s16, q6s16);
- q7s16 = vaddq_s16(q7s16, q6s16);
- d26s16 = vget_low_s16(*q13s16);
- d27s16 = vget_high_s16(*q13s16);
- d28s16 = vget_low_s16(*q14s16);
- d29s16 = vget_high_s16(*q14s16);
-
- d16s16 = vdup_n_s16(cospi_16_64);
-
- q9s32 = vmull_s16(d28s16, d16s16);
- q10s32 = vmull_s16(d29s16, d16s16);
- q11s32 = vmull_s16(d28s16, d16s16);
- q12s32 = vmull_s16(d29s16, d16s16);
-
- q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
- q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
- q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
- q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
- d10s16 = vqrshrn_n_s32(q9s32, 14);
- d11s16 = vqrshrn_n_s32(q10s32, 14);
- d12s16 = vqrshrn_n_s32(q11s32, 14);
- d13s16 = vqrshrn_n_s32(q12s32, 14);
- q5s16 = vcombine_s16(d10s16, d11s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
-
- *q8s16 = vaddq_s16(q0s16, q7s16);
- *q9s16 = vaddq_s16(q1s16, q6s16);
- *q10s16 = vaddq_s16(q2s16, q5s16);
- *q11s16 = vaddq_s16(q3s16, q4s16);
- *q12s16 = vsubq_s16(q3s16, q4s16);
- *q13s16 = vsubq_s16(q2s16, q5s16);
- *q14s16 = vsubq_s16(q1s16, q6s16);
- *q15s16 = vsubq_s16(q0s16, q7s16);
- return;
+static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
+ int16x8_t *q10s16, int16x8_t *q11s16,
+ int16x8_t *q12s16, int16x8_t *q13s16,
+ int16x8_t *q14s16, int16x8_t *q15s16) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+
+ d0s16 = vdup_n_s16(cospi_28_64);
+ d1s16 = vdup_n_s16(cospi_4_64);
+ d2s16 = vdup_n_s16(cospi_12_64);
+ d3s16 = vdup_n_s16(cospi_20_64);
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ q2s32 = vmull_s16(d18s16, d0s16);
+ q3s32 = vmull_s16(d19s16, d0s16);
+ q5s32 = vmull_s16(d26s16, d2s16);
+ q6s32 = vmull_s16(d27s16, d2s16);
+
+ q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+ q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+ q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
+ q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+
+ d8s16 = vqrshrn_n_s32(q2s32, 14);
+ d9s16 = vqrshrn_n_s32(q3s32, 14);
+ d10s16 = vqrshrn_n_s32(q5s32, 14);
+ d11s16 = vqrshrn_n_s32(q6s32, 14);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ q2s32 = vmull_s16(d18s16, d1s16);
+ q3s32 = vmull_s16(d19s16, d1s16);
+ q9s32 = vmull_s16(d26s16, d3s16);
+ q13s32 = vmull_s16(d27s16, d3s16);
+
+ q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
+ q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
+ q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+ q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
+
+ d14s16 = vqrshrn_n_s32(q2s32, 14);
+ d15s16 = vqrshrn_n_s32(q3s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q13s32, 14);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+ q7s16 = vcombine_s16(d14s16, d15s16);
+
+ d0s16 = vdup_n_s16(cospi_16_64);
+
+ q2s32 = vmull_s16(d16s16, d0s16);
+ q3s32 = vmull_s16(d17s16, d0s16);
+ q13s32 = vmull_s16(d16s16, d0s16);
+ q15s32 = vmull_s16(d17s16, d0s16);
+
+ q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
+ q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
+ q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
+ q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
+
+ d0s16 = vdup_n_s16(cospi_24_64);
+ d1s16 = vdup_n_s16(cospi_8_64);
+
+ d18s16 = vqrshrn_n_s32(q2s32, 14);
+ d19s16 = vqrshrn_n_s32(q3s32, 14);
+ d22s16 = vqrshrn_n_s32(q13s32, 14);
+ d23s16 = vqrshrn_n_s32(q15s32, 14);
+ *q9s16 = vcombine_s16(d18s16, d19s16);
+ *q11s16 = vcombine_s16(d22s16, d23s16);
+
+ q2s32 = vmull_s16(d20s16, d0s16);
+ q3s32 = vmull_s16(d21s16, d0s16);
+ q8s32 = vmull_s16(d20s16, d1s16);
+ q12s32 = vmull_s16(d21s16, d1s16);
+
+ q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
+ q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
+ q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
+ q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
+
+ d26s16 = vqrshrn_n_s32(q2s32, 14);
+ d27s16 = vqrshrn_n_s32(q3s32, 14);
+ d30s16 = vqrshrn_n_s32(q8s32, 14);
+ d31s16 = vqrshrn_n_s32(q12s32, 14);
+ *q13s16 = vcombine_s16(d26s16, d27s16);
+ *q15s16 = vcombine_s16(d30s16, d31s16);
+
+ q0s16 = vaddq_s16(*q9s16, *q15s16);
+ q1s16 = vaddq_s16(*q11s16, *q13s16);
+ q2s16 = vsubq_s16(*q11s16, *q13s16);
+ q3s16 = vsubq_s16(*q9s16, *q15s16);
+
+ *q13s16 = vsubq_s16(q4s16, q5s16);
+ q4s16 = vaddq_s16(q4s16, q5s16);
+ *q14s16 = vsubq_s16(q7s16, q6s16);
+ q7s16 = vaddq_s16(q7s16, q6s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+
+ d16s16 = vdup_n_s16(cospi_16_64);
+
+ q9s32 = vmull_s16(d28s16, d16s16);
+ q10s32 = vmull_s16(d29s16, d16s16);
+ q11s32 = vmull_s16(d28s16, d16s16);
+ q12s32 = vmull_s16(d29s16, d16s16);
+
+ q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
+ q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+ q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+ q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+ d10s16 = vqrshrn_n_s32(q9s32, 14);
+ d11s16 = vqrshrn_n_s32(q10s32, 14);
+ d12s16 = vqrshrn_n_s32(q11s32, 14);
+ d13s16 = vqrshrn_n_s32(q12s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ *q8s16 = vaddq_s16(q0s16, q7s16);
+ *q9s16 = vaddq_s16(q1s16, q6s16);
+ *q10s16 = vaddq_s16(q2s16, q5s16);
+ *q11s16 = vaddq_s16(q3s16, q4s16);
+ *q12s16 = vsubq_s16(q3s16, q4s16);
+ *q13s16 = vsubq_s16(q2s16, q5s16);
+ *q14s16 = vsubq_s16(q1s16, q6s16);
+ *q15s16 = vsubq_s16(q0s16, q7s16);
+ return;
}
-void vpx_idct8x8_64_add_neon(
- int16_t *input,
- uint8_t *dest,
- int dest_stride) {
- uint8_t *d1, *d2;
- uint8x8_t d0u8, d1u8, d2u8, d3u8;
- uint64x1_t d0u64, d1u64, d2u64, d3u64;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
- uint16x8_t q8u16, q9u16, q10u16, q11u16;
-
- q8s16 = vld1q_s16(input);
- q9s16 = vld1q_s16(input + 8);
- q10s16 = vld1q_s16(input + 16);
- q11s16 = vld1q_s16(input + 24);
- q12s16 = vld1q_s16(input + 32);
- q13s16 = vld1q_s16(input + 40);
- q14s16 = vld1q_s16(input + 48);
- q15s16 = vld1q_s16(input + 56);
-
- TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
- &q12s16, &q13s16, &q14s16, &q15s16);
-
- IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
- &q12s16, &q13s16, &q14s16, &q15s16);
-
- TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
- &q12s16, &q13s16, &q14s16, &q15s16);
-
- IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
- &q12s16, &q13s16, &q14s16, &q15s16);
-
- q8s16 = vrshrq_n_s16(q8s16, 5);
- q9s16 = vrshrq_n_s16(q9s16, 5);
- q10s16 = vrshrq_n_s16(q10s16, 5);
- q11s16 = vrshrq_n_s16(q11s16, 5);
- q12s16 = vrshrq_n_s16(q12s16, 5);
- q13s16 = vrshrq_n_s16(q13s16, 5);
- q14s16 = vrshrq_n_s16(q14s16, 5);
- q15s16 = vrshrq_n_s16(q15s16, 5);
-
- d1 = d2 = dest;
-
- d0u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d1u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d2u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d3u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
-
- q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
- vreinterpret_u8_u64(d0u64));
- q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
- vreinterpret_u8_u64(d1u64));
- q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
- vreinterpret_u8_u64(d2u64));
- q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
- vreinterpret_u8_u64(d3u64));
-
- d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
- d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
- d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
- d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
- d2 += dest_stride;
-
- q8s16 = q12s16;
- q9s16 = q13s16;
- q10s16 = q14s16;
- q11s16 = q15s16;
-
- d0u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d1u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d2u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d3u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
-
- q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
- vreinterpret_u8_u64(d0u64));
- q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
- vreinterpret_u8_u64(d1u64));
- q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
- vreinterpret_u8_u64(d2u64));
- q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
- vreinterpret_u8_u64(d3u64));
-
- d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
- d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
- d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
- d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
- d2 += dest_stride;
- return;
+void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+ uint8_t *d1, *d2;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8;
+ uint64x1_t d0u64, d1u64, d2u64, d3u64;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+ q8s16 = vld1q_s16(input);
+ q9s16 = vld1q_s16(input + 8);
+ q10s16 = vld1q_s16(input + 16);
+ q11s16 = vld1q_s16(input + 24);
+ q12s16 = vld1q_s16(input + 32);
+ q13s16 = vld1q_s16(input + 40);
+ q14s16 = vld1q_s16(input + 48);
+ q15s16 = vld1q_s16(input + 56);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+ &q15s16);
+
+ IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+ &q15s16);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+ &q15s16);
+
+ IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+ &q15s16);
+
+ q8s16 = vrshrq_n_s16(q8s16, 5);
+ q9s16 = vrshrq_n_s16(q9s16, 5);
+ q10s16 = vrshrq_n_s16(q10s16, 5);
+ q11s16 = vrshrq_n_s16(q11s16, 5);
+ q12s16 = vrshrq_n_s16(q12s16, 5);
+ q13s16 = vrshrq_n_s16(q13s16, 5);
+ q14s16 = vrshrq_n_s16(q14s16, 5);
+ q15s16 = vrshrq_n_s16(q15s16, 5);
+
+ d1 = d2 = dest;
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+
+ q8s16 = q12s16;
+ q9s16 = q13s16;
+ q10s16 = q14s16;
+ q11s16 = q15s16;
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ return;
}
-void vpx_idct8x8_12_add_neon(
- int16_t *input,
- uint8_t *dest,
- int dest_stride) {
- uint8_t *d1, *d2;
- uint8x8_t d0u8, d1u8, d2u8, d3u8;
- int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
- int16x4_t d26s16, d27s16, d28s16, d29s16;
- uint64x1_t d0u64, d1u64, d2u64, d3u64;
- int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
- uint16x8_t q8u16, q9u16, q10u16, q11u16;
- int32x4_t q9s32, q10s32, q11s32, q12s32;
-
- q8s16 = vld1q_s16(input);
- q9s16 = vld1q_s16(input + 8);
- q10s16 = vld1q_s16(input + 16);
- q11s16 = vld1q_s16(input + 24);
- q12s16 = vld1q_s16(input + 32);
- q13s16 = vld1q_s16(input + 40);
- q14s16 = vld1q_s16(input + 48);
- q15s16 = vld1q_s16(input + 56);
-
- TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
- &q12s16, &q13s16, &q14s16, &q15s16);
-
- // First transform rows
- // stage 1
- q0s16 = vdupq_n_s16(cospi_28_64 * 2);
- q1s16 = vdupq_n_s16(cospi_4_64 * 2);
-
- q4s16 = vqrdmulhq_s16(q9s16, q0s16);
-
- q0s16 = vdupq_n_s16(-cospi_20_64 * 2);
-
- q7s16 = vqrdmulhq_s16(q9s16, q1s16);
-
- q1s16 = vdupq_n_s16(cospi_12_64 * 2);
-
- q5s16 = vqrdmulhq_s16(q11s16, q0s16);
-
- q0s16 = vdupq_n_s16(cospi_16_64 * 2);
-
- q6s16 = vqrdmulhq_s16(q11s16, q1s16);
-
- // stage 2 & stage 3 - even half
- q1s16 = vdupq_n_s16(cospi_24_64 * 2);
-
- q9s16 = vqrdmulhq_s16(q8s16, q0s16);
-
- q0s16 = vdupq_n_s16(cospi_8_64 * 2);
-
- q13s16 = vqrdmulhq_s16(q10s16, q1s16);
-
- q15s16 = vqrdmulhq_s16(q10s16, q0s16);
-
- // stage 3 -odd half
- q0s16 = vaddq_s16(q9s16, q15s16);
- q1s16 = vaddq_s16(q9s16, q13s16);
- q2s16 = vsubq_s16(q9s16, q13s16);
- q3s16 = vsubq_s16(q9s16, q15s16);
-
- // stage 2 - odd half
- q13s16 = vsubq_s16(q4s16, q5s16);
- q4s16 = vaddq_s16(q4s16, q5s16);
- q14s16 = vsubq_s16(q7s16, q6s16);
- q7s16 = vaddq_s16(q7s16, q6s16);
- d26s16 = vget_low_s16(q13s16);
- d27s16 = vget_high_s16(q13s16);
- d28s16 = vget_low_s16(q14s16);
- d29s16 = vget_high_s16(q14s16);
-
- d16s16 = vdup_n_s16(cospi_16_64);
- q9s32 = vmull_s16(d28s16, d16s16);
- q10s32 = vmull_s16(d29s16, d16s16);
- q11s32 = vmull_s16(d28s16, d16s16);
- q12s32 = vmull_s16(d29s16, d16s16);
-
- q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
- q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
- q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
- q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
- d10s16 = vqrshrn_n_s32(q9s32, 14);
- d11s16 = vqrshrn_n_s32(q10s32, 14);
- d12s16 = vqrshrn_n_s32(q11s32, 14);
- d13s16 = vqrshrn_n_s32(q12s32, 14);
- q5s16 = vcombine_s16(d10s16, d11s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
-
- // stage 4
- q8s16 = vaddq_s16(q0s16, q7s16);
- q9s16 = vaddq_s16(q1s16, q6s16);
- q10s16 = vaddq_s16(q2s16, q5s16);
- q11s16 = vaddq_s16(q3s16, q4s16);
- q12s16 = vsubq_s16(q3s16, q4s16);
- q13s16 = vsubq_s16(q2s16, q5s16);
- q14s16 = vsubq_s16(q1s16, q6s16);
- q15s16 = vsubq_s16(q0s16, q7s16);
-
- TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
- &q12s16, &q13s16, &q14s16, &q15s16);
-
- IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
- &q12s16, &q13s16, &q14s16, &q15s16);
-
- q8s16 = vrshrq_n_s16(q8s16, 5);
- q9s16 = vrshrq_n_s16(q9s16, 5);
- q10s16 = vrshrq_n_s16(q10s16, 5);
- q11s16 = vrshrq_n_s16(q11s16, 5);
- q12s16 = vrshrq_n_s16(q12s16, 5);
- q13s16 = vrshrq_n_s16(q13s16, 5);
- q14s16 = vrshrq_n_s16(q14s16, 5);
- q15s16 = vrshrq_n_s16(q15s16, 5);
-
- d1 = d2 = dest;
-
- d0u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d1u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d2u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d3u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
-
- q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
- vreinterpret_u8_u64(d0u64));
- q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
- vreinterpret_u8_u64(d1u64));
- q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
- vreinterpret_u8_u64(d2u64));
- q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
- vreinterpret_u8_u64(d3u64));
-
- d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
- d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
- d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
- d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
- d2 += dest_stride;
-
- q8s16 = q12s16;
- q9s16 = q13s16;
- q10s16 = q14s16;
- q11s16 = q15s16;
-
- d0u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d1u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d2u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d3u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
-
- q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
- vreinterpret_u8_u64(d0u64));
- q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
- vreinterpret_u8_u64(d1u64));
- q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
- vreinterpret_u8_u64(d2u64));
- q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
- vreinterpret_u8_u64(d3u64));
-
- d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
- d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
- d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
- d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
- d2 += dest_stride;
- return;
+void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+ uint8_t *d1, *d2;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8;
+ int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
+ int16x4_t d26s16, d27s16, d28s16, d29s16;
+ uint64x1_t d0u64, d1u64, d2u64, d3u64;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16;
+ int32x4_t q9s32, q10s32, q11s32, q12s32;
+
+ q8s16 = vld1q_s16(input);
+ q9s16 = vld1q_s16(input + 8);
+ q10s16 = vld1q_s16(input + 16);
+ q11s16 = vld1q_s16(input + 24);
+ q12s16 = vld1q_s16(input + 32);
+ q13s16 = vld1q_s16(input + 40);
+ q14s16 = vld1q_s16(input + 48);
+ q15s16 = vld1q_s16(input + 56);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+ &q15s16);
+
+ // First transform rows
+ // stage 1
+ q0s16 = vdupq_n_s16(cospi_28_64 * 2);
+ q1s16 = vdupq_n_s16(cospi_4_64 * 2);
+
+ q4s16 = vqrdmulhq_s16(q9s16, q0s16);
+
+ q0s16 = vdupq_n_s16(-cospi_20_64 * 2);
+
+ q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+
+ q1s16 = vdupq_n_s16(cospi_12_64 * 2);
+
+ q5s16 = vqrdmulhq_s16(q11s16, q0s16);
+
+ q0s16 = vdupq_n_s16(cospi_16_64 * 2);
+
+ q6s16 = vqrdmulhq_s16(q11s16, q1s16);
+
+ // stage 2 & stage 3 - even half
+ q1s16 = vdupq_n_s16(cospi_24_64 * 2);
+
+ q9s16 = vqrdmulhq_s16(q8s16, q0s16);
+
+ q0s16 = vdupq_n_s16(cospi_8_64 * 2);
+
+ q13s16 = vqrdmulhq_s16(q10s16, q1s16);
+
+ q15s16 = vqrdmulhq_s16(q10s16, q0s16);
+
+ // stage 3 -odd half
+ q0s16 = vaddq_s16(q9s16, q15s16);
+ q1s16 = vaddq_s16(q9s16, q13s16);
+ q2s16 = vsubq_s16(q9s16, q13s16);
+ q3s16 = vsubq_s16(q9s16, q15s16);
+
+ // stage 2 - odd half
+ q13s16 = vsubq_s16(q4s16, q5s16);
+ q4s16 = vaddq_s16(q4s16, q5s16);
+ q14s16 = vsubq_s16(q7s16, q6s16);
+ q7s16 = vaddq_s16(q7s16, q6s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+
+ d16s16 = vdup_n_s16(cospi_16_64);
+ q9s32 = vmull_s16(d28s16, d16s16);
+ q10s32 = vmull_s16(d29s16, d16s16);
+ q11s32 = vmull_s16(d28s16, d16s16);
+ q12s32 = vmull_s16(d29s16, d16s16);
+
+ q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
+ q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+ q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+ q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+ d10s16 = vqrshrn_n_s32(q9s32, 14);
+ d11s16 = vqrshrn_n_s32(q10s32, 14);
+ d12s16 = vqrshrn_n_s32(q11s32, 14);
+ d13s16 = vqrshrn_n_s32(q12s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ // stage 4
+ q8s16 = vaddq_s16(q0s16, q7s16);
+ q9s16 = vaddq_s16(q1s16, q6s16);
+ q10s16 = vaddq_s16(q2s16, q5s16);
+ q11s16 = vaddq_s16(q3s16, q4s16);
+ q12s16 = vsubq_s16(q3s16, q4s16);
+ q13s16 = vsubq_s16(q2s16, q5s16);
+ q14s16 = vsubq_s16(q1s16, q6s16);
+ q15s16 = vsubq_s16(q0s16, q7s16);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+ &q15s16);
+
+ IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+ &q15s16);
+
+ q8s16 = vrshrq_n_s16(q8s16, 5);
+ q9s16 = vrshrq_n_s16(q9s16, 5);
+ q10s16 = vrshrq_n_s16(q10s16, 5);
+ q11s16 = vrshrq_n_s16(q11s16, 5);
+ q12s16 = vrshrq_n_s16(q12s16, 5);
+ q13s16 = vrshrq_n_s16(q13s16, 5);
+ q14s16 = vrshrq_n_s16(q14s16, 5);
+ q15s16 = vrshrq_n_s16(q15s16, 5);
+
+ d1 = d2 = dest;
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+
+ q8s16 = q12s16;
+ q9s16 = q13s16;
+ q10s16 = q14s16;
+ q11s16 = q15s16;
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ return;
}
diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c
index 0a376104d..32dd1ba14 100644
--- a/vpx_dsp/arm/intrapred_neon.c
+++ b/vpx_dsp/arm/intrapred_neon.c
@@ -18,9 +18,8 @@
// DC 4x4
// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left,
- int do_above, int do_left) {
+static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
+ const uint8_t *left, int do_above, int do_left) {
uint16x8_t sum_top;
uint16x8_t sum_left;
uint8x8_t dc0;
@@ -33,7 +32,7 @@ static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride,
}
if (do_left) {
- const uint8x8_t L = vld1_u8(left); // left border
+ const uint8x8_t L = vld1_u8(left); // left border
const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
const uint16x4_t p1 = vpadd_u16(p0, p0);
sum_left = vcombine_u16(p1, p1);
@@ -54,7 +53,7 @@ static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride,
const uint8x8_t dc = vdup_lane_u8(dc0, 0);
int i;
for (i = 0; i < 4; ++i) {
- vst1_lane_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
+ vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
}
}
}
@@ -87,9 +86,8 @@ void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
// DC 8x8
// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left,
- int do_above, int do_left) {
+static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
+ const uint8_t *left, int do_above, int do_left) {
uint16x8_t sum_top;
uint16x8_t sum_left;
uint8x8_t dc0;
@@ -103,7 +101,7 @@ static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
}
if (do_left) {
- const uint8x8_t L = vld1_u8(left); // left border
+ const uint8x8_t L = vld1_u8(left); // left border
const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
const uint16x4_t p1 = vpadd_u16(p0, p0);
const uint16x4_t p2 = vpadd_u16(p1, p1);
@@ -125,7 +123,7 @@ static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
const uint8x8_t dc = vdup_lane_u8(dc0, 0);
int i;
for (i = 0; i < 8; ++i) {
- vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc));
+ vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc));
}
}
}
@@ -167,7 +165,7 @@ static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
if (do_above) {
const uint8x16_t A = vld1q_u8(above); // top row
- const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
+ const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
const uint16x4_t p2 = vpadd_u16(p1, p1);
const uint16x4_t p3 = vpadd_u16(p2, p2);
@@ -425,8 +423,7 @@ void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
(void)left;
d0u8 = vld1_u8(above);
- for (i = 0; i < 8; i++, dst += stride)
- vst1_u8(dst, d0u8);
+ for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
}
void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
@@ -436,8 +433,7 @@ void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
(void)left;
q0u8 = vld1q_u8(above);
- for (i = 0; i < 16; i++, dst += stride)
- vst1q_u8(dst, q0u8);
+ for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
}
void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
@@ -608,8 +604,8 @@ void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
for (i = 0; i < 4; i++, dst += stride) {
q1u16 = vdupq_n_u16((uint16_t)left[i]);
- q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
- vreinterpretq_s16_u16(q3u16));
+ q1s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16));
d0u8 = vqmovun_s16(q1s16);
vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
}
@@ -631,26 +627,26 @@ void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
d20u16 = vget_low_u16(q10u16);
for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
q0u16 = vdupq_lane_u16(d20u16, 0);
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
- vreinterpretq_s16_u16(q0u16));
+ q0s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
d0u8 = vqmovun_s16(q0s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
dst += stride;
q0u16 = vdupq_lane_u16(d20u16, 1);
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
- vreinterpretq_s16_u16(q0u16));
+ q0s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
d0u8 = vqmovun_s16(q0s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
dst += stride;
q0u16 = vdupq_lane_u16(d20u16, 2);
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
- vreinterpretq_s16_u16(q0u16));
+ q0s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
d0u8 = vqmovun_s16(q0s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
dst += stride;
q0u16 = vdupq_lane_u16(d20u16, 3);
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
- vreinterpretq_s16_u16(q0u16));
+ q0s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
d0u8 = vqmovun_s16(q0s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
dst += stride;
@@ -677,14 +673,14 @@ void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
q0u16 = vdupq_lane_u16(d20u16, 0);
q8u16 = vdupq_lane_u16(d20u16, 1);
- q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q2u16));
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q3u16));
- q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
- vreinterpretq_s16_u16(q2u16));
- q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
- vreinterpretq_s16_u16(q3u16));
+ q1s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
+ q0s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
+ q11s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
+ q8s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
d2u8 = vqmovun_s16(q1s16);
d3u8 = vqmovun_s16(q0s16);
d22u8 = vqmovun_s16(q11s16);
@@ -698,14 +694,14 @@ void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
q0u16 = vdupq_lane_u16(d20u16, 2);
q8u16 = vdupq_lane_u16(d20u16, 3);
- q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q2u16));
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q3u16));
- q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
- vreinterpretq_s16_u16(q2u16));
- q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
- vreinterpretq_s16_u16(q3u16));
+ q1s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
+ q0s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
+ q11s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
+ q8s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
d2u8 = vqmovun_s16(q1s16);
d3u8 = vqmovun_s16(q0s16);
d22u8 = vqmovun_s16(q11s16);
@@ -742,10 +738,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
d6u16 = vget_low_u16(q3u16);
for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
q0u16 = vdupq_lane_u16(d6u16, 0);
- q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q8u16));
- q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q9u16));
+ q12s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
+ q13s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q10u16));
q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
@@ -761,10 +757,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
dst += stride;
q0u16 = vdupq_lane_u16(d6u16, 1);
- q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q8u16));
- q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q9u16));
+ q12s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
+ q13s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q10u16));
q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
@@ -780,10 +776,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
dst += stride;
q0u16 = vdupq_lane_u16(d6u16, 2);
- q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q8u16));
- q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q9u16));
+ q12s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
+ q13s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q10u16));
q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
@@ -799,10 +795,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
dst += stride;
q0u16 = vdupq_lane_u16(d6u16, 3);
- q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q8u16));
- q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q9u16));
+ q12s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
+ q13s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q10u16));
q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
diff --git a/vpx_dsp/arm/loopfilter_16_neon.c b/vpx_dsp/arm/loopfilter_16_neon.c
index d24e6adc8..9607bb240 100644
--- a/vpx_dsp/arm/loopfilter_16_neon.c
+++ b/vpx_dsp/arm/loopfilter_16_neon.c
@@ -14,166 +14,160 @@
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
-static INLINE void loop_filter_neon_16(
- uint8x16_t qblimit, // blimit
- uint8x16_t qlimit, // limit
- uint8x16_t qthresh, // thresh
- uint8x16_t q3, // p3
- uint8x16_t q4, // p2
- uint8x16_t q5, // p1
- uint8x16_t q6, // p0
- uint8x16_t q7, // q0
- uint8x16_t q8, // q1
- uint8x16_t q9, // q2
- uint8x16_t q10, // q3
- uint8x16_t *q5r, // p1
- uint8x16_t *q6r, // p0
- uint8x16_t *q7r, // q0
- uint8x16_t *q8r) { // q1
- uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
- int16x8_t q2s16, q11s16;
- uint16x8_t q4u16;
- int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
- int8x8_t d2s8, d3s8;
-
- q11u8 = vabdq_u8(q3, q4);
- q12u8 = vabdq_u8(q4, q5);
- q13u8 = vabdq_u8(q5, q6);
- q14u8 = vabdq_u8(q8, q7);
- q3 = vabdq_u8(q9, q8);
- q4 = vabdq_u8(q10, q9);
-
- q11u8 = vmaxq_u8(q11u8, q12u8);
- q12u8 = vmaxq_u8(q13u8, q14u8);
- q3 = vmaxq_u8(q3, q4);
- q15u8 = vmaxq_u8(q11u8, q12u8);
-
- q9 = vabdq_u8(q6, q7);
-
- // vp8_hevmask
- q13u8 = vcgtq_u8(q13u8, qthresh);
- q14u8 = vcgtq_u8(q14u8, qthresh);
- q15u8 = vmaxq_u8(q15u8, q3);
-
- q2u8 = vabdq_u8(q5, q8);
- q9 = vqaddq_u8(q9, q9);
-
- q15u8 = vcgeq_u8(qlimit, q15u8);
-
- // vp8_filter() function
- // convert to signed
- q10 = vdupq_n_u8(0x80);
- q8 = veorq_u8(q8, q10);
- q7 = veorq_u8(q7, q10);
- q6 = veorq_u8(q6, q10);
- q5 = veorq_u8(q5, q10);
-
- q2u8 = vshrq_n_u8(q2u8, 1);
- q9 = vqaddq_u8(q9, q2u8);
-
- q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
- vget_low_s8(vreinterpretq_s8_u8(q6)));
- q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
- vget_high_s8(vreinterpretq_s8_u8(q6)));
-
- q9 = vcgeq_u8(qblimit, q9);
-
- q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
- vreinterpretq_s8_u8(q8));
-
- q14u8 = vorrq_u8(q13u8, q14u8);
-
- q4u16 = vdupq_n_u16(3);
- q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
- q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
-
- q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
- q15u8 = vandq_u8(q15u8, q9);
-
- q1s8 = vreinterpretq_s8_u8(q1u8);
- q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
- q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
-
- q4 = vdupq_n_u8(3);
- q9 = vdupq_n_u8(4);
- // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
- d2s8 = vqmovn_s16(q2s16);
- d3s8 = vqmovn_s16(q11s16);
- q1s8 = vcombine_s8(d2s8, d3s8);
- q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
- q1s8 = vreinterpretq_s8_u8(q1u8);
-
- q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
- q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
- q2s8 = vshrq_n_s8(q2s8, 3);
- q1s8 = vshrq_n_s8(q1s8, 3);
-
- q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
- q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
-
- q1s8 = vrshrq_n_s8(q1s8, 1);
- q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
-
- q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
- q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
-
- *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
- *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10);
- *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
- *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
- return;
+static INLINE void loop_filter_neon_16(uint8x16_t qblimit, // blimit
+ uint8x16_t qlimit, // limit
+ uint8x16_t qthresh, // thresh
+ uint8x16_t q3, // p3
+ uint8x16_t q4, // p2
+ uint8x16_t q5, // p1
+ uint8x16_t q6, // p0
+ uint8x16_t q7, // q0
+ uint8x16_t q8, // q1
+ uint8x16_t q9, // q2
+ uint8x16_t q10, // q3
+ uint8x16_t *q5r, // p1
+ uint8x16_t *q6r, // p0
+ uint8x16_t *q7r, // q0
+ uint8x16_t *q8r) { // q1
+ uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+ int16x8_t q2s16, q11s16;
+ uint16x8_t q4u16;
+ int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
+ int8x8_t d2s8, d3s8;
+
+ q11u8 = vabdq_u8(q3, q4);
+ q12u8 = vabdq_u8(q4, q5);
+ q13u8 = vabdq_u8(q5, q6);
+ q14u8 = vabdq_u8(q8, q7);
+ q3 = vabdq_u8(q9, q8);
+ q4 = vabdq_u8(q10, q9);
+
+ q11u8 = vmaxq_u8(q11u8, q12u8);
+ q12u8 = vmaxq_u8(q13u8, q14u8);
+ q3 = vmaxq_u8(q3, q4);
+ q15u8 = vmaxq_u8(q11u8, q12u8);
+
+ q9 = vabdq_u8(q6, q7);
+
+ // vp8_hevmask
+ q13u8 = vcgtq_u8(q13u8, qthresh);
+ q14u8 = vcgtq_u8(q14u8, qthresh);
+ q15u8 = vmaxq_u8(q15u8, q3);
+
+ q2u8 = vabdq_u8(q5, q8);
+ q9 = vqaddq_u8(q9, q9);
+
+ q15u8 = vcgeq_u8(qlimit, q15u8);
+
+ // vp8_filter() function
+ // convert to signed
+ q10 = vdupq_n_u8(0x80);
+ q8 = veorq_u8(q8, q10);
+ q7 = veorq_u8(q7, q10);
+ q6 = veorq_u8(q6, q10);
+ q5 = veorq_u8(q5, q10);
+
+ q2u8 = vshrq_n_u8(q2u8, 1);
+ q9 = vqaddq_u8(q9, q2u8);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+ vget_low_s8(vreinterpretq_s8_u8(q6)));
+ q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+ vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+ q9 = vcgeq_u8(qblimit, q9);
+
+ q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8));
+
+ q14u8 = vorrq_u8(q13u8, q14u8);
+
+ q4u16 = vdupq_n_u16(3);
+ q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
+ q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
+
+ q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
+ q15u8 = vandq_u8(q15u8, q9);
+
+ q1s8 = vreinterpretq_s8_u8(q1u8);
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+ q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
+
+ q4 = vdupq_n_u8(3);
+ q9 = vdupq_n_u8(4);
+ // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+ d2s8 = vqmovn_s16(q2s16);
+ d3s8 = vqmovn_s16(q11s16);
+ q1s8 = vcombine_s8(d2s8, d3s8);
+ q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
+ q1s8 = vreinterpretq_s8_u8(q1u8);
+
+ q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
+ q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q1s8 = vshrq_n_s8(q1s8, 3);
+
+ q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
+ q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
+
+ q1s8 = vrshrq_n_s8(q1s8, 1);
+ q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+ q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
+ q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
+
+ *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
+ *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10);
+ *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
+ *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
+ return;
}
-void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1) {
- uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
- uint8x16_t qblimit, qlimit, qthresh;
- uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
-
- dblimit0 = vld1_u8(blimit0);
- dlimit0 = vld1_u8(limit0);
- dthresh0 = vld1_u8(thresh0);
- dblimit1 = vld1_u8(blimit1);
- dlimit1 = vld1_u8(limit1);
- dthresh1 = vld1_u8(thresh1);
- qblimit = vcombine_u8(dblimit0, dblimit1);
- qlimit = vcombine_u8(dlimit0, dlimit1);
- qthresh = vcombine_u8(dthresh0, dthresh1);
-
- s -= (p << 2);
-
- q3u8 = vld1q_u8(s);
- s += p;
- q4u8 = vld1q_u8(s);
- s += p;
- q5u8 = vld1q_u8(s);
- s += p;
- q6u8 = vld1q_u8(s);
- s += p;
- q7u8 = vld1q_u8(s);
- s += p;
- q8u8 = vld1q_u8(s);
- s += p;
- q9u8 = vld1q_u8(s);
- s += p;
- q10u8 = vld1q_u8(s);
-
- loop_filter_neon_16(qblimit, qlimit, qthresh,
- q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8,
- &q5u8, &q6u8, &q7u8, &q8u8);
-
- s -= (p * 5);
- vst1q_u8(s, q5u8);
- s += p;
- vst1q_u8(s, q6u8);
- s += p;
- vst1q_u8(s, q7u8);
- s += p;
- vst1q_u8(s, q8u8);
- return;
+void vpx_lpf_horizontal_4_dual_neon(
+ uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+ const uint8_t *limit1, const uint8_t *thresh1) {
+ uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
+ uint8x16_t qblimit, qlimit, qthresh;
+ uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
+
+ dblimit0 = vld1_u8(blimit0);
+ dlimit0 = vld1_u8(limit0);
+ dthresh0 = vld1_u8(thresh0);
+ dblimit1 = vld1_u8(blimit1);
+ dlimit1 = vld1_u8(limit1);
+ dthresh1 = vld1_u8(thresh1);
+ qblimit = vcombine_u8(dblimit0, dblimit1);
+ qlimit = vcombine_u8(dlimit0, dlimit1);
+ qthresh = vcombine_u8(dthresh0, dthresh1);
+
+ s -= (p << 2);
+
+ q3u8 = vld1q_u8(s);
+ s += p;
+ q4u8 = vld1q_u8(s);
+ s += p;
+ q5u8 = vld1q_u8(s);
+ s += p;
+ q6u8 = vld1q_u8(s);
+ s += p;
+ q7u8 = vld1q_u8(s);
+ s += p;
+ q8u8 = vld1q_u8(s);
+ s += p;
+ q9u8 = vld1q_u8(s);
+ s += p;
+ q10u8 = vld1q_u8(s);
+
+ loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8,
+ q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8);
+
+ s -= (p * 5);
+ vst1q_u8(s, q5u8);
+ s += p;
+ vst1q_u8(s, q6u8);
+ s += p;
+ vst1q_u8(s, q7u8);
+ s += p;
+ vst1q_u8(s, q8u8);
+ return;
}
diff --git a/vpx_dsp/arm/loopfilter_4_neon.c b/vpx_dsp/arm/loopfilter_4_neon.c
index 7f3ee70b9..1c1e80e00 100644
--- a/vpx_dsp/arm/loopfilter_4_neon.c
+++ b/vpx_dsp/arm/loopfilter_4_neon.c
@@ -12,255 +12,238 @@
#include "./vpx_dsp_rtcd.h"
-static INLINE void loop_filter_neon(
- uint8x8_t dblimit, // flimit
- uint8x8_t dlimit, // limit
- uint8x8_t dthresh, // thresh
- uint8x8_t d3u8, // p3
- uint8x8_t d4u8, // p2
- uint8x8_t d5u8, // p1
- uint8x8_t d6u8, // p0
- uint8x8_t d7u8, // q0
- uint8x8_t d16u8, // q1
- uint8x8_t d17u8, // q2
- uint8x8_t d18u8, // q3
- uint8x8_t *d4ru8, // p1
- uint8x8_t *d5ru8, // p0
- uint8x8_t *d6ru8, // q0
- uint8x8_t *d7ru8) { // q1
- uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
- int16x8_t q12s16;
- int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
-
- d19u8 = vabd_u8(d3u8, d4u8);
- d20u8 = vabd_u8(d4u8, d5u8);
- d21u8 = vabd_u8(d5u8, d6u8);
- d22u8 = vabd_u8(d16u8, d7u8);
- d3u8 = vabd_u8(d17u8, d16u8);
- d4u8 = vabd_u8(d18u8, d17u8);
-
- d19u8 = vmax_u8(d19u8, d20u8);
- d20u8 = vmax_u8(d21u8, d22u8);
- d3u8 = vmax_u8(d3u8, d4u8);
- d23u8 = vmax_u8(d19u8, d20u8);
-
- d17u8 = vabd_u8(d6u8, d7u8);
-
- d21u8 = vcgt_u8(d21u8, dthresh);
- d22u8 = vcgt_u8(d22u8, dthresh);
- d23u8 = vmax_u8(d23u8, d3u8);
-
- d28u8 = vabd_u8(d5u8, d16u8);
- d17u8 = vqadd_u8(d17u8, d17u8);
-
- d23u8 = vcge_u8(dlimit, d23u8);
-
- d18u8 = vdup_n_u8(0x80);
- d5u8 = veor_u8(d5u8, d18u8);
- d6u8 = veor_u8(d6u8, d18u8);
- d7u8 = veor_u8(d7u8, d18u8);
- d16u8 = veor_u8(d16u8, d18u8);
-
- d28u8 = vshr_n_u8(d28u8, 1);
- d17u8 = vqadd_u8(d17u8, d28u8);
-
- d19u8 = vdup_n_u8(3);
-
- d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8),
- vreinterpret_s8_u8(d6u8));
-
- d17u8 = vcge_u8(dblimit, d17u8);
-
- d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8),
- vreinterpret_s8_u8(d16u8));
-
- d22u8 = vorr_u8(d21u8, d22u8);
-
- q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
-
- d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
- d23u8 = vand_u8(d23u8, d17u8);
-
- q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
-
- d17u8 = vdup_n_u8(4);
-
- d27s8 = vqmovn_s16(q12s16);
- d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
- d27s8 = vreinterpret_s8_u8(d27u8);
-
- d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
- d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
- d28s8 = vshr_n_s8(d28s8, 3);
- d27s8 = vshr_n_s8(d27s8, 3);
-
- d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
- d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
-
- d27s8 = vrshr_n_s8(d27s8, 1);
- d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
-
- d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
- d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
-
- *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
- *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
- *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
- *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
- return;
+static INLINE void loop_filter_neon(uint8x8_t dblimit, // flimit
+ uint8x8_t dlimit, // limit
+ uint8x8_t dthresh, // thresh
+ uint8x8_t d3u8, // p3
+ uint8x8_t d4u8, // p2
+ uint8x8_t d5u8, // p1
+ uint8x8_t d6u8, // p0
+ uint8x8_t d7u8, // q0
+ uint8x8_t d16u8, // q1
+ uint8x8_t d17u8, // q2
+ uint8x8_t d18u8, // q3
+ uint8x8_t *d4ru8, // p1
+ uint8x8_t *d5ru8, // p0
+ uint8x8_t *d6ru8, // q0
+ uint8x8_t *d7ru8) { // q1
+ uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
+ int16x8_t q12s16;
+ int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
+
+ d19u8 = vabd_u8(d3u8, d4u8);
+ d20u8 = vabd_u8(d4u8, d5u8);
+ d21u8 = vabd_u8(d5u8, d6u8);
+ d22u8 = vabd_u8(d16u8, d7u8);
+ d3u8 = vabd_u8(d17u8, d16u8);
+ d4u8 = vabd_u8(d18u8, d17u8);
+
+ d19u8 = vmax_u8(d19u8, d20u8);
+ d20u8 = vmax_u8(d21u8, d22u8);
+ d3u8 = vmax_u8(d3u8, d4u8);
+ d23u8 = vmax_u8(d19u8, d20u8);
+
+ d17u8 = vabd_u8(d6u8, d7u8);
+
+ d21u8 = vcgt_u8(d21u8, dthresh);
+ d22u8 = vcgt_u8(d22u8, dthresh);
+ d23u8 = vmax_u8(d23u8, d3u8);
+
+ d28u8 = vabd_u8(d5u8, d16u8);
+ d17u8 = vqadd_u8(d17u8, d17u8);
+
+ d23u8 = vcge_u8(dlimit, d23u8);
+
+ d18u8 = vdup_n_u8(0x80);
+ d5u8 = veor_u8(d5u8, d18u8);
+ d6u8 = veor_u8(d6u8, d18u8);
+ d7u8 = veor_u8(d7u8, d18u8);
+ d16u8 = veor_u8(d16u8, d18u8);
+
+ d28u8 = vshr_n_u8(d28u8, 1);
+ d17u8 = vqadd_u8(d17u8, d28u8);
+
+ d19u8 = vdup_n_u8(3);
+
+ d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), vreinterpret_s8_u8(d6u8));
+
+ d17u8 = vcge_u8(dblimit, d17u8);
+
+ d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), vreinterpret_s8_u8(d16u8));
+
+ d22u8 = vorr_u8(d21u8, d22u8);
+
+ q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+ d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
+ d23u8 = vand_u8(d23u8, d17u8);
+
+ q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
+
+ d17u8 = vdup_n_u8(4);
+
+ d27s8 = vqmovn_s16(q12s16);
+ d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
+ d27s8 = vreinterpret_s8_u8(d27u8);
+
+ d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
+ d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
+ d28s8 = vshr_n_s8(d28s8, 3);
+ d27s8 = vshr_n_s8(d27s8, 3);
+
+ d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
+ d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
+
+ d27s8 = vrshr_n_s8(d27s8, 1);
+ d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
+
+ d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
+ d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
+
+ *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
+ *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
+ *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
+ *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
+ return;
}
-void vpx_lpf_horizontal_4_neon(
- uint8_t *src,
- int pitch,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh) {
- int i;
- uint8_t *s, *psrc;
- uint8x8_t dblimit, dlimit, dthresh;
- uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
-
- dblimit = vld1_u8(blimit);
- dlimit = vld1_u8(limit);
- dthresh = vld1_u8(thresh);
-
- psrc = src - (pitch << 2);
- for (i = 0; i < 1; i++) {
- s = psrc + i * 8;
-
- d3u8 = vld1_u8(s);
- s += pitch;
- d4u8 = vld1_u8(s);
- s += pitch;
- d5u8 = vld1_u8(s);
- s += pitch;
- d6u8 = vld1_u8(s);
- s += pitch;
- d7u8 = vld1_u8(s);
- s += pitch;
- d16u8 = vld1_u8(s);
- s += pitch;
- d17u8 = vld1_u8(s);
- s += pitch;
- d18u8 = vld1_u8(s);
-
- loop_filter_neon(dblimit, dlimit, dthresh,
- d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
- &d4u8, &d5u8, &d6u8, &d7u8);
-
- s -= (pitch * 5);
- vst1_u8(s, d4u8);
- s += pitch;
- vst1_u8(s, d5u8);
- s += pitch;
- vst1_u8(s, d6u8);
- s += pitch;
- vst1_u8(s, d7u8);
- }
- return;
+void vpx_lpf_horizontal_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ int i;
+ uint8_t *s, *psrc;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ psrc = src - (pitch << 2);
+ for (i = 0; i < 1; i++) {
+ s = psrc + i * 8;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
+ d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
+
+ s -= (pitch * 5);
+ vst1_u8(s, d4u8);
+ s += pitch;
+ vst1_u8(s, d5u8);
+ s += pitch;
+ vst1_u8(s, d6u8);
+ s += pitch;
+ vst1_u8(s, d7u8);
+ }
+ return;
}
-void vpx_lpf_vertical_4_neon(
- uint8_t *src,
- int pitch,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh) {
- int i, pitch8;
- uint8_t *s;
- uint8x8_t dblimit, dlimit, dthresh;
- uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
- uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
- uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
- uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
- uint8x8x4_t d4Result;
-
- dblimit = vld1_u8(blimit);
- dlimit = vld1_u8(limit);
- dthresh = vld1_u8(thresh);
-
- pitch8 = pitch * 8;
- for (i = 0; i < 1; i++, src += pitch8) {
- s = src - (i + 1) * 4;
-
- d3u8 = vld1_u8(s);
- s += pitch;
- d4u8 = vld1_u8(s);
- s += pitch;
- d5u8 = vld1_u8(s);
- s += pitch;
- d6u8 = vld1_u8(s);
- s += pitch;
- d7u8 = vld1_u8(s);
- s += pitch;
- d16u8 = vld1_u8(s);
- s += pitch;
- d17u8 = vld1_u8(s);
- s += pitch;
- d18u8 = vld1_u8(s);
-
- d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
- vreinterpret_u32_u8(d7u8));
- d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
- vreinterpret_u32_u8(d16u8));
- d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
- vreinterpret_u32_u8(d17u8));
- d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
- vreinterpret_u32_u8(d18u8));
-
- d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
- vreinterpret_u16_u32(d2tmp2.val[0]));
- d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
- vreinterpret_u16_u32(d2tmp3.val[0]));
- d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
- vreinterpret_u16_u32(d2tmp2.val[1]));
- d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
- vreinterpret_u16_u32(d2tmp3.val[1]));
-
- d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
- vreinterpret_u8_u16(d2tmp5.val[0]));
- d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
- vreinterpret_u8_u16(d2tmp5.val[1]));
- d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
- vreinterpret_u8_u16(d2tmp7.val[0]));
- d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
- vreinterpret_u8_u16(d2tmp7.val[1]));
-
- d3u8 = d2tmp8.val[0];
- d4u8 = d2tmp8.val[1];
- d5u8 = d2tmp9.val[0];
- d6u8 = d2tmp9.val[1];
- d7u8 = d2tmp10.val[0];
- d16u8 = d2tmp10.val[1];
- d17u8 = d2tmp11.val[0];
- d18u8 = d2tmp11.val[1];
-
- loop_filter_neon(dblimit, dlimit, dthresh,
- d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
- &d4u8, &d5u8, &d6u8, &d7u8);
-
- d4Result.val[0] = d4u8;
- d4Result.val[1] = d5u8;
- d4Result.val[2] = d6u8;
- d4Result.val[3] = d7u8;
-
- src -= 2;
- vst4_lane_u8(src, d4Result, 0);
- src += pitch;
- vst4_lane_u8(src, d4Result, 1);
- src += pitch;
- vst4_lane_u8(src, d4Result, 2);
- src += pitch;
- vst4_lane_u8(src, d4Result, 3);
- src += pitch;
- vst4_lane_u8(src, d4Result, 4);
- src += pitch;
- vst4_lane_u8(src, d4Result, 5);
- src += pitch;
- vst4_lane_u8(src, d4Result, 6);
- src += pitch;
- vst4_lane_u8(src, d4Result, 7);
- }
- return;
+void vpx_lpf_vertical_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ int i, pitch8;
+ uint8_t *s;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+ uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+ uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+ uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+ uint8x8x4_t d4Result;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ pitch8 = pitch * 8;
+ for (i = 0; i < 1; i++, src += pitch8) {
+ s = src - (i + 1) * 4;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
+ d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
+ d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
+ d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
+
+ d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+ vreinterpret_u16_u32(d2tmp2.val[0]));
+ d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+ vreinterpret_u16_u32(d2tmp3.val[0]));
+ d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+ vreinterpret_u16_u32(d2tmp2.val[1]));
+ d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+ vreinterpret_u16_u32(d2tmp3.val[1]));
+
+ d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+ vreinterpret_u8_u16(d2tmp5.val[0]));
+ d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+ vreinterpret_u8_u16(d2tmp5.val[1]));
+ d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+ vreinterpret_u8_u16(d2tmp7.val[0]));
+ d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+ vreinterpret_u8_u16(d2tmp7.val[1]));
+
+ d3u8 = d2tmp8.val[0];
+ d4u8 = d2tmp8.val[1];
+ d5u8 = d2tmp9.val[0];
+ d6u8 = d2tmp9.val[1];
+ d7u8 = d2tmp10.val[0];
+ d16u8 = d2tmp10.val[1];
+ d17u8 = d2tmp11.val[0];
+ d18u8 = d2tmp11.val[1];
+
+ loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
+ d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
+
+ d4Result.val[0] = d4u8;
+ d4Result.val[1] = d5u8;
+ d4Result.val[2] = d6u8;
+ d4Result.val[3] = d7u8;
+
+ src -= 2;
+ vst4_lane_u8(src, d4Result, 0);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 1);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 2);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 3);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 4);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 5);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 6);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 7);
+ }
+ return;
}
diff --git a/vpx_dsp/arm/loopfilter_8_neon.c b/vpx_dsp/arm/loopfilter_8_neon.c
index ec3757380..854196f42 100644
--- a/vpx_dsp/arm/loopfilter_8_neon.c
+++ b/vpx_dsp/arm/loopfilter_8_neon.c
@@ -12,434 +12,418 @@
#include "./vpx_dsp_rtcd.h"
-static INLINE void mbloop_filter_neon(
- uint8x8_t dblimit, // mblimit
- uint8x8_t dlimit, // limit
- uint8x8_t dthresh, // thresh
- uint8x8_t d3u8, // p2
- uint8x8_t d4u8, // p2
- uint8x8_t d5u8, // p1
- uint8x8_t d6u8, // p0
- uint8x8_t d7u8, // q0
- uint8x8_t d16u8, // q1
- uint8x8_t d17u8, // q2
- uint8x8_t d18u8, // q3
- uint8x8_t *d0ru8, // p1
- uint8x8_t *d1ru8, // p1
- uint8x8_t *d2ru8, // p0
- uint8x8_t *d3ru8, // q0
- uint8x8_t *d4ru8, // q1
- uint8x8_t *d5ru8) { // q1
- uint32_t flat;
- uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
- uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
- int16x8_t q15s16;
- uint16x8_t q10u16, q14u16;
- int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
+static INLINE void mbloop_filter_neon(uint8x8_t dblimit, // mblimit
+ uint8x8_t dlimit, // limit
+ uint8x8_t dthresh, // thresh
+ uint8x8_t d3u8, // p2
+ uint8x8_t d4u8, // p2
+ uint8x8_t d5u8, // p1
+ uint8x8_t d6u8, // p0
+ uint8x8_t d7u8, // q0
+ uint8x8_t d16u8, // q1
+ uint8x8_t d17u8, // q2
+ uint8x8_t d18u8, // q3
+ uint8x8_t *d0ru8, // p1
+ uint8x8_t *d1ru8, // p1
+ uint8x8_t *d2ru8, // p0
+ uint8x8_t *d3ru8, // q0
+ uint8x8_t *d4ru8, // q1
+ uint8x8_t *d5ru8) { // q1
+ uint32_t flat;
+ uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
+ uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+ int16x8_t q15s16;
+ uint16x8_t q10u16, q14u16;
+ int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
- d19u8 = vabd_u8(d3u8, d4u8);
- d20u8 = vabd_u8(d4u8, d5u8);
- d21u8 = vabd_u8(d5u8, d6u8);
- d22u8 = vabd_u8(d16u8, d7u8);
- d23u8 = vabd_u8(d17u8, d16u8);
- d24u8 = vabd_u8(d18u8, d17u8);
+ d19u8 = vabd_u8(d3u8, d4u8);
+ d20u8 = vabd_u8(d4u8, d5u8);
+ d21u8 = vabd_u8(d5u8, d6u8);
+ d22u8 = vabd_u8(d16u8, d7u8);
+ d23u8 = vabd_u8(d17u8, d16u8);
+ d24u8 = vabd_u8(d18u8, d17u8);
- d19u8 = vmax_u8(d19u8, d20u8);
- d20u8 = vmax_u8(d21u8, d22u8);
+ d19u8 = vmax_u8(d19u8, d20u8);
+ d20u8 = vmax_u8(d21u8, d22u8);
- d25u8 = vabd_u8(d6u8, d4u8);
+ d25u8 = vabd_u8(d6u8, d4u8);
- d23u8 = vmax_u8(d23u8, d24u8);
+ d23u8 = vmax_u8(d23u8, d24u8);
- d26u8 = vabd_u8(d7u8, d17u8);
+ d26u8 = vabd_u8(d7u8, d17u8);
- d19u8 = vmax_u8(d19u8, d20u8);
+ d19u8 = vmax_u8(d19u8, d20u8);
- d24u8 = vabd_u8(d6u8, d7u8);
- d27u8 = vabd_u8(d3u8, d6u8);
- d28u8 = vabd_u8(d18u8, d7u8);
+ d24u8 = vabd_u8(d6u8, d7u8);
+ d27u8 = vabd_u8(d3u8, d6u8);
+ d28u8 = vabd_u8(d18u8, d7u8);
- d19u8 = vmax_u8(d19u8, d23u8);
+ d19u8 = vmax_u8(d19u8, d23u8);
- d23u8 = vabd_u8(d5u8, d16u8);
- d24u8 = vqadd_u8(d24u8, d24u8);
+ d23u8 = vabd_u8(d5u8, d16u8);
+ d24u8 = vqadd_u8(d24u8, d24u8);
+ d19u8 = vcge_u8(dlimit, d19u8);
- d19u8 = vcge_u8(dlimit, d19u8);
+ d25u8 = vmax_u8(d25u8, d26u8);
+ d26u8 = vmax_u8(d27u8, d28u8);
+ d23u8 = vshr_n_u8(d23u8, 1);
- d25u8 = vmax_u8(d25u8, d26u8);
- d26u8 = vmax_u8(d27u8, d28u8);
+ d25u8 = vmax_u8(d25u8, d26u8);
- d23u8 = vshr_n_u8(d23u8, 1);
+ d24u8 = vqadd_u8(d24u8, d23u8);
- d25u8 = vmax_u8(d25u8, d26u8);
+ d20u8 = vmax_u8(d20u8, d25u8);
- d24u8 = vqadd_u8(d24u8, d23u8);
+ d23u8 = vdup_n_u8(1);
+ d24u8 = vcge_u8(dblimit, d24u8);
- d20u8 = vmax_u8(d20u8, d25u8);
+ d21u8 = vcgt_u8(d21u8, dthresh);
- d23u8 = vdup_n_u8(1);
- d24u8 = vcge_u8(dblimit, d24u8);
+ d20u8 = vcge_u8(d23u8, d20u8);
- d21u8 = vcgt_u8(d21u8, dthresh);
+ d19u8 = vand_u8(d19u8, d24u8);
- d20u8 = vcge_u8(d23u8, d20u8);
+ d23u8 = vcgt_u8(d22u8, dthresh);
- d19u8 = vand_u8(d19u8, d24u8);
+ d20u8 = vand_u8(d20u8, d19u8);
- d23u8 = vcgt_u8(d22u8, dthresh);
+ d22u8 = vdup_n_u8(0x80);
- d20u8 = vand_u8(d20u8, d19u8);
+ d23u8 = vorr_u8(d21u8, d23u8);
- d22u8 = vdup_n_u8(0x80);
+ q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8));
- d23u8 = vorr_u8(d21u8, d23u8);
+ d30u8 = vshrn_n_u16(q10u16, 4);
+ flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
- q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8),
- vreinterpret_u16_u8(d21u8));
+ if (flat == 0xffffffff) { // Check for all 1's, power_branch_only
+ d27u8 = vdup_n_u8(3);
+ d21u8 = vdup_n_u8(2);
+ q14u16 = vaddl_u8(d6u8, d7u8);
+ q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+ q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
+ q14u16 = vaddw_u8(q14u16, d5u8);
+ *d0ru8 = vqrshrn_n_u16(q14u16, 3);
- d30u8 = vshrn_n_u16(q10u16, 4);
- flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vaddw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+ *d1ru8 = vqrshrn_n_u16(q14u16, 3);
- if (flat == 0xffffffff) { // Check for all 1's, power_branch_only
- d27u8 = vdup_n_u8(3);
- d21u8 = vdup_n_u8(2);
- q14u16 = vaddl_u8(d6u8, d7u8);
- q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
- q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
- q14u16 = vaddw_u8(q14u16, d5u8);
- *d0ru8 = vqrshrn_n_u16(q14u16, 3);
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+ *d2ru8 = vqrshrn_n_u16(q14u16, 3);
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d4u8);
- q14u16 = vaddw_u8(q14u16, d5u8);
- q14u16 = vaddw_u8(q14u16, d16u8);
- *d1ru8 = vqrshrn_n_u16(q14u16, 3);
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d7u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+ *d3ru8 = vqrshrn_n_u16(q14u16, 3);
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d5u8);
- q14u16 = vaddw_u8(q14u16, d6u8);
- q14u16 = vaddw_u8(q14u16, d17u8);
- *d2ru8 = vqrshrn_n_u16(q14u16, 3);
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vsubw_u8(q14u16, d7u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+ *d4ru8 = vqrshrn_n_u16(q14u16, 3);
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d6u8);
- q14u16 = vaddw_u8(q14u16, d7u8);
- q14u16 = vaddw_u8(q14u16, d18u8);
- *d3ru8 = vqrshrn_n_u16(q14u16, 3);
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vsubw_u8(q14u16, d16u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+ *d5ru8 = vqrshrn_n_u16(q14u16, 3);
+ } else {
+ d21u8 = veor_u8(d7u8, d22u8);
+ d24u8 = veor_u8(d6u8, d22u8);
+ d25u8 = veor_u8(d5u8, d22u8);
+ d26u8 = veor_u8(d16u8, d22u8);
- q14u16 = vsubw_u8(q14u16, d4u8);
- q14u16 = vsubw_u8(q14u16, d7u8);
- q14u16 = vaddw_u8(q14u16, d16u8);
- q14u16 = vaddw_u8(q14u16, d18u8);
- *d4ru8 = vqrshrn_n_u16(q14u16, 3);
+ d27u8 = vdup_n_u8(3);
- q14u16 = vsubw_u8(q14u16, d5u8);
- q14u16 = vsubw_u8(q14u16, d16u8);
- q14u16 = vaddw_u8(q14u16, d17u8);
- q14u16 = vaddw_u8(q14u16, d18u8);
- *d5ru8 = vqrshrn_n_u16(q14u16, 3);
- } else {
- d21u8 = veor_u8(d7u8, d22u8);
- d24u8 = veor_u8(d6u8, d22u8);
- d25u8 = veor_u8(d5u8, d22u8);
- d26u8 = veor_u8(d16u8, d22u8);
+ d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
+ d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
- d27u8 = vdup_n_u8(3);
+ q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
- d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
- d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
+ d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
- q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
+ q15s16 = vaddw_s8(q15s16, d29s8);
- d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
+ d29u8 = vdup_n_u8(4);
- q15s16 = vaddw_s8(q15s16, d29s8);
+ d28s8 = vqmovn_s16(q15s16);
- d29u8 = vdup_n_u8(4);
+ d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
- d28s8 = vqmovn_s16(q15s16);
+ d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
+ d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
+ d30s8 = vshr_n_s8(d30s8, 3);
+ d29s8 = vshr_n_s8(d29s8, 3);
- d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
+ d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
+ d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
- d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
- d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
- d30s8 = vshr_n_s8(d30s8, 3);
- d29s8 = vshr_n_s8(d29s8, 3);
+ d29s8 = vrshr_n_s8(d29s8, 1);
+ d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
- d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
- d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
+ d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
+ d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
- d29s8 = vrshr_n_s8(d29s8, 1);
- d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
-
- d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
- d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
-
- if (flat == 0) { // filter_branch_only
- *d0ru8 = d4u8;
- *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
- *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
- *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
- *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
- *d5ru8 = d17u8;
- return;
- }
+ if (flat == 0) { // filter_branch_only
+ *d0ru8 = d4u8;
+ *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+ *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+ *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+ *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+ *d5ru8 = d17u8;
+ return;
+ }
- d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
- d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
- d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
- d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+ d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+ d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+ d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+ d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
- d23u8 = vdup_n_u8(2);
- q14u16 = vaddl_u8(d6u8, d7u8);
- q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
- q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
+ d23u8 = vdup_n_u8(2);
+ q14u16 = vaddl_u8(d6u8, d7u8);
+ q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+ q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
- d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
+ d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
- q14u16 = vaddw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d5u8);
- d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
+ d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
- d30u8 = vqrshrn_n_u16(q14u16, 3);
+ d30u8 = vqrshrn_n_u16(q14u16, 3);
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d4u8);
- q14u16 = vaddw_u8(q14u16, d5u8);
- q14u16 = vaddw_u8(q14u16, d16u8);
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vaddw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
- d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
+ d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
- d31u8 = vqrshrn_n_u16(q14u16, 3);
+ d31u8 = vqrshrn_n_u16(q14u16, 3);
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d5u8);
- q14u16 = vaddw_u8(q14u16, d6u8);
- q14u16 = vaddw_u8(q14u16, d17u8);
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
- *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
+ *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
- d23u8 = vqrshrn_n_u16(q14u16, 3);
+ d23u8 = vqrshrn_n_u16(q14u16, 3);
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d6u8);
- q14u16 = vaddw_u8(q14u16, d7u8);
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d7u8);
- *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
+ *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
- q14u16 = vaddw_u8(q14u16, d18u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
- *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
+ *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
- d22u8 = vqrshrn_n_u16(q14u16, 3);
+ d22u8 = vqrshrn_n_u16(q14u16, 3);
- q14u16 = vsubw_u8(q14u16, d4u8);
- q14u16 = vsubw_u8(q14u16, d7u8);
- q14u16 = vaddw_u8(q14u16, d16u8);
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vsubw_u8(q14u16, d7u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
- d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
+ d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
- q14u16 = vaddw_u8(q14u16, d18u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
- d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
+ d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
- d6u8 = vqrshrn_n_u16(q14u16, 3);
+ d6u8 = vqrshrn_n_u16(q14u16, 3);
- q14u16 = vsubw_u8(q14u16, d5u8);
- q14u16 = vsubw_u8(q14u16, d16u8);
- q14u16 = vaddw_u8(q14u16, d17u8);
- q14u16 = vaddw_u8(q14u16, d18u8);
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vsubw_u8(q14u16, d16u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
- d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
+ d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
- d7u8 = vqrshrn_n_u16(q14u16, 3);
+ d7u8 = vqrshrn_n_u16(q14u16, 3);
- *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
- *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
- *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
- }
- return;
+ *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
+ *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
+ *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
+ }
+ return;
}
-void vpx_lpf_horizontal_8_neon(
- uint8_t *src,
- int pitch,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh) {
- int i;
- uint8_t *s, *psrc;
- uint8x8_t dblimit, dlimit, dthresh;
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
- uint8x8_t d16u8, d17u8, d18u8;
-
- dblimit = vld1_u8(blimit);
- dlimit = vld1_u8(limit);
- dthresh = vld1_u8(thresh);
-
- psrc = src - (pitch << 2);
- for (i = 0; i < 1; i++) {
- s = psrc + i * 8;
-
- d3u8 = vld1_u8(s);
- s += pitch;
- d4u8 = vld1_u8(s);
- s += pitch;
- d5u8 = vld1_u8(s);
- s += pitch;
- d6u8 = vld1_u8(s);
- s += pitch;
- d7u8 = vld1_u8(s);
- s += pitch;
- d16u8 = vld1_u8(s);
- s += pitch;
- d17u8 = vld1_u8(s);
- s += pitch;
- d18u8 = vld1_u8(s);
-
- mbloop_filter_neon(dblimit, dlimit, dthresh,
- d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
- &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
-
- s -= (pitch * 6);
- vst1_u8(s, d0u8);
- s += pitch;
- vst1_u8(s, d1u8);
- s += pitch;
- vst1_u8(s, d2u8);
- s += pitch;
- vst1_u8(s, d3u8);
- s += pitch;
- vst1_u8(s, d4u8);
- s += pitch;
- vst1_u8(s, d5u8);
- }
- return;
+void vpx_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ int i;
+ uint8_t *s, *psrc;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ uint8x8_t d16u8, d17u8, d18u8;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ psrc = src - (pitch << 2);
+ for (i = 0; i < 1; i++) {
+ s = psrc + i * 8;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
+ d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
+ &d5u8);
+
+ s -= (pitch * 6);
+ vst1_u8(s, d0u8);
+ s += pitch;
+ vst1_u8(s, d1u8);
+ s += pitch;
+ vst1_u8(s, d2u8);
+ s += pitch;
+ vst1_u8(s, d3u8);
+ s += pitch;
+ vst1_u8(s, d4u8);
+ s += pitch;
+ vst1_u8(s, d5u8);
+ }
+ return;
}
-void vpx_lpf_vertical_8_neon(
- uint8_t *src,
- int pitch,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh) {
- int i;
- uint8_t *s;
- uint8x8_t dblimit, dlimit, dthresh;
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
- uint8x8_t d16u8, d17u8, d18u8;
- uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
- uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
- uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
- uint8x8x4_t d4Result;
- uint8x8x2_t d2Result;
-
- dblimit = vld1_u8(blimit);
- dlimit = vld1_u8(limit);
- dthresh = vld1_u8(thresh);
-
- for (i = 0; i < 1; i++) {
- s = src + (i * (pitch << 3)) - 4;
-
- d3u8 = vld1_u8(s);
- s += pitch;
- d4u8 = vld1_u8(s);
- s += pitch;
- d5u8 = vld1_u8(s);
- s += pitch;
- d6u8 = vld1_u8(s);
- s += pitch;
- d7u8 = vld1_u8(s);
- s += pitch;
- d16u8 = vld1_u8(s);
- s += pitch;
- d17u8 = vld1_u8(s);
- s += pitch;
- d18u8 = vld1_u8(s);
-
- d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
- vreinterpret_u32_u8(d7u8));
- d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
- vreinterpret_u32_u8(d16u8));
- d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
- vreinterpret_u32_u8(d17u8));
- d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
- vreinterpret_u32_u8(d18u8));
-
- d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
- vreinterpret_u16_u32(d2tmp2.val[0]));
- d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
- vreinterpret_u16_u32(d2tmp3.val[0]));
- d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
- vreinterpret_u16_u32(d2tmp2.val[1]));
- d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
- vreinterpret_u16_u32(d2tmp3.val[1]));
-
- d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
- vreinterpret_u8_u16(d2tmp5.val[0]));
- d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
- vreinterpret_u8_u16(d2tmp5.val[1]));
- d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
- vreinterpret_u8_u16(d2tmp7.val[0]));
- d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
- vreinterpret_u8_u16(d2tmp7.val[1]));
-
- d3u8 = d2tmp8.val[0];
- d4u8 = d2tmp8.val[1];
- d5u8 = d2tmp9.val[0];
- d6u8 = d2tmp9.val[1];
- d7u8 = d2tmp10.val[0];
- d16u8 = d2tmp10.val[1];
- d17u8 = d2tmp11.val[0];
- d18u8 = d2tmp11.val[1];
-
- mbloop_filter_neon(dblimit, dlimit, dthresh,
- d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
- &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
-
- d4Result.val[0] = d0u8;
- d4Result.val[1] = d1u8;
- d4Result.val[2] = d2u8;
- d4Result.val[3] = d3u8;
-
- d2Result.val[0] = d4u8;
- d2Result.val[1] = d5u8;
-
- s = src - 3;
- vst4_lane_u8(s, d4Result, 0);
- s += pitch;
- vst4_lane_u8(s, d4Result, 1);
- s += pitch;
- vst4_lane_u8(s, d4Result, 2);
- s += pitch;
- vst4_lane_u8(s, d4Result, 3);
- s += pitch;
- vst4_lane_u8(s, d4Result, 4);
- s += pitch;
- vst4_lane_u8(s, d4Result, 5);
- s += pitch;
- vst4_lane_u8(s, d4Result, 6);
- s += pitch;
- vst4_lane_u8(s, d4Result, 7);
-
- s = src + 1;
- vst2_lane_u8(s, d2Result, 0);
- s += pitch;
- vst2_lane_u8(s, d2Result, 1);
- s += pitch;
- vst2_lane_u8(s, d2Result, 2);
- s += pitch;
- vst2_lane_u8(s, d2Result, 3);
- s += pitch;
- vst2_lane_u8(s, d2Result, 4);
- s += pitch;
- vst2_lane_u8(s, d2Result, 5);
- s += pitch;
- vst2_lane_u8(s, d2Result, 6);
- s += pitch;
- vst2_lane_u8(s, d2Result, 7);
- }
- return;
+void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ int i;
+ uint8_t *s;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ uint8x8_t d16u8, d17u8, d18u8;
+ uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+ uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+ uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+ uint8x8x4_t d4Result;
+ uint8x8x2_t d2Result;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ for (i = 0; i < 1; i++) {
+ s = src + (i * (pitch << 3)) - 4;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
+ d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
+ d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
+ d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
+
+ d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+ vreinterpret_u16_u32(d2tmp2.val[0]));
+ d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+ vreinterpret_u16_u32(d2tmp3.val[0]));
+ d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+ vreinterpret_u16_u32(d2tmp2.val[1]));
+ d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+ vreinterpret_u16_u32(d2tmp3.val[1]));
+
+ d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+ vreinterpret_u8_u16(d2tmp5.val[0]));
+ d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+ vreinterpret_u8_u16(d2tmp5.val[1]));
+ d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+ vreinterpret_u8_u16(d2tmp7.val[0]));
+ d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+ vreinterpret_u8_u16(d2tmp7.val[1]));
+
+ d3u8 = d2tmp8.val[0];
+ d4u8 = d2tmp8.val[1];
+ d5u8 = d2tmp9.val[0];
+ d6u8 = d2tmp9.val[1];
+ d7u8 = d2tmp10.val[0];
+ d16u8 = d2tmp10.val[1];
+ d17u8 = d2tmp11.val[0];
+ d18u8 = d2tmp11.val[1];
+
+ mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
+ d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
+ &d5u8);
+
+ d4Result.val[0] = d0u8;
+ d4Result.val[1] = d1u8;
+ d4Result.val[2] = d2u8;
+ d4Result.val[3] = d3u8;
+
+ d2Result.val[0] = d4u8;
+ d2Result.val[1] = d5u8;
+
+ s = src - 3;
+ vst4_lane_u8(s, d4Result, 0);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 1);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 2);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 3);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 4);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 5);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 6);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 7);
+
+ s = src + 1;
+ vst2_lane_u8(s, d2Result, 0);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 1);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 2);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 3);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 4);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 5);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 6);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 7);
+ }
+ return;
}
diff --git a/vpx_dsp/arm/loopfilter_neon.c b/vpx_dsp/arm/loopfilter_neon.c
index aa31f2935..9129b5d2d 100644
--- a/vpx_dsp/arm/loopfilter_neon.c
+++ b/vpx_dsp/arm/loopfilter_neon.c
@@ -14,42 +14,32 @@
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
-void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
+void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1) {
vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
}
#if HAVE_NEON_ASM
-void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1) {
+void vpx_lpf_horizontal_8_dual_neon(
+ uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+ const uint8_t *limit1, const uint8_t *thresh1) {
vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
}
-void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
+void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1) {
vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
}
-void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p,
- const uint8_t *blimit,
+void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh) {
vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c
index c7704dc1b..dc2039800 100644
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -16,10 +16,10 @@
static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
const uint16x8_t vec_hi) {
- const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
- vget_high_u16(vec_lo));
- const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),
- vget_high_u16(vec_hi));
+ const uint32x4_t vec_l_lo =
+ vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+ const uint32x4_t vec_l_hi =
+ vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
const uint64x2_t b = vpaddlq_u32(a);
const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
@@ -33,8 +33,7 @@ static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
static void sad_neon_64(const uint8x16_t vec_src_00,
const uint8x16_t vec_src_16,
const uint8x16_t vec_src_32,
- const uint8x16_t vec_src_48,
- const uint8_t *ref,
+ const uint8x16_t vec_src_48, const uint8_t *ref,
uint16x8_t *vec_sum_ref_lo,
uint16x8_t *vec_sum_ref_hi) {
const uint8x16_t vec_ref_00 = vld1q_u8(ref);
@@ -63,8 +62,7 @@ static void sad_neon_64(const uint8x16_t vec_src_00,
// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
static void sad_neon_32(const uint8x16_t vec_src_00,
- const uint8x16_t vec_src_16,
- const uint8_t *ref,
+ const uint8x16_t vec_src_16, const uint8_t *ref,
uint16x8_t *vec_sum_ref_lo,
uint16x8_t *vec_sum_ref_hi) {
const uint8x16_t vec_ref_00 = vld1q_u8(ref);
@@ -81,7 +79,7 @@ static void sad_neon_32(const uint8x16_t vec_src_00,
}
void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
- const uint8_t* const ref[4], int ref_stride,
+ const uint8_t *const ref[4], int ref_stride,
uint32_t *res) {
int i;
uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
@@ -127,7 +125,7 @@ void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
}
void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
- const uint8_t* const ref[4], int ref_stride,
+ const uint8_t *const ref[4], int ref_stride,
uint32_t *res) {
int i;
uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
@@ -148,14 +146,14 @@ void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
const uint8x16_t vec_src_00 = vld1q_u8(src);
const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
- sad_neon_32(vec_src_00, vec_src_16, ref0,
- &vec_sum_ref0_lo, &vec_sum_ref0_hi);
- sad_neon_32(vec_src_00, vec_src_16, ref1,
- &vec_sum_ref1_lo, &vec_sum_ref1_hi);
- sad_neon_32(vec_src_00, vec_src_16, ref2,
- &vec_sum_ref2_lo, &vec_sum_ref2_hi);
- sad_neon_32(vec_src_00, vec_src_16, ref3,
- &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+ sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo,
+ &vec_sum_ref0_hi);
+ sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo,
+ &vec_sum_ref1_hi);
+ sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo,
+ &vec_sum_ref2_hi);
+ sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo,
+ &vec_sum_ref3_hi);
src += src_stride;
ref0 += ref_stride;
@@ -171,7 +169,7 @@ void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
}
void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
- const uint8_t* const ref[4], int ref_stride,
+ const uint8_t *const ref[4], int ref_stride,
uint32_t *res) {
int i;
uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
@@ -195,20 +193,20 @@ void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
const uint8x16_t vec_ref2 = vld1q_u8(ref2);
const uint8x16_t vec_ref3 = vld1q_u8(ref3);
- vec_sum_ref0_lo = vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src),
- vget_low_u8(vec_ref0));
+ vec_sum_ref0_lo =
+ vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0));
vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
vget_high_u8(vec_ref0));
- vec_sum_ref1_lo = vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src),
- vget_low_u8(vec_ref1));
+ vec_sum_ref1_lo =
+ vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1));
vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
vget_high_u8(vec_ref1));
- vec_sum_ref2_lo = vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src),
- vget_low_u8(vec_ref2));
+ vec_sum_ref2_lo =
+ vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2));
vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
vget_high_u8(vec_ref2));
- vec_sum_ref3_lo = vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src),
- vget_low_u8(vec_ref3));
+ vec_sum_ref3_lo =
+ vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3));
vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
vget_high_u8(vec_ref3));
diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c
index 173f08ac3..ff3228768 100644
--- a/vpx_dsp/arm/sad_neon.c
+++ b/vpx_dsp/arm/sad_neon.c
@@ -14,114 +14,105 @@
#include "vpx/vpx_integer.h"
-unsigned int vpx_sad8x16_neon(
- unsigned char *src_ptr,
- int src_stride,
- unsigned char *ref_ptr,
- int ref_stride) {
- uint8x8_t d0, d8;
- uint16x8_t q12;
- uint32x4_t q1;
- uint64x2_t q3;
- uint32x2_t d5;
- int i;
+unsigned int vpx_sad8x16_neon(unsigned char *src_ptr, int src_stride,
+ unsigned char *ref_ptr, int ref_stride) {
+ uint8x8_t d0, d8;
+ uint16x8_t q12;
+ uint32x4_t q1;
+ uint64x2_t q3;
+ uint32x2_t d5;
+ int i;
+
+ d0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d8 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabdl_u8(d0, d8);
+ for (i = 0; i < 15; i++) {
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
- q12 = vabdl_u8(d0, d8);
-
- for (i = 0; i < 15; i++) {
- d0 = vld1_u8(src_ptr);
- src_ptr += src_stride;
- d8 = vld1_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabal_u8(q12, d0, d8);
- }
-
- q1 = vpaddlq_u16(q12);
- q3 = vpaddlq_u32(q1);
- d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
- vreinterpret_u32_u64(vget_high_u64(q3)));
-
- return vget_lane_u32(d5, 0);
+ q12 = vabal_u8(q12, d0, d8);
+ }
+
+ q1 = vpaddlq_u16(q12);
+ q3 = vpaddlq_u32(q1);
+ d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+ vreinterpret_u32_u64(vget_high_u64(q3)));
+
+ return vget_lane_u32(d5, 0);
}
-unsigned int vpx_sad4x4_neon(
- unsigned char *src_ptr,
- int src_stride,
- unsigned char *ref_ptr,
- int ref_stride) {
- uint8x8_t d0, d8;
- uint16x8_t q12;
- uint32x2_t d1;
- uint64x1_t d3;
- int i;
+unsigned int vpx_sad4x4_neon(unsigned char *src_ptr, int src_stride,
+ unsigned char *ref_ptr, int ref_stride) {
+ uint8x8_t d0, d8;
+ uint16x8_t q12;
+ uint32x2_t d1;
+ uint64x1_t d3;
+ int i;
+
+ d0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d8 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabdl_u8(d0, d8);
+ for (i = 0; i < 3; i++) {
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
- q12 = vabdl_u8(d0, d8);
-
- for (i = 0; i < 3; i++) {
- d0 = vld1_u8(src_ptr);
- src_ptr += src_stride;
- d8 = vld1_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabal_u8(q12, d0, d8);
- }
+ q12 = vabal_u8(q12, d0, d8);
+ }
- d1 = vpaddl_u16(vget_low_u16(q12));
- d3 = vpaddl_u32(d1);
+ d1 = vpaddl_u16(vget_low_u16(q12));
+ d3 = vpaddl_u32(d1);
- return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
+ return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
}
-unsigned int vpx_sad16x8_neon(
- unsigned char *src_ptr,
- int src_stride,
- unsigned char *ref_ptr,
- int ref_stride) {
- uint8x16_t q0, q4;
- uint16x8_t q12, q13;
- uint32x4_t q1;
- uint64x2_t q3;
- uint32x2_t d5;
- int i;
+unsigned int vpx_sad16x8_neon(unsigned char *src_ptr, int src_stride,
+ unsigned char *ref_ptr, int ref_stride) {
+ uint8x16_t q0, q4;
+ uint16x8_t q12, q13;
+ uint32x4_t q1;
+ uint64x2_t q3;
+ uint32x2_t d5;
+ int i;
+
+ q0 = vld1q_u8(src_ptr);
+ src_ptr += src_stride;
+ q4 = vld1q_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
+ q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
+ for (i = 0; i < 7; i++) {
q0 = vld1q_u8(src_ptr);
src_ptr += src_stride;
q4 = vld1q_u8(ref_ptr);
ref_ptr += ref_stride;
- q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
- q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
-
- for (i = 0; i < 7; i++) {
- q0 = vld1q_u8(src_ptr);
- src_ptr += src_stride;
- q4 = vld1q_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
- q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
- }
-
- q12 = vaddq_u16(q12, q13);
- q1 = vpaddlq_u16(q12);
- q3 = vpaddlq_u32(q1);
- d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
- vreinterpret_u32_u64(vget_high_u64(q3)));
-
- return vget_lane_u32(d5, 0);
+ q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
+ q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
+ }
+
+ q12 = vaddq_u16(q12, q13);
+ q1 = vpaddlq_u16(q12);
+ q3 = vpaddlq_u32(q1);
+ d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+ vreinterpret_u32_u64(vget_high_u64(q3)));
+
+ return vget_lane_u32(d5, 0);
}
static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
const uint16x8_t vec_hi) {
- const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
- vget_high_u16(vec_lo));
- const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),
- vget_high_u16(vec_hi));
+ const uint32x4_t vec_l_lo =
+ vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+ const uint32x4_t vec_l_hi =
+ vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
const uint64x2_t b = vpaddlq_u32(a);
const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
@@ -208,10 +199,10 @@ unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride,
const uint8x16_t vec_ref = vld1q_u8(ref);
src += src_stride;
ref += ref_stride;
- vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src),
- vget_low_u8(vec_ref));
- vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src),
- vget_high_u8(vec_ref));
+ vec_accum_lo =
+ vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref));
+ vec_accum_hi =
+ vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
}
return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
}
diff --git a/vpx_dsp/arm/subpel_variance_media.c b/vpx_dsp/arm/subpel_variance_media.c
index e7d8c85fb..ab5336157 100644
--- a/vpx_dsp/arm/subpel_variance_media.c
+++ b/vpx_dsp/arm/subpel_variance_media.c
@@ -14,91 +14,66 @@
#include "vpx_ports/mem.h"
#if HAVE_MEDIA
-static const int16_t bilinear_filters_media[8][2] = {
- { 128, 0 },
- { 112, 16 },
- { 96, 32 },
- { 80, 48 },
- { 64, 64 },
- { 48, 80 },
- { 32, 96 },
- { 16, 112 }
-};
+static const int16_t bilinear_filters_media[8][2] = { { 128, 0 }, { 112, 16 },
+ { 96, 32 }, { 80, 48 },
+ { 64, 64 }, { 48, 80 },
+ { 32, 96 }, { 16, 112 } };
-extern void vpx_filter_block2d_bil_first_pass_media(const uint8_t *src_ptr,
- uint16_t *dst_ptr,
- uint32_t src_pitch,
- uint32_t height,
- uint32_t width,
- const int16_t *filter);
+extern void vpx_filter_block2d_bil_first_pass_media(
+ const uint8_t *src_ptr, uint16_t *dst_ptr, uint32_t src_pitch,
+ uint32_t height, uint32_t width, const int16_t *filter);
-extern void vpx_filter_block2d_bil_second_pass_media(const uint16_t *src_ptr,
- uint8_t *dst_ptr,
- int32_t src_pitch,
- uint32_t height,
- uint32_t width,
- const int16_t *filter);
+extern void vpx_filter_block2d_bil_second_pass_media(
+ const uint16_t *src_ptr, uint8_t *dst_ptr, int32_t src_pitch,
+ uint32_t height, uint32_t width, const int16_t *filter);
-
-unsigned int vpx_sub_pixel_variance8x8_media(const uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset, int yoffset,
- const uint8_t *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse) {
- uint16_t first_pass[10*8];
- uint8_t second_pass[8*8];
+unsigned int vpx_sub_pixel_variance8x8_media(
+ const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
+ const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
+ uint16_t first_pass[10 * 8];
+ uint8_t second_pass[8 * 8];
const int16_t *HFilter, *VFilter;
HFilter = bilinear_filters_media[xoffset];
VFilter = bilinear_filters_media[yoffset];
vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
- src_pixels_per_line,
- 9, 8, HFilter);
- vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass,
- 8, 8, 8, VFilter);
+ src_pixels_per_line, 9, 8, HFilter);
+ vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, 8, 8, 8,
+ VFilter);
- return vpx_variance8x8_media(second_pass, 8, dst_ptr,
- dst_pixels_per_line, sse);
+ return vpx_variance8x8_media(second_pass, 8, dst_ptr, dst_pixels_per_line,
+ sse);
}
-unsigned int vpx_sub_pixel_variance16x16_media(const uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const uint8_t *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse) {
- uint16_t first_pass[36*16];
- uint8_t second_pass[20*16];
+unsigned int vpx_sub_pixel_variance16x16_media(
+ const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
+ const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
+ uint16_t first_pass[36 * 16];
+ uint8_t second_pass[20 * 16];
const int16_t *HFilter, *VFilter;
unsigned int var;
if (xoffset == 4 && yoffset == 0) {
- var = vpx_variance_halfpixvar16x16_h_media(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line,
- sse);
+ var = vpx_variance_halfpixvar16x16_h_media(
+ src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
} else if (xoffset == 0 && yoffset == 4) {
- var = vpx_variance_halfpixvar16x16_v_media(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line,
- sse);
+ var = vpx_variance_halfpixvar16x16_v_media(
+ src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
} else if (xoffset == 4 && yoffset == 4) {
- var = vpx_variance_halfpixvar16x16_hv_media(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line,
- sse);
+ var = vpx_variance_halfpixvar16x16_hv_media(
+ src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
} else {
HFilter = bilinear_filters_media[xoffset];
VFilter = bilinear_filters_media[yoffset];
- vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
- src_pixels_per_line,
- 17, 16, HFilter);
- vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass,
- 16, 16, 16, VFilter);
+ vpx_filter_block2d_bil_first_pass_media(
+ src_ptr, first_pass, src_pixels_per_line, 17, 16, HFilter);
+ vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, 16, 16,
+ 16, VFilter);
- var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
- dst_pixels_per_line, sse);
+ var = vpx_variance16x16_media(second_pass, 16, dst_ptr, dst_pixels_per_line,
+ sse);
}
return var;
}
diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c
index 40e2cc89b..f044e11a1 100644
--- a/vpx_dsp/arm/subpel_variance_neon.c
+++ b/vpx_dsp/arm/subpel_variance_neon.c
@@ -18,14 +18,8 @@
#include "vpx_dsp/variance.h"
static const uint8_t bilinear_filters[8][2] = {
- { 128, 0, },
- { 112, 16, },
- { 96, 32, },
- { 80, 48, },
- { 64, 64, },
- { 48, 80, },
- { 32, 96, },
- { 16, 112, },
+ { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+ { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
};
static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
@@ -79,74 +73,61 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
}
}
-unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src,
- int src_stride,
- int xoffset,
- int yoffset,
- const uint8_t *dst,
- int dst_stride,
+unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride,
+ int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride,
unsigned int *sse) {
DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
- var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,
- 9, 8,
+ var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8,
bilinear_filters[xoffset]);
- var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
- 8, bilinear_filters[yoffset]);
+ var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8,
+ bilinear_filters[yoffset]);
return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
}
unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src,
- int src_stride,
- int xoffset,
- int yoffset,
- const uint8_t *dst,
+ int src_stride, int xoffset,
+ int yoffset, const uint8_t *dst,
int dst_stride,
unsigned int *sse) {
DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
- var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
- 17, 16,
+ var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16,
bilinear_filters[xoffset]);
- var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,
- 16, bilinear_filters[yoffset]);
+ var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16,
+ bilinear_filters[yoffset]);
return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
}
unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src,
- int src_stride,
- int xoffset,
- int yoffset,
- const uint8_t *dst,
+ int src_stride, int xoffset,
+ int yoffset, const uint8_t *dst,
int dst_stride,
unsigned int *sse) {
DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
- var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
- 33, 32,
+ var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32,
bilinear_filters[xoffset]);
- var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,
- 32, bilinear_filters[yoffset]);
+ var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32,
+ bilinear_filters[yoffset]);
return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
}
unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src,
- int src_stride,
- int xoffset,
- int yoffset,
- const uint8_t *dst,
+ int src_stride, int xoffset,
+ int yoffset, const uint8_t *dst,
int dst_stride,
unsigned int *sse) {
DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
- var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
- 65, 64,
+ var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64,
bilinear_filters[xoffset]);
- var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,
- 64, bilinear_filters[yoffset]);
+ var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64,
+ bilinear_filters[yoffset]);
return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
}
diff --git a/vpx_dsp/arm/subtract_neon.c b/vpx_dsp/arm/subtract_neon.c
index 7b146095e..ce81fb630 100644
--- a/vpx_dsp/arm/subtract_neon.c
+++ b/vpx_dsp/arm/subtract_neon.c
@@ -13,10 +13,10 @@
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
-void vpx_subtract_block_neon(int rows, int cols,
- int16_t *diff, ptrdiff_t diff_stride,
- const uint8_t *src, ptrdiff_t src_stride,
- const uint8_t *pred, ptrdiff_t pred_stride) {
+void vpx_subtract_block_neon(int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src,
+ ptrdiff_t src_stride, const uint8_t *pred,
+ ptrdiff_t pred_stride) {
int r, c;
if (cols > 16) {
@@ -24,38 +24,38 @@ void vpx_subtract_block_neon(int rows, int cols,
for (c = 0; c < cols; c += 32) {
const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
- const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
+ const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
- const uint16x8_t v_diff_lo_00 = vsubl_u8(vget_low_u8(v_src_00),
- vget_low_u8(v_pred_00));
- const uint16x8_t v_diff_hi_00 = vsubl_u8(vget_high_u8(v_src_00),
- vget_high_u8(v_pred_00));
- const uint16x8_t v_diff_lo_16 = vsubl_u8(vget_low_u8(v_src_16),
- vget_low_u8(v_pred_16));
- const uint16x8_t v_diff_hi_16 = vsubl_u8(vget_high_u8(v_src_16),
- vget_high_u8(v_pred_16));
- vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
- vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
+ const uint16x8_t v_diff_lo_00 =
+ vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00));
+ const uint16x8_t v_diff_hi_00 =
+ vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00));
+ const uint16x8_t v_diff_lo_16 =
+ vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16));
+ const uint16x8_t v_diff_hi_16 =
+ vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16));
+ vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
+ vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
}
diff += diff_stride;
pred += pred_stride;
- src += src_stride;
+ src += src_stride;
}
} else if (cols > 8) {
for (r = 0; r < rows; ++r) {
const uint8x16_t v_src = vld1q_u8(&src[0]);
const uint8x16_t v_pred = vld1q_u8(&pred[0]);
- const uint16x8_t v_diff_lo = vsubl_u8(vget_low_u8(v_src),
- vget_low_u8(v_pred));
- const uint16x8_t v_diff_hi = vsubl_u8(vget_high_u8(v_src),
- vget_high_u8(v_pred));
+ const uint16x8_t v_diff_lo =
+ vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred));
+ const uint16x8_t v_diff_hi =
+ vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred));
vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
diff += diff_stride;
pred += pred_stride;
- src += src_stride;
+ src += src_stride;
}
} else if (cols > 4) {
for (r = 0; r < rows; ++r) {
@@ -65,16 +65,15 @@ void vpx_subtract_block_neon(int rows, int cols,
vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
diff += diff_stride;
pred += pred_stride;
- src += src_stride;
+ src += src_stride;
}
} else {
for (r = 0; r < rows; ++r) {
- for (c = 0; c < cols; ++c)
- diff[c] = src[c] - pred[c];
+ for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c];
diff += diff_stride;
pred += pred_stride;
- src += src_stride;
+ src += src_stride;
}
}
}
diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c
index ede6e7bbb..f469afc4e 100644
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -32,9 +32,9 @@ static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
}
// w * h must be less than 2048 or local variable v_sum may overflow.
-static void variance_neon_w8(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- int w, int h, uint32_t *sse, int *sum) {
+static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int w, int h, uint32_t *sse,
+ int *sum) {
int i, j;
int16x8_t v_sum = vdupq_n_s16(0);
int32x4_t v_sse_lo = vdupq_n_s32(0);
@@ -47,12 +47,10 @@ static void variance_neon_w8(const uint8_t *a, int a_stride,
const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
v_sum = vaddq_s16(v_sum, sv_diff);
- v_sse_lo = vmlal_s16(v_sse_lo,
- vget_low_s16(sv_diff),
- vget_low_s16(sv_diff));
- v_sse_hi = vmlal_s16(v_sse_hi,
- vget_high_s16(sv_diff),
- vget_high_s16(sv_diff));
+ v_sse_lo =
+ vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff));
+ v_sse_hi =
+ vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff));
}
a += a_stride;
b += b_stride;
@@ -62,15 +60,13 @@ static void variance_neon_w8(const uint8_t *a, int a_stride,
*sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
}
-void vpx_get8x8var_neon(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse, int *sum) {
+void vpx_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, unsigned int *sse, int *sum) {
variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
}
-void vpx_get16x16var_neon(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse, int *sum) {
+void vpx_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, unsigned int *sse, int *sum) {
variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
}
@@ -104,9 +100,8 @@ unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
int sum1, sum2;
uint32_t sse1, sse2;
variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
- variance_neon_w8(a + (32 * a_stride), a_stride,
- b + (32 * b_stride), b_stride, 32, 32,
- &sse2, &sum2);
+ variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride,
+ 32, 32, &sse2, &sum2);
*sse = sse1 + sse2;
sum1 += sum2;
return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
@@ -118,9 +113,8 @@ unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride,
int sum1, sum2;
uint32_t sse1, sse2;
variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
- variance_neon_w8(a + (16 * a_stride), a_stride,
- b + (16 * b_stride), b_stride, 64, 16,
- &sse2, &sum2);
+ variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
+ 64, 16, &sse2, &sum2);
*sse = sse1 + sse2;
sum1 += sum2;
return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
@@ -133,286 +127,273 @@ unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
uint32_t sse1, sse2;
variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
- variance_neon_w8(a + (16 * a_stride), a_stride,
- b + (16 * b_stride), b_stride, 64, 16,
- &sse2, &sum2);
+ variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
+ 64, 16, &sse2, &sum2);
sse1 += sse2;
sum1 += sum2;
- variance_neon_w8(a + (16 * 2 * a_stride), a_stride,
- b + (16 * 2 * b_stride), b_stride,
- 64, 16, &sse2, &sum2);
+ variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride),
+ b_stride, 64, 16, &sse2, &sum2);
sse1 += sse2;
sum1 += sum2;
- variance_neon_w8(a + (16 * 3 * a_stride), a_stride,
- b + (16 * 3 * b_stride), b_stride,
- 64, 16, &sse2, &sum2);
+ variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride),
+ b_stride, 64, 16, &sse2, &sum2);
*sse = sse1 + sse2;
sum1 += sum2;
return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64
}
-unsigned int vpx_variance16x8_neon(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- int i;
- int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
- uint32x2_t d0u32, d10u32;
- int64x1_t d0s64, d1s64;
- uint8x16_t q0u8, q1u8, q2u8, q3u8;
- uint16x8_t q11u16, q12u16, q13u16, q14u16;
- int32x4_t q8s32, q9s32, q10s32;
- int64x2_t q0s64, q1s64, q5s64;
-
- q8s32 = vdupq_n_s32(0);
- q9s32 = vdupq_n_s32(0);
- q10s32 = vdupq_n_s32(0);
-
- for (i = 0; i < 4; i++) {
- q0u8 = vld1q_u8(src_ptr);
- src_ptr += source_stride;
- q1u8 = vld1q_u8(src_ptr);
- src_ptr += source_stride;
- __builtin_prefetch(src_ptr);
-
- q2u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- q3u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- __builtin_prefetch(ref_ptr);
-
- q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
- q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
- q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
- q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
- q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
- q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
- q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
- q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
- q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
- q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
- d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
- d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
- q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
- q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
- }
-
- q10s32 = vaddq_s32(q10s32, q9s32);
- q0s64 = vpaddlq_s32(q8s32);
- q1s64 = vpaddlq_s32(q10s32);
+unsigned int vpx_variance16x8_neon(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride, unsigned int *sse) {
+ int i;
+ int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+ uint32x2_t d0u32, d10u32;
+ int64x1_t d0s64, d1s64;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8;
+ uint16x8_t q11u16, q12u16, q13u16, q14u16;
+ int32x4_t q8s32, q9s32, q10s32;
+ int64x2_t q0s64, q1s64, q5s64;
+
+ q8s32 = vdupq_n_s32(0);
+ q9s32 = vdupq_n_s32(0);
+ q10s32 = vdupq_n_s32(0);
+
+ for (i = 0; i < 4; i++) {
+ q0u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ q1u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ __builtin_prefetch(src_ptr);
- d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
- d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+ q2u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q3u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ __builtin_prefetch(ref_ptr);
+
+ q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+ q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+ q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+ q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+ q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+ q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+ q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+ q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+ q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+ q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+
+ d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+ d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+ q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+ q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+ }
- q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
- vreinterpret_s32_s64(d0s64));
- vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+ q10s32 = vaddq_s32(q10s32, q9s32);
+ q0s64 = vpaddlq_s32(q8s32);
+ q1s64 = vpaddlq_s32(q10s32);
- d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
- d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+ d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+ d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
- return vget_lane_u32(d0u32, 0);
-}
+ q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-unsigned int vpx_variance8x16_neon(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- int i;
- uint8x8_t d0u8, d2u8, d4u8, d6u8;
- int16x4_t d22s16, d23s16, d24s16, d25s16;
- uint32x2_t d0u32, d10u32;
- int64x1_t d0s64, d1s64;
- uint16x8_t q11u16, q12u16;
- int32x4_t q8s32, q9s32, q10s32;
- int64x2_t q0s64, q1s64, q5s64;
-
- q8s32 = vdupq_n_s32(0);
- q9s32 = vdupq_n_s32(0);
- q10s32 = vdupq_n_s32(0);
-
- for (i = 0; i < 8; i++) {
- d0u8 = vld1_u8(src_ptr);
- src_ptr += source_stride;
- d2u8 = vld1_u8(src_ptr);
- src_ptr += source_stride;
- __builtin_prefetch(src_ptr);
-
- d4u8 = vld1_u8(ref_ptr);
- ref_ptr += recon_stride;
- d6u8 = vld1_u8(ref_ptr);
- ref_ptr += recon_stride;
- __builtin_prefetch(ref_ptr);
-
- q11u16 = vsubl_u8(d0u8, d4u8);
- q12u16 = vsubl_u8(d2u8, d6u8);
-
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
- q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
- q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
- q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
- q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
- }
+ d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+ d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
- q10s32 = vaddq_s32(q10s32, q9s32);
- q0s64 = vpaddlq_s32(q8s32);
- q1s64 = vpaddlq_s32(q10s32);
+ return vget_lane_u32(d0u32, 0);
+}
- d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
- d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+unsigned int vpx_variance8x16_neon(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride, unsigned int *sse) {
+ int i;
+ uint8x8_t d0u8, d2u8, d4u8, d6u8;
+ int16x4_t d22s16, d23s16, d24s16, d25s16;
+ uint32x2_t d0u32, d10u32;
+ int64x1_t d0s64, d1s64;
+ uint16x8_t q11u16, q12u16;
+ int32x4_t q8s32, q9s32, q10s32;
+ int64x2_t q0s64, q1s64, q5s64;
+
+ q8s32 = vdupq_n_s32(0);
+ q9s32 = vdupq_n_s32(0);
+ q10s32 = vdupq_n_s32(0);
+
+ for (i = 0; i < 8; i++) {
+ d0u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+ d2u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+ __builtin_prefetch(src_ptr);
- q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
- vreinterpret_s32_s64(d0s64));
- vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+ d4u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ d6u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ __builtin_prefetch(ref_ptr);
- d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
- d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+ q11u16 = vsubl_u8(d0u8, d4u8);
+ q12u16 = vsubl_u8(d2u8, d6u8);
+
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+ q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+ q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+ q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+ q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+ }
- return vget_lane_u32(d0u32, 0);
-}
+ q10s32 = vaddq_s32(q10s32, q9s32);
+ q0s64 = vpaddlq_s32(q8s32);
+ q1s64 = vpaddlq_s32(q10s32);
-unsigned int vpx_mse16x16_neon(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- int i;
- int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
- int64x1_t d0s64;
- uint8x16_t q0u8, q1u8, q2u8, q3u8;
- int32x4_t q7s32, q8s32, q9s32, q10s32;
- uint16x8_t q11u16, q12u16, q13u16, q14u16;
- int64x2_t q1s64;
-
- q7s32 = vdupq_n_s32(0);
- q8s32 = vdupq_n_s32(0);
- q9s32 = vdupq_n_s32(0);
- q10s32 = vdupq_n_s32(0);
-
- for (i = 0; i < 8; i++) { // mse16x16_neon_loop
- q0u8 = vld1q_u8(src_ptr);
- src_ptr += source_stride;
- q1u8 = vld1q_u8(src_ptr);
- src_ptr += source_stride;
- q2u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- q3u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
-
- q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
- q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
- q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
- q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
- q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
- q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
-
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
- q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
- q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
- q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
-
- d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
- d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
- q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
- q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
- }
+ d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+ d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
- q7s32 = vaddq_s32(q7s32, q8s32);
- q9s32 = vaddq_s32(q9s32, q10s32);
- q10s32 = vaddq_s32(q7s32, q9s32);
+ q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
- q1s64 = vpaddlq_s32(q10s32);
- d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+ d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+ d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
- vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
- return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+ return vget_lane_u32(d0u32, 0);
}
-unsigned int vpx_get4x4sse_cs_neon(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride) {
- int16x4_t d22s16, d24s16, d26s16, d28s16;
- int64x1_t d0s64;
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
- int32x4_t q7s32, q8s32, q9s32, q10s32;
- uint16x8_t q11u16, q12u16, q13u16, q14u16;
- int64x2_t q1s64;
-
- d0u8 = vld1_u8(src_ptr);
+unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int source_stride,
+ const unsigned char *ref_ptr, int recon_stride,
+ unsigned int *sse) {
+ int i;
+ int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+ int64x1_t d0s64;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8;
+ int32x4_t q7s32, q8s32, q9s32, q10s32;
+ uint16x8_t q11u16, q12u16, q13u16, q14u16;
+ int64x2_t q1s64;
+
+ q7s32 = vdupq_n_s32(0);
+ q8s32 = vdupq_n_s32(0);
+ q9s32 = vdupq_n_s32(0);
+ q10s32 = vdupq_n_s32(0);
+
+ for (i = 0; i < 8; i++) { // mse16x16_neon_loop
+ q0u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
- d4u8 = vld1_u8(ref_ptr);
- ref_ptr += recon_stride;
- d1u8 = vld1_u8(src_ptr);
+ q1u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
- d5u8 = vld1_u8(ref_ptr);
+ q2u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
- d2u8 = vld1_u8(src_ptr);
- src_ptr += source_stride;
- d6u8 = vld1_u8(ref_ptr);
- ref_ptr += recon_stride;
- d3u8 = vld1_u8(src_ptr);
- src_ptr += source_stride;
- d7u8 = vld1_u8(ref_ptr);
+ q3u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
- q11u16 = vsubl_u8(d0u8, d4u8);
- q12u16 = vsubl_u8(d1u8, d5u8);
- q13u16 = vsubl_u8(d2u8, d6u8);
- q14u16 = vsubl_u8(d3u8, d7u8);
-
- d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
- d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
- d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
- d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
+ q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+ q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+ q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+ q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
+ q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
+
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+ q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+ q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
+ q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
+
+ d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+ d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+ q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+ q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+ }
- q7s32 = vmull_s16(d22s16, d22s16);
- q8s32 = vmull_s16(d24s16, d24s16);
- q9s32 = vmull_s16(d26s16, d26s16);
- q10s32 = vmull_s16(d28s16, d28s16);
+ q7s32 = vaddq_s32(q7s32, q8s32);
+ q9s32 = vaddq_s32(q9s32, q10s32);
+ q10s32 = vaddq_s32(q7s32, q9s32);
- q7s32 = vaddq_s32(q7s32, q8s32);
- q9s32 = vaddq_s32(q9s32, q10s32);
- q9s32 = vaddq_s32(q7s32, q9s32);
+ q1s64 = vpaddlq_s32(q10s32);
+ d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
- q1s64 = vpaddlq_s32(q9s32);
- d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
+ return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+}
- return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride) {
+ int16x4_t d22s16, d24s16, d26s16, d28s16;
+ int64x1_t d0s64;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ int32x4_t q7s32, q8s32, q9s32, q10s32;
+ uint16x8_t q11u16, q12u16, q13u16, q14u16;
+ int64x2_t q1s64;
+
+ d0u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+ d4u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ d1u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+ d5u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ d2u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+ d6u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ d3u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+ d7u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+
+ q11u16 = vsubl_u8(d0u8, d4u8);
+ q12u16 = vsubl_u8(d1u8, d5u8);
+ q13u16 = vsubl_u8(d2u8, d6u8);
+ q14u16 = vsubl_u8(d3u8, d7u8);
+
+ d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
+ d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
+ d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
+ d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
+
+ q7s32 = vmull_s16(d22s16, d22s16);
+ q8s32 = vmull_s16(d24s16, d24s16);
+ q9s32 = vmull_s16(d26s16, d26s16);
+ q10s32 = vmull_s16(d28s16, d28s16);
+
+ q7s32 = vaddq_s32(q7s32, q8s32);
+ q9s32 = vaddq_s32(q9s32, q10s32);
+ q9s32 = vaddq_s32(q7s32, q9s32);
+
+ q1s64 = vpaddlq_s32(q9s32);
+ d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+ return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
}
diff --git a/vpx_dsp/arm/vpx_convolve8_avg_neon.c b/vpx_dsp/arm/vpx_convolve8_avg_neon.c
index 863225013..69cb28400 100644
--- a/vpx_dsp/arm/vpx_convolve8_avg_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_avg_neon.c
@@ -16,16 +16,11 @@
#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
-static INLINE int32x4_t MULTIPLY_BY_Q0(
- int16x4_t dsrc0,
- int16x4_t dsrc1,
- int16x4_t dsrc2,
- int16x4_t dsrc3,
- int16x4_t dsrc4,
- int16x4_t dsrc5,
- int16x4_t dsrc6,
- int16x4_t dsrc7,
- int16x8_t q0s16) {
+static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
+ int16x4_t dsrc2, int16x4_t dsrc3,
+ int16x4_t dsrc4, int16x4_t dsrc5,
+ int16x4_t dsrc6, int16x4_t dsrc7,
+ int16x8_t q0s16) {
int32x4_t qdst;
int16x4_t d0s16, d1s16;
@@ -43,17 +38,12 @@ static INLINE int32x4_t MULTIPLY_BY_Q0(
return qdst;
}
-void vpx_convolve8_avg_horiz_neon(
- const uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride,
- const int16_t *filter_x,
- int x_step_q4,
- const int16_t *filter_y, // unused
- int y_step_q4, // unused
- int w,
- int h) {
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, // unused
+ int y_step_q4, // unused
+ int w, int h) {
int width;
const uint8_t *s;
uint8_t *d;
@@ -76,7 +66,7 @@ void vpx_convolve8_avg_horiz_neon(
q0s16 = vld1q_s16(filter_x);
- src -= 3; // adjust for taps
+ src -= 3; // adjust for taps
for (; h > 0; h -= 4) { // loop_horiz_v
s = src;
d24u8 = vld1_u8(s);
@@ -90,8 +80,8 @@ void vpx_convolve8_avg_horiz_neon(
q12u8 = vcombine_u8(d24u8, d25u8);
q13u8 = vcombine_u8(d26u8, d27u8);
- q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
- vreinterpretq_u16_u8(q13u8));
+ q0x2u16 =
+ vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
@@ -116,10 +106,8 @@ void vpx_convolve8_avg_horiz_neon(
q9u16 = vcombine_u16(d17u16, d19u16);
d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
- for (width = w;
- width > 0;
- width -= 4, src += 4, dst += 4) { // loop_horiz
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
+ for (width = w; width > 0; width -= 4, src += 4, dst += 4) { // loop_horiz
s = src;
d28u32 = vld1_dup_u32((const uint32_t *)s);
s += src_stride;
@@ -131,10 +119,10 @@ void vpx_convolve8_avg_horiz_neon(
__builtin_prefetch(src + 64);
- d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
- vreinterpret_u16_u32(d31u32));
- d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
- vreinterpret_u16_u32(d30u32));
+ d0x2u16 =
+ vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
+ d1x2u16 =
+ vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
vreinterpret_u8_u16(d1x2u16.val[0])); // d29
d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
@@ -144,8 +132,8 @@ void vpx_convolve8_avg_horiz_neon(
q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
- q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
- vreinterpretq_u32_u8(q15u8));
+ q0x2u32 =
+ vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
@@ -173,14 +161,14 @@ void vpx_convolve8_avg_horiz_neon(
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
- q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
- d18s16, d19s16, d23s16, d24s16, q0s16);
- q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
- d19s16, d23s16, d24s16, d26s16, q0s16);
- q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
- d23s16, d24s16, d26s16, d27s16, q0s16);
- q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
- d24s16, d26s16, d27s16, d25s16, q0s16);
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
+ d23s16, d24s16, q0s16);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
+ d24s16, d26s16, q0s16);
+ q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
+ d26s16, d27s16, q0s16);
+ q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
+ d27s16, d25s16, q0s16);
__builtin_prefetch(src + 64 + src_stride * 3);
@@ -195,8 +183,7 @@ void vpx_convolve8_avg_horiz_neon(
d2u8 = vqmovn_u16(q1u16);
d3u8 = vqmovn_u16(q2u16);
- d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
- vreinterpret_u16_u8(d3u8));
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
vreinterpret_u32_u16(d0x2u16.val[1]));
d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
@@ -231,17 +218,12 @@ void vpx_convolve8_avg_horiz_neon(
return;
}
-void vpx_convolve8_avg_vert_neon(
- const uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride,
- const int16_t *filter_x, // unused
- int x_step_q4, // unused
- const int16_t *filter_y,
- int y_step_q4,
- int w,
- int h) {
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, // unused
+ int x_step_q4, // unused
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
int height;
const uint8_t *s;
uint8_t *d;
@@ -277,8 +259,8 @@ void vpx_convolve8_avg_vert_neon(
d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
s += src_stride;
- q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
- q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
+ q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
+ q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
@@ -319,20 +301,20 @@ void vpx_convolve8_avg_vert_neon(
__builtin_prefetch(s);
__builtin_prefetch(s + src_stride);
- q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
- d20s16, d21s16, d22s16, d24s16, q0s16);
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
+ d22s16, d24s16, q0s16);
__builtin_prefetch(s + src_stride * 2);
__builtin_prefetch(s + src_stride * 3);
- q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
- d21s16, d22s16, d24s16, d26s16, q0s16);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
+ d24s16, d26s16, q0s16);
__builtin_prefetch(d);
__builtin_prefetch(d + dst_stride);
- q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
- d22s16, d24s16, d26s16, d27s16, q0s16);
+ q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
+ d26s16, d27s16, q0s16);
__builtin_prefetch(d + dst_stride * 2);
__builtin_prefetch(d + dst_stride * 3);
- q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
- d24s16, d26s16, d27s16, d25s16, q0s16);
+ q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
+ d27s16, d25s16, q0s16);
d2u16 = vqrshrun_n_s32(q1s32, 7);
d3u16 = vqrshrun_n_s32(q2s32, 7);
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index 9bd715e2c..514525696 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -16,16 +16,11 @@
#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
-static INLINE int32x4_t MULTIPLY_BY_Q0(
- int16x4_t dsrc0,
- int16x4_t dsrc1,
- int16x4_t dsrc2,
- int16x4_t dsrc3,
- int16x4_t dsrc4,
- int16x4_t dsrc5,
- int16x4_t dsrc6,
- int16x4_t dsrc7,
- int16x8_t q0s16) {
+static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
+ int16x4_t dsrc2, int16x4_t dsrc3,
+ int16x4_t dsrc4, int16x4_t dsrc5,
+ int16x4_t dsrc6, int16x4_t dsrc7,
+ int16x8_t q0s16) {
int32x4_t qdst;
int16x4_t d0s16, d1s16;
@@ -43,17 +38,12 @@ static INLINE int32x4_t MULTIPLY_BY_Q0(
return qdst;
}
-void vpx_convolve8_horiz_neon(
- const uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride,
- const int16_t *filter_x,
- int x_step_q4,
- const int16_t *filter_y, // unused
- int y_step_q4, // unused
- int w,
- int h) {
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, // unused
+ int y_step_q4, // unused
+ int w, int h) {
int width;
const uint8_t *s, *psrc;
uint8_t *d, *pdst;
@@ -77,9 +67,8 @@ void vpx_convolve8_horiz_neon(
q0s16 = vld1q_s16(filter_x);
src -= 3; // adjust for taps
- for (; h > 0; h -= 4,
- src += src_stride * 4,
- dst += dst_stride * 4) { // loop_horiz_v
+ for (; h > 0; h -= 4, src += src_stride * 4,
+ dst += dst_stride * 4) { // loop_horiz_v
s = src;
d24u8 = vld1_u8(s);
s += src_stride;
@@ -92,8 +81,8 @@ void vpx_convolve8_horiz_neon(
q12u8 = vcombine_u8(d24u8, d25u8);
q13u8 = vcombine_u8(d26u8, d27u8);
- q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
- vreinterpretq_u16_u8(q13u8));
+ q0x2u16 =
+ vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
@@ -105,8 +94,8 @@ void vpx_convolve8_horiz_neon(
__builtin_prefetch(src + src_stride * 5);
__builtin_prefetch(src + src_stride * 6);
- q8u16 = vmovl_u8(d0x2u8.val[0]);
- q9u16 = vmovl_u8(d0x2u8.val[1]);
+ q8u16 = vmovl_u8(d0x2u8.val[0]);
+ q9u16 = vmovl_u8(d0x2u8.val[1]);
q10u16 = vmovl_u8(d1x2u8.val[0]);
q11u16 = vmovl_u8(d1x2u8.val[1]);
@@ -119,8 +108,7 @@ void vpx_convolve8_horiz_neon(
d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
- for (width = w, psrc = src + 7, pdst = dst;
- width > 0;
+ for (width = w, psrc = src + 7, pdst = dst; width > 0;
width -= 4, psrc += 4, pdst += 4) { // loop_horiz
s = psrc;
d28u32 = vld1_dup_u32((const uint32_t *)s);
@@ -133,10 +121,10 @@ void vpx_convolve8_horiz_neon(
__builtin_prefetch(psrc + 64);
- d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
- vreinterpret_u16_u32(d31u32));
- d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
- vreinterpret_u16_u32(d30u32));
+ d0x2u16 =
+ vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
+ d1x2u16 =
+ vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
vreinterpret_u8_u16(d1x2u16.val[0])); // d29
d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
@@ -146,8 +134,8 @@ void vpx_convolve8_horiz_neon(
q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
- q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
- vreinterpretq_u32_u8(q15u8));
+ q0x2u32 =
+ vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
@@ -166,14 +154,14 @@ void vpx_convolve8_horiz_neon(
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
- q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
- d18s16, d19s16, d23s16, d24s16, q0s16);
- q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
- d19s16, d23s16, d24s16, d26s16, q0s16);
- q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
- d23s16, d24s16, d26s16, d27s16, q0s16);
- q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
- d24s16, d26s16, d27s16, d25s16, q0s16);
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
+ d23s16, d24s16, q0s16);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
+ d24s16, d26s16, q0s16);
+ q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
+ d26s16, d27s16, q0s16);
+ q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
+ d27s16, d25s16, q0s16);
__builtin_prefetch(psrc + 60 + src_stride * 3);
@@ -188,8 +176,7 @@ void vpx_convolve8_horiz_neon(
d2u8 = vqmovn_u16(q1u16);
d3u8 = vqmovn_u16(q2u16);
- d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
- vreinterpret_u16_u8(d3u8));
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
vreinterpret_u32_u16(d0x2u16.val[1]));
d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
@@ -217,17 +204,12 @@ void vpx_convolve8_horiz_neon(
return;
}
-void vpx_convolve8_vert_neon(
- const uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride,
- const int16_t *filter_x, // unused
- int x_step_q4, // unused
- const int16_t *filter_y,
- int y_step_q4,
- int w,
- int h) {
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, // unused
+ int x_step_q4, // unused
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
int height;
const uint8_t *s;
uint8_t *d;
@@ -261,8 +243,8 @@ void vpx_convolve8_vert_neon(
d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
s += src_stride;
- q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
- q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
+ q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
+ q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
@@ -294,20 +276,20 @@ void vpx_convolve8_vert_neon(
__builtin_prefetch(d);
__builtin_prefetch(d + dst_stride);
- q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
- d20s16, d21s16, d22s16, d24s16, q0s16);
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
+ d22s16, d24s16, q0s16);
__builtin_prefetch(d + dst_stride * 2);
__builtin_prefetch(d + dst_stride * 3);
- q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
- d21s16, d22s16, d24s16, d26s16, q0s16);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
+ d24s16, d26s16, q0s16);
__builtin_prefetch(s);
__builtin_prefetch(s + src_stride);
- q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
- d22s16, d24s16, d26s16, d27s16, q0s16);
+ q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
+ d26s16, d27s16, q0s16);
__builtin_prefetch(s + src_stride * 2);
__builtin_prefetch(s + src_stride * 3);
- q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
- d24s16, d26s16, d27s16, d25s16, q0s16);
+ q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
+ d27s16, d25s16, q0s16);
d2u16 = vqrshrun_n_s32(q1s32, 7);
d3u16 = vqrshrun_n_s32(q2s32, 7);
diff --git a/vpx_dsp/arm/vpx_convolve_avg_neon.c b/vpx_dsp/arm/vpx_convolve_avg_neon.c
index dc58a332f..abc2511ea 100644
--- a/vpx_dsp/arm/vpx_convolve_avg_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_avg_neon.c
@@ -13,34 +13,32 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
-void vpx_convolve_avg_neon(
- const uint8_t *src, // r0
- ptrdiff_t src_stride, // r1
- uint8_t *dst, // r2
- ptrdiff_t dst_stride, // r3
- const int16_t *filter_x,
- int filter_x_stride,
- const int16_t *filter_y,
- int filter_y_stride,
- int w,
- int h) {
+void vpx_convolve_avg_neon(const uint8_t *src, // r0
+ ptrdiff_t src_stride, // r1
+ uint8_t *dst, // r2
+ ptrdiff_t dst_stride, // r3
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride, int w,
+ int h) {
uint8_t *d;
uint8x8_t d0u8, d1u8, d2u8, d3u8;
uint32x2_t d0u32, d2u32;
uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
- (void)filter_x; (void)filter_x_stride;
- (void)filter_y; (void)filter_y_stride;
+ (void)filter_x;
+ (void)filter_x_stride;
+ (void)filter_y;
+ (void)filter_y_stride;
d = dst;
if (w > 32) { // avg64
for (; h > 0; h -= 1) {
- q0u8 = vld1q_u8(src);
- q1u8 = vld1q_u8(src + 16);
- q2u8 = vld1q_u8(src + 32);
- q3u8 = vld1q_u8(src + 48);
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ q2u8 = vld1q_u8(src + 32);
+ q3u8 = vld1q_u8(src + 48);
src += src_stride;
- q8u8 = vld1q_u8(d);
- q9u8 = vld1q_u8(d + 16);
+ q8u8 = vld1q_u8(d);
+ q9u8 = vld1q_u8(d + 16);
q10u8 = vld1q_u8(d + 32);
q11u8 = vld1q_u8(d + 48);
d += dst_stride;
@@ -133,8 +131,7 @@ void vpx_convolve_avg_neon(
d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
d += dst_stride;
- d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32),
- vreinterpret_u8_u32(d2u32));
+ d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), vreinterpret_u8_u32(d2u32));
d0u32 = vreinterpret_u32_u8(d0u8);
vst1_lane_u32((uint32_t *)dst, d0u32, 0);
diff --git a/vpx_dsp/arm/vpx_convolve_copy_neon.c b/vpx_dsp/arm/vpx_convolve_copy_neon.c
index d8fb97a86..fec189e0e 100644
--- a/vpx_dsp/arm/vpx_convolve_copy_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_copy_neon.c
@@ -13,21 +13,19 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
-void vpx_convolve_copy_neon(
- const uint8_t *src, // r0
- ptrdiff_t src_stride, // r1
- uint8_t *dst, // r2
- ptrdiff_t dst_stride, // r3
- const int16_t *filter_x,
- int filter_x_stride,
- const int16_t *filter_y,
- int filter_y_stride,
- int w,
- int h) {
+void vpx_convolve_copy_neon(const uint8_t *src, // r0
+ ptrdiff_t src_stride, // r1
+ uint8_t *dst, // r2
+ ptrdiff_t dst_stride, // r3
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride, int w,
+ int h) {
uint8x8_t d0u8, d2u8;
uint8x16_t q0u8, q1u8, q2u8, q3u8;
- (void)filter_x; (void)filter_x_stride;
- (void)filter_y; (void)filter_y_stride;
+ (void)filter_x;
+ (void)filter_x_stride;
+ (void)filter_y;
+ (void)filter_y_stride;
if (w > 32) { // copy64
for (; h > 0; h--) {
diff --git a/vpx_dsp/arm/vpx_convolve_neon.c b/vpx_dsp/arm/vpx_convolve_neon.c
index 1506ce620..c2d5895b7 100644
--- a/vpx_dsp/arm/vpx_convolve_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_neon.c
@@ -14,10 +14,9 @@
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_ports/mem.h"
-void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int x_step_q4, const int16_t *filter_y, int y_step_q4,
int w, int h) {
/* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
* maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
@@ -35,23 +34,20 @@ void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
* the temp buffer which has lots of extra room and is subsequently discarded
* this is safe if somewhat less than ideal.
*/
- vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride,
- temp, 64,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, intermediate_height);
+ vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
+ x_step_q4, filter_y, y_step_q4, w,
+ intermediate_height);
/* Step into the temp buffer 3 lines to get the actual frame data */
- vpx_convolve8_vert_neon(temp + 64 * 3, 64,
- dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
}
void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
int intermediate_height = h + 7;
@@ -61,12 +57,9 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
/* This implementation has the same issues as above. In addition, we only want
* to average the values after both passes.
*/
- vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride,
- temp, 64,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, intermediate_height);
- vpx_convolve8_avg_vert_neon(temp + 64 * 3,
- 64, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
+ x_step_q4, filter_y, y_step_q4, w,
+ intermediate_height);
+ vpx_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
}
diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c
index a8c996663..4d9abb8de 100644
--- a/vpx_dsp/avg.c
+++ b/vpx_dsp/avg.c
@@ -15,8 +15,9 @@
unsigned int vpx_avg_8x8_c(const uint8_t *s, int p) {
int i, j;
int sum = 0;
- for (i = 0; i < 8; ++i, s+=p)
- for (j = 0; j < 8; sum += s[j], ++j) {}
+ for (i = 0; i < 8; ++i, s += p)
+ for (j = 0; j < 8; sum += s[j], ++j) {
+ }
return (sum + 32) >> 6;
}
@@ -24,8 +25,9 @@ unsigned int vpx_avg_8x8_c(const uint8_t *s, int p) {
unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) {
int i, j;
int sum = 0;
- for (i = 0; i < 4; ++i, s+=p)
- for (j = 0; j < 4; sum += s[j], ++j) {}
+ for (i = 0; i < 4; ++i, s += p)
+ for (j = 0; j < 4; sum += s[j], ++j) {
+ }
return (sum + 8) >> 4;
}
@@ -80,8 +82,8 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
for (idx = 0; idx < 8; ++idx) {
hadamard_col8(tmp_buf, 8, coeff); // tmp_buf: 12 bit
// dynamic range [-2040, 2040]
- coeff += 8; // coeff: 15 bit
- // dynamic range [-16320, 16320]
+ coeff += 8; // coeff: 15 bit
+ // dynamic range [-16320, 16320]
++tmp_buf;
}
}
@@ -92,8 +94,8 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
int idx;
for (idx = 0; idx < 4; ++idx) {
// src_diff: 9 bit, dynamic range [-255, 255]
- const int16_t *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
- + (idx & 0x01) * 8;
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
vpx_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
}
@@ -109,8 +111,8 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320]
int16_t b3 = (a2 - a3) >> 1;
- coeff[0] = b0 + b2; // 16 bit, [-32640, 32640]
- coeff[64] = b1 + b3;
+ coeff[0] = b0 + b2; // 16 bit, [-32640, 32640]
+ coeff[64] = b1 + b3;
coeff[128] = b0 - b2;
coeff[192] = b1 - b3;
@@ -123,8 +125,7 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
int vpx_satd_c(const int16_t *coeff, int length) {
int i;
int satd = 0;
- for (i = 0; i < length; ++i)
- satd += abs(coeff[i]);
+ for (i = 0; i < length; ++i) satd += abs(coeff[i]);
// satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
return satd;
@@ -140,8 +141,7 @@ void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
int i;
hbuf[idx] = 0;
// hbuf[idx]: 14 bit, dynamic range [0, 16320].
- for (i = 0; i < height; ++i)
- hbuf[idx] += ref[i * ref_stride];
+ for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
// hbuf[idx]: 9 bit, dynamic range [0, 510].
hbuf[idx] /= norm_factor;
++ref;
@@ -153,16 +153,14 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width) {
int idx;
int16_t sum = 0;
// sum: 14 bit, dynamic range [0, 16320]
- for (idx = 0; idx < width; ++idx)
- sum += ref[idx];
+ for (idx = 0; idx < width; ++idx) sum += ref[idx];
return sum;
}
// ref: [0 - 510]
// src: [0 - 510]
// bwl: {2, 3, 4}
-int vpx_vector_var_c(const int16_t *ref, const int16_t *src,
- const int bwl) {
+int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl) {
int i;
int width = 4 << bwl;
int sse = 0, mean = 0, var;
@@ -185,7 +183,7 @@ void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
*max = 0;
for (i = 0; i < 8; ++i, s += p, d += dp) {
for (j = 0; j < 8; ++j) {
- int diff = abs(s[j]-d[j]);
+ int diff = abs(s[j] - d[j]);
*min = diff < *min ? diff : *min;
*max = diff > *max ? diff : *max;
}
@@ -196,9 +194,10 @@ void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p) {
int i, j;
int sum = 0;
- const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
- for (i = 0; i < 8; ++i, s+=p)
- for (j = 0; j < 8; sum += s[j], ++j) {}
+ const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+ for (i = 0; i < 8; ++i, s += p)
+ for (j = 0; j < 8; sum += s[j], ++j) {
+ }
return (sum + 32) >> 6;
}
@@ -206,9 +205,10 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p) {
unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p) {
int i, j;
int sum = 0;
- const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
- for (i = 0; i < 4; ++i, s+=p)
- for (j = 0; j < 4; sum += s[j], ++j) {}
+ const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+ for (i = 0; i < 4; ++i, s += p)
+ for (j = 0; j < 4; sum += s[j], ++j) {
+ }
return (sum + 8) >> 4;
}
@@ -216,18 +216,16 @@ unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p) {
void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
int dp, int *min, int *max) {
int i, j;
- const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
- const uint16_t* d = CONVERT_TO_SHORTPTR(d8);
+ const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+ const uint16_t *d = CONVERT_TO_SHORTPTR(d8);
*min = 255;
*max = 0;
for (i = 0; i < 8; ++i, s += p, d += dp) {
for (j = 0; j < 8; ++j) {
- int diff = abs(s[j]-d[j]);
+ int diff = abs(s[j] - d[j]);
*min = diff < *min ? diff : *min;
*max = diff > *max ? diff : *max;
}
}
}
#endif // CONFIG_VP9_HIGHBITDEPTH
-
-
diff --git a/vpx_dsp/bitreader.c b/vpx_dsp/bitreader.c
index 8140e78e7..90cbbba53 100644
--- a/vpx_dsp/bitreader.c
+++ b/vpx_dsp/bitreader.c
@@ -18,11 +18,8 @@
#include "vpx_mem/vpx_mem.h"
#include "vpx_util/endian_inl.h"
-int vpx_reader_init(vpx_reader *r,
- const uint8_t *buffer,
- size_t size,
- vpx_decrypt_cb decrypt_cb,
- void *decrypt_state) {
+int vpx_reader_init(vpx_reader *r, const uint8_t *buffer, size_t size,
+ vpx_decrypt_cb decrypt_cb, void *decrypt_state) {
if (size && !buffer) {
return 1;
} else {
@@ -55,19 +52,19 @@ void vpx_reader_fill(vpx_reader *r) {
buffer_start = r->clear_buffer;
}
if (bits_left > BD_VALUE_SIZE) {
- const int bits = (shift & 0xfffffff8) + CHAR_BIT;
- BD_VALUE nv;
- BD_VALUE big_endian_values;
- memcpy(&big_endian_values, buffer, sizeof(BD_VALUE));
+ const int bits = (shift & 0xfffffff8) + CHAR_BIT;
+ BD_VALUE nv;
+ BD_VALUE big_endian_values;
+ memcpy(&big_endian_values, buffer, sizeof(BD_VALUE));
#if SIZE_MAX == 0xffffffffffffffffULL
- big_endian_values = HToBE64(big_endian_values);
+ big_endian_values = HToBE64(big_endian_values);
#else
- big_endian_values = HToBE32(big_endian_values);
+ big_endian_values = HToBE32(big_endian_values);
#endif
- nv = big_endian_values >> (BD_VALUE_SIZE - bits);
- count += bits;
- buffer += (bits >> 3);
- value = r->value | (nv << (shift & 0x7));
+ nv = big_endian_values >> (BD_VALUE_SIZE - bits);
+ count += bits;
+ buffer += (bits >> 3);
+ value = r->value | (nv << (shift & 0x7));
} else {
const int bits_over = (int)(shift + CHAR_BIT - (int)bits_left);
int loop_end = 0;
diff --git a/vpx_dsp/bitreader.h b/vpx_dsp/bitreader.h
index 9a441b410..6ee2a5863 100644
--- a/vpx_dsp/bitreader.h
+++ b/vpx_dsp/bitreader.h
@@ -45,11 +45,8 @@ typedef struct {
uint8_t clear_buffer[sizeof(BD_VALUE) + 1];
} vpx_reader;
-int vpx_reader_init(vpx_reader *r,
- const uint8_t *buffer,
- size_t size,
- vpx_decrypt_cb decrypt_cb,
- void *decrypt_state);
+int vpx_reader_init(vpx_reader *r, const uint8_t *buffer, size_t size,
+ vpx_decrypt_cb decrypt_cb, void *decrypt_state);
void vpx_reader_fill(vpx_reader *r);
@@ -81,8 +78,7 @@ static INLINE int vpx_read(vpx_reader *r, int prob) {
unsigned int range;
unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT;
- if (r->count < 0)
- vpx_reader_fill(r);
+ if (r->count < 0) vpx_reader_fill(r);
value = r->value;
count = r->count;
@@ -117,8 +113,7 @@ static INLINE int vpx_read_bit(vpx_reader *r) {
static INLINE int vpx_read_literal(vpx_reader *r, int bits) {
int literal = 0, bit;
- for (bit = bits - 1; bit >= 0; bit--)
- literal |= vpx_read_bit(r) << bit;
+ for (bit = bits - 1; bit >= 0; bit--) literal |= vpx_read_bit(r) << bit;
return literal;
}
@@ -127,8 +122,7 @@ static INLINE int vpx_read_tree(vpx_reader *r, const vpx_tree_index *tree,
const vpx_prob *probs) {
vpx_tree_index i = 0;
- while ((i = tree[i + vpx_read(r, probs[i >> 1])]) > 0)
- continue;
+ while ((i = tree[i + vpx_read(r, probs[i >> 1])]) > 0) continue;
return -i;
}
diff --git a/vpx_dsp/bitreader_buffer.c b/vpx_dsp/bitreader_buffer.c
index d7b55cf9f..e99fffb60 100644
--- a/vpx_dsp/bitreader_buffer.c
+++ b/vpx_dsp/bitreader_buffer.c
@@ -30,23 +30,20 @@ int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb) {
int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits) {
int value = 0, bit;
- for (bit = bits - 1; bit >= 0; bit--)
- value |= vpx_rb_read_bit(rb) << bit;
+ for (bit = bits - 1; bit >= 0; bit--) value |= vpx_rb_read_bit(rb) << bit;
return value;
}
-int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb,
- int bits) {
+int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, int bits) {
const int value = vpx_rb_read_literal(rb, bits);
return vpx_rb_read_bit(rb) ? -value : value;
}
-int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb,
- int bits) {
+int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, int bits) {
#if CONFIG_MISC_FIXES
const int nbits = sizeof(unsigned) * 8 - bits - 1;
const unsigned value = (unsigned)vpx_rb_read_literal(rb, bits + 1) << nbits;
- return ((int) value) >> nbits;
+ return ((int)value) >> nbits;
#else
return vpx_rb_read_signed_literal(rb, bits);
#endif
diff --git a/vpx_dsp/bitwriter.c b/vpx_dsp/bitwriter.c
index 5b232e346..81e28b309 100644
--- a/vpx_dsp/bitwriter.c
+++ b/vpx_dsp/bitwriter.c
@@ -14,21 +14,18 @@
void vpx_start_encode(vpx_writer *br, uint8_t *source) {
br->lowvalue = 0;
- br->range = 255;
- br->count = -24;
- br->buffer = source;
- br->pos = 0;
+ br->range = 255;
+ br->count = -24;
+ br->buffer = source;
+ br->pos = 0;
vpx_write_bit(br, 0);
}
void vpx_stop_encode(vpx_writer *br) {
int i;
- for (i = 0; i < 32; i++)
- vpx_write_bit(br, 0);
+ for (i = 0; i < 32; i++) vpx_write_bit(br, 0);
// Ensure there's no ambigous collision with any index marker bytes
- if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0)
- br->buffer[br->pos++] = 0;
+ if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) br->buffer[br->pos++] = 0;
}
-
diff --git a/vpx_dsp/bitwriter.h b/vpx_dsp/bitwriter.h
index d904997af..41040cf93 100644
--- a/vpx_dsp/bitwriter.h
+++ b/vpx_dsp/bitwriter.h
@@ -85,8 +85,7 @@ static INLINE void vpx_write_bit(vpx_writer *w, int bit) {
static INLINE void vpx_write_literal(vpx_writer *w, int data, int bits) {
int bit;
- for (bit = bits - 1; bit >= 0; bit--)
- vpx_write_bit(w, 1 & (data >> bit));
+ for (bit = bits - 1; bit >= 0; bit--) vpx_write_bit(w, 1 & (data >> bit));
}
#define vpx_write_prob(w, v) vpx_write_literal((w), (v), 8)
diff --git a/vpx_dsp/bitwriter_buffer.c b/vpx_dsp/bitwriter_buffer.c
index 6182a7222..1043cdc78 100644
--- a/vpx_dsp/bitwriter_buffer.c
+++ b/vpx_dsp/bitwriter_buffer.c
@@ -22,7 +22,7 @@ void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit) {
const int off = (int)wb->bit_offset;
const int p = off / CHAR_BIT;
const int q = CHAR_BIT - 1 - off % CHAR_BIT;
- if (q == CHAR_BIT -1) {
+ if (q == CHAR_BIT - 1) {
wb->bit_buffer[p] = bit << q;
} else {
wb->bit_buffer[p] &= ~(1 << q);
@@ -33,12 +33,11 @@ void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit) {
void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits) {
int bit;
- for (bit = bits - 1; bit >= 0; bit--)
- vpx_wb_write_bit(wb, (data >> bit) & 1);
+ for (bit = bits - 1; bit >= 0; bit--) vpx_wb_write_bit(wb, (data >> bit) & 1);
}
-void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb,
- int data, int bits) {
+void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb, int data,
+ int bits) {
#if CONFIG_MISC_FIXES
vpx_wb_write_literal(wb, data, bits + 1);
#else
diff --git a/vpx_dsp/deblock.c b/vpx_dsp/deblock.c
index 2b1b7e30e..d3e047221 100644
--- a/vpx_dsp/deblock.c
+++ b/vpx_dsp/deblock.c
@@ -10,26 +10,32 @@
#include <stdlib.h>
#include "vpx/vpx_integer.h"
-const int16_t vpx_rv[] = {8, 5, 2, 2, 8, 12, 4, 9, 8, 3, 0, 3, 9, 0, 0, 0, 8, 3,
- 14, 4, 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, 8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
- 8, 11, 13, 4, 2, 9, 0, 3, 9, 6, 1, 2, 3, 14, 13, 1, 8, 2, 9, 7, 3, 3, 1, 13,
- 13, 6, 6, 5, 2, 7, 11, 9, 11, 8, 7, 3, 2, 0, 13, 13, 14, 4, 12, 5, 12, 10,
- 8, 10, 13, 10, 4, 14, 4, 10, 0, 8, 11, 1, 13, 7, 7, 14, 6, 14, 13, 2, 13, 5,
- 4, 4, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, 3,
- 4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13, 1, 12, 0,
- 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, 6, 10, 11, 7, 8, 7,
- 5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
- 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0, 3,
- 10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6, 10,
- 8, 9, 4, 11, 14, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2,
- 2, 5, 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13,
- 1, 12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, 6, 10, 11,
- 7, 8, 7, 5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14,
- 5, 2, 6, 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
- 0, 3, 10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6,
- 10, 8, 9, 4, 11, 14, 3, 8, 3, 7, 8, 5, 11, 4, 12, 3, 11, 9, 14, 8, 14, 13,
- 4, 3, 1, 2, 14, 6, 5, 4, 4, 11, 4, 6, 2, 1, 5, 8, 8, 12, 13, 5, 14, 10, 12,
- 13, 0, 9, 5, 5, 11, 10, 13, 9, 10, 13, };
+const int16_t vpx_rv[] = {
+ 8, 5, 2, 2, 8, 12, 4, 9, 8, 3, 0, 3, 9, 0, 0, 0, 8, 3, 14,
+ 4, 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, 8, 6, 10, 0, 0, 8, 9, 0,
+ 3, 14, 8, 11, 13, 4, 2, 9, 0, 3, 9, 6, 1, 2, 3, 14, 13, 1, 8,
+ 2, 9, 7, 3, 3, 1, 13, 13, 6, 6, 5, 2, 7, 11, 9, 11, 8, 7, 3,
+ 2, 0, 13, 13, 14, 4, 12, 5, 12, 10, 8, 10, 13, 10, 4, 14, 4, 10, 0,
+ 8, 11, 1, 13, 7, 7, 14, 6, 14, 13, 2, 13, 5, 4, 4, 0, 10, 0, 5,
+ 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, 3, 4, 7,
+ 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13, 1,
+ 12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9,
+ 6, 10, 11, 7, 8, 7, 5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
+ 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, 11, 12, 12, 8, 0, 11, 13, 1, 2,
+ 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0, 3, 10, 5, 8, 0, 11, 6,
+ 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6, 10, 8, 9,
+ 4, 11, 14, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2,
+ 7, 2, 2, 5, 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3,
+ 0, 11, 8, 13, 1, 13, 1, 12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7,
+ 1, 13, 14, 7, 6, 7, 9, 6, 10, 11, 7, 8, 7, 5, 14, 8, 4, 4, 0,
+ 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, 11, 12,
+ 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0,
+ 3, 10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
+ 4, 3, 5, 6, 10, 8, 9, 4, 11, 14, 3, 8, 3, 7, 8, 5, 11, 4, 12,
+ 3, 11, 9, 14, 8, 14, 13, 4, 3, 1, 2, 14, 6, 5, 4, 4, 11, 4, 6,
+ 2, 1, 5, 8, 8, 12, 13, 5, 14, 10, 12, 13, 0, 9, 5, 5, 11, 10, 13,
+ 9, 10, 13,
+};
void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
unsigned char *dst_ptr,
@@ -55,8 +61,8 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
v = p_src[col];
- if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col])
- && (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) {
+ if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) &&
+ (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) {
unsigned char k1, k2, k3;
k1 = (p_above2 + p_above1 + 1) >> 1;
k2 = (p_below2 + p_below1 + 1) >> 1;
@@ -77,10 +83,10 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
for (col = 0; col < cols; col++) {
v = p_src[col];
- if ((abs(v - p_src[col - 2]) < f[col])
- && (abs(v - p_src[col - 1]) < f[col])
- && (abs(v - p_src[col + 1]) < f[col])
- && (abs(v - p_src[col + 2]) < f[col])) {
+ if ((abs(v - p_src[col - 2]) < f[col]) &&
+ (abs(v - p_src[col - 1]) < f[col]) &&
+ (abs(v - p_src[col + 1]) < f[col]) &&
+ (abs(v - p_src[col + 2]) < f[col])) {
unsigned char k1, k2, k3;
k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
@@ -90,8 +96,7 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
d[col & 3] = v;
- if (col >= 2)
- p_dst[col - 2] = d[(col - 2) & 3];
+ if (col >= 2) p_dst[col - 2] = d[(col - 2) & 3];
}
/* handle the last two pixels */
@@ -115,14 +120,12 @@ void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows,
int sumsq = 0;
int sum = 0;
- for (i = -8; i < 0; i++)
- s[i] = s[0];
+ for (i = -8; i < 0; i++) s[i] = s[0];
/* 17 avoids valgrind warning - we buffer values in c in d
* and only write them when we've read 8 ahead...
*/
- for (i = 0; i < 17; i++)
- s[i + cols] = s[cols - 1];
+ for (i = 0; i < 17; i++) s[i + cols] = s[cols - 1];
for (i = -8; i <= 6; i++) {
sumsq += s[i] * s[i];
@@ -162,14 +165,12 @@ void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
unsigned char d[16];
const int16_t *rv2 = rv3 + ((c * 17) & 127);
- for (i = -8; i < 0; i++)
- s[i * pitch] = s[0];
+ for (i = -8; i < 0; i++) s[i * pitch] = s[0];
/* 17 avoids valgrind warning - we buffer values in c in d
* and only write them when we've read 8 ahead...
*/
- for (i = 0; i < 17; i++)
- s[(i + rows) * pitch] = s[(rows - 1) * pitch];
+ for (i = 0; i < 17; i++) s[(i + rows) * pitch] = s[(rows - 1) * pitch];
for (i = -8; i <= 6; i++) {
sumsq += s[i * pitch] * s[i * pitch];
@@ -184,20 +185,18 @@ void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
if (sumsq * 15 - sum * sum < flimit) {
d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
}
- if (r >= 8)
- s[-8 * pitch] = d[(r - 8) & 15];
+ if (r >= 8) s[-8 * pitch] = d[(r - 8) & 15];
s += pitch;
}
}
}
#if CONFIG_POSTPROC
-static void vpx_de_mblock(YV12_BUFFER_CONFIG *post,
- int q) {
+static void vpx_de_mblock(YV12_BUFFER_CONFIG *post, int q) {
vpx_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
- post->y_width, q2mbl(q));
+ post->y_width, q2mbl(q));
vpx_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
- post->y_width, q2mbl(q));
+ post->y_width, q2mbl(q));
}
#endif
diff --git a/vpx_dsp/fastssim.c b/vpx_dsp/fastssim.c
index 7d9089171..4d5eb5a6f 100644
--- a/vpx_dsp/fastssim.c
+++ b/vpx_dsp/fastssim.c
@@ -55,12 +55,12 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
int l;
lw = (_w + 1) >> 1;
lh = (_h + 1) >> 1;
- data_size = _nlevels * sizeof(fs_level)
- + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf);
+ data_size =
+ _nlevels * sizeof(fs_level) + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf);
for (l = 0; l < _nlevels; l++) {
size_t im_size;
size_t level_size;
- im_size = lw * (size_t) lh;
+ im_size = lw * (size_t)lh;
level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
level_size += sizeof(*_ctx->level[l].ssim) - 1;
level_size /= sizeof(*_ctx->level[l].ssim);
@@ -70,8 +70,8 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
lw = (lw + 1) >> 1;
lh = (lh + 1) >> 1;
}
- data = (unsigned char *) malloc(data_size);
- _ctx->level = (fs_level *) data;
+ data = (unsigned char *)malloc(data_size);
+ _ctx->level = (fs_level *)data;
_ctx->nlevels = _nlevels;
data += _nlevels * sizeof(*_ctx->level);
lw = (_w + 1) >> 1;
@@ -81,7 +81,7 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
size_t level_size;
_ctx->level[l].w = lw;
_ctx->level[l].h = lh;
- im_size = lw * (size_t) lh;
+ im_size = lw * (size_t)lh;
level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
level_size += sizeof(*_ctx->level[l].ssim) - 1;
level_size /= sizeof(*_ctx->level[l].ssim);
@@ -89,17 +89,15 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
_ctx->level[l].im1 = (uint32_t *)data;
_ctx->level[l].im2 = _ctx->level[l].im1 + im_size;
data += level_size;
- _ctx->level[l].ssim = (double *) data;
+ _ctx->level[l].ssim = (double *)data;
data += im_size * sizeof(*_ctx->level[l].ssim);
lw = (lw + 1) >> 1;
lh = (lh + 1) >> 1;
}
- _ctx->col_buf = (unsigned *) data;
+ _ctx->col_buf = (unsigned *)data;
}
-static void fs_ctx_clear(fs_ctx *_ctx) {
- free(_ctx->level);
-}
+static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); }
static void fs_downsample_level(fs_ctx *_ctx, int _l) {
const uint32_t *src1;
@@ -130,18 +128,18 @@ static void fs_downsample_level(fs_ctx *_ctx, int _l) {
int i1;
i0 = 2 * i;
i1 = FS_MINI(i0 + 1, w2);
- dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1]
- + src1[j1offs + i0] + src1[j1offs + i1];
- dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1]
- + src2[j1offs + i0] + src2[j1offs + i1];
+ dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] +
+ src1[j1offs + i0] + src1[j1offs + i1];
+ dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] +
+ src2[j1offs + i0] + src2[j1offs + i1];
}
}
}
static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1,
int _s1ystride, const uint8_t *_src2,
- int _s2ystride, int _w, int _h,
- uint32_t bd, uint32_t shift) {
+ int _s2ystride, int _w, int _h, uint32_t bd,
+ uint32_t shift) {
uint32_t *dst1;
uint32_t *dst2;
int w;
@@ -163,23 +161,23 @@ static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1,
i0 = 2 * i;
i1 = FS_MINI(i0 + 1, _w);
if (bd == 8 && shift == 0) {
- dst1[j * w + i] = _src1[j0 * _s1ystride + i0]
- + _src1[j0 * _s1ystride + i1] + _src1[j1 * _s1ystride + i0]
- + _src1[j1 * _s1ystride + i1];
- dst2[j * w + i] = _src2[j0 * _s2ystride + i0]
- + _src2[j0 * _s2ystride + i1] + _src2[j1 * _s2ystride + i0]
- + _src2[j1 * _s2ystride + i1];
+ dst1[j * w + i] =
+ _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] +
+ _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1];
+ dst2[j * w + i] =
+ _src2[j0 * _s2ystride + i0] + _src2[j0 * _s2ystride + i1] +
+ _src2[j1 * _s2ystride + i0] + _src2[j1 * _s2ystride + i1];
} else {
- uint16_t * src1s = CONVERT_TO_SHORTPTR(_src1);
- uint16_t * src2s = CONVERT_TO_SHORTPTR(_src2);
- dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift)
- + (src1s[j0 * _s1ystride + i1] >> shift)
- + (src1s[j1 * _s1ystride + i0] >> shift)
- + (src1s[j1 * _s1ystride + i1] >> shift);
- dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift)
- + (src2s[j0 * _s2ystride + i1] >> shift)
- + (src2s[j1 * _s2ystride + i0] >> shift)
- + (src2s[j1 * _s2ystride + i1] >> shift);
+ uint16_t *src1s = CONVERT_TO_SHORTPTR(_src1);
+ uint16_t *src2s = CONVERT_TO_SHORTPTR(_src2);
+ dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) +
+ (src1s[j0 * _s1ystride + i1] >> shift) +
+ (src1s[j1 * _s1ystride + i0] >> shift) +
+ (src1s[j1 * _s1ystride + i1] >> shift);
+ dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) +
+ (src2s[j0 * _s2ystride + i1] >> shift) +
+ (src2s[j1 * _s2ystride + i0] >> shift) +
+ (src2s[j1 * _s2ystride + i1] >> shift);
}
}
}
@@ -200,10 +198,8 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
int j;
double ssim_c1 = SSIM_C1;
#if CONFIG_VP9_HIGHBITDEPTH
- if (bit_depth == 10)
- ssim_c1 = SSIM_C1_10;
- if (bit_depth == 12)
- ssim_c1 = SSIM_C1_12;
+ if (bit_depth == 10) ssim_c1 = SSIM_C1_10;
+ if (bit_depth == 12) ssim_c1 = SSIM_C1_12;
#else
assert(bit_depth == 8);
#endif
@@ -213,19 +209,15 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
col_sums_y = col_sums_x + w;
im1 = _ctx->level[_l].im1;
im2 = _ctx->level[_l].im2;
- for (i = 0; i < w; i++)
- col_sums_x[i] = 5 * im1[i];
- for (i = 0; i < w; i++)
- col_sums_y[i] = 5 * im2[i];
+ for (i = 0; i < w; i++) col_sums_x[i] = 5 * im1[i];
+ for (i = 0; i < w; i++) col_sums_y[i] = 5 * im2[i];
for (j = 1; j < 4; j++) {
j1offs = FS_MINI(j, h - 1) * w;
- for (i = 0; i < w; i++)
- col_sums_x[i] += im1[j1offs + i];
- for (i = 0; i < w; i++)
- col_sums_y[i] += im2[j1offs + i];
+ for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
+ for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
}
ssim = _ctx->level[_l].ssim;
- c1 = (double) (ssim_c1 * 4096 * (1 << 4 * _l));
+ c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l));
for (j = 0; j < h; j++) {
unsigned mux;
unsigned muy;
@@ -239,8 +231,8 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
muy += col_sums_y[i1];
}
for (i = 0; i < w; i++) {
- ssim[j * w + i] *= (2 * mux * (double) muy + c1)
- / (mux * (double) mux + muy * (double) muy + c1);
+ ssim[j * w + i] *= (2 * mux * (double)muy + c1) /
+ (mux * (double)mux + muy * (double)muy + c1);
if (i + 1 < w) {
i0 = FS_MAXI(0, i - 4);
i1 = FS_MINI(i + 4, w - 1);
@@ -250,78 +242,68 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
}
if (j + 1 < h) {
j0offs = FS_MAXI(0, j - 4) * w;
- for (i = 0; i < w; i++)
- col_sums_x[i] -= im1[j0offs + i];
- for (i = 0; i < w; i++)
- col_sums_y[i] -= im2[j0offs + i];
+ for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i];
+ for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i];
j1offs = FS_MINI(j + 4, h - 1) * w;
- for (i = 0; i < w; i++)
- col_sums_x[i] += im1[j1offs + i];
- for (i = 0; i < w; i++)
- col_sums_y[i] += im2[j1offs + i];
+ for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
+ for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
}
}
}
-#define FS_COL_SET(_col, _joffs, _ioffs) \
- do { \
- unsigned gx; \
- unsigned gy; \
+#define FS_COL_SET(_col, _joffs, _ioffs) \
+ do { \
+ unsigned gx; \
+ unsigned gy; \
gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
- col_sums_gx2[(_col)] = gx * (double)gx; \
- col_sums_gy2[(_col)] = gy * (double)gy; \
- col_sums_gxgy[(_col)] = gx * (double)gy; \
- } \
- while (0)
+ col_sums_gx2[(_col)] = gx * (double)gx; \
+ col_sums_gy2[(_col)] = gy * (double)gy; \
+ col_sums_gxgy[(_col)] = gx * (double)gy; \
+ } while (0)
-#define FS_COL_ADD(_col, _joffs, _ioffs) \
- do { \
- unsigned gx; \
- unsigned gy; \
+#define FS_COL_ADD(_col, _joffs, _ioffs) \
+ do { \
+ unsigned gx; \
+ unsigned gy; \
gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
- col_sums_gx2[(_col)] += gx * (double)gx; \
- col_sums_gy2[(_col)] += gy * (double)gy; \
- col_sums_gxgy[(_col)] += gx * (double)gy; \
- } \
- while (0)
+ col_sums_gx2[(_col)] += gx * (double)gx; \
+ col_sums_gy2[(_col)] += gy * (double)gy; \
+ col_sums_gxgy[(_col)] += gx * (double)gy; \
+ } while (0)
-#define FS_COL_SUB(_col, _joffs, _ioffs) \
- do { \
- unsigned gx; \
- unsigned gy; \
+#define FS_COL_SUB(_col, _joffs, _ioffs) \
+ do { \
+ unsigned gx; \
+ unsigned gy; \
gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
- col_sums_gx2[(_col)] -= gx * (double)gx; \
- col_sums_gy2[(_col)] -= gy * (double)gy; \
- col_sums_gxgy[(_col)] -= gx * (double)gy; \
- } \
- while (0)
+ col_sums_gx2[(_col)] -= gx * (double)gx; \
+ col_sums_gy2[(_col)] -= gy * (double)gy; \
+ col_sums_gxgy[(_col)] -= gx * (double)gy; \
+ } while (0)
-#define FS_COL_COPY(_col1, _col2) \
- do { \
- col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)]; \
- col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)]; \
+#define FS_COL_COPY(_col1, _col2) \
+ do { \
+ col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)]; \
+ col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)]; \
col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \
- } \
- while (0)
+ } while (0)
-#define FS_COL_HALVE(_col1, _col2) \
- do { \
- col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5; \
- col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5; \
+#define FS_COL_HALVE(_col1, _col2) \
+ do { \
+ col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5; \
+ col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5; \
col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \
- } \
- while (0)
+ } while (0)
-#define FS_COL_DOUBLE(_col1, _col2) \
- do { \
- col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2; \
- col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2; \
+#define FS_COL_DOUBLE(_col1, _col2) \
+ do { \
+ col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2; \
+ col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2; \
col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \
- } \
- while (0)
+ } while (0)
static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
uint32_t *im1;
@@ -340,10 +322,8 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
int j;
double ssim_c2 = SSIM_C2;
#if CONFIG_VP9_HIGHBITDEPTH
- if (bit_depth == 10)
- ssim_c2 = SSIM_C2_10;
- if (bit_depth == 12)
- ssim_c2 = SSIM_C2_12;
+ if (bit_depth == 10) ssim_c2 = SSIM_C2_10;
+ if (bit_depth == 12) ssim_c2 = SSIM_C2_12;
#else
assert(bit_depth == 8);
#endif
@@ -398,14 +378,11 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
double mugy2;
double mugxgy;
mugx2 = col_sums_gx2[0];
- for (k = 1; k < 8; k++)
- mugx2 += col_sums_gx2[k];
+ for (k = 1; k < 8; k++) mugx2 += col_sums_gx2[k];
mugy2 = col_sums_gy2[0];
- for (k = 1; k < 8; k++)
- mugy2 += col_sums_gy2[k];
+ for (k = 1; k < 8; k++) mugy2 += col_sums_gy2[k];
mugxgy = col_sums_gxgy[0];
- for (k = 1; k < 8; k++)
- mugxgy += col_sums_gxgy[k];
+ for (k = 1; k < 8; k++) mugxgy += col_sums_gxgy[k];
ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2);
if (i + 1 < w) {
FS_COL_SET(0, -1, 1);
@@ -440,8 +417,9 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}.
We drop the finest scale and renormalize the rest to sum to 1.*/
-static const double FS_WEIGHTS[FS_NLEVELS] = {0.2989654541015625,
- 0.3141326904296875, 0.2473602294921875, 0.1395416259765625};
+static const double FS_WEIGHTS[FS_NLEVELS] = {
+ 0.2989654541015625, 0.3141326904296875, 0.2473602294921875, 0.1395416259765625
+};
static double fs_average(fs_ctx *_ctx, int _l) {
double *ssim;
@@ -455,28 +433,26 @@ static double fs_average(fs_ctx *_ctx, int _l) {
ssim = _ctx->level[_l].ssim;
ret = 0;
for (j = 0; j < h; j++)
- for (i = 0; i < w; i++)
- ret += ssim[j * w + i];
+ for (i = 0; i < w; i++) ret += ssim[j * w + i];
return pow(ret / (w * h), FS_WEIGHTS[_l]);
}
static double convert_ssim_db(double _ssim, double _weight) {
assert(_weight >= _ssim);
- if ((_weight - _ssim) < 1e-10)
- return MAX_SSIM_DB;
+ if ((_weight - _ssim) < 1e-10) return MAX_SSIM_DB;
return 10 * (log10(_weight) - log10(_weight - _ssim));
}
-static double calc_ssim(const uint8_t *_src, int _systride,
- const uint8_t *_dst, int _dystride,
- int _w, int _h, uint32_t _bd, uint32_t _shift) {
+static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst,
+ int _dystride, int _w, int _h, uint32_t _bd,
+ uint32_t _shift) {
fs_ctx ctx;
double ret;
int l;
ret = 1;
fs_ctx_init(&ctx, _w, _h, FS_NLEVELS);
- fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride,
- _w, _h, _bd, _shift);
+ fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _bd,
+ _shift);
for (l = 0; l < FS_NLEVELS - 1; l++) {
fs_calc_structure(&ctx, l, _bd);
ret *= fs_average(&ctx, l);
@@ -490,9 +466,9 @@ static double calc_ssim(const uint8_t *_src, int _systride,
}
double vpx_calc_fastssim(const YV12_BUFFER_CONFIG *source,
- const YV12_BUFFER_CONFIG *dest,
- double *ssim_y, double *ssim_u, double *ssim_v,
- uint32_t bd, uint32_t in_bd) {
+ const YV12_BUFFER_CONFIG *dest, double *ssim_y,
+ double *ssim_u, double *ssim_v, uint32_t bd,
+ uint32_t in_bd) {
double ssimv;
uint32_t bd_shift = 0;
vpx_clear_system_state();
diff --git a/vpx_dsp/fwd_txfm.c b/vpx_dsp/fwd_txfm.c
index 4c0d5db83..4e7d4053e 100644
--- a/vpx_dsp/fwd_txfm.c
+++ b/vpx_dsp/fwd_txfm.c
@@ -72,8 +72,7 @@ void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
{
int i, j;
for (i = 0; i < 4; ++i) {
- for (j = 0; j < 4; ++j)
- output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
+ for (j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
}
}
}
@@ -82,8 +81,7 @@ void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
int r, c;
tran_low_t sum = 0;
for (r = 0; r < 4; ++r)
- for (c = 0; c < 4; ++c)
- sum += input[r * stride + c];
+ for (c = 0; c < 4; ++c) sum += input[r * stride + c];
output[0] = sum << 1;
}
@@ -133,8 +131,8 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
x3 = s0 - s3;
t0 = (x0 + x1) * cospi_16_64;
t1 = (x0 - x1) * cospi_16_64;
- t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
- t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
+ t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
+ t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
output[0] = (tran_low_t)fdct_round_shift(t0);
output[2] = (tran_low_t)fdct_round_shift(t2);
output[4] = (tran_low_t)fdct_round_shift(t1);
@@ -153,24 +151,23 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
x3 = s7 + t3;
// Stage 4
- t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
- t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+ t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+ t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
- t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+ t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
output[1] = (tran_low_t)fdct_round_shift(t0);
output[3] = (tran_low_t)fdct_round_shift(t2);
output[5] = (tran_low_t)fdct_round_shift(t1);
output[7] = (tran_low_t)fdct_round_shift(t3);
output += 8;
}
- in = intermediate;
+ in = intermediate;
output = final_output;
}
// Rows
for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j)
- final_output[j + i * 8] /= 2;
+ for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
}
}
@@ -178,8 +175,7 @@ void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
int r, c;
tran_low_t sum = 0;
for (r = 0; r < 8; ++r)
- for (c = 0; c < 8; ++c)
- sum += input[r * stride + c];
+ for (c = 0; c < 8; ++c) sum += input[r * stride + c];
output[0] = sum;
}
@@ -214,11 +210,11 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
- input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4;
- input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4;
+ input[6] = (in_pass0[6 * stride] + in_pass0[9 * stride]) * 4;
+ input[7] = (in_pass0[7 * stride] + in_pass0[8 * stride]) * 4;
// Calculate input for the next 8 results.
- step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4;
- step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4;
+ step1[0] = (in_pass0[7 * stride] - in_pass0[8 * stride]) * 4;
+ step1[1] = (in_pass0[6 * stride] - in_pass0[9 * stride]) * 4;
step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
@@ -233,11 +229,11 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
- input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
- input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
+ input[6] = ((in[6 * 16] + 1) >> 2) + ((in[9 * 16] + 1) >> 2);
+ input[7] = ((in[7 * 16] + 1) >> 2) + ((in[8 * 16] + 1) >> 2);
// Calculate input for the next 8 results.
- step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
- step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
+ step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[8 * 16] + 1) >> 2);
+ step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[9 * 16] + 1) >> 2);
step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
@@ -268,7 +264,7 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
x3 = s0 - s3;
t0 = (x0 + x1) * cospi_16_64;
t1 = (x0 - x1) * cospi_16_64;
- t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
+ t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
out[0] = (tran_low_t)fdct_round_shift(t0);
out[4] = (tran_low_t)fdct_round_shift(t2);
@@ -288,10 +284,10 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
x3 = s7 + t3;
// Stage 4
- t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
- t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+ t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+ t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
- t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+ t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
out[2] = (tran_low_t)fdct_round_shift(t0);
out[6] = (tran_low_t)fdct_round_shift(t2);
out[10] = (tran_low_t)fdct_round_shift(t1);
@@ -318,12 +314,12 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
step3[6] = step1[6] + step2[5];
step3[7] = step1[7] + step2[4];
// step 4
- temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
- temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
+ temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
+ temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
step2[1] = fdct_round_shift(temp1);
step2[2] = fdct_round_shift(temp2);
temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
- temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
+ temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
step2[5] = fdct_round_shift(temp1);
step2[6] = fdct_round_shift(temp2);
// step 5
@@ -336,20 +332,20 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
step1[6] = step3[7] - step2[6];
step1[7] = step3[7] + step2[6];
// step 6
- temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
+ temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
out[1] = (tran_low_t)fdct_round_shift(temp1);
out[9] = (tran_low_t)fdct_round_shift(temp2);
temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
- temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
+ temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
out[5] = (tran_low_t)fdct_round_shift(temp1);
out[13] = (tran_low_t)fdct_round_shift(temp2);
- temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
+ temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
out[3] = (tran_low_t)fdct_round_shift(temp1);
out[11] = (tran_low_t)fdct_round_shift(temp2);
temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
- temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
+ temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
out[7] = (tran_low_t)fdct_round_shift(temp1);
out[15] = (tran_low_t)fdct_round_shift(temp2);
}
@@ -368,8 +364,7 @@ void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
int r, c;
int sum = 0;
for (r = 0; r < 16; ++r)
- for (c = 0; c < 16; ++c)
- sum += input[r * stride + c];
+ for (c = 0; c < 16; ++c) sum += input[r * stride + c];
output[0] = (tran_low_t)(sum >> 1);
}
@@ -675,36 +670,36 @@ void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
step[31] = output[31] + output[30];
// Final stage --- outputs indices are bit-reversed.
- output[0] = step[0];
+ output[0] = step[0];
output[16] = step[1];
- output[8] = step[2];
+ output[8] = step[2];
output[24] = step[3];
- output[4] = step[4];
+ output[4] = step[4];
output[20] = step[5];
output[12] = step[6];
output[28] = step[7];
- output[2] = step[8];
+ output[2] = step[8];
output[18] = step[9];
output[10] = step[10];
output[26] = step[11];
- output[6] = step[12];
+ output[6] = step[12];
output[22] = step[13];
output[14] = step[14];
output[30] = step[15];
- output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
+ output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
- output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
+ output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
- output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
+ output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
- output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
+ output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
- output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
+ output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
@@ -717,8 +712,7 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
// Columns
for (i = 0; i < 32; ++i) {
tran_high_t temp_in[32], temp_out[32];
- for (j = 0; j < 32; ++j)
- temp_in[j] = input[j * stride + i] * 4;
+ for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
vpx_fdct32(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
@@ -727,8 +721,7 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
// Rows
for (i = 0; i < 32; ++i) {
tran_high_t temp_in[32], temp_out[32];
- for (j = 0; j < 32; ++j)
- temp_in[j] = output[j + i * 32];
+ for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
vpx_fdct32(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
out[j + i * 32] =
@@ -746,8 +739,7 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
// Columns
for (i = 0; i < 32; ++i) {
tran_high_t temp_in[32], temp_out[32];
- for (j = 0; j < 32; ++j)
- temp_in[j] = input[j * stride + i] * 4;
+ for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
vpx_fdct32(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
// TODO(cd): see quality impact of only doing
@@ -759,11 +751,9 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
// Rows
for (i = 0; i < 32; ++i) {
tran_high_t temp_in[32], temp_out[32];
- for (j = 0; j < 32; ++j)
- temp_in[j] = output[j + i * 32];
+ for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
vpx_fdct32(temp_in, temp_out, 1);
- for (j = 0; j < 32; ++j)
- out[j + i * 32] = (tran_low_t)temp_out[j];
+ for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
}
}
@@ -771,8 +761,7 @@ void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
int r, c;
int sum = 0;
for (r = 0; r < 32; ++r)
- for (c = 0; c < 32; ++c)
- sum += input[r * stride + c];
+ for (c = 0; c < 32; ++c) sum += input[r * stride + c];
output[0] = (tran_low_t)(sum >> 3);
}
diff --git a/vpx_dsp/intrapred.c b/vpx_dsp/intrapred.c
index cc4a74bd2..eca17a983 100644
--- a/vpx_dsp/intrapred.c
+++ b/vpx_dsp/intrapred.c
@@ -14,17 +14,16 @@
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_mem/vpx_mem.h"
-#define DST(x, y) dst[(x) + (y) * stride]
+#define DST(x, y) dst[(x) + (y)*stride]
#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
#define AVG2(a, b) (((a) + (b) + 1) >> 1)
static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
int r, c;
- (void) above;
+ (void)above;
// first column
- for (r = 0; r < bs - 1; ++r)
- dst[r * stride] = AVG2(left[r], left[r + 1]);
+ for (r = 0; r < bs - 1; ++r) dst[r * stride] = AVG2(left[r], left[r + 1]);
dst[(bs - 1) * stride] = left[bs - 1];
dst++;
@@ -36,8 +35,7 @@ static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
dst++;
// rest of last row
- for (c = 0; c < bs - 2; ++c)
- dst[(bs - 1) * stride + c] = left[bs - 1];
+ for (c = 0; c < bs - 2; ++c) dst[(bs - 1) * stride + c] = left[bs - 1];
for (r = bs - 2; r >= 0; --r)
for (c = 0; c < bs - 2; ++c)
@@ -48,13 +46,13 @@ static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
int r, c;
- (void) above;
+ (void)above;
for (r = 0; r < bs; ++r) {
for (c = 0; c < bs; ++c) {
dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1],
left[(c >> 1) + r + 2])
- : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
+ : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
}
dst += stride;
}
@@ -82,12 +80,12 @@ static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
int r, c;
- (void) left;
+ (void)left;
for (r = 0; r < bs; ++r) {
for (c = 0; c < bs; ++c) {
dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
above[(r >> 1) + c + 2])
- : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
+ : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
}
dst += stride;
}
@@ -117,7 +115,7 @@ static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
int r, c;
- (void) left;
+ (void)left;
for (r = 0; r < bs; ++r) {
for (c = 0; c < bs; ++c) {
dst[c] = AVG3(above[r + c], above[r + c + 1],
@@ -133,14 +131,12 @@ static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
int r, c;
// first row
- for (c = 0; c < bs; c++)
- dst[c] = AVG2(above[c - 1], above[c]);
+ for (c = 0; c < bs; c++) dst[c] = AVG2(above[c - 1], above[c]);
dst += stride;
// second row
dst[0] = AVG3(left[0], above[-1], above[0]);
- for (c = 1; c < bs; c++)
- dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
+ for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
dst += stride;
// the rest of first col
@@ -150,8 +146,7 @@ static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
// the rest of the block
for (r = 2; r < bs; ++r) {
- for (c = 1; c < bs; c++)
- dst[c] = dst[-2 * stride + c - 1];
+ for (c = 1; c < bs; c++) dst[c] = dst[-2 * stride + c - 1];
dst += stride;
}
}
@@ -188,8 +183,7 @@ static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
int r, c;
dst[0] = AVG2(above[-1], left[0]);
- for (r = 1; r < bs; r++)
- dst[r * stride] = AVG2(left[r - 1], left[r]);
+ for (r = 1; r < bs; r++) dst[r * stride] = AVG2(left[r - 1], left[r]);
dst++;
dst[0] = AVG3(left[0], above[-1], above[0]);
@@ -203,8 +197,7 @@ static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
dst += stride;
for (r = 1; r < bs; ++r) {
- for (c = 0; c < bs - 2; c++)
- dst[c] = dst[-stride + c - 2];
+ for (c = 0; c < bs - 2; c++) dst[c] = dst[-stride + c - 2];
dst += stride;
}
}
@@ -212,7 +205,7 @@ static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
int r;
- (void) left;
+ (void)left;
for (r = 0; r < bs; r++) {
memcpy(dst, above, bs);
@@ -223,7 +216,7 @@ static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
int r;
- (void) above;
+ (void)above;
for (r = 0; r < bs; r++) {
memset(dst, left[r], bs);
@@ -246,8 +239,8 @@ static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
int r;
- (void) above;
- (void) left;
+ (void)above;
+ (void)left;
for (r = 0; r < bs; r++) {
memset(dst, 128, bs);
@@ -259,10 +252,9 @@ static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above,
const uint8_t *left) {
int i, r, expected_dc, sum = 0;
- (void) above;
+ (void)above;
- for (i = 0; i < bs; i++)
- sum += left[i];
+ for (i = 0; i < bs; i++) sum += left[i];
expected_dc = (sum + (bs >> 1)) / bs;
for (r = 0; r < bs; r++) {
@@ -274,10 +266,9 @@ static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
int i, r, expected_dc, sum = 0;
- (void) left;
+ (void)left;
- for (i = 0; i < bs; i++)
- sum += above[i];
+ for (i = 0; i < bs; i++) sum += above[i];
expected_dc = (sum + (bs >> 1)) / bs;
for (r = 0; r < bs; r++) {
@@ -344,14 +335,13 @@ void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
const int K = left[2];
const int L = left[3];
(void)above;
- DST(0, 0) = AVG2(I, J);
+ DST(0, 0) = AVG2(I, J);
DST(2, 0) = DST(0, 1) = AVG2(J, K);
DST(2, 1) = DST(0, 2) = AVG2(K, L);
- DST(1, 0) = AVG3(I, J, K);
+ DST(1, 0) = AVG3(I, J, K);
DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
- DST(3, 2) = DST(2, 2) =
- DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+ DST(3, 2) = DST(2, 2) = DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
}
void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
@@ -364,17 +354,17 @@ void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
const int F = above[5];
const int G = above[6];
(void)left;
- DST(0, 0) = AVG2(A, B);
+ DST(0, 0) = AVG2(A, B);
DST(1, 0) = DST(0, 2) = AVG2(B, C);
DST(2, 0) = DST(1, 2) = AVG2(C, D);
DST(3, 0) = DST(2, 2) = AVG2(D, E);
- DST(3, 2) = AVG2(E, F); // differs from vp8
+ DST(3, 2) = AVG2(E, F); // differs from vp8
- DST(0, 1) = AVG3(A, B, C);
+ DST(0, 1) = AVG3(A, B, C);
DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
- DST(3, 3) = AVG3(E, F, G); // differs from vp8
+ DST(3, 3) = AVG3(E, F, G); // differs from vp8
}
void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
@@ -388,17 +378,17 @@ void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
const int G = above[6];
const int H = above[7];
(void)left;
- DST(0, 0) = AVG2(A, B);
+ DST(0, 0) = AVG2(A, B);
DST(1, 0) = DST(0, 2) = AVG2(B, C);
DST(2, 0) = DST(1, 2) = AVG2(C, D);
DST(3, 0) = DST(2, 2) = AVG2(D, E);
- DST(3, 2) = AVG3(E, F, G);
+ DST(3, 2) = AVG3(E, F, G);
- DST(0, 1) = AVG3(A, B, C);
+ DST(0, 1) = AVG3(A, B, C);
DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
- DST(3, 3) = AVG3(F, G, H);
+ DST(3, 3) = AVG3(F, G, H);
}
void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
@@ -413,13 +403,13 @@ void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
const int H = above[7];
(void)stride;
(void)left;
- DST(0, 0) = AVG3(A, B, C);
- DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
- DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
+ DST(0, 0) = AVG3(A, B, C);
+ DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
+ DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
- DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
- DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
- DST(3, 3) = H; // differs from vp8
+ DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
+ DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
+ DST(3, 3) = H; // differs from vp8
}
void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
@@ -434,13 +424,13 @@ void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
const int H = above[7];
(void)stride;
(void)left;
- DST(0, 0) = AVG3(A, B, C);
- DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
- DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
+ DST(0, 0) = AVG3(A, B, C);
+ DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
+ DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
- DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
- DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
- DST(3, 3) = AVG3(G, H, H);
+ DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
+ DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
+ DST(3, 3) = AVG3(G, H, H);
}
void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
@@ -456,14 +446,14 @@ void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
DST(0, 0) = DST(1, 2) = AVG2(X, A);
DST(1, 0) = DST(2, 2) = AVG2(A, B);
DST(2, 0) = DST(3, 2) = AVG2(B, C);
- DST(3, 0) = AVG2(C, D);
+ DST(3, 0) = AVG2(C, D);
- DST(0, 3) = AVG3(K, J, I);
- DST(0, 2) = AVG3(J, I, X);
+ DST(0, 3) = AVG3(K, J, I);
+ DST(0, 2) = AVG3(J, I, X);
DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
- DST(3, 1) = AVG3(B, C, D);
+ DST(3, 1) = AVG3(B, C, D);
}
void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
@@ -478,13 +468,13 @@ void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
const int C = above[2];
const int D = above[3];
(void)stride;
- DST(0, 3) = AVG3(J, K, L);
- DST(1, 3) = DST(0, 2) = AVG3(I, J, K);
- DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J);
+ DST(0, 3) = AVG3(J, K, L);
+ DST(1, 3) = DST(0, 2) = AVG3(I, J, K);
+ DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J);
DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
- DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X);
- DST(3, 1) = DST(2, 0) = AVG3(C, B, A);
- DST(3, 0) = AVG3(D, C, B);
+ DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X);
+ DST(3, 1) = DST(2, 0) = AVG3(C, B, A);
+ DST(3, 0) = AVG3(D, C, B);
}
void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
@@ -501,14 +491,14 @@ void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
DST(0, 0) = DST(2, 1) = AVG2(I, X);
DST(0, 1) = DST(2, 2) = AVG2(J, I);
DST(0, 2) = DST(2, 3) = AVG2(K, J);
- DST(0, 3) = AVG2(L, K);
+ DST(0, 3) = AVG2(L, K);
- DST(3, 0) = AVG3(A, B, C);
- DST(2, 0) = AVG3(X, A, B);
+ DST(3, 0) = AVG3(A, B, C);
+ DST(2, 0) = AVG3(X, A, B);
DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
- DST(1, 3) = AVG3(L, K, J);
+ DST(1, 3) = AVG3(L, K, J);
}
#if CONFIG_VP9_HIGHBITDEPTH
@@ -516,8 +506,8 @@ static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above,
const uint16_t *left, int bd) {
int r, c;
- (void) above;
- (void) bd;
+ (void)above;
+ (void)bd;
// First column.
for (r = 0; r < bs - 1; ++r) {
@@ -535,8 +525,7 @@ static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride,
dst++;
// Rest of last row.
- for (c = 0; c < bs - 2; ++c)
- dst[(bs - 1) * stride + c] = left[bs - 1];
+ for (c = 0; c < bs - 2; ++c) dst[(bs - 1) * stride + c] = left[bs - 1];
for (r = bs - 2; r >= 0; --r) {
for (c = 0; c < bs - 2; ++c)
@@ -549,31 +538,31 @@ static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above,
const uint16_t *left, int bd) {
int r, c;
- (void) above;
- (void) bd;
+ (void)above;
+ (void)bd;
for (r = 0; r < bs; ++r) {
for (c = 0; c < bs; ++c) {
dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1],
left[(c >> 1) + r + 2])
- : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
+ : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
}
dst += stride;
}
}
#endif // CONFIG_MISC_FIXES
-static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
+static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+ const uint16_t *above,
const uint16_t *left, int bd) {
int r, c;
- (void) left;
- (void) bd;
+ (void)left;
+ (void)bd;
for (r = 0; r < bs; ++r) {
for (c = 0; c < bs; ++c) {
dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
above[(r >> 1) + c + 2])
- : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
+ : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
}
dst += stride;
}
@@ -585,13 +574,13 @@ static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
const uint16_t *above,
const uint16_t *left, int bd) {
int r, c;
- (void) left;
- (void) bd;
+ (void)left;
+ (void)bd;
for (r = 0; r < bs; ++r) {
for (c = 0; c < bs; ++c) {
- dst[c] = r + c + 2 < bs * 2 ? AVG3(above[r + c], above[r + c + 1],
- above[r + c + 2])
- : above[bs * 2 - 1];
+ dst[c] = r + c + 2 < bs * 2
+ ? AVG3(above[r + c], above[r + c + 1], above[r + c + 2])
+ : above[bs * 2 - 1];
}
dst += stride;
}
@@ -602,8 +591,8 @@ static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above,
const uint16_t *left, int bd) {
int r, c;
- (void) left;
- (void) bd;
+ (void)left;
+ (void)bd;
for (r = 0; r < bs; ++r) {
for (c = 0; c < bs; ++c) {
dst[c] = AVG3(above[r + c], above[r + c + 1],
@@ -618,17 +607,15 @@ static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above,
const uint16_t *left, int bd) {
int r, c;
- (void) bd;
+ (void)bd;
// first row
- for (c = 0; c < bs; c++)
- dst[c] = AVG2(above[c - 1], above[c]);
+ for (c = 0; c < bs; c++) dst[c] = AVG2(above[c - 1], above[c]);
dst += stride;
// second row
dst[0] = AVG3(left[0], above[-1], above[0]);
- for (c = 1; c < bs; c++)
- dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
+ for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
dst += stride;
// the rest of first col
@@ -638,8 +625,7 @@ static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
// the rest of the block
for (r = 2; r < bs; ++r) {
- for (c = 1; c < bs; c++)
- dst[c] = dst[-2 * stride + c - 1];
+ for (c = 1; c < bs; c++) dst[c] = dst[-2 * stride + c - 1];
dst += stride;
}
}
@@ -648,10 +634,9 @@ static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above,
const uint16_t *left, int bd) {
int r, c;
- (void) bd;
+ (void)bd;
dst[0] = AVG3(left[0], above[-1], above[0]);
- for (c = 1; c < bs; c++)
- dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
+ for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
dst[stride] = AVG3(above[-1], left[0], left[1]);
for (r = 2; r < bs; ++r)
@@ -659,8 +644,7 @@ static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride,
dst += stride;
for (r = 1; r < bs; ++r) {
- for (c = 1; c < bs; c++)
- dst[c] = dst[-stride + c - 1];
+ for (c = 1; c < bs; c++) dst[c] = dst[-stride + c - 1];
dst += stride;
}
}
@@ -669,10 +653,9 @@ static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above,
const uint16_t *left, int bd) {
int r, c;
- (void) bd;
+ (void)bd;
dst[0] = AVG2(above[-1], left[0]);
- for (r = 1; r < bs; r++)
- dst[r * stride] = AVG2(left[r - 1], left[r]);
+ for (r = 1; r < bs; r++) dst[r * stride] = AVG2(left[r - 1], left[r]);
dst++;
dst[0] = AVG3(left[0], above[-1], above[0]);
@@ -686,42 +669,41 @@ static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride,
dst += stride;
for (r = 1; r < bs; ++r) {
- for (c = 0; c < bs - 2; c++)
- dst[c] = dst[-stride + c - 2];
+ for (c = 0; c < bs - 2; c++) dst[c] = dst[-stride + c - 2];
dst += stride;
}
}
-static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
+static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+ const uint16_t *above,
const uint16_t *left, int bd) {
int r;
- (void) left;
- (void) bd;
+ (void)left;
+ (void)bd;
for (r = 0; r < bs; r++) {
memcpy(dst, above, bs * sizeof(uint16_t));
dst += stride;
}
}
-static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
+static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+ const uint16_t *above,
const uint16_t *left, int bd) {
int r;
- (void) above;
- (void) bd;
+ (void)above;
+ (void)bd;
for (r = 0; r < bs; r++) {
vpx_memset16(dst, left[r], bs);
dst += stride;
}
}
-static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
+static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+ const uint16_t *above,
const uint16_t *left, int bd) {
int r, c;
int ytop_left = above[-1];
- (void) bd;
+ (void)bd;
for (r = 0; r < bs; r++) {
for (c = 0; c < bs; c++)
@@ -734,8 +716,8 @@ static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above,
const uint16_t *left, int bd) {
int r;
- (void) above;
- (void) left;
+ (void)above;
+ (void)left;
for (r = 0; r < bs; r++) {
vpx_memset16(dst, 128 << (bd - 8), bs);
@@ -747,11 +729,10 @@ static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above,
const uint16_t *left, int bd) {
int i, r, expected_dc, sum = 0;
- (void) above;
- (void) bd;
+ (void)above;
+ (void)bd;
- for (i = 0; i < bs; i++)
- sum += left[i];
+ for (i = 0; i < bs; i++) sum += left[i];
expected_dc = (sum + (bs >> 1)) / bs;
for (r = 0; r < bs; r++) {
@@ -764,11 +745,10 @@ static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above,
const uint16_t *left, int bd) {
int i, r, expected_dc, sum = 0;
- (void) left;
- (void) bd;
+ (void)left;
+ (void)bd;
- for (i = 0; i < bs; i++)
- sum += above[i];
+ for (i = 0; i < bs; i++) sum += above[i];
expected_dc = (sum + (bs >> 1)) / bs;
for (r = 0; r < bs; r++) {
@@ -777,12 +757,12 @@ static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
}
}
-static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
+static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+ const uint16_t *above,
const uint16_t *left, int bd) {
int i, r, expected_dc, sum = 0;
const int count = 2 * bs;
- (void) bd;
+ (void)bd;
for (i = 0; i < bs; i++) {
sum += above[i];
@@ -801,22 +781,22 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride,
// This serves as a wrapper function, so that all the prediction functions
// can be unified and accessed as a pointer array. Note that the boundary
// above and left are not necessarily used all the time.
-#define intra_pred_sized(type, size) \
- void vpx_##type##_predictor_##size##x##size##_c(uint8_t *dst, \
- ptrdiff_t stride, \
- const uint8_t *above, \
- const uint8_t *left) { \
- type##_predictor(dst, stride, size, above, left); \
+#define intra_pred_sized(type, size) \
+ void vpx_##type##_predictor_##size##x##size##_c( \
+ uint8_t *dst, ptrdiff_t stride, const uint8_t *above, \
+ const uint8_t *left) { \
+ type##_predictor(dst, stride, size, above, left); \
}
#if CONFIG_VP9_HIGHBITDEPTH
-#define intra_pred_highbd_sized(type, size) \
- void vpx_highbd_##type##_predictor_##size##x##size##_c( \
- uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
- const uint16_t *left, int bd) { \
+#define intra_pred_highbd_sized(type, size) \
+ void vpx_highbd_##type##_predictor_##size##x##size##_c( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
highbd_##type##_predictor(dst, stride, size, above, left, bd); \
}
+/* clang-format off */
#define intra_pred_allsizes(type) \
intra_pred_sized(type, 4) \
intra_pred_sized(type, 8) \
@@ -867,4 +847,5 @@ intra_pred_allsizes(dc_128)
intra_pred_allsizes(dc_left)
intra_pred_allsizes(dc_top)
intra_pred_allsizes(dc)
+/* clang-format on */
#undef intra_pred_allsizes
diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c
index e18d31d7a..1526663d8 100644
--- a/vpx_dsp/inv_txfm.c
+++ b/vpx_dsp/inv_txfm.c
@@ -15,8 +15,8 @@
#include "vpx_dsp/inv_txfm.h"
void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
- 0.5 shifts per pixel. */
+ /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+ 0.5 shifts per pixel. */
int i;
tran_low_t output[16];
tran_high_t a1, b1, c1, d1, e1;
@@ -127,8 +127,7 @@ void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
// Columns
for (i = 0; i < 4; ++i) {
- for (j = 0; j < 4; ++j)
- temp_in[j] = out[j * 4 + i];
+ for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
idct4_c(temp_in, temp_out);
for (j = 0; j < 4; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@@ -223,8 +222,7 @@ void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
// Then transform columns
for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j)
- temp_in[j] = out[j * 8 + i];
+ for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
idct8_c(temp_in, temp_out);
for (j = 0; j < 8; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@@ -240,8 +238,7 @@ void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
a1 = ROUND_POWER_OF_TWO(out, 5);
for (j = 0; j < 8; ++j) {
- for (i = 0; i < 8; ++i)
- dest[i] = clip_pixel_add(dest[i], a1);
+ for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
dest += stride;
}
}
@@ -296,20 +293,20 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
tran_high_t x7 = input[6];
if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
- output[0] = output[1] = output[2] = output[3] = output[4]
- = output[5] = output[6] = output[7] = 0;
+ output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
+ output[6] = output[7] = 0;
return;
}
// stage 1
- s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
- s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
+ s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
+ s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
- s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
- s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
+ s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
+ s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
@@ -376,8 +373,7 @@ void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
// Then transform columns
for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j)
- temp_in[j] = out[j * 8 + i];
+ for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
idct8_c(temp_in, temp_out);
for (j = 0; j < 8; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@@ -391,22 +387,22 @@ void idct16_c(const tran_low_t *input, tran_low_t *output) {
tran_high_t temp1, temp2;
// stage 1
- step1[0] = input[0/2];
- step1[1] = input[16/2];
- step1[2] = input[8/2];
- step1[3] = input[24/2];
- step1[4] = input[4/2];
- step1[5] = input[20/2];
- step1[6] = input[12/2];
- step1[7] = input[28/2];
- step1[8] = input[2/2];
- step1[9] = input[18/2];
- step1[10] = input[10/2];
- step1[11] = input[26/2];
- step1[12] = input[6/2];
- step1[13] = input[22/2];
- step1[14] = input[14/2];
- step1[15] = input[30/2];
+ step1[0] = input[0 / 2];
+ step1[1] = input[16 / 2];
+ step1[2] = input[8 / 2];
+ step1[3] = input[24 / 2];
+ step1[4] = input[4 / 2];
+ step1[5] = input[20 / 2];
+ step1[6] = input[12 / 2];
+ step1[7] = input[28 / 2];
+ step1[8] = input[2 / 2];
+ step1[9] = input[18 / 2];
+ step1[10] = input[10 / 2];
+ step1[11] = input[26 / 2];
+ step1[12] = input[6 / 2];
+ step1[13] = input[22 / 2];
+ step1[14] = input[14 / 2];
+ step1[15] = input[30 / 2];
// stage 2
step2[0] = step1[0];
@@ -567,8 +563,7 @@ void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
// Then transform columns
for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j)
- temp_in[j] = out[j * 16 + i];
+ for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
idct16_c(temp_in, temp_out);
for (j = 0; j < 16; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@@ -598,21 +593,20 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
tran_high_t x14 = input[1];
tran_high_t x15 = input[14];
- if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
- | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
- output[0] = output[1] = output[2] = output[3] = output[4]
- = output[5] = output[6] = output[7] = output[8]
- = output[9] = output[10] = output[11] = output[12]
- = output[13] = output[14] = output[15] = 0;
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
+ x13 | x14 | x15)) {
+ output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
+ output[6] = output[7] = output[8] = output[9] = output[10] =
+ output[11] = output[12] = output[13] = output[14] = output[15] = 0;
return;
}
// stage 1
- s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
+ s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
- s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
+ s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
- s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
+ s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
@@ -621,9 +615,9 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
- s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
+ s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
- s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
+ s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
@@ -651,14 +645,14 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
s5 = x5;
s6 = x6;
s7 = x7;
- s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
- s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
- s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
- s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
- s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
- s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
- s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
- s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+ s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+ s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+ s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+ s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+ s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+ s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+ s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+ s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
x0 = WRAPLOW(s0 + s4);
x1 = WRAPLOW(s1 + s5);
@@ -682,18 +676,18 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
s1 = x1;
s2 = x2;
s3 = x3;
- s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+ s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
- s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
- s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+ s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+ s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
s8 = x8;
s9 = x9;
s10 = x10;
s11 = x11;
- s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+ s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
- s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
- s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+ s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+ s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
x0 = WRAPLOW(s0 + s2);
x1 = WRAPLOW(s1 + s3);
@@ -713,13 +707,13 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
// stage 4
- s2 = (- cospi_16_64) * (x2 + x3);
+ s2 = (-cospi_16_64) * (x2 + x3);
s3 = cospi_16_64 * (x2 - x3);
s6 = cospi_16_64 * (x6 + x7);
- s7 = cospi_16_64 * (- x6 + x7);
+ s7 = cospi_16_64 * (-x6 + x7);
s10 = cospi_16_64 * (x10 + x11);
- s11 = cospi_16_64 * (- x10 + x11);
- s14 = (- cospi_16_64) * (x14 + x15);
+ s11 = cospi_16_64 * (-x10 + x11);
+ s14 = (-cospi_16_64) * (x14 + x15);
s15 = cospi_16_64 * (x14 - x15);
x2 = WRAPLOW(dct_const_round_shift(s2));
@@ -766,8 +760,7 @@ void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
// Then transform columns
for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j)
- temp_in[j] = out[j*16 + i];
+ for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
idct16_c(temp_in, temp_out);
for (j = 0; j < 16; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@@ -783,8 +776,7 @@ void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
a1 = ROUND_POWER_OF_TWO(out, 6);
for (j = 0; j < 16; ++j) {
- for (i = 0; i < 16; ++i)
- dest[i] = clip_pixel_add(dest[i], a1);
+ for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
dest += stride;
}
}
@@ -1166,8 +1158,7 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
// Rows
for (i = 0; i < 32; ++i) {
int16_t zero_coeff[16];
- for (j = 0; j < 16; ++j)
- zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+ for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
for (j = 0; j < 8; ++j)
zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
for (j = 0; j < 4; ++j)
@@ -1185,8 +1176,7 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
// Columns
for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j)
- temp_in[j] = out[j * 32 + i];
+ for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
idct32_c(temp_in, temp_out);
for (j = 0; j < 32; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@@ -1197,7 +1187,7 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
int stride) {
- tran_low_t out[32 * 32] = {0};
+ tran_low_t out[32 * 32] = { 0 };
tran_low_t *outptr = out;
int i, j;
tran_low_t temp_in[32], temp_out[32];
@@ -1212,8 +1202,7 @@ void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
// Columns
for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j)
- temp_in[j] = out[j * 32 + i];
+ for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
idct32_c(temp_in, temp_out);
for (j = 0; j < 32; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@@ -1224,7 +1213,7 @@ void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
int stride) {
- tran_low_t out[32 * 32] = {0};
+ tran_low_t out[32 * 32] = { 0 };
tran_low_t *outptr = out;
int i, j;
tran_low_t temp_in[32], temp_out[32];
@@ -1239,8 +1228,7 @@ void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
// Columns
for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j)
- temp_in[j] = out[j * 32 + i];
+ for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
idct32_c(temp_in, temp_out);
for (j = 0; j < 32; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@@ -1258,8 +1246,7 @@ void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
a1 = ROUND_POWER_OF_TWO(out, 6);
for (j = 0; j < 32; ++j) {
- for (i = 0; i < 32; ++i)
- dest[i] = clip_pixel_add(dest[i], a1);
+ for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
dest += stride;
}
}
@@ -1309,14 +1296,14 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
c1 = e1 - c1;
a1 -= b1;
d1 += c1;
- dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0],
- HIGHBD_WRAPLOW(a1, bd), bd);
- dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1],
- HIGHBD_WRAPLOW(b1, bd), bd);
- dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2],
- HIGHBD_WRAPLOW(c1, bd), bd);
- dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3],
- HIGHBD_WRAPLOW(d1, bd), bd);
+ dest[stride * 0] =
+ highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
+ dest[stride * 1] =
+ highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
+ dest[stride * 2] =
+ highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
+ dest[stride * 3] =
+ highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
ip++;
dest++;
@@ -1331,7 +1318,7 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
const tran_low_t *ip = in;
tran_low_t *op = tmp;
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- (void) bd;
+ (void)bd;
a1 = ip[0] >> UNIT_QUANT_SHIFT;
e1 = a1 >> 1;
@@ -1343,14 +1330,14 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
for (i = 0; i < 4; i++) {
e1 = ip[0] >> 1;
a1 = ip[0] - e1;
- dest[dest_stride * 0] = highbd_clip_pixel_add(
- dest[dest_stride * 0], a1, bd);
- dest[dest_stride * 1] = highbd_clip_pixel_add(
- dest[dest_stride * 1], e1, bd);
- dest[dest_stride * 2] = highbd_clip_pixel_add(
- dest[dest_stride * 2], e1, bd);
- dest[dest_stride * 3] = highbd_clip_pixel_add(
- dest[dest_stride * 3], e1, bd);
+ dest[dest_stride * 0] =
+ highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
+ dest[dest_stride * 1] =
+ highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
+ dest[dest_stride * 2] =
+ highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
+ dest[dest_stride * 3] =
+ highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
ip++;
dest++;
}
@@ -1359,7 +1346,7 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
tran_low_t step[4];
tran_high_t temp1, temp2;
- (void) bd;
+ (void)bd;
// stage 1
temp1 = (input[0] + input[2]) * cospi_16_64;
temp2 = (input[0] - input[2]) * cospi_16_64;
@@ -1394,8 +1381,7 @@ void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
// Columns
for (i = 0; i < 4; ++i) {
- for (j = 0; j < 4; ++j)
- temp_in[j] = out[j * 4 + i];
+ for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
vpx_highbd_idct4_c(temp_in, temp_out, bd);
for (j = 0; j < 4; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
@@ -1408,8 +1394,8 @@ void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
int dest_stride, int bd) {
int i;
tran_high_t a1;
- tran_low_t out = HIGHBD_WRAPLOW(
- highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+ tran_low_t out =
+ HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
@@ -1486,8 +1472,7 @@ void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
// Then transform columns.
for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j)
- temp_in[j] = out[j * 8 + i];
+ for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
vpx_highbd_idct8_c(temp_in, temp_out, bd);
for (j = 0; j < 8; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
@@ -1500,14 +1485,13 @@ void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
int i, j;
tran_high_t a1;
- tran_low_t out = HIGHBD_WRAPLOW(
- highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+ tran_low_t out =
+ HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
a1 = ROUND_POWER_OF_TWO(out, 5);
for (j = 0; j < 8; ++j) {
- for (i = 0; i < 8; ++i)
- dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+ for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
dest += stride;
}
}
@@ -1519,7 +1503,7 @@ void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
tran_low_t x1 = input[1];
tran_low_t x2 = input[2];
tran_low_t x3 = input[3];
- (void) bd;
+ (void)bd;
if (!(x0 | x1 | x2 | x3)) {
memset(output, 0, 4 * sizeof(*output));
@@ -1561,7 +1545,7 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
tran_low_t x5 = input[4];
tran_low_t x6 = input[1];
tran_low_t x7 = input[6];
- (void) bd;
+ (void)bd;
if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
memset(output, 0, 8 * sizeof(*output));
@@ -1569,14 +1553,14 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
}
// stage 1
- s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
- s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+ s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+ s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
- s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
- s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+ s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+ s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd);
x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd);
@@ -1592,10 +1576,10 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
s1 = x1;
s2 = x2;
s3 = x3;
- s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
- s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
- s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
- s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+ s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+ s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+ s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+ s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
@@ -1644,8 +1628,7 @@ void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
}
// Then transform columns.
for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j)
- temp_in[j] = out[j * 8 + i];
+ for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
vpx_highbd_idct8_c(temp_in, temp_out, bd);
for (j = 0; j < 8; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
@@ -1657,25 +1640,25 @@ void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
tran_low_t step1[16], step2[16];
tran_high_t temp1, temp2;
- (void) bd;
+ (void)bd;
// stage 1
- step1[0] = input[0/2];
- step1[1] = input[16/2];
- step1[2] = input[8/2];
- step1[3] = input[24/2];
- step1[4] = input[4/2];
- step1[5] = input[20/2];
- step1[6] = input[12/2];
- step1[7] = input[28/2];
- step1[8] = input[2/2];
- step1[9] = input[18/2];
- step1[10] = input[10/2];
- step1[11] = input[26/2];
- step1[12] = input[6/2];
- step1[13] = input[22/2];
- step1[14] = input[14/2];
- step1[15] = input[30/2];
+ step1[0] = input[0 / 2];
+ step1[1] = input[16 / 2];
+ step1[2] = input[8 / 2];
+ step1[3] = input[24 / 2];
+ step1[4] = input[4 / 2];
+ step1[5] = input[20 / 2];
+ step1[6] = input[12 / 2];
+ step1[7] = input[28 / 2];
+ step1[8] = input[2 / 2];
+ step1[9] = input[18 / 2];
+ step1[10] = input[10 / 2];
+ step1[11] = input[26 / 2];
+ step1[12] = input[6 / 2];
+ step1[13] = input[22 / 2];
+ step1[14] = input[14 / 2];
+ step1[15] = input[30 / 2];
// stage 2
step2[0] = step1[0];
@@ -1837,8 +1820,7 @@ void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
// Then transform columns.
for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j)
- temp_in[j] = out[j * 16 + i];
+ for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
vpx_highbd_idct16_c(temp_in, temp_out, bd);
for (j = 0; j < 16; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
@@ -1867,20 +1849,20 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
tran_low_t x13 = input[12];
tran_low_t x14 = input[1];
tran_low_t x15 = input[14];
- (void) bd;
+ (void)bd;
- if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
- | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
+ x13 | x14 | x15)) {
memset(output, 0, 16 * sizeof(*output));
return;
}
// stage 1
- s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
+ s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
- s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
+ s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
- s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
+ s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
@@ -1889,9 +1871,9 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
- s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
+ s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
- s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
+ s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd);
x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd);
@@ -1901,8 +1883,8 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd);
x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd);
x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd);
- x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd);
- x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd);
+ x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd);
+ x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd);
x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd);
x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd);
x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd);
@@ -1981,13 +1963,13 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd);
// stage 4
- s2 = (- cospi_16_64) * (x2 + x3);
+ s2 = (-cospi_16_64) * (x2 + x3);
s3 = cospi_16_64 * (x2 - x3);
s6 = cospi_16_64 * (x6 + x7);
s7 = cospi_16_64 * (-x6 + x7);
s10 = cospi_16_64 * (x10 + x11);
s11 = cospi_16_64 * (-x10 + x11);
- s14 = (- cospi_16_64) * (x14 + x15);
+ s14 = (-cospi_16_64) * (x14 + x15);
s15 = cospi_16_64 * (x14 - x15);
x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
@@ -2035,8 +2017,7 @@ void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
// Then transform columns.
for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j)
- temp_in[j] = out[j*16 + i];
+ for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
vpx_highbd_idct16_c(temp_in, temp_out, bd);
for (j = 0; j < 16; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
@@ -2049,24 +2030,23 @@ void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
int i, j;
tran_high_t a1;
- tran_low_t out = HIGHBD_WRAPLOW(
- highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+ tran_low_t out =
+ HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
a1 = ROUND_POWER_OF_TWO(out, 6);
for (j = 0; j < 16; ++j) {
- for (i = 0; i < 16; ++i)
- dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+ for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
dest += stride;
}
}
-static void highbd_idct32_c(const tran_low_t *input,
- tran_low_t *output, int bd) {
+static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
+ int bd) {
tran_low_t step1[32], step2[32];
tran_high_t temp1, temp2;
- (void) bd;
+ (void)bd;
// stage 1
step1[0] = input[0];
@@ -2442,8 +2422,7 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
// Rows
for (i = 0; i < 32; ++i) {
tran_low_t zero_coeff[16];
- for (j = 0; j < 16; ++j)
- zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+ for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
for (j = 0; j < 8; ++j)
zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
for (j = 0; j < 4; ++j)
@@ -2461,8 +2440,7 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
// Columns
for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j)
- temp_in[j] = out[j * 32 + i];
+ for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
highbd_idct32_c(temp_in, temp_out, bd);
for (j = 0; j < 32; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
@@ -2473,7 +2451,7 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
- tran_low_t out[32 * 32] = {0};
+ tran_low_t out[32 * 32] = { 0 };
tran_low_t *outptr = out;
int i, j;
tran_low_t temp_in[32], temp_out[32];
@@ -2488,8 +2466,7 @@ void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
}
// Columns
for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j)
- temp_in[j] = out[j * 32 + i];
+ for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
highbd_idct32_c(temp_in, temp_out, bd);
for (j = 0; j < 32; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
@@ -2504,14 +2481,13 @@ void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
int a1;
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- tran_low_t out = HIGHBD_WRAPLOW(
- highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+ tran_low_t out =
+ HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
a1 = ROUND_POWER_OF_TWO(out, 6);
for (j = 0; j < 32; ++j) {
- for (i = 0; i < 32; ++i)
- dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+ for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
dest += stride;
}
}
diff --git a/vpx_dsp/inv_txfm.h b/vpx_dsp/inv_txfm.h
index 9cfe1be3a..e530730d5 100644
--- a/vpx_dsp/inv_txfm.h
+++ b/vpx_dsp/inv_txfm.h
@@ -41,8 +41,7 @@ static INLINE tran_high_t dct_const_round_shift(tran_high_t input) {
}
#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE tran_high_t highbd_check_range(tran_high_t input,
- int bd) {
+static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) {
#if CONFIG_COEFFICIENT_RANGE_CHECKING
// For valid highbitdepth VP9 streams, intermediate stage coefficients will
// stay within the ranges:
@@ -53,9 +52,9 @@ static INLINE tran_high_t highbd_check_range(tran_high_t input,
const int32_t int_min = -int_max - 1;
assert(int_min <= input);
assert(input <= int_max);
- (void) int_min;
+ (void)int_min;
#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
- (void) bd;
+ (void)bd;
return input;
}
@@ -86,15 +85,14 @@ static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) {
#define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16)
#if CONFIG_VP9_HIGHBITDEPTH
#define HIGHBD_WRAPLOW(x, bd) \
- ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd))
+ ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd))
#endif // CONFIG_VP9_HIGHBITDEPTH
-#else // CONFIG_EMULATE_HARDWARE
+#else // CONFIG_EMULATE_HARDWARE
#define WRAPLOW(x) ((int32_t)check_range(x))
#if CONFIG_VP9_HIGHBITDEPTH
-#define HIGHBD_WRAPLOW(x, bd) \
- ((int32_t)highbd_check_range((x), bd))
+#define HIGHBD_WRAPLOW(x, bd) ((int32_t)highbd_check_range((x), bd))
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // CONFIG_EMULATE_HARDWARE
diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c
index 645a1ab95..40f02b46d 100644
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c
@@ -22,23 +22,18 @@ static INLINE int8_t signed_char_clamp(int t) {
#if CONFIG_VP9_HIGHBITDEPTH
static INLINE int16_t signed_char_clamp_high(int t, int bd) {
switch (bd) {
- case 10:
- return (int16_t)clamp(t, -128*4, 128*4-1);
- case 12:
- return (int16_t)clamp(t, -128*16, 128*16-1);
+ case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
+ case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1);
case 8:
- default:
- return (int16_t)clamp(t, -128, 128-1);
+ default: return (int16_t)clamp(t, -128, 128 - 1);
}
}
#endif
// should we apply any filter at all: 11111111 yes, 00000000 no
-static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
- uint8_t p3, uint8_t p2,
- uint8_t p1, uint8_t p0,
- uint8_t q0, uint8_t q1,
- uint8_t q2, uint8_t q3) {
+static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
+ uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
+ uint8_t q1, uint8_t q2, uint8_t q3) {
int8_t mask = 0;
mask |= (abs(p3 - p2) > limit) * -1;
mask |= (abs(p2 - p1) > limit) * -1;
@@ -46,14 +41,12 @@ static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
mask |= (abs(q1 - q0) > limit) * -1;
mask |= (abs(q2 - q1) > limit) * -1;
mask |= (abs(q3 - q2) > limit) * -1;
- mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
return ~mask;
}
-static INLINE int8_t flat_mask4(uint8_t thresh,
- uint8_t p3, uint8_t p2,
- uint8_t p1, uint8_t p0,
- uint8_t q0, uint8_t q1,
+static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
+ uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
uint8_t q2, uint8_t q3) {
int8_t mask = 0;
mask |= (abs(p1 - p0) > thresh) * -1;
@@ -65,12 +58,10 @@ static INLINE int8_t flat_mask4(uint8_t thresh,
return ~mask;
}
-static INLINE int8_t flat_mask5(uint8_t thresh,
- uint8_t p4, uint8_t p3,
- uint8_t p2, uint8_t p1,
- uint8_t p0, uint8_t q0,
- uint8_t q1, uint8_t q2,
- uint8_t q3, uint8_t q4) {
+static INLINE int8_t flat_mask5(uint8_t thresh, uint8_t p4, uint8_t p3,
+ uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
+ uint8_t q1, uint8_t q2, uint8_t q3,
+ uint8_t q4) {
int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
mask |= (abs(p4 - p0) > thresh) * -1;
mask |= (abs(q4 - q0) > thresh) * -1;
@@ -81,8 +72,8 @@ static INLINE int8_t flat_mask5(uint8_t thresh,
static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
uint8_t q0, uint8_t q1) {
int8_t hev = 0;
- hev |= (abs(p1 - p0) > thresh) * -1;
- hev |= (abs(q1 - q0) > thresh) * -1;
+ hev |= (abs(p1 - p0) > thresh) * -1;
+ hev |= (abs(q1 - q0) > thresh) * -1;
return hev;
}
@@ -90,10 +81,10 @@ static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
int8_t filter1, filter2;
- const int8_t ps1 = (int8_t) *op1 ^ 0x80;
- const int8_t ps0 = (int8_t) *op0 ^ 0x80;
- const int8_t qs0 = (int8_t) *oq0 ^ 0x80;
- const int8_t qs1 = (int8_t) *oq1 ^ 0x80;
+ const int8_t ps1 = (int8_t)*op1 ^ 0x80;
+ const int8_t ps0 = (int8_t)*op0 ^ 0x80;
+ const int8_t qs0 = (int8_t)*oq0 ^ 0x80;
+ const int8_t qs1 = (int8_t)*oq1 ^ 0x80;
const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
// add outer taps if we have high edge variance
@@ -127,9 +118,9 @@ void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
// of 8 bit simd instructions.
for (i = 0; i < 8; ++i) {
const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
- const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
- const int8_t mask = filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3);
+ const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+ const int8_t mask =
+ filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
++s;
}
@@ -151,9 +142,9 @@ void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
// of 8 bit simd instructions.
for (i = 0; i < 8; ++i) {
const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
- const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
- const int8_t mask = filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3);
+ const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+ const int8_t mask =
+ filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
s += pitch;
}
@@ -168,9 +159,8 @@ void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
}
static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
- uint8_t *op3, uint8_t *op2,
- uint8_t *op1, uint8_t *op0,
- uint8_t *oq0, uint8_t *oq1,
+ uint8_t *op3, uint8_t *op2, uint8_t *op1,
+ uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
uint8_t *oq2, uint8_t *oq3) {
if (flat && mask) {
const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
@@ -184,7 +174,7 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
*oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
*oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
} else {
- filter4(mask, thresh, op1, op0, oq0, oq1);
+ filter4(mask, thresh, op1, op0, oq0, oq1);
}
}
@@ -198,11 +188,11 @@ void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
- const int8_t mask = filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t mask =
+ filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
- filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
- s, s + 1 * p, s + 2 * p, s + 3 * p);
+ filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
+ s + 1 * p, s + 2 * p, s + 3 * p);
++s;
}
}
@@ -222,11 +212,11 @@ void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
for (i = 0; i < 8; ++i) {
const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
- const int8_t mask = filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t mask =
+ filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
- filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1,
- s, s + 1, s + 2, s + 3);
+ filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2,
+ s + 3);
s += pitch;
}
}
@@ -239,52 +229,55 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
}
-static INLINE void filter16(int8_t mask, uint8_t thresh,
- uint8_t flat, uint8_t flat2,
- uint8_t *op7, uint8_t *op6,
- uint8_t *op5, uint8_t *op4,
- uint8_t *op3, uint8_t *op2,
- uint8_t *op1, uint8_t *op0,
- uint8_t *oq0, uint8_t *oq1,
- uint8_t *oq2, uint8_t *oq3,
- uint8_t *oq4, uint8_t *oq5,
+static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat,
+ uint8_t flat2, uint8_t *op7, uint8_t *op6,
+ uint8_t *op5, uint8_t *op4, uint8_t *op3,
+ uint8_t *op2, uint8_t *op1, uint8_t *op0,
+ uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
+ uint8_t *oq3, uint8_t *oq4, uint8_t *oq5,
uint8_t *oq6, uint8_t *oq7) {
if (flat2 && flat && mask) {
- const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4,
- p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+ const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3,
+ p2 = *op2, p1 = *op1, p0 = *op0;
- const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3,
- q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
+ const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
+ q5 = *oq5, q6 = *oq6, q7 = *oq7;
// 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
- *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
- q0, 4);
- *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
- q0 + q1, 4);
- *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
- q0 + q1 + q2, 4);
- *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
- q0 + q1 + q2 + q3, 4);
- *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
- q0 + q1 + q2 + q3 + q4, 4);
+ *op6 = ROUND_POWER_OF_TWO(
+ p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
+ *op5 = ROUND_POWER_OF_TWO(
+ p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
+ *op4 = ROUND_POWER_OF_TWO(
+ p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
+ *op3 = ROUND_POWER_OF_TWO(
+ p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
+ *op2 = ROUND_POWER_OF_TWO(
+ p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
+ 4);
*op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
- q0 + q1 + q2 + q3 + q4 + q5, 4);
- *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
- q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
- *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
- q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
- *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
- q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
- *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
- q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
- *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
- q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
- *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
- q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
- *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
- q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
- *oq6 = ROUND_POWER_OF_TWO(p0 +
- q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
+ q0 + q1 + q2 + q3 + q4 + q5,
+ 4);
+ *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
+ q1 + q2 + q3 + q4 + q5 + q6,
+ 4);
+ *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
+ q2 + q3 + q4 + q5 + q6 + q7,
+ 4);
+ *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
+ q3 + q4 + q5 + q6 + q7 * 2,
+ 4);
+ *oq2 = ROUND_POWER_OF_TWO(
+ p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
+ 4);
+ *oq3 = ROUND_POWER_OF_TWO(
+ p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
+ *oq4 = ROUND_POWER_OF_TWO(
+ p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
+ *oq5 = ROUND_POWER_OF_TWO(
+ p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
+ *oq6 = ROUND_POWER_OF_TWO(
+ p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
} else {
filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
}
@@ -300,18 +293,17 @@ static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
for (i = 0; i < 8 * count; ++i) {
const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
- const int8_t mask = filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t mask =
+ filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
- const int8_t flat2 = flat_mask5(1,
- s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
- q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
-
- filter16(mask, *thresh, flat, flat2,
- s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
- s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
- s, s + 1 * p, s + 2 * p, s + 3 * p,
- s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
+ const int8_t flat2 =
+ flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
+ s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
+
+ filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
+ s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
+ s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p,
+ s + 7 * p);
++s;
}
}
@@ -326,25 +318,23 @@ void vpx_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit,
mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
}
-static void mb_lpf_vertical_edge_w(uint8_t *s, int p,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh,
+static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
int count) {
int i;
for (i = 0; i < count; ++i) {
const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
- const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
- const int8_t mask = filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3);
+ const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+ const int8_t mask =
+ filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
- const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
- q0, s[4], s[5], s[6], s[7]);
+ const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, q0, s[4],
+ s[5], s[6], s[7]);
- filter16(mask, *thresh, flat, flat2,
- s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
- s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
+ filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4,
+ s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6,
+ s + 7);
s += p;
}
}
@@ -362,9 +352,8 @@ void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
#if CONFIG_VP9_HIGHBITDEPTH
// Should we apply any filter at all: 11111111 yes, 00000000 no ?
static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
- uint16_t p3, uint16_t p2,
- uint16_t p1, uint16_t p0,
- uint16_t q0, uint16_t q1,
+ uint16_t p3, uint16_t p2, uint16_t p1,
+ uint16_t p0, uint16_t q0, uint16_t q1,
uint16_t q2, uint16_t q3, int bd) {
int8_t mask = 0;
int16_t limit16 = (uint16_t)limit << (bd - 8);
@@ -375,15 +364,14 @@ static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
mask |= (abs(q1 - q0) > limit16) * -1;
mask |= (abs(q2 - q1) > limit16) * -1;
mask |= (abs(q3 - q2) > limit16) * -1;
- mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
+ mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
return ~mask;
}
-static INLINE int8_t highbd_flat_mask4(uint8_t thresh,
- uint16_t p3, uint16_t p2,
- uint16_t p1, uint16_t p0,
- uint16_t q0, uint16_t q1,
- uint16_t q2, uint16_t q3, int bd) {
+static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
+ uint16_t p1, uint16_t p0, uint16_t q0,
+ uint16_t q1, uint16_t q2, uint16_t q3,
+ int bd) {
int8_t mask = 0;
int16_t thresh16 = (uint16_t)thresh << (bd - 8);
mask |= (abs(p1 - p0) > thresh16) * -1;
@@ -395,11 +383,9 @@ static INLINE int8_t highbd_flat_mask4(uint8_t thresh,
return ~mask;
}
-static INLINE int8_t highbd_flat_mask5(uint8_t thresh,
- uint16_t p4, uint16_t p3,
- uint16_t p2, uint16_t p1,
- uint16_t p0, uint16_t q0,
- uint16_t q1, uint16_t q2,
+static INLINE int8_t highbd_flat_mask5(uint8_t thresh, uint16_t p4, uint16_t p3,
+ uint16_t p2, uint16_t p1, uint16_t p0,
+ uint16_t q0, uint16_t q1, uint16_t q2,
uint16_t q3, uint16_t q4, int bd) {
int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd);
int16_t thresh16 = (uint16_t)thresh << (bd - 8);
@@ -470,21 +456,17 @@ void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
const uint16_t q1 = s[1 * p];
const uint16_t q2 = s[2 * p];
const uint16_t q3 = s[3 * p];
- const int8_t mask = highbd_filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t mask =
+ highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
++s;
}
}
-void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1,
- int bd) {
+void vpx_highbd_lpf_horizontal_4_dual_c(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd);
}
@@ -498,31 +480,26 @@ void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
// of 8 bit simd instructions.
for (i = 0; i < 8; ++i) {
const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
- const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
- const int8_t mask = highbd_filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+ const int8_t mask =
+ highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
s += pitch;
}
}
-void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1,
- int bd) {
+void vpx_highbd_lpf_vertical_4_dual_c(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
- vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
- thresh1, bd);
+ vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
+ bd);
}
static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
- uint16_t *op3, uint16_t *op2,
- uint16_t *op1, uint16_t *op0,
- uint16_t *oq0, uint16_t *oq1,
+ uint16_t *op3, uint16_t *op2, uint16_t *op1,
+ uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
uint16_t *oq2, uint16_t *oq3, int bd) {
if (flat && mask) {
const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
@@ -536,7 +513,7 @@ static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
*oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
*oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
} else {
- highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
+ highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
}
}
@@ -551,25 +528,20 @@ void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
- const int8_t mask = highbd_filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3, bd);
- const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
- bd);
- highbd_filter8(mask, *thresh, flat,
- s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
- s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
+ const int8_t mask =
+ highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat =
+ highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p,
+ s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
++s;
}
}
-void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1,
- int bd) {
+void vpx_highbd_lpf_horizontal_8_dual_c(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd);
}
@@ -582,40 +554,31 @@ void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
for (i = 0; i < 8; ++i) {
const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
- const int8_t mask = highbd_filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3, bd);
- const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
- bd);
- highbd_filter8(mask, *thresh, flat,
- s - 4, s - 3, s - 2, s - 1,
- s, s + 1, s + 2, s + 3,
- bd);
+ const int8_t mask =
+ highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat =
+ highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1,
+ s + 2, s + 3, bd);
s += pitch;
}
}
-void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1,
- int bd) {
+void vpx_highbd_lpf_vertical_8_dual_c(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
- vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
- thresh1, bd);
-}
-
-static INLINE void highbd_filter16(int8_t mask, uint8_t thresh,
- uint8_t flat, uint8_t flat2,
- uint16_t *op7, uint16_t *op6,
- uint16_t *op5, uint16_t *op4,
- uint16_t *op3, uint16_t *op2,
- uint16_t *op1, uint16_t *op0,
- uint16_t *oq0, uint16_t *oq1,
- uint16_t *oq2, uint16_t *oq3,
- uint16_t *oq4, uint16_t *oq5,
+ vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
+ bd);
+}
+
+static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat,
+ uint8_t flat2, uint16_t *op7, uint16_t *op6,
+ uint16_t *op5, uint16_t *op4, uint16_t *op3,
+ uint16_t *op2, uint16_t *op1, uint16_t *op0,
+ uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
+ uint16_t *oq3, uint16_t *oq4, uint16_t *oq5,
uint16_t *oq6, uint16_t *oq7, int bd) {
if (flat2 && flat && mask) {
const uint16_t p7 = *op7;
@@ -636,34 +599,40 @@ static INLINE void highbd_filter16(int8_t mask, uint8_t thresh,
const uint16_t q7 = *oq7;
// 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
- *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
- q0, 4);
- *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
- q0 + q1, 4);
- *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
- q0 + q1 + q2, 4);
- *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
- q0 + q1 + q2 + q3, 4);
- *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
- q0 + q1 + q2 + q3 + q4, 4);
+ *op6 = ROUND_POWER_OF_TWO(
+ p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
+ *op5 = ROUND_POWER_OF_TWO(
+ p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
+ *op4 = ROUND_POWER_OF_TWO(
+ p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
+ *op3 = ROUND_POWER_OF_TWO(
+ p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
+ *op2 = ROUND_POWER_OF_TWO(
+ p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
+ 4);
*op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
- q0 + q1 + q2 + q3 + q4 + q5, 4);
- *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
- q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
- *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
- q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
- *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
- q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
- *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
- q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
- *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
- q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
- *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
- q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
- *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
- q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
- *oq6 = ROUND_POWER_OF_TWO(p0 +
- q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
+ q0 + q1 + q2 + q3 + q4 + q5,
+ 4);
+ *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
+ q1 + q2 + q3 + q4 + q5 + q6,
+ 4);
+ *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
+ q2 + q3 + q4 + q5 + q6 + q7,
+ 4);
+ *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
+ q3 + q4 + q5 + q6 + q7 * 2,
+ 4);
+ *oq2 = ROUND_POWER_OF_TWO(
+ p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
+ 4);
+ *oq3 = ROUND_POWER_OF_TWO(
+ p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
+ *oq4 = ROUND_POWER_OF_TWO(
+ p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
+ *oq5 = ROUND_POWER_OF_TWO(
+ p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
+ *oq6 = ROUND_POWER_OF_TWO(
+ p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
} else {
highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
bd);
@@ -673,8 +642,8 @@ static INLINE void highbd_filter16(int8_t mask, uint8_t thresh,
static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
const uint8_t *blimit,
const uint8_t *limit,
- const uint8_t *thresh,
- int count, int bd) {
+ const uint8_t *thresh, int count,
+ int bd) {
int i;
// loop filter designed to work using chars so that we can make maximum use
@@ -688,20 +657,18 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
const uint16_t q1 = s[1 * p];
const uint16_t q2 = s[2 * p];
const uint16_t q3 = s[3 * p];
- const int8_t mask = highbd_filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3, bd);
- const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
- bd);
- const int8_t flat2 = highbd_flat_mask5(
- 1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
- q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
-
- highbd_filter16(mask, *thresh, flat, flat2,
- s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
- s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
- s, s + 1 * p, s + 2 * p, s + 3 * p,
- s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p,
- bd);
+ const int8_t mask =
+ highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat =
+ highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat2 =
+ highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
+ s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
+
+ highbd_filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
+ s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
+ s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p,
+ s + 6 * p, s + 7 * p, bd);
++s;
}
}
@@ -723,8 +690,8 @@ void vpx_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p,
static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
const uint8_t *blimit,
const uint8_t *limit,
- const uint8_t *thresh,
- int count, int bd) {
+ const uint8_t *thresh, int count,
+ int bd) {
int i;
for (i = 0; i < count; ++i) {
@@ -736,17 +703,16 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
const uint16_t q1 = s[1];
const uint16_t q2 = s[2];
const uint16_t q3 = s[3];
- const int8_t mask = highbd_filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3, bd);
- const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
- bd);
+ const int8_t mask =
+ highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat =
+ highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
q0, s[4], s[5], s[6], s[7], bd);
- highbd_filter16(mask, *thresh, flat, flat2,
- s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
- s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7,
- bd);
+ highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5,
+ s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4,
+ s + 5, s + 6, s + 7, bd);
s += p;
}
}
@@ -760,8 +726,7 @@ void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
const uint8_t *blimit,
const uint8_t *limit,
- const uint8_t *thresh,
- int bd) {
+ const uint8_t *thresh, int bd) {
highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/mips/add_noise_msa.c b/vpx_dsp/mips/add_noise_msa.c
index c66c93c58..e372b9d8c 100644
--- a/vpx_dsp/mips/add_noise_msa.c
+++ b/vpx_dsp/mips/add_noise_msa.c
@@ -12,8 +12,8 @@
#include "./macros_msa.h"
void vpx_plane_add_noise_msa(uint8_t *start_ptr, const int8_t *noise,
- int blackclamp, int whiteclamp,
- int width, int height, int32_t pitch) {
+ int blackclamp, int whiteclamp, int width,
+ int height, int32_t pitch) {
uint32_t i, j;
for (i = 0; i < height / 2; ++i) {
diff --git a/vpx_dsp/mips/common_dspr2.h b/vpx_dsp/mips/common_dspr2.h
index 7a10bf1c4..0a42f5cec 100644
--- a/vpx_dsp/mips/common_dspr2.h
+++ b/vpx_dsp/mips/common_dspr2.h
@@ -24,37 +24,21 @@ extern "C" {
extern uint8_t *vpx_ff_cropTbl; // From "vpx_dsp/mips/intrapred4_dspr2.c"
static INLINE void prefetch_load(const unsigned char *src) {
- __asm__ __volatile__ (
- "pref 0, 0(%[src]) \n\t"
- :
- : [src] "r" (src)
- );
+ __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src));
}
/* prefetch data for store */
static INLINE void prefetch_store(unsigned char *dst) {
- __asm__ __volatile__ (
- "pref 1, 0(%[dst]) \n\t"
- :
- : [dst] "r" (dst)
- );
+ __asm__ __volatile__("pref 1, 0(%[dst]) \n\t" : : [dst] "r"(dst));
}
static INLINE void prefetch_load_streamed(const unsigned char *src) {
- __asm__ __volatile__ (
- "pref 4, 0(%[src]) \n\t"
- :
- : [src] "r" (src)
- );
+ __asm__ __volatile__("pref 4, 0(%[src]) \n\t" : : [src] "r"(src));
}
/* prefetch data for store */
static INLINE void prefetch_store_streamed(unsigned char *dst) {
- __asm__ __volatile__ (
- "pref 5, 0(%[dst]) \n\t"
- :
- : [dst] "r" (dst)
- );
+ __asm__ __volatile__("pref 5, 0(%[dst]) \n\t" : : [dst] "r"(dst));
}
#endif // #if HAVE_DSPR2
#ifdef __cplusplus
diff --git a/vpx_dsp/mips/convolve2_avg_dspr2.c b/vpx_dsp/mips/convolve2_avg_dspr2.c
index 3c767672f..ae88eddfd 100644
--- a/vpx_dsp/mips/convolve2_avg_dspr2.c
+++ b/vpx_dsp/mips/convolve2_avg_dspr2.c
@@ -18,25 +18,22 @@
#include "vpx_ports/mem.h"
#if HAVE_DSPR2
-static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_y,
- int32_t w,
+static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_y, int32_t w,
int32_t h) {
- int32_t x, y;
+ int32_t x, y;
const uint8_t *src_ptr;
- uint8_t *dst_ptr;
- uint8_t *cm = vpx_ff_cropTbl;
- uint32_t vector4a = 64;
- uint32_t load1, load2;
- uint32_t p1, p2;
- uint32_t scratch1, scratch2;
- uint32_t store1, store2;
- int32_t Temp1, Temp2;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
const int16_t *filter = &filter_y[3];
- uint32_t filter45;
+ uint32_t filter45;
filter45 = ((const int32_t *)filter)[0];
@@ -48,7 +45,7 @@ static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
src_ptr = src + x;
dst_ptr = dst + x;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[load1], 0(%[src_ptr]) \n\t"
"add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
"ulw %[load2], 0(%[src_ptr]) \n\t"
@@ -105,16 +102,13 @@ static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
"sb %[store1], 2(%[dst_ptr]) \n\t"
"sb %[store2], 3(%[dst_ptr]) \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [p1] "=&r" (p1), [p2] "=&r" (p2),
- [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
- [store1] "=&r" (store1), [store2] "=&r" (store2),
- [src_ptr] "+r" (src_ptr)
- : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
- [src_stride] "r" (src_stride), [cm] "r" (cm),
- [dst_ptr] "r" (dst_ptr)
- );
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
+ [p2] "=&r"(p2), [scratch1] "=&r"(scratch1),
+ [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
+ [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
}
/* Next row... */
@@ -124,23 +118,21 @@ static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
}
static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
- const int16_t *filter_y,
- int32_t h) {
- int32_t x, y;
+ const int16_t *filter_y, int32_t h) {
+ int32_t x, y;
const uint8_t *src_ptr;
- uint8_t *dst_ptr;
- uint8_t *cm = vpx_ff_cropTbl;
- uint32_t vector4a = 64;
- uint32_t load1, load2;
- uint32_t p1, p2;
- uint32_t scratch1, scratch2;
- uint32_t store1, store2;
- int32_t Temp1, Temp2;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
const int16_t *filter = &filter_y[3];
- uint32_t filter45;;
+ uint32_t filter45;
filter45 = ((const int32_t *)filter)[0];
@@ -153,7 +145,7 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
src_ptr = src + x;
dst_ptr = dst + x;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[load1], 0(%[src_ptr]) \n\t"
"add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
"ulw %[load2], 0(%[src_ptr]) \n\t"
@@ -210,16 +202,13 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
"sb %[store1], 2(%[dst_ptr]) \n\t"
"sb %[store2], 3(%[dst_ptr]) \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [p1] "=&r" (p1), [p2] "=&r" (p2),
- [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
- [store1] "=&r" (store1), [store2] "=&r" (store2),
- [src_ptr] "+r" (src_ptr)
- : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
- [src_stride] "r" (src_stride), [cm] "r" (cm),
- [dst_ptr] "r" (dst_ptr)
- );
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
+ [p2] "=&r"(p2), [scratch1] "=&r"(scratch1),
+ [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
+ [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
}
/* Next row... */
@@ -231,18 +220,16 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
uint32_t pos = 38;
assert(y_step_q4 == 16);
/* bit positon for extract from acc */
- __asm__ __volatile__ (
- "wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r" (pos)
- );
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
prefetch_store(dst);
@@ -251,22 +238,17 @@ void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
case 8:
case 16:
case 32:
- convolve_bi_avg_vert_4_dspr2(src, src_stride,
- dst, dst_stride,
- filter_y, w, h);
+ convolve_bi_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y,
+ w, h);
break;
case 64:
prefetch_store(dst + 32);
- convolve_bi_avg_vert_64_dspr2(src, src_stride,
- dst, dst_stride,
- filter_y, h);
+ convolve_bi_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y,
+ h);
break;
default:
- vpx_convolve8_avg_vert_c(src, src_stride,
- dst, dst_stride,
- filter_x, x_step_q4,
- filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
break;
}
}
diff --git a/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c b/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
index 932a73d39..e944207b6 100644
--- a/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
+++ b/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
@@ -19,20 +19,18 @@
#if HAVE_DSPR2
static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h) {
+ const int16_t *filter_x0, int32_t h) {
int32_t y;
uint8_t *cm = vpx_ff_cropTbl;
- int32_t Temp1, Temp2, Temp3, Temp4;
+ int32_t Temp1, Temp2, Temp3, Temp4;
uint32_t vector4a = 64;
uint32_t tp1, tp2;
uint32_t p1, p2, p3;
uint32_t tn1, tn2;
const int16_t *filter = &filter_x0[3];
- uint32_t filter45;
+ uint32_t filter45;
filter45 = ((const int32_t *)filter)[0];
@@ -42,7 +40,7 @@ static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t"
@@ -61,51 +59,49 @@ static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
"dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
"extp %[Temp3], $ac2, 31 \n\t"
- "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */
+ "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */
/* odd 1. pixel */
- "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */
"mtlo %[vector4a], $ac3 \n\t"
"mthi $zero, $ac3 \n\t"
- "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */
+ "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */
"preceu.ph.qbr %[p1], %[tp2] \n\t"
"preceu.ph.qbl %[p3], %[tp2] \n\t"
"dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
"extp %[Temp2], $ac3, 31 \n\t"
- "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */
+ "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */
/* odd 2. pixel */
- "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */
"mtlo %[vector4a], $ac2 \n\t"
"mthi $zero, $ac2 \n\t"
- "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */
- "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */
+ "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */
"dpa.w.ph $ac2, %[p3], %[filter45] \n\t"
"extp %[Temp4], $ac2, 31 \n\t"
- "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */
- "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */
+ "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */
+ "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */
/* clamp */
- "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */
- "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */
- "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */
+ "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */
+ "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */
+ "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */
- "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */
- "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */
+ "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */
+ "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */
- "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */
- "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */
+ "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */
+ "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */
- : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
- [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
- [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
- : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
- [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
- );
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+ [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [Temp4] "=&r"(Temp4)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
/* Next row... */
src += src_stride;
@@ -114,11 +110,9 @@ static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
}
static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h) {
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
int32_t y;
uint8_t *cm = vpx_ff_cropTbl;
uint32_t vector4a = 64;
@@ -127,7 +121,7 @@ static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
uint32_t p1, p2, p3, p4, n1;
uint32_t st0, st1;
const int16_t *filter = &filter_x0[3];
- uint32_t filter45;;
+ uint32_t filter45;
filter45 = ((const int32_t *)filter)[0];
@@ -137,7 +131,7 @@ static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t"
@@ -246,15 +240,12 @@ static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
"sb %[tp4], 5(%[dst]) \n\t"
"sb %[tp1], 7(%[dst]) \n\t"
- : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
- [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
- [st0] "=&r" (st0), [st1] "=&r" (st1),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [n1] "=&r" (n1),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
- : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
- [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
- );
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4), [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1),
+ [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
/* Next row... */
src += src_stride;
@@ -263,12 +254,10 @@ static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
}
static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
- int32_t src_stride,
- uint8_t *dst_ptr,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h,
- int32_t count) {
+ int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h,
+ int32_t count) {
int32_t y, c;
const uint8_t *src;
uint8_t *dst;
@@ -279,7 +268,7 @@ static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
uint32_t p1, p2, p3, p4, p5;
uint32_t st1, st2, st3;
const int16_t *filter = &filter_x0[3];
- uint32_t filter45;;
+ uint32_t filter45;
filter45 = ((const int32_t *)filter)[0];
@@ -293,7 +282,7 @@ static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
prefetch_store(dst_ptr + dst_stride);
for (c = 0; c < count; c++) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[qload1], 0(%[src]) \n\t"
"ulw %[qload2], 4(%[src]) \n\t"
@@ -493,14 +482,13 @@ static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
"sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
"sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
- : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
- [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [qload3] "=&r" (qload3), [p5] "=&r" (p5),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
- : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
- [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
- );
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
+ [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
+ [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3)
+ : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
src += 16;
dst += 16;
@@ -513,11 +501,10 @@ static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
}
static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
- int32_t src_stride,
- uint8_t *dst_ptr,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h) {
+ int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
int32_t y, c;
const uint8_t *src;
uint8_t *dst;
@@ -528,7 +515,7 @@ static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
uint32_t p1, p2, p3, p4, p5;
uint32_t st1, st2, st3;
const int16_t *filter = &filter_x0[3];
- uint32_t filter45;;
+ uint32_t filter45;
filter45 = ((const int32_t *)filter)[0];
@@ -544,7 +531,7 @@ static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
prefetch_store(dst_ptr + dst_stride + 32);
for (c = 0; c < 4; c++) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[qload1], 0(%[src]) \n\t"
"ulw %[qload2], 4(%[src]) \n\t"
@@ -744,14 +731,13 @@ static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
"sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
"sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
- : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
- [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [qload3] "=&r" (qload3), [p5] "=&r" (p5),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
- : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
- [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
- );
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
+ [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
+ [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3)
+ : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
src += 16;
dst += 16;
@@ -773,11 +759,9 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
assert(x_step_q4 == 16);
/* bit positon for extract from acc */
- __asm__ __volatile__ (
- "wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r" (pos)
- );
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
/* prefetch data to cache memory */
prefetch_load(src);
@@ -786,39 +770,31 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
switch (w) {
case 4:
- convolve_bi_avg_horiz_4_dspr2(src, src_stride,
- dst, dst_stride,
- filter_x, h);
+ convolve_bi_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h);
break;
case 8:
- convolve_bi_avg_horiz_8_dspr2(src, src_stride,
- dst, dst_stride,
- filter_x, h);
+ convolve_bi_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h);
break;
case 16:
- convolve_bi_avg_horiz_16_dspr2(src, src_stride,
- dst, dst_stride,
- filter_x, h, 1);
+ convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h, 1);
break;
case 32:
- convolve_bi_avg_horiz_16_dspr2(src, src_stride,
- dst, dst_stride,
- filter_x, h, 2);
+ convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h, 2);
break;
case 64:
prefetch_load(src + 64);
prefetch_store(dst + 32);
- convolve_bi_avg_horiz_64_dspr2(src, src_stride,
- dst, dst_stride,
- filter_x, h);
+ convolve_bi_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h);
break;
default:
- vpx_convolve8_avg_horiz_c(src, src_stride,
- dst, dst_stride,
- filter_x, x_step_q4,
- filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
break;
}
}
diff --git a/vpx_dsp/mips/convolve2_dspr2.c b/vpx_dsp/mips/convolve2_dspr2.c
index d111029d4..e355ba3a0 100644
--- a/vpx_dsp/mips/convolve2_dspr2.c
+++ b/vpx_dsp/mips/convolve2_dspr2.c
@@ -18,21 +18,18 @@
#include "vpx_ports/mem.h"
#if HAVE_DSPR2
-static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h) {
- int32_t y;
- uint8_t *cm = vpx_ff_cropTbl;
- uint8_t *dst_ptr;
- int32_t Temp1, Temp2;
- uint32_t vector4a = 64;
- uint32_t tp1, tp2;
- uint32_t p1, p2;
+static void convolve_bi_horiz_4_transposed_dspr2(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint8_t *dst_ptr;
+ int32_t Temp1, Temp2;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2;
const int16_t *filter = &filter_x0[3];
- uint32_t filter45;
+ uint32_t filter45;
filter45 = ((const int32_t *)filter)[0];
@@ -42,7 +39,7 @@ static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src,
prefetch_load(src + src_stride);
prefetch_load(src + src_stride + 32);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t"
@@ -94,13 +91,10 @@ static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src,
"sb %[p2], 0(%[dst_ptr]) \n\t"
"addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
- : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
- [p1] "=&r" (p1), [p2] "=&r" (p2),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
- [dst_ptr] "+r" (dst_ptr)
- : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
- [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
- );
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [dst_ptr] "+r"(dst_ptr)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+ [src] "r"(src), [dst_stride] "r"(dst_stride));
/* Next row... */
src += src_stride;
@@ -108,12 +102,9 @@ static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src,
}
}
-static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h) {
+static void convolve_bi_horiz_8_transposed_dspr2(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
int32_t y;
uint8_t *cm = vpx_ff_cropTbl;
uint8_t *dst_ptr;
@@ -124,7 +115,7 @@ static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
uint8_t *odd_dst;
uint32_t dst_pitch_2 = (dst_stride << 1);
const int16_t *filter = &filter_x0[3];
- uint32_t filter45;
+ uint32_t filter45;
filter45 = ((const int32_t *)filter)[0];
@@ -136,7 +127,7 @@ static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
dst_ptr = dst;
odd_dst = (dst_ptr + dst_stride);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t"
@@ -180,7 +171,8 @@ static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
"dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
"extp %[Temp3], $ac2, 31 \n\t"
- "lbux %[Temp1], %[p3](%[cm]) \n\t"
+ "lbux %[Temp1], %[p3](%[cm]) "
+ "\n\t"
/* odd 1. pixel */
"mtlo %[vector4a], $ac1 \n\t"
@@ -231,13 +223,12 @@ static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
"sb %[p1], 0(%[odd_dst]) \n\t"
- : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
- [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
- : [filter45] "r" (filter45),[vector4a] "r" (vector4a), [cm] "r" (cm),
- [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
- );
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
+ [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dst_ptr] "+r"(dst_ptr),
+ [odd_dst] "+r"(odd_dst)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+ [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
/* Next row... */
src += src_stride;
@@ -245,26 +236,22 @@ static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
}
}
-static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
- int32_t src_stride,
- uint8_t *dst_ptr,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h,
- int32_t count) {
- int32_t c, y;
+static void convolve_bi_horiz_16_transposed_dspr2(
+ const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
+ int32_t c, y;
const uint8_t *src;
- uint8_t *dst;
- uint8_t *cm = vpx_ff_cropTbl;
- uint32_t vector_64 = 64;
- int32_t Temp1, Temp2, Temp3;
- uint32_t qload1, qload2;
- uint32_t p1, p2, p3, p4, p5;
- uint32_t st1, st2, st3;
- uint32_t dst_pitch_2 = (dst_stride << 1);
- uint8_t *odd_dst;
+ uint8_t *dst;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+ uint8_t *odd_dst;
const int16_t *filter = &filter_x0[3];
- uint32_t filter45;
+ uint32_t filter45;
filter45 = ((const int32_t *)filter)[0];
@@ -279,193 +266,329 @@ static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
odd_dst = (dst + dst_stride);
for (c = 0; c < count; c++) {
- __asm__ __volatile__ (
- "ulw %[qload1], 0(%[src]) \n\t"
- "ulw %[qload2], 4(%[src]) \n\t"
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 4(%[src]) "
+ "\n\t"
/* even 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
- "mthi $zero, $ac1 \n\t"
- "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "ulw %[qload1], 8(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 1 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 2 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "ulw %[qload1], 8(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] "
+ "\n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 1 */
/* even 2. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p5], %[qload1] \n\t"
- "ulw %[qload2], 12(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 3 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload1] "
+ "\n\t"
+ "ulw %[qload2], 12(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] "
+ "\n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 1 */
/* even 3. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p2], %[qload2] \n\t"
- "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 4 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 1 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ " \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] "
+ "\n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 1 */
/* even 4. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbl %[p3], %[qload2] \n\t"
- "sb %[st2], 0(%[dst]) \n\t" /* even 2 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 5 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 2 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter45] "
+ "\n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 3 */
/* even 5. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
- "mthi $zero, $ac3 \n\t"
- "sb %[st3], 0(%[dst]) \n\t" /* even 3 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 6 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 3 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter45] "
+ "\n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 4 */
/* even 6. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
- "mthi $zero, $ac1 \n\t"
- "sb %[st1], 0(%[dst]) \n\t" /* even 4 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "ulw %[qload1], 20(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 7 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 4 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 20(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter45] "
+ "\n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 5 */
/* even 7. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p5], %[qload1] \n\t"
- "sb %[st2], 0(%[dst]) \n\t" /* even 5 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 8 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 5 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] "
+ "\n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 6 */
/* even 8. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
- "mthi $zero, $ac3 \n\t"
- "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
- "sb %[st3], 0(%[dst]) \n\t" /* even 6 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 1 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] "
+ "\n\t" /* even 8 */
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 6 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 7 */
/* ODD pixels */
- "ulw %[qload1], 1(%[src]) \n\t"
- "ulw %[qload2], 5(%[src]) \n\t"
+ "ulw %[qload1], 1(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 5(%[src]) "
+ "\n\t"
/* odd 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "sb %[st1], 0(%[dst]) \n\t" /* even 7 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "ulw %[qload2], 9(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 2 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 7 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 9(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] "
+ "\n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 8 */
/* odd 2. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[qload2] \n\t"
- "preceu.ph.qbl %[p5], %[qload2] \n\t"
- "sb %[st2], 0(%[dst]) \n\t" /* even 8 */
- "ulw %[qload1], 13(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 3 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] "
+ "\n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 1 */
/* odd 3. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 4 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload1] "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 1 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] "
+ "\n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 2 */
/* odd 4. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 5 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 2 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter45] "
+ "\n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 3 */
/* odd 5. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
- "mthi $zero, $ac2 \n\t"
- "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 6 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 3 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] "
+ "\n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 4 */
/* odd 6. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
- "mthi $zero, $ac3 \n\t"
- "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "ulw %[qload1], 21(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 7 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 4 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 21(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter45] "
+ "\n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 5 */
/* odd 7. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p5], %[qload1] \n\t"
- "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 8 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 5 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] "
+ "\n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 7 */
/* odd 8. pixel */
- "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
-
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
-
- "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
-
- "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
-
- "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */
-
- : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
- [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
- [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
- : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
- [cm] "r" (cm),
- [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
- );
+ "dpa.w.ph $ac1, %[p3], %[filter45] "
+ "\n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 8 */
+
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 6 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 7 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 8 */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
+ [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
+ [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
+ : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+ [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
src += 16;
dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
@@ -478,25 +601,22 @@ static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
}
}
-static void convolve_bi_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
- int32_t src_stride,
- uint8_t *dst_ptr,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h) {
- int32_t c, y;
+static void convolve_bi_horiz_64_transposed_dspr2(
+ const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
+ int32_t c, y;
const uint8_t *src;
- uint8_t *dst;
- uint8_t *cm = vpx_ff_cropTbl;
- uint32_t vector_64 = 64;
- int32_t Temp1, Temp2, Temp3;
- uint32_t qload1, qload2;
- uint32_t p1, p2, p3, p4, p5;
- uint32_t st1, st2, st3;
- uint32_t dst_pitch_2 = (dst_stride << 1);
- uint8_t *odd_dst;
+ uint8_t *dst;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+ uint8_t *odd_dst;
const int16_t *filter = &filter_x0[3];
- uint32_t filter45;
+ uint32_t filter45;
filter45 = ((const int32_t *)filter)[0];
@@ -512,193 +632,329 @@ static void convolve_bi_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
odd_dst = (dst + dst_stride);
for (c = 0; c < 4; c++) {
- __asm__ __volatile__ (
- "ulw %[qload1], 0(%[src]) \n\t"
- "ulw %[qload2], 4(%[src]) \n\t"
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 4(%[src]) "
+ "\n\t"
/* even 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
- "mthi $zero, $ac1 \n\t"
- "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "ulw %[qload1], 8(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 1 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 2 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "ulw %[qload1], 8(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] "
+ "\n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 1 */
/* even 2. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p5], %[qload1] \n\t"
- "ulw %[qload2], 12(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 3 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload1] "
+ "\n\t"
+ "ulw %[qload2], 12(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] "
+ "\n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 1 */
/* even 3. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p2], %[qload2] \n\t"
- "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 4 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 1 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ " \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] "
+ "\n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 1 */
/* even 4. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbl %[p3], %[qload2] \n\t"
- "sb %[st2], 0(%[dst]) \n\t" /* even 2 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 5 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 2 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter45] "
+ "\n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 3 */
/* even 5. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
- "mthi $zero, $ac3 \n\t"
- "sb %[st3], 0(%[dst]) \n\t" /* even 3 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 6 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 3 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter45] "
+ "\n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 4 */
/* even 6. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
- "mthi $zero, $ac1 \n\t"
- "sb %[st1], 0(%[dst]) \n\t" /* even 4 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "ulw %[qload1], 20(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 7 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 4 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 20(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter45] "
+ "\n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 5 */
/* even 7. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p5], %[qload1] \n\t"
- "sb %[st2], 0(%[dst]) \n\t" /* even 5 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 8 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 5 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] "
+ "\n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 6 */
/* even 8. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
- "mthi $zero, $ac3 \n\t"
- "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
- "sb %[st3], 0(%[dst]) \n\t" /* even 6 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 1 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] "
+ "\n\t" /* even 8 */
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 6 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 7 */
/* ODD pixels */
- "ulw %[qload1], 1(%[src]) \n\t"
- "ulw %[qload2], 5(%[src]) \n\t"
+ "ulw %[qload1], 1(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 5(%[src]) "
+ "\n\t"
/* odd 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "sb %[st1], 0(%[dst]) \n\t" /* even 7 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "ulw %[qload2], 9(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 2 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 7 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 9(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] "
+ "\n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 8 */
/* odd 2. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[qload2] \n\t"
- "preceu.ph.qbl %[p5], %[qload2] \n\t"
- "sb %[st2], 0(%[dst]) \n\t" /* even 8 */
- "ulw %[qload1], 13(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 3 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] "
+ "\n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 1 */
/* odd 3. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 4 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload1] "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 1 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] "
+ "\n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 2 */
/* odd 4. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 5 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 2 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter45] "
+ "\n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 3 */
/* odd 5. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
- "mthi $zero, $ac2 \n\t"
- "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 6 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 3 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] "
+ "\n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 4 */
/* odd 6. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
- "mthi $zero, $ac3 \n\t"
- "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "ulw %[qload1], 21(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 7 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 4 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 21(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter45] "
+ "\n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 5 */
/* odd 7. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p5], %[qload1] \n\t"
- "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 8 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 5 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] "
+ "\n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 7 */
/* odd 8. pixel */
- "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
-
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
-
- "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
-
- "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
-
- "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */
-
- : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
- [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
- [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
- : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
- [cm] "r" (cm),
- [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
- );
+ "dpa.w.ph $ac1, %[p3], %[filter45] "
+ "\n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 8 */
+
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 6 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 7 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 8 */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
+ [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
+ [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
+ : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+ [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
src += 16;
dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
@@ -731,18 +987,15 @@ void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
}
}
-void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter,
- int w, int h) {
+void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter, int w,
+ int h) {
uint32_t pos = 38;
/* bit positon for extract from acc */
- __asm__ __volatile__ (
- "wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r" (pos)
- );
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
/* prefetch data to cache memory */
prefetch_load(src);
@@ -750,32 +1003,26 @@ void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
switch (w) {
case 4:
- convolve_bi_horiz_4_transposed_dspr2(src, src_stride,
- dst, dst_stride,
+ convolve_bi_horiz_4_transposed_dspr2(src, src_stride, dst, dst_stride,
filter, h);
break;
case 8:
- convolve_bi_horiz_8_transposed_dspr2(src, src_stride,
- dst, dst_stride,
+ convolve_bi_horiz_8_transposed_dspr2(src, src_stride, dst, dst_stride,
filter, h);
break;
case 16:
case 32:
- convolve_bi_horiz_16_transposed_dspr2(src, src_stride,
- dst, dst_stride,
- filter, h,
- (w/16));
+ convolve_bi_horiz_16_transposed_dspr2(src, src_stride, dst, dst_stride,
+ filter, h, (w / 16));
break;
case 64:
prefetch_load(src + 32);
- convolve_bi_horiz_64_transposed_dspr2(src, src_stride,
- dst, dst_stride,
+ convolve_bi_horiz_64_transposed_dspr2(src, src_stride, dst, dst_stride,
filter, h);
break;
default:
- convolve_bi_horiz_transposed(src, src_stride,
- dst, dst_stride,
- filter, w, h);
+ convolve_bi_horiz_transposed(src, src_stride, dst, dst_stride, filter, w,
+ h);
break;
}
}
diff --git a/vpx_dsp/mips/convolve2_horiz_dspr2.c b/vpx_dsp/mips/convolve2_horiz_dspr2.c
index 9fe1a3454..5cc06b5f2 100644
--- a/vpx_dsp/mips/convolve2_horiz_dspr2.c
+++ b/vpx_dsp/mips/convolve2_horiz_dspr2.c
@@ -18,12 +18,9 @@
#include "vpx_ports/mem.h"
#if HAVE_DSPR2
-static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h) {
+static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
int32_t y;
uint8_t *cm = vpx_ff_cropTbl;
int32_t Temp1, Temp2, Temp3, Temp4;
@@ -31,7 +28,7 @@ static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
uint32_t tp1, tp2;
uint32_t p1, p2;
const int16_t *filter = &filter_x0[3];
- uint32_t filter45;;
+ uint32_t filter45;
filter45 = ((const int32_t *)filter)[0];
@@ -41,7 +38,7 @@ static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t"
@@ -86,13 +83,11 @@ static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
"sb %[tp2], 2(%[dst]) \n\t"
"sb %[p2], 3(%[dst]) \n\t"
- : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
- [p1] "=&r" (p1), [p2] "=&r" (p2),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
- [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
- : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
- [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
- );
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [Temp4] "=&r"(Temp4)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
/* Next row... */
src += src_stride;
@@ -100,12 +95,9 @@ static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
}
}
-static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h) {
+static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
int32_t y;
uint8_t *cm = vpx_ff_cropTbl;
uint32_t vector4a = 64;
@@ -114,7 +106,7 @@ static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
uint32_t p1, p2, p3, p4;
uint32_t st0, st1;
const int16_t *filter = &filter_x0[3];
- uint32_t filter45;;
+ uint32_t filter45;
filter45 = ((const int32_t *)filter)[0];
@@ -124,7 +116,7 @@ static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t"
@@ -210,13 +202,12 @@ static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
"sb %[p2], 5(%[dst]) \n\t"
"sb %[p1], 7(%[dst]) \n\t"
- : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
- [st0] "=&r" (st0), [st1] "=&r" (st1),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
- : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
- [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
- );
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
/* Next row... */
src += src_stride;
@@ -225,11 +216,9 @@ static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
}
static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
- int32_t src_stride,
- uint8_t *dst_ptr,
+ int32_t src_stride, uint8_t *dst_ptr,
int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h,
+ const int16_t *filter_x0, int32_t h,
int32_t count) {
int32_t y, c;
const uint8_t *src;
@@ -241,7 +230,7 @@ static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
uint32_t p1, p2, p3, p4, p5;
uint32_t st1, st2, st3;
const int16_t *filter = &filter_x0[3];
- uint32_t filter45;;
+ uint32_t filter45;
filter45 = ((const int32_t *)filter)[0];
@@ -255,7 +244,7 @@ static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
prefetch_store(dst_ptr + dst_stride);
for (c = 0; c < count; c++) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[qload1], 0(%[src]) \n\t"
"ulw %[qload2], 4(%[src]) \n\t"
@@ -413,14 +402,13 @@ static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
"sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
"sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
- : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
- [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [p5] "=&r" (p5),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
- : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
- [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
- );
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+ [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+ [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
src += 16;
dst += 16;
@@ -433,11 +421,9 @@ static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
}
static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
- int32_t src_stride,
- uint8_t *dst_ptr,
+ int32_t src_stride, uint8_t *dst_ptr,
int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h) {
+ const int16_t *filter_x0, int32_t h) {
int32_t y, c;
const uint8_t *src;
uint8_t *dst;
@@ -448,7 +434,7 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
uint32_t p1, p2, p3, p4, p5;
uint32_t st1, st2, st3;
const int16_t *filter = &filter_x0[3];
- uint32_t filter45;;
+ uint32_t filter45;
filter45 = ((const int32_t *)filter)[0];
@@ -464,7 +450,7 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
prefetch_store(dst_ptr + dst_stride + 32);
for (c = 0; c < 4; c++) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[qload1], 0(%[src]) \n\t"
"ulw %[qload2], 4(%[src]) \n\t"
@@ -622,14 +608,13 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
"sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
"sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
- : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
- [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [p5] "=&r" (p5),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
- : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
- [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
- );
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+ [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+ [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
src += 16;
dst += 16;
@@ -644,8 +629,8 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
uint32_t pos = 38;
assert(x_step_q4 == 16);
@@ -653,11 +638,9 @@ void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
prefetch_load((const uint8_t *)filter_x);
/* bit positon for extract from acc */
- __asm__ __volatile__ (
- "wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r" (pos)
- );
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
/* prefetch data to cache memory */
prefetch_load(src);
@@ -666,39 +649,31 @@ void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
switch (w) {
case 4:
- convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filter_x, (int32_t)h);
+ convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h);
break;
case 8:
- convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filter_x, (int32_t)h);
+ convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h);
break;
case 16:
- convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filter_x, (int32_t)h, 1);
+ convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h, 1);
break;
case 32:
- convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filter_x, (int32_t)h, 2);
+ convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h, 2);
break;
case 64:
prefetch_load(src + 64);
prefetch_store(dst + 32);
- convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filter_x, (int32_t)h);
+ convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h);
break;
default:
- vpx_convolve8_horiz_c(src, src_stride,
- dst, dst_stride,
- filter_x, x_step_q4,
- filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
break;
}
}
diff --git a/vpx_dsp/mips/convolve2_vert_dspr2.c b/vpx_dsp/mips/convolve2_vert_dspr2.c
index dde6ffd54..eb1975e44 100644
--- a/vpx_dsp/mips/convolve2_vert_dspr2.c
+++ b/vpx_dsp/mips/convolve2_vert_dspr2.c
@@ -18,25 +18,22 @@
#include "vpx_ports/mem.h"
#if HAVE_DSPR2
-static void convolve_bi_vert_4_dspr2(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_y,
- int32_t w,
+static void convolve_bi_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_y, int32_t w,
int32_t h) {
- int32_t x, y;
+ int32_t x, y;
const uint8_t *src_ptr;
- uint8_t *dst_ptr;
- uint8_t *cm = vpx_ff_cropTbl;
- uint32_t vector4a = 64;
- uint32_t load1, load2;
- uint32_t p1, p2;
- uint32_t scratch1;
- uint32_t store1, store2;
- int32_t Temp1, Temp2;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
const int16_t *filter = &filter_y[3];
- uint32_t filter45;
+ uint32_t filter45;
filter45 = ((const int32_t *)filter)[0];
@@ -48,7 +45,7 @@ static void convolve_bi_vert_4_dspr2(const uint8_t *src,
src_ptr = src + x;
dst_ptr = dst + x;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[load1], 0(%[src_ptr]) \n\t"
"add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
"ulw %[load2], 0(%[src_ptr]) \n\t"
@@ -98,16 +95,12 @@ static void convolve_bi_vert_4_dspr2(const uint8_t *src,
"sb %[store1], 2(%[dst_ptr]) \n\t"
"sb %[store2], 3(%[dst_ptr]) \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [p1] "=&r" (p1), [p2] "=&r" (p2),
- [scratch1] "=&r" (scratch1),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
- [store1] "=&r" (store1), [store2] "=&r" (store2),
- [src_ptr] "+r" (src_ptr)
- : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
- [src_stride] "r" (src_stride),
- [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
- );
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
+ [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
+ [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
}
/* Next row... */
@@ -116,24 +109,21 @@ static void convolve_bi_vert_4_dspr2(const uint8_t *src,
}
}
-static void convolve_bi_vert_64_dspr2(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_y,
- int32_t h) {
- int32_t x, y;
+static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_y, int32_t h) {
+ int32_t x, y;
const uint8_t *src_ptr;
- uint8_t *dst_ptr;
- uint8_t *cm = vpx_ff_cropTbl;
- uint32_t vector4a = 64;
- uint32_t load1, load2;
- uint32_t p1, p2;
- uint32_t scratch1;
- uint32_t store1, store2;
- int32_t Temp1, Temp2;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
const int16_t *filter = &filter_y[3];
- uint32_t filter45;
+ uint32_t filter45;
filter45 = ((const int32_t *)filter)[0];
@@ -145,7 +135,7 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src,
src_ptr = src + x;
dst_ptr = dst + x;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[load1], 0(%[src_ptr]) \n\t"
"add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
"ulw %[load2], 0(%[src_ptr]) \n\t"
@@ -195,16 +185,12 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src,
"sb %[store1], 2(%[dst_ptr]) \n\t"
"sb %[store2], 3(%[dst_ptr]) \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [p1] "=&r" (p1), [p2] "=&r" (p2),
- [scratch1] "=&r" (scratch1),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
- [store1] "=&r" (store1), [store2] "=&r" (store2),
- [src_ptr] "+r" (src_ptr)
- : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
- [src_stride] "r" (src_stride),
- [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
- );
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
+ [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
+ [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
}
/* Next row... */
@@ -216,42 +202,34 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src,
void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
uint32_t pos = 38;
assert(y_step_q4 == 16);
/* bit positon for extract from acc */
- __asm__ __volatile__ (
- "wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r" (pos)
- );
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
prefetch_store(dst);
switch (w) {
- case 4 :
- case 8 :
- case 16 :
- case 32 :
- convolve_bi_vert_4_dspr2(src, src_stride,
- dst, dst_stride,
- filter_y, w, h);
+ case 4:
+ case 8:
+ case 16:
+ case 32:
+ convolve_bi_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
+ h);
break;
- case 64 :
+ case 64:
prefetch_store(dst + 32);
- convolve_bi_vert_64_dspr2(src, src_stride,
- dst, dst_stride,
- filter_y, h);
+ convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
break;
default:
- vpx_convolve8_vert_c(src, src_stride,
- dst, dst_stride,
- filter_x, x_step_q4,
- filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
break;
}
}
diff --git a/vpx_dsp/mips/convolve8_avg_dspr2.c b/vpx_dsp/mips/convolve8_avg_dspr2.c
index 43da9e54f..31812299c 100644
--- a/vpx_dsp/mips/convolve8_avg_dspr2.c
+++ b/vpx_dsp/mips/convolve8_avg_dspr2.c
@@ -18,25 +18,22 @@
#include "vpx_ports/mem.h"
#if HAVE_DSPR2
-static void convolve_avg_vert_4_dspr2(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_y,
- int32_t w,
+static void convolve_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_y, int32_t w,
int32_t h) {
- int32_t x, y;
+ int32_t x, y;
const uint8_t *src_ptr;
- uint8_t *dst_ptr;
- uint8_t *cm = vpx_ff_cropTbl;
- uint32_t vector4a = 64;
- uint32_t load1, load2, load3, load4;
- uint32_t p1, p2;
- uint32_t n1, n2;
- uint32_t scratch1, scratch2;
- uint32_t store1, store2;
- int32_t vector1b, vector2b, vector3b, vector4b;
- int32_t Temp1, Temp2;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2, load3, load4;
+ uint32_t p1, p2;
+ uint32_t n1, n2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2;
vector1b = ((const int32_t *)filter_y)[0];
vector2b = ((const int32_t *)filter_y)[1];
@@ -53,7 +50,7 @@ static void convolve_avg_vert_4_dspr2(const uint8_t *src,
src_ptr = src + x;
dst_ptr = dst + x;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[load1], 0(%[src_ptr]) \n\t"
"add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
"ulw %[load2], 0(%[src_ptr]) \n\t"
@@ -160,18 +157,16 @@ static void convolve_avg_vert_4_dspr2(const uint8_t *src,
"sb %[store1], 2(%[dst_ptr]) \n\t"
"sb %[store2], 3(%[dst_ptr]) \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [load3] "=&r" (load3), [load4] "=&r" (load4),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
- [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
- [store1] "=&r" (store1), [store2] "=&r" (store2),
- [src_ptr] "+r" (src_ptr)
- : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
- [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
- [vector4a] "r" (vector4a),
- [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
- );
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
+ [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
+ [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
}
/* Next row... */
@@ -180,24 +175,21 @@ static void convolve_avg_vert_4_dspr2(const uint8_t *src,
}
}
-static void convolve_avg_vert_64_dspr2(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_y,
- int32_t h) {
- int32_t x, y;
+static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_y, int32_t h) {
+ int32_t x, y;
const uint8_t *src_ptr;
- uint8_t *dst_ptr;
- uint8_t *cm = vpx_ff_cropTbl;
- uint32_t vector4a = 64;
- uint32_t load1, load2, load3, load4;
- uint32_t p1, p2;
- uint32_t n1, n2;
- uint32_t scratch1, scratch2;
- uint32_t store1, store2;
- int32_t vector1b, vector2b, vector3b, vector4b;
- int32_t Temp1, Temp2;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2, load3, load4;
+ uint32_t p1, p2;
+ uint32_t n1, n2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2;
vector1b = ((const int32_t *)filter_y)[0];
vector2b = ((const int32_t *)filter_y)[1];
@@ -215,7 +207,7 @@ static void convolve_avg_vert_64_dspr2(const uint8_t *src,
src_ptr = src + x;
dst_ptr = dst + x;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[load1], 0(%[src_ptr]) \n\t"
"add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
"ulw %[load2], 0(%[src_ptr]) \n\t"
@@ -322,18 +314,16 @@ static void convolve_avg_vert_64_dspr2(const uint8_t *src,
"sb %[store1], 2(%[dst_ptr]) \n\t"
"sb %[store2], 3(%[dst_ptr]) \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [load3] "=&r" (load3), [load4] "=&r" (load4),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
- [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
- [store1] "=&r" (store1), [store2] "=&r" (store2),
- [src_ptr] "+r" (src_ptr)
- : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
- [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
- [vector4a] "r" (vector4a),
- [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
- );
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
+ [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
+ [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
}
/* Next row... */
@@ -345,26 +335,21 @@ static void convolve_avg_vert_64_dspr2(const uint8_t *src,
void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
assert(y_step_q4 == 16);
assert(((const int32_t *)filter_y)[1] != 0x800000);
if (((const int32_t *)filter_y)[0] == 0) {
- vpx_convolve2_avg_vert_dspr2(src, src_stride,
- dst, dst_stride,
- filter_x, x_step_q4,
- filter_y, y_step_q4,
- w, h);
+ vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
} else {
uint32_t pos = 38;
/* bit positon for extract from acc */
- __asm__ __volatile__ (
- "wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r" (pos)
- );
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
prefetch_store(dst);
@@ -373,22 +358,17 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
case 8:
case 16:
case 32:
- convolve_avg_vert_4_dspr2(src, src_stride,
- dst, dst_stride,
- filter_y, w, h);
+ convolve_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
+ h);
break;
case 64:
prefetch_store(dst + 32);
- convolve_avg_vert_64_dspr2(src, src_stride,
- dst, dst_stride,
- filter_y, h);
+ convolve_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y,
+ h);
break;
default:
- vpx_convolve8_avg_vert_c(src, src_stride,
- dst, dst_stride,
- filter_x, x_step_q4,
- filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
break;
}
}
@@ -397,8 +377,8 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
/* Fixed size intermediate buffer places limits on parameters. */
DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
@@ -408,27 +388,20 @@ void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
assert(x_step_q4 == 16);
assert(y_step_q4 == 16);
- if (intermediate_height < h)
- intermediate_height = h;
+ if (intermediate_height < h) intermediate_height = h;
- vpx_convolve8_horiz(src - (src_stride * 3), src_stride,
- temp, 64,
- filter_x, x_step_q4,
- filter_y, y_step_q4,
- w, intermediate_height);
+ vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, intermediate_height);
- vpx_convolve8_avg_vert(temp + 64 * 3, 64,
- dst, dst_stride,
- filter_x, x_step_q4,
- filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
}
void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int w, int h) {
+ const int16_t *filter_y, int filter_y_stride, int w,
+ int h) {
int x, y;
uint32_t tp1, tp2, tn1;
uint32_t tp3, tp4, tn2;
@@ -441,21 +414,19 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
switch (w) {
case 4:
/* 1 word storage */
- for (y = h; y--; ) {
+ for (y = h; y--;) {
prefetch_load(src + src_stride);
prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 0(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
- "sw %[tn1], 0(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "sw %[tn1], 0(%[dst]) \n\t" /* store */
- : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
- [tp2] "=&r" (tp2)
- : [src] "r" (src), [dst] "r" (dst)
- );
+ : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
+ : [src] "r"(src), [dst] "r"(dst));
src += src_stride;
dst += dst_stride;
@@ -463,26 +434,24 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
break;
case 8:
/* 2 word storage */
- for (y = h; y--; ) {
+ for (y = h; y--;) {
prefetch_load(src + src_stride);
prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 0(%[dst]) \n\t"
"ulw %[tp3], 4(%[src]) \n\t"
"ulw %[tp4], 4(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
- "sw %[tn1], 0(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 4(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "sw %[tn1], 0(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 4(%[dst]) \n\t" /* store */
- : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
- [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
- [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
- : [src] "r" (src), [dst] "r" (dst)
- );
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
+ : [src] "r"(src), [dst] "r"(dst));
src += src_stride;
dst += dst_stride;
@@ -490,34 +459,32 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
break;
case 16:
/* 4 word storage */
- for (y = h; y--; ) {
+ for (y = h; y--;) {
prefetch_load(src + src_stride);
prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 0(%[dst]) \n\t"
"ulw %[tp3], 4(%[src]) \n\t"
"ulw %[tp4], 4(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
"ulw %[tp1], 8(%[src]) \n\t"
"ulw %[tp2], 8(%[dst]) \n\t"
- "sw %[tn1], 0(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 4(%[dst]) \n\t" /* store */
+ "sw %[tn1], 0(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 4(%[dst]) \n\t" /* store */
"ulw %[tp3], 12(%[src]) \n\t"
"ulw %[tp4], 12(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
- "sw %[tn1], 8(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 12(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "sw %[tn1], 8(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 12(%[dst]) \n\t" /* store */
- : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
- [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
- [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
- : [src] "r" (src), [dst] "r" (dst)
- );
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
+ : [src] "r"(src), [dst] "r"(dst));
src += src_stride;
dst += dst_stride;
@@ -525,50 +492,48 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
break;
case 32:
/* 8 word storage */
- for (y = h; y--; ) {
+ for (y = h; y--;) {
prefetch_load(src + src_stride);
prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 0(%[dst]) \n\t"
"ulw %[tp3], 4(%[src]) \n\t"
"ulw %[tp4], 4(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
"ulw %[tp1], 8(%[src]) \n\t"
"ulw %[tp2], 8(%[dst]) \n\t"
- "sw %[tn1], 0(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 4(%[dst]) \n\t" /* store */
+ "sw %[tn1], 0(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 4(%[dst]) \n\t" /* store */
"ulw %[tp3], 12(%[src]) \n\t"
"ulw %[tp4], 12(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
"ulw %[tp1], 16(%[src]) \n\t"
"ulw %[tp2], 16(%[dst]) \n\t"
- "sw %[tn1], 8(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 12(%[dst]) \n\t" /* store */
+ "sw %[tn1], 8(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 12(%[dst]) \n\t" /* store */
"ulw %[tp3], 20(%[src]) \n\t"
"ulw %[tp4], 20(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
"ulw %[tp1], 24(%[src]) \n\t"
"ulw %[tp2], 24(%[dst]) \n\t"
- "sw %[tn1], 16(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 20(%[dst]) \n\t" /* store */
+ "sw %[tn1], 16(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 20(%[dst]) \n\t" /* store */
"ulw %[tp3], 28(%[src]) \n\t"
"ulw %[tp4], 28(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
- "sw %[tn1], 24(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 28(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "sw %[tn1], 24(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 28(%[dst]) \n\t" /* store */
- : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
- [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
- [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
- : [src] "r" (src), [dst] "r" (dst)
- );
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
+ : [src] "r"(src), [dst] "r"(dst));
src += src_stride;
dst += dst_stride;
@@ -579,84 +544,82 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
prefetch_store(dst + 32);
/* 16 word storage */
- for (y = h; y--; ) {
+ for (y = h; y--;) {
prefetch_load(src + src_stride);
prefetch_load(src + src_stride + 32);
prefetch_load(src + src_stride + 64);
prefetch_store(dst + dst_stride);
prefetch_store(dst + dst_stride + 32);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 0(%[dst]) \n\t"
"ulw %[tp3], 4(%[src]) \n\t"
"ulw %[tp4], 4(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
"ulw %[tp1], 8(%[src]) \n\t"
"ulw %[tp2], 8(%[dst]) \n\t"
- "sw %[tn1], 0(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 4(%[dst]) \n\t" /* store */
+ "sw %[tn1], 0(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 4(%[dst]) \n\t" /* store */
"ulw %[tp3], 12(%[src]) \n\t"
"ulw %[tp4], 12(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
"ulw %[tp1], 16(%[src]) \n\t"
"ulw %[tp2], 16(%[dst]) \n\t"
- "sw %[tn1], 8(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 12(%[dst]) \n\t" /* store */
+ "sw %[tn1], 8(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 12(%[dst]) \n\t" /* store */
"ulw %[tp3], 20(%[src]) \n\t"
"ulw %[tp4], 20(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
"ulw %[tp1], 24(%[src]) \n\t"
"ulw %[tp2], 24(%[dst]) \n\t"
- "sw %[tn1], 16(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 20(%[dst]) \n\t" /* store */
+ "sw %[tn1], 16(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 20(%[dst]) \n\t" /* store */
"ulw %[tp3], 28(%[src]) \n\t"
"ulw %[tp4], 28(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
"ulw %[tp1], 32(%[src]) \n\t"
"ulw %[tp2], 32(%[dst]) \n\t"
- "sw %[tn1], 24(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 28(%[dst]) \n\t" /* store */
+ "sw %[tn1], 24(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 28(%[dst]) \n\t" /* store */
"ulw %[tp3], 36(%[src]) \n\t"
"ulw %[tp4], 36(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
"ulw %[tp1], 40(%[src]) \n\t"
"ulw %[tp2], 40(%[dst]) \n\t"
- "sw %[tn1], 32(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 36(%[dst]) \n\t" /* store */
+ "sw %[tn1], 32(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 36(%[dst]) \n\t" /* store */
"ulw %[tp3], 44(%[src]) \n\t"
"ulw %[tp4], 44(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
"ulw %[tp1], 48(%[src]) \n\t"
"ulw %[tp2], 48(%[dst]) \n\t"
- "sw %[tn1], 40(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 44(%[dst]) \n\t" /* store */
+ "sw %[tn1], 40(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 44(%[dst]) \n\t" /* store */
"ulw %[tp3], 52(%[src]) \n\t"
"ulw %[tp4], 52(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
"ulw %[tp1], 56(%[src]) \n\t"
"ulw %[tp2], 56(%[dst]) \n\t"
- "sw %[tn1], 48(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 52(%[dst]) \n\t" /* store */
+ "sw %[tn1], 48(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 52(%[dst]) \n\t" /* store */
"ulw %[tp3], 60(%[src]) \n\t"
"ulw %[tp4], 60(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
- "sw %[tn1], 56(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 60(%[dst]) \n\t" /* store */
-
- : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
- [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
- [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
- : [src] "r" (src), [dst] "r" (dst)
- );
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "sw %[tn1], 56(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 60(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
+ : [src] "r"(src), [dst] "r"(dst));
src += src_stride;
dst += dst_stride;
diff --git a/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c b/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
index db0c2a4da..9a9bab25a 100644
--- a/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
+++ b/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
@@ -18,16 +18,13 @@
#include "vpx_ports/mem.h"
#if HAVE_DSPR2
-static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h) {
+static void convolve_avg_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
int32_t y;
uint8_t *cm = vpx_ff_cropTbl;
- int32_t vector1b, vector2b, vector3b, vector4b;
- int32_t Temp1, Temp2, Temp3, Temp4;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2, Temp3, Temp4;
uint32_t vector4a = 64;
uint32_t tp1, tp2;
uint32_t p1, p2, p3, p4;
@@ -45,7 +42,7 @@ static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t"
@@ -76,13 +73,13 @@ static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
"dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
"extp %[Temp3], $ac2, 31 \n\t"
- "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */
+ "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */
/* odd 1. pixel */
- "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */
"mtlo %[vector4a], $ac3 \n\t"
"mthi $zero, $ac3 \n\t"
- "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */
+ "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */
"preceu.ph.qbr %[n1], %[tp2] \n\t"
"preceu.ph.qbl %[n2], %[tp2] \n\t"
"preceu.ph.qbr %[n3], %[tn2] \n\t"
@@ -93,46 +90,44 @@ static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
"dpa.w.ph $ac3, %[n4], %[vector4b] \n\t"
"extp %[Temp2], $ac3, 31 \n\t"
- "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */
+ "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */
/* odd 2. pixel */
- "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */
"mtlo %[vector4a], $ac2 \n\t"
"mthi $zero, $ac2 \n\t"
"preceu.ph.qbr %[n1], %[tn1] \n\t"
- "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */
- "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */
+ "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */
"dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
"dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
"dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
"dpa.w.ph $ac2, %[n1], %[vector4b] \n\t"
"extp %[Temp4], $ac2, 31 \n\t"
- "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */
- "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */
+ "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */
+ "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */
/* clamp */
- "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */
- "lbux %[n2], %[Temp4](%[cm]) \n\t" /* odd 2 */
- "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */
-
- "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */
- "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */
-
- "addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */
- "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */
-
- : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
- [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
- [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
- : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
- [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
- [vector4a] "r" (vector4a),
- [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
- );
+ "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */
+ "lbux %[n2], %[Temp4](%[cm]) \n\t" /* odd 2 */
+ "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */
+
+ "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */
+ "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */
+
+ "addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */
+ "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+ [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
+ [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
/* Next row... */
src += src_stride;
@@ -140,12 +135,9 @@ static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
}
}
-static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h) {
+static void convolve_avg_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
int32_t y;
uint8_t *cm = vpx_ff_cropTbl;
uint32_t vector4a = 64;
@@ -167,7 +159,7 @@ static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t"
@@ -309,17 +301,15 @@ static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
"sb %[tn3], 5(%[dst]) \n\t"
"sb %[tn1], 7(%[dst]) \n\t"
- : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
- [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
- [st0] "=&r" (st0), [st1] "=&r" (st1),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [n1] "=&r" (n1),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
- : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
- [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
- [vector4a] "r" (vector4a),
- [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
- );
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+ [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
+ [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
/* Next row... */
src += src_stride;
@@ -328,11 +318,9 @@ static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
}
static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
- int32_t src_stride,
- uint8_t *dst_ptr,
+ int32_t src_stride, uint8_t *dst_ptr,
int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h,
+ const int16_t *filter_x0, int32_t h,
int32_t count) {
int32_t y, c;
const uint8_t *src;
@@ -360,7 +348,7 @@ static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
prefetch_store(dst_ptr + dst_stride);
for (c = 0; c < count; c++) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[qload1], 0(%[src]) \n\t"
"ulw %[qload2], 4(%[src]) \n\t"
@@ -618,16 +606,15 @@ static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
"sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
"sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
- : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
- [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [qload3] "=&r" (qload3), [p5] "=&r" (p5),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
- : [filter12] "r" (filter12), [filter34] "r" (filter34),
- [filter56] "r" (filter56), [filter78] "r" (filter78),
- [vector_64] "r" (vector_64),
- [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
- );
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
+ [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
+ [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3)
+ : [filter12] "r"(filter12), [filter34] "r"(filter34),
+ [filter56] "r"(filter56), [filter78] "r"(filter78),
+ [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
src += 16;
dst += 16;
@@ -640,11 +627,9 @@ static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
}
static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
- int32_t src_stride,
- uint8_t *dst_ptr,
+ int32_t src_stride, uint8_t *dst_ptr,
int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h) {
+ const int16_t *filter_x0, int32_t h) {
int32_t y, c;
const uint8_t *src;
uint8_t *dst;
@@ -673,7 +658,7 @@ static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
prefetch_store(dst_ptr + dst_stride + 32);
for (c = 0; c < 4; c++) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[qload1], 0(%[src]) \n\t"
"ulw %[qload2], 4(%[src]) \n\t"
@@ -931,16 +916,15 @@ static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
"sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
"sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
- : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
- [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [qload3] "=&r" (qload3), [p5] "=&r" (p5),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
- : [filter12] "r" (filter12), [filter34] "r" (filter34),
- [filter56] "r" (filter56), [filter78] "r" (filter78),
- [vector_64] "r" (vector_64),
- [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
- );
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
+ [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
+ [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3)
+ : [filter12] "r"(filter12), [filter34] "r"(filter34),
+ [filter56] "r"(filter56), [filter78] "r"(filter78),
+ [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
src += 16;
dst += 16;
@@ -961,22 +945,17 @@ void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
assert(((const int32_t *)filter_x)[1] != 0x800000);
if (((const int32_t *)filter_x)[0] == 0) {
- vpx_convolve2_avg_horiz_dspr2(src, src_stride,
- dst, dst_stride,
- filter_x, x_step_q4,
- filter_y, y_step_q4,
- w, h);
+ vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
} else {
uint32_t pos = 38;
src -= 3;
/* bit positon for extract from acc */
- __asm__ __volatile__ (
- "wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r" (pos)
- );
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
/* prefetch data to cache memory */
prefetch_load(src);
@@ -985,39 +964,32 @@ void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
switch (w) {
case 4:
- convolve_avg_horiz_4_dspr2(src, src_stride,
- dst, dst_stride,
- filter_x, h);
+ convolve_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h);
break;
case 8:
- convolve_avg_horiz_8_dspr2(src, src_stride,
- dst, dst_stride,
- filter_x, h);
+ convolve_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h);
break;
case 16:
- convolve_avg_horiz_16_dspr2(src, src_stride,
- dst, dst_stride,
- filter_x, h, 1);
+ convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h, 1);
break;
case 32:
- convolve_avg_horiz_16_dspr2(src, src_stride,
- dst, dst_stride,
- filter_x, h, 2);
+ convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h, 2);
break;
case 64:
prefetch_load(src + 64);
prefetch_store(dst + 32);
- convolve_avg_horiz_64_dspr2(src, src_stride,
- dst, dst_stride,
- filter_x, h);
+ convolve_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h);
break;
default:
- vpx_convolve8_avg_horiz_c(src + 3, src_stride,
- dst, dst_stride,
- filter_x, x_step_q4,
- filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4, w,
+ h);
break;
}
}
diff --git a/vpx_dsp/mips/convolve8_dspr2.c b/vpx_dsp/mips/convolve8_dspr2.c
index ddad18692..789ec8d53 100644
--- a/vpx_dsp/mips/convolve8_dspr2.c
+++ b/vpx_dsp/mips/convolve8_dspr2.c
@@ -19,8 +19,7 @@
#if HAVE_DSPR2
static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
const int16_t *filter_x0,
int32_t h) {
@@ -45,7 +44,7 @@ static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
prefetch_load(src + src_stride);
prefetch_load(src + src_stride + 32);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t"
@@ -118,15 +117,14 @@ static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
"sb %[p2], 0(%[dst_ptr]) \n\t"
"addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
- : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
- [dst_ptr] "+r" (dst_ptr)
- : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
- [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
- [vector4a] "r" (vector4a),
- [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
- );
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+ [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [dst_ptr] "+r"(dst_ptr)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src),
+ [dst_stride] "r"(dst_stride));
/* Next row... */
src += src_stride;
@@ -135,8 +133,7 @@ static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
}
static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
const int16_t *filter_x0,
int32_t h) {
@@ -164,7 +161,7 @@ static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
dst_ptr = dst;
odd_dst = (dst_ptr + dst_stride);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[tp2], 0(%[src]) \n\t"
"ulw %[tp1], 4(%[src]) \n\t"
@@ -293,16 +290,14 @@ static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
"sb %[n1], 0(%[odd_dst]) \n\t"
- : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [n1] "=&r" (n1),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
- [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
- : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
- [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
- [vector4a] "r" (vector4a), [cm] "r" (cm),
- [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
- );
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
+ [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [dst_ptr] "+r"(dst_ptr), [odd_dst] "+r"(odd_dst)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src),
+ [dst_pitch_2] "r"(dst_pitch_2));
/* Next row... */
src += src_stride;
@@ -310,25 +305,21 @@ static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
}
}
-static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
- int32_t src_stride,
- uint8_t *dst_ptr,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h,
- int32_t count) {
+static void convolve_horiz_16_transposed_dspr2(
+ const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
int32_t c, y;
const uint8_t *src;
uint8_t *dst;
uint8_t *cm = vpx_ff_cropTbl;
uint32_t vector_64 = 64;
- int32_t filter12, filter34, filter56, filter78;
- int32_t Temp1, Temp2, Temp3;
+ int32_t filter12, filter34, filter56, filter78;
+ int32_t Temp1, Temp2, Temp3;
uint32_t qload1, qload2;
uint32_t p1, p2, p3, p4, p5;
uint32_t st1, st2, st3;
uint32_t dst_pitch_2 = (dst_stride << 1);
- uint8_t *odd_dst;
+ uint8_t *odd_dst;
filter12 = ((const int32_t *)filter_x0)[0];
filter34 = ((const int32_t *)filter_x0)[1];
@@ -346,248 +337,439 @@ static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
odd_dst = (dst + dst_stride);
for (c = 0; c < count; c++) {
- __asm__ __volatile__ (
- "ulw %[qload1], 0(%[src]) \n\t"
- "ulw %[qload2], 4(%[src]) \n\t"
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 4(%[src]) "
+ "\n\t"
/* even 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
- "mthi $zero, $ac1 \n\t"
- "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "ulw %[qload2], 8(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 1 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 2 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "ulw %[qload2], 8(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p2], %[filter34] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p3], %[filter56] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter78] "
+ "\n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 1 */
/* even 2. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[qload2] \n\t"
- "preceu.ph.qbl %[p5], %[qload2] \n\t"
- "ulw %[qload1], 12(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 3 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload2] "
+ "\n\t"
+ "ulw %[qload1], 12(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter12] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter34] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p4], %[filter56] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p1], %[filter78] "
+ "\n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 1 */
/* even 3. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 4 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 1 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ " \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter12] "
+ "\n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p4], %[filter34] "
+ "\n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p1], %[filter56] "
+ "\n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p5], %[filter78] "
+ "\n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 1 */
/* even 4. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[st2], 0(%[dst]) \n\t" /* even 2 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "ulw %[qload2], 16(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 5 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload1] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 2 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 16(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter12] "
+ "\n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p1], %[filter34] "
+ "\n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p5], %[filter56] "
+ "\n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p2], %[filter78] "
+ "\n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 3 */
/* even 5. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p4], %[qload2] \n\t"
- "sb %[st3], 0(%[dst]) \n\t" /* even 3 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 6 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 3 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter12] "
+ "\n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p5], %[filter34] "
+ "\n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p2], %[filter56] "
+ "\n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p3], %[filter78] "
+ "\n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 4 */
/* even 6. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbl %[p1], %[qload2] \n\t"
- "sb %[st1], 0(%[dst]) \n\t" /* even 4 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "ulw %[qload1], 20(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 7 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbl %[p1], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 4 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 20(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter12] "
+ "\n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] "
+ "\n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] "
+ "\n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] "
+ "\n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 5 */
/* even 7. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p5], %[qload1] \n\t"
- "sb %[st2], 0(%[dst]) \n\t" /* even 5 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 8 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 5 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] "
+ "\n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] "
+ "\n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] "
+ "\n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] "
+ "\n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 6 */
/* even 8. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
- "mthi $zero, $ac3 \n\t"
- "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
- "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
- "sb %[st3], 0(%[dst]) \n\t" /* even 6 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
- "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 1 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] "
+ "\n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] "
+ "\n\t" /* even 8 */
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 6 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter56] "
+ "\n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] "
+ "\n\t" /* even 8 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 7 */
/* ODD pixels */
- "ulw %[qload1], 1(%[src]) \n\t"
- "ulw %[qload2], 5(%[src]) \n\t"
+ "ulw %[qload1], 1(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 5(%[src]) "
+ "\n\t"
/* odd 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "sb %[st1], 0(%[dst]) \n\t" /* even 7 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "ulw %[qload2], 9(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
- "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
- "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
- "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 2 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 7 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 9(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter12] "
+ "\n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] "
+ "\n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] "
+ "\n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] "
+ "\n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 8 */
/* odd 2. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[qload2] \n\t"
- "preceu.ph.qbl %[p5], %[qload2] \n\t"
- "sb %[st2], 0(%[dst]) \n\t" /* even 8 */
- "ulw %[qload1], 13(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 3 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] "
+ "\n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] "
+ "\n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] "
+ "\n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] "
+ "\n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 1 */
/* odd 3. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 4 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload1] "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 1 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] "
+ "\n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] "
+ "\n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] "
+ "\n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] "
+ "\n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 2 */
/* odd 4. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "ulw %[qload2], 17(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 5 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 2 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 17(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter12] "
+ "\n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p1], %[filter34] "
+ "\n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter56] "
+ "\n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p2], %[filter78] "
+ "\n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 3 */
/* odd 5. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p4], %[qload2] \n\t"
- "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 6 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 3 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] "
+ "\n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p5], %[filter34] "
+ "\n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter56] "
+ "\n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p3], %[filter78] "
+ "\n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 4 */
/* odd 6. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbl %[p1], %[qload2] \n\t"
- "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "ulw %[qload1], 21(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 7 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbl %[p1], %[qload2] "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 4 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 21(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter12] "
+ "\n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p2], %[filter34] "
+ "\n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter56] "
+ "\n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p4], %[filter78] "
+ "\n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 5 */
/* odd 7. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p5], %[qload1] \n\t"
- "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 8 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 5 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter12] "
+ "\n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p3], %[filter34] "
+ "\n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p4], %[filter56] "
+ "\n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p1], %[filter78] "
+ "\n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 7 */
/* odd 8. pixel */
- "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
-
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
-
- "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
-
- "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
-
- "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */
-
- : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
- [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
- [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
- : [filter12] "r" (filter12), [filter34] "r" (filter34),
- [filter56] "r" (filter56), [filter78] "r" (filter78),
- [vector_64] "r" (vector_64), [cm] "r" (cm),
- [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
- );
+ "dpa.w.ph $ac1, %[p3], %[filter12] "
+ "\n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p4], %[filter34] "
+ "\n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p1], %[filter56] "
+ "\n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p5], %[filter78] "
+ "\n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 8 */
+
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 6 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 7 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 8 */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
+ [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
+ [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
+ : [filter12] "r"(filter12), [filter34] "r"(filter34),
+ [filter56] "r"(filter56), [filter78] "r"(filter78),
+ [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src),
+ [dst_pitch_2] "r"(dst_pitch_2));
src += 16;
dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
@@ -601,24 +783,21 @@ static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
}
}
-static void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
- int32_t src_stride,
- uint8_t *dst_ptr,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h) {
+static void convolve_horiz_64_transposed_dspr2(
+ const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
int32_t c, y;
const uint8_t *src;
uint8_t *dst;
uint8_t *cm = vpx_ff_cropTbl;
uint32_t vector_64 = 64;
- int32_t filter12, filter34, filter56, filter78;
- int32_t Temp1, Temp2, Temp3;
+ int32_t filter12, filter34, filter56, filter78;
+ int32_t Temp1, Temp2, Temp3;
uint32_t qload1, qload2;
uint32_t p1, p2, p3, p4, p5;
uint32_t st1, st2, st3;
uint32_t dst_pitch_2 = (dst_stride << 1);
- uint8_t *odd_dst;
+ uint8_t *odd_dst;
filter12 = ((const int32_t *)filter_x0)[0];
filter34 = ((const int32_t *)filter_x0)[1];
@@ -637,248 +816,439 @@ static void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
odd_dst = (dst + dst_stride);
for (c = 0; c < 4; c++) {
- __asm__ __volatile__ (
- "ulw %[qload1], 0(%[src]) \n\t"
- "ulw %[qload2], 4(%[src]) \n\t"
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 4(%[src]) "
+ "\n\t"
/* even 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
- "mthi $zero, $ac1 \n\t"
- "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "ulw %[qload2], 8(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 1 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 2 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "ulw %[qload2], 8(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p2], %[filter34] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p3], %[filter56] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter78] "
+ "\n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 1 */
/* even 2. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[qload2] \n\t"
- "preceu.ph.qbl %[p5], %[qload2] \n\t"
- "ulw %[qload1], 12(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 3 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload2] "
+ "\n\t"
+ "ulw %[qload1], 12(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter12] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter34] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p4], %[filter56] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p1], %[filter78] "
+ "\n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 1 */
/* even 3. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 4 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 1 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ " \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter12] "
+ "\n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p4], %[filter34] "
+ "\n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p1], %[filter56] "
+ "\n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p5], %[filter78] "
+ "\n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 1 */
/* even 4. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[st2], 0(%[dst]) \n\t" /* even 2 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "ulw %[qload2], 16(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 5 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload1] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 2 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 16(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter12] "
+ "\n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p1], %[filter34] "
+ "\n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p5], %[filter56] "
+ "\n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p2], %[filter78] "
+ "\n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 3 */
/* even 5. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p4], %[qload2] \n\t"
- "sb %[st3], 0(%[dst]) \n\t" /* even 3 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 6 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 3 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter12] "
+ "\n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p5], %[filter34] "
+ "\n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p2], %[filter56] "
+ "\n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p3], %[filter78] "
+ "\n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 4 */
/* even 6. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbl %[p1], %[qload2] \n\t"
- "sb %[st1], 0(%[dst]) \n\t" /* even 4 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "ulw %[qload1], 20(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 7 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbl %[p1], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 4 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 20(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter12] "
+ "\n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] "
+ "\n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] "
+ "\n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] "
+ "\n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 5 */
/* even 7. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p5], %[qload1] \n\t"
- "sb %[st2], 0(%[dst]) \n\t" /* even 5 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 8 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 5 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] "
+ "\n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] "
+ "\n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] "
+ "\n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] "
+ "\n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 6 */
/* even 8. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
- "mthi $zero, $ac3 \n\t"
- "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
- "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
- "sb %[st3], 0(%[dst]) \n\t" /* even 6 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
- "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 1 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] "
+ "\n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] "
+ "\n\t" /* even 8 */
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 6 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter56] "
+ "\n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] "
+ "\n\t" /* even 8 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 7 */
/* ODD pixels */
- "ulw %[qload1], 1(%[src]) \n\t"
- "ulw %[qload2], 5(%[src]) \n\t"
+ "ulw %[qload1], 1(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 5(%[src]) "
+ "\n\t"
/* odd 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "sb %[st1], 0(%[dst]) \n\t" /* even 7 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "ulw %[qload2], 9(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
- "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
- "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
- "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 2 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 7 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 9(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter12] "
+ "\n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] "
+ "\n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] "
+ "\n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] "
+ "\n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 8 */
/* odd 2. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[qload2] \n\t"
- "preceu.ph.qbl %[p5], %[qload2] \n\t"
- "sb %[st2], 0(%[dst]) \n\t" /* even 8 */
- "ulw %[qload1], 13(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 3 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] "
+ "\n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] "
+ "\n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] "
+ "\n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] "
+ "\n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 1 */
/* odd 3. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 4 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload1] "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 1 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] "
+ "\n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] "
+ "\n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] "
+ "\n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] "
+ "\n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 2 */
/* odd 4. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "ulw %[qload2], 17(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 5 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 2 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 17(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter12] "
+ "\n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p1], %[filter34] "
+ "\n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter56] "
+ "\n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p2], %[filter78] "
+ "\n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 3 */
/* odd 5. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p4], %[qload2] \n\t"
- "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 6 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 3 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] "
+ "\n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p5], %[filter34] "
+ "\n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter56] "
+ "\n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p3], %[filter78] "
+ "\n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 4 */
/* odd 6. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbl %[p1], %[qload2] \n\t"
- "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "ulw %[qload1], 21(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 7 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbl %[p1], %[qload2] "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 4 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 21(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter12] "
+ "\n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p2], %[filter34] "
+ "\n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter56] "
+ "\n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p4], %[filter78] "
+ "\n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 5 */
/* odd 7. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p5], %[qload1] \n\t"
- "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 8 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 5 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter12] "
+ "\n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p3], %[filter34] "
+ "\n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p4], %[filter56] "
+ "\n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p1], %[filter78] "
+ "\n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 7 */
/* odd 8. pixel */
- "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
-
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
-
- "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
-
- "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
-
- "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */
-
- : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
- [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
- [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
- : [filter12] "r" (filter12), [filter34] "r" (filter34),
- [filter56] "r" (filter56), [filter78] "r" (filter78),
- [vector_64] "r" (vector_64), [cm] "r" (cm),
- [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
- );
+ "dpa.w.ph $ac1, %[p3], %[filter12] "
+ "\n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p4], %[filter34] "
+ "\n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p1], %[filter56] "
+ "\n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p5], %[filter78] "
+ "\n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 8 */
+
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 6 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 7 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 8 */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
+ [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
+ [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
+ : [filter12] "r"(filter12), [filter34] "r"(filter34),
+ [filter56] "r"(filter56), [filter78] "r"(filter78),
+ [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src),
+ [dst_pitch_2] "r"(dst_pitch_2));
src += 16;
dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
@@ -901,8 +1271,7 @@ void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
for (x = 0; x < w; ++x) {
int sum = 0;
- for (k = 0; k < 8; ++k)
- sum += src[x + k] * filter[k];
+ for (k = 0; k < 8; ++k) sum += src[x + k] * filter[k];
dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
}
@@ -913,8 +1282,7 @@ void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
}
void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- int w, int h) {
+ uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
int x, y;
for (y = 0; y < h; ++y) {
@@ -927,10 +1295,9 @@ void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
}
}
-void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
+void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int x_step_q4, const int16_t *filter_y, int y_step_q4,
int w, int h) {
DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
@@ -941,27 +1308,20 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
assert(((const int32_t *)filter_x)[1] != 0x800000);
assert(((const int32_t *)filter_y)[1] != 0x800000);
-
/* bit positon for extract from acc */
- __asm__ __volatile__ (
- "wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r" (pos)
- );
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
- if (intermediate_height < h)
- intermediate_height = h;
+ if (intermediate_height < h) intermediate_height = h;
/* copy the src to dst */
if (filter_x[3] == 0x80) {
- copy_horiz_transposed(src - src_stride * 3, src_stride,
- temp, intermediate_height,
- w, intermediate_height);
+ copy_horiz_transposed(src - src_stride * 3, src_stride, temp,
+ intermediate_height, w, intermediate_height);
} else if (((const int32_t *)filter_x)[0] == 0) {
- vpx_convolve2_dspr2(src - src_stride * 3, src_stride,
- temp, intermediate_height,
- filter_x,
- w, intermediate_height);
+ vpx_convolve2_dspr2(src - src_stride * 3, src_stride, temp,
+ intermediate_height, filter_x, w, intermediate_height);
} else {
src -= (src_stride * 3 + 3);
@@ -971,31 +1331,29 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
switch (w) {
case 4:
- convolve_horiz_4_transposed_dspr2(src, src_stride,
- temp, intermediate_height,
- filter_x, intermediate_height);
+ convolve_horiz_4_transposed_dspr2(src, src_stride, temp,
+ intermediate_height, filter_x,
+ intermediate_height);
break;
case 8:
- convolve_horiz_8_transposed_dspr2(src, src_stride,
- temp, intermediate_height,
- filter_x, intermediate_height);
+ convolve_horiz_8_transposed_dspr2(src, src_stride, temp,
+ intermediate_height, filter_x,
+ intermediate_height);
break;
case 16:
case 32:
- convolve_horiz_16_transposed_dspr2(src, src_stride,
- temp, intermediate_height,
- filter_x, intermediate_height,
- (w/16));
+ convolve_horiz_16_transposed_dspr2(src, src_stride, temp,
+ intermediate_height, filter_x,
+ intermediate_height, (w / 16));
break;
case 64:
prefetch_load(src + 32);
- convolve_horiz_64_transposed_dspr2(src, src_stride,
- temp, intermediate_height,
- filter_x, intermediate_height);
+ convolve_horiz_64_transposed_dspr2(src, src_stride, temp,
+ intermediate_height, filter_x,
+ intermediate_height);
break;
default:
- convolve_horiz_transposed(src, src_stride,
- temp, intermediate_height,
+ convolve_horiz_transposed(src, src_stride, temp, intermediate_height,
filter_x, w, intermediate_height);
break;
}
@@ -1003,40 +1361,31 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
/* copy the src to dst */
if (filter_y[3] == 0x80) {
- copy_horiz_transposed(temp + 3, intermediate_height,
- dst, dst_stride,
- h, w);
+ copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w);
} else if (((const int32_t *)filter_y)[0] == 0) {
- vpx_convolve2_dspr2(temp + 3, intermediate_height,
- dst, dst_stride,
- filter_y,
- h, w);
+ vpx_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride,
+ filter_y, h, w);
} else {
switch (h) {
case 4:
- convolve_horiz_4_transposed_dspr2(temp, intermediate_height,
- dst, dst_stride,
- filter_y, w);
+ convolve_horiz_4_transposed_dspr2(temp, intermediate_height, dst,
+ dst_stride, filter_y, w);
break;
case 8:
- convolve_horiz_8_transposed_dspr2(temp, intermediate_height,
- dst, dst_stride,
- filter_y, w);
+ convolve_horiz_8_transposed_dspr2(temp, intermediate_height, dst,
+ dst_stride, filter_y, w);
break;
case 16:
case 32:
- convolve_horiz_16_transposed_dspr2(temp, intermediate_height,
- dst, dst_stride,
- filter_y, w, (h/16));
+ convolve_horiz_16_transposed_dspr2(temp, intermediate_height, dst,
+ dst_stride, filter_y, w, (h / 16));
break;
case 64:
- convolve_horiz_64_transposed_dspr2(temp, intermediate_height,
- dst, dst_stride,
- filter_y, w);
+ convolve_horiz_64_transposed_dspr2(temp, intermediate_height, dst,
+ dst_stride, filter_y, w);
break;
default:
- convolve_horiz_transposed(temp, intermediate_height,
- dst, dst_stride,
+ convolve_horiz_transposed(temp, intermediate_height, dst, dst_stride,
filter_y, h, w);
break;
}
@@ -1056,97 +1405,87 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
prefetch_store(dst);
switch (w) {
- case 4:
- {
+ case 4: {
uint32_t tp1;
/* 1 word storage */
- for (y = h; y--; ) {
+ for (y = h; y--;) {
prefetch_load(src + src_stride);
prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[tp1], (%[src]) \n\t"
- "sw %[tp1], (%[dst]) \n\t" /* store */
+ "sw %[tp1], (%[dst]) \n\t" /* store */
- : [tp1] "=&r" (tp1)
- : [src] "r" (src), [dst] "r" (dst)
- );
+ : [tp1] "=&r"(tp1)
+ : [src] "r"(src), [dst] "r"(dst));
src += src_stride;
dst += dst_stride;
}
- }
- break;
- case 8:
- {
+ } break;
+ case 8: {
uint32_t tp1, tp2;
/* 2 word storage */
- for (y = h; y--; ) {
+ for (y = h; y--;) {
prefetch_load(src + src_stride);
prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t"
- "sw %[tp1], 0(%[dst]) \n\t" /* store */
- "sw %[tp2], 4(%[dst]) \n\t" /* store */
+ "sw %[tp1], 0(%[dst]) \n\t" /* store */
+ "sw %[tp2], 4(%[dst]) \n\t" /* store */
- : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2)
- : [src] "r" (src), [dst] "r" (dst)
- );
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
+ : [src] "r"(src), [dst] "r"(dst));
src += src_stride;
dst += dst_stride;
}
- }
- break;
- case 16:
- {
+ } break;
+ case 16: {
uint32_t tp1, tp2, tp3, tp4;
/* 4 word storage */
- for (y = h; y--; ) {
+ for (y = h; y--;) {
prefetch_load(src + src_stride);
prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t"
"ulw %[tp3], 8(%[src]) \n\t"
"ulw %[tp4], 12(%[src]) \n\t"
- "sw %[tp1], 0(%[dst]) \n\t" /* store */
- "sw %[tp2], 4(%[dst]) \n\t" /* store */
- "sw %[tp3], 8(%[dst]) \n\t" /* store */
- "sw %[tp4], 12(%[dst]) \n\t" /* store */
+ "sw %[tp1], 0(%[dst]) \n\t" /* store */
+ "sw %[tp2], 4(%[dst]) \n\t" /* store */
+ "sw %[tp3], 8(%[dst]) \n\t" /* store */
+ "sw %[tp4], 12(%[dst]) \n\t" /* store */
- : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
- [tp3] "=&r" (tp3), [tp4] "=&r" (tp4)
- : [src] "r" (src), [dst] "r" (dst)
- );
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4)
+ : [src] "r"(src), [dst] "r"(dst));
src += src_stride;
dst += dst_stride;
}
- }
- break;
- case 32:
- {
+ } break;
+ case 32: {
uint32_t tp1, tp2, tp3, tp4;
uint32_t tp5, tp6, tp7, tp8;
/* 8 word storage */
- for (y = h; y--; ) {
+ for (y = h; y--;) {
prefetch_load(src + src_stride);
prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t"
"ulw %[tp3], 8(%[src]) \n\t"
@@ -1156,29 +1495,25 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
"ulw %[tp7], 24(%[src]) \n\t"
"ulw %[tp8], 28(%[src]) \n\t"
- "sw %[tp1], 0(%[dst]) \n\t" /* store */
- "sw %[tp2], 4(%[dst]) \n\t" /* store */
- "sw %[tp3], 8(%[dst]) \n\t" /* store */
- "sw %[tp4], 12(%[dst]) \n\t" /* store */
- "sw %[tp5], 16(%[dst]) \n\t" /* store */
- "sw %[tp6], 20(%[dst]) \n\t" /* store */
- "sw %[tp7], 24(%[dst]) \n\t" /* store */
- "sw %[tp8], 28(%[dst]) \n\t" /* store */
-
- : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
- [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
- [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
- [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
- : [src] "r" (src), [dst] "r" (dst)
- );
+ "sw %[tp1], 0(%[dst]) \n\t" /* store */
+ "sw %[tp2], 4(%[dst]) \n\t" /* store */
+ "sw %[tp3], 8(%[dst]) \n\t" /* store */
+ "sw %[tp4], 12(%[dst]) \n\t" /* store */
+ "sw %[tp5], 16(%[dst]) \n\t" /* store */
+ "sw %[tp6], 20(%[dst]) \n\t" /* store */
+ "sw %[tp7], 24(%[dst]) \n\t" /* store */
+ "sw %[tp8], 28(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
+ [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
+ : [src] "r"(src), [dst] "r"(dst));
src += src_stride;
dst += dst_stride;
}
- }
- break;
- case 64:
- {
+ } break;
+ case 64: {
uint32_t tp1, tp2, tp3, tp4;
uint32_t tp5, tp6, tp7, tp8;
@@ -1186,14 +1521,14 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
prefetch_store(dst + 32);
/* 16 word storage */
- for (y = h; y--; ) {
+ for (y = h; y--;) {
prefetch_load(src + src_stride);
prefetch_load(src + src_stride + 32);
prefetch_load(src + src_stride + 64);
prefetch_store(dst + dst_stride);
prefetch_store(dst + dst_stride + 32);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t"
"ulw %[tp3], 8(%[src]) \n\t"
@@ -1203,14 +1538,14 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
"ulw %[tp7], 24(%[src]) \n\t"
"ulw %[tp8], 28(%[src]) \n\t"
- "sw %[tp1], 0(%[dst]) \n\t" /* store */
- "sw %[tp2], 4(%[dst]) \n\t" /* store */
- "sw %[tp3], 8(%[dst]) \n\t" /* store */
- "sw %[tp4], 12(%[dst]) \n\t" /* store */
- "sw %[tp5], 16(%[dst]) \n\t" /* store */
- "sw %[tp6], 20(%[dst]) \n\t" /* store */
- "sw %[tp7], 24(%[dst]) \n\t" /* store */
- "sw %[tp8], 28(%[dst]) \n\t" /* store */
+ "sw %[tp1], 0(%[dst]) \n\t" /* store */
+ "sw %[tp2], 4(%[dst]) \n\t" /* store */
+ "sw %[tp3], 8(%[dst]) \n\t" /* store */
+ "sw %[tp4], 12(%[dst]) \n\t" /* store */
+ "sw %[tp5], 16(%[dst]) \n\t" /* store */
+ "sw %[tp6], 20(%[dst]) \n\t" /* store */
+ "sw %[tp7], 24(%[dst]) \n\t" /* store */
+ "sw %[tp8], 28(%[dst]) \n\t" /* store */
"ulw %[tp1], 32(%[src]) \n\t"
"ulw %[tp2], 36(%[src]) \n\t"
@@ -1221,29 +1556,26 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
"ulw %[tp7], 56(%[src]) \n\t"
"ulw %[tp8], 60(%[src]) \n\t"
- "sw %[tp1], 32(%[dst]) \n\t" /* store */
- "sw %[tp2], 36(%[dst]) \n\t" /* store */
- "sw %[tp3], 40(%[dst]) \n\t" /* store */
- "sw %[tp4], 44(%[dst]) \n\t" /* store */
- "sw %[tp5], 48(%[dst]) \n\t" /* store */
- "sw %[tp6], 52(%[dst]) \n\t" /* store */
- "sw %[tp7], 56(%[dst]) \n\t" /* store */
- "sw %[tp8], 60(%[dst]) \n\t" /* store */
-
- : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
- [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
- [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
- [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
- : [src] "r" (src), [dst] "r" (dst)
- );
+ "sw %[tp1], 32(%[dst]) \n\t" /* store */
+ "sw %[tp2], 36(%[dst]) \n\t" /* store */
+ "sw %[tp3], 40(%[dst]) \n\t" /* store */
+ "sw %[tp4], 44(%[dst]) \n\t" /* store */
+ "sw %[tp5], 48(%[dst]) \n\t" /* store */
+ "sw %[tp6], 52(%[dst]) \n\t" /* store */
+ "sw %[tp7], 56(%[dst]) \n\t" /* store */
+ "sw %[tp8], 60(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
+ [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
+ : [src] "r"(src), [dst] "r"(dst));
src += src_stride;
dst += dst_stride;
}
- }
- break;
+ } break;
default:
- for (y = h; y--; ) {
+ for (y = h; y--;) {
for (x = 0; x < w; ++x) {
dst[x] = src[x];
}
diff --git a/vpx_dsp/mips/convolve8_horiz_dspr2.c b/vpx_dsp/mips/convolve8_horiz_dspr2.c
index ae78bab89..196a0a2f0 100644
--- a/vpx_dsp/mips/convolve8_horiz_dspr2.c
+++ b/vpx_dsp/mips/convolve8_horiz_dspr2.c
@@ -18,12 +18,9 @@
#include "vpx_ports/mem.h"
#if HAVE_DSPR2
-static void convolve_horiz_4_dspr2(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h) {
+static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
int32_t y;
uint8_t *cm = vpx_ff_cropTbl;
int32_t vector1b, vector2b, vector3b, vector4b;
@@ -45,7 +42,7 @@ static void convolve_horiz_4_dspr2(const uint8_t *src,
prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t"
@@ -111,17 +108,15 @@ static void convolve_horiz_4_dspr2(const uint8_t *src,
"sb %[tp2], 2(%[dst]) \n\t"
"sb %[n2], 3(%[dst]) \n\t"
- : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
- [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
- [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
- : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
- [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
- [vector4a] "r" (vector4a),
- [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
- );
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+ [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
+ [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
/* Next row... */
src += src_stride;
@@ -129,12 +124,9 @@ static void convolve_horiz_4_dspr2(const uint8_t *src,
}
}
-static void convolve_horiz_8_dspr2(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h) {
+static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
int32_t y;
uint8_t *cm = vpx_ff_cropTbl;
uint32_t vector4a = 64;
@@ -156,7 +148,7 @@ static void convolve_horiz_8_dspr2(const uint8_t *src,
prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t"
@@ -275,17 +267,15 @@ static void convolve_horiz_8_dspr2(const uint8_t *src,
"sb %[p2], 5(%[dst]) \n\t"
"sb %[n1], 7(%[dst]) \n\t"
- : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
- [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
- [st0] "=&r" (st0), [st1] "=&r" (st1),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [n1] "=&r" (n1),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
- : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
- [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
- [vector4a] "r" (vector4a),
- [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
- );
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+ [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
+ [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
/* Next row... */
src += src_stride;
@@ -293,12 +283,9 @@ static void convolve_horiz_8_dspr2(const uint8_t *src,
}
}
-static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
- int32_t src_stride,
- uint8_t *dst_ptr,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h,
+static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride,
+ uint8_t *dst_ptr, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h,
int32_t count) {
int32_t y, c;
const uint8_t *src;
@@ -326,7 +313,7 @@ static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
prefetch_store(dst_ptr + dst_stride);
for (c = 0; c < count; c++) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[qload1], 0(%[src]) \n\t"
"ulw %[qload2], 4(%[src]) \n\t"
@@ -542,17 +529,15 @@ static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
"sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
"sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
- : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
- [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [p5] "=&r" (p5),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
- : [filter12] "r" (filter12), [filter34] "r" (filter34),
- [filter56] "r" (filter56), [filter78] "r" (filter78),
- [vector_64] "r" (vector_64),
- [cm] "r" (cm), [dst] "r" (dst),
- [src] "r" (src)
- );
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+ [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+ [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [filter12] "r"(filter12), [filter34] "r"(filter34),
+ [filter56] "r"(filter56), [filter78] "r"(filter78),
+ [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
src += 16;
dst += 16;
@@ -564,12 +549,9 @@ static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
}
}
-static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
- int32_t src_stride,
- uint8_t *dst_ptr,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h) {
+static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride,
+ uint8_t *dst_ptr, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
int32_t y, c;
const uint8_t *src;
uint8_t *dst;
@@ -598,7 +580,7 @@ static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
prefetch_store(dst_ptr + dst_stride + 32);
for (c = 0; c < 4; c++) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[qload1], 0(%[src]) \n\t"
"ulw %[qload2], 4(%[src]) \n\t"
@@ -814,17 +796,15 @@ static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
"sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
"sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
- : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
- [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [p5] "=&r" (p5),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
- : [filter12] "r" (filter12), [filter34] "r" (filter34),
- [filter56] "r" (filter56), [filter78] "r" (filter78),
- [vector_64] "r" (vector_64),
- [cm] "r" (cm), [dst] "r" (dst),
- [src] "r" (src)
- );
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+ [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+ [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [filter12] "r"(filter12), [filter34] "r"(filter34),
+ [filter56] "r"(filter56), [filter78] "r"(filter78),
+ [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
src += 16;
dst += 16;
@@ -839,17 +819,14 @@ static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
assert(x_step_q4 == 16);
assert(((const int32_t *)filter_x)[1] != 0x800000);
if (((const int32_t *)filter_x)[0] == 0) {
- vpx_convolve2_horiz_dspr2(src, src_stride,
- dst, dst_stride,
- filter_x, x_step_q4,
- filter_y, y_step_q4,
- w, h);
+ vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
} else {
uint32_t pos = 38;
@@ -857,11 +834,9 @@ void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
src -= 3;
/* bit positon for extract from acc */
- __asm__ __volatile__ (
- "wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r" (pos)
- );
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
/* prefetch data to cache memory */
prefetch_load(src);
@@ -870,39 +845,31 @@ void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
switch (w) {
case 4:
- convolve_horiz_4_dspr2(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filter_x, (int32_t)h);
+ convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h);
break;
case 8:
- convolve_horiz_8_dspr2(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filter_x, (int32_t)h);
+ convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h);
break;
case 16:
- convolve_horiz_16_dspr2(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filter_x, (int32_t)h, 1);
+ convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h, 1);
break;
case 32:
- convolve_horiz_16_dspr2(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filter_x, (int32_t)h, 2);
+ convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h, 2);
break;
case 64:
prefetch_load(src + 64);
prefetch_store(dst + 32);
- convolve_horiz_64_dspr2(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filter_x, (int32_t)h);
+ convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h);
break;
default:
- vpx_convolve8_horiz_c(src + 3, src_stride,
- dst, dst_stride,
- filter_x, x_step_q4,
- filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
break;
}
}
diff --git a/vpx_dsp/mips/convolve8_vert_dspr2.c b/vpx_dsp/mips/convolve8_vert_dspr2.c
index d553828c5..ad107d5c4 100644
--- a/vpx_dsp/mips/convolve8_vert_dspr2.c
+++ b/vpx_dsp/mips/convolve8_vert_dspr2.c
@@ -18,12 +18,9 @@
#include "vpx_ports/mem.h"
#if HAVE_DSPR2
-static void convolve_vert_4_dspr2(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_y,
- int32_t w,
+static void convolve_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_y, int32_t w,
int32_t h) {
int32_t x, y;
const uint8_t *src_ptr;
@@ -53,7 +50,7 @@ static void convolve_vert_4_dspr2(const uint8_t *src,
src_ptr = src + x;
dst_ptr = dst + x;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[load1], 0(%[src_ptr]) \n\t"
"add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
"ulw %[load2], 0(%[src_ptr]) \n\t"
@@ -152,19 +149,16 @@ static void convolve_vert_4_dspr2(const uint8_t *src,
"sb %[store1], 2(%[dst_ptr]) \n\t"
"sb %[store2], 3(%[dst_ptr]) \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [load3] "=&r" (load3), [load4] "=&r" (load4),
- [p1] "=&r" (p1), [p2] "=&r" (p2),
- [n1] "=&r" (n1), [n2] "=&r" (n2),
- [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
- [store1] "=&r" (store1), [store2] "=&r" (store2),
- [src_ptr] "+r" (src_ptr)
- : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
- [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
- [vector4a] "r" (vector4a), [src_stride] "r" (src_stride),
- [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
- );
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
+ [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
+ [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
}
/* Next row... */
@@ -173,12 +167,9 @@ static void convolve_vert_4_dspr2(const uint8_t *src,
}
}
-static void convolve_vert_64_dspr2(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_y,
- int32_t h) {
+static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_y, int32_t h) {
int32_t x, y;
const uint8_t *src_ptr;
uint8_t *dst_ptr;
@@ -208,7 +199,7 @@ static void convolve_vert_64_dspr2(const uint8_t *src,
src_ptr = src + x;
dst_ptr = dst + x;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"ulw %[load1], 0(%[src_ptr]) \n\t"
"add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
"ulw %[load2], 0(%[src_ptr]) \n\t"
@@ -307,19 +298,16 @@ static void convolve_vert_64_dspr2(const uint8_t *src,
"sb %[store1], 2(%[dst_ptr]) \n\t"
"sb %[store2], 3(%[dst_ptr]) \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [load3] "=&r" (load3), [load4] "=&r" (load4),
- [p1] "=&r" (p1), [p2] "=&r" (p2),
- [n1] "=&r" (n1), [n2] "=&r" (n2),
- [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
- [store1] "=&r" (store1), [store2] "=&r" (store2),
- [src_ptr] "+r" (src_ptr)
- : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
- [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
- [vector4a] "r" (vector4a), [src_stride] "r" (src_stride),
- [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
- );
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
+ [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
+ [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
}
/* Next row... */
@@ -331,50 +319,38 @@ static void convolve_vert_64_dspr2(const uint8_t *src,
void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
assert(y_step_q4 == 16);
assert(((const int32_t *)filter_y)[1] != 0x800000);
if (((const int32_t *)filter_y)[0] == 0) {
- vpx_convolve2_vert_dspr2(src, src_stride,
- dst, dst_stride,
- filter_x, x_step_q4,
- filter_y, y_step_q4,
- w, h);
+ vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
} else {
uint32_t pos = 38;
/* bit positon for extract from acc */
- __asm__ __volatile__ (
- "wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r" (pos)
- );
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
prefetch_store(dst);
switch (w) {
- case 4 :
- case 8 :
- case 16 :
- case 32 :
- convolve_vert_4_dspr2(src, src_stride,
- dst, dst_stride,
- filter_y, w, h);
+ case 4:
+ case 8:
+ case 16:
+ case 32:
+ convolve_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, h);
break;
- case 64 :
+ case 64:
prefetch_store(dst + 32);
- convolve_vert_64_dspr2(src, src_stride,
- dst, dst_stride,
- filter_y, h);
+ convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
break;
default:
- vpx_convolve8_vert_c(src, src_stride,
- dst, dst_stride,
- filter_x, x_step_q4,
- filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
break;
}
}
diff --git a/vpx_dsp/mips/convolve_common_dspr2.h b/vpx_dsp/mips/convolve_common_dspr2.h
index 66d77a285..4eee3bd5e 100644
--- a/vpx_dsp/mips/convolve_common_dspr2.h
+++ b/vpx_dsp/mips/convolve_common_dspr2.h
@@ -25,8 +25,8 @@ extern "C" {
void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h);
void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
@@ -37,19 +37,18 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h);
-void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter,
- int w, int h);
+void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter, int w,
+ int h);
void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h);
#endif // #if HAVE_DSPR2
#ifdef __cplusplus
diff --git a/vpx_dsp/mips/deblock_msa.c b/vpx_dsp/mips/deblock_msa.c
index e98a0399b..402d7ed99 100644
--- a/vpx_dsp/mips/deblock_msa.c
+++ b/vpx_dsp/mips/deblock_msa.c
@@ -13,133 +13,132 @@
extern const int16_t vpx_rv[];
-#define VPX_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3, \
- out4, out5, out6, out7, \
- out8, out9, out10, out11, \
- out12, out13, out14, out15) \
-{ \
- v8i16 temp0, temp1, temp2, temp3, temp4; \
- v8i16 temp5, temp6, temp7, temp8, temp9; \
- \
- ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \
- temp0, temp1, temp2, temp3); \
- ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
- ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
- ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
- ILVRL_W2_SH(temp5, temp4, temp8, temp9); \
- ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \
- temp0, temp1, temp2, temp3); \
- ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
- ILVRL_W2_UB(temp5, temp4, out8, out10); \
- ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
- ILVRL_W2_UB(temp5, temp4, out12, out14); \
- out0 = (v16u8)temp6; \
- out2 = (v16u8)temp7; \
- out4 = (v16u8)temp8; \
- out6 = (v16u8)temp9; \
- out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \
- out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \
- out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \
- out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \
- out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
- out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
- out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \
- out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \
-}
+#define VPX_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, out0, \
+ out1, out2, out3, out4, out5, out6, out7, \
+ out8, out9, out10, out11, out12, out13, out14, \
+ out15) \
+ { \
+ v8i16 temp0, temp1, temp2, temp3, temp4; \
+ v8i16 temp5, temp6, temp7, temp8, temp9; \
+ \
+ ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \
+ temp3); \
+ ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
+ ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
+ ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
+ ILVRL_W2_SH(temp5, temp4, temp8, temp9); \
+ ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \
+ temp3); \
+ ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
+ ILVRL_W2_UB(temp5, temp4, out8, out10); \
+ ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
+ ILVRL_W2_UB(temp5, temp4, out12, out14); \
+ out0 = (v16u8)temp6; \
+ out2 = (v16u8)temp7; \
+ out4 = (v16u8)temp8; \
+ out6 = (v16u8)temp9; \
+ out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \
+ out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \
+ out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \
+ out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \
+ out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
+ out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
+ out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \
+ out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \
+ }
-#define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in, \
- below1_in, below2_in, ref, out) \
-{ \
- v16u8 temp0, temp1; \
- \
- temp1 = __msa_aver_u_b(above2_in, above1_in); \
- temp0 = __msa_aver_u_b(below2_in, below1_in); \
- temp1 = __msa_aver_u_b(temp1, temp0); \
- out = __msa_aver_u_b(src_in, temp1); \
- temp0 = __msa_asub_u_b(src_in, above2_in); \
- temp1 = __msa_asub_u_b(src_in, above1_in); \
- temp0 = (temp0 < ref); \
- temp1 = (temp1 < ref); \
- temp0 = temp0 & temp1; \
- temp1 = __msa_asub_u_b(src_in, below1_in); \
- temp1 = (temp1 < ref); \
- temp0 = temp0 & temp1; \
- temp1 = __msa_asub_u_b(src_in, below2_in); \
- temp1 = (temp1 < ref); \
- temp0 = temp0 & temp1; \
- out = __msa_bmz_v(out, src_in, temp0); \
-}
+#define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in, below1_in, below2_in, \
+ ref, out) \
+ { \
+ v16u8 temp0, temp1; \
+ \
+ temp1 = __msa_aver_u_b(above2_in, above1_in); \
+ temp0 = __msa_aver_u_b(below2_in, below1_in); \
+ temp1 = __msa_aver_u_b(temp1, temp0); \
+ out = __msa_aver_u_b(src_in, temp1); \
+ temp0 = __msa_asub_u_b(src_in, above2_in); \
+ temp1 = __msa_asub_u_b(src_in, above1_in); \
+ temp0 = (temp0 < ref); \
+ temp1 = (temp1 < ref); \
+ temp0 = temp0 & temp1; \
+ temp1 = __msa_asub_u_b(src_in, below1_in); \
+ temp1 = (temp1 < ref); \
+ temp0 = temp0 & temp1; \
+ temp1 = __msa_asub_u_b(src_in, below2_in); \
+ temp1 = (temp1 < ref); \
+ temp0 = temp0 & temp1; \
+ out = __msa_bmz_v(out, src_in, temp0); \
+ }
-#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, \
- in8, in9, in10, in11, in12, in13, in14, in15) \
-{ \
- v8i16 temp0, temp1, temp2, temp3, temp4; \
- v8i16 temp5, temp6, temp7, temp8, temp9; \
- \
- ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
- ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
- ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
- ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
- ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
- ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
- ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
- ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
- ILVRL_H2_SH(temp5, temp4, temp6, temp7); \
- ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5); \
- ILVRL_H2_SH(temp5, temp4, temp8, temp9); \
- ILVRL_W2_SH(temp8, temp6, temp4, temp5); \
- ILVRL_W2_SH(temp9, temp7, temp6, temp7); \
- ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9); \
- ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2); \
- in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0); \
- in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1); \
- ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1); \
- ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6); \
- in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2); \
- in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3); \
- ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, \
- temp2, temp3, temp4, temp5); \
- ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, \
- temp6, temp7, temp8, temp9); \
- ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1); \
- in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0); \
- in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0); \
- ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3); \
- in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2); \
- in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2); \
-}
+#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
+ in10, in11, in12, in13, in14, in15) \
+ { \
+ v8i16 temp0, temp1, temp2, temp3, temp4; \
+ v8i16 temp5, temp6, temp7, temp8, temp9; \
+ \
+ ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
+ ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
+ ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
+ ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
+ ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
+ ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
+ ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
+ ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
+ ILVRL_H2_SH(temp5, temp4, temp6, temp7); \
+ ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5); \
+ ILVRL_H2_SH(temp5, temp4, temp8, temp9); \
+ ILVRL_W2_SH(temp8, temp6, temp4, temp5); \
+ ILVRL_W2_SH(temp9, temp7, temp6, temp7); \
+ ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9); \
+ ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2); \
+ in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0); \
+ in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1); \
+ ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1); \
+ ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6); \
+ in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2); \
+ in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3); \
+ ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, temp2, temp3, \
+ temp4, temp5); \
+ ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, temp6, \
+ temp7, temp8, temp9); \
+ ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1); \
+ in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0); \
+ in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0); \
+ ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3); \
+ in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2); \
+ in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2); \
+ }
-#define VPX_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, \
- in6, in7, in8, in9, in10, in11) \
-{ \
- v8i16 temp0, temp1, temp2, temp3; \
- v8i16 temp4, temp5, temp6, temp7; \
- \
- ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
- ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
- ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
- ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
- ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
- ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
- ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5); \
- temp4 = __msa_ilvr_h(temp5, temp4); \
- ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7); \
- temp5 = __msa_ilvr_h(temp7, temp6); \
- ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
- in0 = (v16u8)temp0; \
- in2 = (v16u8)temp1; \
- in4 = (v16u8)temp2; \
- in6 = (v16u8)temp3; \
- in8 = (v16u8)temp6; \
- in10 = (v16u8)temp7; \
- in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0); \
- in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1); \
- in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2); \
- in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3); \
- in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6); \
- in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7); \
-}
+#define VPX_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, \
+ in9, in10, in11) \
+ { \
+ v8i16 temp0, temp1, temp2, temp3; \
+ v8i16 temp4, temp5, temp6, temp7; \
+ \
+ ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
+ ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
+ ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
+ ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
+ ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
+ ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
+ ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5); \
+ temp4 = __msa_ilvr_h(temp5, temp4); \
+ ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7); \
+ temp5 = __msa_ilvr_h(temp7, temp6); \
+ ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
+ in0 = (v16u8)temp0; \
+ in2 = (v16u8)temp1; \
+ in4 = (v16u8)temp2; \
+ in6 = (v16u8)temp3; \
+ in8 = (v16u8)temp6; \
+ in10 = (v16u8)temp7; \
+ in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0); \
+ in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1); \
+ in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2); \
+ in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3); \
+ in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6); \
+ in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7); \
+ }
static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
int32_t src_stride,
@@ -203,16 +202,16 @@ static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
above1 = LD_UB(p_src + 9 * src_stride);
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
- out0 = __msa_copy_u_d((v2i64) inter0, 0);
- out1 = __msa_copy_u_d((v2i64) inter1, 0);
- out2 = __msa_copy_u_d((v2i64) inter2, 0);
- out3 = __msa_copy_u_d((v2i64) inter3, 0);
+ out0 = __msa_copy_u_d((v2i64)inter0, 0);
+ out1 = __msa_copy_u_d((v2i64)inter1, 0);
+ out2 = __msa_copy_u_d((v2i64)inter2, 0);
+ out3 = __msa_copy_u_d((v2i64)inter3, 0);
SD4(out0, out1, out2, out3, p_dst, dst_stride);
- out0 = __msa_copy_u_d((v2i64) inter4, 0);
- out1 = __msa_copy_u_d((v2i64) inter5, 0);
- out2 = __msa_copy_u_d((v2i64) inter6, 0);
- out3 = __msa_copy_u_d((v2i64) inter7, 0);
+ out0 = __msa_copy_u_d((v2i64)inter4, 0);
+ out1 = __msa_copy_u_d((v2i64)inter5, 0);
+ out2 = __msa_copy_u_d((v2i64)inter6, 0);
+ out3 = __msa_copy_u_d((v2i64)inter7, 0);
SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride);
}
@@ -236,36 +235,36 @@ static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
src = inter2;
below1 = inter3;
below2 = inter4;
- ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 0);
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
above2 = inter5;
- ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 1);
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
above1 = inter6;
- ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 2);
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
src = inter7;
- ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 3);
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
below1 = inter8;
- ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 4);
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
below2 = inter9;
- ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 5);
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
if (col == (cols / 8 - 1)) {
above2 = inter9;
} else {
above2 = inter10;
}
- ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 6);
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
if (col == (cols / 8 - 1)) {
above1 = inter9;
} else {
above1 = inter11;
}
- ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 7);
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7, inter8,
inter9, inter2, inter3, inter4, inter5, inter6, inter7,
@@ -371,36 +370,36 @@ static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
src = inter2;
below1 = inter3;
below2 = inter4;
- ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 0);
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
above2 = inter5;
- ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 1);
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
above1 = inter6;
- ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 2);
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
src = inter7;
- ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 3);
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
below1 = inter8;
- ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 4);
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
below2 = inter9;
- ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 5);
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
if (col == (cols / 8 - 1)) {
above2 = inter9;
} else {
above2 = inter10;
}
- ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 6);
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
if (col == (cols / 8 - 1)) {
above1 = inter9;
} else {
above1 = inter11;
}
- ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 7);
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
VPX_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7,
inter8, inter9, inter2, inter3, inter4, inter5,
@@ -452,8 +451,8 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
int32_t row, col, cnt;
uint8_t *src_dup = src_ptr;
v16u8 src0, src, tmp_orig;
- v16u8 tmp = {0};
- v16i8 zero = {0};
+ v16u8 tmp = { 0 };
+ v16i8 zero = { 0 };
v8u16 sum_h, src_r_h, src_l_h;
v4u32 src_r_w, src_l_w;
v4i32 flimit_vec;
@@ -462,13 +461,13 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
for (row = rows; row--;) {
int32_t sum_sq = 0;
int32_t sum = 0;
- src0 = (v16u8) __msa_fill_b(src_dup[0]);
+ src0 = (v16u8)__msa_fill_b(src_dup[0]);
ST8x1_UB(src0, (src_dup - 8));
- src0 = (v16u8) __msa_fill_b(src_dup[cols - 1]);
+ src0 = (v16u8)__msa_fill_b(src_dup[cols - 1]);
ST_UB(src0, src_dup + cols);
src_dup[cols + 16] = src_dup[cols - 1];
- tmp_orig = (v16u8) __msa_ldi_b(0);
+ tmp_orig = (v16u8)__msa_ldi_b(0);
tmp_orig[15] = tmp[15];
src = LD_UB(src_dup - 8);
src[15] = 0;
@@ -508,9 +507,9 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
sum = sum_l[7];
src = LD_UB(src_dup + 16 * col);
ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
- src7 = (v16u8)((const8 + sum_r + (v8i16) src_r_h) >> 4);
- src8 = (v16u8)((const8 + sum_l + (v8i16) src_l_h) >> 4);
- tmp = (v16u8) __msa_pckev_b((v16i8) src8, (v16i8) src7);
+ src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4);
+ src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4);
+ tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7);
HADD_UB2_UH(src_r, src_l, add_r, add_l);
UNPCK_SH_SW(sub_r, sub0, sub1);
@@ -552,13 +551,13 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
total2 = (total2 < flimit_vec);
total3 = (total3 < flimit_vec);
PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
- mask = __msa_pckev_b((v16i8) mask1, (v16i8) mask0);
- tmp = __msa_bmz_v(tmp, src, (v16u8) mask);
+ mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
+ tmp = __msa_bmz_v(tmp, src, (v16u8)mask);
if (col == 0) {
uint64_t src_d;
- src_d = __msa_copy_u_d((v2i64) tmp_orig, 1);
+ src_d = __msa_copy_u_d((v2i64)tmp_orig, 1);
SD(src_d, (src_dup - 8));
}
@@ -588,15 +587,15 @@ void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
for (col = 0; col < (cols >> 4); ++col) {
uint8_t *dst_tmp = &dst_ptr[col << 4];
v16u8 dst;
- v16i8 zero = {0};
+ v16i8 zero = { 0 };
v16u8 tmp[16];
v8i16 mult0, mult1, rv2_0, rv2_1;
- v8i16 sum0_h = {0};
- v8i16 sum1_h = {0};
- v4i32 mul0 = {0};
- v4i32 mul1 = {0};
- v4i32 mul2 = {0};
- v4i32 mul3 = {0};
+ v8i16 sum0_h = { 0 };
+ v8i16 sum1_h = { 0 };
+ v4i32 mul0 = { 0 };
+ v4i32 mul1 = { 0 };
+ v4i32 mul2 = { 0 };
+ v4i32 mul3 = { 0 };
v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
v4i32 add0, add1, add2, add3;
const int16_t *rv2[16];
@@ -618,10 +617,10 @@ void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
dst = LD_UB(dst_tmp + (cnt * pitch));
UNPCK_UB_SH(dst, dst_r_h, dst_l_h);
MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1);
- mul0 += (v4i32) __msa_ilvr_h((v8i16) zero, (v8i16) mult0);
- mul1 += (v4i32) __msa_ilvl_h((v8i16) zero, (v8i16) mult0);
- mul2 += (v4i32) __msa_ilvr_h((v8i16) zero, (v8i16) mult1);
- mul3 += (v4i32) __msa_ilvl_h((v8i16) zero, (v8i16) mult1);
+ mul0 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult0);
+ mul1 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult0);
+ mul2 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult1);
+ mul3 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult1);
ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h);
}
@@ -652,7 +651,7 @@ void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h);
dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4);
dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4);
- tmp[row & 15] = (v16u8) __msa_pckev_b((v16i8) dst8, (v16i8) dst7);
+ tmp[row & 15] = (v16u8)__msa_pckev_b((v16i8)dst8, (v16i8)dst7);
UNPCK_SH_SW(sum0_h, sum0_w, sum1_w);
UNPCK_SH_SW(sum1_h, sum2_w, sum3_w);
@@ -669,8 +668,8 @@ void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
total2 = (total2 < flimit_vec);
total3 = (total3 < flimit_vec);
PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
- mask = __msa_pckev_b((v16i8) mask1, (v16i8) mask0);
- tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8) mask);
+ mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
+ tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8)mask);
if (row >= 8) {
ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch));
diff --git a/vpx_dsp/mips/fwd_dct32x32_msa.c b/vpx_dsp/mips/fwd_dct32x32_msa.c
index f29c14b3d..e41a90480 100644
--- a/vpx_dsp/mips/fwd_dct32x32_msa.c
+++ b/vpx_dsp/mips/fwd_dct32x32_msa.c
@@ -27,10 +27,10 @@ static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
SLLI_4V(in4, in5, in6, in7, 2);
SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
- BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,
- step0, step1, step2, step3, in4, in5, in6, in7);
- BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
- step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
+ BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
+ step3, in4, in5, in6, in7);
+ BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
+ step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
ST_SH4(step0, step1, step2, step3, temp_buff, 8);
ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8);
ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8);
@@ -45,10 +45,10 @@ static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
SLLI_4V(in4, in5, in6, in7, 2);
SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
- BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,
- step0, step1, step2, step3, in4, in5, in6, in7);
- BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
- step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
+ BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
+ step3, in4, in5, in6, in7);
+ BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
+ step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8);
ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8);
ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8);
@@ -64,12 +64,12 @@ static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
/* fdct even */
LD_SH4(input, 8, in0, in1, in2, in3);
LD_SH4(input + 96, 8, in12, in13, in14, in15);
- BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15,
- vec0, vec1, vec2, vec3, in12, in13, in14, in15);
+ BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2,
+ vec3, in12, in13, in14, in15);
LD_SH4(input + 32, 8, in4, in5, in6, in7);
LD_SH4(input + 64, 8, in8, in9, in10, in11);
- BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11,
- vec4, vec5, vec6, vec7, in8, in9, in10, in11);
+ BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7,
+ in8, in9, in10, in11);
/* Stage 3 */
ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
@@ -258,28 +258,26 @@ static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7);
LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15);
- TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
- in0, in1, in2, in3, in4, in5, in6, in7);
- TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
- in8, in9, in10, in11, in12, in13, in14, in15);
- BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
- in8, in9, in10, in11, in12, in13, in14, in15,
- step0, step1, step2, step3, step4, step5, step6, step7,
- in8, in9, in10, in11, in12, in13, in14, in15);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+ in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
+ step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8);
ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8);
/* 2nd set */
LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15);
- TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
- in0, in1, in2, in3, in4, in5, in6, in7);
- TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
- in8, in9, in10, in11, in12, in13, in14, in15);
- BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
- in8, in9, in10, in11, in12, in13, in14, in15,
- step0, step1, step2, step3, step4, step5, step6, step7,
- in8, in9, in10, in11, in12, in13, in14, in15);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+ in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
+ step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7,
(output + 8 * 8), 8);
ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8);
@@ -299,10 +297,9 @@ static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
- BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
- in8, in9, in10, in11, in12, in13, in14, in15,
- vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
- in8, in9, in10, in11, in12, in13, in14, in15);
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+ in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
+ vec7, in8, in9, in10, in11, in12, in13, in14, in15);
ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8);
ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8);
@@ -315,19 +312,19 @@ static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
UNPCK_SH_SW(vec5, vec5_l, vec5_r);
UNPCK_SH_SW(vec6, vec6_l, vec6_r);
UNPCK_SH_SW(vec7, vec7_l, vec7_r);
- ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r,
- tmp0_w, tmp1_w, tmp2_w, tmp3_w);
+ ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w,
+ tmp1_w, tmp2_w, tmp3_w);
BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r);
- ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l,
- vec0_r, vec1_r, vec2_r, vec3_r);
+ ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r,
+ vec1_r, vec2_r, vec3_r);
tmp3_w = vec0_r + vec3_r;
vec0_r = vec0_r - vec3_r;
vec3_r = vec1_r + vec2_r;
vec1_r = vec1_r - vec2_r;
- DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64,
- cospi_16_64, vec4_r, tmp3_w, vec6_r, vec3_r);
+ DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64,
+ vec4_r, tmp3_w, vec6_r, vec3_r);
FDCT32_POSTPROC_NEG_W(vec4_r);
FDCT32_POSTPROC_NEG_W(tmp3_w);
FDCT32_POSTPROC_NEG_W(vec6_r);
@@ -335,8 +332,8 @@ static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
ST_SH2(vec5, vec4, out, 8);
- DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64,
- cospi_8_64, vec4_r, tmp3_w, vec6_r, vec3_r);
+ DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64,
+ vec4_r, tmp3_w, vec6_r, vec3_r);
FDCT32_POSTPROC_NEG_W(vec4_r);
FDCT32_POSTPROC_NEG_W(tmp3_w);
FDCT32_POSTPROC_NEG_W(vec6_r);
@@ -401,10 +398,9 @@ static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
- BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
- in8, in9, in10, in11, in12, in13, in14, in15,
- vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
- in8, in9, in10, in11, in12, in13, in14, in15);
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+ in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
+ vec7, in8, in9, in10, in11, in12, in13, in14, in15);
/* Stage 3 */
ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
@@ -610,8 +606,8 @@ static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
in3 = LD_SH(temp + 192);
in5 = LD_SH(temp + 216);
- TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
- in0, in1, in2, in3, in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
/* 2nd set */
in0_1 = LD_SH(temp + 16);
@@ -637,10 +633,10 @@ static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
in6 = LD_SH(temp + 104);
in7 = LD_SH(temp + 144);
- ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
- output + 8, 32);
- TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
- in0, in1, in2, in3, in4, in5, in6, in7);
+ ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 8,
+ 32);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32);
/* 4th set */
@@ -655,12 +651,11 @@ static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
- ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
- output + 24, 32);
+ ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 24,
+ 32);
}
-static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf,
- int16_t *output) {
+static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) {
fdct8x32_1d_row_load_butterfly(temp, temp_buf);
fdct8x32_1d_row_even(temp_buf, temp_buf);
fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128);
@@ -706,10 +701,9 @@ static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
- BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
- in8, in9, in10, in11, in12, in13, in14, in15,
- vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
- in8, in9, in10, in11, in12, in13, in14, in15);
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+ in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
+ vec7, in8, in9, in10, in11, in12, in13, in14, in15);
FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
FDCT_POSTPROC_2V_NEG_H(vec4, vec5);
diff --git a/vpx_dsp/mips/fwd_txfm_msa.c b/vpx_dsp/mips/fwd_txfm_msa.c
index 0dd141f41..fdead5050 100644
--- a/vpx_dsp/mips/fwd_txfm_msa.c
+++ b/vpx_dsp/mips/fwd_txfm_msa.c
@@ -18,24 +18,24 @@ void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30;
v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37;
v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
- v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64,
- -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 };
- v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64,
- cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 };
- v8i16 coeff2 = { -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64,
- 0, 0, 0, 0 };
-
- LD_SH16(input, src_stride,
- in0, in1, in2, in3, in4, in5, in6, in7,
- in8, in9, in10, in11, in12, in13, in14, in15);
+ v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64,
+ -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 };
+ v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64,
+ cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 };
+ v8i16 coeff2 = {
+ -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0
+ };
+
+ LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9,
+ in10, in11, in12, in13, in14, in15);
SLLI_4V(in0, in1, in2, in3, 2);
SLLI_4V(in4, in5, in6, in7, 2);
SLLI_4V(in8, in9, in10, in11, 2);
SLLI_4V(in12, in13, in14, in15, 2);
ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3);
ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7);
- FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
- tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+ FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+ tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32);
SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12);
SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8);
@@ -137,10 +137,10 @@ void fdct16x8_1d_row(int16_t *input, int16_t *output) {
LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7);
LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15);
- TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
- in0, in1, in2, in3, in4, in5, in6, in7);
- TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
- in8, in9, in10, in11, in12, in13, in14, in15);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+ in10, in11, in12, in13, in14, in15);
ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11);
@@ -150,19 +150,19 @@ void fdct16x8_1d_row(int16_t *input, int16_t *output) {
SRA_4V(in8, in9, in10, in11, 2);
SRA_4V(in12, in13, in14, in15, 2);
BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
- in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5,
- tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
+ in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6,
+ tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16);
- FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
- tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+ FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+ tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15);
- FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15,
- in0, in1, in2, in3, in4, in5, in6, in7);
- TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3,
- tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3);
+ FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
+ tmp1, in1, tmp2, in2, tmp3, in3);
ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16);
- TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7,
- tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7);
+ TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
+ tmp5, in5, tmp6, in6, tmp7, in7);
ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16);
}
@@ -203,14 +203,14 @@ void vpx_fdct8x8_msa(const int16_t *input, int16_t *output,
LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7);
SLLI_4V(in0, in1, in2, in3, 2);
SLLI_4V(in4, in5, in6, in7, 2);
- VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
- in0, in1, in2, in3, in4, in5, in6, in7);
- TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
- in0, in1, in2, in3, in4, in5, in6, in7);
- VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
- in0, in1, in2, in3, in4, in5, in6, in7);
- TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
- in0, in1, in2, in3, in4, in5, in6, in7);
+ VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
}
diff --git a/vpx_dsp/mips/fwd_txfm_msa.h b/vpx_dsp/mips/fwd_txfm_msa.h
index d1e160eed..db5e90e7b 100644
--- a/vpx_dsp/mips/fwd_txfm_msa.h
+++ b/vpx_dsp/mips/fwd_txfm_msa.h
@@ -14,358 +14,365 @@
#include "vpx_dsp/mips/txfm_macros_msa.h"
#include "vpx_dsp/txfm_common.h"
-#define LD_HADD(psrc, stride) ({ \
- v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m; \
- v4i32 vec_w_m; \
- \
- LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m); \
- ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m); \
- LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m); \
- ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, \
- in4_m, in6_m, in0_m, in4_m); \
- in0_m += in4_m; \
- \
- vec_w_m = __msa_hadd_s_w(in0_m, in0_m); \
- HADD_SW_S32(vec_w_m); \
-})
+#define LD_HADD(psrc, stride) \
+ ({ \
+ v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m; \
+ v4i32 vec_w_m; \
+ \
+ LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m); \
+ ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m); \
+ LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m); \
+ ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, in4_m, in6_m, \
+ in0_m, in4_m); \
+ in0_m += in4_m; \
+ \
+ vec_w_m = __msa_hadd_s_w(in0_m, in0_m); \
+ HADD_SW_S32(vec_w_m); \
+ })
-#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) { \
- v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \
- v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \
- v4i32 vec4_m, vec5_m, vec6_m, vec7_m; \
- v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, \
- cospi_24_64, -cospi_8_64, 0, 0, 0 }; \
- \
- BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \
- ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \
- SPLATI_H2_SH(coeff_m, 0, 1, cnst0_m, cnst1_m); \
- cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
- vec5_m = __msa_dotp_s_w(vec0_m, cnst1_m); \
- \
- SPLATI_H2_SH(coeff_m, 4, 3, cnst2_m, cnst3_m); \
- cnst2_m = __msa_ilvev_h(cnst3_m, cnst2_m); \
- vec7_m = __msa_dotp_s_w(vec2_m, cnst2_m); \
- \
- vec4_m = __msa_dotp_s_w(vec0_m, cnst0_m); \
- cnst2_m = __msa_splati_h(coeff_m, 2); \
- cnst2_m = __msa_ilvev_h(cnst2_m, cnst3_m); \
- vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m); \
- \
- SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS); \
- PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m, \
- vec7_m, vec7_m, out0, out2, out1, out3); \
-}
+#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \
+ v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \
+ v4i32 vec4_m, vec5_m, vec6_m, vec7_m; \
+ v8i16 coeff_m = { \
+ cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, -cospi_8_64, 0, 0, 0 \
+ }; \
+ \
+ BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \
+ ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \
+ SPLATI_H2_SH(coeff_m, 0, 1, cnst0_m, cnst1_m); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ vec5_m = __msa_dotp_s_w(vec0_m, cnst1_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 4, 3, cnst2_m, cnst3_m); \
+ cnst2_m = __msa_ilvev_h(cnst3_m, cnst2_m); \
+ vec7_m = __msa_dotp_s_w(vec2_m, cnst2_m); \
+ \
+ vec4_m = __msa_dotp_s_w(vec0_m, cnst0_m); \
+ cnst2_m = __msa_splati_h(coeff_m, 2); \
+ cnst2_m = __msa_ilvev_h(cnst2_m, cnst3_m); \
+ vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m); \
+ \
+ SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS); \
+ PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m, vec7_m, \
+ vec7_m, out0, out2, out1, out3); \
+ }
-#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) { \
- v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
- \
- SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15); \
- SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15); \
- AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3, \
- in0, in1, in2, in3); \
- AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7, \
- in4, in5, in6, in7); \
-}
+#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \
+ { \
+ v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ \
+ SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15); \
+ SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15); \
+ AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3, in0, in1, \
+ in2, in3); \
+ AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7, in4, in5, \
+ in6, in7); \
+ }
-#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3, out4, out5, out6, out7) { \
- v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \
- v8i16 s7_m, x0_m, x1_m, x2_m, x3_m; \
- v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, \
- cospi_24_64, cospi_4_64, cospi_28_64, \
- cospi_12_64, cospi_20_64 }; \
- \
- /* FDCT stage1 */ \
- BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \
- s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \
- BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \
- ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \
- ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \
- SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \
- x1_m = __msa_ilvev_h(x1_m, x0_m); \
- out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \
- \
- SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \
- x2_m = -x2_m; \
- x2_m = __msa_ilvev_h(x3_m, x2_m); \
- out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \
- \
- out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \
- x2_m = __msa_splati_h(coeff_m, 2); \
- x2_m = __msa_ilvev_h(x2_m, x3_m); \
- out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \
- \
- /* stage2 */ \
- ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \
- \
- s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \
- s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \
- \
- /* stage3 */ \
- BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \
- \
- /* stage4 */ \
- ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \
- ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \
- \
- SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \
- x1_m = __msa_ilvev_h(x0_m, x1_m); \
- out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \
- \
- SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \
- x2_m = __msa_ilvev_h(x3_m, x2_m); \
- out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
- \
- x1_m = __msa_splati_h(coeff_m, 5); \
- x0_m = -x0_m; \
- x0_m = __msa_ilvev_h(x1_m, x0_m); \
- out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \
- \
- x2_m = __msa_splati_h(coeff_m, 6); \
- x3_m = -x3_m; \
- x2_m = __msa_ilvev_h(x2_m, x3_m); \
- out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
-}
+#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+ out3, out4, out5, out6, out7) \
+ { \
+ v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \
+ v8i16 s7_m, x0_m, x1_m, x2_m, x3_m; \
+ v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \
+ cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \
+ \
+ /* FDCT stage1 */ \
+ BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \
+ s3_m, s4_m, s5_m, s6_m, s7_m); \
+ BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \
+ ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \
+ ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \
+ SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \
+ x1_m = __msa_ilvev_h(x1_m, x0_m); \
+ out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \
+ x2_m = -x2_m; \
+ x2_m = __msa_ilvev_h(x3_m, x2_m); \
+ out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \
+ \
+ out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \
+ x2_m = __msa_splati_h(coeff_m, 2); \
+ x2_m = __msa_ilvev_h(x2_m, x3_m); \
+ out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \
+ \
+ /* stage2 */ \
+ ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \
+ \
+ s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \
+ s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \
+ \
+ /* stage3 */ \
+ BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \
+ \
+ /* stage4 */ \
+ ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \
+ ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \
+ x1_m = __msa_ilvev_h(x0_m, x1_m); \
+ out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \
+ x2_m = __msa_ilvev_h(x3_m, x2_m); \
+ out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
+ \
+ x1_m = __msa_splati_h(coeff_m, 5); \
+ x0_m = -x0_m; \
+ x0_m = __msa_ilvev_h(x1_m, x0_m); \
+ out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \
+ \
+ x2_m = __msa_splati_h(coeff_m, 6); \
+ x3_m = -x3_m; \
+ x2_m = __msa_ilvev_h(x2_m, x3_m); \
+ out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
+ }
-#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3, out4, out5, out6, out7) { \
- v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \
- v8i16 x0_m, x1_m, x2_m, x3_m; \
- v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \
- cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \
+#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ { \
+ v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \
+ v8i16 x0_m, x1_m, x2_m, x3_m; \
+ v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \
+ cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \
\
- /* FDCT stage1 */ \
- BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \
- s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \
- BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \
- ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \
- ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \
- SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \
- x1_m = __msa_ilvev_h(x1_m, x0_m); \
- out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \
+ /* FDCT stage1 */ \
+ BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \
+ s3_m, s4_m, s5_m, s6_m, s7_m); \
+ BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \
+ ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \
+ ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \
+ SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \
+ x1_m = __msa_ilvev_h(x1_m, x0_m); \
+ out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \
\
- SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \
- x2_m = -x2_m; \
- x2_m = __msa_ilvev_h(x3_m, x2_m); \
- out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \
+ SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \
+ x2_m = -x2_m; \
+ x2_m = __msa_ilvev_h(x3_m, x2_m); \
+ out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \
\
- out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \
- x2_m = __msa_splati_h(coeff_m, 2); \
- x2_m = __msa_ilvev_h(x2_m, x3_m); \
- out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \
+ out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \
+ x2_m = __msa_splati_h(coeff_m, 2); \
+ x2_m = __msa_ilvev_h(x2_m, x3_m); \
+ out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \
\
- /* stage2 */ \
- ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \
+ /* stage2 */ \
+ ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \
\
- s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \
- s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \
+ s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \
+ s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \
\
- /* stage3 */ \
- BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \
+ /* stage3 */ \
+ BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \
\
- /* stage4 */ \
- ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \
- ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \
+ /* stage4 */ \
+ ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \
+ ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \
\
- SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \
- x1_m = __msa_ilvev_h(x0_m, x1_m); \
- out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \
+ SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \
+ x1_m = __msa_ilvev_h(x0_m, x1_m); \
+ out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \
\
- SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \
- x2_m = __msa_ilvev_h(x3_m, x2_m); \
- out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
+ SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \
+ x2_m = __msa_ilvev_h(x3_m, x2_m); \
+ out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
\
- x1_m = __msa_splati_h(coeff_m, 5); \
- x0_m = -x0_m; \
- x0_m = __msa_ilvev_h(x1_m, x0_m); \
- out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \
+ x1_m = __msa_splati_h(coeff_m, 5); \
+ x0_m = -x0_m; \
+ x0_m = __msa_ilvev_h(x1_m, x0_m); \
+ out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \
\
- x2_m = __msa_splati_h(coeff_m, 6); \
- x3_m = -x3_m; \
- x2_m = __msa_ilvev_h(x2_m, x3_m); \
- out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
-}
+ x2_m = __msa_splati_h(coeff_m, 6); \
+ x3_m = -x3_m; \
+ x2_m = __msa_ilvev_h(x2_m, x3_m); \
+ out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
+ }
-#define FDCT8x16_ODD(input0, input1, input2, input3, \
- input4, input5, input6, input7, \
- out1, out3, out5, out7, \
- out9, out11, out13, out15) { \
- v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \
- v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \
- v8i16 stp36_m, stp37_m, vec0_m, vec1_m; \
- v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \
- v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m; \
- v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, \
- cospi_24_64, -cospi_8_64, -cospi_24_64, \
- cospi_12_64, cospi_20_64 }; \
- v8i16 coeff1_m = { cospi_2_64, cospi_30_64, cospi_14_64, \
- cospi_18_64, cospi_10_64, cospi_22_64, \
- cospi_6_64, cospi_26_64 }; \
- v8i16 coeff2_m = { -cospi_2_64, -cospi_10_64, -cospi_18_64, \
- -cospi_26_64, 0, 0, 0, 0 }; \
- \
- /* stp 1 */ \
- ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m); \
- ILVR_H2_SH(input2, input5, input3, input4, vec3_m, vec5_m); \
- \
- cnst4_m = __msa_splati_h(coeff_m, 0); \
- stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m); \
- \
- cnst5_m = __msa_splati_h(coeff_m, 1); \
- cnst5_m = __msa_ilvev_h(cnst5_m, cnst4_m); \
- stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m); \
- stp24_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m); \
- stp23_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m); \
- \
- /* stp2 */ \
- BUTTERFLY_4(input0, input1, stp22_m, stp23_m, \
- stp30_m, stp31_m, stp32_m, stp33_m); \
- BUTTERFLY_4(input7, input6, stp25_m, stp24_m, \
- stp37_m, stp36_m, stp35_m, stp34_m); \
- \
- ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m); \
- ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m); \
- \
- SPLATI_H2_SH(coeff_m, 2, 3, cnst0_m, cnst1_m); \
- cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
- stp26_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \
- \
- cnst0_m = __msa_splati_h(coeff_m, 4); \
- cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
- stp21_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \
- \
- SPLATI_H2_SH(coeff_m, 5, 2, cnst0_m, cnst1_m); \
- cnst1_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
- stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \
- \
- cnst0_m = __msa_splati_h(coeff_m, 3); \
- cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
- stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \
- \
- /* stp4 */ \
- BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m, \
- vec6_m, vec2_m, vec4_m, vec5_m); \
- BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m, \
- stp21_m, stp23_m, stp24_m, stp31_m); \
- \
- ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m); \
- SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m); \
- cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
- \
- out1 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
- \
- cnst0_m = __msa_splati_h(coeff2_m, 0); \
- cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
- out15 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
- \
- ILVRL_H2_SH(vec4_m, vec5_m, vec1_m, vec0_m); \
- SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \
- cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
- \
- out9 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
- \
- cnst1_m = __msa_splati_h(coeff2_m, 2); \
- cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
- out7 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
- \
- ILVRL_H2_SH(stp23_m, stp21_m, vec1_m, vec0_m); \
- SPLATI_H2_SH(coeff1_m, 4, 5, cnst0_m, cnst1_m); \
- cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
- out5 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
- \
- cnst0_m = __msa_splati_h(coeff2_m, 1); \
- cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
- out11 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
- \
- ILVRL_H2_SH(stp24_m, stp31_m, vec1_m, vec0_m); \
- SPLATI_H2_SH(coeff1_m, 6, 7, cnst0_m, cnst1_m); \
- cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
- \
- out13 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
- \
- cnst1_m = __msa_splati_h(coeff2_m, 3); \
- cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
- out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
-}
+#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \
+ input7, out1, out3, out5, out7, out9, out11, out13, \
+ out15) \
+ { \
+ v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \
+ v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \
+ v8i16 stp36_m, stp37_m, vec0_m, vec1_m; \
+ v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \
+ v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m; \
+ v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \
+ -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; \
+ v8i16 coeff1_m = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, \
+ cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; \
+ v8i16 coeff2_m = { \
+ -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 \
+ }; \
+ \
+ /* stp 1 */ \
+ ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m); \
+ ILVR_H2_SH(input2, input5, input3, input4, vec3_m, vec5_m); \
+ \
+ cnst4_m = __msa_splati_h(coeff_m, 0); \
+ stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m); \
+ \
+ cnst5_m = __msa_splati_h(coeff_m, 1); \
+ cnst5_m = __msa_ilvev_h(cnst5_m, cnst4_m); \
+ stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m); \
+ stp24_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m); \
+ stp23_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m); \
+ \
+ /* stp2 */ \
+ BUTTERFLY_4(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, stp32_m, \
+ stp33_m); \
+ BUTTERFLY_4(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, stp35_m, \
+ stp34_m); \
+ \
+ ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m); \
+ ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 2, 3, cnst0_m, cnst1_m); \
+ cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ stp26_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \
+ \
+ cnst0_m = __msa_splati_h(coeff_m, 4); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ stp21_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 5, 2, cnst0_m, cnst1_m); \
+ cnst1_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \
+ \
+ cnst0_m = __msa_splati_h(coeff_m, 3); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \
+ \
+ /* stp4 */ \
+ BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, vec4_m, \
+ vec5_m); \
+ BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, stp24_m, \
+ stp31_m); \
+ \
+ ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m); \
+ SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m); \
+ cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ \
+ out1 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ \
+ cnst0_m = __msa_splati_h(coeff2_m, 0); \
+ cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ out15 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ \
+ ILVRL_H2_SH(vec4_m, vec5_m, vec1_m, vec0_m); \
+ SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ \
+ out9 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
+ \
+ cnst1_m = __msa_splati_h(coeff2_m, 2); \
+ cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ out7 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ \
+ ILVRL_H2_SH(stp23_m, stp21_m, vec1_m, vec0_m); \
+ SPLATI_H2_SH(coeff1_m, 4, 5, cnst0_m, cnst1_m); \
+ cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ out5 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ \
+ cnst0_m = __msa_splati_h(coeff2_m, 1); \
+ cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ out11 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ \
+ ILVRL_H2_SH(stp24_m, stp31_m, vec1_m, vec0_m); \
+ SPLATI_H2_SH(coeff1_m, 6, 7, cnst0_m, cnst1_m); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ \
+ out13 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
+ \
+ cnst1_m = __msa_splati_h(coeff2_m, 3); \
+ cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ }
-#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) { \
- v8i16 tp0_m, tp1_m; \
- v8i16 one_m = __msa_ldi_h(1); \
- \
- tp0_m = __msa_clti_s_h(vec0, 0); \
- tp1_m = __msa_clti_s_h(vec1, 0); \
- vec0 += 1; \
- vec1 += 1; \
- tp0_m = one_m & tp0_m; \
- tp1_m = one_m & tp1_m; \
- vec0 += tp0_m; \
- vec1 += tp1_m; \
- vec0 >>= 2; \
- vec1 >>= 2; \
-}
+#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \
+ { \
+ v8i16 tp0_m, tp1_m; \
+ v8i16 one_m = __msa_ldi_h(1); \
+ \
+ tp0_m = __msa_clti_s_h(vec0, 0); \
+ tp1_m = __msa_clti_s_h(vec1, 0); \
+ vec0 += 1; \
+ vec1 += 1; \
+ tp0_m = one_m & tp0_m; \
+ tp1_m = one_m & tp1_m; \
+ vec0 += tp0_m; \
+ vec1 += tp1_m; \
+ vec0 >>= 2; \
+ vec1 >>= 2; \
+ }
-#define FDCT32_POSTPROC_NEG_W(vec) { \
- v4i32 temp_m; \
- v4i32 one_m = __msa_ldi_w(1); \
- \
- temp_m = __msa_clti_s_w(vec, 0); \
- vec += 1; \
- temp_m = one_m & temp_m; \
- vec += temp_m; \
- vec >>= 2; \
-}
+#define FDCT32_POSTPROC_NEG_W(vec) \
+ { \
+ v4i32 temp_m; \
+ v4i32 one_m = __msa_ldi_w(1); \
+ \
+ temp_m = __msa_clti_s_w(vec, 0); \
+ vec += 1; \
+ temp_m = one_m & temp_m; \
+ vec += temp_m; \
+ vec >>= 2; \
+ }
-#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) { \
- v8i16 tp0_m, tp1_m; \
- v8i16 one = __msa_ldi_h(1); \
+#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \
+ { \
+ v8i16 tp0_m, tp1_m; \
+ v8i16 one = __msa_ldi_h(1); \
\
- tp0_m = __msa_clei_s_h(vec0, 0); \
- tp1_m = __msa_clei_s_h(vec1, 0); \
- tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255); \
- tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255); \
- vec0 += 1; \
- vec1 += 1; \
- tp0_m = one & tp0_m; \
- tp1_m = one & tp1_m; \
- vec0 += tp0_m; \
- vec1 += tp1_m; \
- vec0 >>= 2; \
- vec1 >>= 2; \
-}
+ tp0_m = __msa_clei_s_h(vec0, 0); \
+ tp1_m = __msa_clei_s_h(vec1, 0); \
+ tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255); \
+ tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255); \
+ vec0 += 1; \
+ vec1 += 1; \
+ tp0_m = one & tp0_m; \
+ tp1_m = one & tp1_m; \
+ vec0 += tp0_m; \
+ vec1 += tp1_m; \
+ vec0 >>= 2; \
+ vec1 >>= 2; \
+ }
-#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, \
- reg1_right, const0, const1, \
- out0, out1, out2, out3) { \
- v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \
- v2i64 tp0_m, tp1_m, tp2_m, tp3_m; \
- v4i32 k0_m = __msa_fill_w((int32_t) const0); \
- \
- s0_m = __msa_fill_w((int32_t) const1); \
- k0_m = __msa_ilvev_w(s0_m, k0_m); \
- \
- ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m); \
- ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m); \
- ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m); \
- ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m); \
- \
- DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m); \
- DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m); \
- tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \
- tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \
- tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \
- tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \
- out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \
- out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \
- \
- DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m); \
- DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m); \
- tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \
- tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \
- tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \
- tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \
- out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \
- out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \
-}
+#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \
+ const0, const1, out0, out1, out2, out3) \
+ { \
+ v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \
+ v2i64 tp0_m, tp1_m, tp2_m, tp3_m; \
+ v4i32 k0_m = __msa_fill_w((int32_t)const0); \
+ \
+ s0_m = __msa_fill_w((int32_t)const1); \
+ k0_m = __msa_ilvev_w(s0_m, k0_m); \
+ \
+ ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m); \
+ ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m); \
+ ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m); \
+ ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m); \
+ \
+ DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m); \
+ DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m); \
+ tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \
+ tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \
+ tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \
+ tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \
+ out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \
+ out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \
+ \
+ DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m); \
+ DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m); \
+ tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \
+ tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \
+ tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \
+ tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \
+ out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \
+ out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \
+ }
void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
int32_t src_stride);
diff --git a/vpx_dsp/mips/idct16x16_msa.c b/vpx_dsp/mips/idct16x16_msa.c
index 5faac715e..2a211c567 100644
--- a/vpx_dsp/mips/idct16x16_msa.c
+++ b/vpx_dsp/mips/idct16x16_msa.c
@@ -20,10 +20,10 @@ void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
input += 8;
LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
- TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
- reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
- TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15,
- reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+ TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg0, reg1,
+ reg2, reg3, reg4, reg5, reg6, reg7);
+ TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15, reg8,
+ reg9, reg10, reg11, reg12, reg13, reg14, reg15);
DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
@@ -93,13 +93,13 @@ void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
reg3 = tmp7;
/* transpose block */
- TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14,
- reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14);
+ TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, reg0,
+ reg2, reg4, reg6, reg8, reg10, reg12, reg14);
ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16);
/* transpose block */
- TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15,
- reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15);
+ TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, reg3,
+ reg13, reg11, reg5, reg7, reg9, reg1, reg15);
ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16);
}
@@ -233,7 +233,7 @@ void vpx_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst,
/* short case just considers top 4 rows as valid output */
out += 4 * 16;
for (i = 12; i--;) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sw $zero, 0(%[out]) \n\t"
"sw $zero, 4(%[out]) \n\t"
"sw $zero, 8(%[out]) \n\t"
@@ -244,8 +244,7 @@ void vpx_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst,
"sw $zero, 28(%[out]) \n\t"
:
- : [out] "r" (out)
- );
+ : [out] "r"(out));
out += 16;
}
@@ -283,8 +282,8 @@ void vpx_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst,
ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
CLIP_SH4_0_255(res0, res1, res2, res3);
CLIP_SH4_0_255(res4, res5, res6, res7);
- PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3,
- tmp0, tmp1, tmp2, tmp3);
+ PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1,
+ tmp2, tmp3);
ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
dst += (4 * dst_stride);
}
@@ -295,29 +294,28 @@ void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) {
v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
/* load input data */
- LD_SH16(input, 8,
- l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, l7, l15);
- TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
- l0, l1, l2, l3, l4, l5, l6, l7);
- TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
- l8, l9, l10, l11, l12, l13, l14, l15);
+ LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14,
+ l7, l15);
+ TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, l0, l1, l2, l3, l4, l5, l6,
+ l7);
+ TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, l8, l9, l10, l11,
+ l12, l13, l14, l15);
/* ADST in horizontal */
- VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7,
- l8, l9, l10, l11, l12, l13, l14, l15,
- r0, r1, r2, r3, r4, r5, r6, r7,
- r8, r9, r10, r11, r12, r13, r14, r15);
+ VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13,
+ l14, l15, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11,
+ r12, r13, r14, r15);
l1 = -r8;
l3 = -r4;
l13 = -r13;
l15 = -r1;
- TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2,
- l0, l1, l2, l3, l4, l5, l6, l7);
+ TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, l0, l1, l2, l3, l4, l5,
+ l6, l7);
ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16);
- TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15,
- l8, l9, l10, l11, l12, l13, l14, l15);
+ TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, l8, l9, l10, l11, l12,
+ l13, l14, l15);
ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16);
}
diff --git a/vpx_dsp/mips/idct32x32_msa.c b/vpx_dsp/mips/idct32x32_msa.c
index d5b3966e0..2ea6136f9 100644
--- a/vpx_dsp/mips/idct32x32_msa.c
+++ b/vpx_dsp/mips/idct32x32_msa.c
@@ -17,10 +17,10 @@ static void idct32x8_row_transpose_store(const int16_t *input,
/* 1st & 2nd 8x8 */
LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3);
LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7);
- TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
- m0, n0, m1, n1, m2, n2, m3, n3);
- TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
- m4, n4, m5, n5, m6, n6, m7, n7);
+ TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+ n3);
+ TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+ n7);
ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8);
ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8);
ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8);
@@ -28,10 +28,10 @@ static void idct32x8_row_transpose_store(const int16_t *input,
/* 3rd & 4th 8x8 */
LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3);
LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7);
- TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
- m0, n0, m1, n1, m2, n2, m3, n3);
- TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
- m4, n4, m5, n5, m6, n6, m7, n7);
+ TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+ n3);
+ TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+ n7);
ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8);
ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8);
ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8);
@@ -186,8 +186,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
/* 4 Stores */
- SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4,
- vec0, vec1, vec2, vec3);
+ SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3);
DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
@@ -198,8 +197,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
/* 4 Stores */
- ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4,
- vec1, vec2, vec0, vec3);
+ ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1, vec2, vec0, vec3);
BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
ST_SH(reg0, (tmp_odd_buf + 13 * 8));
ST_SH(reg1, (tmp_odd_buf + 14 * 8));
@@ -213,8 +211,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
- ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
- loc0, loc1, loc2, loc3);
+ ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
@@ -228,8 +225,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
- ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
- loc0, loc1, loc2, loc3);
+ ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
@@ -242,8 +238,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
static void idct_butterfly_transpose_store(int16_t *tmp_buf,
int16_t *tmp_eve_buf,
- int16_t *tmp_odd_buf,
- int16_t *dst) {
+ int16_t *tmp_odd_buf, int16_t *dst) {
v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
@@ -317,26 +312,26 @@ static void idct_butterfly_transpose_store(int16_t *tmp_buf,
/* Transpose : 16 vectors */
/* 1st & 2nd 8x8 */
- TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
- m0, n0, m1, n1, m2, n2, m3, n3);
+ TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+ n3);
ST_SH4(m0, n0, m1, n1, (dst + 0), 32);
ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32);
- TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
- m4, n4, m5, n5, m6, n6, m7, n7);
+ TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+ n7);
ST_SH4(m4, n4, m5, n5, (dst + 8), 32);
ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32);
/* 3rd & 4th 8x8 */
LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3);
LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7);
- TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
- m0, n0, m1, n1, m2, n2, m3, n3);
+ TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+ n3);
ST_SH4(m0, n0, m1, n1, (dst + 16), 32);
ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32);
- TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
- m4, n4, m5, n5, m6, n6, m7, n7);
+ TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+ n7);
ST_SH4(m4, n4, m5, n5, (dst + 24), 32);
ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32);
}
@@ -349,8 +344,8 @@ static void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) {
idct32x8_row_transpose_store(input, &tmp_buf[0]);
idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]);
idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]);
- idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0],
- &tmp_odd_buf[0], output);
+ idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0],
+ output);
}
static void idct8x32_column_even_process_store(int16_t *tmp_buf,
@@ -541,8 +536,7 @@ static void idct8x32_column_odd_process_store(int16_t *tmp_buf,
}
static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
- int16_t *tmp_odd_buf,
- uint8_t *dst,
+ int16_t *tmp_odd_buf, uint8_t *dst,
int32_t dst_stride) {
v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
@@ -563,8 +557,8 @@ static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0);
SRARI_H4_SH(m0, m2, m4, m6, 6);
- VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride),
- m0, m2, m4, m6);
+ VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), m0, m2, m4,
+ m6);
/* Load 8 & Store 8 */
vec0 = LD_SH(tmp_odd_buf + 4 * 8);
@@ -578,13 +572,12 @@ static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
SRARI_H4_SH(m1, m3, m5, m7, 6);
- VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride),
- m1, m3, m5, m7);
+ VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), m1, m3, m5, m7);
SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1);
SRARI_H4_SH(m1, m3, m5, m7, 6);
- VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride),
- m1, m3, m5, m7);
+ VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), m1, m3, m5,
+ m7);
/* Load 8 & Store 8 */
vec0 = LD_SH(tmp_odd_buf + 2 * 8);
@@ -598,13 +591,12 @@ static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
SRARI_H4_SH(n0, n2, n4, n6, 6);
- VP9_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride),
- n0, n2, n4, n6);
+ VP9_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), n0, n2, n4, n6);
SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0);
SRARI_H4_SH(n0, n2, n4, n6, 6);
- VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride),
- n0, n2, n4, n6);
+ VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), n0, n2, n4,
+ n6);
/* Load 8 & Store 8 */
vec0 = LD_SH(tmp_odd_buf + 5 * 8);
@@ -618,13 +610,12 @@ static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
SRARI_H4_SH(n1, n3, n5, n7, 6);
- VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride),
- n1, n3, n5, n7);
+ VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), n1, n3, n5, n7);
SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1);
SRARI_H4_SH(n1, n3, n5, n7, 6);
- VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride),
- n1, n3, n5, n7);
+ VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), n1, n3, n5,
+ n7);
}
static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
@@ -634,8 +625,8 @@ static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
- idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0],
- dst, dst_stride);
+ idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst,
+ dst_stride);
}
void vpx_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst,
@@ -665,7 +656,7 @@ void vpx_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst,
int16_t *out_ptr = out_arr;
for (i = 32; i--;) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sw $zero, 0(%[out_ptr]) \n\t"
"sw $zero, 4(%[out_ptr]) \n\t"
"sw $zero, 8(%[out_ptr]) \n\t"
@@ -684,8 +675,7 @@ void vpx_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst,
"sw $zero, 60(%[out_ptr]) \n\t"
:
- : [out_ptr] "r" (out_ptr)
- );
+ : [out_ptr] "r"(out_ptr));
out_ptr += 32;
}
@@ -728,8 +718,8 @@ void vpx_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst,
ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
CLIP_SH4_0_255(res0, res1, res2, res3);
CLIP_SH4_0_255(res4, res5, res6, res7);
- PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3,
- tmp0, tmp1, tmp2, tmp3);
+ PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1,
+ tmp2, tmp3);
ST_UB2(tmp0, tmp1, dst, 16);
dst += dst_stride;
diff --git a/vpx_dsp/mips/idct4x4_msa.c b/vpx_dsp/mips/idct4x4_msa.c
index f289d8eda..0a85742f1 100644
--- a/vpx_dsp/mips/idct4x4_msa.c
+++ b/vpx_dsp/mips/idct4x4_msa.c
@@ -42,8 +42,8 @@ void vpx_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst,
in0_r -= in3_r;
in2_r += in1_r;
- PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r,
- in0, in1, in2, in3);
+ PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r, in0, in1,
+ in2, in3);
ADDBLK_ST4x4_UB(in0, in3, in1, in2, dst, dst_stride);
}
diff --git a/vpx_dsp/mips/idct8x8_msa.c b/vpx_dsp/mips/idct8x8_msa.c
index fd667e456..7f77d2019 100644
--- a/vpx_dsp/mips/idct8x8_msa.c
+++ b/vpx_dsp/mips/idct8x8_msa.c
@@ -18,17 +18,17 @@ void vpx_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst,
LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
/* rows transform */
- TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
- in0, in1, in2, in3, in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
/* 1D idct8x8 */
- VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
- in0, in1, in2, in3, in4, in5, in6, in7);
+ VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
/* columns transform */
- TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
- in0, in1, in2, in3, in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
/* 1D idct8x8 */
- VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
- in0, in1, in2, in3, in4, in5, in6, in7);
+ VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
/* final rounding (add 2^4, divide by 2^5) and shift */
SRARI_H4_SH(in0, in1, in2, in3, 5);
SRARI_H4_SH(in4, in5, in6, in7, 5);
@@ -82,12 +82,12 @@ void vpx_idct8x8_12_add_msa(const int16_t *input, uint8_t *dst,
PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3);
/* stage4 */
- BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7,
- in0, in1, in2, in3, in4, in5, in6, in7);
- TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
- in0, in1, in2, in3, in4, in5, in6, in7);
- VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
- in0, in1, in2, in3, in4, in5, in6, in7);
+ BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7, in0, in1, in2, in3, in4, in5, in6,
+ in7);
+ TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
/* final rounding (add 2^4, divide by 2^5) and shift */
SRARI_H4_SH(in0, in1, in2, in3, 5);
diff --git a/vpx_dsp/mips/intrapred16_dspr2.c b/vpx_dsp/mips/intrapred16_dspr2.c
index 11444c718..3e29d0ac3 100644
--- a/vpx_dsp/mips/intrapred16_dspr2.c
+++ b/vpx_dsp/mips/intrapred16_dspr2.c
@@ -13,10 +13,10 @@
#if HAVE_DSPR2
void vpx_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
- int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+ int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+ int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lb %[tmp1], (%[left]) \n\t"
"lb %[tmp2], 1(%[left]) \n\t"
"lb %[tmp3], 2(%[left]) \n\t"
@@ -146,26 +146,23 @@ void vpx_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
"sw %[tmp16], 8(%[dst]) \n\t"
"sw %[tmp16], 12(%[dst]) \n\t"
- : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
- [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4),
- [tmp5] "=&r" (tmp5), [tmp7] "=&r" (tmp7),
- [tmp6] "=&r" (tmp6), [tmp8] "=&r" (tmp8),
- [tmp9] "=&r" (tmp9), [tmp10] "=&r" (tmp10),
- [tmp11] "=&r" (tmp11), [tmp12] "=&r" (tmp12),
- [tmp13] "=&r" (tmp13), [tmp14] "=&r" (tmp14),
- [tmp15] "=&r" (tmp15), [tmp16] "=&r" (tmp16)
- : [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride)
- );
+ : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
+ [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7),
+ [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8), [tmp9] "=&r"(tmp9),
+ [tmp10] "=&r"(tmp10), [tmp11] "=&r"(tmp11), [tmp12] "=&r"(tmp12),
+ [tmp13] "=&r"(tmp13), [tmp14] "=&r"(tmp14), [tmp15] "=&r"(tmp15),
+ [tmp16] "=&r"(tmp16)
+ : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
}
void vpx_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- int32_t expected_dc;
- int32_t average;
- int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
- int32_t above2, left2;
+ int32_t expected_dc;
+ int32_t average;
+ int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
+ int32_t above2, left2;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lw %[above1], (%[above]) \n\t"
"lw %[above2], 4(%[above]) \n\t"
"lw %[left1], (%[left]) \n\t"
@@ -316,14 +313,12 @@ void vpx_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
"sw %[expected_dc], 8(%[dst]) \n\t"
"sw %[expected_dc], 12(%[dst]) \n\t"
- : [left1] "=&r" (left1), [above1] "=&r" (above1),
- [left_l1] "=&r" (left_l1), [above_l1] "=&r" (above_l1),
- [left_r1] "=&r" (left_r1), [above_r1] "=&r" (above_r1),
- [above2] "=&r" (above2), [left2] "=&r" (left2),
- [average] "=&r" (average), [tmp] "=&r" (tmp),
- [expected_dc] "=&r" (expected_dc)
- : [above] "r" (above), [left] "r" (left),
- [dst] "r" (dst), [stride] "r" (stride)
- );
+ : [left1] "=&r"(left1), [above1] "=&r"(above1), [left_l1] "=&r"(left_l1),
+ [above_l1] "=&r"(above_l1), [left_r1] "=&r"(left_r1),
+ [above_r1] "=&r"(above_r1), [above2] "=&r"(above2),
+ [left2] "=&r"(left2), [average] "=&r"(average), [tmp] "=&r"(tmp),
+ [expected_dc] "=&r"(expected_dc)
+ : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+ [stride] "r"(stride));
}
#endif // #if HAVE_DSPR2
diff --git a/vpx_dsp/mips/intrapred4_dspr2.c b/vpx_dsp/mips/intrapred4_dspr2.c
index 03baf4c9c..9f51d50c7 100644
--- a/vpx_dsp/mips/intrapred4_dspr2.c
+++ b/vpx_dsp/mips/intrapred4_dspr2.c
@@ -13,9 +13,9 @@
#if HAVE_DSPR2
void vpx_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- int32_t tmp1, tmp2, tmp3, tmp4;
+ int32_t tmp1, tmp2, tmp3, tmp4;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lb %[tmp1], (%[left]) \n\t"
"lb %[tmp2], 1(%[left]) \n\t"
"lb %[tmp3], 2(%[left]) \n\t"
@@ -32,19 +32,18 @@ void vpx_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
"add %[dst], %[dst], %[stride] \n\t"
"sw %[tmp4], (%[dst]) \n\t"
- : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
- [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4)
- : [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride)
- );
+ : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
+ [tmp4] "=&r"(tmp4)
+ : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
}
void vpx_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- int32_t expected_dc;
- int32_t average;
- int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l;
+ int32_t expected_dc;
+ int32_t average;
+ int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lw %[above_c], (%[above]) \n\t"
"lw %[left_c], (%[left]) \n\t"
@@ -70,27 +69,26 @@ void vpx_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
"add %[dst], %[dst], %[stride] \n\t"
"sw %[expected_dc], (%[dst]) \n\t"
- : [above_c] "=&r" (above_c), [above_l] "=&r" (above_l),
- [above_r] "=&r" (above_r), [left_c] "=&r" (left_c),
- [left_l] "=&r" (left_l), [left_r] "=&r" (left_r),
- [average] "=&r" (average), [tmp] "=&r" (tmp),
- [expected_dc] "=&r" (expected_dc)
- : [above] "r" (above), [left] "r" (left),
- [dst] "r" (dst), [stride] "r" (stride)
- );
+ : [above_c] "=&r"(above_c), [above_l] "=&r"(above_l),
+ [above_r] "=&r"(above_r), [left_c] "=&r"(left_c),
+ [left_l] "=&r"(left_l), [left_r] "=&r"(left_r),
+ [average] "=&r"(average), [tmp] "=&r"(tmp),
+ [expected_dc] "=&r"(expected_dc)
+ : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+ [stride] "r"(stride));
}
void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- int32_t abovel, abover;
- int32_t left0, left1, left2, left3;
- int32_t res0, res1;
- int32_t resl;
- int32_t resr;
- int32_t top_left;
- uint8_t *cm = vpx_ff_cropTbl;
-
- __asm__ __volatile__ (
+ int32_t abovel, abover;
+ int32_t left0, left1, left2, left3;
+ int32_t res0, res1;
+ int32_t resl;
+ int32_t resr;
+ int32_t top_left;
+ uint8_t *cm = vpx_ff_cropTbl;
+
+ __asm__ __volatile__(
"ulw %[resl], (%[above]) \n\t"
"lbu %[left0], (%[left]) \n\t"
@@ -174,7 +172,6 @@ void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
"sra %[res0], %[res0], 16 \n\t"
"lbux %[res0], %[res0](%[cm]) \n\t"
-
"sra %[res1], %[resr], 16 \n\t"
"lbux %[res1], %[res1](%[cm]) \n\t"
"sb %[res0], (%[dst]) \n\t"
@@ -183,7 +180,6 @@ void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
"sra %[res0], %[res0], 16 \n\t"
"lbux %[res0], %[res0](%[cm]) \n\t"
-
"sb %[res1], 1(%[dst]) \n\t"
"sra %[res1], %[resl], 16 \n\t"
"lbux %[res1], %[res1](%[cm]) \n\t"
@@ -218,12 +214,11 @@ void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
"sb %[res0], 2(%[dst]) \n\t"
"sb %[res1], 3(%[dst]) \n\t"
- : [abovel] "=&r" (abovel), [abover] "=&r" (abover),
- [left0] "=&r" (left0), [left1] "=&r" (left1), [left2] "=&r" (left2),
- [res0] "=&r" (res0), [res1] "=&r" (res1), [left3] "=&r" (left3),
- [resl] "=&r" (resl), [resr] "=&r" (resr), [top_left] "=&r" (top_left)
- : [above] "r" (above), [left] "r" (left),
- [dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm)
- );
+ : [abovel] "=&r"(abovel), [abover] "=&r"(abover), [left0] "=&r"(left0),
+ [left1] "=&r"(left1), [left2] "=&r"(left2), [res0] "=&r"(res0),
+ [res1] "=&r"(res1), [left3] "=&r"(left3), [resl] "=&r"(resl),
+ [resr] "=&r"(resr), [top_left] "=&r"(top_left)
+ : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+ [stride] "r"(stride), [cm] "r"(cm));
}
#endif // #if HAVE_DSPR2
diff --git a/vpx_dsp/mips/intrapred8_dspr2.c b/vpx_dsp/mips/intrapred8_dspr2.c
index 196ff5a06..eac79d510 100644
--- a/vpx_dsp/mips/intrapred8_dspr2.c
+++ b/vpx_dsp/mips/intrapred8_dspr2.c
@@ -13,9 +13,9 @@
#if HAVE_DSPR2
void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+ int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lb %[tmp1], (%[left]) \n\t"
"lb %[tmp2], 1(%[left]) \n\t"
"lb %[tmp3], 2(%[left]) \n\t"
@@ -58,23 +58,20 @@ void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
"sw %[tmp8], (%[dst]) \n\t"
"sw %[tmp8], 4(%[dst]) \n\t"
- : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
- [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4),
- [tmp5] "=&r" (tmp5), [tmp7] "=&r" (tmp7),
- [tmp6] "=&r" (tmp6), [tmp8] "=&r" (tmp8)
- : [left] "r" (left), [dst] "r" (dst),
- [stride] "r" (stride)
- );
+ : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
+ [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7),
+ [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8)
+ : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
}
void vpx_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- int32_t expected_dc;
- int32_t average;
- int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
- int32_t above2, above_l2, above_r2, left2, left_r2, left_l2;
+ int32_t expected_dc;
+ int32_t average;
+ int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
+ int32_t above2, above_l2, above_r2, left2, left_r2, left_l2;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lw %[above1], (%[above]) \n\t"
"lw %[above2], 4(%[above]) \n\t"
"lw %[left1], (%[left]) \n\t"
@@ -137,30 +134,29 @@ void vpx_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
"sw %[expected_dc], (%[dst]) \n\t"
"sw %[expected_dc], 4(%[dst]) \n\t"
- : [above1] "=&r" (above1), [above_l1] "=&r" (above_l1),
- [above_r1] "=&r" (above_r1), [left1] "=&r" (left1),
- [left_l1] "=&r" (left_l1), [left_r1] "=&r" (left_r1),
- [above2] "=&r" (above2), [above_l2] "=&r" (above_l2),
- [above_r2] "=&r" (above_r2), [left2] "=&r" (left2),
- [left_l2] "=&r" (left_l2), [left_r2] "=&r" (left_r2),
- [average] "=&r" (average), [tmp] "=&r" (tmp),
- [expected_dc] "=&r" (expected_dc)
- : [above] "r" (above), [left] "r" (left), [dst] "r" (dst),
- [stride] "r" (stride)
- );
+ : [above1] "=&r"(above1), [above_l1] "=&r"(above_l1),
+ [above_r1] "=&r"(above_r1), [left1] "=&r"(left1),
+ [left_l1] "=&r"(left_l1), [left_r1] "=&r"(left_r1),
+ [above2] "=&r"(above2), [above_l2] "=&r"(above_l2),
+ [above_r2] "=&r"(above_r2), [left2] "=&r"(left2),
+ [left_l2] "=&r"(left_l2), [left_r2] "=&r"(left_r2),
+ [average] "=&r"(average), [tmp] "=&r"(tmp),
+ [expected_dc] "=&r"(expected_dc)
+ : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+ [stride] "r"(stride));
}
void vpx_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- int32_t abovel, abover;
- int32_t abovel_1, abover_1;
- int32_t left0;
- int32_t res0, res1, res2, res3;
- int32_t reshw;
- int32_t top_left;
- uint8_t *cm = vpx_ff_cropTbl;
-
- __asm__ __volatile__ (
+ int32_t abovel, abover;
+ int32_t abovel_1, abover_1;
+ int32_t left0;
+ int32_t res0, res1, res2, res3;
+ int32_t reshw;
+ int32_t top_left;
+ uint8_t *cm = vpx_ff_cropTbl;
+
+ __asm__ __volatile__(
"ulw %[reshw], (%[above]) \n\t"
"ulw %[top_left], 4(%[above]) \n\t"
@@ -595,13 +591,12 @@ void vpx_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
"sb %[res2], 6(%[dst]) \n\t"
"sb %[res3], 7(%[dst]) \n\t"
- : [abovel] "=&r" (abovel), [abover] "=&r" (abover),
- [abovel_1] "=&r" (abovel_1), [abover_1] "=&r" (abover_1),
- [left0] "=&r" (left0), [res2] "=&r" (res2), [res3] "=&r" (res3),
- [res0] "=&r" (res0), [res1] "=&r" (res1),
- [reshw] "=&r" (reshw), [top_left] "=&r" (top_left)
- : [above] "r" (above), [left] "r" (left),
- [dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm)
- );
+ : [abovel] "=&r"(abovel), [abover] "=&r"(abover),
+ [abovel_1] "=&r"(abovel_1), [abover_1] "=&r"(abover_1),
+ [left0] "=&r"(left0), [res2] "=&r"(res2), [res3] "=&r"(res3),
+ [res0] "=&r"(res0), [res1] "=&r"(res1), [reshw] "=&r"(reshw),
+ [top_left] "=&r"(top_left)
+ : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+ [stride] "r"(stride), [cm] "r"(cm));
}
#endif // #if HAVE_DSPR2
diff --git a/vpx_dsp/mips/intrapred_msa.c b/vpx_dsp/mips/intrapred_msa.c
index f6fbe4016..b5ee94303 100644
--- a/vpx_dsp/mips/intrapred_msa.c
+++ b/vpx_dsp/mips/intrapred_msa.c
@@ -11,10 +11,11 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/mips/macros_msa.h"
-#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) { \
- out0 = __msa_subs_u_h(out0, in0); \
- out1 = __msa_subs_u_h(out1, in1); \
-}
+#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \
+ { \
+ out0 = __msa_subs_u_h(out0, in0); \
+ out1 = __msa_subs_u_h(out1, in1); \
+ }
static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst,
int32_t dst_stride) {
@@ -150,8 +151,8 @@ static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst,
}
static void intra_predict_dc_4x4_msa(const uint8_t *src_top,
- const uint8_t *src_left,
- uint8_t *dst, int32_t dst_stride) {
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
uint32_t val0, val1;
v16i8 store, src = { 0 };
v8u16 sum_h;
@@ -199,8 +200,8 @@ static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) {
}
static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
- const uint8_t *src_left,
- uint8_t *dst, int32_t dst_stride) {
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
uint64_t val0, val1;
v16i8 store;
v16u8 src = { 0 };
@@ -260,8 +261,8 @@ static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) {
}
static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
- const uint8_t *src_left,
- uint8_t *dst, int32_t dst_stride) {
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
v16u8 top, left, out;
v8u16 sum_h, sum_top, sum_left;
v4u32 sum_w;
@@ -313,8 +314,8 @@ static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) {
}
static void intra_predict_dc_32x32_msa(const uint8_t *src_top,
- const uint8_t *src_left,
- uint8_t *dst, int32_t dst_stride) {
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
uint32_t row;
v16u8 top0, top1, left0, left1, out;
v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
@@ -381,8 +382,8 @@ static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
}
static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
- const uint8_t *src_left,
- uint8_t *dst, int32_t dst_stride) {
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
uint32_t val;
uint8_t top_left = src_top_ptr[-1];
v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
@@ -409,8 +410,8 @@ static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
}
static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
- const uint8_t *src_left,
- uint8_t *dst, int32_t dst_stride) {
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
uint64_t val;
uint8_t top_left = src_top_ptr[-1];
uint32_t loop_cnt;
@@ -442,8 +443,8 @@ static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
}
static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
- const uint8_t *src_left,
- uint8_t *dst, int32_t dst_stride) {
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
uint8_t top_left = src_top_ptr[-1];
uint32_t loop_cnt;
v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
@@ -491,8 +492,8 @@ static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
}
static void intra_predict_tm_32x32_msa(const uint8_t *src_top,
- const uint8_t *src_left,
- uint8_t *dst, int32_t dst_stride) {
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
uint8_t top_left = src_top[-1];
uint32_t loop_cnt;
v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
diff --git a/vpx_dsp/mips/inv_txfm_dspr2.h b/vpx_dsp/mips/inv_txfm_dspr2.h
index abd850911..edd54aec5 100644
--- a/vpx_dsp/mips/inv_txfm_dspr2.h
+++ b/vpx_dsp/mips/inv_txfm_dspr2.h
@@ -23,31 +23,39 @@ extern "C" {
#endif
#if HAVE_DSPR2
-#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input) ({ \
+#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input) \
+ ({ \
\
- int32_t tmp, out; \
- int dct_cost_rounding = DCT_CONST_ROUNDING; \
- int in = input; \
+ int32_t tmp, out; \
+ int dct_cost_rounding = DCT_CONST_ROUNDING; \
+ int in = input; \
\
- __asm__ __volatile__ ( \
- /* out = dct_const_round_shift(input_dc * cospi_16_64); */ \
- "mtlo %[dct_cost_rounding], $ac1 \n\t"\
- "mthi $zero, $ac1 \n\t"\
- "madd $ac1, %[in], %[cospi_16_64] \n\t"\
- "extp %[tmp], $ac1, 31 \n\t"\
+ __asm__ __volatile__(/* out = dct_const_round_shift(dc * cospi_16_64); */ \
+ "mtlo %[dct_cost_rounding], $ac1 " \
+ " \n\t" \
+ "mthi $zero, $ac1 " \
+ " \n\t" \
+ "madd $ac1, %[in], " \
+ "%[cospi_16_64] \n\t" \
+ "extp %[tmp], $ac1, " \
+ "31 \n\t" \
\
- /* out = dct_const_round_shift(out * cospi_16_64); */ \
- "mtlo %[dct_cost_rounding], $ac2 \n\t"\
- "mthi $zero, $ac2 \n\t"\
- "madd $ac2, %[tmp], %[cospi_16_64] \n\t"\
- "extp %[out], $ac2, 31 \n\t"\
+ /* out = dct_const_round_shift(out * cospi_16_64); */ \
+ "mtlo %[dct_cost_rounding], $ac2 " \
+ " \n\t" \
+ "mthi $zero, $ac2 " \
+ " \n\t" \
+ "madd $ac2, %[tmp], " \
+ "%[cospi_16_64] \n\t" \
+ "extp %[out], $ac2, " \
+ "31 \n\t" \
\
- : [tmp] "=&r" (tmp), [out] "=r" (out) \
- : [in] "r" (in), \
- [dct_cost_rounding] "r" (dct_cost_rounding), \
- [cospi_16_64] "r" (cospi_16_64) \
- ); \
- out; })
+ : [tmp] "=&r"(tmp), [out] "=r"(out) \
+ : [in] "r"(in), \
+ [dct_cost_rounding] "r"(dct_cost_rounding), \
+ [cospi_16_64] "r"(cospi_16_64)); \
+ out; \
+ })
void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
int dest_stride);
@@ -59,10 +67,8 @@ void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
int dest_stride);
void iadst8_dspr2(const int16_t *input, int16_t *output);
-void idct16_rows_dspr2(const int16_t *input, int16_t *output,
- uint32_t no_rows);
-void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
- int dest_stride);
+void idct16_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride);
void iadst16_dspr2(const int16_t *input, int16_t *output);
#endif // #if HAVE_DSPR2
diff --git a/vpx_dsp/mips/inv_txfm_msa.h b/vpx_dsp/mips/inv_txfm_msa.h
index 1458561a6..8b8a4173d 100644
--- a/vpx_dsp/mips/inv_txfm_msa.h
+++ b/vpx_dsp/mips/inv_txfm_msa.h
@@ -15,391 +15,392 @@
#include "vpx_dsp/mips/txfm_macros_msa.h"
#include "vpx_dsp/txfm_common.h"
-#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3, out4, out5, out6, out7) { \
- v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \
- v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \
- v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \
- cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \
- v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, \
- -cospi_16_64, cospi_24_64, -cospi_24_64, 0, 0 }; \
- \
- SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \
- cnst2_m = -cnst0_m; \
- ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \
- SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \
- cnst4_m = -cnst2_m; \
- ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \
- \
- ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \
- ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \
- DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \
- cnst1_m, cnst2_m, cnst3_m, in7, in0, \
- in4, in3); \
- \
- SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \
- cnst2_m = -cnst0_m; \
- ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \
- SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \
- cnst4_m = -cnst2_m; \
- ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \
- \
- ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
- ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
- \
- DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \
- cnst1_m, cnst2_m, cnst3_m, in5, in2, \
- in6, in1); \
- BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \
- out7 = -s0_m; \
- out0 = s1_m; \
- \
- SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, \
- cnst0_m, cnst1_m, cnst2_m, cnst3_m); \
- \
- ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \
- cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
- cnst1_m = cnst0_m; \
- \
- ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \
- ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
- DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \
- cnst2_m, cnst3_m, cnst1_m, out1, out6, \
- s0_m, s1_m); \
- \
- SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \
- cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
- \
- ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
- ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \
- out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
- out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
- out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \
- out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \
- \
- out1 = -out1; \
- out3 = -out3; \
- out5 = -out5; \
-}
+#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+ out3, out4, out5, out6, out7) \
+ { \
+ v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \
+ v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \
+ v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \
+ cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \
+ v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64, \
+ cospi_24_64, -cospi_24_64, 0, 0 }; \
+ \
+ SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \
+ cnst2_m = -cnst0_m; \
+ ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \
+ SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \
+ cnst4_m = -cnst2_m; \
+ ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \
+ \
+ ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \
+ ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \
+ DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \
+ cnst2_m, cnst3_m, in7, in0, in4, in3); \
+ \
+ SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \
+ cnst2_m = -cnst0_m; \
+ ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \
+ SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \
+ cnst4_m = -cnst2_m; \
+ ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \
+ \
+ ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
+ ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
+ \
+ DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \
+ cnst2_m, cnst3_m, in5, in2, in6, in1); \
+ BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \
+ out7 = -s0_m; \
+ out0 = s1_m; \
+ \
+ SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m); \
+ \
+ ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \
+ cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ cnst1_m = cnst0_m; \
+ \
+ ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \
+ ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
+ DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m, \
+ cnst3_m, cnst1_m, out1, out6, s0_m, s1_m); \
+ \
+ SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ \
+ ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
+ ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \
+ out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
+ out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \
+ out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \
+ \
+ out1 = -out1; \
+ out3 = -out3; \
+ out5 = -out5; \
+ }
-#define VP9_SET_COSPI_PAIR(c0_h, c1_h) ({ \
- v8i16 out0_m, r0_m, r1_m; \
- \
- r0_m = __msa_fill_h(c0_h); \
- r1_m = __msa_fill_h(c1_h); \
- out0_m = __msa_ilvev_h(r1_m, r0_m); \
- \
- out0_m; \
-})
+#define VP9_SET_COSPI_PAIR(c0_h, c1_h) \
+ ({ \
+ v8i16 out0_m, r0_m, r1_m; \
+ \
+ r0_m = __msa_fill_h(c0_h); \
+ r1_m = __msa_fill_h(c1_h); \
+ out0_m = __msa_ilvev_h(r1_m, r0_m); \
+ \
+ out0_m; \
+ })
-#define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) { \
- uint8_t *dst_m = (uint8_t *) (dst); \
- v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
- v16i8 tmp0_m, tmp1_m; \
- v16i8 zero_m = { 0 }; \
- v8i16 res0_m, res1_m, res2_m, res3_m; \
- \
- LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \
- ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, \
- zero_m, dst3_m, res0_m, res1_m, res2_m, res3_m); \
- ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, \
- res0_m, res1_m, res2_m, res3_m); \
- CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \
- PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \
- ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \
-}
+#define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) \
+ { \
+ uint8_t *dst_m = (uint8_t *)(dst); \
+ v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
+ v16i8 tmp0_m, tmp1_m; \
+ v16i8 zero_m = { 0 }; \
+ v8i16 res0_m, res1_m, res2_m, res3_m; \
+ \
+ LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \
+ ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, zero_m, dst3_m, \
+ res0_m, res1_m, res2_m, res3_m); \
+ ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, res0_m, res1_m, \
+ res2_m, res3_m); \
+ CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \
+ PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \
+ ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \
+ }
-#define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) { \
- v8i16 c0_m, c1_m, c2_m, c3_m; \
- v8i16 step0_m, step1_m; \
- v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- \
- c0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \
- c1_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \
- step0_m = __msa_ilvr_h(in2, in0); \
- DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m); \
- \
- c2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
- c3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
- step1_m = __msa_ilvr_h(in3, in1); \
- DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m); \
- SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
- \
- PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \
- SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \
- BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, \
- (v8i16)tmp2_m, (v8i16)tmp3_m, \
- out0, out1, out2, out3); \
-}
+#define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v8i16 c0_m, c1_m, c2_m, c3_m; \
+ v8i16 step0_m, step1_m; \
+ v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ \
+ c0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \
+ c1_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \
+ step0_m = __msa_ilvr_h(in2, in0); \
+ DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m); \
+ \
+ c2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
+ c3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
+ step1_m = __msa_ilvr_h(in3, in1); \
+ DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m); \
+ SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
+ \
+ PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \
+ SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \
+ BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, (v8i16)tmp2_m, (v8i16)tmp3_m, \
+ out0, out1, out2, out3); \
+ }
-#define VP9_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) { \
- v8i16 res0_m, res1_m, c0_m, c1_m; \
- v8i16 k1_m, k2_m, k3_m, k4_m; \
- v8i16 zero_m = { 0 }; \
- v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- v4i32 int0_m, int1_m, int2_m, int3_m; \
- v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, \
- sinpi_4_9, -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, \
- -sinpi_4_9 }; \
- \
- SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \
- ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \
- ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \
- DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m); \
- int0_m = tmp2_m + tmp1_m; \
- \
- SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m); \
- ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m); \
- DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \
- int1_m = tmp0_m + tmp1_m; \
- \
- c0_m = __msa_splati_h(mask_m, 6); \
- ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m); \
- ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \
- DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \
- int2_m = tmp0_m + tmp1_m; \
- \
- c0_m = __msa_splati_h(mask_m, 6); \
- c0_m = __msa_ilvev_h(c0_m, k1_m); \
- \
- res0_m = __msa_ilvr_h((in1), (in3)); \
- tmp0_m = __msa_dotp_s_w(res0_m, c0_m); \
- int3_m = tmp2_m + tmp0_m; \
- \
- res0_m = __msa_ilvr_h((in2), (in3)); \
- c1_m = __msa_ilvev_h(k4_m, k3_m); \
- \
- tmp2_m = __msa_dotp_s_w(res0_m, c1_m); \
- res1_m = __msa_ilvr_h((in0), (in2)); \
- c1_m = __msa_ilvev_h(k1_m, zero_m); \
- \
- tmp3_m = __msa_dotp_s_w(res1_m, c1_m); \
- int3_m += tmp2_m; \
- int3_m += tmp3_m; \
- \
- SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \
- PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \
-}
+#define VP9_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v8i16 res0_m, res1_m, c0_m, c1_m; \
+ v8i16 k1_m, k2_m, k3_m, k4_m; \
+ v8i16 zero_m = { 0 }; \
+ v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v4i32 int0_m, int1_m, int2_m, int3_m; \
+ v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9, \
+ -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, -sinpi_4_9 }; \
+ \
+ SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \
+ ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \
+ ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \
+ DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m); \
+ int0_m = tmp2_m + tmp1_m; \
+ \
+ SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m); \
+ ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m); \
+ DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \
+ int1_m = tmp0_m + tmp1_m; \
+ \
+ c0_m = __msa_splati_h(mask_m, 6); \
+ ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m); \
+ ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \
+ DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \
+ int2_m = tmp0_m + tmp1_m; \
+ \
+ c0_m = __msa_splati_h(mask_m, 6); \
+ c0_m = __msa_ilvev_h(c0_m, k1_m); \
+ \
+ res0_m = __msa_ilvr_h((in1), (in3)); \
+ tmp0_m = __msa_dotp_s_w(res0_m, c0_m); \
+ int3_m = tmp2_m + tmp0_m; \
+ \
+ res0_m = __msa_ilvr_h((in2), (in3)); \
+ c1_m = __msa_ilvev_h(k4_m, k3_m); \
+ \
+ tmp2_m = __msa_dotp_s_w(res0_m, c1_m); \
+ res1_m = __msa_ilvr_h((in0), (in2)); \
+ c1_m = __msa_ilvev_h(k1_m, zero_m); \
+ \
+ tmp3_m = __msa_dotp_s_w(res1_m, c1_m); \
+ int3_m += tmp2_m; \
+ int3_m += tmp3_m; \
+ \
+ SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \
+ PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \
+ }
-#define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) ({ \
- v8i16 c0_m, c1_m; \
- \
- SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \
- c0_m = __msa_ilvev_h(c1_m, c0_m); \
- \
- c0_m; \
-})
+#define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) \
+ ({ \
+ v8i16 c0_m, c1_m; \
+ \
+ SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \
+ c0_m = __msa_ilvev_h(c1_m, c0_m); \
+ \
+ c0_m; \
+ })
/* multiply and add macro */
-#define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \
- out0, out1, out2, out3) { \
- v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
- v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- \
- ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \
- ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \
- DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, \
- cst0, cst0, cst1, cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
- SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \
- DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, \
- cst2, cst2, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
- SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3); \
-}
+#define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \
+ out2, out3) \
+ { \
+ v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
+ v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ \
+ ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \
+ ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \
+ DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \
+ cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
+ SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \
+ DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \
+ cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
+ SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3); \
+ }
/* idct 8x8 macro */
-#define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3, out4, out5, out6, out7) { \
- v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \
- v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \
- v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \
- cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 }; \
- \
- k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5); \
- k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0); \
- k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3); \
- k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2); \
- VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \
- SUB2(in1, in3, in7, in5, res0_m, res1_m); \
- k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7); \
- k1_m = __msa_splati_h(mask_m, 4); \
- \
- ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m); \
- DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m, \
- tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
- SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
- tp4_m = in1 + in3; \
- PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m); \
- tp7_m = in7 + in5; \
- k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
- k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
- VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, \
- in0, in4, in2, in6); \
- BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \
- BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, \
- out0, out1, out2, out3, out4, out5, out6, out7); \
-}
+#define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ { \
+ v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \
+ v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \
+ v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \
+ cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 }; \
+ \
+ k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5); \
+ k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0); \
+ k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3); \
+ k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2); \
+ VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \
+ SUB2(in1, in3, in7, in5, res0_m, res1_m); \
+ k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7); \
+ k1_m = __msa_splati_h(mask_m, 4); \
+ \
+ ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m); \
+ DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m, \
+ tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
+ SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
+ tp4_m = in1 + in3; \
+ PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m); \
+ tp7_m = in7 + in5; \
+ k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
+ k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
+ VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, in0, in4, in2, in6); \
+ BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \
+ BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, out0, \
+ out1, out2, out3, out4, out5, out6, out7); \
+ }
-#define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3, out4, out5, out6, out7) { \
- v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \
- v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \
- v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \
- v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, \
- cospi_10_64, cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \
- v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, \
- cospi_6_64, -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \
- v8i16 mask3_m = { -cospi_24_64, cospi_8_64, cospi_16_64, \
- -cospi_16_64, 0, 0, 0, 0 }; \
- \
- k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1); \
- k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2); \
- ILVRL_H2_SH(in1, in0, in_s1, in_s0); \
- DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
- r0_m, r1_m, r2_m, r3_m); \
- k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7); \
- k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1); \
- ILVRL_H2_SH(in5, in4, in_s1, in_s0); \
- DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
- r4_m, r5_m, r6_m, r7_m); \
- ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \
- m0_m, m1_m, m2_m, m3_m); \
- SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \
- SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \
- m0_m, m1_m, m2_m, m3_m); \
- SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
- PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \
- k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4); \
- k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5); \
- ILVRL_H2_SH(in3, in2, in_s1, in_s0); \
- DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
- r0_m, r1_m, r2_m, r3_m); \
- k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3); \
- k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4); \
- ILVRL_H2_SH(in7, in6, in_s1, in_s0); \
- DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
- r4_m, r5_m, r6_m, r7_m); \
- ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \
- m0_m, m1_m, m2_m, m3_m); \
- SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \
- SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \
- m0_m, m1_m, m2_m, m3_m); \
- SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
- PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \
- ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \
- BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3); \
- k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6); \
- k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7); \
- ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \
- DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
- r0_m, r1_m, r2_m, r3_m); \
- k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1); \
- DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, \
- r4_m, r5_m, r6_m, r7_m); \
- ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, \
- m0_m, m1_m, m2_m, m3_m); \
- SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \
- SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, \
- m0_m, m1_m, m2_m, m3_m); \
- SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \
- k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2); \
- k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3); \
- ILVRL_H2_SH(in4, in3, in_s1, in_s0); \
- DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
- m0_m, m1_m, m2_m, m3_m); \
- SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \
- ILVRL_H2_SW(in5, in2, m2_m, m3_m); \
- DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, \
- m0_m, m1_m, m2_m, m3_m); \
- SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \
- \
- out1 = -in1; \
- out3 = -in3; \
- out5 = -in5; \
- out7 = -in7; \
-}
+#define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ { \
+ v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \
+ v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \
+ v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \
+ v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, cospi_10_64, \
+ cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \
+ v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, cospi_6_64, \
+ -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \
+ v8i16 mask3_m = { \
+ -cospi_24_64, cospi_8_64, cospi_16_64, -cospi_16_64, 0, 0, 0, 0 \
+ }; \
+ \
+ k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1); \
+ k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2); \
+ ILVRL_H2_SH(in1, in0, in_s1, in_s0); \
+ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \
+ r1_m, r2_m, r3_m); \
+ k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7); \
+ k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1); \
+ ILVRL_H2_SH(in5, in4, in_s1, in_s0); \
+ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \
+ r5_m, r6_m, r7_m); \
+ ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
+ m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \
+ SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
+ m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \
+ k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4); \
+ k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5); \
+ ILVRL_H2_SH(in3, in2, in_s1, in_s0); \
+ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \
+ r1_m, r2_m, r3_m); \
+ k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3); \
+ k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4); \
+ ILVRL_H2_SH(in7, in6, in_s1, in_s0); \
+ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \
+ r5_m, r6_m, r7_m); \
+ ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
+ m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \
+ SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
+ m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \
+ ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \
+ BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3); \
+ k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6); \
+ k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7); \
+ ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \
+ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \
+ r1_m, r2_m, r3_m); \
+ k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1); \
+ DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, r4_m, r5_m, \
+ r6_m, r7_m); \
+ ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \
+ m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \
+ SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \
+ m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \
+ k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2); \
+ k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3); \
+ ILVRL_H2_SH(in4, in3, in_s1, in_s0); \
+ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, m0_m, \
+ m1_m, m2_m, m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \
+ ILVRL_H2_SW(in5, in2, m2_m, m3_m); \
+ DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, m0_m, m1_m, \
+ m2_m, m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \
+ \
+ out1 = -in1; \
+ out3 = -in3; \
+ out5 = -in5; \
+ out7 = -in7; \
+ }
-#define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, \
- r9, r10, r11, r12, r13, r14, r15, \
- out0, out1, out2, out3, out4, out5, \
- out6, out7, out8, out9, out10, out11, \
- out12, out13, out14, out15) { \
- v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \
- v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \
- v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \
- v8i16 h8_m, h9_m, h10_m, h11_m; \
- v8i16 k0_m, k1_m, k2_m, k3_m; \
- \
- /* stage 1 */ \
- k0_m = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \
- k1_m = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \
- k2_m = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \
- k3_m = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \
- MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, \
- g0_m, g1_m, g2_m, g3_m); \
- k0_m = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \
- k1_m = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \
- k2_m = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \
- k3_m = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \
- MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, \
- g4_m, g5_m, g6_m, g7_m); \
- k0_m = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \
- k1_m = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \
- k2_m = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \
- k3_m = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \
- MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, \
- g8_m, g9_m, g10_m, g11_m); \
- k0_m = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \
- k1_m = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \
- k2_m = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \
- k3_m = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \
- MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, \
- g12_m, g13_m, g14_m, g15_m); \
- \
- /* stage 2 */ \
- k0_m = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \
- k1_m = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \
- k2_m = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \
- MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, \
- h0_m, h1_m, h2_m, h3_m); \
- k0_m = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \
- k1_m = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \
- k2_m = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \
- MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, \
- h4_m, h5_m, h6_m, h7_m); \
- BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \
- BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, \
- h8_m, h9_m, h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \
- \
- /* stage 3 */ \
- BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \
- k0_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
- k1_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
- k2_m = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \
- MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, \
- out4, out6, out5, out7); \
- MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, \
- out12, out14, out13, out15); \
- \
- /* stage 4 */ \
- k0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \
- k1_m = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \
- k2_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \
- k3_m = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \
- MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \
- MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \
- MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \
- MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \
-}
+#define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, \
+ r12, r13, r14, r15, out0, out1, out2, out3, out4, \
+ out5, out6, out7, out8, out9, out10, out11, out12, \
+ out13, out14, out15) \
+ { \
+ v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \
+ v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \
+ v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \
+ v8i16 h8_m, h9_m, h10_m, h11_m; \
+ v8i16 k0_m, k1_m, k2_m, k3_m; \
+ \
+ /* stage 1 */ \
+ k0_m = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \
+ k1_m = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \
+ k2_m = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \
+ k3_m = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \
+ MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, g0_m, g1_m, g2_m, g3_m); \
+ k0_m = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \
+ k1_m = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \
+ k2_m = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \
+ k3_m = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \
+ MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, g4_m, g5_m, g6_m, g7_m); \
+ k0_m = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \
+ k1_m = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \
+ k2_m = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \
+ k3_m = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \
+ MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, g8_m, g9_m, g10_m, \
+ g11_m); \
+ k0_m = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \
+ k1_m = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \
+ k2_m = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \
+ k3_m = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \
+ MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, g12_m, g13_m, g14_m, \
+ g15_m); \
+ \
+ /* stage 2 */ \
+ k0_m = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \
+ k1_m = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \
+ k2_m = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \
+ MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, h0_m, h1_m, h2_m, \
+ h3_m); \
+ k0_m = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \
+ k1_m = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \
+ k2_m = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \
+ MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, h4_m, h5_m, \
+ h6_m, h7_m); \
+ BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \
+ BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, h8_m, h9_m, \
+ h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \
+ \
+ /* stage 3 */ \
+ BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \
+ k0_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
+ k1_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
+ k2_m = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \
+ MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, out4, out6, out5, \
+ out7); \
+ MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, out12, out14, \
+ out13, out15); \
+ \
+ /* stage 4 */ \
+ k0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \
+ k1_m = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \
+ k2_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \
+ k3_m = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \
+ MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \
+ MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \
+ MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \
+ MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \
+ }
void vpx_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
int32_t dst_stride);
diff --git a/vpx_dsp/mips/itrans16_dspr2.c b/vpx_dsp/mips/itrans16_dspr2.c
index 6d41e6190..0ec0c2059 100644
--- a/vpx_dsp/mips/itrans16_dspr2.c
+++ b/vpx_dsp/mips/itrans16_dspr2.c
@@ -26,11 +26,11 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
int result1, result2, result3, result4;
const int const_2_power_13 = 8192;
- for (i = no_rows; i--; ) {
+ for (i = no_rows; i--;) {
/* prefetch row */
prefetch_load((const uint8_t *)(input + 16));
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lh %[load1], 0(%[input]) \n\t"
"lh %[load2], 16(%[input]) \n\t"
"lh %[load3], 8(%[input]) \n\t"
@@ -64,19 +64,18 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
"sub %[step1_2], %[step2_1], %[step2_2] \n\t"
"sub %[step1_3], %[step2_0], %[step2_3] \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [load3] "=&r" (load3), [load4] "=&r" (load4),
- [result1] "=&r" (result1), [result2] "=&r" (result2),
- [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
- [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
- [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
- [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
- [cospi_16_64] "r" (cospi_16_64)
- );
-
- __asm__ __volatile__ (
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
+ [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
+ [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
+ [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
+ [step1_3] "=r"(step1_3)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
+ [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
"lh %[load5], 2(%[input]) \n\t"
"lh %[load6], 30(%[input]) \n\t"
"lh %[load7], 18(%[input]) \n\t"
@@ -126,19 +125,18 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
"add %[step2_8], %[result1], %[result2] \n\t"
"add %[step2_15], %[result4], %[result3] \n\t"
- : [load5] "=&r" (load5), [load6] "=&r" (load6),
- [load7] "=&r" (load7), [load8] "=&r" (load8),
- [result1] "=&r" (result1), [result2] "=&r" (result2),
- [result3] "=&r" (result3), [result4] "=&r" (result4),
- [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
- [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
- [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
- [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
- );
-
- __asm__ __volatile__ (
+ : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
+ [load8] "=&r"(load8), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [result3] "=&r"(result3),
+ [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
+ [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
+ [step2_14] "=r"(step2_14)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
+ [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
+ [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
+
+ __asm__ __volatile__(
"lh %[load1], 10(%[input]) \n\t"
"lh %[load2], 22(%[input]) \n\t"
"lh %[load3], 26(%[input]) \n\t"
@@ -188,19 +186,18 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
"add %[step2_11], %[result1], %[result2] \n\t"
"add %[step2_12], %[result4], %[result3] \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [load3] "=&r" (load3), [load4] "=&r" (load4),
- [result1] "=&r" (result1), [result2] "=&r" (result2),
- [result3] "=&r" (result3), [result4] "=&r" (result4),
- [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
- [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
- [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
- [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
- );
-
- __asm__ __volatile__ (
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [result3] "=&r"(result3),
+ [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
+ [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
+ [step2_13] "=r"(step2_13)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
+ [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
+ [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
+
+ __asm__ __volatile__(
"lh %[load5], 4(%[input]) \n\t"
"lh %[load6], 28(%[input]) \n\t"
"lh %[load7], 20(%[input]) \n\t"
@@ -253,19 +250,18 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
"add %[step1_4], %[result1], %[result2] \n\t"
"add %[step1_7], %[result4], %[result3] \n\t"
- : [load5] "=&r" (load5), [load6] "=&r" (load6),
- [load7] "=&r" (load7), [load8] "=&r" (load8),
- [result1] "=&r" (result1), [result2] "=&r" (result2),
- [result3] "=&r" (result3), [result4] "=&r" (result4),
- [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
- [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
- [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
- [cospi_16_64] "r" (cospi_16_64)
- );
-
- __asm__ __volatile__ (
+ : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
+ [load8] "=&r"(load8), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [result3] "=&r"(result3),
+ [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
+ [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
+ [step1_7] "=r"(step1_7)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
+ [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
"mtlo %[const_2_power_13], $ac1 \n\t"
@@ -305,18 +301,16 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
"extp %[step1_11], $ac2, 31 \n\t"
"extp %[step1_12], $ac3, 31 \n\t"
- : [load5] "=&r" (load5), [load6] "=&r" (load6),
- [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
- [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
- : [const_2_power_13] "r" (const_2_power_13),
- [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
- [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
- [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
- [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
- [cospi_16_64] "r" (cospi_16_64)
- );
-
- __asm__ __volatile__ (
+ : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
+ [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
+ [step1_13] "=r"(step1_13)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
+ [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
+ [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
+ [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
+ [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
"add %[load5], %[step1_0], %[step1_7] \n\t"
"add %[load5], %[load5], %[step2_12] \n\t"
"add %[load5], %[load5], %[step2_15] \n\t"
@@ -350,17 +344,15 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
"sh %[load5], 448(%[output]) \n\t"
"sh %[load6], 480(%[output]) \n\t"
- : [load5] "=&r" (load5), [load6] "=&r" (load6)
- : [output] "r" (output),
- [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
- [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
- [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
- [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
- [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
- [step2_14] "r" (step2_14), [step2_15] "r" (step2_15)
- );
-
- __asm__ __volatile__ (
+ : [load5] "=&r"(load5), [load6] "=&r"(load6)
+ : [output] "r"(output), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1),
+ [step1_6] "r"(step1_6), [step1_7] "r"(step1_7),
+ [step2_8] "r"(step2_8), [step2_9] "r"(step2_9),
+ [step2_10] "r"(step2_10), [step2_11] "r"(step2_11),
+ [step2_12] "r"(step2_12), [step2_13] "r"(step2_13),
+ [step2_14] "r"(step2_14), [step2_15] "r"(step2_15));
+
+ __asm__ __volatile__(
"add %[load5], %[step1_2], %[step1_5] \n\t"
"add %[load5], %[load5], %[step1_13] \n\t"
"add %[load6], %[step1_3], %[step1_4] \n\t"
@@ -386,21 +378,18 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
"sh %[load5], 384(%[output]) \n\t"
"sh %[load6], 416(%[output]) \n\t"
- : [load5] "=&r" (load5), [load6] "=&r" (load6)
- : [output] "r" (output),
- [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
- [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
- [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
- [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)
- );
+ : [load5] "=&r"(load5), [load6] "=&r"(load6)
+ : [output] "r"(output), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
+ [step1_4] "r"(step1_4), [step1_5] "r"(step1_5),
+ [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
+ [step1_12] "r"(step1_12), [step1_13] "r"(step1_13));
input += 16;
output += 1;
}
}
-void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
- int dest_stride) {
+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
int i;
int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
int step1_8, step1_9, step1_10, step1_11;
@@ -416,9 +405,9 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
/* prefetch vpx_ff_cropTbl */
prefetch_load(vpx_ff_cropTbl);
- prefetch_load(vpx_ff_cropTbl + 32);
- prefetch_load(vpx_ff_cropTbl + 64);
- prefetch_load(vpx_ff_cropTbl + 96);
+ prefetch_load(vpx_ff_cropTbl + 32);
+ prefetch_load(vpx_ff_cropTbl + 64);
+ prefetch_load(vpx_ff_cropTbl + 96);
prefetch_load(vpx_ff_cropTbl + 128);
prefetch_load(vpx_ff_cropTbl + 160);
prefetch_load(vpx_ff_cropTbl + 192);
@@ -426,7 +415,7 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
for (i = 0; i < 16; ++i) {
dest_pix = (dest + i);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lh %[load1], 0(%[input]) \n\t"
"lh %[load2], 16(%[input]) \n\t"
"lh %[load3], 8(%[input]) \n\t"
@@ -460,19 +449,18 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"sub %[step1_2], %[step2_1], %[step2_2] \n\t"
"sub %[step1_3], %[step2_0], %[step2_3] \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [load3] "=&r" (load3), [load4] "=&r" (load4),
- [result1] "=&r" (result1), [result2] "=&r" (result2),
- [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
- [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
- [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
- [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
- [cospi_16_64] "r" (cospi_16_64)
- );
-
- __asm__ __volatile__ (
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
+ [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
+ [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
+ [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
+ [step1_3] "=r"(step1_3)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
+ [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
"lh %[load5], 2(%[input]) \n\t"
"lh %[load6], 30(%[input]) \n\t"
"lh %[load7], 18(%[input]) \n\t"
@@ -522,19 +510,18 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[step2_8], %[result1], %[result2] \n\t"
"add %[step2_15], %[result4], %[result3] \n\t"
- : [load5] "=&r" (load5), [load6] "=&r" (load6),
- [load7] "=&r" (load7), [load8] "=&r" (load8),
- [result1] "=&r" (result1), [result2] "=&r" (result2),
- [result3] "=&r" (result3), [result4] "=&r" (result4),
- [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
- [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
- [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
- [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
- );
-
- __asm__ __volatile__ (
+ : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
+ [load8] "=&r"(load8), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [result3] "=&r"(result3),
+ [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
+ [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
+ [step2_14] "=r"(step2_14)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
+ [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
+ [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
+
+ __asm__ __volatile__(
"lh %[load1], 10(%[input]) \n\t"
"lh %[load2], 22(%[input]) \n\t"
"lh %[load3], 26(%[input]) \n\t"
@@ -584,19 +571,18 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[step2_11], %[result1], %[result2] \n\t"
"add %[step2_12], %[result4], %[result3] \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [load3] "=&r" (load3), [load4] "=&r" (load4),
- [result1] "=&r" (result1), [result2] "=&r" (result2),
- [result3] "=&r" (result3), [result4] "=&r" (result4),
- [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
- [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
- [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
- [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
- );
-
- __asm__ __volatile__ (
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [result3] "=&r"(result3),
+ [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
+ [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
+ [step2_13] "=r"(step2_13)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
+ [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
+ [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
+
+ __asm__ __volatile__(
"lh %[load5], 4(%[input]) \n\t"
"lh %[load6], 28(%[input]) \n\t"
"lh %[load7], 20(%[input]) \n\t"
@@ -650,19 +636,18 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[step1_4], %[result1], %[result2] \n\t"
"add %[step1_7], %[result4], %[result3] \n\t"
- : [load5] "=&r" (load5), [load6] "=&r" (load6),
- [load7] "=&r" (load7), [load8] "=&r" (load8),
- [result1] "=&r" (result1), [result2] "=&r" (result2),
- [result3] "=&r" (result3), [result4] "=&r" (result4),
- [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
- [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
- [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
- [cospi_16_64] "r" (cospi_16_64)
- );
-
- __asm__ __volatile__ (
+ : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
+ [load8] "=&r"(load8), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [result3] "=&r"(result3),
+ [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
+ [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
+ [step1_7] "=r"(step1_7)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
+ [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
"mtlo %[const_2_power_13], $ac1 \n\t"
@@ -702,23 +687,21 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"extp %[step1_11], $ac2, 31 \n\t"
"extp %[step1_12], $ac3, 31 \n\t"
- : [load5] "=&r" (load5), [load6] "=&r" (load6),
- [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
- [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
- : [const_2_power_13] "r" (const_2_power_13),
- [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
- [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
- [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
- [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
- [cospi_16_64] "r" (cospi_16_64)
- );
+ : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
+ [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
+ [step1_13] "=r"(step1_13)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
+ [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
+ [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
+ [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
+ [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
step1_8 = step2_8 + step2_11;
step1_9 = step2_9 + step2_10;
step1_14 = step2_13 + step2_14;
step1_15 = step2_12 + step2_15;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lbu %[load7], 0(%[dest_pix]) \n\t"
"add %[load5], %[step1_0], %[step1_7] \n\t"
"add %[load5], %[load5], %[step1_15] \n\t"
@@ -870,18 +853,16 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"lbux %[load6], %[load8](%[cm]) \n\t"
"sb %[load6], 0(%[dest_pix]) \n\t"
- : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7),
- [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix)
- : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
- [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
- [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
- [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
- [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
- [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
- [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
- [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
- [step1_14] "r" (step1_14), [step1_15] "r" (step1_15)
- );
+ : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
+ [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix)
+ :
+ [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
+ [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
+ [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
+ [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9),
+ [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
+ [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
+ [step1_14] "r"(step1_14), [step1_15] "r"(step1_15));
input += 16;
}
@@ -889,15 +870,11 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
int dest_stride) {
- DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+ DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
uint32_t pos = 45;
/* bit positon for extract from acc */
- __asm__ __volatile__ (
- "wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r" (pos)
- );
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos));
// First transform rows
idct16_rows_dspr2(input, out, 16);
@@ -908,17 +885,13 @@ void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
int dest_stride) {
- DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+ DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
int16_t *outptr = out;
uint32_t i;
uint32_t pos = 45;
/* bit positon for extract from acc */
- __asm__ __volatile__ (
- "wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r" (pos)
- );
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos));
// First transform rows. Since all non-zero dct coefficients are in
// upper-left 4x4 area, we only need to calculate first 4 rows here.
@@ -926,7 +899,7 @@ void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
outptr += 4;
for (i = 0; i < 6; ++i) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sw $zero, 0(%[outptr]) \n\t"
"sw $zero, 32(%[outptr]) \n\t"
"sw $zero, 64(%[outptr]) \n\t"
@@ -945,8 +918,7 @@ void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
"sw $zero, 480(%[outptr]) \n\t"
:
- : [outptr] "r" (outptr)
- );
+ : [outptr] "r"(outptr));
outptr += 2;
}
@@ -966,35 +938,31 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
int32_t vector_1, vector_2, vector_3, vector_4;
/* bit positon for extract from acc */
- __asm__ __volatile__ (
- "wrdsp %[pos], 1 \n\t"
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r" (pos)
- );
+ :
+ : [pos] "r"(pos));
out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"addi %[out], %[out], 32 \n\t"
"sra %[a1], %[out], 6 \n\t"
- : [out] "+r" (out), [a1] "=r" (a1)
- :
- );
+ : [out] "+r"(out), [a1] "=r"(a1)
+ :);
if (a1 < 0) {
/* use quad-byte
* input and output memory are four byte aligned */
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"abs %[absa1], %[a1] \n\t"
"replv.qb %[vector_a1], %[absa1] \n\t"
- : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
- : [a1] "r" (a1)
- );
+ : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
for (r = 16; r--;) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lw %[t1], 0(%[dest]) \n\t"
"lw %[t2], 4(%[dest]) \n\t"
"lw %[t3], 8(%[dest]) \n\t"
@@ -1009,25 +977,22 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
"sw %[vector_4], 12(%[dest]) \n\t"
"add %[dest], %[dest], %[dest_stride] \n\t"
- : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
- [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
- [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
- [dest] "+&r" (dest)
- : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
- );
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+ [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+ [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+ [dest] "+&r"(dest)
+ : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
}
} else {
/* use quad-byte
* input and output memory are four byte aligned */
- __asm__ __volatile__ (
- "replv.qb %[vector_a1], %[a1] \n\t"
+ __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
- : [vector_a1] "=r" (vector_a1)
- : [a1] "r" (a1)
- );
+ : [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
for (r = 16; r--;) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lw %[t1], 0(%[dest]) \n\t"
"lw %[t2], 4(%[dest]) \n\t"
"lw %[t3], 8(%[dest]) \n\t"
@@ -1042,12 +1007,11 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
"sw %[vector_4], 12(%[dest]) \n\t"
"add %[dest], %[dest], %[dest_stride] \n\t"
- : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
- [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
- [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
- [dest] "+&r" (dest)
- : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
- );
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+ [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+ [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+ [dest] "+&r"(dest)
+ : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
}
}
}
@@ -1072,21 +1036,20 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) {
int x14 = input[1];
int x15 = input[14];
- if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
- | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
- output[0] = output[1] = output[2] = output[3] = output[4]
- = output[5] = output[6] = output[7] = output[8]
- = output[9] = output[10] = output[11] = output[12]
- = output[13] = output[14] = output[15] = 0;
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
+ x13 | x14 | x15)) {
+ output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
+ output[6] = output[7] = output[8] = output[9] = output[10] =
+ output[11] = output[12] = output[13] = output[14] = output[15] = 0;
return;
}
// stage 1
- s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
+ s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
- s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
+ s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
- s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
+ s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
@@ -1095,9 +1058,9 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) {
s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
- s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
+ s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
- s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
+ s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
x0 = dct_const_round_shift(s0 + s8);
x1 = dct_const_round_shift(s1 + s9);
@@ -1107,8 +1070,8 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) {
x5 = dct_const_round_shift(s5 + s13);
x6 = dct_const_round_shift(s6 + s14);
x7 = dct_const_round_shift(s7 + s15);
- x8 = dct_const_round_shift(s0 - s8);
- x9 = dct_const_round_shift(s1 - s9);
+ x8 = dct_const_round_shift(s0 - s8);
+ x9 = dct_const_round_shift(s1 - s9);
x10 = dct_const_round_shift(s2 - s10);
x11 = dct_const_round_shift(s3 - s11);
x12 = dct_const_round_shift(s4 - s12);
@@ -1125,14 +1088,14 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) {
s5 = x5;
s6 = x6;
s7 = x7;
- s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
- s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
- s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
- s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
- s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
- s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
- s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
- s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+ s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+ s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+ s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+ s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+ s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+ s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+ s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+ s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
x0 = s0 + s4;
x1 = s1 + s5;
@@ -1156,18 +1119,18 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) {
s1 = x1;
s2 = x2;
s3 = x3;
- s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+ s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
- s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
- s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+ s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+ s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
s8 = x8;
s9 = x9;
s10 = x10;
s11 = x11;
- s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+ s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
- s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
- s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+ s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+ s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
x0 = s0 + s2;
x1 = s1 + s3;
@@ -1187,13 +1150,13 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) {
x15 = dct_const_round_shift(s13 - s15);
// stage 4
- s2 = (- cospi_16_64) * (x2 + x3);
+ s2 = (-cospi_16_64) * (x2 + x3);
s3 = cospi_16_64 * (x2 - x3);
s6 = cospi_16_64 * (x6 + x7);
- s7 = cospi_16_64 * (- x6 + x7);
+ s7 = cospi_16_64 * (-x6 + x7);
s10 = cospi_16_64 * (x10 + x11);
- s11 = cospi_16_64 * (- x10 + x11);
- s14 = (- cospi_16_64) * (x14 + x15);
+ s11 = cospi_16_64 * (-x10 + x11);
+ s14 = (-cospi_16_64) * (x14 + x15);
s15 = cospi_16_64 * (x14 - x15);
x2 = dct_const_round_shift(s2);
@@ -1205,23 +1168,22 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) {
x14 = dct_const_round_shift(s14);
x15 = dct_const_round_shift(s15);
- output[0] = x0;
+ output[0] = x0;
output[1] = -x8;
- output[2] = x12;
+ output[2] = x12;
output[3] = -x4;
- output[4] = x6;
- output[5] = x14;
- output[6] = x10;
- output[7] = x2;
- output[8] = x3;
- output[9] = x11;
- output[10] = x15;
- output[11] = x7;
- output[12] = x5;
+ output[4] = x6;
+ output[5] = x14;
+ output[6] = x10;
+ output[7] = x2;
+ output[8] = x3;
+ output[9] = x11;
+ output[10] = x15;
+ output[11] = x7;
+ output[12] = x5;
output[13] = -x13;
- output[14] = x9;
+ output[14] = x9;
output[15] = -x1;
}
-
#endif // HAVE_DSPR2
diff --git a/vpx_dsp/mips/itrans32_cols_dspr2.c b/vpx_dsp/mips/itrans32_cols_dspr2.c
index 553acb0f5..ce25d55c9 100644
--- a/vpx_dsp/mips/itrans32_cols_dspr2.c
+++ b/vpx_dsp/mips/itrans32_cols_dspr2.c
@@ -39,9 +39,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
/* prefetch vpx_ff_cropTbl */
prefetch_load(vpx_ff_cropTbl);
- prefetch_load(vpx_ff_cropTbl + 32);
- prefetch_load(vpx_ff_cropTbl + 64);
- prefetch_load(vpx_ff_cropTbl + 96);
+ prefetch_load(vpx_ff_cropTbl + 32);
+ prefetch_load(vpx_ff_cropTbl + 64);
+ prefetch_load(vpx_ff_cropTbl + 96);
prefetch_load(vpx_ff_cropTbl + 128);
prefetch_load(vpx_ff_cropTbl + 160);
prefetch_load(vpx_ff_cropTbl + 192);
@@ -51,7 +51,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
dest_pix = dest + i;
dest_pix1 = dest + i + 31 * dest_stride;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lh %[load1], 2(%[input]) \n\t"
"lh %[load2], 62(%[input]) \n\t"
"lh %[load3], 34(%[input]) \n\t"
@@ -101,18 +101,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[step1_16], %[temp0], %[temp1] \n\t"
"add %[step1_31], %[temp2], %[temp3] \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
- [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
- [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
- [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),
- [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),
- [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),
- [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)
- );
-
- __asm__ __volatile__ (
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16),
+ [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30),
+ [step1_31] "=r"(step1_31)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
+ [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64));
+
+ __asm__ __volatile__(
"lh %[load1], 18(%[input]) \n\t"
"lh %[load2], 46(%[input]) \n\t"
"lh %[load3], 50(%[input]) \n\t"
@@ -162,18 +161,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[step1_19], %[temp0], %[temp1] \n\t"
"add %[step1_28], %[temp2], %[temp3] \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
- [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
- [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
- [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),
- [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),
- [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),
- [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)
- );
-
- __asm__ __volatile__ (
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18),
+ [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28),
+ [step1_29] "=r"(step1_29)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
+ [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64));
+
+ __asm__ __volatile__(
"lh %[load1], 10(%[input]) \n\t"
"lh %[load2], 54(%[input]) \n\t"
"lh %[load3], 42(%[input]) \n\t"
@@ -223,18 +221,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[step1_20], %[temp0], %[temp1] \n\t"
"add %[step1_27], %[temp2], %[temp3] \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
- [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
- [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
- [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),
- [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),
- [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),
- [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
- );
-
- __asm__ __volatile__ (
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20),
+ [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26),
+ [step1_27] "=r"(step1_27)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
+ [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
+ [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
+
+ __asm__ __volatile__(
"lh %[load1], 26(%[input]) \n\t"
"lh %[load2], 38(%[input]) \n\t"
"lh %[load3], 58(%[input]) \n\t"
@@ -280,18 +277,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[step1_23], %[temp0], %[temp1] \n\t"
"add %[step1_24], %[temp2], %[temp3] \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
- [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
- [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
- [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),
- [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),
- [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),
- [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
- );
-
- __asm__ __volatile__ (
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22),
+ [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24),
+ [step1_25] "=r"(step1_25)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
+ [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
+ [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
+
+ __asm__ __volatile__(
"lh %[load1], 4(%[input]) \n\t"
"lh %[load2], 60(%[input]) \n\t"
"lh %[load3], 36(%[input]) \n\t"
@@ -337,18 +333,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[step2_8], %[temp0], %[temp1] \n\t"
"add %[step2_15], %[temp2], %[temp3] \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
- [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
- [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
- [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),
- [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
- [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
- [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
- );
-
- __asm__ __volatile__ (
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8),
+ [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14),
+ [step2_15] "=r"(step2_15)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
+ [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
+ [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
+
+ __asm__ __volatile__(
"lh %[load1], 20(%[input]) \n\t"
"lh %[load2], 44(%[input]) \n\t"
"lh %[load3], 52(%[input]) \n\t"
@@ -394,18 +389,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[step2_11], %[temp0], %[temp1] \n\t"
"add %[step2_12], %[temp2], %[temp3] \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
- [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
- [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
- [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
- [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
- [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
- [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
- );
-
- __asm__ __volatile__ (
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10),
+ [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
+ [step2_13] "=r"(step2_13)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
+ [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
+ [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
+
+ __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
"sub %[temp0], %[step2_14], %[step2_13] \n\t"
@@ -440,33 +434,31 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"extp %[step3_11], $ac2, 31 \n\t"
"extp %[step3_12], $ac3, 31 \n\t"
- : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
- [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),
- [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),
- [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),
- [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)
- : [const_2_power_13] "r" (const_2_power_13), [step2_8] "r" (step2_8),
- [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
- [step2_11] "r" (step2_11), [step2_12] "r" (step2_12),
- [step2_13] "r" (step2_13), [step2_14] "r" (step2_14),
- [step2_15] "r" (step2_15), [cospi_16_64] "r" (cospi_16_64)
- );
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8),
+ [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10),
+ [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12),
+ [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14),
+ [step3_15] "=r"(step3_15)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
+ [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
+ [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
+ [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
+ [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
step2_18 = step1_17 - step1_18;
step2_29 = step1_30 - step1_29;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
"msub $ac0, %[step2_18], %[cospi_8_64] \n\t"
"madd $ac0, %[step2_29], %[cospi_24_64] \n\t"
"extp %[step3_18], $ac0, 31 \n\t"
- : [step3_18] "=r" (step3_18)
- : [const_2_power_13] "r" (const_2_power_13),
- [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),
- [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
- );
+ : [step3_18] "=r"(step3_18)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18),
+ [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@@ -474,18 +466,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
step2_19 = step1_16 - step1_19;
step2_28 = step1_31 - step1_28;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
"msub $ac0, %[step2_19], %[cospi_8_64] \n\t"
"madd $ac0, %[step2_28], %[cospi_24_64] \n\t"
"extp %[step3_19], $ac0, 31 \n\t"
- : [step3_19] "=r" (step3_19)
- : [const_2_power_13] "r" (const_2_power_13),
- [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),
- [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
- );
+ : [step3_19] "=r"(step3_19)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19),
+ [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@@ -498,18 +489,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
step2_20 = step1_23 - step1_20;
step2_27 = step1_24 - step1_27;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
"msub $ac0, %[step2_20], %[cospi_24_64] \n\t"
"msub $ac0, %[step2_27], %[cospi_8_64] \n\t"
"extp %[step3_20], $ac0, 31 \n\t"
- : [step3_20] "=r" (step3_20)
- : [const_2_power_13] "r" (const_2_power_13),
- [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
- [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
- );
+ : [step3_20] "=r"(step3_20)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
+ [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@@ -517,18 +507,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
step2_21 = step1_22 - step1_21;
step2_26 = step1_25 - step1_26;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac1 \n\t"
"mthi $zero, $ac1 \n\t"
"msub $ac1, %[step2_21], %[cospi_24_64] \n\t"
"msub $ac1, %[step2_26], %[cospi_8_64] \n\t"
"extp %[step3_21], $ac1, 31 \n\t"
- : [step3_21] "=r" (step3_21)
- : [const_2_power_13] "r" (const_2_power_13),
- [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),
- [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
- );
+ : [step3_21] "=r"(step3_21)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21),
+ [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@@ -556,7 +545,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
step2_30 = step3_30 + step3_25;
step2_31 = step3_31 + step3_24;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lh %[load1], 0(%[input]) \n\t"
"lh %[load2], 32(%[input]) \n\t"
"lh %[load3], 16(%[input]) \n\t"
@@ -588,19 +577,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"sub %[step1_2], %[temp1], %[temp2] \n\t"
"sub %[step1_3], %[temp0], %[temp3] \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [load3] "=&r" (load3), [load4] "=&r" (load4),
- [result1] "=&r" (result1), [result2] "=&r" (result2),
- [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
- [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
- [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
- [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
- [cospi_16_64] "r" (cospi_16_64)
- );
-
- __asm__ __volatile__ (
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0),
+ [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
+ [step1_3] "=r"(step1_3)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
+ [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
"lh %[load1], 8(%[input]) \n\t"
"lh %[load2], 56(%[input]) \n\t"
"lh %[load3], 40(%[input]) \n\t"
@@ -649,17 +636,15 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[step1_4], %[temp0], %[temp1] \n\t"
"add %[step1_7], %[temp3], %[temp2] \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [load3] "=&r" (load3), [load4] "=&r" (load4),
- [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
- [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
- [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
- [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
- [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
- [cospi_16_64] "r" (cospi_16_64)
- );
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4),
+ [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
+ [step1_7] "=r"(step1_7)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
+ [cospi_16_64] "r"(cospi_16_64));
step2_0 = step1_0 + step1_7;
step2_1 = step1_1 + step1_6;
@@ -688,67 +673,63 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
step1_14 = step2_1 - step3_14;
step1_15 = step2_0 - step3_15;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sub %[temp0], %[step2_27], %[step2_20] \n\t"
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
"madd $ac0, %[temp0], %[cospi_16_64] \n\t"
"extp %[step1_20], $ac0, 31 \n\t"
- : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)
- : [const_2_power_13] "r" (const_2_power_13), [step2_20] "r" (step2_20),
- [step2_27] "r" (step2_27), [cospi_16_64] "r" (cospi_16_64)
- );
+ : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
+ [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64));
temp21 = (step2_20 + step2_27) * cospi_16_64;
step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sub %[temp0], %[step2_26], %[step2_21] \n\t"
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
"madd $ac0, %[temp0], %[cospi_16_64] \n\t"
"extp %[step1_21], $ac0, 31 \n\t"
- : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)
- : [const_2_power_13] "r" (const_2_power_13), [step2_26] "r" (step2_26),
- [step2_21] "r" (step2_21), [cospi_16_64] "r" (cospi_16_64)
- );
+ : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26),
+ [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64));
temp21 = (step2_21 + step2_26) * cospi_16_64;
step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sub %[temp0], %[step2_25], %[step2_22] \n\t"
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
"madd $ac0, %[temp0], %[cospi_16_64] \n\t"
"extp %[step1_22], $ac0, 31 \n\t"
- : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)
- : [const_2_power_13] "r" (const_2_power_13), [step2_25] "r" (step2_25),
- [step2_22] "r" (step2_22), [cospi_16_64] "r" (cospi_16_64)
- );
+ : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25),
+ [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64));
temp21 = (step2_22 + step2_25) * cospi_16_64;
step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sub %[temp0], %[step2_24], %[step2_23] \n\t"
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
"madd $ac0, %[temp0], %[cospi_16_64] \n\t"
"extp %[step1_23], $ac0, 31 \n\t"
- : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)
- : [const_2_power_13] "r" (const_2_power_13), [step2_24] "r" (step2_24),
- [step2_23] "r" (step2_23), [cospi_16_64] "r" (cospi_16_64)
- );
+ : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24),
+ [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64));
temp21 = (step2_23 + step2_24) * cospi_16_64;
step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lbu %[temp2], 0(%[dest_pix]) \n\t"
"add %[temp0], %[step1_0], %[step2_31] \n\t"
"addi %[temp0], %[temp0], 32 \n\t"
@@ -783,21 +764,20 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"sb %[temp1], 0(%[dest_pix]) \n\t"
"addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
- : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
- [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
- : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
- [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
- [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
- [step2_28] "r" (step2_28), [step2_29] "r" (step2_29),
- [step2_30] "r" (step2_30), [step2_31] "r" (step2_31)
- );
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
+ : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
+ [step1_1] "r"(step1_1), [step1_2] "r"(step1_2),
+ [step1_3] "r"(step1_3), [step2_28] "r"(step2_28),
+ [step2_29] "r"(step2_29), [step2_30] "r"(step2_30),
+ [step2_31] "r"(step2_31));
step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6);
step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6);
step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6);
step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lbu %[temp2], 0(%[dest_pix1]) \n\t"
"add %[temp2], %[temp2], %[step3_15] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
@@ -820,14 +800,13 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"sb %[temp1], 0(%[dest_pix1]) \n\t"
"subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
- : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
- [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
- : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
- [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
- [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
- );
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
+ : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
+ [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
+ [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lbu %[temp2], 0(%[dest_pix]) \n\t"
"add %[temp0], %[step1_4], %[step1_27] \n\t"
"addi %[temp0], %[temp0], 32 \n\t"
@@ -862,21 +841,20 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"sb %[temp1], 0(%[dest_pix]) \n\t"
"addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
- : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
- [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
- : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
- [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
- [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
- [step1_24] "r" (step1_24), [step1_25] "r" (step1_25),
- [step1_26] "r" (step1_26), [step1_27] "r" (step1_27)
- );
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
+ : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_4] "r"(step1_4),
+ [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
+ [step1_7] "r"(step1_7), [step1_24] "r"(step1_24),
+ [step1_25] "r"(step1_25), [step1_26] "r"(step1_26),
+ [step1_27] "r"(step1_27));
step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6);
step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6);
step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6);
step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lbu %[temp2], 0(%[dest_pix1]) \n\t"
"add %[temp2], %[temp2], %[step3_15] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
@@ -899,14 +877,13 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"sb %[temp1], 0(%[dest_pix1]) \n\t"
"subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
- : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
- [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
- : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
- [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
- [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
- );
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
+ : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
+ [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
+ [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lbu %[temp2], 0(%[dest_pix]) \n\t"
"add %[temp0], %[step1_8], %[step1_23] \n\t"
"addi %[temp0], %[temp0], 32 \n\t"
@@ -941,21 +918,20 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"sb %[temp1], 0(%[dest_pix]) \n\t"
"addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
- : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
- [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
- : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
- [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
- [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
- [step1_20] "r" (step1_20), [step1_21] "r" (step1_21),
- [step1_22] "r" (step1_22), [step1_23] "r" (step1_23)
- );
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
+ : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_8] "r"(step1_8),
+ [step1_9] "r"(step1_9), [step1_10] "r"(step1_10),
+ [step1_11] "r"(step1_11), [step1_20] "r"(step1_20),
+ [step1_21] "r"(step1_21), [step1_22] "r"(step1_22),
+ [step1_23] "r"(step1_23));
step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6);
step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6);
step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6);
step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lbu %[temp2], 0(%[dest_pix1]) \n\t"
"add %[temp2], %[temp2], %[step3_15] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
@@ -978,14 +954,13 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"sb %[temp1], 0(%[dest_pix1]) \n\t"
"subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
- : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
- [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
- : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
- [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
- [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
- );
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
+ : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
+ [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
+ [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lbu %[temp2], 0(%[dest_pix]) \n\t"
"add %[temp0], %[step1_12], %[step2_19] \n\t"
"addi %[temp0], %[temp0], 32 \n\t"
@@ -1019,21 +994,20 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix]) \n\t"
- : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
- [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
- : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
- [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
- [step1_14] "r" (step1_14), [step1_15] "r" (step1_15),
- [step2_16] "r" (step2_16), [step2_17] "r" (step2_17),
- [step2_18] "r" (step2_18), [step2_19] "r" (step2_19)
- );
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
+ : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
+ [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
+ [step1_14] "r"(step1_14), [step1_15] "r"(step1_15),
+ [step2_16] "r"(step2_16), [step2_17] "r"(step2_17),
+ [step2_18] "r"(step2_18), [step2_19] "r"(step2_19));
step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6);
step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lbu %[temp2], 0(%[dest_pix1]) \n\t"
"add %[temp2], %[temp2], %[step3_15] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
@@ -1055,12 +1029,11 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix1]) \n\t"
- : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
- [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
- : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
- [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
- [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
- );
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
+ : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
+ [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
+ [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
input += 32;
}
diff --git a/vpx_dsp/mips/itrans32_dspr2.c b/vpx_dsp/mips/itrans32_dspr2.c
index 523da1df1..d71c5ffed 100644
--- a/vpx_dsp/mips/itrans32_dspr2.c
+++ b/vpx_dsp/mips/itrans32_dspr2.c
@@ -40,16 +40,16 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
const int const_2_power_13 = 8192;
const int32_t *input_int;
- for (i = no_rows; i--; ) {
+ for (i = no_rows; i--;) {
input_int = (const int32_t *)input;
- if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] |
- input_int[4] | input_int[5] | input_int[6] | input_int[7] |
- input_int[8] | input_int[9] | input_int[10] | input_int[11] |
+ if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] |
+ input_int[4] | input_int[5] | input_int[6] | input_int[7] |
+ input_int[8] | input_int[9] | input_int[10] | input_int[11] |
input_int[12] | input_int[13] | input_int[14] | input_int[15])) {
input += 32;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sh $zero, 0(%[output]) \n\t"
"sh $zero, 64(%[output]) \n\t"
"sh $zero, 128(%[output]) \n\t"
@@ -84,8 +84,7 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"sh $zero, 1984(%[output]) \n\t"
:
- : [output] "r" (output)
- );
+ : [output] "r"(output));
output += 1;
@@ -96,7 +95,7 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
prefetch_load((const uint8_t *)(input + 32));
prefetch_load((const uint8_t *)(input + 48));
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lh %[load1], 2(%[input]) \n\t"
"lh %[load2], 62(%[input]) \n\t"
"lh %[load3], 34(%[input]) \n\t"
@@ -146,19 +145,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"add %[step1_16], %[temp0], %[temp1] \n\t"
"add %[step1_31], %[temp2], %[temp3] \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [load3] "=&r" (load3), [load4] "=&r" (load4),
- [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
- [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
- [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),
- [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),
- [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),
- [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)
- );
-
- __asm__ __volatile__ (
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16),
+ [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30),
+ [step1_31] "=r"(step1_31)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
+ [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64));
+
+ __asm__ __volatile__(
"lh %[load1], 18(%[input]) \n\t"
"lh %[load2], 46(%[input]) \n\t"
"lh %[load3], 50(%[input]) \n\t"
@@ -208,19 +205,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"add %[step1_19], %[temp0], %[temp1] \n\t"
"add %[step1_28], %[temp2], %[temp3] \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [load3] "=&r" (load3), [load4] "=&r" (load4),
- [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
- [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
- [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),
- [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),
- [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),
- [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)
- );
-
- __asm__ __volatile__ (
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18),
+ [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28),
+ [step1_29] "=r"(step1_29)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
+ [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64));
+
+ __asm__ __volatile__(
"lh %[load1], 10(%[input]) \n\t"
"lh %[load2], 54(%[input]) \n\t"
"lh %[load3], 42(%[input]) \n\t"
@@ -270,19 +265,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"add %[step1_20], %[temp0], %[temp1] \n\t"
"add %[step1_27], %[temp2], %[temp3] \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [load3] "=&r" (load3), [load4] "=&r" (load4),
- [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
- [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
- [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),
- [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),
- [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),
- [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
- );
-
- __asm__ __volatile__ (
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20),
+ [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26),
+ [step1_27] "=r"(step1_27)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
+ [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
+ [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
+
+ __asm__ __volatile__(
"lh %[load1], 26(%[input]) \n\t"
"lh %[load2], 38(%[input]) \n\t"
"lh %[load3], 58(%[input]) \n\t"
@@ -332,19 +325,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"add %[step1_23], %[temp0], %[temp1] \n\t"
"add %[step1_24], %[temp2], %[temp3] \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [load3] "=&r" (load3), [load4] "=&r" (load4),
- [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
- [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
- [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),
- [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),
- [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),
- [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
- );
-
- __asm__ __volatile__ (
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22),
+ [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24),
+ [step1_25] "=r"(step1_25)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
+ [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
+ [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
+
+ __asm__ __volatile__(
"lh %[load1], 4(%[input]) \n\t"
"lh %[load2], 60(%[input]) \n\t"
"lh %[load3], 36(%[input]) \n\t"
@@ -394,19 +385,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"add %[step2_8], %[temp0], %[temp1] \n\t"
"add %[step2_15], %[temp2], %[temp3] \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [load3] "=&r" (load3), [load4] "=&r" (load4),
- [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
- [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
- [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),
- [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
- [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
- [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
- );
-
- __asm__ __volatile__ (
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8),
+ [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14),
+ [step2_15] "=r"(step2_15)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
+ [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
+ [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
+
+ __asm__ __volatile__(
"lh %[load1], 20(%[input]) \n\t"
"lh %[load2], 44(%[input]) \n\t"
"lh %[load3], 52(%[input]) \n\t"
@@ -456,19 +445,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"add %[step2_11], %[temp0], %[temp1] \n\t"
"add %[step2_12], %[temp2], %[temp3] \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [load3] "=&r" (load3), [load4] "=&r" (load4),
- [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
- [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
- [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
- [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
- [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
- [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
- );
-
- __asm__ __volatile__ (
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10),
+ [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
+ [step2_13] "=r"(step2_13)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
+ [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
+ [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
+
+ __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
"sub %[temp0], %[step2_14], %[step2_13] \n\t"
@@ -507,34 +494,31 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"extp %[step3_11], $ac2, 31 \n\t"
"extp %[step3_12], $ac3, 31 \n\t"
- : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
- [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),
- [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),
- [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),
- [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)
- : [const_2_power_13] "r" (const_2_power_13),
- [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
- [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
- [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
- [step2_14] "r" (step2_14), [step2_15] "r" (step2_15),
- [cospi_16_64] "r" (cospi_16_64)
- );
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8),
+ [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10),
+ [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12),
+ [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14),
+ [step3_15] "=r"(step3_15)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
+ [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
+ [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
+ [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
+ [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
step2_18 = step1_17 - step1_18;
step2_29 = step1_30 - step1_29;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
"msub $ac0, %[step2_18], %[cospi_8_64] \n\t"
"madd $ac0, %[step2_29], %[cospi_24_64] \n\t"
"extp %[step3_18], $ac0, 31 \n\t"
- : [step3_18] "=r" (step3_18)
- : [const_2_power_13] "r" (const_2_power_13),
- [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),
- [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
- );
+ : [step3_18] "=r"(step3_18)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18),
+ [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@@ -542,18 +526,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
step2_19 = step1_16 - step1_19;
step2_28 = step1_31 - step1_28;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
"msub $ac0, %[step2_19], %[cospi_8_64] \n\t"
"madd $ac0, %[step2_28], %[cospi_24_64] \n\t"
"extp %[step3_19], $ac0, 31 \n\t"
- : [step3_19] "=r" (step3_19)
- : [const_2_power_13] "r" (const_2_power_13),
- [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),
- [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
- );
+ : [step3_19] "=r"(step3_19)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19),
+ [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@@ -566,18 +549,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
step2_20 = step1_23 - step1_20;
step2_27 = step1_24 - step1_27;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
"msub $ac0, %[step2_20], %[cospi_24_64] \n\t"
"msub $ac0, %[step2_27], %[cospi_8_64] \n\t"
"extp %[step3_20], $ac0, 31 \n\t"
- : [step3_20] "=r" (step3_20)
- : [const_2_power_13] "r" (const_2_power_13),
- [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
- [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
- );
+ : [step3_20] "=r"(step3_20)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
+ [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@@ -585,18 +567,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
step2_21 = step1_22 - step1_21;
step2_26 = step1_25 - step1_26;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac1 \n\t"
"mthi $zero, $ac1 \n\t"
"msub $ac1, %[step2_21], %[cospi_24_64] \n\t"
"msub $ac1, %[step2_26], %[cospi_8_64] \n\t"
"extp %[step3_21], $ac1, 31 \n\t"
- : [step3_21] "=r" (step3_21)
- : [const_2_power_13] "r" (const_2_power_13),
- [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),
- [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
- );
+ : [step3_21] "=r"(step3_21)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21),
+ [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@@ -624,7 +605,7 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
step2_30 = step3_30 + step3_25;
step2_31 = step3_31 + step3_24;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lh %[load1], 0(%[input]) \n\t"
"lh %[load2], 32(%[input]) \n\t"
"lh %[load3], 16(%[input]) \n\t"
@@ -658,20 +639,19 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"sub %[step1_2], %[temp1], %[temp2] \n\t"
"sub %[step1_3], %[temp0], %[temp3] \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [load3] "=&r" (load3), [load4] "=&r" (load4),
- [result1] "=&r" (result1), [result2] "=&r" (result2),
- [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
- [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
- [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
- [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_16_64] "r" (cospi_16_64),
- [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0),
+ [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
+ [step1_3] "=r"(step1_3)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_16_64] "r"(cospi_16_64), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64)
- );
+ );
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lh %[load1], 8(%[input]) \n\t"
"lh %[load2], 56(%[input]) \n\t"
"lh %[load3], 40(%[input]) \n\t"
@@ -724,17 +704,15 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"add %[step1_4], %[temp0], %[temp1] \n\t"
"add %[step1_7], %[temp3], %[temp2] \n\t"
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [load3] "=&r" (load3), [load4] "=&r" (load4),
- [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
- [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
- [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
- [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
- : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
- [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
- [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
- [cospi_16_64] "r" (cospi_16_64)
- );
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4),
+ [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
+ [step1_7] "=r"(step1_7)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
+ [cospi_16_64] "r"(cospi_16_64));
step2_0 = step1_0 + step1_7;
step2_1 = step1_1 + step1_6;
@@ -762,66 +740,58 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
step1_14 = step2_1 - step3_14;
step1_15 = step2_0 - step3_15;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sub %[temp0], %[step2_27], %[step2_20] \n\t"
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
"madd $ac0, %[temp0], %[cospi_16_64] \n\t"
"extp %[step1_20], $ac0, 31 \n\t"
- : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)
- : [const_2_power_13] "r" (const_2_power_13),
- [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
- [cospi_16_64] "r" (cospi_16_64)
- );
+ : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
+ [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64));
temp21 = (step2_20 + step2_27) * cospi_16_64;
step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sub %[temp0], %[step2_26], %[step2_21] \n\t"
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
"madd $ac0, %[temp0], %[cospi_16_64] \n\t"
"extp %[step1_21], $ac0, 31 \n\t"
- : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)
- : [const_2_power_13] "r" (const_2_power_13),
- [step2_26] "r" (step2_26), [step2_21] "r" (step2_21),
- [cospi_16_64] "r" (cospi_16_64)
- );
+ : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26),
+ [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64));
temp21 = (step2_21 + step2_26) * cospi_16_64;
step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sub %[temp0], %[step2_25], %[step2_22] \n\t"
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
"madd $ac0, %[temp0], %[cospi_16_64] \n\t"
"extp %[step1_22], $ac0, 31 \n\t"
- : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)
- : [const_2_power_13] "r" (const_2_power_13),
- [step2_25] "r" (step2_25), [step2_22] "r" (step2_22),
- [cospi_16_64] "r" (cospi_16_64)
- );
+ : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25),
+ [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64));
temp21 = (step2_22 + step2_25) * cospi_16_64;
step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sub %[temp0], %[step2_24], %[step2_23] \n\t"
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
"madd $ac0, %[temp0], %[cospi_16_64] \n\t"
"extp %[step1_23], $ac0, 31 \n\t"
- : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)
- : [const_2_power_13] "r" (const_2_power_13),
- [step2_24] "r" (step2_24), [step2_23] "r" (step2_23),
- [cospi_16_64] "r" (cospi_16_64)
- );
+ : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24),
+ [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64));
temp21 = (step2_23 + step2_24) * cospi_16_64;
step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@@ -867,16 +837,14 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
int dest_stride) {
- DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
+ DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
int16_t *outptr = out;
uint32_t pos = 45;
/* bit positon for extract from acc */
- __asm__ __volatile__ (
- "wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r" (pos)
- );
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
// Rows
idct32_rows_dspr2(input, outptr, 32);
@@ -887,23 +855,21 @@ void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
int stride) {
- DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
+ DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
int16_t *outptr = out;
uint32_t i;
uint32_t pos = 45;
/* bit positon for extract from acc */
- __asm__ __volatile__ (
- "wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r" (pos)
- );
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
// Rows
idct32_rows_dspr2(input, outptr, 8);
outptr += 8;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sw $zero, 0(%[outptr]) \n\t"
"sw $zero, 4(%[outptr]) \n\t"
"sw $zero, 8(%[outptr]) \n\t"
@@ -918,13 +884,12 @@ void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
"sw $zero, 44(%[outptr]) \n\t"
:
- : [outptr] "r" (outptr)
- );
+ : [outptr] "r"(outptr));
for (i = 0; i < 31; ++i) {
outptr += 32;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sw $zero, 0(%[outptr]) \n\t"
"sw $zero, 4(%[outptr]) \n\t"
"sw $zero, 8(%[outptr]) \n\t"
@@ -939,8 +904,7 @@ void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
"sw $zero, 44(%[outptr]) \n\t"
:
- : [outptr] "r" (outptr)
- );
+ : [outptr] "r"(outptr));
}
// Columns
@@ -949,43 +913,39 @@ void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
int stride) {
- int r, out;
- int32_t a1, absa1;
- int32_t vector_a1;
- int32_t t1, t2, t3, t4;
- int32_t vector_1, vector_2, vector_3, vector_4;
- uint32_t pos = 45;
+ int r, out;
+ int32_t a1, absa1;
+ int32_t vector_a1;
+ int32_t t1, t2, t3, t4;
+ int32_t vector_1, vector_2, vector_3, vector_4;
+ uint32_t pos = 45;
/* bit positon for extract from acc */
- __asm__ __volatile__ (
- "wrdsp %[pos], 1 \n\t"
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r" (pos)
- );
+ :
+ : [pos] "r"(pos));
out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"addi %[out], %[out], 32 \n\t"
"sra %[a1], %[out], 6 \n\t"
- : [out] "+r" (out), [a1] "=r" (a1)
- :
- );
+ : [out] "+r"(out), [a1] "=r"(a1)
+ :);
if (a1 < 0) {
/* use quad-byte
* input and output memory are four byte aligned */
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"abs %[absa1], %[a1] \n\t"
"replv.qb %[vector_a1], %[absa1] \n\t"
- : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
- : [a1] "r" (a1)
- );
+ : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
for (r = 32; r--;) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lw %[t1], 0(%[dest]) \n\t"
"lw %[t2], 4(%[dest]) \n\t"
"lw %[t3], 8(%[dest]) \n\t"
@@ -1014,25 +974,22 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
"add %[dest], %[dest], %[stride] \n\t"
- : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
- [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
- [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
- [dest] "+&r" (dest)
- : [stride] "r" (stride), [vector_a1] "r" (vector_a1)
- );
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+ [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+ [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+ [dest] "+&r"(dest)
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
}
} else {
/* use quad-byte
* input and output memory are four byte aligned */
- __asm__ __volatile__ (
- "replv.qb %[vector_a1], %[a1] \n\t"
+ __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
- : [vector_a1] "=r" (vector_a1)
- : [a1] "r" (a1)
- );
+ : [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
for (r = 32; r--;) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lw %[t1], 0(%[dest]) \n\t"
"lw %[t2], 4(%[dest]) \n\t"
"lw %[t3], 8(%[dest]) \n\t"
@@ -1061,12 +1018,11 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
"add %[dest], %[dest], %[stride] \n\t"
- : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
- [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
- [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
- [dest] "+&r" (dest)
- : [stride] "r" (stride), [vector_a1] "r" (vector_a1)
- );
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+ [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+ [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+ [dest] "+&r"(dest)
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
}
}
}
diff --git a/vpx_dsp/mips/itrans4_dspr2.c b/vpx_dsp/mips/itrans4_dspr2.c
index ecb8bd3de..516ea80f4 100644
--- a/vpx_dsp/mips/itrans4_dspr2.c
+++ b/vpx_dsp/mips/itrans4_dspr2.c
@@ -15,13 +15,13 @@
#if HAVE_DSPR2
void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
- int16_t step_0, step_1, step_2, step_3;
- int Temp0, Temp1, Temp2, Temp3;
+ int16_t step_0, step_1, step_2, step_3;
+ int Temp0, Temp1, Temp2, Temp3;
const int const_2_power_13 = 8192;
- int i;
+ int i;
- for (i = 4; i--; ) {
- __asm__ __volatile__ (
+ for (i = 4; i--;) {
+ __asm__ __volatile__(
/*
temp_1 = (input[0] + input[2]) * cospi_16_64;
step_0 = dct_const_round_shift(temp_1);
@@ -83,16 +83,12 @@ void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
"sub %[Temp3], %[step_0], %[step_3] \n\t"
"sh %[Temp3], 24(%[output]) \n\t"
- : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
- [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
- [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
- [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
- [output] "+r" (output)
- : [const_2_power_13] "r" (const_2_power_13),
- [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
- [cospi_24_64] "r" (cospi_24_64),
- [input] "r" (input)
- );
+ : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
+ [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), [output] "+r"(output)
+ : [const_2_power_13] "r"(const_2_power_13),
+ [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
+ [cospi_24_64] "r"(cospi_24_64), [input] "r"(input));
input += 4;
output += 1;
@@ -101,27 +97,27 @@ void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
int dest_stride) {
- int16_t step_0, step_1, step_2, step_3;
- int Temp0, Temp1, Temp2, Temp3;
+ int16_t step_0, step_1, step_2, step_3;
+ int Temp0, Temp1, Temp2, Temp3;
const int const_2_power_13 = 8192;
- int i;
- uint8_t *dest_pix;
- uint8_t *cm = vpx_ff_cropTbl;
+ int i;
+ uint8_t *dest_pix;
+ uint8_t *cm = vpx_ff_cropTbl;
/* prefetch vpx_ff_cropTbl */
prefetch_load(vpx_ff_cropTbl);
- prefetch_load(vpx_ff_cropTbl + 32);
- prefetch_load(vpx_ff_cropTbl + 64);
- prefetch_load(vpx_ff_cropTbl + 96);
+ prefetch_load(vpx_ff_cropTbl + 32);
+ prefetch_load(vpx_ff_cropTbl + 64);
+ prefetch_load(vpx_ff_cropTbl + 96);
prefetch_load(vpx_ff_cropTbl + 128);
prefetch_load(vpx_ff_cropTbl + 160);
prefetch_load(vpx_ff_cropTbl + 192);
prefetch_load(vpx_ff_cropTbl + 224);
for (i = 0; i < 4; ++i) {
- dest_pix = (dest + i);
+ dest_pix = (dest + i);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
/*
temp_1 = (input[0] + input[2]) * cospi_16_64;
step_0 = dct_const_round_shift(temp_1);
@@ -206,16 +202,14 @@ void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
"lbux %[Temp2], %[Temp1](%[cm]) \n\t"
"sb %[Temp2], 0(%[dest_pix]) \n\t"
- : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
- [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
- [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
- [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
- [dest_pix] "+r" (dest_pix)
- : [const_2_power_13] "r" (const_2_power_13),
- [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
- [cospi_24_64] "r" (cospi_24_64),
- [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
- );
+ : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
+ [step_2] "=&r"(step_2), [step_3] "=&r"(step_3),
+ [dest_pix] "+r"(dest_pix)
+ : [const_2_power_13] "r"(const_2_power_13),
+ [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
+ [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
+ [dest_stride] "r"(dest_stride));
input += 4;
}
@@ -228,11 +222,9 @@ void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
uint32_t pos = 45;
/* bit positon for extract from acc */
- __asm__ __volatile__ (
- "wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r" (pos)
- );
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
// Rows
vpx_idct4_rows_dspr2(input, outptr);
@@ -243,73 +235,63 @@ void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
int dest_stride) {
- int a1, absa1;
- int r;
- int32_t out;
- int t2, vector_a1, vector_a;
- uint32_t pos = 45;
- int16_t input_dc = input[0];
+ int a1, absa1;
+ int r;
+ int32_t out;
+ int t2, vector_a1, vector_a;
+ uint32_t pos = 45;
+ int16_t input_dc = input[0];
/* bit positon for extract from acc */
- __asm__ __volatile__ (
- "wrdsp %[pos], 1 \n\t"
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r" (pos)
- );
+ :
+ : [pos] "r"(pos));
out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"addi %[out], %[out], 8 \n\t"
"sra %[a1], %[out], 4 \n\t"
- : [out] "+r" (out), [a1] "=r" (a1)
- :
- );
+ : [out] "+r"(out), [a1] "=r"(a1)
+ :);
if (a1 < 0) {
/* use quad-byte
* input and output memory are four byte aligned */
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"abs %[absa1], %[a1] \n\t"
"replv.qb %[vector_a1], %[absa1] \n\t"
- : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
- : [a1] "r" (a1)
- );
+ : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
for (r = 4; r--;) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lw %[t2], 0(%[dest]) \n\t"
"subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
"sw %[vector_a], 0(%[dest]) \n\t"
"add %[dest], %[dest], %[dest_stride] \n\t"
- : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
- [dest] "+&r" (dest)
- : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
- );
+ : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
+ : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
}
} else {
/* use quad-byte
* input and output memory are four byte aligned */
- __asm__ __volatile__ (
- "replv.qb %[vector_a1], %[a1] \n\t"
- : [vector_a1] "=r" (vector_a1)
- : [a1] "r" (a1)
- );
+ __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
+ : [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
for (r = 4; r--;) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lw %[t2], 0(%[dest]) \n\t"
"addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
"sw %[vector_a], 0(%[dest]) \n\t"
"add %[dest], %[dest], %[dest_stride] \n\t"
- : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
- [dest] "+&r" (dest)
- : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
- );
+ : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
+ : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
}
}
}
diff --git a/vpx_dsp/mips/itrans8_dspr2.c b/vpx_dsp/mips/itrans8_dspr2.c
index 823e845d5..08a6c78b6 100644
--- a/vpx_dsp/mips/itrans8_dspr2.c
+++ b/vpx_dsp/mips/itrans8_dspr2.c
@@ -20,8 +20,8 @@ void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) {
int Temp0, Temp1, Temp2, Temp3, Temp4;
int i;
- for (i = no_rows; i--; ) {
- __asm__ __volatile__ (
+ for (i = no_rows; i--;) {
+ __asm__ __volatile__(
/*
temp_1 = (input[0] + input[4]) * cospi_16_64;
step2_0 = dct_const_round_shift(temp_1);
@@ -174,20 +174,18 @@ void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) {
"sub %[Temp1], %[step1_0], %[step1_7] \n\t"
"sh %[Temp1], 112(%[output]) \n\t"
- : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
- [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
- [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
- [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
- [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
- [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
- [Temp4] "=&r" (Temp4)
- : [const_2_power_13] "r" (const_2_power_13),
- [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
- [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
- [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
- [cospi_24_64] "r" (cospi_24_64),
- [output] "r" (output), [input] "r" (input)
- );
+ : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1),
+ [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3),
+ [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5),
+ [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7),
+ [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
+ : [const_2_power_13] "r"(const_2_power_13),
+ [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64),
+ [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64),
+ [cospi_24_64] "r"(cospi_24_64), [output] "r"(output),
+ [input] "r"(input));
input += 8;
output += 1;
@@ -205,18 +203,18 @@ void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
/* prefetch vpx_ff_cropTbl */
prefetch_load(vpx_ff_cropTbl);
- prefetch_load(vpx_ff_cropTbl + 32);
- prefetch_load(vpx_ff_cropTbl + 64);
- prefetch_load(vpx_ff_cropTbl + 96);
+ prefetch_load(vpx_ff_cropTbl + 32);
+ prefetch_load(vpx_ff_cropTbl + 64);
+ prefetch_load(vpx_ff_cropTbl + 96);
prefetch_load(vpx_ff_cropTbl + 128);
prefetch_load(vpx_ff_cropTbl + 160);
prefetch_load(vpx_ff_cropTbl + 192);
prefetch_load(vpx_ff_cropTbl + 224);
for (i = 0; i < 8; ++i) {
- dest_pix = (dest + i);
+ dest_pix = (dest + i);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
/*
temp_1 = (input[0] + input[4]) * cospi_16_64;
step2_0 = dct_const_round_shift(temp_1);
@@ -423,20 +421,18 @@ void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
"lbux %[Temp2], %[Temp1](%[cm]) \n\t"
"sb %[Temp2], 0(%[dest_pix]) \n\t"
- : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
- [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
- [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
- [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
- [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
- [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
- [dest_pix] "+r" (dest_pix)
- : [const_2_power_13] "r" (const_2_power_13),
- [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
- [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
- [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
- [cospi_24_64] "r" (cospi_24_64),
- [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
- );
+ : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1),
+ [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3),
+ [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5),
+ [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7),
+ [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [dest_pix] "+r"(dest_pix)
+ : [const_2_power_13] "r"(const_2_power_13),
+ [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64),
+ [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64),
+ [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
+ [dest_stride] "r"(dest_stride));
input += 8;
}
@@ -449,11 +445,7 @@ void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
uint32_t pos = 45;
/* bit positon for extract from acc */
- __asm__ __volatile__ (
- "wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r" (pos)
- );
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos));
// First transform rows
idct8_rows_dspr2(input, outptr, 8);
@@ -469,18 +461,14 @@ void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
uint32_t pos = 45;
/* bit positon for extract from acc */
- __asm__ __volatile__ (
- "wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r" (pos)
- );
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos));
// First transform rows
idct8_rows_dspr2(input, outptr, 4);
outptr += 4;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sw $zero, 0(%[outptr]) \n\t"
"sw $zero, 4(%[outptr]) \n\t"
"sw $zero, 16(%[outptr]) \n\t"
@@ -499,9 +487,7 @@ void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
"sw $zero, 116(%[outptr]) \n\t"
:
- : [outptr] "r" (outptr)
- );
-
+ : [outptr] "r"(outptr));
// Then transform columns and add to dest
idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
@@ -516,35 +502,31 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
int32_t t1, t2, vector_a1, vector_1, vector_2;
/* bit positon for extract from acc */
- __asm__ __volatile__ (
- "wrdsp %[pos], 1 \n\t"
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r" (pos)
- );
+ :
+ : [pos] "r"(pos));
out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"addi %[out], %[out], 16 \n\t"
"sra %[a1], %[out], 5 \n\t"
- : [out] "+r" (out), [a1] "=r" (a1)
- :
- );
+ : [out] "+r"(out), [a1] "=r"(a1)
+ :);
if (a1 < 0) {
/* use quad-byte
* input and output memory are four byte aligned */
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"abs %[absa1], %[a1] \n\t"
"replv.qb %[vector_a1], %[absa1] \n\t"
- : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
- : [a1] "r" (a1)
- );
+ : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
for (r = 8; r--;) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lw %[t1], 0(%[dest]) \n\t"
"lw %[t2], 4(%[dest]) \n\t"
"subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
@@ -553,24 +535,20 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
"sw %[vector_2], 4(%[dest]) \n\t"
"add %[dest], %[dest], %[dest_stride] \n\t"
- : [t1] "=&r" (t1), [t2] "=&r" (t2),
- [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
- [dest] "+&r" (dest)
- : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
- );
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
+ [vector_2] "=&r"(vector_2), [dest] "+&r"(dest)
+ : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
}
} else {
/* use quad-byte
* input and output memory are four byte aligned */
- __asm__ __volatile__ (
- "replv.qb %[vector_a1], %[a1] \n\t"
+ __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
- : [vector_a1] "=r" (vector_a1)
- : [a1] "r" (a1)
- );
+ : [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
for (r = 8; r--;) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lw %[t1], 0(%[dest]) \n\t"
"lw %[t2], 4(%[dest]) \n\t"
"addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
@@ -579,11 +557,9 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
"sw %[vector_2], 4(%[dest]) \n\t"
"add %[dest], %[dest], %[dest_stride] \n\t"
- : [t1] "=&r" (t1), [t2] "=&r" (t2),
- [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
- [dest] "+r" (dest)
- : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
- );
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
+ [vector_2] "=&r"(vector_2), [dest] "+r"(dest)
+ : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
}
}
}
@@ -602,20 +578,20 @@ void iadst8_dspr2(const int16_t *input, int16_t *output) {
x7 = input[6];
if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
- output[0] = output[1] = output[2] = output[3] = output[4]
- = output[5] = output[6] = output[7] = 0;
+ output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
+ output[6] = output[7] = 0;
return;
}
// stage 1
- s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
- s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+ s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+ s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
- s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
- s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+ s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+ s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
@@ -631,10 +607,10 @@ void iadst8_dspr2(const int16_t *input, int16_t *output) {
s1 = x1;
s2 = x2;
s3 = x3;
- s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
- s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
- s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
- s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+ s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+ s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+ s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+ s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
x0 = s0 + s2;
x1 = s1 + s3;
@@ -656,13 +632,13 @@ void iadst8_dspr2(const int16_t *input, int16_t *output) {
x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
- output[0] = x0;
+ output[0] = x0;
output[1] = -x4;
- output[2] = x6;
+ output[2] = x6;
output[3] = -x2;
- output[4] = x3;
+ output[4] = x3;
output[5] = -x7;
- output[6] = x5;
+ output[6] = x5;
output[7] = -x1;
}
#endif // HAVE_DSPR2
diff --git a/vpx_dsp/mips/loopfilter_16_msa.c b/vpx_dsp/mips/loopfilter_16_msa.c
index a6c581d72..2c8a10fc5 100644
--- a/vpx_dsp/mips/loopfilter_16_msa.c
+++ b/vpx_dsp/mips/loopfilter_16_msa.c
@@ -11,8 +11,7 @@
#include "vpx_ports/mem.h"
#include "vpx_dsp/mips/loopfilter_msa.h"
-int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
- uint8_t *filter48,
+int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
const uint8_t *b_limit_ptr,
const uint8_t *limit_ptr,
const uint8_t *thresh_ptr) {
@@ -33,8 +32,8 @@ int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
limit = (v16u8)__msa_fill_b(*limit_ptr);
/* mask and hev */
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
- hev, mask, flat);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
@@ -43,9 +42,8 @@ int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
return 1;
} else {
- ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
- zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
- q2_r, q3_r);
+ ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+ q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
@@ -107,8 +105,8 @@ void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
} else {
src -= 7 * pitch;
- ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
- zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
+ ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
+ p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
p2_r_in, p1_r_in, p0_r_in);
q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
@@ -408,8 +406,7 @@ void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
const uint8_t *b_limit_ptr,
const uint8_t *limit_ptr,
- const uint8_t *thresh_ptr,
- int32_t count) {
+ const uint8_t *thresh_ptr, int32_t count) {
DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
uint8_t early_exit = 0;
@@ -426,8 +423,7 @@ void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
const uint8_t *b_limit_ptr,
const uint8_t *limit_ptr,
- const uint8_t *thresh_ptr,
- int32_t count) {
+ const uint8_t *thresh_ptr, int32_t count) {
if (1 == count) {
uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
uint64_t dword0, dword1;
@@ -449,8 +445,8 @@ static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
limit = (v16u8)__msa_fill_b(*limit_ptr);
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
- hev, mask, flat);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
q1_out);
@@ -472,9 +468,8 @@ static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
/* convert 16 bit output data into 8 bit */
- PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
- zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
- q0_filter8);
+ PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
+ q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
/* store pixel values */
@@ -668,8 +663,8 @@ static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
- LD_UB8(input, in_pitch,
- p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
+ LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org,
+ p1_org, p0_org);
/* 8x8 transpose */
TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
@@ -699,8 +694,8 @@ static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
}
-static void transpose_16x16(uint8_t *input, int32_t in_pitch,
- uint8_t *output, int32_t out_pitch) {
+static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
+ int32_t out_pitch) {
v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@@ -709,12 +704,11 @@ static void transpose_16x16(uint8_t *input, int32_t in_pitch,
LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
input += (8 * in_pitch);
- LD_UB8(input, in_pitch,
- row8, row9, row10, row11, row12, row13, row14, row15);
+ LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15);
- TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
- row8, row9, row10, row11, row12, row13, row14, row15,
- p7, p6, p5, p4, p3, p2, p1, p0);
+ TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+ row9, row10, row11, row12, row13, row14, row15, p7, p6,
+ p5, p4, p3, p2, p1, p0);
/* transpose 16x8 matrix into 8x16 */
/* total 8 intermediate register and 32 instructions */
@@ -779,8 +773,8 @@ int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
limit = (v16u8)__msa_fill_b(*limit_ptr);
/* mask and hev */
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
- hev, mask, flat);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
/* flat4 */
VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
/* filter4 */
@@ -794,9 +788,8 @@ int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
return 1;
} else {
- ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
- zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
- q3_r);
+ ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+ q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
@@ -864,9 +857,9 @@ int32_t vpx_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
} else {
src -= 7 * 16;
- ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
- zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
- p3_r_in, p2_r_in, p1_r_in, p0_r_in);
+ ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
+ p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
+ p2_r_in, p1_r_in, p0_r_in);
q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
tmp0_r = p7_r_in << 3;
@@ -1056,9 +1049,9 @@ void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
- early_exit = vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
- &filter48[0], src, pitch, b_limit_ptr,
- limit_ptr, thresh_ptr);
+ early_exit =
+ vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src,
+ pitch, b_limit_ptr, limit_ptr, thresh_ptr);
if (0 == early_exit) {
early_exit = vpx_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
@@ -1093,8 +1086,8 @@ int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
limit = (v16u8)__msa_fill_b(*limit_ptr);
/* mask and hev */
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
- hev, mask, flat);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
/* flat4 */
VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
/* filter4 */
@@ -1113,9 +1106,8 @@ int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
return 1;
} else {
- ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
- zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
- q3_r);
+ ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+ q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
@@ -1196,9 +1188,9 @@ int32_t vpx_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
} else {
src -= 7 * 16;
- ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
- zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
- p3_r_in, p2_r_in, p1_r_in, p0_r_in);
+ ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
+ p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
+ p2_r_in, p1_r_in, p0_r_in);
q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
tmp0_r = p7_r_in << 3;
@@ -1479,9 +1471,9 @@ void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
- early_exit = vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
- &filter48[0], src, pitch, b_limit_ptr,
- limit_ptr, thresh_ptr);
+ early_exit =
+ vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
+ pitch, b_limit_ptr, limit_ptr, thresh_ptr);
if (0 == early_exit) {
early_exit = vpx_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
diff --git a/vpx_dsp/mips/loopfilter_4_msa.c b/vpx_dsp/mips/loopfilter_4_msa.c
index 936347031..e0079665f 100644
--- a/vpx_dsp/mips/loopfilter_4_msa.c
+++ b/vpx_dsp/mips/loopfilter_4_msa.c
@@ -25,8 +25,8 @@ void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
limit = (v16u8)__msa_fill_b(*limit_ptr);
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
- hev, mask, flat);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
@@ -61,8 +61,8 @@ void vpx_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
- hev, mask, flat);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+ mask, flat);
VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
@@ -82,10 +82,10 @@ void vpx_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
limit = (v16u8)__msa_fill_b(*limit_ptr);
- TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
- p3, p2, p1, p0, q0, q1, q2, q3);
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
- hev, mask, flat);
+ TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+ q3);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
ILVRL_H2_SH(vec1, vec0, vec2, vec3);
@@ -111,12 +111,12 @@ void vpx_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
- LD_UB8(src - 4 + (8 * pitch), pitch,
- row8, row9, row10, row11, row12, row13, row14, row15);
+ LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13,
+ row14, row15);
- TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
- row8, row9, row10, row11, row12, row13, row14, row15,
- p3, p2, p1, p0, q0, q1, q2, q3);
+ TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+ row9, row10, row11, row12, row13, row14, row15, p3, p2,
+ p1, p0, q0, q1, q2, q3);
thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
@@ -130,8 +130,8 @@ void vpx_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
- hev, mask, flat);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+ mask, flat);
VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
diff --git a/vpx_dsp/mips/loopfilter_8_msa.c b/vpx_dsp/mips/loopfilter_8_msa.c
index 5b22bd002..403e5dc51 100644
--- a/vpx_dsp/mips/loopfilter_8_msa.c
+++ b/vpx_dsp/mips/loopfilter_8_msa.c
@@ -29,8 +29,8 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
limit = (v16u8)__msa_fill_b(*limit_ptr);
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
- hev, mask, flat);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
@@ -43,16 +43,14 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
} else {
- ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
- zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
- q2_r, q3_r);
+ ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+ q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
/* convert 16 bit output data into 8 bit */
- PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
- zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
- q0_filter8);
+ PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
+ q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
/* store pixel values */
@@ -80,13 +78,10 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
}
}
-void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch,
- const uint8_t *b_limit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *b_limit1,
- const uint8_t *limit1,
- const uint8_t *thresh1) {
+void vpx_lpf_horizontal_8_dual_msa(
+ uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
@@ -112,17 +107,16 @@ void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch,
limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit);
/* mask and hev */
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
- hev, mask, flat);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
if (__msa_test_bz_v(flat)) {
ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
} else {
- ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
- zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
- q2_r, q3_r);
+ ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+ q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
@@ -170,16 +164,16 @@ void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
/* load vector elements */
LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
- TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
- p3, p2, p1, p0, q0, q1, q2, q3);
+ TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+ q3);
thresh = (v16u8)__msa_fill_b(*thresh_ptr);
b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
limit = (v16u8)__msa_fill_b(*limit_ptr);
/* mask and hev */
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
- hev, mask, flat);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
/* flat4 */
VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
/* filter4 */
@@ -197,9 +191,8 @@ void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
src += 4 * pitch;
ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
} else {
- ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
- zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
- q3_r);
+ ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+ q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
/* convert 16 bit output data into 8 bit */
@@ -232,11 +225,9 @@ void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
}
void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
- const uint8_t *b_limit0,
- const uint8_t *limit0,
+ const uint8_t *b_limit0, const uint8_t *limit0,
const uint8_t *thresh0,
- const uint8_t *b_limit1,
- const uint8_t *limit1,
+ const uint8_t *b_limit1, const uint8_t *limit1,
const uint8_t *thresh1) {
uint8_t *temp_src;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
@@ -257,9 +248,9 @@ void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
/* transpose 16x8 matrix into 8x16 */
- TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
- q3, q2, q1, q0, row12, row13, row14, row15,
- p3, p2, p1, p0, q0, q1, q2, q3);
+ TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0,
+ row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2,
+ q3);
thresh = (v16u8)__msa_fill_b(*thresh0);
vec0 = (v8i16)__msa_fill_b(*thresh1);
@@ -274,8 +265,8 @@ void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit);
/* mask and hev */
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
- hev, mask, flat);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
/* flat4 */
VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
/* filter4 */
@@ -292,9 +283,8 @@ void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
src += 8 * pitch;
ST4x8_UB(vec4, vec5, src, pitch);
} else {
- ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
- zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
- q3_r);
+ ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+ q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
diff --git a/vpx_dsp/mips/loopfilter_filters_dspr2.c b/vpx_dsp/mips/loopfilter_filters_dspr2.c
index 8414b9ed5..f1743679a 100644
--- a/vpx_dsp/mips/loopfilter_filters_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_filters_dspr2.c
@@ -19,33 +19,30 @@
#include "vpx_mem/vpx_mem.h"
#if HAVE_DSPR2
-void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
- int pitch,
- const uint8_t *blimit,
- const uint8_t *limit,
+void vpx_lpf_horizontal_4_dspr2(unsigned char *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
const uint8_t *thresh) {
- uint8_t i;
- uint32_t mask;
- uint32_t hev;
- uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
- uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
- uint32_t thresh_vec, flimit_vec, limit_vec;
- uint32_t uflimit, ulimit, uthresh;
+ uint8_t i;
+ uint32_t mask;
+ uint32_t hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
uflimit = *blimit;
ulimit = *limit;
uthresh = *thresh;
/* create quad-byte */
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"replv.qb %[thresh_vec], %[uthresh] \n\t"
"replv.qb %[flimit_vec], %[uflimit] \n\t"
"replv.qb %[limit_vec], %[ulimit] \n\t"
- : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
- [limit_vec] "=r" (limit_vec)
- : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
- );
+ : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+ [limit_vec] "=r"(limit_vec)
+ : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
/* prefetch data for store */
prefetch_store(s);
@@ -62,49 +59,44 @@ void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
s5 = s4 + pitch;
s6 = s5 + pitch;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lw %[p1], (%[s1]) \n\t"
"lw %[p2], (%[s2]) \n\t"
"lw %[p3], (%[s3]) \n\t"
"lw %[p4], (%[s4]) \n\t"
- : [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4)
- : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
- );
+ : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4)
+ : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
/* if (p1 - p4 == 0) and (p2 - p3 == 0)
mask will be zero and filtering is not needed */
if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lw %[pm1], (%[sm1]) \n\t"
"lw %[p0], (%[s0]) \n\t"
"lw %[p5], (%[s5]) \n\t"
"lw %[p6], (%[s6]) \n\t"
- : [pm1] "=&r" (pm1), [p0] "=&r" (p0), [p5] "=&r" (p5),
- [p6] "=&r" (p6)
- : [sm1] "r" (sm1), [s0] "r" (s0), [s5] "r" (s5), [s6] "r" (s6)
- );
+ : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6)
+ : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6));
- filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2,
- pm1, p0, p3, p4, p5, p6,
- thresh_vec, &hev, &mask);
+ filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
+ p6, thresh_vec, &hev, &mask);
/* if mask == 0 do filtering is not needed */
if (mask) {
/* filtering */
filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sw %[p1], (%[s1]) \n\t"
"sw %[p2], (%[s2]) \n\t"
"sw %[p3], (%[s3]) \n\t"
"sw %[p4], (%[s4]) \n\t"
:
- : [p1] "r" (p1), [p2] "r" (p2), [p3] "r" (p3), [p4] "r" (p4),
- [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
- );
+ : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4),
+ [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
}
}
@@ -112,33 +104,30 @@ void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
}
}
-void vpx_lpf_vertical_4_dspr2(unsigned char *s,
- int pitch,
- const uint8_t *blimit,
- const uint8_t *limit,
+void vpx_lpf_vertical_4_dspr2(unsigned char *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
const uint8_t *thresh) {
- uint8_t i;
- uint32_t mask, hev;
- uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
- uint8_t *s1, *s2, *s3, *s4;
- uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
- uint32_t thresh_vec, flimit_vec, limit_vec;
- uint32_t uflimit, ulimit, uthresh;
+ uint8_t i;
+ uint32_t mask, hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ uint8_t *s1, *s2, *s3, *s4;
+ uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
uflimit = *blimit;
ulimit = *limit;
uthresh = *thresh;
/* create quad-byte */
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"replv.qb %[thresh_vec], %[uthresh] \n\t"
"replv.qb %[flimit_vec], %[uflimit] \n\t"
"replv.qb %[limit_vec], %[ulimit] \n\t"
- : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
- [limit_vec] "=r" (limit_vec)
- : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
- );
+ : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+ [limit_vec] "=r"(limit_vec)
+ : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
/* prefetch data for store */
prefetch_store(s + pitch);
@@ -148,22 +137,22 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
s2 = s + pitch;
s3 = s2 + pitch;
s4 = s3 + pitch;
- s = s4 + pitch;
+ s = s4 + pitch;
/* load quad-byte vectors
* memory is 4 byte aligned
*/
- p2 = *((uint32_t *)(s1 - 4));
- p6 = *((uint32_t *)(s1));
- p1 = *((uint32_t *)(s2 - 4));
- p5 = *((uint32_t *)(s2));
- p0 = *((uint32_t *)(s3 - 4));
- p4 = *((uint32_t *)(s3));
+ p2 = *((uint32_t *)(s1 - 4));
+ p6 = *((uint32_t *)(s1));
+ p1 = *((uint32_t *)(s2 - 4));
+ p5 = *((uint32_t *)(s2));
+ p0 = *((uint32_t *)(s3 - 4));
+ p4 = *((uint32_t *)(s3));
pm1 = *((uint32_t *)(s4 - 4));
- p3 = *((uint32_t *)(s4));
+ p3 = *((uint32_t *)(s4));
/* transpose pm1, p0, p1, p2 */
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
"precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
"precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
@@ -179,15 +168,13 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
"append %[p1], %[sec3], 16 \n\t"
"append %[pm1], %[sec4], 16 \n\t"
- : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
- [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
- [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
- [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
- :
- );
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
+ [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
/* transpose p3, p4, p5, p6 */
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
"precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
"precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
@@ -203,20 +190,17 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
"append %[p5], %[sec3], 16 \n\t"
"append %[p3], %[sec4], 16 \n\t"
- : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
- [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
- [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
- [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
- :
- );
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
+ [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
/* if (p1 - p4 == 0) and (p2 - p3 == 0)
* mask will be zero and filtering is not needed
*/
if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
- filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1,
- p0, p3, p4, p5, p6, thresh_vec,
- &hev, &mask);
+ filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
+ p6, thresh_vec, &hev, &mask);
/* if mask == 0 do filtering is not needed */
if (mask) {
@@ -227,107 +211,93 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
* don't use transpose on output data
* because memory isn't aligned
*/
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p4], 1(%[s4]) \n\t"
"sb %[p3], 0(%[s4]) \n\t"
"sb %[p2], -1(%[s4]) \n\t"
"sb %[p1], -2(%[s4]) \n\t"
:
- : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
- [s4] "r" (s4)
- );
+ : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+ [s4] "r"(s4));
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p4], %[p4], 8 \n\t"
"srl %[p3], %[p3], 8 \n\t"
"srl %[p2], %[p2], 8 \n\t"
"srl %[p1], %[p1], 8 \n\t"
- : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
- :
- );
+ : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+ :);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p4], 1(%[s3]) \n\t"
"sb %[p3], 0(%[s3]) \n\t"
"sb %[p2], -1(%[s3]) \n\t"
"sb %[p1], -2(%[s3]) \n\t"
- : [p1] "+r" (p1)
- : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [s3] "r" (s3)
- );
+ : [p1] "+r"(p1)
+ : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3));
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p4], %[p4], 8 \n\t"
"srl %[p3], %[p3], 8 \n\t"
"srl %[p2], %[p2], 8 \n\t"
"srl %[p1], %[p1], 8 \n\t"
- : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
- :
- );
+ : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+ :);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p4], 1(%[s2]) \n\t"
"sb %[p3], 0(%[s2]) \n\t"
"sb %[p2], -1(%[s2]) \n\t"
"sb %[p1], -2(%[s2]) \n\t"
:
- : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
- [s2] "r" (s2)
- );
+ : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+ [s2] "r"(s2));
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p4], %[p4], 8 \n\t"
"srl %[p3], %[p3], 8 \n\t"
"srl %[p2], %[p2], 8 \n\t"
"srl %[p1], %[p1], 8 \n\t"
- : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
- :
- );
+ : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+ :);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p4], 1(%[s1]) \n\t"
"sb %[p3], 0(%[s1]) \n\t"
"sb %[p2], -1(%[s1]) \n\t"
"sb %[p1], -2(%[s1]) \n\t"
:
- : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
- [s1] "r" (s1)
- );
+ : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+ [s1] "r"(s1));
}
}
}
}
-void vpx_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1) {
+void vpx_lpf_horizontal_4_dual_dspr2(
+ uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+ const uint8_t *limit1, const uint8_t *thresh1) {
vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0);
vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1);
}
-void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1) {
+void vpx_lpf_horizontal_8_dual_dspr2(
+ uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+ const uint8_t *limit1, const uint8_t *thresh1) {
vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0);
vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1);
}
-void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
- const uint8_t *blimit0,
+void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
const uint8_t *limit0,
const uint8_t *thresh0,
const uint8_t *blimit1,
@@ -337,8 +307,7 @@ void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
}
-void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
- const uint8_t *blimit0,
+void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
const uint8_t *limit0,
const uint8_t *thresh0,
const uint8_t *blimit1,
@@ -348,8 +317,7 @@ void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
}
-void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p,
- const uint8_t *blimit,
+void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh) {
vpx_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
diff --git a/vpx_dsp/mips/loopfilter_filters_dspr2.h b/vpx_dsp/mips/loopfilter_filters_dspr2.h
index 4a1506ba1..5b0c73345 100644
--- a/vpx_dsp/mips/loopfilter_filters_dspr2.h
+++ b/vpx_dsp/mips/loopfilter_filters_dspr2.h
@@ -24,22 +24,21 @@ extern "C" {
#if HAVE_DSPR2
/* inputs & outputs are quad-byte vectors */
-static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
- uint32_t *ps1, uint32_t *ps0,
- uint32_t *qs0, uint32_t *qs1) {
- int32_t vpx_filter_l, vpx_filter_r;
- int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
- int32_t subr_r, subr_l;
- uint32_t t1, t2, HWM, t3;
- uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
- int32_t vps1, vps0, vqs0, vqs1;
- int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
- uint32_t N128;
+static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, uint32_t *ps1,
+ uint32_t *ps0, uint32_t *qs0, uint32_t *qs1) {
+ int32_t vpx_filter_l, vpx_filter_r;
+ int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+ int32_t subr_r, subr_l;
+ uint32_t t1, t2, HWM, t3;
+ uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+ int32_t vps1, vps0, vqs0, vqs1;
+ int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+ uint32_t N128;
N128 = 0x80808080;
- t1 = 0x03000300;
- t2 = 0x04000400;
- t3 = 0x01000100;
+ t1 = 0x03000300;
+ t2 = 0x04000400;
+ t3 = 0x01000100;
HWM = 0xFF00FF00;
vps0 = (*ps0) ^ N128;
@@ -72,7 +71,7 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
hev_r = hev << 8;
hev_r = hev_r & HWM;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
/* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
"subq_s.ph %[vpx_filter_l], %[vps1_l], %[vqs1_l] \n\t"
"subq_s.ph %[vpx_filter_r], %[vps1_r], %[vqs1_r] \n\t"
@@ -99,20 +98,17 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
"and %[vpx_filter_l], %[vpx_filter_l], %[mask_l] \n\t"
"and %[vpx_filter_r], %[vpx_filter_r], %[mask_r] \n\t"
- : [vpx_filter_l] "=&r" (vpx_filter_l),
- [vpx_filter_r] "=&r" (vpx_filter_r),
- [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
- [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
- : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
- [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
- [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
- [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
- [hev_l] "r" (hev_l), [hev_r] "r" (hev_r),
- [HWM] "r" (HWM)
- );
+ : [vpx_filter_l] "=&r"(vpx_filter_l), [vpx_filter_r] "=&r"(vpx_filter_r),
+ [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
+ [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
+ : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
+ [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
+ [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
+ [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
+ [HWM] "r"(HWM));
/* save bottom 3 bits so that we round one side +4 and the other +3 */
- __asm__ __volatile__ (
+ __asm__ __volatile__(
/* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
"addq_s.ph %[Filter1_l], %[vpx_filter_l], %[t2] \n\t"
"addq_s.ph %[Filter1_r], %[vpx_filter_r], %[t2] \n\t"
@@ -137,15 +133,14 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
"subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
"subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
- : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
- [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
- [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
- [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
- : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
- [vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r)
- );
+ : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
+ [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
+ [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
+ [vqs0_r] "+r"(vqs0_r)
+ : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
+ [vpx_filter_l] "r"(vpx_filter_l), [vpx_filter_r] "r"(vpx_filter_r));
- __asm__ __volatile__ (
+ __asm__ __volatile__(
/* (vpx_filter += 1) >>= 1 */
"addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t"
"addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t"
@@ -162,11 +157,10 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
"subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t"
"subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t"
- : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
- [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
- [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
- : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
- );
+ : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
+ [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
+ [vqs1_r] "+r"(vqs1_r)
+ : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
/* Create quad-bytes from halfword pairs */
vqs0_l = vqs0_l & HWM;
@@ -174,16 +168,15 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
vps0_l = vps0_l & HWM;
vps1_l = vps1_l & HWM;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
"shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
"shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
"shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
- : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
- [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
- :
- );
+ : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
+ [vqs0_r] "+r"(vqs0_r)
+ :);
vqs0 = vqs0_l | vqs0_r;
vqs1 = vqs1_l | vqs1_r;
@@ -196,24 +189,23 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
*qs1 = vqs1 ^ N128;
}
-static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
- uint32_t ps1, uint32_t ps0,
- uint32_t qs0, uint32_t qs1,
+static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, uint32_t ps1,
+ uint32_t ps0, uint32_t qs0, uint32_t qs1,
uint32_t *p1_f0, uint32_t *p0_f0,
uint32_t *q0_f0, uint32_t *q1_f0) {
- int32_t vpx_filter_l, vpx_filter_r;
- int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
- int32_t subr_r, subr_l;
- uint32_t t1, t2, HWM, t3;
- uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
- int32_t vps1, vps0, vqs0, vqs1;
- int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
- uint32_t N128;
+ int32_t vpx_filter_l, vpx_filter_r;
+ int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+ int32_t subr_r, subr_l;
+ uint32_t t1, t2, HWM, t3;
+ uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+ int32_t vps1, vps0, vqs0, vqs1;
+ int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+ uint32_t N128;
N128 = 0x80808080;
- t1 = 0x03000300;
- t2 = 0x04000400;
- t3 = 0x01000100;
+ t1 = 0x03000300;
+ t2 = 0x04000400;
+ t3 = 0x01000100;
HWM = 0xFF00FF00;
vps0 = (ps0) ^ N128;
@@ -246,7 +238,7 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
hev_r = hev << 8;
hev_r = hev_r & HWM;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
/* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
"subq_s.ph %[vpx_filter_l], %[vps1_l], %[vqs1_l] \n\t"
"subq_s.ph %[vpx_filter_r], %[vps1_r], %[vqs1_r] \n\t"
@@ -273,19 +265,17 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
"and %[vpx_filter_l], %[vpx_filter_l], %[mask_l] \n\t"
"and %[vpx_filter_r], %[vpx_filter_r], %[mask_r] \n\t"
- : [vpx_filter_l] "=&r" (vpx_filter_l),
- [vpx_filter_r] "=&r" (vpx_filter_r),
- [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
- [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
- : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
- [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
- [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
- [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
- [hev_l] "r" (hev_l), [hev_r] "r" (hev_r), [HWM] "r" (HWM)
- );
+ : [vpx_filter_l] "=&r"(vpx_filter_l), [vpx_filter_r] "=&r"(vpx_filter_r),
+ [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
+ [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
+ : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
+ [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
+ [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
+ [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
+ [HWM] "r"(HWM));
/* save bottom 3 bits so that we round one side +4 and the other +3 */
- __asm__ __volatile__ (
+ __asm__ __volatile__(
/* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
"addq_s.ph %[Filter1_l], %[vpx_filter_l], %[t2] \n\t"
"addq_s.ph %[Filter1_r], %[vpx_filter_r], %[t2] \n\t"
@@ -310,15 +300,14 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
"subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
"subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
- : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
- [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
- [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
- [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
- : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
- [vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r)
- );
+ : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
+ [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
+ [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
+ [vqs0_r] "+r"(vqs0_r)
+ : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
+ [vpx_filter_l] "r"(vpx_filter_l), [vpx_filter_r] "r"(vpx_filter_r));
- __asm__ __volatile__ (
+ __asm__ __volatile__(
/* (vpx_filter += 1) >>= 1 */
"addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t"
"addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t"
@@ -335,11 +324,10 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
"subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t"
"subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t"
- : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
- [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
- [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
- : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
- );
+ : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
+ [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
+ [vqs1_r] "+r"(vqs1_r)
+ : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
/* Create quad-bytes from halfword pairs */
vqs0_l = vqs0_l & HWM;
@@ -347,16 +335,15 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
vps0_l = vps0_l & HWM;
vps1_l = vps1_l & HWM;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
"shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
"shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
"shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
- : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
- [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
- :
- );
+ : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
+ [vqs0_r] "+r"(vqs0_r)
+ :);
vqs0 = vqs0_l | vqs0_r;
vqs1 = vqs1_l | vqs1_r;
@@ -369,18 +356,17 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
*q1_f0 = vqs1 ^ N128;
}
-static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
- uint32_t *op1, uint32_t *op0,
- uint32_t *oq0, uint32_t *oq1,
+static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, uint32_t *op1,
+ uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
uint32_t *oq2, uint32_t *oq3) {
/* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
- uint32_t res_op2, res_op1, res_op0;
- uint32_t res_oq0, res_oq1, res_oq2;
- uint32_t tmp;
- uint32_t add_p210_q012;
- uint32_t u32Four = 0x00040004;
+ uint32_t res_op2, res_op1, res_op0;
+ uint32_t res_oq0, res_oq1, res_oq2;
+ uint32_t tmp;
+ uint32_t add_p210_q012;
+ uint32_t u32Four = 0x00040004;
/* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */
/* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */
@@ -389,7 +375,7 @@ static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
/* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */
/* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"addu.ph %[add_p210_q012], %[p2], %[p1] \n\t"
"addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t"
"addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t"
@@ -428,15 +414,12 @@ static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
"shrl.ph %[res_op0], %[res_op0], 3 \n\t"
"shrl.ph %[res_oq2], %[res_oq2], 3 \n\t"
- : [add_p210_q012] "=&r" (add_p210_q012),
- [tmp] "=&r" (tmp), [res_op2] "=&r" (res_op2),
- [res_op1] "=&r" (res_op1), [res_op0] "=&r" (res_op0),
- [res_oq0] "=&r" (res_oq0), [res_oq1] "=&r" (res_oq1),
- [res_oq2] "=&r" (res_oq2)
- : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
- [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
- [u32Four] "r" (u32Four)
- );
+ : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
+ [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
+ [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
+ [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
+ : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
+ [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
*op2 = res_op2;
*op1 = res_op1;
@@ -446,20 +429,18 @@ static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
*oq2 = res_oq2;
}
-static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2,
- uint32_t p1, uint32_t p0,
- uint32_t q0, uint32_t q1,
- uint32_t q2, uint32_t q3,
- uint32_t *op2_f1,
+static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, uint32_t p1,
+ uint32_t p0, uint32_t q0, uint32_t q1,
+ uint32_t q2, uint32_t q3, uint32_t *op2_f1,
uint32_t *op1_f1, uint32_t *op0_f1,
uint32_t *oq0_f1, uint32_t *oq1_f1,
uint32_t *oq2_f1) {
/* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
- uint32_t res_op2, res_op1, res_op0;
- uint32_t res_oq0, res_oq1, res_oq2;
- uint32_t tmp;
- uint32_t add_p210_q012;
- uint32_t u32Four = 0x00040004;
+ uint32_t res_op2, res_op1, res_op0;
+ uint32_t res_oq0, res_oq1, res_oq2;
+ uint32_t tmp;
+ uint32_t add_p210_q012;
+ uint32_t u32Four = 0x00040004;
/* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */
/* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */
@@ -468,7 +449,7 @@ static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2,
/* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */
/* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"addu.ph %[add_p210_q012], %[p2], %[p1] \n\t"
"addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t"
"addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t"
@@ -507,14 +488,12 @@ static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2,
"shrl.ph %[res_op0], %[res_op0], 3 \n\t"
"shrl.ph %[res_oq2], %[res_oq2], 3 \n\t"
- : [add_p210_q012] "=&r" (add_p210_q012), [tmp] "=&r" (tmp),
- [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1),
- [res_op0] "=&r" (res_op0), [res_oq0] "=&r" (res_oq0),
- [res_oq1] "=&r" (res_oq1), [res_oq2] "=&r" (res_oq2)
- : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
- [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
- [u32Four] "r" (u32Four)
- );
+ : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
+ [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
+ [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
+ [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
+ : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
+ [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
*op2_f1 = res_op2;
*op1_f1 = res_op1;
@@ -524,25 +503,22 @@ static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2,
*oq2_f1 = res_oq2;
}
-static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
- uint32_t *op5, uint32_t *op4,
- uint32_t *op3, uint32_t *op2,
- uint32_t *op1, uint32_t *op0,
- uint32_t *oq0, uint32_t *oq1,
- uint32_t *oq2, uint32_t *oq3,
- uint32_t *oq4, uint32_t *oq5,
- uint32_t *oq6, uint32_t *oq7) {
+static INLINE void wide_mbfilter_dspr2(
+ uint32_t *op7, uint32_t *op6, uint32_t *op5, uint32_t *op4, uint32_t *op3,
+ uint32_t *op2, uint32_t *op1, uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
+ uint32_t *oq2, uint32_t *oq3, uint32_t *oq4, uint32_t *oq5, uint32_t *oq6,
+ uint32_t *oq7) {
const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
- uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0;
- uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6;
- uint32_t tmp;
- uint32_t add_p6toq6;
- uint32_t u32Eight = 0x00080008;
+ uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0;
+ uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6;
+ uint32_t tmp;
+ uint32_t add_p6toq6;
+ uint32_t u32Eight = 0x00080008;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
/* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6
which is used most of the time */
"addu.ph %[add_p6toq6], %[p6], %[p5] \n\t"
@@ -560,15 +536,13 @@ static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
"addu.ph %[add_p6toq6], %[add_p6toq6], %[q6] \n\t"
"addu.ph %[add_p6toq6], %[add_p6toq6], %[u32Eight] \n\t"
- : [add_p6toq6] "=&r" (add_p6toq6)
- : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
- [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
- [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3),
- [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6),
- [u32Eight] "r" (u32Eight)
- );
+ : [add_p6toq6] "=&r"(add_p6toq6)
+ : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2),
+ [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2),
+ [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
+ [u32Eight] "r"(u32Eight));
- __asm__ __volatile__ (
+ __asm__ __volatile__(
/* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 +
p3 + p2 + p1 + p0 + q0, 4) */
"shll.ph %[tmp], %[p7], 3 \n\t"
@@ -643,16 +617,14 @@ static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
"addu.ph %[res_op0], %[res_op0], %[add_p6toq6] \n\t"
"shrl.ph %[res_op0], %[res_op0], 4 \n\t"
- : [res_op6] "=&r" (res_op6), [res_op5] "=&r" (res_op5),
- [res_op4] "=&r" (res_op4), [res_op3] "=&r" (res_op3),
- [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1),
- [res_op0] "=&r" (res_op0), [tmp] "=&r" (tmp)
- : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
- [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
- [q2] "r" (q2), [q1] "r" (q1),
- [q3] "r" (q3), [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6),
- [add_p6toq6] "r" (add_p6toq6)
- );
+ : [res_op6] "=&r"(res_op6), [res_op5] "=&r"(res_op5),
+ [res_op4] "=&r"(res_op4), [res_op3] "=&r"(res_op3),
+ [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
+ [res_op0] "=&r"(res_op0), [tmp] "=&r"(tmp)
+ : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
+ [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q2] "r"(q2), [q1] "r"(q1),
+ [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
+ [add_p6toq6] "r"(add_p6toq6));
*op6 = res_op6;
*op5 = res_op5;
@@ -662,7 +634,7 @@ static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
*op1 = res_op1;
*op0 = res_op0;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
/* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */
"addu.ph %[res_oq0], %[q7], %[q0] \n\t"
@@ -737,16 +709,14 @@ static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
"subu.ph %[res_oq6], %[res_oq6], %[p6] \n\t"
"shrl.ph %[res_oq6], %[res_oq6], 4 \n\t"
- : [res_oq6] "=&r" (res_oq6), [res_oq5] "=&r" (res_oq5),
- [res_oq4] "=&r" (res_oq4), [res_oq3] "=&r" (res_oq3),
- [res_oq2] "=&r" (res_oq2), [res_oq1] "=&r" (res_oq1),
- [res_oq0] "=&r" (res_oq0), [tmp] "=&r" (tmp)
- : [q7] "r" (q7), [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4),
- [q3] "r" (q3), [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0),
- [p1] "r" (p1), [p2] "r" (p2),
- [p3] "r" (p3), [p4] "r" (p4), [p5] "r" (p5), [p6] "r" (p6),
- [add_p6toq6] "r" (add_p6toq6)
- );
+ : [res_oq6] "=&r"(res_oq6), [res_oq5] "=&r"(res_oq5),
+ [res_oq4] "=&r"(res_oq4), [res_oq3] "=&r"(res_oq3),
+ [res_oq2] "=&r"(res_oq2), [res_oq1] "=&r"(res_oq1),
+ [res_oq0] "=&r"(res_oq0), [tmp] "=&r"(tmp)
+ : [q7] "r"(q7), [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
+ [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [p1] "r"(p1), [p2] "r"(p2),
+ [p3] "r"(p3), [p4] "r"(p4), [p5] "r"(p5), [p6] "r"(p6),
+ [add_p6toq6] "r"(add_p6toq6));
*oq0 = res_oq0;
*oq1 = res_oq1;
diff --git a/vpx_dsp/mips/loopfilter_macros_dspr2.h b/vpx_dsp/mips/loopfilter_macros_dspr2.h
index 994ff185a..38ed0b2a6 100644
--- a/vpx_dsp/mips/loopfilter_macros_dspr2.h
+++ b/vpx_dsp/mips/loopfilter_macros_dspr2.h
@@ -22,453 +22,410 @@ extern "C" {
#endif
#if HAVE_DSPR2
-#define STORE_F0() { \
- __asm__ __volatile__ ( \
- "sb %[q1_f0], 1(%[s4]) \n\t" \
- "sb %[q0_f0], 0(%[s4]) \n\t" \
- "sb %[p0_f0], -1(%[s4]) \n\t" \
- "sb %[p1_f0], -2(%[s4]) \n\t" \
- \
- : \
- : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0), \
- [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0), \
- [s4] "r" (s4) \
- ); \
- \
- __asm__ __volatile__ ( \
- "srl %[q1_f0], %[q1_f0], 8 \n\t" \
- "srl %[q0_f0], %[q0_f0], 8 \n\t" \
- "srl %[p0_f0], %[p0_f0], 8 \n\t" \
- "srl %[p1_f0], %[p1_f0], 8 \n\t" \
- \
- : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0), \
- [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0) \
- : \
- ); \
- \
- __asm__ __volatile__ ( \
- "sb %[q1_f0], 1(%[s3]) \n\t" \
- "sb %[q0_f0], 0(%[s3]) \n\t" \
- "sb %[p0_f0], -1(%[s3]) \n\t" \
- "sb %[p1_f0], -2(%[s3]) \n\t" \
- \
- : [p1_f0] "+r" (p1_f0) \
- : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0), \
- [s3] "r" (s3), [p0_f0] "r" (p0_f0) \
- ); \
- \
- __asm__ __volatile__ ( \
- "srl %[q1_f0], %[q1_f0], 8 \n\t" \
- "srl %[q0_f0], %[q0_f0], 8 \n\t" \
- "srl %[p0_f0], %[p0_f0], 8 \n\t" \
- "srl %[p1_f0], %[p1_f0], 8 \n\t" \
- \
- : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0), \
- [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0) \
- : \
- ); \
- \
- __asm__ __volatile__ ( \
- "sb %[q1_f0], 1(%[s2]) \n\t" \
- "sb %[q0_f0], 0(%[s2]) \n\t" \
- "sb %[p0_f0], -1(%[s2]) \n\t" \
- "sb %[p1_f0], -2(%[s2]) \n\t" \
- \
- : \
- : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0), \
- [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0), \
- [s2] "r" (s2) \
- ); \
- \
- __asm__ __volatile__ ( \
- "srl %[q1_f0], %[q1_f0], 8 \n\t" \
- "srl %[q0_f0], %[q0_f0], 8 \n\t" \
- "srl %[p0_f0], %[p0_f0], 8 \n\t" \
- "srl %[p1_f0], %[p1_f0], 8 \n\t" \
- \
- : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0), \
- [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0) \
- : \
- ); \
- \
- __asm__ __volatile__ ( \
- "sb %[q1_f0], 1(%[s1]) \n\t" \
- "sb %[q0_f0], 0(%[s1]) \n\t" \
- "sb %[p0_f0], -1(%[s1]) \n\t" \
- "sb %[p1_f0], -2(%[s1]) \n\t" \
- \
- : \
- : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0), \
- [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0), \
- [s1] "r" (s1) \
- ); \
-}
+#define STORE_F0() \
+ { \
+ __asm__ __volatile__( \
+ "sb %[q1_f0], 1(%[s4]) \n\t" \
+ "sb %[q0_f0], 0(%[s4]) \n\t" \
+ "sb %[p0_f0], -1(%[s4]) \n\t" \
+ "sb %[p1_f0], -2(%[s4]) \n\t" \
+ \
+ : \
+ : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \
+ [p1_f0] "r"(p1_f0), [s4] "r"(s4)); \
+ \
+ __asm__ __volatile__( \
+ "srl %[q1_f0], %[q1_f0], 8 \n\t" \
+ "srl %[q0_f0], %[q0_f0], 8 \n\t" \
+ "srl %[p0_f0], %[p0_f0], 8 \n\t" \
+ "srl %[p1_f0], %[p1_f0], 8 \n\t" \
+ \
+ : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
+ [p1_f0] "+r"(p1_f0) \
+ :); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q1_f0], 1(%[s3]) \n\t" \
+ "sb %[q0_f0], 0(%[s3]) \n\t" \
+ "sb %[p0_f0], -1(%[s3]) \n\t" \
+ "sb %[p1_f0], -2(%[s3]) \n\t" \
+ \
+ : [p1_f0] "+r"(p1_f0) \
+ : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [s3] "r"(s3), \
+ [p0_f0] "r"(p0_f0)); \
+ \
+ __asm__ __volatile__( \
+ "srl %[q1_f0], %[q1_f0], 8 \n\t" \
+ "srl %[q0_f0], %[q0_f0], 8 \n\t" \
+ "srl %[p0_f0], %[p0_f0], 8 \n\t" \
+ "srl %[p1_f0], %[p1_f0], 8 \n\t" \
+ \
+ : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
+ [p1_f0] "+r"(p1_f0) \
+ :); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q1_f0], 1(%[s2]) \n\t" \
+ "sb %[q0_f0], 0(%[s2]) \n\t" \
+ "sb %[p0_f0], -1(%[s2]) \n\t" \
+ "sb %[p1_f0], -2(%[s2]) \n\t" \
+ \
+ : \
+ : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \
+ [p1_f0] "r"(p1_f0), [s2] "r"(s2)); \
+ \
+ __asm__ __volatile__( \
+ "srl %[q1_f0], %[q1_f0], 8 \n\t" \
+ "srl %[q0_f0], %[q0_f0], 8 \n\t" \
+ "srl %[p0_f0], %[p0_f0], 8 \n\t" \
+ "srl %[p1_f0], %[p1_f0], 8 \n\t" \
+ \
+ : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
+ [p1_f0] "+r"(p1_f0) \
+ :); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q1_f0], 1(%[s1]) \n\t" \
+ "sb %[q0_f0], 0(%[s1]) \n\t" \
+ "sb %[p0_f0], -1(%[s1]) \n\t" \
+ "sb %[p1_f0], -2(%[s1]) \n\t" \
+ \
+ : \
+ : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \
+ [p1_f0] "r"(p1_f0), [s1] "r"(s1)); \
+ }
-#define STORE_F1() { \
- __asm__ __volatile__ ( \
- "sb %[q2_r], 2(%[s4]) \n\t" \
- "sb %[q1_r], 1(%[s4]) \n\t" \
- "sb %[q0_r], 0(%[s4]) \n\t" \
- "sb %[p0_r], -1(%[s4]) \n\t" \
- "sb %[p1_r], -2(%[s4]) \n\t" \
- "sb %[p2_r], -3(%[s4]) \n\t" \
- \
- : \
- : [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), [q0_r] "r" (q0_r), \
- [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r), \
- [s4] "r" (s4) \
- ); \
- \
- __asm__ __volatile__ ( \
- "srl %[q2_r], %[q2_r], 16 \n\t" \
- "srl %[q1_r], %[q1_r], 16 \n\t" \
- "srl %[q0_r], %[q0_r], 16 \n\t" \
- "srl %[p0_r], %[p0_r], 16 \n\t" \
- "srl %[p1_r], %[p1_r], 16 \n\t" \
- "srl %[p2_r], %[p2_r], 16 \n\t" \
- \
- : [q2_r] "+r" (q2_r), [q1_r] "+r" (q1_r), [q0_r] "+r" (q0_r), \
- [p0_r] "+r" (p0_r), [p1_r] "+r" (p1_r), [p2_r] "+r" (p2_r) \
- : \
- ); \
- \
- __asm__ __volatile__ ( \
- "sb %[q2_r], 2(%[s3]) \n\t" \
- "sb %[q1_r], 1(%[s3]) \n\t" \
- "sb %[q0_r], 0(%[s3]) \n\t" \
- "sb %[p0_r], -1(%[s3]) \n\t" \
- "sb %[p1_r], -2(%[s3]) \n\t" \
- "sb %[p2_r], -3(%[s3]) \n\t" \
- \
- : \
- : [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), [q0_r] "r" (q0_r), \
- [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r), \
- [s3] "r" (s3) \
- ); \
- \
- __asm__ __volatile__ ( \
- "sb %[q2_l], 2(%[s2]) \n\t" \
- "sb %[q1_l], 1(%[s2]) \n\t" \
- "sb %[q0_l], 0(%[s2]) \n\t" \
- "sb %[p0_l], -1(%[s2]) \n\t" \
- "sb %[p1_l], -2(%[s2]) \n\t" \
- "sb %[p2_l], -3(%[s2]) \n\t" \
- \
- : \
- : [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), [q0_l] "r" (q0_l), \
- [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l), \
- [s2] "r" (s2) \
- ); \
- \
- __asm__ __volatile__ ( \
- "srl %[q2_l], %[q2_l], 16 \n\t" \
- "srl %[q1_l], %[q1_l], 16 \n\t" \
- "srl %[q0_l], %[q0_l], 16 \n\t" \
- "srl %[p0_l], %[p0_l], 16 \n\t" \
- "srl %[p1_l], %[p1_l], 16 \n\t" \
- "srl %[p2_l], %[p2_l], 16 \n\t" \
- \
- : [q2_l] "+r" (q2_l), [q1_l] "+r" (q1_l), [q0_l] "+r" (q0_l), \
- [p0_l] "+r" (p0_l), [p1_l] "+r" (p1_l), [p2_l] "+r" (p2_l) \
- : \
- ); \
- \
- __asm__ __volatile__ ( \
- "sb %[q2_l], 2(%[s1]) \n\t" \
- "sb %[q1_l], 1(%[s1]) \n\t" \
- "sb %[q0_l], 0(%[s1]) \n\t" \
- "sb %[p0_l], -1(%[s1]) \n\t" \
- "sb %[p1_l], -2(%[s1]) \n\t" \
- "sb %[p2_l], -3(%[s1]) \n\t" \
- \
- : \
- : [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), [q0_l] "r" (q0_l), \
- [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l), \
- [s1] "r" (s1) \
- ); \
-}
+#define STORE_F1() \
+ { \
+ __asm__ __volatile__( \
+ "sb %[q2_r], 2(%[s4]) \n\t" \
+ "sb %[q1_r], 1(%[s4]) \n\t" \
+ "sb %[q0_r], 0(%[s4]) \n\t" \
+ "sb %[p0_r], -1(%[s4]) \n\t" \
+ "sb %[p1_r], -2(%[s4]) \n\t" \
+ "sb %[p2_r], -3(%[s4]) \n\t" \
+ \
+ : \
+ : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \
+ [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s4] "r"(s4)); \
+ \
+ __asm__ __volatile__( \
+ "srl %[q2_r], %[q2_r], 16 \n\t" \
+ "srl %[q1_r], %[q1_r], 16 \n\t" \
+ "srl %[q0_r], %[q0_r], 16 \n\t" \
+ "srl %[p0_r], %[p0_r], 16 \n\t" \
+ "srl %[p1_r], %[p1_r], 16 \n\t" \
+ "srl %[p2_r], %[p2_r], 16 \n\t" \
+ \
+ : [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), [q0_r] "+r"(q0_r), \
+ [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), [p2_r] "+r"(p2_r) \
+ :); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q2_r], 2(%[s3]) \n\t" \
+ "sb %[q1_r], 1(%[s3]) \n\t" \
+ "sb %[q0_r], 0(%[s3]) \n\t" \
+ "sb %[p0_r], -1(%[s3]) \n\t" \
+ "sb %[p1_r], -2(%[s3]) \n\t" \
+ "sb %[p2_r], -3(%[s3]) \n\t" \
+ \
+ : \
+ : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \
+ [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s3] "r"(s3)); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q2_l], 2(%[s2]) \n\t" \
+ "sb %[q1_l], 1(%[s2]) \n\t" \
+ "sb %[q0_l], 0(%[s2]) \n\t" \
+ "sb %[p0_l], -1(%[s2]) \n\t" \
+ "sb %[p1_l], -2(%[s2]) \n\t" \
+ "sb %[p2_l], -3(%[s2]) \n\t" \
+ \
+ : \
+ : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \
+ [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s2] "r"(s2)); \
+ \
+ __asm__ __volatile__( \
+ "srl %[q2_l], %[q2_l], 16 \n\t" \
+ "srl %[q1_l], %[q1_l], 16 \n\t" \
+ "srl %[q0_l], %[q0_l], 16 \n\t" \
+ "srl %[p0_l], %[p0_l], 16 \n\t" \
+ "srl %[p1_l], %[p1_l], 16 \n\t" \
+ "srl %[p2_l], %[p2_l], 16 \n\t" \
+ \
+ : [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), [q0_l] "+r"(q0_l), \
+ [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), [p2_l] "+r"(p2_l) \
+ :); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q2_l], 2(%[s1]) \n\t" \
+ "sb %[q1_l], 1(%[s1]) \n\t" \
+ "sb %[q0_l], 0(%[s1]) \n\t" \
+ "sb %[p0_l], -1(%[s1]) \n\t" \
+ "sb %[p1_l], -2(%[s1]) \n\t" \
+ "sb %[p2_l], -3(%[s1]) \n\t" \
+ \
+ : \
+ : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \
+ [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s1] "r"(s1)); \
+ }
-#define STORE_F2() { \
- __asm__ __volatile__ ( \
- "sb %[q6_r], 6(%[s4]) \n\t" \
- "sb %[q5_r], 5(%[s4]) \n\t" \
- "sb %[q4_r], 4(%[s4]) \n\t" \
- "sb %[q3_r], 3(%[s4]) \n\t" \
- "sb %[q2_r], 2(%[s4]) \n\t" \
- "sb %[q1_r], 1(%[s4]) \n\t" \
- "sb %[q0_r], 0(%[s4]) \n\t" \
- "sb %[p0_r], -1(%[s4]) \n\t" \
- "sb %[p1_r], -2(%[s4]) \n\t" \
- "sb %[p2_r], -3(%[s4]) \n\t" \
- "sb %[p3_r], -4(%[s4]) \n\t" \
- "sb %[p4_r], -5(%[s4]) \n\t" \
- "sb %[p5_r], -6(%[s4]) \n\t" \
- "sb %[p6_r], -7(%[s4]) \n\t" \
- \
- : \
- : [q6_r] "r" (q6_r), [q5_r] "r" (q5_r), [q4_r] "r" (q4_r), \
- [q3_r] "r" (q3_r), [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), \
- [q0_r] "r" (q0_r), \
- [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r), \
- [p3_r] "r" (p3_r), [p4_r] "r" (p4_r), [p5_r] "r" (p5_r), \
- [p6_r] "r" (p6_r), \
- [s4] "r" (s4) \
- ); \
- \
- __asm__ __volatile__ ( \
- "srl %[q6_r], %[q6_r], 16 \n\t" \
- "srl %[q5_r], %[q5_r], 16 \n\t" \
- "srl %[q4_r], %[q4_r], 16 \n\t" \
- "srl %[q3_r], %[q3_r], 16 \n\t" \
- "srl %[q2_r], %[q2_r], 16 \n\t" \
- "srl %[q1_r], %[q1_r], 16 \n\t" \
- "srl %[q0_r], %[q0_r], 16 \n\t" \
- "srl %[p0_r], %[p0_r], 16 \n\t" \
- "srl %[p1_r], %[p1_r], 16 \n\t" \
- "srl %[p2_r], %[p2_r], 16 \n\t" \
- "srl %[p3_r], %[p3_r], 16 \n\t" \
- "srl %[p4_r], %[p4_r], 16 \n\t" \
- "srl %[p5_r], %[p5_r], 16 \n\t" \
- "srl %[p6_r], %[p6_r], 16 \n\t" \
- \
- : [q6_r] "+r" (q6_r), [q5_r] "+r" (q5_r), [q4_r] "+r" (q4_r), \
- [q3_r] "+r" (q3_r), [q2_r] "+r" (q2_r), [q1_r] "+r" (q1_r), \
- [q0_r] "+r" (q0_r), \
- [p0_r] "+r" (p0_r), [p1_r] "+r" (p1_r), [p2_r] "+r" (p2_r), \
- [p3_r] "+r" (p3_r), [p4_r] "+r" (p4_r), [p5_r] "+r" (p5_r), \
- [p6_r] "+r" (p6_r) \
- : \
- ); \
- \
- __asm__ __volatile__ ( \
- "sb %[q6_r], 6(%[s3]) \n\t" \
- "sb %[q5_r], 5(%[s3]) \n\t" \
- "sb %[q4_r], 4(%[s3]) \n\t" \
- "sb %[q3_r], 3(%[s3]) \n\t" \
- "sb %[q2_r], 2(%[s3]) \n\t" \
- "sb %[q1_r], 1(%[s3]) \n\t" \
- "sb %[q0_r], 0(%[s3]) \n\t" \
- "sb %[p0_r], -1(%[s3]) \n\t" \
- "sb %[p1_r], -2(%[s3]) \n\t" \
- "sb %[p2_r], -3(%[s3]) \n\t" \
- "sb %[p3_r], -4(%[s3]) \n\t" \
- "sb %[p4_r], -5(%[s3]) \n\t" \
- "sb %[p5_r], -6(%[s3]) \n\t" \
- "sb %[p6_r], -7(%[s3]) \n\t" \
- \
- : \
- : [q6_r] "r" (q6_r), [q5_r] "r" (q5_r), [q4_r] "r" (q4_r), \
- [q3_r] "r" (q3_r), [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), \
- [q0_r] "r" (q0_r), \
- [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r), \
- [p3_r] "r" (p3_r), [p4_r] "r" (p4_r), [p5_r] "r" (p5_r), \
- [p6_r] "r" (p6_r), \
- [s3] "r" (s3) \
- ); \
- \
- __asm__ __volatile__ ( \
- "sb %[q6_l], 6(%[s2]) \n\t" \
- "sb %[q5_l], 5(%[s2]) \n\t" \
- "sb %[q4_l], 4(%[s2]) \n\t" \
- "sb %[q3_l], 3(%[s2]) \n\t" \
- "sb %[q2_l], 2(%[s2]) \n\t" \
- "sb %[q1_l], 1(%[s2]) \n\t" \
- "sb %[q0_l], 0(%[s2]) \n\t" \
- "sb %[p0_l], -1(%[s2]) \n\t" \
- "sb %[p1_l], -2(%[s2]) \n\t" \
- "sb %[p2_l], -3(%[s2]) \n\t" \
- "sb %[p3_l], -4(%[s2]) \n\t" \
- "sb %[p4_l], -5(%[s2]) \n\t" \
- "sb %[p5_l], -6(%[s2]) \n\t" \
- "sb %[p6_l], -7(%[s2]) \n\t" \
- \
- : \
- : [q6_l] "r" (q6_l), [q5_l] "r" (q5_l), [q4_l] "r" (q4_l), \
- [q3_l] "r" (q3_l), [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), \
- [q0_l] "r" (q0_l), \
- [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l), \
- [p3_l] "r" (p3_l), [p4_l] "r" (p4_l), [p5_l] "r" (p5_l), \
- [p6_l] "r" (p6_l), \
- [s2] "r" (s2) \
- ); \
- \
- __asm__ __volatile__ ( \
- "srl %[q6_l], %[q6_l], 16 \n\t" \
- "srl %[q5_l], %[q5_l], 16 \n\t" \
- "srl %[q4_l], %[q4_l], 16 \n\t" \
- "srl %[q3_l], %[q3_l], 16 \n\t" \
- "srl %[q2_l], %[q2_l], 16 \n\t" \
- "srl %[q1_l], %[q1_l], 16 \n\t" \
- "srl %[q0_l], %[q0_l], 16 \n\t" \
- "srl %[p0_l], %[p0_l], 16 \n\t" \
- "srl %[p1_l], %[p1_l], 16 \n\t" \
- "srl %[p2_l], %[p2_l], 16 \n\t" \
- "srl %[p3_l], %[p3_l], 16 \n\t" \
- "srl %[p4_l], %[p4_l], 16 \n\t" \
- "srl %[p5_l], %[p5_l], 16 \n\t" \
- "srl %[p6_l], %[p6_l], 16 \n\t" \
- \
- : [q6_l] "+r" (q6_l), [q5_l] "+r" (q5_l), [q4_l] "+r" (q4_l), \
- [q3_l] "+r" (q3_l), [q2_l] "+r" (q2_l), [q1_l] "+r" (q1_l), \
- [q0_l] "+r" (q0_l), \
- [p0_l] "+r" (p0_l), [p1_l] "+r" (p1_l), [p2_l] "+r" (p2_l), \
- [p3_l] "+r" (p3_l), [p4_l] "+r" (p4_l), [p5_l] "+r" (p5_l), \
- [p6_l] "+r" (p6_l) \
- : \
- ); \
- \
- __asm__ __volatile__ ( \
- "sb %[q6_l], 6(%[s1]) \n\t" \
- "sb %[q5_l], 5(%[s1]) \n\t" \
- "sb %[q4_l], 4(%[s1]) \n\t" \
- "sb %[q3_l], 3(%[s1]) \n\t" \
- "sb %[q2_l], 2(%[s1]) \n\t" \
- "sb %[q1_l], 1(%[s1]) \n\t" \
- "sb %[q0_l], 0(%[s1]) \n\t" \
- "sb %[p0_l], -1(%[s1]) \n\t" \
- "sb %[p1_l], -2(%[s1]) \n\t" \
- "sb %[p2_l], -3(%[s1]) \n\t" \
- "sb %[p3_l], -4(%[s1]) \n\t" \
- "sb %[p4_l], -5(%[s1]) \n\t" \
- "sb %[p5_l], -6(%[s1]) \n\t" \
- "sb %[p6_l], -7(%[s1]) \n\t" \
- \
- : \
- : [q6_l] "r" (q6_l), [q5_l] "r" (q5_l), [q4_l] "r" (q4_l), \
- [q3_l] "r" (q3_l), [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), \
- [q0_l] "r" (q0_l), \
- [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l), \
- [p3_l] "r" (p3_l), [p4_l] "r" (p4_l), [p5_l] "r" (p5_l), \
- [p6_l] "r" (p6_l), \
- [s1] "r" (s1) \
- ); \
-}
+#define STORE_F2() \
+ { \
+ __asm__ __volatile__( \
+ "sb %[q6_r], 6(%[s4]) \n\t" \
+ "sb %[q5_r], 5(%[s4]) \n\t" \
+ "sb %[q4_r], 4(%[s4]) \n\t" \
+ "sb %[q3_r], 3(%[s4]) \n\t" \
+ "sb %[q2_r], 2(%[s4]) \n\t" \
+ "sb %[q1_r], 1(%[s4]) \n\t" \
+ "sb %[q0_r], 0(%[s4]) \n\t" \
+ "sb %[p0_r], -1(%[s4]) \n\t" \
+ "sb %[p1_r], -2(%[s4]) \n\t" \
+ "sb %[p2_r], -3(%[s4]) \n\t" \
+ "sb %[p3_r], -4(%[s4]) \n\t" \
+ "sb %[p4_r], -5(%[s4]) \n\t" \
+ "sb %[p5_r], -6(%[s4]) \n\t" \
+ "sb %[p6_r], -7(%[s4]) \n\t" \
+ \
+ : \
+ : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \
+ [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \
+ [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \
+ [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \
+ [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s4] "r"(s4)); \
+ \
+ __asm__ __volatile__( \
+ "srl %[q6_r], %[q6_r], 16 \n\t" \
+ "srl %[q5_r], %[q5_r], 16 \n\t" \
+ "srl %[q4_r], %[q4_r], 16 \n\t" \
+ "srl %[q3_r], %[q3_r], 16 \n\t" \
+ "srl %[q2_r], %[q2_r], 16 \n\t" \
+ "srl %[q1_r], %[q1_r], 16 \n\t" \
+ "srl %[q0_r], %[q0_r], 16 \n\t" \
+ "srl %[p0_r], %[p0_r], 16 \n\t" \
+ "srl %[p1_r], %[p1_r], 16 \n\t" \
+ "srl %[p2_r], %[p2_r], 16 \n\t" \
+ "srl %[p3_r], %[p3_r], 16 \n\t" \
+ "srl %[p4_r], %[p4_r], 16 \n\t" \
+ "srl %[p5_r], %[p5_r], 16 \n\t" \
+ "srl %[p6_r], %[p6_r], 16 \n\t" \
+ \
+ : [q6_r] "+r"(q6_r), [q5_r] "+r"(q5_r), [q4_r] "+r"(q4_r), \
+ [q3_r] "+r"(q3_r), [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), \
+ [q0_r] "+r"(q0_r), [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), \
+ [p2_r] "+r"(p2_r), [p3_r] "+r"(p3_r), [p4_r] "+r"(p4_r), \
+ [p5_r] "+r"(p5_r), [p6_r] "+r"(p6_r) \
+ :); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q6_r], 6(%[s3]) \n\t" \
+ "sb %[q5_r], 5(%[s3]) \n\t" \
+ "sb %[q4_r], 4(%[s3]) \n\t" \
+ "sb %[q3_r], 3(%[s3]) \n\t" \
+ "sb %[q2_r], 2(%[s3]) \n\t" \
+ "sb %[q1_r], 1(%[s3]) \n\t" \
+ "sb %[q0_r], 0(%[s3]) \n\t" \
+ "sb %[p0_r], -1(%[s3]) \n\t" \
+ "sb %[p1_r], -2(%[s3]) \n\t" \
+ "sb %[p2_r], -3(%[s3]) \n\t" \
+ "sb %[p3_r], -4(%[s3]) \n\t" \
+ "sb %[p4_r], -5(%[s3]) \n\t" \
+ "sb %[p5_r], -6(%[s3]) \n\t" \
+ "sb %[p6_r], -7(%[s3]) \n\t" \
+ \
+ : \
+ : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \
+ [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \
+ [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \
+ [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \
+ [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s3] "r"(s3)); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q6_l], 6(%[s2]) \n\t" \
+ "sb %[q5_l], 5(%[s2]) \n\t" \
+ "sb %[q4_l], 4(%[s2]) \n\t" \
+ "sb %[q3_l], 3(%[s2]) \n\t" \
+ "sb %[q2_l], 2(%[s2]) \n\t" \
+ "sb %[q1_l], 1(%[s2]) \n\t" \
+ "sb %[q0_l], 0(%[s2]) \n\t" \
+ "sb %[p0_l], -1(%[s2]) \n\t" \
+ "sb %[p1_l], -2(%[s2]) \n\t" \
+ "sb %[p2_l], -3(%[s2]) \n\t" \
+ "sb %[p3_l], -4(%[s2]) \n\t" \
+ "sb %[p4_l], -5(%[s2]) \n\t" \
+ "sb %[p5_l], -6(%[s2]) \n\t" \
+ "sb %[p6_l], -7(%[s2]) \n\t" \
+ \
+ : \
+ : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \
+ [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \
+ [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \
+ [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \
+ [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s2] "r"(s2)); \
+ \
+ __asm__ __volatile__( \
+ "srl %[q6_l], %[q6_l], 16 \n\t" \
+ "srl %[q5_l], %[q5_l], 16 \n\t" \
+ "srl %[q4_l], %[q4_l], 16 \n\t" \
+ "srl %[q3_l], %[q3_l], 16 \n\t" \
+ "srl %[q2_l], %[q2_l], 16 \n\t" \
+ "srl %[q1_l], %[q1_l], 16 \n\t" \
+ "srl %[q0_l], %[q0_l], 16 \n\t" \
+ "srl %[p0_l], %[p0_l], 16 \n\t" \
+ "srl %[p1_l], %[p1_l], 16 \n\t" \
+ "srl %[p2_l], %[p2_l], 16 \n\t" \
+ "srl %[p3_l], %[p3_l], 16 \n\t" \
+ "srl %[p4_l], %[p4_l], 16 \n\t" \
+ "srl %[p5_l], %[p5_l], 16 \n\t" \
+ "srl %[p6_l], %[p6_l], 16 \n\t" \
+ \
+ : [q6_l] "+r"(q6_l), [q5_l] "+r"(q5_l), [q4_l] "+r"(q4_l), \
+ [q3_l] "+r"(q3_l), [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), \
+ [q0_l] "+r"(q0_l), [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), \
+ [p2_l] "+r"(p2_l), [p3_l] "+r"(p3_l), [p4_l] "+r"(p4_l), \
+ [p5_l] "+r"(p5_l), [p6_l] "+r"(p6_l) \
+ :); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q6_l], 6(%[s1]) \n\t" \
+ "sb %[q5_l], 5(%[s1]) \n\t" \
+ "sb %[q4_l], 4(%[s1]) \n\t" \
+ "sb %[q3_l], 3(%[s1]) \n\t" \
+ "sb %[q2_l], 2(%[s1]) \n\t" \
+ "sb %[q1_l], 1(%[s1]) \n\t" \
+ "sb %[q0_l], 0(%[s1]) \n\t" \
+ "sb %[p0_l], -1(%[s1]) \n\t" \
+ "sb %[p1_l], -2(%[s1]) \n\t" \
+ "sb %[p2_l], -3(%[s1]) \n\t" \
+ "sb %[p3_l], -4(%[s1]) \n\t" \
+ "sb %[p4_l], -5(%[s1]) \n\t" \
+ "sb %[p5_l], -6(%[s1]) \n\t" \
+ "sb %[p6_l], -7(%[s1]) \n\t" \
+ \
+ : \
+ : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \
+ [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \
+ [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \
+ [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \
+ [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s1] "r"(s1)); \
+ }
-#define PACK_LEFT_0TO3() { \
- __asm__ __volatile__ ( \
- "preceu.ph.qbl %[p3_l], %[p3] \n\t" \
- "preceu.ph.qbl %[p2_l], %[p2] \n\t" \
- "preceu.ph.qbl %[p1_l], %[p1] \n\t" \
- "preceu.ph.qbl %[p0_l], %[p0] \n\t" \
- "preceu.ph.qbl %[q0_l], %[q0] \n\t" \
- "preceu.ph.qbl %[q1_l], %[q1] \n\t" \
- "preceu.ph.qbl %[q2_l], %[q2] \n\t" \
- "preceu.ph.qbl %[q3_l], %[q3] \n\t" \
- \
- : [p3_l] "=&r" (p3_l), [p2_l] "=&r" (p2_l), \
- [p1_l] "=&r" (p1_l), [p0_l] "=&r" (p0_l), \
- [q0_l] "=&r" (q0_l), [q1_l] "=&r" (q1_l), \
- [q2_l] "=&r" (q2_l), [q3_l] "=&r" (q3_l) \
- : [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), \
- [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3) \
- ); \
-}
+#define PACK_LEFT_0TO3() \
+ { \
+ __asm__ __volatile__( \
+ "preceu.ph.qbl %[p3_l], %[p3] \n\t" \
+ "preceu.ph.qbl %[p2_l], %[p2] \n\t" \
+ "preceu.ph.qbl %[p1_l], %[p1] \n\t" \
+ "preceu.ph.qbl %[p0_l], %[p0] \n\t" \
+ "preceu.ph.qbl %[q0_l], %[q0] \n\t" \
+ "preceu.ph.qbl %[q1_l], %[q1] \n\t" \
+ "preceu.ph.qbl %[q2_l], %[q2] \n\t" \
+ "preceu.ph.qbl %[q3_l], %[q3] \n\t" \
+ \
+ : [p3_l] "=&r"(p3_l), [p2_l] "=&r"(p2_l), [p1_l] "=&r"(p1_l), \
+ [p0_l] "=&r"(p0_l), [q0_l] "=&r"(q0_l), [q1_l] "=&r"(q1_l), \
+ [q2_l] "=&r"(q2_l), [q3_l] "=&r"(q3_l) \
+ : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \
+ [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \
+ }
-#define PACK_LEFT_4TO7() { \
- __asm__ __volatile__ ( \
- "preceu.ph.qbl %[p7_l], %[p7] \n\t" \
- "preceu.ph.qbl %[p6_l], %[p6] \n\t" \
- "preceu.ph.qbl %[p5_l], %[p5] \n\t" \
- "preceu.ph.qbl %[p4_l], %[p4] \n\t" \
- "preceu.ph.qbl %[q4_l], %[q4] \n\t" \
- "preceu.ph.qbl %[q5_l], %[q5] \n\t" \
- "preceu.ph.qbl %[q6_l], %[q6] \n\t" \
- "preceu.ph.qbl %[q7_l], %[q7] \n\t" \
- \
- : [p7_l] "=&r" (p7_l), [p6_l] "=&r" (p6_l), \
- [p5_l] "=&r" (p5_l), [p4_l] "=&r" (p4_l), \
- [q4_l] "=&r" (q4_l), [q5_l] "=&r" (q5_l), \
- [q6_l] "=&r" (q6_l), [q7_l] "=&r" (q7_l) \
- : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), \
- [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [q7] "r" (q7) \
- ); \
-}
+#define PACK_LEFT_4TO7() \
+ { \
+ __asm__ __volatile__( \
+ "preceu.ph.qbl %[p7_l], %[p7] \n\t" \
+ "preceu.ph.qbl %[p6_l], %[p6] \n\t" \
+ "preceu.ph.qbl %[p5_l], %[p5] \n\t" \
+ "preceu.ph.qbl %[p4_l], %[p4] \n\t" \
+ "preceu.ph.qbl %[q4_l], %[q4] \n\t" \
+ "preceu.ph.qbl %[q5_l], %[q5] \n\t" \
+ "preceu.ph.qbl %[q6_l], %[q6] \n\t" \
+ "preceu.ph.qbl %[q7_l], %[q7] \n\t" \
+ \
+ : [p7_l] "=&r"(p7_l), [p6_l] "=&r"(p6_l), [p5_l] "=&r"(p5_l), \
+ [p4_l] "=&r"(p4_l), [q4_l] "=&r"(q4_l), [q5_l] "=&r"(q5_l), \
+ [q6_l] "=&r"(q6_l), [q7_l] "=&r"(q7_l) \
+ : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \
+ [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \
+ }
-#define PACK_RIGHT_0TO3() { \
- __asm__ __volatile__ ( \
- "preceu.ph.qbr %[p3_r], %[p3] \n\t" \
- "preceu.ph.qbr %[p2_r], %[p2] \n\t" \
- "preceu.ph.qbr %[p1_r], %[p1] \n\t" \
- "preceu.ph.qbr %[p0_r], %[p0] \n\t" \
- "preceu.ph.qbr %[q0_r], %[q0] \n\t" \
- "preceu.ph.qbr %[q1_r], %[q1] \n\t" \
- "preceu.ph.qbr %[q2_r], %[q2] \n\t" \
- "preceu.ph.qbr %[q3_r], %[q3] \n\t" \
- \
- : [p3_r] "=&r" (p3_r), [p2_r] "=&r" (p2_r), \
- [p1_r] "=&r" (p1_r), [p0_r] "=&r" (p0_r), \
- [q0_r] "=&r" (q0_r), [q1_r] "=&r" (q1_r), \
- [q2_r] "=&r" (q2_r), [q3_r] "=&r" (q3_r) \
- : [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), \
- [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3) \
- ); \
-}
+#define PACK_RIGHT_0TO3() \
+ { \
+ __asm__ __volatile__( \
+ "preceu.ph.qbr %[p3_r], %[p3] \n\t" \
+ "preceu.ph.qbr %[p2_r], %[p2] \n\t" \
+ "preceu.ph.qbr %[p1_r], %[p1] \n\t" \
+ "preceu.ph.qbr %[p0_r], %[p0] \n\t" \
+ "preceu.ph.qbr %[q0_r], %[q0] \n\t" \
+ "preceu.ph.qbr %[q1_r], %[q1] \n\t" \
+ "preceu.ph.qbr %[q2_r], %[q2] \n\t" \
+ "preceu.ph.qbr %[q3_r], %[q3] \n\t" \
+ \
+ : [p3_r] "=&r"(p3_r), [p2_r] "=&r"(p2_r), [p1_r] "=&r"(p1_r), \
+ [p0_r] "=&r"(p0_r), [q0_r] "=&r"(q0_r), [q1_r] "=&r"(q1_r), \
+ [q2_r] "=&r"(q2_r), [q3_r] "=&r"(q3_r) \
+ : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \
+ [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \
+ }
-#define PACK_RIGHT_4TO7() { \
- __asm__ __volatile__ ( \
- "preceu.ph.qbr %[p7_r], %[p7] \n\t" \
- "preceu.ph.qbr %[p6_r], %[p6] \n\t" \
- "preceu.ph.qbr %[p5_r], %[p5] \n\t" \
- "preceu.ph.qbr %[p4_r], %[p4] \n\t" \
- "preceu.ph.qbr %[q4_r], %[q4] \n\t" \
- "preceu.ph.qbr %[q5_r], %[q5] \n\t" \
- "preceu.ph.qbr %[q6_r], %[q6] \n\t" \
- "preceu.ph.qbr %[q7_r], %[q7] \n\t" \
- \
- : [p7_r] "=&r" (p7_r), [p6_r] "=&r" (p6_r), \
- [p5_r] "=&r" (p5_r), [p4_r] "=&r" (p4_r), \
- [q4_r] "=&r" (q4_r), [q5_r] "=&r" (q5_r), \
- [q6_r] "=&r" (q6_r), [q7_r] "=&r" (q7_r) \
- : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), \
- [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [q7] "r" (q7) \
- ); \
-}
+#define PACK_RIGHT_4TO7() \
+ { \
+ __asm__ __volatile__( \
+ "preceu.ph.qbr %[p7_r], %[p7] \n\t" \
+ "preceu.ph.qbr %[p6_r], %[p6] \n\t" \
+ "preceu.ph.qbr %[p5_r], %[p5] \n\t" \
+ "preceu.ph.qbr %[p4_r], %[p4] \n\t" \
+ "preceu.ph.qbr %[q4_r], %[q4] \n\t" \
+ "preceu.ph.qbr %[q5_r], %[q5] \n\t" \
+ "preceu.ph.qbr %[q6_r], %[q6] \n\t" \
+ "preceu.ph.qbr %[q7_r], %[q7] \n\t" \
+ \
+ : [p7_r] "=&r"(p7_r), [p6_r] "=&r"(p6_r), [p5_r] "=&r"(p5_r), \
+ [p4_r] "=&r"(p4_r), [q4_r] "=&r"(q4_r), [q5_r] "=&r"(q5_r), \
+ [q6_r] "=&r"(q6_r), [q7_r] "=&r"(q7_r) \
+ : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \
+ [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \
+ }
-#define COMBINE_LEFT_RIGHT_0TO2() { \
- __asm__ __volatile__ ( \
- "precr.qb.ph %[p2], %[p2_l], %[p2_r] \n\t" \
- "precr.qb.ph %[p1], %[p1_l], %[p1_r] \n\t" \
- "precr.qb.ph %[p0], %[p0_l], %[p0_r] \n\t" \
- "precr.qb.ph %[q0], %[q0_l], %[q0_r] \n\t" \
- "precr.qb.ph %[q1], %[q1_l], %[q1_r] \n\t" \
- "precr.qb.ph %[q2], %[q2_l], %[q2_r] \n\t" \
- \
- : [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0), \
- [q0] "=&r" (q0), [q1] "=&r" (q1), [q2] "=&r" (q2) \
- : [p2_l] "r" (p2_l), [p2_r] "r" (p2_r), \
- [p1_l] "r" (p1_l), [p1_r] "r" (p1_r), \
- [p0_l] "r" (p0_l), [p0_r] "r" (p0_r), \
- [q0_l] "r" (q0_l), [q0_r] "r" (q0_r), \
- [q1_l] "r" (q1_l), [q1_r] "r" (q1_r), \
- [q2_l] "r" (q2_l), [q2_r] "r" (q2_r) \
- ); \
-}
+#define COMBINE_LEFT_RIGHT_0TO2() \
+ { \
+ __asm__ __volatile__( \
+ "precr.qb.ph %[p2], %[p2_l], %[p2_r] \n\t" \
+ "precr.qb.ph %[p1], %[p1_l], %[p1_r] \n\t" \
+ "precr.qb.ph %[p0], %[p0_l], %[p0_r] \n\t" \
+ "precr.qb.ph %[q0], %[q0_l], %[q0_r] \n\t" \
+ "precr.qb.ph %[q1], %[q1_l], %[q1_r] \n\t" \
+ "precr.qb.ph %[q2], %[q2_l], %[q2_r] \n\t" \
+ \
+ : [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), [q0] "=&r"(q0), \
+ [q1] "=&r"(q1), [q2] "=&r"(q2) \
+ : [p2_l] "r"(p2_l), [p2_r] "r"(p2_r), [p1_l] "r"(p1_l), \
+ [p1_r] "r"(p1_r), [p0_l] "r"(p0_l), [p0_r] "r"(p0_r), \
+ [q0_l] "r"(q0_l), [q0_r] "r"(q0_r), [q1_l] "r"(q1_l), \
+ [q1_r] "r"(q1_r), [q2_l] "r"(q2_l), [q2_r] "r"(q2_r)); \
+ }
-#define COMBINE_LEFT_RIGHT_3TO6() { \
- __asm__ __volatile__ ( \
- "precr.qb.ph %[p6], %[p6_l], %[p6_r] \n\t" \
- "precr.qb.ph %[p5], %[p5_l], %[p5_r] \n\t" \
- "precr.qb.ph %[p4], %[p4_l], %[p4_r] \n\t" \
- "precr.qb.ph %[p3], %[p3_l], %[p3_r] \n\t" \
- "precr.qb.ph %[q3], %[q3_l], %[q3_r] \n\t" \
- "precr.qb.ph %[q4], %[q4_l], %[q4_r] \n\t" \
- "precr.qb.ph %[q5], %[q5_l], %[q5_r] \n\t" \
- "precr.qb.ph %[q6], %[q6_l], %[q6_r] \n\t" \
- \
- : [p6] "=&r" (p6),[p5] "=&r" (p5), \
- [p4] "=&r" (p4),[p3] "=&r" (p3), \
- [q3] "=&r" (q3),[q4] "=&r" (q4), \
- [q5] "=&r" (q5),[q6] "=&r" (q6) \
- : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), \
- [p4_l] "r" (p4_l), [p3_l] "r" (p3_l), \
- [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), \
- [p4_r] "r" (p4_r), [p3_r] "r" (p3_r), \
- [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), \
- [q5_l] "r" (q5_l), [q6_l] "r" (q6_l), \
- [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), \
- [q5_r] "r" (q5_r), [q6_r] "r" (q6_r) \
- ); \
-}
+#define COMBINE_LEFT_RIGHT_3TO6() \
+ { \
+ __asm__ __volatile__( \
+ "precr.qb.ph %[p6], %[p6_l], %[p6_r] \n\t" \
+ "precr.qb.ph %[p5], %[p5_l], %[p5_r] \n\t" \
+ "precr.qb.ph %[p4], %[p4_l], %[p4_r] \n\t" \
+ "precr.qb.ph %[p3], %[p3_l], %[p3_r] \n\t" \
+ "precr.qb.ph %[q3], %[q3_l], %[q3_r] \n\t" \
+ "precr.qb.ph %[q4], %[q4_l], %[q4_r] \n\t" \
+ "precr.qb.ph %[q5], %[q5_l], %[q5_r] \n\t" \
+ "precr.qb.ph %[q6], %[q6_l], %[q6_r] \n\t" \
+ \
+ : [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4), [p3] "=&r"(p3), \
+ [q3] "=&r"(q3), [q4] "=&r"(q4), [q5] "=&r"(q5), [q6] "=&r"(q6) \
+ : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), \
+ [p3_l] "r"(p3_l), [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), \
+ [p4_r] "r"(p4_r), [p3_r] "r"(p3_r), [q3_l] "r"(q3_l), \
+ [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), [q6_l] "r"(q6_l), \
+ [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), \
+ [q6_r] "r"(q6_r)); \
+ }
#endif // #if HAVE_DSPR2
#ifdef __cplusplus
diff --git a/vpx_dsp/mips/loopfilter_masks_dspr2.h b/vpx_dsp/mips/loopfilter_masks_dspr2.h
index 2c964afaa..ee1114226 100644
--- a/vpx_dsp/mips/loopfilter_masks_dspr2.h
+++ b/vpx_dsp/mips/loopfilter_masks_dspr2.h
@@ -25,18 +25,17 @@ extern "C" {
/* processing 4 pixels at the same time
* compute hev and mask in the same function */
static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
- uint32_t p1, uint32_t p0,
- uint32_t p3, uint32_t p2,
- uint32_t q0, uint32_t q1,
+ uint32_t p1, uint32_t p0, uint32_t p3,
+ uint32_t p2, uint32_t q0, uint32_t q1,
uint32_t q2, uint32_t q3,
uint32_t thresh, uint32_t *hev,
uint32_t *mask) {
- uint32_t c, r, r3, r_k;
- uint32_t s1, s2, s3;
- uint32_t ones = 0xFFFFFFFF;
- uint32_t hev1;
+ uint32_t c, r, r3, r_k;
+ uint32_t s1, s2, s3;
+ uint32_t ones = 0xFFFFFFFF;
+ uint32_t hev1;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
/* mask |= (abs(p3 - p2) > limit) */
"subu_s.qb %[c], %[p3], %[p2] \n\t"
"subu_s.qb %[r_k], %[p2], %[p3] \n\t"
@@ -88,14 +87,12 @@ static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
"cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
"or %[r], %[r], %[c] \n\t"
- : [c] "=&r" (c), [r_k] "=&r" (r_k),
- [r] "=&r" (r), [r3] "=&r" (r3)
- : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
- [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
- [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh)
- );
+ : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3)
+ : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+ [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
+ [thresh] "r"(thresh));
- __asm__ __volatile__ (
+ __asm__ __volatile__(
/* abs(p0 - q0) */
"subu_s.qb %[c], %[p0], %[q0] \n\t"
"subu_s.qb %[r_k], %[q0], %[p0] \n\t"
@@ -119,34 +116,27 @@ static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
"wrdsp %[r] \n\t"
"pick.qb %[s2], $0, %[ones] \n\t"
- : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
- [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
- : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
- [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
- );
+ : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
+ [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
+ : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
+ [ones] "r"(ones), [flimit] "r"(flimit));
*hev = hev1;
*mask = s2;
}
-static INLINE void filter_hev_mask_flatmask4_dspr2(uint32_t limit,
- uint32_t flimit,
- uint32_t thresh,
- uint32_t p1, uint32_t p0,
- uint32_t p3, uint32_t p2,
- uint32_t q0, uint32_t q1,
- uint32_t q2, uint32_t q3,
- uint32_t *hev,
- uint32_t *mask,
- uint32_t *flat) {
- uint32_t c, r, r3, r_k, r_flat;
- uint32_t s1, s2, s3;
- uint32_t ones = 0xFFFFFFFF;
- uint32_t flat_thresh = 0x01010101;
- uint32_t hev1;
- uint32_t flat1;
-
- __asm__ __volatile__ (
+static INLINE void filter_hev_mask_flatmask4_dspr2(
+ uint32_t limit, uint32_t flimit, uint32_t thresh, uint32_t p1, uint32_t p0,
+ uint32_t p3, uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2,
+ uint32_t q3, uint32_t *hev, uint32_t *mask, uint32_t *flat) {
+ uint32_t c, r, r3, r_k, r_flat;
+ uint32_t s1, s2, s3;
+ uint32_t ones = 0xFFFFFFFF;
+ uint32_t flat_thresh = 0x01010101;
+ uint32_t hev1;
+ uint32_t flat1;
+
+ __asm__ __volatile__(
/* mask |= (abs(p3 - p2) > limit) */
"subu_s.qb %[c], %[p3], %[p2] \n\t"
"subu_s.qb %[r_k], %[p2], %[p3] \n\t"
@@ -236,15 +226,13 @@ static INLINE void filter_hev_mask_flatmask4_dspr2(uint32_t limit,
"cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
"or %[r], %[r], %[c] \n\t"
- : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r), [r3] "=&r" (r3),
- [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1)
- : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
- [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
- [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh),
- [flat_thresh] "r" (flat_thresh), [ones] "r" (ones)
- );
+ : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3),
+ [r_flat] "=&r"(r_flat), [flat1] "=&r"(flat1)
+ : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+ [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
+ [thresh] "r"(thresh), [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
- __asm__ __volatile__ (
+ __asm__ __volatile__(
/* abs(p0 - q0) */
"subu_s.qb %[c], %[p0], %[q0] \n\t"
"subu_s.qb %[r_k], %[q0], %[p0] \n\t"
@@ -268,29 +256,25 @@ static INLINE void filter_hev_mask_flatmask4_dspr2(uint32_t limit,
"wrdsp %[r] \n\t"
"pick.qb %[s2], $0, %[ones] \n\t"
- : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
- [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
- : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
- [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
- );
+ : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
+ [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
+ : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
+ [ones] "r"(ones), [flimit] "r"(flimit));
*hev = hev1;
*mask = s2;
*flat = flat1;
}
-static INLINE void flatmask5(uint32_t p4, uint32_t p3,
- uint32_t p2, uint32_t p1,
- uint32_t p0, uint32_t q0,
- uint32_t q1, uint32_t q2,
- uint32_t q3, uint32_t q4,
- uint32_t *flat2) {
- uint32_t c, r, r_k, r_flat;
- uint32_t ones = 0xFFFFFFFF;
- uint32_t flat_thresh = 0x01010101;
- uint32_t flat1, flat3;
-
- __asm__ __volatile__ (
+static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1,
+ uint32_t p0, uint32_t q0, uint32_t q1, uint32_t q2,
+ uint32_t q3, uint32_t q4, uint32_t *flat2) {
+ uint32_t c, r, r_k, r_flat;
+ uint32_t ones = 0xFFFFFFFF;
+ uint32_t flat_thresh = 0x01010101;
+ uint32_t flat1, flat3;
+
+ __asm__ __volatile__(
/* flat |= (abs(p4 - p0) > thresh) */
"subu_s.qb %[c], %[p4], %[p0] \n\t"
"subu_s.qb %[r_k], %[p0], %[p4] \n\t"
@@ -355,13 +339,11 @@ static INLINE void flatmask5(uint32_t p4, uint32_t p3,
/* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */
"and %[flat1], %[flat3], %[flat1] \n\t"
- : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r),
- [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1), [flat3] "=&r" (flat3)
- : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2),
- [p1] "r" (p1), [p0] "r" (p0), [q0] "r" (q0), [q1] "r" (q1),
- [q2] "r" (q2), [q3] "r" (q3), [q4] "r" (q4),
- [flat_thresh] "r" (flat_thresh), [ones] "r" (ones)
- );
+ : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r_flat] "=&r"(r_flat),
+ [flat1] "=&r"(flat1), [flat3] "=&r"(flat3)
+ : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),
+ [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3), [q4] "r"(q4),
+ [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
*flat2 = flat1;
}
diff --git a/vpx_dsp/mips/loopfilter_mb_dspr2.c b/vpx_dsp/mips/loopfilter_mb_dspr2.c
index dd0545eed..e42479257 100644
--- a/vpx_dsp/mips/loopfilter_mb_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_mb_dspr2.c
@@ -19,36 +19,33 @@
#include "vpx_mem/vpx_mem.h"
#if HAVE_DSPR2
-void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
- int pitch,
- const uint8_t *blimit,
- const uint8_t *limit,
+void vpx_lpf_horizontal_8_dspr2(unsigned char *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
const uint8_t *thresh) {
- uint32_t mask;
- uint32_t hev, flat;
- uint8_t i;
- uint8_t *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3;
- uint32_t thresh_vec, flimit_vec, limit_vec;
- uint32_t uflimit, ulimit, uthresh;
- uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
- uint32_t p3, p2, p1, p0, q0, q1, q2, q3;
- uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
- uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
+ uint32_t mask;
+ uint32_t hev, flat;
+ uint8_t i;
+ uint8_t *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
+ uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+ uint32_t p3, p2, p1, p0, q0, q1, q2, q3;
+ uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
+ uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
uflimit = *blimit;
- ulimit = *limit;
+ ulimit = *limit;
uthresh = *thresh;
/* create quad-byte */
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"replv.qb %[thresh_vec], %[uthresh] \n\t"
"replv.qb %[flimit_vec], %[uflimit] \n\t"
"replv.qb %[limit_vec], %[ulimit] \n\t"
- : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
- [limit_vec] "=r" (limit_vec)
- : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
- );
+ : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+ [limit_vec] "=r"(limit_vec)
+ : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
/* prefetch data for store */
prefetch_store(s);
@@ -63,7 +60,7 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
sq2 = sq1 + pitch;
sq3 = sq2 + pitch;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lw %[p3], (%[sp3]) \n\t"
"lw %[p2], (%[sp2]) \n\t"
"lw %[p1], (%[sp1]) \n\t"
@@ -73,46 +70,39 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
"lw %[q2], (%[sq2]) \n\t"
"lw %[q3], (%[sq3]) \n\t"
- : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
- [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0)
- : [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0)
- );
+ : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+ [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0)
+ : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0));
- filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
- p1, p0, p3, p2, q0, q1, q2, q3,
- &hev, &mask, &flat);
+ filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+ p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
if ((flat == 0) && (mask != 0)) {
- filter1_dspr2(mask, hev, p1, p0, q0, q1,
- &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sw %[p1_f0], (%[sp1]) \n\t"
"sw %[p0_f0], (%[sp0]) \n\t"
"sw %[q0_f0], (%[sq0]) \n\t"
"sw %[q1_f0], (%[sq1]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
- [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
- [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1));
} else if ((mask & flat) == 0xFFFFFFFF) {
/* left 2 element operation */
PACK_LEFT_0TO3()
- mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
- &q0_l, &q1_l, &q2_l, &q3_l);
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
/* right 2 element operation */
PACK_RIGHT_0TO3()
- mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
- &q0_r, &q1_r, &q2_r, &q3_r);
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
COMBINE_LEFT_RIGHT_0TO2()
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sw %[p2], (%[sp2]) \n\t"
"sw %[p1], (%[sp1]) \n\t"
"sw %[p0], (%[sp0]) \n\t"
@@ -121,28 +111,23 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
"sw %[q2], (%[sq2]) \n\t"
:
- : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
- [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2),
- [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
- );
+ : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
+ [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
+ [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
} else if ((flat != 0) && (mask != 0)) {
/* filtering */
- filter1_dspr2(mask, hev, p1, p0, q0, q1,
- &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
/* left 2 element operation */
PACK_LEFT_0TO3()
- mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
- &q0_l, &q1_l, &q2_l, &q3_l);
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
/* right 2 element operation */
PACK_RIGHT_0TO3()
- mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
- &q0_r, &q1_r, &q2_r, &q3_r);
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
if (mask & flat & 0x000000FF) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p2_r], (%[sp2]) \n\t"
"sb %[p1_r], (%[sp1]) \n\t"
"sb %[p0_r], (%[sp0]) \n\t"
@@ -151,27 +136,24 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
"sb %[q2_r], (%[sq2]) \n\t"
:
- : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
- [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
- [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
- );
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
} else if (mask & 0x000000FF) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], (%[sp1]) \n\t"
"sb %[p0_f0], (%[sp0]) \n\t"
"sb %[q0_f0], (%[sq0]) \n\t"
"sb %[q1_f0], (%[sq1]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
- [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
- [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
}
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p2_r], %[p2_r], 16 \n\t"
"srl %[p1_r], %[p1_r], 16 \n\t"
"srl %[p0_r], %[p0_r], 16 \n\t"
@@ -183,15 +165,14 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t"
- : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
- [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
- [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
- [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
- :
- );
+ : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+ [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
if (mask & flat & 0x0000FF00) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p2_r], +1(%[sp2]) \n\t"
"sb %[p1_r], +1(%[sp1]) \n\t"
"sb %[p0_r], +1(%[sp0]) \n\t"
@@ -200,41 +181,36 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
"sb %[q2_r], +1(%[sq2]) \n\t"
:
- : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
- [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
- [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
- );
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
} else if (mask & 0x0000FF00) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], +1(%[sp1]) \n\t"
"sb %[p0_f0], +1(%[sp0]) \n\t"
"sb %[q0_f0], +1(%[sq0]) \n\t"
"sb %[q1_f0], +1(%[sq1]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
- [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
- [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
}
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p1_f0], %[p1_f0], 8 \n\t"
"srl %[p0_f0], %[p0_f0], 8 \n\t"
"srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t"
- : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0),
- [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2),
- [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
- [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
- :
- );
+ : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
+ [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
+ [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
+ :);
if (mask & flat & 0x00FF0000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p2_l], +2(%[sp2]) \n\t"
"sb %[p1_l], +2(%[sp1]) \n\t"
"sb %[p0_l], +2(%[sp0]) \n\t"
@@ -243,27 +219,24 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
"sb %[q2_l], +2(%[sq2]) \n\t"
:
- : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
- [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
- [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
- );
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
} else if (mask & 0x00FF0000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], +2(%[sp1]) \n\t"
"sb %[p0_f0], +2(%[sp0]) \n\t"
"sb %[q0_f0], +2(%[sq0]) \n\t"
"sb %[q1_f0], +2(%[sq1]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
- [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
- [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
}
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p2_l], %[p2_l], 16 \n\t"
"srl %[p1_l], %[p1_l], 16 \n\t"
"srl %[p0_l], %[p0_l], 16 \n\t"
@@ -275,15 +248,14 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t"
- : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
- [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
- [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
- [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
- :
- );
+ : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+ [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
if (mask & flat & 0xFF000000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p2_l], +3(%[sp2]) \n\t"
"sb %[p1_l], +3(%[sp1]) \n\t"
"sb %[p0_l], +3(%[sp0]) \n\t"
@@ -292,24 +264,21 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
"sb %[q2_l], +3(%[sq2]) \n\t"
:
- : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
- [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
- [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
- );
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
} else if (mask & 0xFF000000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], +3(%[sp1]) \n\t"
"sb %[p0_f0], +3(%[sp0]) \n\t"
"sb %[q0_f0], +3(%[sq0]) \n\t"
"sb %[q1_f0], +3(%[sq1]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
- [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
- [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
}
}
@@ -317,36 +286,33 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
}
}
-void vpx_lpf_vertical_8_dspr2(unsigned char *s,
- int pitch,
- const uint8_t *blimit,
- const uint8_t *limit,
+void vpx_lpf_vertical_8_dspr2(unsigned char *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
const uint8_t *thresh) {
- uint8_t i;
- uint32_t mask, hev, flat;
- uint8_t *s1, *s2, *s3, *s4;
- uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
- uint32_t thresh_vec, flimit_vec, limit_vec;
- uint32_t uflimit, ulimit, uthresh;
- uint32_t p3, p2, p1, p0, q3, q2, q1, q0;
- uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
- uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
- uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
+ uint8_t i;
+ uint32_t mask, hev, flat;
+ uint8_t *s1, *s2, *s3, *s4;
+ uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
+ uint32_t p3, p2, p1, p0, q3, q2, q1, q0;
+ uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+ uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
+ uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
uflimit = *blimit;
- ulimit = *limit;
+ ulimit = *limit;
uthresh = *thresh;
/* create quad-byte */
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"replv.qb %[thresh_vec], %[uthresh] \n\t"
"replv.qb %[flimit_vec], %[uflimit] \n\t"
"replv.qb %[limit_vec], %[ulimit] \n\t"
- : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
- [limit_vec] "=r" (limit_vec)
- : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
- );
+ : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+ [limit_vec] "=r"(limit_vec)
+ : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
prefetch_store(s + pitch);
@@ -355,9 +321,9 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
s2 = s + pitch;
s3 = s2 + pitch;
s4 = s3 + pitch;
- s = s4 + pitch;
+ s = s4 + pitch;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lw %[p0], -4(%[s1]) \n\t"
"lw %[p1], -4(%[s2]) \n\t"
"lw %[p2], -4(%[s3]) \n\t"
@@ -367,10 +333,9 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
"lw %[q1], (%[s3]) \n\t"
"lw %[q0], (%[s4]) \n\t"
- : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
- [q0] "=&r" (q0), [q1] "=&r" (q1), [q2] "=&r" (q2), [q3] "=&r" (q3)
- : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
- );
+ : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+ [q0] "=&r"(q0), [q1] "=&r"(q1), [q2] "=&r"(q2), [q3] "=&r"(q3)
+ : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
/* transpose p3, p2, p1, p0
original (when loaded from memory)
@@ -387,7 +352,7 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
p2 p3_1 p2_1 p1_1 p0_1
p3 p3_0 p2_0 p1_0 p0_0
*/
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"precrq.qb.ph %[prim1], %[p0], %[p1] \n\t"
"precr.qb.ph %[prim2], %[p0], %[p1] \n\t"
"precrq.qb.ph %[prim3], %[p2], %[p3] \n\t"
@@ -403,12 +368,10 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
"append %[p1], %[sec3], 16 \n\t"
"append %[p3], %[sec4], 16 \n\t"
- : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
- [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
- [p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3),
- [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
- :
- );
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
+ [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
/* transpose q0, q1, q2, q3
original (when loaded from memory)
@@ -425,7 +388,7 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
q1 q0_1 q1_1 q2_1 q3_1
q0 q0_0 q1_0 q2_0 q3_0
*/
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"precrq.qb.ph %[prim1], %[q3], %[q2] \n\t"
"precr.qb.ph %[prim2], %[q3], %[q2] \n\t"
"precrq.qb.ph %[prim3], %[q1], %[q0] \n\t"
@@ -441,49 +404,40 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
"append %[q2], %[sec3], 16 \n\t"
"append %[q0], %[sec4], 16 \n\t"
- : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
- [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
- [q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0),
- [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
- :
- );
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
+ [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
- filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
- p1, p0, p3, p2, q0, q1, q2, q3,
- &hev, &mask, &flat);
+ filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+ p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
if ((flat == 0) && (mask != 0)) {
- filter1_dspr2(mask, hev, p1, p0, q0, q1,
- &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
STORE_F0()
} else if ((mask & flat) == 0xFFFFFFFF) {
/* left 2 element operation */
PACK_LEFT_0TO3()
- mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
- &q0_l, &q1_l, &q2_l, &q3_l);
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
/* right 2 element operation */
PACK_RIGHT_0TO3()
- mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
- &q0_r, &q1_r, &q2_r, &q3_r);
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
STORE_F1()
} else if ((flat != 0) && (mask != 0)) {
- filter1_dspr2(mask, hev, p1, p0, q0, q1,
- &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
/* left 2 element operation */
PACK_LEFT_0TO3()
- mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
- &q0_l, &q1_l, &q2_l, &q3_l);
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
/* right 2 element operation */
PACK_RIGHT_0TO3()
- mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
- &q0_r, &q1_r, &q2_r, &q3_r);
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
if (mask & flat & 0x000000FF) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p2_r], -3(%[s4]) \n\t"
"sb %[p1_r], -2(%[s4]) \n\t"
"sb %[p0_r], -1(%[s4]) \n\t"
@@ -492,25 +446,22 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
"sb %[q2_r], +2(%[s4]) \n\t"
:
- : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
- [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
- [s4] "r" (s4)
- );
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [s4] "r"(s4));
} else if (mask & 0x000000FF) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], -2(%[s4]) \n\t"
"sb %[p0_f0], -1(%[s4]) \n\t"
"sb %[q0_f0], (%[s4]) \n\t"
"sb %[q1_f0], +1(%[s4]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
- [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
- [s4] "r" (s4)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s4] "r"(s4));
}
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p2_r], %[p2_r], 16 \n\t"
"srl %[p1_r], %[p1_r], 16 \n\t"
"srl %[p0_r], %[p0_r], 16 \n\t"
@@ -522,15 +473,14 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t"
- : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
- [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
- [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
- [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
- :
- );
+ : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+ [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
if (mask & flat & 0x0000FF00) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p2_r], -3(%[s3]) \n\t"
"sb %[p1_r], -2(%[s3]) \n\t"
"sb %[p0_r], -1(%[s3]) \n\t"
@@ -539,66 +489,58 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
"sb %[q2_r], +2(%[s3]) \n\t"
:
- : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
- [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
- [s3] "r" (s3)
- );
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [s3] "r"(s3));
} else if (mask & 0x0000FF00) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], -2(%[s3]) \n\t"
"sb %[p0_f0], -1(%[s3]) \n\t"
"sb %[q0_f0], (%[s3]) \n\t"
"sb %[q1_f0], +1(%[s3]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
- [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
- [s3] "r" (s3)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s3] "r"(s3));
}
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p1_f0], %[p1_f0], 8 \n\t"
"srl %[p0_f0], %[p0_f0], 8 \n\t"
"srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t"
- : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0),
- [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2),
- [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
- [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
- :
- );
+ : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
+ [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
+ [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
+ :);
if (mask & flat & 0x00FF0000) {
- __asm__ __volatile__ (
- "sb %[p2_l], -3(%[s2]) \n\t"
- "sb %[p1_l], -2(%[s2]) \n\t"
- "sb %[p0_l], -1(%[s2]) \n\t"
- "sb %[q0_l], (%[s2]) \n\t"
- "sb %[q1_l], +1(%[s2]) \n\t"
- "sb %[q2_l], +2(%[s2]) \n\t"
+ __asm__ __volatile__(
+ "sb %[p2_l], -3(%[s2]) \n\t"
+ "sb %[p1_l], -2(%[s2]) \n\t"
+ "sb %[p0_l], -1(%[s2]) \n\t"
+ "sb %[q0_l], (%[s2]) \n\t"
+ "sb %[q1_l], +1(%[s2]) \n\t"
+ "sb %[q2_l], +2(%[s2]) \n\t"
- :
- : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
- [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
- [s2] "r" (s2)
- );
+ :
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [s2] "r"(s2));
} else if (mask & 0x00FF0000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], -2(%[s2]) \n\t"
"sb %[p0_f0], -1(%[s2]) \n\t"
"sb %[q0_f0], (%[s2]) \n\t"
"sb %[q1_f0], +1(%[s2]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
- [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
- [s2] "r" (s2)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s2] "r"(s2));
}
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p2_l], %[p2_l], 16 \n\t"
"srl %[p1_l], %[p1_l], 16 \n\t"
"srl %[p0_l], %[p0_l], 16 \n\t"
@@ -610,15 +552,14 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t"
- : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
- [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
- [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
- [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
- :
- );
+ : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+ [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
if (mask & flat & 0xFF000000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p2_l], -3(%[s1]) \n\t"
"sb %[p1_l], -2(%[s1]) \n\t"
"sb %[p0_l], -1(%[s1]) \n\t"
@@ -627,21 +568,19 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
"sb %[q2_l], +2(%[s1]) \n\t"
:
- : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
- [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
- [s1] "r" (s1)
- );
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [s1] "r"(s1));
} else if (mask & 0xFF000000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], -2(%[s1]) \n\t"
"sb %[p0_f0], -1(%[s1]) \n\t"
"sb %[q0_f0], (%[s1]) \n\t"
"sb %[q1_f0], +1(%[s1]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
- [q1_f0] "r" (q1_f0), [s1] "r" (s1)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s1] "r"(s1));
}
}
}
diff --git a/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c b/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
index 85e167ca0..6325762a2 100644
--- a/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
@@ -19,42 +19,38 @@
#include "vpx_mem/vpx_mem.h"
#if HAVE_DSPR2
-static void mb_lpf_horizontal_edge(unsigned char *s,
- int pitch,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh,
- int count) {
- uint32_t mask;
- uint32_t hev, flat, flat2;
- uint8_t i;
- uint8_t *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0;
- uint8_t *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7;
- uint32_t thresh_vec, flimit_vec, limit_vec;
- uint32_t uflimit, ulimit, uthresh;
- uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
- uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
- uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
- uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
- uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
- uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
- uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
- uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
+static void mb_lpf_horizontal_edge(unsigned char *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int count) {
+ uint32_t mask;
+ uint32_t hev, flat, flat2;
+ uint8_t i;
+ uint8_t *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0;
+ uint8_t *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
+ uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+ uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+ uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
+ uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
+ uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
+ uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
+ uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
+ uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
uflimit = *blimit;
- ulimit = *limit;
+ ulimit = *limit;
uthresh = *thresh;
/* create quad-byte */
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"replv.qb %[thresh_vec], %[uthresh] \n\t"
"replv.qb %[flimit_vec], %[uflimit] \n\t"
"replv.qb %[limit_vec], %[ulimit] \n\t"
- : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
- [limit_vec] "=r" (limit_vec)
- : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
- );
+ : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+ [limit_vec] "=r"(limit_vec)
+ : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
/* prefetch data for store */
prefetch_store(s);
@@ -77,7 +73,7 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
sq6 = sq5 + pitch;
sq7 = sq6 + pitch;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lw %[p7], (%[sp7]) \n\t"
"lw %[p6], (%[sp6]) \n\t"
"lw %[p5], (%[sp5]) \n\t"
@@ -87,13 +83,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"lw %[p1], (%[sp1]) \n\t"
"lw %[p0], (%[sp0]) \n\t"
- : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
- [p7] "=&r" (p7), [p6] "=&r" (p6), [p5] "=&r" (p5), [p4] "=&r" (p4)
- : [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sp4] "r" (sp4), [sp5] "r" (sp5), [sp6] "r" (sp6), [sp7] "r" (sp7)
- );
+ : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+ [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
+ : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sp4] "r"(sp4), [sp5] "r"(sp5), [sp6] "r"(sp6), [sp7] "r"(sp7));
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lw %[q0], (%[sq0]) \n\t"
"lw %[q1], (%[sq1]) \n\t"
"lw %[q2], (%[sq2]) \n\t"
@@ -103,57 +98,50 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"lw %[q6], (%[sq6]) \n\t"
"lw %[q7], (%[sq7]) \n\t"
- : [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0),
- [q7] "=&r" (q7), [q6] "=&r" (q6), [q5] "=&r" (q5), [q4] "=&r" (q4)
- : [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0),
- [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6), [sq7] "r" (sq7)
- );
+ : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
+ [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
+ : [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0),
+ [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6), [sq7] "r"(sq7));
- filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
- p1, p0, p3, p2, q0, q1, q2, q3,
- &hev, &mask, &flat);
+ filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+ p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
/* f0 */
if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
((flat2 != 0) && (flat == 0) && (mask != 0))) {
- filter1_dspr2(mask, hev, p1, p0, q0, q1,
- &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sw %[p1_f0], (%[sp1]) \n\t"
"sw %[p0_f0], (%[sp0]) \n\t"
"sw %[q0_f0], (%[sq0]) \n\t"
"sw %[q1_f0], (%[sq1]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
- [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
- [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1));
} else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
(mask == 0xFFFFFFFF)) {
/* f2 */
PACK_LEFT_0TO3()
PACK_LEFT_4TO7()
- wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
- &p3_l, &p2_l, &p1_l, &p0_l,
- &q0_l, &q1_l, &q2_l, &q3_l,
- &q4_l, &q5_l, &q6_l, &q7_l);
+ wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+ &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+ &q6_l, &q7_l);
PACK_RIGHT_0TO3()
PACK_RIGHT_4TO7()
- wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
- &p3_r, &p2_r, &p1_r, &p0_r,
- &q0_r, &q1_r, &q2_r, &q3_r,
- &q4_r, &q5_r, &q6_r, &q7_r);
+ wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+ &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+ &q6_r, &q7_r);
COMBINE_LEFT_RIGHT_0TO2()
COMBINE_LEFT_RIGHT_3TO6()
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sw %[p6], (%[sp6]) \n\t"
"sw %[p5], (%[sp5]) \n\t"
"sw %[p4], (%[sp4]) \n\t"
@@ -163,13 +151,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sw %[p0], (%[sp0]) \n\t"
:
- : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3),
- [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
- [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4), [sp3] "r" (sp3),
- [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
- );
+ : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
+ [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [sp6] "r"(sp6),
+ [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), [sp2] "r"(sp2),
+ [sp1] "r"(sp1), [sp0] "r"(sp0));
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sw %[q6], (%[sq6]) \n\t"
"sw %[q5], (%[sq5]) \n\t"
"sw %[q4], (%[sq4]) \n\t"
@@ -179,26 +166,23 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sw %[q0], (%[sq0]) \n\t"
:
- : [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4), [q3] "r" (q3),
- [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0),
- [sq6] "r" (sq6), [sq5] "r" (sq5), [sq4] "r" (sq4), [sq3] "r" (sq3),
- [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0)
- );
+ : [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
+ [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [sq6] "r"(sq6),
+ [sq5] "r"(sq5), [sq4] "r"(sq4), [sq3] "r"(sq3), [sq2] "r"(sq2),
+ [sq1] "r"(sq1), [sq0] "r"(sq0));
} else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
/* f1 */
/* left 2 element operation */
PACK_LEFT_0TO3()
- mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
- &q0_l, &q1_l, &q2_l, &q3_l);
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
/* right 2 element operation */
PACK_RIGHT_0TO3()
- mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
- &q0_r, &q1_r, &q2_r, &q3_r);
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
COMBINE_LEFT_RIGHT_0TO2()
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sw %[p2], (%[sp2]) \n\t"
"sw %[p1], (%[sp1]) \n\t"
"sw %[p0], (%[sp0]) \n\t"
@@ -207,28 +191,23 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sw %[q2], (%[sq2]) \n\t"
:
- : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
- [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2),
- [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
- );
+ : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
+ [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
+ [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
} else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
/* f0+f1 */
- filter1_dspr2(mask, hev, p1, p0, q0, q1,
- &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
/* left 2 element operation */
PACK_LEFT_0TO3()
- mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
- &q0_l, &q1_l, &q2_l, &q3_l);
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
/* right 2 element operation */
PACK_RIGHT_0TO3()
- mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
- &q0_r, &q1_r, &q2_r, &q3_r);
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
if (mask & flat & 0x000000FF) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p2_r], (%[sp2]) \n\t"
"sb %[p1_r], (%[sp1]) \n\t"
"sb %[p0_r], (%[sp0]) \n\t"
@@ -237,27 +216,24 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q2_r], (%[sq2]) \n\t"
:
- : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
- [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
- [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
- );
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
} else if (mask & 0x000000FF) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], (%[sp1]) \n\t"
"sb %[p0_f0], (%[sp0]) \n\t"
"sb %[q0_f0], (%[sq0]) \n\t"
"sb %[q1_f0], (%[sq1]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
- [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
- [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
}
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p2_r], %[p2_r], 16 \n\t"
"srl %[p1_r], %[p1_r], 16 \n\t"
"srl %[p0_r], %[p0_r], 16 \n\t"
@@ -269,15 +245,14 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t"
- : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
- [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
- [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
- [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
- :
- );
+ : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+ [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
if (mask & flat & 0x0000FF00) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p2_r], +1(%[sp2]) \n\t"
"sb %[p1_r], +1(%[sp1]) \n\t"
"sb %[p0_r], +1(%[sp0]) \n\t"
@@ -286,39 +261,35 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q2_r], +1(%[sq2]) \n\t"
:
- : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
- [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
- [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
- );
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
} else if (mask & 0x0000FF00) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], +1(%[sp1]) \n\t"
"sb %[p0_f0], +1(%[sp0]) \n\t"
"sb %[q0_f0], +1(%[sq0]) \n\t"
"sb %[q1_f0], +1(%[sq1]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
- [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
- [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
}
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p1_f0], %[p1_f0], 8 \n\t"
"srl %[p0_f0], %[p0_f0], 8 \n\t"
"srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t"
- : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
- [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
- :
- );
+ : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
if (mask & flat & 0x00FF0000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p2_l], +2(%[sp2]) \n\t"
"sb %[p1_l], +2(%[sp1]) \n\t"
"sb %[p0_l], +2(%[sp0]) \n\t"
@@ -327,27 +298,24 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q2_l], +2(%[sq2]) \n\t"
:
- : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
- [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
- [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
- );
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
} else if (mask & 0x00FF0000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], +2(%[sp1]) \n\t"
"sb %[p0_f0], +2(%[sp0]) \n\t"
"sb %[q0_f0], +2(%[sq0]) \n\t"
"sb %[q1_f0], +2(%[sq1]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
- [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
- [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
}
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p2_l], %[p2_l], 16 \n\t"
"srl %[p1_l], %[p1_l], 16 \n\t"
"srl %[p0_l], %[p0_l], 16 \n\t"
@@ -359,15 +327,14 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t"
- : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
- [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
- [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
- [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
- :
- );
+ : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+ [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
if (mask & flat & 0xFF000000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p2_l], +3(%[sp2]) \n\t"
"sb %[p1_l], +3(%[sp1]) \n\t"
"sb %[p0_l], +3(%[sp0]) \n\t"
@@ -376,61 +343,51 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q2_l], +3(%[sq2]) \n\t"
:
- : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
- [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
- [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
- );
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
} else if (mask & 0xFF000000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], +3(%[sp1]) \n\t"
"sb %[p0_f0], +3(%[sp0]) \n\t"
"sb %[q0_f0], +3(%[sq0]) \n\t"
"sb %[q1_f0], +3(%[sq1]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
- [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
- [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
}
} else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
/* f0 + f1 + f2 */
/* f0 function */
- filter1_dspr2(mask, hev, p1, p0, q0, q1,
- &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
/* f1 function */
/* left 2 element operation */
PACK_LEFT_0TO3()
- mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
- q0_l, q1_l, q2_l, q3_l,
- &p2_l_f1, &p1_l_f1, &p0_l_f1,
- &q0_l_f1, &q1_l_f1, &q2_l_f1);
+ mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
+ &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
/* right 2 element operation */
PACK_RIGHT_0TO3()
- mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
- q0_r, q1_r, q2_r, q3_r,
- &p2_r_f1, &p1_r_f1, &p0_r_f1,
- &q0_r_f1, &q1_r_f1, &q2_r_f1);
+ mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
+ &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
/* f2 function */
PACK_LEFT_4TO7()
- wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
- &p3_l, &p2_l, &p1_l, &p0_l,
- &q0_l, &q1_l, &q2_l, &q3_l,
- &q4_l, &q5_l, &q6_l, &q7_l);
+ wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+ &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+ &q6_l, &q7_l);
PACK_RIGHT_4TO7()
- wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
- &p3_r, &p2_r, &p1_r, &p0_r,
- &q0_r, &q1_r, &q2_r, &q3_r,
- &q4_r, &q5_r, &q6_r, &q7_r);
+ wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+ &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+ &q6_r, &q7_r);
if (mask & flat & flat2 & 0x000000FF) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p6_r], (%[sp6]) \n\t"
"sb %[p5_r], (%[sp5]) \n\t"
"sb %[p4_r], (%[sp4]) \n\t"
@@ -440,14 +397,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[p0_r], (%[sp0]) \n\t"
:
- : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
- [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
- [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4),
- [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1),
- [p0_r] "r" (p0_r), [sp0] "r" (sp0)
- );
-
- __asm__ __volatile__ (
+ : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+ [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+ [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [p0_r] "r"(p0_r), [sp0] "r"(sp0));
+
+ __asm__ __volatile__(
"sb %[q0_r], (%[sq0]) \n\t"
"sb %[q1_r], (%[sq1]) \n\t"
"sb %[q2_r], (%[sq2]) \n\t"
@@ -457,15 +412,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q6_r], (%[sq6]) \n\t"
:
- : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
- [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
- [q6_r] "r" (q6_r),
- [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2),
- [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5),
- [sq6] "r" (sq6)
- );
+ : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+ [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
+ [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
} else if (mask & flat & 0x000000FF) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p2_r_f1], (%[sp2]) \n\t"
"sb %[p1_r_f1], (%[sp1]) \n\t"
"sb %[p0_r_f1], (%[sp0]) \n\t"
@@ -474,27 +426,25 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q2_r_f1], (%[sq2]) \n\t"
:
- : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
- [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
- [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
- [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
- );
+ : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+ [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+ [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
+ [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+ [sq2] "r"(sq2));
} else if (mask & 0x000000FF) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], (%[sp1]) \n\t"
"sb %[p0_f0], (%[sp0]) \n\t"
"sb %[q0_f0], (%[sq0]) \n\t"
"sb %[q1_f0], (%[sq1]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
- [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
}
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p6_r], %[p6_r], 16 \n\t"
"srl %[p5_r], %[p5_r], 16 \n\t"
"srl %[p4_r], %[p4_r], 16 \n\t"
@@ -510,15 +460,14 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"srl %[q5_r], %[q5_r], 16 \n\t"
"srl %[q6_r], %[q6_r], 16 \n\t"
- : [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
- [q3_r] "+r" (q3_r), [q4_r] "+r" (q4_r), [q5_r] "+r" (q5_r),
- [p6_r] "+r" (p6_r), [p5_r] "+r" (p5_r), [p4_r] "+r" (p4_r),
- [p3_r] "+r" (p3_r), [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r),
- [q6_r] "+r" (q6_r), [p0_r] "+r" (p0_r)
- :
- );
+ : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+ [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
+ [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), [p4_r] "+r"(p4_r),
+ [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r),
+ [q6_r] "+r"(q6_r), [p0_r] "+r"(p0_r)
+ :);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p2_r_f1], %[p2_r_f1], 16 \n\t"
"srl %[p1_r_f1], %[p1_r_f1], 16 \n\t"
"srl %[p0_r_f1], %[p0_r_f1], 16 \n\t"
@@ -530,16 +479,15 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t"
- : [p2_r_f1] "+r" (p2_r_f1), [p1_r_f1] "+r" (p1_r_f1),
- [p0_r_f1] "+r" (p0_r_f1), [q0_r_f1] "+r" (q0_r_f1),
- [q1_r_f1] "+r" (q1_r_f1), [q2_r_f1] "+r" (q2_r_f1),
- [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
- [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
- :
- );
+ : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
+ [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
+ [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
if (mask & flat & flat2 & 0x0000FF00) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p6_r], +1(%[sp6]) \n\t"
"sb %[p5_r], +1(%[sp5]) \n\t"
"sb %[p4_r], +1(%[sp4]) \n\t"
@@ -549,14 +497,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[p0_r], +1(%[sp0]) \n\t"
:
- : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
- [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
- [p0_r] "r" (p0_r), [sp6] "r" (sp6), [sp5] "r" (sp5),
- [sp4] "r" (sp4), [sp3] "r" (sp3),
- [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
- );
-
- __asm__ __volatile__ (
+ : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+ [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+ [p0_r] "r"(p0_r), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
+ [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
+
+ __asm__ __volatile__(
"sb %[q0_r], +1(%[sq0]) \n\t"
"sb %[q1_r], +1(%[sq1]) \n\t"
"sb %[q2_r], +1(%[sq2]) \n\t"
@@ -566,14 +512,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q6_r], +1(%[sq6]) \n\t"
:
- : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
- [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
- [q6_r] "r" (q6_r), [sq0] "r" (sq0), [sq1] "r" (sq1),
- [sq2] "r" (sq2), [sq3] "r" (sq3),
- [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6)
- );
+ : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+ [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
+ [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
} else if (mask & flat & 0x0000FF00) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p2_r_f1], +1(%[sp2]) \n\t"
"sb %[p1_r_f1], +1(%[sp1]) \n\t"
"sb %[p0_r_f1], +1(%[sp0]) \n\t"
@@ -582,39 +526,36 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q2_r_f1], +1(%[sq2]) \n\t"
:
- : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
- [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
- [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
- [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
- );
+ : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+ [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+ [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
+ [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+ [sq2] "r"(sq2));
} else if (mask & 0x0000FF00) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], +1(%[sp1]) \n\t"
"sb %[p0_f0], +1(%[sp0]) \n\t"
"sb %[q0_f0], +1(%[sq0]) \n\t"
"sb %[q1_f0], +1(%[sq1]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
- [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
}
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p1_f0], %[p1_f0], 8 \n\t"
"srl %[p0_f0], %[p0_f0], 8 \n\t"
"srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t"
- : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
- [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
- :
- );
+ : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
if (mask & flat & flat2 & 0x00FF0000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p6_l], +2(%[sp6]) \n\t"
"sb %[p5_l], +2(%[sp5]) \n\t"
"sb %[p4_l], +2(%[sp4]) \n\t"
@@ -624,14 +565,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[p0_l], +2(%[sp0]) \n\t"
:
- : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
- [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
- [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5),
- [sp4] "r" (sp4), [sp3] "r" (sp3),
- [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
- );
-
- __asm__ __volatile__ (
+ : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+ [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+ [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
+ [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
+
+ __asm__ __volatile__(
"sb %[q0_l], +2(%[sq0]) \n\t"
"sb %[q1_l], +2(%[sq1]) \n\t"
"sb %[q2_l], +2(%[sq2]) \n\t"
@@ -641,14 +580,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q6_l], +2(%[sq6]) \n\t"
:
- : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
- [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
- [q6_l] "r" (q6_l), [sq0] "r" (sq0), [sq1] "r" (sq1),
- [sq2] "r" (sq2), [sq3] "r" (sq3),
- [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6)
- );
+ : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+ [q6_l] "r"(q6_l), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
+ [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
} else if (mask & flat & 0x00FF0000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p2_l_f1], +2(%[sp2]) \n\t"
"sb %[p1_l_f1], +2(%[sp1]) \n\t"
"sb %[p0_l_f1], +2(%[sp0]) \n\t"
@@ -657,27 +594,25 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q2_l_f1], +2(%[sq2]) \n\t"
:
- : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
- [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
- [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
- [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
- );
+ : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+ [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+ [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
+ [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+ [sq2] "r"(sq2));
} else if (mask & 0x00FF0000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], +2(%[sp1]) \n\t"
"sb %[p0_f0], +2(%[sp0]) \n\t"
"sb %[q0_f0], +2(%[sq0]) \n\t"
"sb %[q1_f0], +2(%[sq1]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
- [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
}
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p6_l], %[p6_l], 16 \n\t"
"srl %[p5_l], %[p5_l], 16 \n\t"
"srl %[p4_l], %[p4_l], 16 \n\t"
@@ -693,15 +628,14 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"srl %[q5_l], %[q5_l], 16 \n\t"
"srl %[q6_l], %[q6_l], 16 \n\t"
- : [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
- [q3_l] "+r" (q3_l), [q4_l] "+r" (q4_l), [q5_l] "+r" (q5_l),
- [q6_l] "+r" (q6_l), [p6_l] "+r" (p6_l), [p5_l] "+r" (p5_l),
- [p4_l] "+r" (p4_l), [p3_l] "+r" (p3_l), [p2_l] "+r" (p2_l),
- [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l)
- :
- );
+ : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+ [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l),
+ [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
+ [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
+ [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
+ :);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p2_l_f1], %[p2_l_f1], 16 \n\t"
"srl %[p1_l_f1], %[p1_l_f1], 16 \n\t"
"srl %[p0_l_f1], %[p0_l_f1], 16 \n\t"
@@ -713,16 +647,15 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t"
- : [p2_l_f1] "+r" (p2_l_f1), [p1_l_f1] "+r" (p1_l_f1),
- [p0_l_f1] "+r" (p0_l_f1), [q0_l_f1] "+r" (q0_l_f1),
- [q1_l_f1] "+r" (q1_l_f1), [q2_l_f1] "+r" (q2_l_f1),
- [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
- [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
- :
- );
+ : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
+ [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
+ [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
if (mask & flat & flat2 & 0xFF000000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p6_l], +3(%[sp6]) \n\t"
"sb %[p5_l], +3(%[sp5]) \n\t"
"sb %[p4_l], +3(%[sp4]) \n\t"
@@ -732,14 +665,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[p0_l], +3(%[sp0]) \n\t"
:
- : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
- [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
- [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5),
- [sp4] "r" (sp4), [sp3] "r" (sp3), [sp2] "r" (sp2),
- [sp1] "r" (sp1), [sp0] "r" (sp0)
- );
-
- __asm__ __volatile__ (
+ : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+ [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+ [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
+ [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
+
+ __asm__ __volatile__(
"sb %[q0_l], +3(%[sq0]) \n\t"
"sb %[q1_l], +3(%[sq1]) \n\t"
"sb %[q2_l], +3(%[sq2]) \n\t"
@@ -749,15 +680,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q6_l], +3(%[sq6]) \n\t"
:
- : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l),
- [q2_l] "r" (q2_l), [q3_l] "r" (q3_l),
- [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
- [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2),
- [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5),
- [q6_l] "r" (q6_l), [sq6] "r" (sq6)
- );
+ : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+ [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), [sq3] "r"(sq3),
+ [sq4] "r"(sq4), [sq5] "r"(sq5), [q6_l] "r"(q6_l), [sq6] "r"(sq6));
} else if (mask & flat & 0xFF000000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p2_l_f1], +3(%[sp2]) \n\t"
"sb %[p1_l_f1], +3(%[sp1]) \n\t"
"sb %[p0_l_f1], +3(%[sp0]) \n\t"
@@ -766,25 +694,22 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q2_l_f1], +3(%[sq2]) \n\t"
:
- : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
- [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
- [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
- [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
- );
+ : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+ [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+ [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
+ [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+ [sq2] "r"(sq2));
} else if (mask & 0xFF000000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], +3(%[sp1]) \n\t"
"sb %[p0_f0], +3(%[sp0]) \n\t"
"sb %[q0_f0], +3(%[sq0]) \n\t"
"sb %[q1_f0], +3(%[sq1]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
- [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
- [sp1] "r" (sp1), [sp0] "r" (sp0),
- [sq0] "r" (sq0), [sq1] "r" (sq1)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
}
}
diff --git a/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c b/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c
index e580f014e..96e8d8858 100644
--- a/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c
@@ -19,40 +19,36 @@
#include "vpx_mem/vpx_mem.h"
#if HAVE_DSPR2
-void vpx_lpf_vertical_16_dspr2(uint8_t *s,
- int pitch,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh) {
- uint8_t i;
- uint32_t mask, hev, flat, flat2;
- uint8_t *s1, *s2, *s3, *s4;
- uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
- uint32_t thresh_vec, flimit_vec, limit_vec;
- uint32_t uflimit, ulimit, uthresh;
- uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
- uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
- uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
- uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
- uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
- uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
- uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
- uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
+void vpx_lpf_vertical_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8_t i;
+ uint32_t mask, hev, flat, flat2;
+ uint8_t *s1, *s2, *s3, *s4;
+ uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
+ uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+ uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+ uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
+ uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
+ uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
+ uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
+ uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
+ uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
uflimit = *blimit;
ulimit = *limit;
uthresh = *thresh;
/* create quad-byte */
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"replv.qb %[thresh_vec], %[uthresh] \n\t"
"replv.qb %[flimit_vec], %[uflimit] \n\t"
"replv.qb %[limit_vec], %[ulimit] \n\t"
- : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
- [limit_vec] "=r" (limit_vec)
- : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
- );
+ : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+ [limit_vec] "=r"(limit_vec)
+ : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
prefetch_store(s + pitch);
@@ -61,9 +57,9 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
s2 = s + pitch;
s3 = s2 + pitch;
s4 = s3 + pitch;
- s = s4 + pitch;
+ s = s4 + pitch;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lw %[p0], -4(%[s1]) \n\t"
"lw %[p1], -4(%[s2]) \n\t"
"lw %[p2], -4(%[s3]) \n\t"
@@ -73,13 +69,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"lw %[p6], -8(%[s3]) \n\t"
"lw %[p7], -8(%[s4]) \n\t"
- : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1),
- [p0] "=&r" (p0), [p7] "=&r" (p7), [p6] "=&r" (p6),
- [p5] "=&r" (p5), [p4] "=&r" (p4)
- : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
- );
+ : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+ [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
+ : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"lw %[q3], (%[s1]) \n\t"
"lw %[q2], (%[s2]) \n\t"
"lw %[q1], (%[s3]) \n\t"
@@ -89,11 +83,9 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"lw %[q5], +4(%[s3]) \n\t"
"lw %[q4], +4(%[s4]) \n\t"
- : [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1),
- [q0] "=&r" (q0), [q7] "=&r" (q7), [q6] "=&r" (q6),
- [q5] "=&r" (q5), [q4] "=&r" (q4)
- : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
- );
+ : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
+ [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
+ : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
/* transpose p3, p2, p1, p0
original (when loaded from memory)
@@ -110,7 +102,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
p2 p3_1 p2_1 p1_1 p0_1
p3 p3_0 p2_0 p1_0 p0_0
*/
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"precrq.qb.ph %[prim1], %[p0], %[p1] \n\t"
"precr.qb.ph %[prim2], %[p0], %[p1] \n\t"
"precrq.qb.ph %[prim3], %[p2], %[p3] \n\t"
@@ -126,12 +118,10 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"append %[p1], %[sec3], 16 \n\t"
"append %[p3], %[sec4], 16 \n\t"
- : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
- [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
- [p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3),
- [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
- :
- );
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
+ [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
/* transpose q0, q1, q2, q3
original (when loaded from memory)
@@ -148,7 +138,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
q1 q0_1 q1_1 q2_1 q3_1
q0 q0_0 q1_0 q2_0 q3_0
*/
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"precrq.qb.ph %[prim1], %[q3], %[q2] \n\t"
"precr.qb.ph %[prim2], %[q3], %[q2] \n\t"
"precrq.qb.ph %[prim3], %[q1], %[q0] \n\t"
@@ -164,12 +154,10 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"append %[q2], %[sec3], 16 \n\t"
"append %[q0], %[sec4], 16 \n\t"
- : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
- [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
- [q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0),
- [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
- :
- );
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
+ [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
/* transpose p7, p6, p5, p4
original (when loaded from memory)
@@ -186,7 +174,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
p6 p7_1 p6_1 p5_1 p4_1
p7 p7_0 p6_0 p5_0 p4_0
*/
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"precrq.qb.ph %[prim1], %[p4], %[p5] \n\t"
"precr.qb.ph %[prim2], %[p4], %[p5] \n\t"
"precrq.qb.ph %[prim3], %[p6], %[p7] \n\t"
@@ -202,12 +190,10 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"append %[p5], %[sec3], 16 \n\t"
"append %[p7], %[sec4], 16 \n\t"
- : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
- [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
- [p4] "+r" (p4), [p5] "+r" (p5), [p6] "+r" (p6), [p7] "+r" (p7),
- [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
- :
- );
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p4] "+r"(p4), [p5] "+r"(p5), [p6] "+r"(p6),
+ [p7] "+r"(p7), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
/* transpose q4, q5, q6, q7
original (when loaded from memory)
@@ -224,7 +210,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
q5 q4_1 q5_1 q26_1 q7_1
q4 q4_0 q5_0 q26_0 q7_0
*/
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"precrq.qb.ph %[prim1], %[q7], %[q6] \n\t"
"precr.qb.ph %[prim2], %[q7], %[q6] \n\t"
"precrq.qb.ph %[prim3], %[q5], %[q4] \n\t"
@@ -240,71 +226,60 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"append %[q6], %[sec3], 16 \n\t"
"append %[q4], %[sec4], 16 \n\t"
- : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
- [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
- [q7] "+r" (q7), [q6] "+r" (q6), [q5] "+r" (q5), [q4] "+r" (q4),
- [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
- :
- );
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [q7] "+r"(q7), [q6] "+r"(q6), [q5] "+r"(q5),
+ [q4] "+r"(q4), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
- filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
- p1, p0, p3, p2, q0, q1, q2, q3,
- &hev, &mask, &flat);
+ filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+ p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
/* f0 */
if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
((flat2 != 0) && (flat == 0) && (mask != 0))) {
- filter1_dspr2(mask, hev, p1, p0, q0, q1,
- &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
STORE_F0()
} else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
(mask == 0xFFFFFFFF)) {
/* f2 */
PACK_LEFT_0TO3()
PACK_LEFT_4TO7()
- wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
- &p3_l, &p2_l, &p1_l, &p0_l,
- &q0_l, &q1_l, &q2_l, &q3_l,
- &q4_l, &q5_l, &q6_l, &q7_l);
+ wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+ &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+ &q6_l, &q7_l);
PACK_RIGHT_0TO3()
PACK_RIGHT_4TO7()
- wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
- &p3_r, &p2_r, &p1_r, &p0_r,
- &q0_r, &q1_r, &q2_r, &q3_r,
- &q4_r, &q5_r, &q6_r, &q7_r);
+ wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+ &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+ &q6_r, &q7_r);
STORE_F2()
} else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
/* f1 */
PACK_LEFT_0TO3()
- mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
- &q0_l, &q1_l, &q2_l, &q3_l);
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
PACK_RIGHT_0TO3()
- mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
- &q0_r, &q1_r, &q2_r, &q3_r);
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
STORE_F1()
} else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
/* f0 + f1 */
- filter1_dspr2(mask, hev, p1, p0, q0, q1,
- &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
/* left 2 element operation */
PACK_LEFT_0TO3()
- mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
- &q0_l, &q1_l, &q2_l, &q3_l);
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
/* right 2 element operation */
PACK_RIGHT_0TO3()
- mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
- &q0_r, &q1_r, &q2_r, &q3_r);
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
if (mask & flat & 0x000000FF) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p2_r], -3(%[s4]) \n\t"
"sb %[p1_r], -2(%[s4]) \n\t"
"sb %[p0_r], -1(%[s4]) \n\t"
@@ -313,25 +288,22 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[q2_r], +2(%[s4]) \n\t"
:
- : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
- [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
- [s4] "r" (s4)
- );
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [s4] "r"(s4));
} else if (mask & 0x000000FF) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], -2(%[s4]) \n\t"
"sb %[p0_f0], -1(%[s4]) \n\t"
"sb %[q0_f0], (%[s4]) \n\t"
"sb %[q1_f0], +1(%[s4]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
- [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
- [s4] "r" (s4)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s4] "r"(s4));
}
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p2_r], %[p2_r], 16 \n\t"
"srl %[p1_r], %[p1_r], 16 \n\t"
"srl %[p0_r], %[p0_r], 16 \n\t"
@@ -343,15 +315,14 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t"
- : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
- [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
- [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
- [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
- :
- );
+ : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+ [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
if (mask & flat & 0x0000FF00) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p2_r], -3(%[s3]) \n\t"
"sb %[p1_r], -2(%[s3]) \n\t"
"sb %[p0_r], -1(%[s3]) \n\t"
@@ -360,64 +331,57 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[q2_r], +2(%[s3]) \n\t"
:
- : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
- [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
- [s3] "r" (s3)
- );
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [s3] "r"(s3));
} else if (mask & 0x0000FF00) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], -2(%[s3]) \n\t"
"sb %[p0_f0], -1(%[s3]) \n\t"
"sb %[q0_f0], (%[s3]) \n\t"
"sb %[q1_f0], +1(%[s3]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
- [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
- [s3] "r" (s3)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s3] "r"(s3));
}
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p1_f0], %[p1_f0], 8 \n\t"
"srl %[p0_f0], %[p0_f0], 8 \n\t"
"srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t"
- : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
- [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
- :
- );
+ : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
if (mask & flat & 0x00FF0000) {
- __asm__ __volatile__ (
- "sb %[p2_l], -3(%[s2]) \n\t"
- "sb %[p1_l], -2(%[s2]) \n\t"
- "sb %[p0_l], -1(%[s2]) \n\t"
- "sb %[q0_l], (%[s2]) \n\t"
- "sb %[q1_l], +1(%[s2]) \n\t"
- "sb %[q2_l], +2(%[s2]) \n\t"
-
- :
- : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
- [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
- [s2] "r" (s2)
- );
+ __asm__ __volatile__(
+ "sb %[p2_l], -3(%[s2]) \n\t"
+ "sb %[p1_l], -2(%[s2]) \n\t"
+ "sb %[p0_l], -1(%[s2]) \n\t"
+ "sb %[q0_l], (%[s2]) \n\t"
+ "sb %[q1_l], +1(%[s2]) \n\t"
+ "sb %[q2_l], +2(%[s2]) \n\t"
+
+ :
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [s2] "r"(s2));
} else if (mask & 0x00FF0000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], -2(%[s2]) \n\t"
"sb %[p0_f0], -1(%[s2]) \n\t"
"sb %[q0_f0], (%[s2]) \n\t"
"sb %[q1_f0], +1(%[s2]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
- [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
- [s2] "r" (s2)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s2] "r"(s2));
}
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p2_l], %[p2_l], 16 \n\t"
"srl %[p1_l], %[p1_l], 16 \n\t"
"srl %[p0_l], %[p0_l], 16 \n\t"
@@ -429,15 +393,14 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t"
- : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
- [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
- [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
- [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
- :
- );
+ : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+ [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
if (mask & flat & 0xFF000000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p2_l], -3(%[s1]) \n\t"
"sb %[p1_l], -2(%[s1]) \n\t"
"sb %[p0_l], -1(%[s1]) \n\t"
@@ -446,54 +409,44 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[q2_l], +2(%[s1]) \n\t"
:
- : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
- [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
- [s1] "r" (s1)
- );
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [s1] "r"(s1));
} else if (mask & 0xFF000000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], -2(%[s1]) \n\t"
"sb %[p0_f0], -1(%[s1]) \n\t"
"sb %[q0_f0], (%[s1]) \n\t"
"sb %[q1_f0], +1(%[s1]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
- [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
- [s1] "r" (s1)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s1] "r"(s1));
}
} else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
/* f0+f1+f2 */
- filter1_dspr2(mask, hev, p1, p0, q0, q1,
- &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
PACK_LEFT_0TO3()
- mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
- q0_l, q1_l, q2_l, q3_l,
- &p2_l_f1, &p1_l_f1, &p0_l_f1,
- &q0_l_f1, &q1_l_f1, &q2_l_f1);
+ mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
+ &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
PACK_RIGHT_0TO3()
- mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
- q0_r, q1_r, q2_r, q3_r,
- &p2_r_f1, &p1_r_f1, &p0_r_f1,
- &q0_r_f1, &q1_r_f1, &q2_r_f1);
+ mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
+ &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
PACK_LEFT_4TO7()
- wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
- &p3_l, &p2_l, &p1_l, &p0_l,
- &q0_l, &q1_l, &q2_l, &q3_l,
- &q4_l, &q5_l, &q6_l, &q7_l);
+ wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+ &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+ &q6_l, &q7_l);
PACK_RIGHT_4TO7()
- wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
- &p3_r, &p2_r, &p1_r, &p0_r,
- &q0_r, &q1_r, &q2_r, &q3_r,
- &q4_r, &q5_r, &q6_r, &q7_r);
+ wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+ &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+ &q6_r, &q7_r);
if (mask & flat & flat2 & 0x000000FF) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p6_r], -7(%[s4]) \n\t"
"sb %[p5_r], -6(%[s4]) \n\t"
"sb %[p4_r], -5(%[s4]) \n\t"
@@ -503,13 +456,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[p0_r], -1(%[s4]) \n\t"
:
- : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r),
- [p4_r] "r" (p4_r), [p3_r] "r" (p3_r),
- [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
- [p0_r] "r" (p0_r), [s4] "r" (s4)
- );
+ : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+ [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+ [p0_r] "r"(p0_r), [s4] "r"(s4));
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[q0_r], (%[s4]) \n\t"
"sb %[q1_r], +1(%[s4]) \n\t"
"sb %[q2_r], +2(%[s4]) \n\t"
@@ -519,13 +470,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[q6_r], +6(%[s4]) \n\t"
:
- : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r),
- [q2_r] "r" (q2_r), [q3_r] "r" (q3_r),
- [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
- [q6_r] "r" (q6_r), [s4] "r" (s4)
- );
+ : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+ [q6_r] "r"(q6_r), [s4] "r"(s4));
} else if (mask & flat & 0x000000FF) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p2_r_f1], -3(%[s4]) \n\t"
"sb %[p1_r_f1], -2(%[s4]) \n\t"
"sb %[p0_r_f1], -1(%[s4]) \n\t"
@@ -534,26 +483,22 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[q2_r_f1], +2(%[s4]) \n\t"
:
- : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
- [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
- [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
- [s4] "r" (s4)
- );
+ : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+ [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+ [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s4] "r"(s4));
} else if (mask & 0x000000FF) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], -2(%[s4]) \n\t"
"sb %[p0_f0], -1(%[s4]) \n\t"
"sb %[q0_f0], (%[s4]) \n\t"
"sb %[q1_f0], +1(%[s4]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
- [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
- [s4] "r" (s4)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s4] "r"(s4));
}
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p6_r], %[p6_r], 16 \n\t"
"srl %[p5_r], %[p5_r], 16 \n\t"
"srl %[p4_r], %[p4_r], 16 \n\t"
@@ -569,17 +514,14 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"srl %[q5_r], %[q5_r], 16 \n\t"
"srl %[q6_r], %[q6_r], 16 \n\t"
- : [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r),
- [q2_r] "+r" (q2_r), [q3_r] "+r" (q3_r),
- [q4_r] "+r" (q4_r), [q5_r] "+r" (q5_r),
- [q6_r] "+r" (q6_r), [p6_r] "+r" (p6_r),
- [p5_r] "+r" (p5_r), [p4_r] "+r" (p4_r),
- [p3_r] "+r" (p3_r), [p2_r] "+r" (p2_r),
- [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r)
- :
- );
-
- __asm__ __volatile__ (
+ : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+ [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
+ [q6_r] "+r"(q6_r), [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r),
+ [p4_r] "+r"(p4_r), [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r),
+ [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r)
+ :);
+
+ __asm__ __volatile__(
"srl %[p2_r_f1], %[p2_r_f1], 16 \n\t"
"srl %[p1_r_f1], %[p1_r_f1], 16 \n\t"
"srl %[p0_r_f1], %[p0_r_f1], 16 \n\t"
@@ -591,16 +533,15 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t"
- : [p2_r_f1] "+r" (p2_r_f1), [p1_r_f1] "+r" (p1_r_f1),
- [p0_r_f1] "+r" (p0_r_f1), [q0_r_f1] "+r" (q0_r_f1),
- [q1_r_f1] "+r" (q1_r_f1), [q2_r_f1] "+r" (q2_r_f1),
- [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
- [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
- :
- );
+ : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
+ [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
+ [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
if (mask & flat & flat2 & 0x0000FF00) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p6_r], -7(%[s3]) \n\t"
"sb %[p5_r], -6(%[s3]) \n\t"
"sb %[p4_r], -5(%[s3]) \n\t"
@@ -610,12 +551,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[p0_r], -1(%[s3]) \n\t"
:
- : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
- [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
- [p0_r] "r" (p0_r), [s3] "r" (s3)
- );
+ : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+ [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+ [p0_r] "r"(p0_r), [s3] "r"(s3));
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[q0_r], (%[s3]) \n\t"
"sb %[q1_r], +1(%[s3]) \n\t"
"sb %[q2_r], +2(%[s3]) \n\t"
@@ -625,13 +565,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[q6_r], +6(%[s3]) \n\t"
:
- : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r),
- [q2_r] "r" (q2_r), [q3_r] "r" (q3_r),
- [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
- [q6_r] "r" (q6_r), [s3] "r" (s3)
- );
+ : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+ [q6_r] "r"(q6_r), [s3] "r"(s3));
} else if (mask & flat & 0x0000FF00) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p2_r_f1], -3(%[s3]) \n\t"
"sb %[p1_r_f1], -2(%[s3]) \n\t"
"sb %[p0_r_f1], -1(%[s3]) \n\t"
@@ -640,38 +578,33 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[q2_r_f1], +2(%[s3]) \n\t"
:
- : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
- [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
- [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
- [s3] "r" (s3)
- );
+ : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+ [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+ [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s3] "r"(s3));
} else if (mask & 0x0000FF00) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], -2(%[s3]) \n\t"
"sb %[p0_f0], -1(%[s3]) \n\t"
"sb %[q0_f0], (%[s3]) \n\t"
"sb %[q1_f0], +1(%[s3]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
- [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
- [s3] "r" (s3)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s3] "r"(s3));
}
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p1_f0], %[p1_f0], 8 \n\t"
"srl %[p0_f0], %[p0_f0], 8 \n\t"
"srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t"
- : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
- [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
- :
- );
+ : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
if (mask & flat & flat2 & 0x00FF0000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p6_l], -7(%[s2]) \n\t"
"sb %[p5_l], -6(%[s2]) \n\t"
"sb %[p4_l], -5(%[s2]) \n\t"
@@ -681,12 +614,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[p0_l], -1(%[s2]) \n\t"
:
- : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
- [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
- [p0_l] "r" (p0_l), [s2] "r" (s2)
- );
+ : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+ [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+ [p0_l] "r"(p0_l), [s2] "r"(s2));
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[q0_l], (%[s2]) \n\t"
"sb %[q1_l], +1(%[s2]) \n\t"
"sb %[q2_l], +2(%[s2]) \n\t"
@@ -696,12 +628,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[q6_l], +6(%[s2]) \n\t"
:
- : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
- [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
- [q6_l] "r" (q6_l), [s2] "r" (s2)
- );
+ : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+ [q6_l] "r"(q6_l), [s2] "r"(s2));
} else if (mask & flat & 0x00FF0000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p2_l_f1], -3(%[s2]) \n\t"
"sb %[p1_l_f1], -2(%[s2]) \n\t"
"sb %[p0_l_f1], -1(%[s2]) \n\t"
@@ -710,26 +641,22 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[q2_l_f1], +2(%[s2]) \n\t"
:
- : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
- [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
- [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
- [s2] "r" (s2)
- );
+ : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+ [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+ [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s2] "r"(s2));
} else if (mask & 0x00FF0000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], -2(%[s2]) \n\t"
"sb %[p0_f0], -1(%[s2]) \n\t"
"sb %[q0_f0], (%[s2]) \n\t"
"sb %[q1_f0], +1(%[s2]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
- [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
- [s2] "r" (s2)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s2] "r"(s2));
}
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p6_l], %[p6_l], 16 \n\t"
"srl %[p5_l], %[p5_l], 16 \n\t"
"srl %[p4_l], %[p4_l], 16 \n\t"
@@ -745,15 +672,14 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"srl %[q5_l], %[q5_l], 16 \n\t"
"srl %[q6_l], %[q6_l], 16 \n\t"
- : [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
- [q3_l] "+r" (q3_l), [q4_l] "+r" (q4_l), [q5_l] "+r" (q5_l),
- [q6_l] "+r" (q6_l), [p6_l] "+r" (p6_l), [p5_l] "+r" (p5_l),
- [p4_l] "+r" (p4_l), [p3_l] "+r" (p3_l), [p2_l] "+r" (p2_l),
- [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l)
- :
- );
+ : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+ [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l),
+ [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
+ [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
+ [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
+ :);
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"srl %[p2_l_f1], %[p2_l_f1], 16 \n\t"
"srl %[p1_l_f1], %[p1_l_f1], 16 \n\t"
"srl %[p0_l_f1], %[p0_l_f1], 16 \n\t"
@@ -765,16 +691,15 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t"
- : [p2_l_f1] "+r" (p2_l_f1), [p1_l_f1] "+r" (p1_l_f1),
- [p0_l_f1] "+r" (p0_l_f1), [q0_l_f1] "+r" (q0_l_f1),
- [q1_l_f1] "+r" (q1_l_f1), [q2_l_f1] "+r" (q2_l_f1),
- [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
- [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
- :
- );
+ : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
+ [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
+ [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
if (mask & flat & flat2 & 0xFF000000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p6_l], -7(%[s1]) \n\t"
"sb %[p5_l], -6(%[s1]) \n\t"
"sb %[p4_l], -5(%[s1]) \n\t"
@@ -784,13 +709,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[p0_l], -1(%[s1]) \n\t"
:
- : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
- [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
- [p0_l] "r" (p0_l),
- [s1] "r" (s1)
- );
+ : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+ [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+ [p0_l] "r"(p0_l), [s1] "r"(s1));
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[q0_l], (%[s1]) \n\t"
"sb %[q1_l], 1(%[s1]) \n\t"
"sb %[q2_l], 2(%[s1]) \n\t"
@@ -800,13 +723,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[q6_l], 6(%[s1]) \n\t"
:
- : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
- [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
- [q6_l] "r" (q6_l),
- [s1] "r" (s1)
- );
+ : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+ [q6_l] "r"(q6_l), [s1] "r"(s1));
} else if (mask & flat & 0xFF000000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p2_l_f1], -3(%[s1]) \n\t"
"sb %[p1_l_f1], -2(%[s1]) \n\t"
"sb %[p0_l_f1], -1(%[s1]) \n\t"
@@ -815,23 +736,19 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[q2_l_f1], +2(%[s1]) \n\t"
:
- : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
- [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
- [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
- [s1] "r" (s1)
- );
+ : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+ [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+ [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s1] "r"(s1));
} else if (mask & 0xFF000000) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"sb %[p1_f0], -2(%[s1]) \n\t"
"sb %[p0_f0], -1(%[s1]) \n\t"
"sb %[q0_f0], (%[s1]) \n\t"
"sb %[q1_f0], +1(%[s1]) \n\t"
:
- : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
- [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
- [s1] "r" (s1)
- );
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s1] "r"(s1));
}
}
}
diff --git a/vpx_dsp/mips/loopfilter_msa.h b/vpx_dsp/mips/loopfilter_msa.h
index 62b170610..4063e5e6b 100644
--- a/vpx_dsp/mips/loopfilter_msa.h
+++ b/vpx_dsp/mips/loopfilter_msa.h
@@ -13,234 +13,238 @@
#include "vpx_dsp/mips/macros_msa.h"
-#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
- p1_out, p0_out, q0_out, q1_out) { \
- v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \
- v16i8 filt, filt1, filt2, cnst4b, cnst3b; \
- v8i16 q0_sub_p0_r, filt_r, cnst3h; \
- \
- p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \
- p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \
- q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \
- q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \
- \
- filt = __msa_subs_s_b(p1_m, q1_m); \
- filt = filt & (v16i8)hev_in; \
- q0_sub_p0 = q0_m - p0_m; \
- filt_sign = __msa_clti_s_b(filt, 0); \
- \
- cnst3h = __msa_ldi_h(3); \
- q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \
- q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \
- filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \
- filt_r += q0_sub_p0_r; \
- filt_r = __msa_sat_s_h(filt_r, 7); \
- \
- /* combine left and right part */ \
- filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r); \
- \
- filt = filt & (v16i8)mask_in; \
- cnst4b = __msa_ldi_b(4); \
- filt1 = __msa_adds_s_b(filt, cnst4b); \
- filt1 >>= 3; \
- \
- cnst3b = __msa_ldi_b(3); \
- filt2 = __msa_adds_s_b(filt, cnst3b); \
- filt2 >>= 3; \
- \
- q0_m = __msa_subs_s_b(q0_m, filt1); \
- q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \
- p0_m = __msa_adds_s_b(p0_m, filt2); \
- p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \
- \
- filt = __msa_srari_b(filt1, 1); \
- hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \
- filt = filt & (v16i8)hev_in; \
- \
- q1_m = __msa_subs_s_b(q1_m, filt); \
- q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \
- p1_m = __msa_adds_s_b(p1_m, filt); \
- p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \
-}
+#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
+ p1_out, p0_out, q0_out, q1_out) \
+ { \
+ v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \
+ v16i8 filt, filt1, filt2, cnst4b, cnst3b; \
+ v8i16 q0_sub_p0_r, filt_r, cnst3h; \
+ \
+ p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \
+ p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \
+ q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \
+ q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \
+ \
+ filt = __msa_subs_s_b(p1_m, q1_m); \
+ filt = filt & (v16i8)hev_in; \
+ q0_sub_p0 = q0_m - p0_m; \
+ filt_sign = __msa_clti_s_b(filt, 0); \
+ \
+ cnst3h = __msa_ldi_h(3); \
+ q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \
+ q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \
+ filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \
+ filt_r += q0_sub_p0_r; \
+ filt_r = __msa_sat_s_h(filt_r, 7); \
+ \
+ /* combine left and right part */ \
+ filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r); \
+ \
+ filt = filt & (v16i8)mask_in; \
+ cnst4b = __msa_ldi_b(4); \
+ filt1 = __msa_adds_s_b(filt, cnst4b); \
+ filt1 >>= 3; \
+ \
+ cnst3b = __msa_ldi_b(3); \
+ filt2 = __msa_adds_s_b(filt, cnst3b); \
+ filt2 >>= 3; \
+ \
+ q0_m = __msa_subs_s_b(q0_m, filt1); \
+ q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \
+ p0_m = __msa_adds_s_b(p0_m, filt2); \
+ p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \
+ \
+ filt = __msa_srari_b(filt1, 1); \
+ hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \
+ filt = filt & (v16i8)hev_in; \
+ \
+ q1_m = __msa_subs_s_b(q1_m, filt); \
+ q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \
+ p1_m = __msa_adds_s_b(p1_m, filt); \
+ p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \
+ }
-#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
- p1_out, p0_out, q0_out, q1_out) { \
- v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \
- v16i8 filt, filt1, filt2, cnst4b, cnst3b; \
- v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \
- \
- p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \
- p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \
- q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \
- q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \
- \
- filt = __msa_subs_s_b(p1_m, q1_m); \
- \
- filt = filt & (v16i8)hev_in; \
- \
- q0_sub_p0 = q0_m - p0_m; \
- filt_sign = __msa_clti_s_b(filt, 0); \
- \
- cnst3h = __msa_ldi_h(3); \
- q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \
- q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \
- filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \
- filt_r += q0_sub_p0_r; \
- filt_r = __msa_sat_s_h(filt_r, 7); \
- \
- q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0); \
- q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h); \
- filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \
- filt_l += q0_sub_p0_l; \
- filt_l = __msa_sat_s_h(filt_l, 7); \
- \
- filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \
- filt = filt & (v16i8)mask_in; \
- \
- cnst4b = __msa_ldi_b(4); \
- filt1 = __msa_adds_s_b(filt, cnst4b); \
- filt1 >>= 3; \
- \
- cnst3b = __msa_ldi_b(3); \
- filt2 = __msa_adds_s_b(filt, cnst3b); \
- filt2 >>= 3; \
- \
- q0_m = __msa_subs_s_b(q0_m, filt1); \
- q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \
- p0_m = __msa_adds_s_b(p0_m, filt2); \
- p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \
- \
- filt = __msa_srari_b(filt1, 1); \
- hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \
- filt = filt & (v16i8)hev_in; \
- \
- q1_m = __msa_subs_s_b(q1_m, filt); \
- q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \
- p1_m = __msa_adds_s_b(p1_m, filt); \
- p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \
-}
+#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
+ p1_out, p0_out, q0_out, q1_out) \
+ { \
+ v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \
+ v16i8 filt, filt1, filt2, cnst4b, cnst3b; \
+ v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \
+ \
+ p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \
+ p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \
+ q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \
+ q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \
+ \
+ filt = __msa_subs_s_b(p1_m, q1_m); \
+ \
+ filt = filt & (v16i8)hev_in; \
+ \
+ q0_sub_p0 = q0_m - p0_m; \
+ filt_sign = __msa_clti_s_b(filt, 0); \
+ \
+ cnst3h = __msa_ldi_h(3); \
+ q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \
+ q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \
+ filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \
+ filt_r += q0_sub_p0_r; \
+ filt_r = __msa_sat_s_h(filt_r, 7); \
+ \
+ q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0); \
+ q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h); \
+ filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \
+ filt_l += q0_sub_p0_l; \
+ filt_l = __msa_sat_s_h(filt_l, 7); \
+ \
+ filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \
+ filt = filt & (v16i8)mask_in; \
+ \
+ cnst4b = __msa_ldi_b(4); \
+ filt1 = __msa_adds_s_b(filt, cnst4b); \
+ filt1 >>= 3; \
+ \
+ cnst3b = __msa_ldi_b(3); \
+ filt2 = __msa_adds_s_b(filt, cnst3b); \
+ filt2 >>= 3; \
+ \
+ q0_m = __msa_subs_s_b(q0_m, filt1); \
+ q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \
+ p0_m = __msa_adds_s_b(p0_m, filt2); \
+ p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \
+ \
+ filt = __msa_srari_b(filt1, 1); \
+ hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \
+ filt = filt & (v16i8)hev_in; \
+ \
+ q1_m = __msa_subs_s_b(q1_m, filt); \
+ q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \
+ p1_m = __msa_adds_s_b(p1_m, filt); \
+ p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \
+ }
-#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) { \
- v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
- v16u8 zero_in = { 0 }; \
- \
- tmp = __msa_ori_b(zero_in, 1); \
- p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \
- q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \
- p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \
- q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \
- \
- p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \
- flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \
- p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \
- flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \
- \
- flat_out = (tmp < (v16u8)flat_out); \
- flat_out = __msa_xori_b(flat_out, 0xff); \
- flat_out = flat_out & (mask); \
-}
+#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \
+ { \
+ v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
+ v16u8 zero_in = { 0 }; \
+ \
+ tmp = __msa_ori_b(zero_in, 1); \
+ p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \
+ q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \
+ p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \
+ q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \
+ \
+ p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \
+ flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \
+ p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \
+ flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \
+ \
+ flat_out = (tmp < (v16u8)flat_out); \
+ flat_out = __msa_xori_b(flat_out, 0xff); \
+ flat_out = flat_out & (mask); \
+ }
-#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, \
- q5_in, q6_in, q7_in, flat_in, flat2_out) { \
- v16u8 tmp, zero_in = { 0 }; \
- v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \
- v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \
- \
- tmp = __msa_ori_b(zero_in, 1); \
- p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \
- q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \
- p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \
- q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \
- p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \
- q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \
- p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \
- q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \
- \
- p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \
- flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \
- flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \
- p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \
- flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \
- p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \
- flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \
- \
- flat2_out = (tmp < (v16u8)flat2_out); \
- flat2_out = __msa_xori_b(flat2_out, 0xff); \
- flat2_out = flat2_out & flat_in; \
-}
+#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \
+ q6_in, q7_in, flat_in, flat2_out) \
+ { \
+ v16u8 tmp, zero_in = { 0 }; \
+ v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \
+ v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \
+ \
+ tmp = __msa_ori_b(zero_in, 1); \
+ p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \
+ q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \
+ p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \
+ q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \
+ p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \
+ q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \
+ p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \
+ q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \
+ \
+ p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \
+ flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \
+ flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \
+ p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \
+ flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \
+ p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \
+ flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \
+ \
+ flat2_out = (tmp < (v16u8)flat2_out); \
+ flat2_out = __msa_xori_b(flat2_out, 0xff); \
+ flat2_out = flat2_out & flat_in; \
+ }
-#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, \
- q0_in, q1_in, q2_in, q3_in, \
- p2_filt8_out, p1_filt8_out, p0_filt8_out, \
- q0_filt8_out, q1_filt8_out, q2_filt8_out) { \
- v8u16 tmp0, tmp1, tmp2; \
- \
- tmp2 = p2_in + p1_in + p0_in; \
- tmp0 = p3_in << 1; \
- \
- tmp0 = tmp0 + tmp2 + q0_in; \
- tmp1 = tmp0 + p3_in + p2_in; \
- p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \
- \
- tmp1 = tmp0 + p1_in + q1_in; \
- p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \
- \
- tmp1 = q2_in + q1_in + q0_in; \
- tmp2 = tmp2 + tmp1; \
- tmp0 = tmp2 + (p0_in); \
- tmp0 = tmp0 + (p3_in); \
- p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp0, 3); \
- \
- tmp0 = q2_in + q3_in; \
- tmp0 = p0_in + tmp1 + tmp0; \
- tmp1 = q3_in + q3_in; \
- tmp1 = tmp1 + tmp0; \
- q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \
- \
- tmp0 = tmp2 + q3_in; \
- tmp1 = tmp0 + q0_in; \
- q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \
- \
- tmp1 = tmp0 - p2_in; \
- tmp0 = q1_in + q3_in; \
- tmp1 = tmp0 + tmp1; \
- q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \
-}
+#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+ p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
+ q1_filt8_out, q2_filt8_out) \
+ { \
+ v8u16 tmp0, tmp1, tmp2; \
+ \
+ tmp2 = p2_in + p1_in + p0_in; \
+ tmp0 = p3_in << 1; \
+ \
+ tmp0 = tmp0 + tmp2 + q0_in; \
+ tmp1 = tmp0 + p3_in + p2_in; \
+ p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \
+ \
+ tmp1 = tmp0 + p1_in + q1_in; \
+ p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \
+ \
+ tmp1 = q2_in + q1_in + q0_in; \
+ tmp2 = tmp2 + tmp1; \
+ tmp0 = tmp2 + (p0_in); \
+ tmp0 = tmp0 + (p3_in); \
+ p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp0, 3); \
+ \
+ tmp0 = q2_in + q3_in; \
+ tmp0 = p0_in + tmp1 + tmp0; \
+ tmp1 = q3_in + q3_in; \
+ tmp1 = tmp1 + tmp0; \
+ q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \
+ \
+ tmp0 = tmp2 + q3_in; \
+ tmp1 = tmp0 + q0_in; \
+ q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \
+ \
+ tmp1 = tmp0 - p2_in; \
+ tmp0 = q1_in + q3_in; \
+ tmp1 = tmp0 + tmp1; \
+ q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \
+ }
-#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \
- q0_in, q1_in, q2_in, q3_in, \
- limit_in, b_limit_in, thresh_in, \
- hev_out, mask_out, flat_out) { \
- v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
- v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
- \
- /* absolute subtraction of pixel values */ \
- p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \
- p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \
- p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \
- q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \
- q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \
- q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \
- p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \
- p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \
- \
- /* calculation of hev */ \
- flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \
- hev_out = thresh_in < (v16u8)flat_out; \
- \
- /* calculation of mask */ \
- p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \
- p1_asub_q1_m >>= 1; \
- p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \
- \
- mask_out = b_limit_in < p0_asub_q0_m; \
- mask_out = __msa_max_u_b(flat_out, mask_out); \
- p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \
- mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \
- q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \
- mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \
- \
- mask_out = limit_in < (v16u8)mask_out; \
- mask_out = __msa_xori_b(mask_out, 0xff); \
-}
-#endif /* VPX_DSP_LOOPFILTER_MSA_H_ */
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+ limit_in, b_limit_in, thresh_in, hev_out, mask_out, \
+ flat_out) \
+ { \
+ v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
+ v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
+ \
+ /* absolute subtraction of pixel values */ \
+ p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \
+ p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \
+ p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \
+ q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \
+ q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \
+ q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \
+ p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \
+ p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \
+ \
+ /* calculation of hev */ \
+ flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \
+ hev_out = thresh_in < (v16u8)flat_out; \
+ \
+ /* calculation of mask */ \
+ p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \
+ p1_asub_q1_m >>= 1; \
+ p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \
+ \
+ mask_out = b_limit_in < p0_asub_q0_m; \
+ mask_out = __msa_max_u_b(flat_out, mask_out); \
+ p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \
+ mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \
+ q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \
+ mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \
+ \
+ mask_out = limit_in < (v16u8)mask_out; \
+ mask_out = __msa_xori_b(mask_out, 0xff); \
+ }
+#endif /* VPX_DSP_LOOPFILTER_MSA_H_ */
diff --git a/vpx_dsp/mips/macros_msa.h b/vpx_dsp/mips/macros_msa.h
index ea59eafe9..f498fbe9d 100644
--- a/vpx_dsp/mips/macros_msa.h
+++ b/vpx_dsp/mips/macros_msa.h
@@ -38,194 +38,186 @@
#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
#if (__mips_isa_rev >= 6)
-#define LH(psrc) ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint16_t val_m; \
- \
- __asm__ __volatile__ ( \
- "lh %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r" (val_m) \
- : [psrc_m] "m" (*psrc_m) \
- ); \
- \
- val_m; \
-})
-
-#define LW(psrc) ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint32_t val_m; \
- \
- __asm__ __volatile__ ( \
- "lw %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r" (val_m) \
- : [psrc_m] "m" (*psrc_m) \
- ); \
- \
- val_m; \
-})
+#define LH(psrc) \
+ ({ \
+ const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ uint16_t val_m; \
+ \
+ __asm__ __volatile__("lh %[val_m], %[psrc_m] \n\t" \
+ \
+ : [val_m] "=r"(val_m) \
+ : [psrc_m] "m"(*psrc_m)); \
+ \
+ val_m; \
+ })
+
+#define LW(psrc) \
+ ({ \
+ const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ uint32_t val_m; \
+ \
+ __asm__ __volatile__("lw %[val_m], %[psrc_m] \n\t" \
+ \
+ : [val_m] "=r"(val_m) \
+ : [psrc_m] "m"(*psrc_m)); \
+ \
+ val_m; \
+ })
#if (__mips == 64)
-#define LD(psrc) ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint64_t val_m = 0; \
- \
- __asm__ __volatile__ ( \
- "ld %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r" (val_m) \
- : [psrc_m] "m" (*psrc_m) \
- ); \
- \
- val_m; \
-})
+#define LD(psrc) \
+ ({ \
+ const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ uint64_t val_m = 0; \
+ \
+ __asm__ __volatile__("ld %[val_m], %[psrc_m] \n\t" \
+ \
+ : [val_m] "=r"(val_m) \
+ : [psrc_m] "m"(*psrc_m)); \
+ \
+ val_m; \
+ })
#else // !(__mips == 64)
-#define LD(psrc) ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint32_t val0_m, val1_m; \
- uint64_t val_m = 0; \
+#define LD(psrc) \
+ ({ \
+ const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ uint32_t val0_m, val1_m; \
+ uint64_t val_m = 0; \
+ \
+ val0_m = LW(psrc_m); \
+ val1_m = LW(psrc_m + 4); \
+ \
+ val_m = (uint64_t)(val1_m); \
+ val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
+ val_m = (uint64_t)(val_m | (uint64_t)val0_m); \
+ \
+ val_m; \
+ })
+#endif // (__mips == 64)
+
+#define SH(val, pdst) \
+ { \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ const uint16_t val_m = (val); \
+ \
+ __asm__ __volatile__("sh %[val_m], %[pdst_m] \n\t" \
+ \
+ : [pdst_m] "=m"(*pdst_m) \
+ : [val_m] "r"(val_m)); \
+ }
+
+#define SW(val, pdst) \
+ { \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ const uint32_t val_m = (val); \
+ \
+ __asm__ __volatile__("sw %[val_m], %[pdst_m] \n\t" \
+ \
+ : [pdst_m] "=m"(*pdst_m) \
+ : [val_m] "r"(val_m)); \
+ }
+
+#define SD(val, pdst) \
+ { \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ const uint64_t val_m = (val); \
+ \
+ __asm__ __volatile__("sd %[val_m], %[pdst_m] \n\t" \
+ \
+ : [pdst_m] "=m"(*pdst_m) \
+ : [val_m] "r"(val_m)); \
+ }
+#else // !(__mips_isa_rev >= 6)
+#define LH(psrc) \
+ ({ \
+ const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ uint16_t val_m; \
\
- val0_m = LW(psrc_m); \
- val1_m = LW(psrc_m + 4); \
+ __asm__ __volatile__("ulh %[val_m], %[psrc_m] \n\t" \
\
- val_m = (uint64_t)(val1_m); \
- val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
- val_m = (uint64_t)(val_m | (uint64_t)val0_m); \
+ : [val_m] "=r"(val_m) \
+ : [psrc_m] "m"(*psrc_m)); \
\
- val_m; \
-})
-#endif // (__mips == 64)
+ val_m; \
+ })
-#define SH(val, pdst) { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint16_t val_m = (val); \
- \
- __asm__ __volatile__ ( \
- "sh %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m" (*pdst_m) \
- : [val_m] "r" (val_m) \
- ); \
-}
-
-#define SW(val, pdst) { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint32_t val_m = (val); \
- \
- __asm__ __volatile__ ( \
- "sw %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m" (*pdst_m) \
- : [val_m] "r" (val_m) \
- ); \
-}
-
-#define SD(val, pdst) { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint64_t val_m = (val); \
- \
- __asm__ __volatile__ ( \
- "sd %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m" (*pdst_m) \
- : [val_m] "r" (val_m) \
- ); \
-}
-#else // !(__mips_isa_rev >= 6)
-#define LH(psrc) ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint16_t val_m; \
- \
- __asm__ __volatile__ ( \
- "ulh %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r" (val_m) \
- : [psrc_m] "m" (*psrc_m) \
- ); \
- \
- val_m; \
-})
-
-#define LW(psrc) ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint32_t val_m; \
- \
- __asm__ __volatile__ ( \
- "ulw %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r" (val_m) \
- : [psrc_m] "m" (*psrc_m) \
- ); \
- \
- val_m; \
-})
+#define LW(psrc) \
+ ({ \
+ const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ uint32_t val_m; \
+ \
+ __asm__ __volatile__("ulw %[val_m], %[psrc_m] \n\t" \
+ \
+ : [val_m] "=r"(val_m) \
+ : [psrc_m] "m"(*psrc_m)); \
+ \
+ val_m; \
+ })
#if (__mips == 64)
-#define LD(psrc) ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint64_t val_m = 0; \
- \
- __asm__ __volatile__ ( \
- "uld %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r" (val_m) \
- : [psrc_m] "m" (*psrc_m) \
- ); \
- \
- val_m; \
-})
-#else // !(__mips == 64)
-#define LD(psrc) ({ \
- const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \
- uint32_t val0_m, val1_m; \
- uint64_t val_m = 0; \
+#define LD(psrc) \
+ ({ \
+ const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ uint64_t val_m = 0; \
\
- val0_m = LW(psrc_m1); \
- val1_m = LW(psrc_m1 + 4); \
+ __asm__ __volatile__("uld %[val_m], %[psrc_m] \n\t" \
\
- val_m = (uint64_t)(val1_m); \
- val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
- val_m = (uint64_t)(val_m | (uint64_t)val0_m); \
+ : [val_m] "=r"(val_m) \
+ : [psrc_m] "m"(*psrc_m)); \
\
- val_m; \
-})
-#endif // (__mips == 64)
-
-#define SH(val, pdst) { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint16_t val_m = (val); \
- \
- __asm__ __volatile__ ( \
- "ush %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m" (*pdst_m) \
- : [val_m] "r" (val_m) \
- ); \
-}
-
-#define SW(val, pdst) { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint32_t val_m = (val); \
- \
- __asm__ __volatile__ ( \
- "usw %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m" (*pdst_m) \
- : [val_m] "r" (val_m) \
- ); \
-}
-
-#define SD(val, pdst) { \
- uint8_t *pdst_m1 = (uint8_t *)(pdst); \
- uint32_t val0_m, val1_m; \
+ val_m; \
+ })
+#else // !(__mips == 64)
+#define LD(psrc) \
+ ({ \
+ const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \
+ uint32_t val0_m, val1_m; \
+ uint64_t val_m = 0; \
\
- val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF); \
- val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+ val0_m = LW(psrc_m1); \
+ val1_m = LW(psrc_m1 + 4); \
\
- SW(val0_m, pdst_m1); \
- SW(val1_m, pdst_m1 + 4); \
-}
+ val_m = (uint64_t)(val1_m); \
+ val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
+ val_m = (uint64_t)(val_m | (uint64_t)val0_m); \
+ \
+ val_m; \
+ })
+#endif // (__mips == 64)
+
+#define SH(val, pdst) \
+ { \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ const uint16_t val_m = (val); \
+ \
+ __asm__ __volatile__("ush %[val_m], %[pdst_m] \n\t" \
+ \
+ : [pdst_m] "=m"(*pdst_m) \
+ : [val_m] "r"(val_m)); \
+ }
+
+#define SW(val, pdst) \
+ { \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ const uint32_t val_m = (val); \
+ \
+ __asm__ __volatile__("usw %[val_m], %[pdst_m] \n\t" \
+ \
+ : [pdst_m] "=m"(*pdst_m) \
+ : [val_m] "r"(val_m)); \
+ }
+
+#define SD(val, pdst) \
+ { \
+ uint8_t *pdst_m1 = (uint8_t *)(pdst); \
+ uint32_t val0_m, val1_m; \
+ \
+ val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
+ val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+ \
+ SW(val0_m, pdst_m1); \
+ SW(val1_m, pdst_m1 + 4); \
+ }
#endif // (__mips_isa_rev >= 6)
/* Description : Load 4 words with stride
@@ -236,12 +228,13 @@
Load word in 'out2' from (psrc + 2 * stride)
Load word in 'out3' from (psrc + 3 * stride)
*/
-#define LW4(psrc, stride, out0, out1, out2, out3) { \
- out0 = LW((psrc)); \
- out1 = LW((psrc) + stride); \
- out2 = LW((psrc) + 2 * stride); \
- out3 = LW((psrc) + 3 * stride); \
-}
+#define LW4(psrc, stride, out0, out1, out2, out3) \
+ { \
+ out0 = LW((psrc)); \
+ out1 = LW((psrc) + stride); \
+ out2 = LW((psrc) + 2 * stride); \
+ out3 = LW((psrc) + 3 * stride); \
+ }
/* Description : Load double words with stride
Arguments : Inputs - psrc, stride
@@ -249,14 +242,16 @@
Details : Load double word in 'out0' from (psrc)
Load double word in 'out1' from (psrc + stride)
*/
-#define LD2(psrc, stride, out0, out1) { \
- out0 = LD((psrc)); \
- out1 = LD((psrc) + stride); \
-}
-#define LD4(psrc, stride, out0, out1, out2, out3) { \
- LD2((psrc), stride, out0, out1); \
- LD2((psrc) + 2 * stride, stride, out2, out3); \
-}
+#define LD2(psrc, stride, out0, out1) \
+ { \
+ out0 = LD((psrc)); \
+ out1 = LD((psrc) + stride); \
+ }
+#define LD4(psrc, stride, out0, out1, out2, out3) \
+ { \
+ LD2((psrc), stride, out0, out1); \
+ LD2((psrc) + 2 * stride, stride, out2, out3); \
+ }
/* Description : Store 4 words with stride
Arguments : Inputs - in0, in1, in2, in3, pdst, stride
@@ -265,12 +260,13 @@
Store word from 'in2' to (pdst + 2 * stride)
Store word from 'in3' to (pdst + 3 * stride)
*/
-#define SW4(in0, in1, in2, in3, pdst, stride) { \
- SW(in0, (pdst)) \
- SW(in1, (pdst) + stride); \
- SW(in2, (pdst) + 2 * stride); \
- SW(in3, (pdst) + 3 * stride); \
-}
+#define SW4(in0, in1, in2, in3, pdst, stride) \
+ { \
+ SW(in0, (pdst)) \
+ SW(in1, (pdst) + stride); \
+ SW(in2, (pdst) + 2 * stride); \
+ SW(in3, (pdst) + 3 * stride); \
+ }
/* Description : Store 4 double words with stride
Arguments : Inputs - in0, in1, in2, in3, pdst, stride
@@ -279,12 +275,13 @@
Store double word from 'in2' to (pdst + 2 * stride)
Store double word from 'in3' to (pdst + 3 * stride)
*/
-#define SD4(in0, in1, in2, in3, pdst, stride) { \
- SD(in0, (pdst)) \
- SD(in1, (pdst) + stride); \
- SD(in2, (pdst) + 2 * stride); \
- SD(in3, (pdst) + 3 * stride); \
-}
+#define SD4(in0, in1, in2, in3, pdst, stride) \
+ { \
+ SD(in0, (pdst)) \
+ SD(in1, (pdst) + stride); \
+ SD(in2, (pdst) + 2 * stride); \
+ SD(in3, (pdst) + 3 * stride); \
+ }
/* Description : Load vectors with 16 byte elements with stride
Arguments : Inputs - psrc, stride
@@ -293,45 +290,50 @@
Details : Load 16 byte elements in 'out0' from (psrc)
Load 16 byte elements in 'out1' from (psrc + stride)
*/
-#define LD_B2(RTYPE, psrc, stride, out0, out1) { \
- out0 = LD_B(RTYPE, (psrc)); \
- out1 = LD_B(RTYPE, (psrc) + stride); \
-}
+#define LD_B2(RTYPE, psrc, stride, out0, out1) \
+ { \
+ out0 = LD_B(RTYPE, (psrc)); \
+ out1 = LD_B(RTYPE, (psrc) + stride); \
+ }
#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
-#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) { \
- LD_B2(RTYPE, (psrc), stride, out0, out1); \
- out2 = LD_B(RTYPE, (psrc) + 2 * stride); \
-}
+#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
+ { \
+ LD_B2(RTYPE, (psrc), stride, out0, out1); \
+ out2 = LD_B(RTYPE, (psrc) + 2 * stride); \
+ }
#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
-#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) { \
- LD_B2(RTYPE, (psrc), stride, out0, out1); \
- LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
-}
+#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
+ { \
+ LD_B2(RTYPE, (psrc), stride, out0, out1); \
+ LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+ }
#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
-#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) { \
- LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
- out4 = LD_B(RTYPE, (psrc) + 4 * stride); \
-}
+#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
+ { \
+ LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
+ out4 = LD_B(RTYPE, (psrc) + 4 * stride); \
+ }
#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
-#define LD_B7(RTYPE, psrc, stride, \
- out0, out1, out2, out3, out4, out5, out6) { \
- LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
- LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
-}
+#define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \
+ { \
+ LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
+ LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
+ }
#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
-#define LD_B8(RTYPE, psrc, stride, \
- out0, out1, out2, out3, out4, out5, out6, out7) { \
- LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
- LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
-}
+#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
+ out7) \
+ { \
+ LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
+ LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
+ }
#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
@@ -341,33 +343,36 @@
Details : Load 8 halfword elements in 'out0' from (psrc)
Load 8 halfword elements in 'out1' from (psrc + stride)
*/
-#define LD_H2(RTYPE, psrc, stride, out0, out1) { \
- out0 = LD_H(RTYPE, (psrc)); \
- out1 = LD_H(RTYPE, (psrc) + (stride)); \
-}
+#define LD_H2(RTYPE, psrc, stride, out0, out1) \
+ { \
+ out0 = LD_H(RTYPE, (psrc)); \
+ out1 = LD_H(RTYPE, (psrc) + (stride)); \
+ }
#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
-#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) { \
- LD_H2(RTYPE, (psrc), stride, out0, out1); \
- LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
-}
+#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
+ { \
+ LD_H2(RTYPE, (psrc), stride, out0, out1); \
+ LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+ }
#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
-#define LD_H8(RTYPE, psrc, stride, \
- out0, out1, out2, out3, out4, out5, out6, out7) { \
- LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
- LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
-}
+#define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
+ out7) \
+ { \
+ LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
+ LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
+ }
#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
-#define LD_H16(RTYPE, psrc, stride, \
- out0, out1, out2, out3, out4, out5, out6, out7, \
- out8, out9, out10, out11, out12, out13, out14, out15) { \
- LD_H8(RTYPE, (psrc), stride, \
- out0, out1, out2, out3, out4, out5, out6, out7); \
- LD_H8(RTYPE, (psrc) + 8 * stride, stride, \
- out8, out9, out10, out11, out12, out13, out14, out15); \
-}
+#define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
+ out7, out8, out9, out10, out11, out12, out13, out14, out15) \
+ { \
+ LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \
+ out7); \
+ LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \
+ out13, out14, out15); \
+ }
#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
/* Description : Load 4x4 block of signed halfword elements from 1D source
@@ -375,45 +380,49 @@
Arguments : Input - psrc
Outputs - out0, out1, out2, out3
*/
-#define LD4x4_SH(psrc, out0, out1, out2, out3) { \
- out0 = LD_SH(psrc); \
- out2 = LD_SH(psrc + 8); \
- out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
- out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
-}
+#define LD4x4_SH(psrc, out0, out1, out2, out3) \
+ { \
+ out0 = LD_SH(psrc); \
+ out2 = LD_SH(psrc + 8); \
+ out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
+ out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
+ }
/* Description : Load 2 vectors of signed word elements with stride
Arguments : Inputs - psrc, stride
Outputs - out0, out1
Return Type - signed word
*/
-#define LD_SW2(psrc, stride, out0, out1) { \
- out0 = LD_SW((psrc)); \
- out1 = LD_SW((psrc) + stride); \
-}
+#define LD_SW2(psrc, stride, out0, out1) \
+ { \
+ out0 = LD_SW((psrc)); \
+ out1 = LD_SW((psrc) + stride); \
+ }
/* Description : Store vectors of 16 byte elements with stride
Arguments : Inputs - in0, in1, pdst, stride
Details : Store 16 byte elements from 'in0' to (pdst)
Store 16 byte elements from 'in1' to (pdst + stride)
*/
-#define ST_B2(RTYPE, in0, in1, pdst, stride) { \
- ST_B(RTYPE, in0, (pdst)); \
- ST_B(RTYPE, in1, (pdst) + stride); \
-}
+#define ST_B2(RTYPE, in0, in1, pdst, stride) \
+ { \
+ ST_B(RTYPE, in0, (pdst)); \
+ ST_B(RTYPE, in1, (pdst) + stride); \
+ }
#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
-#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) { \
- ST_B2(RTYPE, in0, in1, (pdst), stride); \
- ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
-}
+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
+ { \
+ ST_B2(RTYPE, in0, in1, (pdst), stride); \
+ ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+ }
#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
-#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- pdst, stride) { \
- ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
- ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
-}
+#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
+ { \
+ ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
+ ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
+ }
#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
/* Description : Store vectors of 8 halfword elements with stride
@@ -421,22 +430,25 @@
Details : Store 8 halfword elements from 'in0' to (pdst)
Store 8 halfword elements from 'in1' to (pdst + stride)
*/
-#define ST_H2(RTYPE, in0, in1, pdst, stride) { \
- ST_H(RTYPE, in0, (pdst)); \
- ST_H(RTYPE, in1, (pdst) + stride); \
-}
+#define ST_H2(RTYPE, in0, in1, pdst, stride) \
+ { \
+ ST_H(RTYPE, in0, (pdst)); \
+ ST_H(RTYPE, in1, (pdst) + stride); \
+ }
#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
-#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) { \
- ST_H2(RTYPE, in0, in1, (pdst), stride); \
- ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
-}
+#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \
+ { \
+ ST_H2(RTYPE, in0, in1, (pdst), stride); \
+ ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+ }
#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
-#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) { \
- ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
- ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
-}
+#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
+ { \
+ ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
+ ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
+ }
#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
/* Description : Store vectors of word elements with stride
@@ -444,10 +456,11 @@
Details : Store 4 word elements from 'in0' to (pdst)
Store 4 word elements from 'in1' to (pdst + stride)
*/
-#define ST_SW2(in0, in1, pdst, stride) { \
- ST_SW(in0, (pdst)); \
- ST_SW(in1, (pdst) + stride); \
-}
+#define ST_SW2(in0, in1, pdst, stride) \
+ { \
+ ST_SW(in0, (pdst)); \
+ ST_SW(in1, (pdst) + stride); \
+ }
/* Description : Store 2x4 byte block to destination memory from input vector
Arguments : Inputs - in, stidx, pdst, stride
@@ -460,20 +473,21 @@
Index 'stidx+3' halfword element from 'in' vector is copied to
the GP register and stored to (pdst + 3 * stride)
*/
-#define ST2x4_UB(in, stidx, pdst, stride) { \
- uint16_t out0_m, out1_m, out2_m, out3_m; \
- uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \
- \
- out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \
- out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
- out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
- out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
- \
- SH(out0_m, pblk_2x4_m); \
- SH(out1_m, pblk_2x4_m + stride); \
- SH(out2_m, pblk_2x4_m + 2 * stride); \
- SH(out3_m, pblk_2x4_m + 3 * stride); \
-}
+#define ST2x4_UB(in, stidx, pdst, stride) \
+ { \
+ uint16_t out0_m, out1_m, out2_m, out3_m; \
+ uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \
+ out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
+ out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
+ out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
+ \
+ SH(out0_m, pblk_2x4_m); \
+ SH(out1_m, pblk_2x4_m + stride); \
+ SH(out2_m, pblk_2x4_m + 2 * stride); \
+ SH(out3_m, pblk_2x4_m + 3 * stride); \
+ }
/* Description : Store 4x2 byte block to destination memory from input vector
Arguments : Inputs - in, pdst, stride
@@ -482,16 +496,17 @@
Index 1 word element from 'in' vector is copied to the GP
register and stored to (pdst + stride)
*/
-#define ST4x2_UB(in, pdst, stride) { \
- uint32_t out0_m, out1_m; \
- uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \
- \
- out0_m = __msa_copy_u_w((v4i32)in, 0); \
- out1_m = __msa_copy_u_w((v4i32)in, 1); \
- \
- SW(out0_m, pblk_4x2_m); \
- SW(out1_m, pblk_4x2_m + stride); \
-}
+#define ST4x2_UB(in, pdst, stride) \
+ { \
+ uint32_t out0_m, out1_m; \
+ uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_w((v4i32)in, 0); \
+ out1_m = __msa_copy_u_w((v4i32)in, 1); \
+ \
+ SW(out0_m, pblk_4x2_m); \
+ SW(out1_m, pblk_4x2_m + stride); \
+ }
/* Description : Store 4x4 byte block to destination memory from input vector
Arguments : Inputs - in0, in1, pdst, stride
@@ -504,35 +519,38 @@
'Idx3' word element from input vector 'in0' is copied to the
GP register and stored to (pdst + 3 * stride)
*/
-#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) { \
- uint32_t out0_m, out1_m, out2_m, out3_m; \
- uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \
- \
- out0_m = __msa_copy_u_w((v4i32)in0, idx0); \
- out1_m = __msa_copy_u_w((v4i32)in0, idx1); \
- out2_m = __msa_copy_u_w((v4i32)in1, idx2); \
- out3_m = __msa_copy_u_w((v4i32)in1, idx3); \
- \
- SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
-}
-#define ST4x8_UB(in0, in1, pdst, stride) { \
- uint8_t *pblk_4x8 = (uint8_t *)(pdst); \
- \
- ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
- ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
-}
+#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
+ { \
+ uint32_t out0_m, out1_m, out2_m, out3_m; \
+ uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_w((v4i32)in0, idx0); \
+ out1_m = __msa_copy_u_w((v4i32)in0, idx1); \
+ out2_m = __msa_copy_u_w((v4i32)in1, idx2); \
+ out3_m = __msa_copy_u_w((v4i32)in1, idx3); \
+ \
+ SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
+ }
+#define ST4x8_UB(in0, in1, pdst, stride) \
+ { \
+ uint8_t *pblk_4x8 = (uint8_t *)(pdst); \
+ \
+ ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
+ ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
+ }
/* Description : Store 8x1 byte block to destination memory from input vector
Arguments : Inputs - in, pdst
Details : Index 0 double word element from 'in' vector is copied to the
GP register and stored to (pdst)
*/
-#define ST8x1_UB(in, pdst) { \
- uint64_t out0_m; \
- \
- out0_m = __msa_copy_u_d((v2i64)in, 0); \
- SD(out0_m, pdst); \
-}
+#define ST8x1_UB(in, pdst) \
+ { \
+ uint64_t out0_m; \
+ \
+ out0_m = __msa_copy_u_d((v2i64)in, 0); \
+ SD(out0_m, pdst); \
+ }
/* Description : Store 8x2 byte block to destination memory from input vector
Arguments : Inputs - in, pdst, stride
@@ -541,16 +559,17 @@
Index 1 double word element from 'in' vector is copied to the
GP register and stored to (pdst + stride)
*/
-#define ST8x2_UB(in, pdst, stride) { \
- uint64_t out0_m, out1_m; \
- uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
- \
- out0_m = __msa_copy_u_d((v2i64)in, 0); \
- out1_m = __msa_copy_u_d((v2i64)in, 1); \
- \
- SD(out0_m, pblk_8x2_m); \
- SD(out1_m, pblk_8x2_m + stride); \
-}
+#define ST8x2_UB(in, pdst, stride) \
+ { \
+ uint64_t out0_m, out1_m; \
+ uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_d((v2i64)in, 0); \
+ out1_m = __msa_copy_u_d((v2i64)in, 1); \
+ \
+ SD(out0_m, pblk_8x2_m); \
+ SD(out1_m, pblk_8x2_m + stride); \
+ }
/* Description : Store 8x4 byte block to destination memory from input
vectors
@@ -564,17 +583,18 @@
Index 1 double word element from 'in1' vector is copied to the
GP register and stored to (pdst + 3 * stride)
*/
-#define ST8x4_UB(in0, in1, pdst, stride) { \
- uint64_t out0_m, out1_m, out2_m, out3_m; \
- uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \
- \
- out0_m = __msa_copy_u_d((v2i64)in0, 0); \
- out1_m = __msa_copy_u_d((v2i64)in0, 1); \
- out2_m = __msa_copy_u_d((v2i64)in1, 0); \
- out3_m = __msa_copy_u_d((v2i64)in1, 1); \
- \
- SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
-}
+#define ST8x4_UB(in0, in1, pdst, stride) \
+ { \
+ uint64_t out0_m, out1_m, out2_m, out3_m; \
+ uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_d((v2i64)in0, 0); \
+ out1_m = __msa_copy_u_d((v2i64)in0, 1); \
+ out2_m = __msa_copy_u_d((v2i64)in1, 0); \
+ out3_m = __msa_copy_u_d((v2i64)in1, 1); \
+ \
+ SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
+ }
/* Description : average with rounding (in0 + in1 + 1) / 2.
Arguments : Inputs - in0, in1, in2, in3,
@@ -584,17 +604,19 @@
each unsigned byte element from 'in1' vector. Then the average
with rounding is calculated and written to 'out0'
*/
-#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \
- out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \
-}
+#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \
+ out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \
+ }
#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
-#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
- AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \
-}
+#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \
+ }
#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
/* Description : Immediate number of elements to slide with zero
@@ -604,18 +626,20 @@
Details : Byte elements from 'zero_m' vector are slid into 'in0' by
value specified in the 'slide_val'
*/
-#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) { \
- v16i8 zero_m = { 0 }; \
- out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
- out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
-}
+#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \
+ { \
+ v16i8 zero_m = { 0 }; \
+ out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
+ out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
+ }
#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
-#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \
- out0, out1, out2, out3, slide_val) { \
- SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
- SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \
-}
+#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \
+ slide_val) \
+ { \
+ SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
+ SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \
+ }
#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
/* Description : Immediate number of elements to slide
@@ -625,18 +649,20 @@
Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by
value specified in the 'slide_val'
*/
-#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) { \
- out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \
- out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \
-}
+#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
+ { \
+ out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \
+ out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \
+ }
#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
-#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, \
- out0, out1, out2, slide_val) { \
- SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
- out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \
-}
+#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \
+ out2, slide_val) \
+ { \
+ SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
+ out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \
+ }
#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
@@ -647,19 +673,21 @@
Details : Byte elements from 'in0' & 'in1' are copied selectively to
'out0' as per control vector 'mask0'
*/
-#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) { \
- out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
- out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
-}
+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
+ out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
+ }
#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
-#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \
- out0, out1, out2, out3) { \
- VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
- VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
-}
+#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \
+ out3) \
+ { \
+ VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
+ VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
+ }
#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
@@ -673,18 +701,19 @@
The multiplication result of adjacent odd-even elements
are added together and written to the 'out0' vector
*/
-#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
- out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \
- out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \
-}
+#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \
+ out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \
+ }
#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
-#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \
- cnst0, cnst1, cnst2, cnst3, \
- out0, out1, out2, out3) { \
- DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
- DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
-}
+#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+ cnst3, out0, out1, out2, out3) \
+ { \
+ DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
+ DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
+ }
#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
/* Description : Dot product of byte vector elements
@@ -697,17 +726,19 @@
The multiplication result of adjacent odd-even elements
are added together and written to the 'out0' vector
*/
-#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
- out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \
- out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \
-}
+#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \
+ out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \
+ }
#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
-#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \
- cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) { \
- DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
- DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
-}
+#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+ cnst3, out0, out1, out2, out3) \
+ { \
+ DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
+ DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
+ }
#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
/* Description : Dot product of halfword vector elements
@@ -720,18 +751,19 @@
The multiplication result of adjacent odd-even elements
are added together and written to the 'out0' vector
*/
-#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
- out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \
- out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \
-}
+#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \
+ out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \
+ }
#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
-#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \
- cnst0, cnst1, cnst2, cnst3, \
- out0, out1, out2, out3) { \
- DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
- DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
-}
+#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+ cnst3, out0, out1, out2, out3) \
+ { \
+ DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
+ DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
+ }
#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
/* Description : Dot product of word vector elements
@@ -744,10 +776,11 @@
The multiplication result of adjacent odd-even elements
are added together and written to the 'out0' vector
*/
-#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
- out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \
- out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \
-}
+#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \
+ out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \
+ }
#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
/* Description : Dot product & addition of byte vector elements
@@ -760,17 +793,19 @@
The multiplication result of adjacent odd-even elements
are added to the 'out0' vector
*/
-#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
- out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
- out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
-}
+#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
+ out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
+ }
#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
-#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \
- cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) { \
- DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
- DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
-}
+#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+ cnst3, out0, out1, out2, out3) \
+ { \
+ DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
+ DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
+ }
#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
/* Description : Dot product & addition of halfword vector elements
@@ -783,10 +818,11 @@
The multiplication result of adjacent odd-even elements
are added to the 'out0' vector
*/
-#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
- out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
- out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
-}
+#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
+ out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
+ }
#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
/* Description : Dot product & addition of double word vector elements
@@ -799,10 +835,11 @@
The multiplication result of adjacent odd-even elements
are added to the 'out0' vector
*/
-#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) { \
- out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
- out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
-}
+#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
+ out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
+ }
#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
/* Description : Minimum values between unsigned elements of
@@ -813,16 +850,18 @@
Details : Minimum of unsigned halfword element values from 'in0' and
'min_vec' are written to output vector 'in0'
*/
-#define MIN_UH2(RTYPE, in0, in1, min_vec) { \
- in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \
- in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \
-}
+#define MIN_UH2(RTYPE, in0, in1, min_vec) \
+ { \
+ in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \
+ in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \
+ }
#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
-#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) { \
- MIN_UH2(RTYPE, in0, in1, min_vec); \
- MIN_UH2(RTYPE, in2, in3, min_vec); \
-}
+#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
+ { \
+ MIN_UH2(RTYPE, in0, in1, min_vec); \
+ MIN_UH2(RTYPE, in2, in3, min_vec); \
+ }
#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
/* Description : Clips all signed halfword elements of input vector
@@ -831,22 +870,25 @@
Output - out_m
Return Type - signed halfword
*/
-#define CLIP_SH_0_255(in) ({ \
- v8i16 max_m = __msa_ldi_h(255); \
- v8i16 out_m; \
- \
- out_m = __msa_maxi_s_h((v8i16)in, 0); \
- out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
- out_m; \
-})
-#define CLIP_SH2_0_255(in0, in1) { \
- in0 = CLIP_SH_0_255(in0); \
- in1 = CLIP_SH_0_255(in1); \
-}
-#define CLIP_SH4_0_255(in0, in1, in2, in3) { \
- CLIP_SH2_0_255(in0, in1); \
- CLIP_SH2_0_255(in2, in3); \
-}
+#define CLIP_SH_0_255(in) \
+ ({ \
+ v8i16 max_m = __msa_ldi_h(255); \
+ v8i16 out_m; \
+ \
+ out_m = __msa_maxi_s_h((v8i16)in, 0); \
+ out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
+ out_m; \
+ })
+#define CLIP_SH2_0_255(in0, in1) \
+ { \
+ in0 = CLIP_SH_0_255(in0); \
+ in1 = CLIP_SH_0_255(in1); \
+ }
+#define CLIP_SH4_0_255(in0, in1, in2, in3) \
+ { \
+ CLIP_SH2_0_255(in0, in1); \
+ CLIP_SH2_0_255(in2, in3); \
+ }
/* Description : Horizontal addition of 4 signed word elements of input vector
Arguments : Input - in (signed word vector)
@@ -855,16 +897,17 @@
Details : 4 signed word elements of 'in' vector are added together and
the resulting integer sum is returned
*/
-#define HADD_SW_S32(in) ({ \
- v2i64 res0_m, res1_m; \
- int32_t sum_m; \
- \
- res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
- res1_m = __msa_splati_d(res0_m, 1); \
- res0_m = res0_m + res1_m; \
- sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \
- sum_m; \
-})
+#define HADD_SW_S32(in) \
+ ({ \
+ v2i64 res0_m, res1_m; \
+ int32_t sum_m; \
+ \
+ res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
+ res1_m = __msa_splati_d(res0_m, 1); \
+ res0_m = res0_m + res1_m; \
+ sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \
+ sum_m; \
+ })
/* Description : Horizontal addition of 8 unsigned halfword elements
Arguments : Inputs - in (unsigned halfword vector)
@@ -873,18 +916,19 @@
Details : 8 unsigned halfword elements of input vector are added
together and the resulting integer sum is returned
*/
-#define HADD_UH_U32(in) ({ \
- v4u32 res_m; \
- v2u64 res0_m, res1_m; \
- uint32_t sum_m; \
- \
- res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
- res0_m = __msa_hadd_u_d(res_m, res_m); \
- res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
- res0_m = res0_m + res1_m; \
- sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \
- sum_m; \
-})
+#define HADD_UH_U32(in) \
+ ({ \
+ v4u32 res_m; \
+ v2u64 res0_m, res1_m; \
+ uint32_t sum_m; \
+ \
+ res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
+ res0_m = __msa_hadd_u_d(res_m, res_m); \
+ res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
+ res0_m = res0_m + res1_m; \
+ sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \
+ sum_m; \
+ })
/* Description : Horizontal addition of unsigned byte vector elements
Arguments : Inputs - in0, in1
@@ -894,16 +938,18 @@
even unsigned byte element from 'in0' (pairwise) and the
halfword result is written to 'out0'
*/
-#define HADD_UB2(RTYPE, in0, in1, out0, out1) { \
- out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
- out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
-}
+#define HADD_UB2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
+ out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
+ }
#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
-#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) { \
- HADD_UB2(RTYPE, in0, in1, out0, out1); \
- HADD_UB2(RTYPE, in2, in3, out2, out3); \
-}
+#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ HADD_UB2(RTYPE, in0, in1, out0, out1); \
+ HADD_UB2(RTYPE, in2, in3, out2, out3); \
+ }
#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
/* Description : Horizontal subtraction of unsigned byte vector elements
@@ -914,10 +960,11 @@
even unsigned byte element from 'in0' (pairwise) and the
halfword result is written to 'out0'
*/
-#define HSUB_UB2(RTYPE, in0, in1, out0, out1) { \
- out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
- out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
-}
+#define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
+ out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
+ }
#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
/* Description : SAD (Sum of Absolute Difference)
@@ -928,18 +975,19 @@
'ref0' is calculated and preserved in 'diff0'. Then even-odd
pairs are added together to generate 8 halfword results.
*/
-#define SAD_UB2_UH(in0, in1, ref0, ref1) ({ \
- v16u8 diff0_m, diff1_m; \
- v8u16 sad_m = { 0 }; \
- \
- diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0); \
- diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1); \
- \
- sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \
- sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \
- \
- sad_m; \
-})
+#define SAD_UB2_UH(in0, in1, ref0, ref1) \
+ ({ \
+ v16u8 diff0_m, diff1_m; \
+ v8u16 sad_m = { 0 }; \
+ \
+ diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0); \
+ diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1); \
+ \
+ sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \
+ sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \
+ \
+ sad_m; \
+ })
/* Description : Horizontal subtraction of signed halfword vector elements
Arguments : Inputs - in0, in1
@@ -949,10 +997,11 @@
even signed halfword element from 'in0' (pairwise) and the
word result is written to 'out0'
*/
-#define HSUB_UH2(RTYPE, in0, in1, out0, out1) { \
- out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
- out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
-}
+#define HSUB_UH2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
+ out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
+ }
#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
/* Description : Set element n input vector to GPR value
@@ -961,25 +1010,28 @@
Return Type - as per RTYPE
Details : Set element 0 in vector 'out' to value specified in 'in0'
*/
-#define INSERT_W2(RTYPE, in0, in1, out) { \
- out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
- out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
-}
+#define INSERT_W2(RTYPE, in0, in1, out) \
+ { \
+ out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
+ out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
+ }
#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
-#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) { \
- out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
- out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
- out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \
- out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \
-}
+#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \
+ { \
+ out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
+ out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
+ out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \
+ out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \
+ }
#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
-#define INSERT_D2(RTYPE, in0, in1, out) { \
- out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
- out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
-}
+#define INSERT_D2(RTYPE, in0, in1, out) \
+ { \
+ out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
+ out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
+ }
#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
@@ -990,10 +1042,11 @@
Details : Even byte elements of 'in0' and 'in1' are interleaved
and written to 'out0'
*/
-#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
- out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
-}
+#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
+ out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
+ }
#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
@@ -1004,10 +1057,11 @@
Details : Even halfword elements of 'in0' and 'in1' are interleaved
and written to 'out0'
*/
-#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
- out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
-}
+#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
+ out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
+ }
#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
@@ -1019,10 +1073,11 @@
Details : Even word elements of 'in0' and 'in1' are interleaved
and written to 'out0'
*/
-#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
- out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
-}
+#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
+ out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
+ }
#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
/* Description : Interleave even double word elements from vectors
@@ -1032,10 +1087,11 @@
Details : Even double word elements of 'in0' and 'in1' are interleaved
and written to 'out0'
*/
-#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
- out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
-}
+#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
+ out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
+ }
#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
/* Description : Interleave left half of byte elements from vectors
@@ -1045,20 +1101,22 @@
Details : Left half of byte elements of 'in0' and 'in1' are interleaved
and written to 'out0'.
*/
-#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
- out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
-}
+#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
+ out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
+ }
#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
-#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
- ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
-}
+#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
@@ -1070,10 +1128,11 @@
Details : Left half of halfword elements of 'in0' and 'in1' are
interleaved and written to 'out0'.
*/
-#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
- out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
-}
+#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
+ }
#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
@@ -1084,10 +1143,11 @@
Details : Left half of word elements of 'in0' and 'in1' are interleaved
and written to 'out0'.
*/
-#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
- out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
-}
+#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
+ out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
+ }
#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
@@ -1098,33 +1158,36 @@
Details : Right half of byte elements of 'in0' and 'in1' are interleaved
and written to out0.
*/
-#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
- out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
-}
+#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
+ out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
+ }
#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
-#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
- ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
-}
+#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
-#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- in8, in9, in10, in11, in12, in13, in14, in15, \
- out0, out1, out2, out3, out4, out5, out6, out7) { \
- ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3); \
- ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \
- out4, out5, out6, out7); \
-}
+#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \
+ in11, in12, in13, in14, in15, out0, out1, out2, out3, out4, \
+ out5, out6, out7) \
+ { \
+ ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+ out3); \
+ ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5, \
+ out6, out7); \
+ }
#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
/* Description : Interleave right half of halfword elements from vectors
@@ -1134,32 +1197,36 @@
Details : Right half of halfword elements of 'in0' and 'in1' are
interleaved and written to 'out0'.
*/
-#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
- out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
-}
+#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
+ }
#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
-#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
- ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
-}
+#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
-#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
- out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
-}
+#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
+ out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
+ }
#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
-#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
- ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
-}
+#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
/* Description : Interleave right half of double word elements from vectors
@@ -1169,25 +1236,28 @@
Details : Right half of double word elements of 'in0' and 'in1' are
interleaved and written to 'out0'.
*/
-#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
- out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
-}
+#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
+ out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
+ }
#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
-#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) { \
- ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
- out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \
-}
+#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
+ { \
+ ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \
+ }
#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
-#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
- ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
-}
+#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
@@ -1198,26 +1268,29 @@
Details : Right half of byte elements from 'in0' and 'in1' are
interleaved and written to 'out0'
*/
-#define ILVRL_B2(RTYPE, in0, in1, out0, out1) { \
- out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
- out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
-}
+#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
+ out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
+ }
#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
-#define ILVRL_H2(RTYPE, in0, in1, out0, out1) { \
- out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
- out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
-}
+#define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
+ }
#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
-#define ILVRL_W2(RTYPE, in0, in1, out0, out1) { \
- out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
- out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
-}
+#define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
+ out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
+ }
#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
@@ -1232,16 +1305,18 @@
value generated with (sat_val + 1) bit range.
The results are written in place
*/
-#define SAT_UH2(RTYPE, in0, in1, sat_val) { \
- in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
- in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
-}
+#define SAT_UH2(RTYPE, in0, in1, sat_val) \
+ { \
+ in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
+ in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
+ }
#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
-#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) { \
- SAT_UH2(RTYPE, in0, in1, sat_val); \
- SAT_UH2(RTYPE, in2, in3, sat_val) \
-}
+#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
+ { \
+ SAT_UH2(RTYPE, in0, in1, sat_val); \
+ SAT_UH2(RTYPE, in2, in3, sat_val) \
+ }
#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
/* Description : Saturate the halfword element values to the max
@@ -1254,16 +1329,18 @@
value generated with (sat_val + 1) bit range
The results are written in place
*/
-#define SAT_SH2(RTYPE, in0, in1, sat_val) { \
- in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
- in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
-}
+#define SAT_SH2(RTYPE, in0, in1, sat_val) \
+ { \
+ in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
+ in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
+ }
#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
-#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) { \
- SAT_SH2(RTYPE, in0, in1, sat_val); \
- SAT_SH2(RTYPE, in2, in3, sat_val); \
-}
+#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
+ { \
+ SAT_SH2(RTYPE, in0, in1, sat_val); \
+ SAT_SH2(RTYPE, in2, in3, sat_val); \
+ }
#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
/* Description : Indexed halfword element values are replicated to all
@@ -1275,17 +1352,18 @@
elements in 'out0' vector
Valid index range for halfword operation is 0-7
*/
-#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) { \
- out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \
- out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \
-}
+#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \
+ out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \
+ }
#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
-#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \
- out0, out1, out2, out3) { \
- SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
- SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
-}
+#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \
+ { \
+ SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
+ SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
+ }
#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
@@ -1297,19 +1375,21 @@
'out0' & even byte elements of 'in1' are copied to the right
half of 'out0'.
*/
-#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
- out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
-}
+#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
+ out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
+ }
#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
-#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
- PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
-}
+#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
@@ -1322,18 +1402,20 @@
'out0' & even halfword elements of 'in1' are copied to the
right half of 'out0'.
*/
-#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
- out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
-}
+#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
+ }
#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
-#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
- PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
-}
+#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
/* Description : Pack even double word elements of vector pairs
@@ -1344,18 +1426,20 @@
'out0' & even double elements of 'in1' are copied to the right
half of 'out0'.
*/
-#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
- out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
-}
+#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
+ out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
+ }
#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
-#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
- PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
-}
+#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
/* Description : Each byte element is logically xor'ed with immediate 128
@@ -1365,30 +1449,34 @@
Details : Each unsigned byte element from input vector 'in0' is
logically xor'ed with 128 and the result is stored in-place.
*/
-#define XORI_B2_128(RTYPE, in0, in1) { \
- in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
- in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
-}
+#define XORI_B2_128(RTYPE, in0, in1) \
+ { \
+ in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
+ in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
+ }
#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
-#define XORI_B3_128(RTYPE, in0, in1, in2) { \
- XORI_B2_128(RTYPE, in0, in1); \
- in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
-}
+#define XORI_B3_128(RTYPE, in0, in1, in2) \
+ { \
+ XORI_B2_128(RTYPE, in0, in1); \
+ in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
+ }
#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
-#define XORI_B4_128(RTYPE, in0, in1, in2, in3) { \
- XORI_B2_128(RTYPE, in0, in1); \
- XORI_B2_128(RTYPE, in2, in3); \
-}
+#define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
+ { \
+ XORI_B2_128(RTYPE, in0, in1); \
+ XORI_B2_128(RTYPE, in2, in3); \
+ }
#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
-#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) { \
- XORI_B4_128(RTYPE, in0, in1, in2, in3); \
- XORI_B3_128(RTYPE, in4, in5, in6); \
-}
+#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
+ { \
+ XORI_B4_128(RTYPE, in0, in1, in2, in3); \
+ XORI_B3_128(RTYPE, in4, in5, in6); \
+ }
#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
/* Description : Average of signed halfword elements -> (a + b) / 2
@@ -1400,13 +1488,14 @@
in one extra bit in the result. The result is then divided by
2 and written to 'out0'
*/
-#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \
- out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \
- out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \
- out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \
-}
+#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \
+ out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \
+ out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \
+ }
#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
/* Description : Addition of signed halfword elements and signed saturation
@@ -1417,17 +1506,19 @@
halfword elements of 'in1'. The result is then signed saturated
between halfword data type range
*/
-#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \
- out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \
-}
+#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \
+ }
#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
-#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
- ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
-}
+#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
/* Description : Shift left all elements of vector (generic for all data types)
@@ -1437,12 +1528,13 @@
Details : Each element of vector 'in0' is left shifted by 'shift' and
the result is written in-place.
*/
-#define SLLI_4V(in0, in1, in2, in3, shift) { \
- in0 = in0 << shift; \
- in1 = in1 << shift; \
- in2 = in2 << shift; \
- in3 = in3 << shift; \
-}
+#define SLLI_4V(in0, in1, in2, in3, shift) \
+ { \
+ in0 = in0 << shift; \
+ in1 = in1 << shift; \
+ in2 = in2 << shift; \
+ in3 = in3 << shift; \
+ }
/* Description : Arithmetic shift right all elements of vector
(generic for all data types)
@@ -1452,12 +1544,13 @@
Details : Each element of vector 'in0' is right shifted by 'shift' and
the result is written in-place. 'shift' is a GP variable.
*/
-#define SRA_4V(in0, in1, in2, in3, shift) { \
- in0 = in0 >> shift; \
- in1 = in1 >> shift; \
- in2 = in2 >> shift; \
- in3 = in3 >> shift; \
-}
+#define SRA_4V(in0, in1, in2, in3, shift) \
+ { \
+ in0 = in0 >> shift; \
+ in1 = in1 >> shift; \
+ in2 = in2 >> shift; \
+ in3 = in3 >> shift; \
+ }
/* Description : Shift right arithmetic rounded words
Arguments : Inputs - in0, in1, shift
@@ -1469,15 +1562,17 @@
rounding and the result is written in-place.
'shift' is a vector.
*/
-#define SRAR_W2(RTYPE, in0, in1, shift) { \
- in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
- in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
-}
-
-#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) { \
- SRAR_W2(RTYPE, in0, in1, shift) \
- SRAR_W2(RTYPE, in2, in3, shift) \
-}
+#define SRAR_W2(RTYPE, in0, in1, shift) \
+ { \
+ in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
+ in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
+ }
+
+#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
+ { \
+ SRAR_W2(RTYPE, in0, in1, shift) \
+ SRAR_W2(RTYPE, in2, in3, shift) \
+ }
#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
/* Description : Shift right arithmetic rounded (immediate)
@@ -1489,30 +1584,34 @@
shifted value for rounding and the result is written in-place.
'shift' is an immediate value.
*/
-#define SRARI_H2(RTYPE, in0, in1, shift) { \
- in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
- in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
-}
+#define SRARI_H2(RTYPE, in0, in1, shift) \
+ { \
+ in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
+ in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
+ }
#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
-#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) { \
- SRARI_H2(RTYPE, in0, in1, shift); \
- SRARI_H2(RTYPE, in2, in3, shift); \
-}
+#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
+ { \
+ SRARI_H2(RTYPE, in0, in1, shift); \
+ SRARI_H2(RTYPE, in2, in3, shift); \
+ }
#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
-#define SRARI_W2(RTYPE, in0, in1, shift) { \
- in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
- in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
-}
+#define SRARI_W2(RTYPE, in0, in1, shift) \
+ { \
+ in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
+ in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
+ }
#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
-#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) { \
- SRARI_W2(RTYPE, in0, in1, shift); \
- SRARI_W2(RTYPE, in2, in3, shift); \
-}
+#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
+ { \
+ SRARI_W2(RTYPE, in0, in1, shift); \
+ SRARI_W2(RTYPE, in2, in3, shift); \
+ }
#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
/* Description : Logical shift right all elements of vector (immediate)
@@ -1522,12 +1621,13 @@
Details : Each element of vector 'in0' is right shifted by 'shift' and
the result is written in-place. 'shift' is an immediate value.
*/
-#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) { \
- out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \
- out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \
- out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \
- out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \
-}
+#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \
+ { \
+ out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \
+ out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \
+ out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \
+ out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \
+ }
#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
/* Description : Multiplication of pairs of vectors
@@ -1536,15 +1636,16 @@
Details : Each element from 'in0' is multiplied with elements from 'in1'
and the result is written to 'out0'
*/
-#define MUL2(in0, in1, in2, in3, out0, out1) { \
- out0 = in0 * in1; \
- out1 = in2 * in3; \
-}
-#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- MUL2(in0, in1, in2, in3, out0, out1); \
- MUL2(in4, in5, in6, in7, out2, out3); \
-}
+#define MUL2(in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = in0 * in1; \
+ out1 = in2 * in3; \
+ }
+#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
+ { \
+ MUL2(in0, in1, in2, in3, out0, out1); \
+ MUL2(in4, in5, in6, in7, out2, out3); \
+ }
/* Description : Addition of 2 pairs of vectors
Arguments : Inputs - in0, in1, in2, in3
@@ -1552,15 +1653,16 @@
Details : Each element in 'in0' is added to 'in1' and result is written
to 'out0'.
*/
-#define ADD2(in0, in1, in2, in3, out0, out1) { \
- out0 = in0 + in1; \
- out1 = in2 + in3; \
-}
-#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- ADD2(in0, in1, in2, in3, out0, out1); \
- ADD2(in4, in5, in6, in7, out2, out3); \
-}
+#define ADD2(in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = in0 + in1; \
+ out1 = in2 + in3; \
+ }
+#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
+ { \
+ ADD2(in0, in1, in2, in3, out0, out1); \
+ ADD2(in4, in5, in6, in7, out2, out3); \
+ }
/* Description : Subtraction of 2 pairs of vectors
Arguments : Inputs - in0, in1, in2, in3
@@ -1568,17 +1670,18 @@
Details : Each element in 'in1' is subtracted from 'in0' and result is
written to 'out0'.
*/
-#define SUB2(in0, in1, in2, in3, out0, out1) { \
- out0 = in0 - in1; \
- out1 = in2 - in3; \
-}
-#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- out0 = in0 - in1; \
- out1 = in2 - in3; \
- out2 = in4 - in5; \
- out3 = in6 - in7; \
-}
+#define SUB2(in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = in0 - in1; \
+ out1 = in2 - in3; \
+ }
+#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
+ { \
+ out0 = in0 - in1; \
+ out1 = in2 - in3; \
+ out2 = in4 - in5; \
+ out3 = in6 - in7; \
+ }
/* Description : Sign extend halfword elements from right half of the vector
Arguments : Input - in (halfword vector)
@@ -1588,12 +1691,13 @@
extracted and interleaved with same vector 'in0' to generate
4 word elements keeping sign intact
*/
-#define UNPCK_R_SH_SW(in, out) { \
- v8i16 sign_m; \
- \
- sign_m = __msa_clti_s_h((v8i16)in, 0); \
- out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
-}
+#define UNPCK_R_SH_SW(in, out) \
+ { \
+ v8i16 sign_m; \
+ \
+ sign_m = __msa_clti_s_h((v8i16)in, 0); \
+ out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
+ }
/* Description : Zero extend unsigned byte elements to halfword elements
Arguments : Input - in (unsigned byte vector)
@@ -1602,11 +1706,12 @@
Details : Zero extended right half of vector is returned in 'out0'
Zero extended left half of vector is returned in 'out1'
*/
-#define UNPCK_UB_SH(in, out0, out1) { \
- v16i8 zero_m = { 0 }; \
- \
- ILVRL_B2_SH(zero_m, in, out0, out1); \
-}
+#define UNPCK_UB_SH(in, out0, out1) \
+ { \
+ v16i8 zero_m = { 0 }; \
+ \
+ ILVRL_B2_SH(zero_m, in, out0, out1); \
+ }
/* Description : Sign extend halfword elements from input vector and return
the result in pair of vectors
@@ -1619,91 +1724,96 @@
Then interleaved left with same vector 'in0' to
generate 4 signed word elements in 'out1'
*/
-#define UNPCK_SH_SW(in, out0, out1) { \
- v8i16 tmp_m; \
- \
- tmp_m = __msa_clti_s_h((v8i16)in, 0); \
- ILVRL_H2_SW(tmp_m, in, out0, out1); \
-}
+#define UNPCK_SH_SW(in, out0, out1) \
+ { \
+ v8i16 tmp_m; \
+ \
+ tmp_m = __msa_clti_s_h((v8i16)in, 0); \
+ ILVRL_H2_SW(tmp_m, in, out0, out1); \
+ }
/* Description : Butterfly of 4 input vectors
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1, out2, out3
Details : Butterfly operation
*/
-#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) { \
- out0 = in0 + in3; \
- out1 = in1 + in2; \
- \
- out2 = in1 - in2; \
- out3 = in0 - in3; \
-}
+#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ out0 = in0 + in3; \
+ out1 = in1 + in2; \
+ \
+ out2 = in1 - in2; \
+ out3 = in0 - in3; \
+ }
/* Description : Butterfly of 8 input vectors
Arguments : Inputs - in0 ... in7
Outputs - out0 .. out7
Details : Butterfly operation
*/
-#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3, out4, out5, out6, out7) { \
- out0 = in0 + in7; \
- out1 = in1 + in6; \
- out2 = in2 + in5; \
- out3 = in3 + in4; \
- \
- out4 = in3 - in4; \
- out5 = in2 - in5; \
- out6 = in1 - in6; \
- out7 = in0 - in7; \
-}
+#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+ out3, out4, out5, out6, out7) \
+ { \
+ out0 = in0 + in7; \
+ out1 = in1 + in6; \
+ out2 = in2 + in5; \
+ out3 = in3 + in4; \
+ \
+ out4 = in3 - in4; \
+ out5 = in2 - in5; \
+ out6 = in1 - in6; \
+ out7 = in0 - in7; \
+ }
/* Description : Butterfly of 16 input vectors
Arguments : Inputs - in0 ... in15
Outputs - out0 .. out15
Details : Butterfly operation
*/
-#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \
- in8, in9, in10, in11, in12, in13, in14, in15, \
- out0, out1, out2, out3, out4, out5, out6, out7, \
- out8, out9, out10, out11, out12, out13, out14, out15) { \
- out0 = in0 + in15; \
- out1 = in1 + in14; \
- out2 = in2 + in13; \
- out3 = in3 + in12; \
- out4 = in4 + in11; \
- out5 = in5 + in10; \
- out6 = in6 + in9; \
- out7 = in7 + in8; \
+#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \
+ in11, in12, in13, in14, in15, out0, out1, out2, out3, \
+ out4, out5, out6, out7, out8, out9, out10, out11, out12, \
+ out13, out14, out15) \
+ { \
+ out0 = in0 + in15; \
+ out1 = in1 + in14; \
+ out2 = in2 + in13; \
+ out3 = in3 + in12; \
+ out4 = in4 + in11; \
+ out5 = in5 + in10; \
+ out6 = in6 + in9; \
+ out7 = in7 + in8; \
\
- out8 = in7 - in8; \
- out9 = in6 - in9; \
- out10 = in5 - in10; \
- out11 = in4 - in11; \
- out12 = in3 - in12; \
- out13 = in2 - in13; \
- out14 = in1 - in14; \
- out15 = in0 - in15; \
-}
+ out8 = in7 - in8; \
+ out9 = in6 - in9; \
+ out10 = in5 - in10; \
+ out11 = in4 - in11; \
+ out12 = in3 - in12; \
+ out13 = in2 - in13; \
+ out14 = in1 - in14; \
+ out15 = in0 - in15; \
+ }
/* Description : Transpose input 8x8 byte block
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
Outputs - out0, out1, out2, out3, out4, out5, out6, out7
Return Type - as per RTYPE
*/
-#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3, out4, out5, out6, out7) { \
- v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
- \
- ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \
- tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
- ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
- ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
- ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
- ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
- SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \
- SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \
-}
+#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \
+ out1, out2, out3, out4, out5, out6, out7) \
+ { \
+ v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
+ \
+ ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \
+ tmp3_m); \
+ ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
+ ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
+ ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
+ ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
+ SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \
+ SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \
+ }
#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
@@ -1712,128 +1822,133 @@
Outputs - out0, out1, out2, out3, out4, out5, out6, out7
Return Type - unsigned byte
*/
-#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
- in8, in9, in10, in11, in12, in13, in14, in15, \
- out0, out1, out2, out3, out4, out5, out6, out7) { \
- v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
- \
- ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
- ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
- ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
- ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
- \
- tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \
- tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \
- tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \
- tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \
- out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \
- tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \
- out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \
- tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \
- \
- ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
- out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
- out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
- \
- tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \
- tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \
- out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
- out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
- \
- ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
- out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
- out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
- \
- tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \
- tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \
- tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \
- tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \
- out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
- out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
-}
+#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
+ in10, in11, in12, in13, in14, in15, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ { \
+ v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
+ \
+ ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
+ ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
+ ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
+ ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
+ \
+ tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \
+ tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \
+ tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \
+ tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \
+ out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \
+ tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \
+ out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \
+ tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \
+ \
+ ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
+ out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ \
+ tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \
+ tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \
+ out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ \
+ ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
+ out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ \
+ tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \
+ tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \
+ tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \
+ tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \
+ out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ }
/* Description : Transpose 4x4 block with half word elements in vectors
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1, out2, out3
Return Type - signed halfword
*/
-#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) { \
- v8i16 s0_m, s1_m; \
- \
- ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \
- ILVRL_W2_SH(s1_m, s0_m, out0, out2); \
- out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
- out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \
-}
+#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v8i16 s0_m, s1_m; \
+ \
+ ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \
+ ILVRL_W2_SH(s1_m, s0_m, out0, out2); \
+ out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
+ out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \
+ }
/* Description : Transpose 4x8 block with half word elements in vectors
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
Outputs - out0, out1, out2, out3, out4, out5, out6, out7
Return Type - signed halfword
*/
-#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3, out4, out5, out6, out7) { \
- v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \
- v8i16 zero_m = { 0 }; \
- \
- ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \
- tmp0_n, tmp1_n, tmp2_n, tmp3_n); \
- ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \
- ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \
- \
- out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \
- out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \
- out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \
- out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \
- \
- out4 = zero_m; \
- out5 = zero_m; \
- out6 = zero_m; \
- out7 = zero_m; \
-}
+#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ { \
+ v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \
+ v8i16 zero_m = { 0 }; \
+ \
+ ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \
+ tmp3_n); \
+ ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \
+ ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \
+ \
+ out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \
+ out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \
+ out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \
+ out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \
+ \
+ out4 = zero_m; \
+ out5 = zero_m; \
+ out6 = zero_m; \
+ out7 = zero_m; \
+ }
/* Description : Transpose 8x4 block with half word elements in vectors
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
Outputs - out0, out1, out2, out3, out4, out5, out6, out7
Return Type - signed halfword
*/
-#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) { \
- v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- \
- ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \
- ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \
- ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \
- ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \
-}
+#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ \
+ ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \
+ ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \
+ ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \
+ ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \
+ }
/* Description : Transpose 8x8 block with half word elements in vectors
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
Outputs - out0, out1, out2, out3, out4, out5, out6, out7
Return Type - as per RTYPE
*/
-#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3, out4, out5, out6, out7) { \
- v8i16 s0_m, s1_m; \
- v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
- \
- ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
- ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \
- ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
- ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \
- ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
- ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \
- ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
- ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \
- PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \
- tmp3_m, tmp7_m, out0, out2, out4, out6); \
- out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \
- out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \
- out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \
- out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \
-}
+#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \
+ out1, out2, out3, out4, out5, out6, out7) \
+ { \
+ v8i16 s0_m, s1_m; \
+ v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
+ \
+ ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
+ ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \
+ ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
+ ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \
+ ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
+ ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \
+ ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
+ ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \
+ PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \
+ tmp7_m, out0, out2, out4, out6); \
+ out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \
+ out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \
+ out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \
+ out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \
+ }
#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
/* Description : Transpose 4x4 block with word elements in vectors
@@ -1841,40 +1956,42 @@
Outputs - out0, out1, out2, out3
Return Type - signed word
*/
-#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) { \
- v4i32 s0_m, s1_m, s2_m, s3_m; \
- \
- ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
- ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
- \
- out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \
- out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \
- out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \
- out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \
-}
+#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v4i32 s0_m, s1_m, s2_m, s3_m; \
+ \
+ ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
+ ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
+ \
+ out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \
+ out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \
+ out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \
+ out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \
+ }
/* Description : Add block 4x4
Arguments : Inputs - in0, in1, in2, in3, pdst, stride
Details : Least significant 4 bytes from each input vector are added to
the destination bytes, clipped between 0-255 and stored.
*/
-#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) { \
- uint32_t src0_m, src1_m, src2_m, src3_m; \
- v8i16 inp0_m, inp1_m, res0_m, res1_m; \
- v16i8 dst0_m = { 0 }; \
- v16i8 dst1_m = { 0 }; \
- v16i8 zero_m = { 0 }; \
- \
- ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \
- LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \
- INSERT_W2_SB(src0_m, src1_m, dst0_m); \
- INSERT_W2_SB(src2_m, src3_m, dst1_m); \
- ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \
- ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \
- CLIP_SH2_0_255(res0_m, res1_m); \
- PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
- ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \
-}
+#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
+ { \
+ uint32_t src0_m, src1_m, src2_m, src3_m; \
+ v8i16 inp0_m, inp1_m, res0_m, res1_m; \
+ v16i8 dst0_m = { 0 }; \
+ v16i8 dst1_m = { 0 }; \
+ v16i8 zero_m = { 0 }; \
+ \
+ ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \
+ LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \
+ INSERT_W2_SB(src0_m, src1_m, dst0_m); \
+ INSERT_W2_SB(src2_m, src3_m, dst1_m); \
+ ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \
+ ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \
+ CLIP_SH2_0_255(res0_m, res1_m); \
+ PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
+ ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \
+ }
/* Description : Pack even elements of input vectors & xor with 128
Arguments : Inputs - in0, in1
@@ -1884,53 +2001,57 @@
together in one vector and the resulting vector is xor'ed with
128 to shift the range from signed to unsigned byte
*/
-#define PCKEV_XORI128_UB(in0, in1) ({ \
- v16u8 out_m; \
- \
- out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
- out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \
- out_m; \
-})
+#define PCKEV_XORI128_UB(in0, in1) \
+ ({ \
+ v16u8 out_m; \
+ \
+ out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
+ out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \
+ out_m; \
+ })
/* Description : Converts inputs to unsigned bytes, interleave, average & store
as 8x4 unsigned byte block
Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
pdst, stride
*/
-#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \
- dst0, dst1, dst2, dst3, pdst, stride) { \
- v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- \
- tmp0_m = PCKEV_XORI128_UB(in0, in1); \
- tmp1_m = PCKEV_XORI128_UB(in2, in3); \
- ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
- AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
- ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
-}
+#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \
+ pdst, stride) \
+ { \
+ v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ \
+ tmp0_m = PCKEV_XORI128_UB(in0, in1); \
+ tmp1_m = PCKEV_XORI128_UB(in2, in3); \
+ ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
+ AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
+ ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
+ }
/* Description : Pack even byte elements and store byte vector in destination
memory
Arguments : Inputs - in0, in1, pdst
*/
-#define PCKEV_ST_SB(in0, in1, pdst) { \
- v16i8 tmp_m; \
- \
- tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
- ST_SB(tmp_m, (pdst)); \
-}
+#define PCKEV_ST_SB(in0, in1, pdst) \
+ { \
+ v16i8 tmp_m; \
+ \
+ tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
+ ST_SB(tmp_m, (pdst)); \
+ }
/* Description : Horizontal 2 tap filter kernel code
Arguments : Inputs - in0, in1, mask, coeff, shift
*/
-#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({ \
- v16i8 tmp0_m; \
- v8u16 tmp1_m; \
- \
- tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
- tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \
- tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \
- \
- tmp1_m; \
-})
-#endif /* VPX_DSP_MIPS_MACROS_MSA_H_ */
+#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \
+ ({ \
+ v16i8 tmp0_m; \
+ v8u16 tmp1_m; \
+ \
+ tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
+ tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \
+ tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \
+ \
+ tmp1_m; \
+ })
+#endif /* VPX_DSP_MIPS_MACROS_MSA_H_ */
diff --git a/vpx_dsp/mips/sad_msa.c b/vpx_dsp/mips/sad_msa.c
index 3bdec28e6..6455814e1 100644
--- a/vpx_dsp/mips/sad_msa.c
+++ b/vpx_dsp/mips/sad_msa.c
@@ -11,12 +11,13 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/mips/macros_msa.h"
-#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) { \
- out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
- out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
- out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
- out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \
-}
+#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \
+ { \
+ out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
+ out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
+ out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
+ out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \
+ }
#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
@@ -58,8 +59,8 @@ static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
ref += (4 * ref_stride);
- PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
- src0, src1, ref0, ref1);
+ PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+ ref0, ref1);
sad += SAD_UB2_UH(src0, src1, ref0, ref1);
}
@@ -214,8 +215,8 @@ static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
src += (4 * src_stride);
LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
ref += (4 * ref_stride);
- PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22,
- src0, src1, ref0, ref1);
+ PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
+ ref0, ref1);
sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
@@ -473,8 +474,8 @@ static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
src += (4 * src_stride);
LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
ref += (4 * ref_stride);
- PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22,
- src0, src1, ref0, ref1);
+ PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
+ ref0, ref1);
sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
@@ -793,9 +794,9 @@ static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride,
}
static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t * const aref_ptr[],
- int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
+ const uint8_t *const aref_ptr[],
+ int32_t ref_stride, int32_t height,
+ uint32_t *sad_array) {
const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
int32_t ht_cnt;
uint32_t src0, src1, src2, src3;
@@ -854,9 +855,9 @@ static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
}
static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t * const aref_ptr[],
- int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
+ const uint8_t *const aref_ptr[],
+ int32_t ref_stride, int32_t height,
+ uint32_t *sad_array) {
int32_t ht_cnt;
const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
v16u8 src0, src1, src2, src3;
@@ -905,9 +906,9 @@ static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
}
static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t * const aref_ptr[],
- int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
+ const uint8_t *const aref_ptr[],
+ int32_t ref_stride, int32_t height,
+ uint32_t *sad_array) {
int32_t ht_cnt;
const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
v16u8 src, ref0, ref1, ref2, ref3, diff;
@@ -970,9 +971,9 @@ static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
}
static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t * const aref_ptr[],
- int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
+ const uint8_t *const aref_ptr[],
+ int32_t ref_stride, int32_t height,
+ uint32_t *sad_array) {
const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
int32_t ht_cnt;
v16u8 src0, src1, ref0, ref1;
@@ -1014,9 +1015,9 @@ static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
}
static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t * const aref_ptr[],
- int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
+ const uint8_t *const aref_ptr[],
+ int32_t ref_stride, int32_t height,
+ uint32_t *sad_array) {
const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
int32_t ht_cnt;
v16u8 src0, src1, src2, src3;
@@ -1114,8 +1115,8 @@ static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
ref += (4 * ref_stride);
LD_UB2(sec_pred, 16, pred0, pred1);
sec_pred += 32;
- PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
- src0, src1, ref0, ref1);
+ PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+ ref0, ref1);
AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
sad += SAD_UB2_UH(src0, src1, diff0, diff1);
}
@@ -1213,8 +1214,8 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
ref += ref_stride;
LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
sec_pred += 64;
- AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
- comp0, comp1, comp2, comp3);
+ AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+ comp1, comp2, comp3);
sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
@@ -1224,8 +1225,8 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
ref += ref_stride;
LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
sec_pred += 64;
- AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
- comp0, comp1, comp2, comp3);
+ AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+ comp1, comp2, comp3);
sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
@@ -1235,8 +1236,8 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
ref += ref_stride;
LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
sec_pred += 64;
- AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
- comp0, comp1, comp2, comp3);
+ AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+ comp1, comp2, comp3);
sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
@@ -1246,8 +1247,8 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
ref += ref_stride;
LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
sec_pred += 64;
- AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
- comp0, comp1, comp2, comp3);
+ AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+ comp1, comp2, comp3);
sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
}
@@ -1258,180 +1259,180 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
return HADD_SW_S32(sad);
}
-#define VPX_SAD_4xHEIGHT_MSA(height) \
-uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride) { \
- return sad_4width_msa(src, src_stride, ref, ref_stride, height); \
-}
+#define VPX_SAD_4xHEIGHT_MSA(height) \
+ uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride) { \
+ return sad_4width_msa(src, src_stride, ref, ref_stride, height); \
+ }
-#define VPX_SAD_8xHEIGHT_MSA(height) \
-uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride) { \
- return sad_8width_msa(src, src_stride, ref, ref_stride, height); \
-}
+#define VPX_SAD_8xHEIGHT_MSA(height) \
+ uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride) { \
+ return sad_8width_msa(src, src_stride, ref, ref_stride, height); \
+ }
-#define VPX_SAD_16xHEIGHT_MSA(height) \
-uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride) { \
- return sad_16width_msa(src, src_stride, ref, ref_stride, height); \
-}
+#define VPX_SAD_16xHEIGHT_MSA(height) \
+ uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride) { \
+ return sad_16width_msa(src, src_stride, ref, ref_stride, height); \
+ }
-#define VPX_SAD_32xHEIGHT_MSA(height) \
-uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride) { \
- return sad_32width_msa(src, src_stride, ref, ref_stride, height); \
-}
+#define VPX_SAD_32xHEIGHT_MSA(height) \
+ uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride) { \
+ return sad_32width_msa(src, src_stride, ref, ref_stride, height); \
+ }
-#define VPX_SAD_64xHEIGHT_MSA(height) \
-uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride) { \
- return sad_64width_msa(src, src_stride, ref, ref_stride, height); \
-}
+#define VPX_SAD_64xHEIGHT_MSA(height) \
+ uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride) { \
+ return sad_64width_msa(src, src_stride, ref, ref_stride, height); \
+ }
-#define VPX_SAD_4xHEIGHTx3_MSA(height) \
-void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
-}
+#define VPX_SAD_4xHEIGHTx3_MSA(height) \
+ void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sads) { \
+ sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
+ }
-#define VPX_SAD_8xHEIGHTx3_MSA(height) \
-void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
-}
+#define VPX_SAD_8xHEIGHTx3_MSA(height) \
+ void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sads) { \
+ sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
+ }
-#define VPX_SAD_16xHEIGHTx3_MSA(height) \
-void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
-}
+#define VPX_SAD_16xHEIGHTx3_MSA(height) \
+ void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sads) { \
+ sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
+ }
-#define VPX_SAD_32xHEIGHTx3_MSA(height) \
-void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
-}
+#define VPX_SAD_32xHEIGHTx3_MSA(height) \
+ void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sads) { \
+ sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
+ }
-#define VPX_SAD_64xHEIGHTx3_MSA(height) \
-void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
-}
+#define VPX_SAD_64xHEIGHTx3_MSA(height) \
+ void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sads) { \
+ sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
+ }
-#define VPX_SAD_4xHEIGHTx8_MSA(height) \
-void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
-}
+#define VPX_SAD_4xHEIGHTx8_MSA(height) \
+ void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sads) { \
+ sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
+ }
-#define VPX_SAD_8xHEIGHTx8_MSA(height) \
-void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
-}
+#define VPX_SAD_8xHEIGHTx8_MSA(height) \
+ void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sads) { \
+ sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
+ }
-#define VPX_SAD_16xHEIGHTx8_MSA(height) \
-void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
-}
+#define VPX_SAD_16xHEIGHTx8_MSA(height) \
+ void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sads) { \
+ sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
+ }
-#define VPX_SAD_32xHEIGHTx8_MSA(height) \
-void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
-}
+#define VPX_SAD_32xHEIGHTx8_MSA(height) \
+ void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sads) { \
+ sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
+ }
-#define VPX_SAD_64xHEIGHTx8_MSA(height) \
-void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
-}
+#define VPX_SAD_64xHEIGHTx8_MSA(height) \
+ void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sads) { \
+ sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
+ }
-#define VPX_SAD_4xHEIGHTx4D_MSA(height) \
-void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *const refs[], \
- int32_t ref_stride, uint32_t *sads) { \
- sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
-}
+#define VPX_SAD_4xHEIGHTx4D_MSA(height) \
+ void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *const refs[], \
+ int32_t ref_stride, uint32_t *sads) { \
+ sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
+ }
-#define VPX_SAD_8xHEIGHTx4D_MSA(height) \
-void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *const refs[], \
- int32_t ref_stride, uint32_t *sads) { \
- sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
-}
+#define VPX_SAD_8xHEIGHTx4D_MSA(height) \
+ void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *const refs[], \
+ int32_t ref_stride, uint32_t *sads) { \
+ sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
+ }
-#define VPX_SAD_16xHEIGHTx4D_MSA(height) \
-void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *const refs[], \
- int32_t ref_stride, uint32_t *sads) { \
- sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
-}
+#define VPX_SAD_16xHEIGHTx4D_MSA(height) \
+ void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *const refs[], \
+ int32_t ref_stride, uint32_t *sads) { \
+ sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
+ }
-#define VPX_SAD_32xHEIGHTx4D_MSA(height) \
-void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *const refs[], \
- int32_t ref_stride, uint32_t *sads) { \
- sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
-}
+#define VPX_SAD_32xHEIGHTx4D_MSA(height) \
+ void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *const refs[], \
+ int32_t ref_stride, uint32_t *sads) { \
+ sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
+ }
-#define VPX_SAD_64xHEIGHTx4D_MSA(height) \
-void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *const refs[], \
- int32_t ref_stride, uint32_t *sads) { \
- sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
-}
+#define VPX_SAD_64xHEIGHTx4D_MSA(height) \
+ void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *const refs[], \
+ int32_t ref_stride, uint32_t *sads) { \
+ sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
+ }
-#define VPX_AVGSAD_4xHEIGHT_MSA(height) \
-uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- const uint8_t *second_pred) { \
- return avgsad_4width_msa(src, src_stride, ref, ref_stride, \
- height, second_pred); \
-}
+#define VPX_AVGSAD_4xHEIGHT_MSA(height) \
+ uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ const uint8_t *second_pred) { \
+ return avgsad_4width_msa(src, src_stride, ref, ref_stride, height, \
+ second_pred); \
+ }
-#define VPX_AVGSAD_8xHEIGHT_MSA(height) \
-uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- const uint8_t *second_pred) { \
- return avgsad_8width_msa(src, src_stride, ref, ref_stride, \
- height, second_pred); \
-}
+#define VPX_AVGSAD_8xHEIGHT_MSA(height) \
+ uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ const uint8_t *second_pred) { \
+ return avgsad_8width_msa(src, src_stride, ref, ref_stride, height, \
+ second_pred); \
+ }
-#define VPX_AVGSAD_16xHEIGHT_MSA(height) \
-uint32_t vpx_sad16x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- const uint8_t *second_pred) { \
- return avgsad_16width_msa(src, src_stride, ref, ref_stride, \
- height, second_pred); \
-}
+#define VPX_AVGSAD_16xHEIGHT_MSA(height) \
+ uint32_t vpx_sad16x##height##_avg_msa( \
+ const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
+ int32_t ref_stride, const uint8_t *second_pred) { \
+ return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \
+ second_pred); \
+ }
-#define VPX_AVGSAD_32xHEIGHT_MSA(height) \
-uint32_t vpx_sad32x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- const uint8_t *second_pred) { \
- return avgsad_32width_msa(src, src_stride, ref, ref_stride, \
- height, second_pred); \
-}
+#define VPX_AVGSAD_32xHEIGHT_MSA(height) \
+ uint32_t vpx_sad32x##height##_avg_msa( \
+ const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
+ int32_t ref_stride, const uint8_t *second_pred) { \
+ return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \
+ second_pred); \
+ }
-#define VPX_AVGSAD_64xHEIGHT_MSA(height) \
-uint32_t vpx_sad64x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- const uint8_t *second_pred) { \
- return avgsad_64width_msa(src, src_stride, ref, ref_stride, \
- height, second_pred); \
-}
+#define VPX_AVGSAD_64xHEIGHT_MSA(height) \
+ uint32_t vpx_sad64x##height##_avg_msa( \
+ const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
+ int32_t ref_stride, const uint8_t *second_pred) { \
+ return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \
+ second_pred); \
+ }
// 64x64
VPX_SAD_64xHEIGHT_MSA(64);
diff --git a/vpx_dsp/mips/sub_pixel_variance_msa.c b/vpx_dsp/mips/sub_pixel_variance_msa.c
index a592a2d07..313e06f92 100644
--- a/vpx_dsp/mips/sub_pixel_variance_msa.c
+++ b/vpx_dsp/mips/sub_pixel_variance_msa.c
@@ -14,29 +14,23 @@
#include "vpx_dsp/variance.h"
static const uint8_t bilinear_filters_msa[8][2] = {
- { 128, 0, },
- { 112, 16, },
- { 96, 32, },
- { 80, 48, },
- { 64, 64, },
- { 48, 80, },
- { 32, 96, },
- { 16, 112, },
+ { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+ { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
};
-#define CALC_MSE_AVG_B(src, ref, var, sub) { \
- v16u8 src_l0_m, src_l1_m; \
- v8i16 res_l0_m, res_l1_m; \
- \
- ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
- HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
- DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
- \
- sub += res_l0_m + res_l1_m; \
-}
+#define CALC_MSE_AVG_B(src, ref, var, sub) \
+ { \
+ v16u8 src_l0_m, src_l1_m; \
+ v8i16 res_l0_m, res_l1_m; \
+ \
+ ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
+ HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
+ DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
+ \
+ sub += res_l0_m + res_l1_m; \
+ }
-#define VARIANCE_WxH(sse, diff, shift) \
- sse - (((uint32_t)diff * diff) >> shift)
+#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
#define VARIANCE_LARGE_WxH(sse, diff, shift) \
sse - (((int64_t)diff * diff) >> shift)
@@ -45,8 +39,7 @@ static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
int32_t src_stride,
const uint8_t *ref_ptr,
int32_t ref_stride,
- const uint8_t *sec_pred,
- int32_t height,
+ const uint8_t *sec_pred, int32_t height,
int32_t *diff) {
int32_t ht_cnt;
uint32_t src0, src1, src2, src3;
@@ -81,8 +74,7 @@ static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
int32_t src_stride,
const uint8_t *ref_ptr,
int32_t ref_stride,
- const uint8_t *sec_pred,
- int32_t height,
+ const uint8_t *sec_pred, int32_t height,
int32_t *diff) {
int32_t ht_cnt;
v16u8 src0, src1, src2, src3;
@@ -99,8 +91,8 @@ static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
ref_ptr += (4 * ref_stride);
- PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
- src0, src1, ref0, ref1);
+ PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+ ref0, ref1);
AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
CALC_MSE_AVG_B(src0, ref0, var, avg);
CALC_MSE_AVG_B(src1, ref1, var, avg);
@@ -117,8 +109,7 @@ static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
const uint8_t *ref_ptr,
int32_t ref_stride,
const uint8_t *sec_pred,
- int32_t height,
- int32_t *diff) {
+ int32_t height, int32_t *diff) {
int32_t ht_cnt;
v16u8 src, ref, pred;
v8i16 avg = { 0 };
@@ -173,8 +164,7 @@ static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
const uint8_t *ref_ptr,
int32_t ref_stride,
const uint8_t *sec_pred,
- int32_t height,
- int32_t *diff) {
+ int32_t height, int32_t *diff) {
int32_t ht_cnt;
v16u8 src0, src1, ref0, ref1, pred0, pred1;
v8i16 avg = { 0 };
@@ -232,8 +222,7 @@ static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
int32_t src_stride,
const uint8_t *ref_ptr,
int32_t ref_stride,
- const uint8_t *sec_pred,
- int32_t *diff) {
+ const uint8_t *sec_pred, int32_t *diff) {
int32_t ht_cnt;
v16u8 src0, src1, ref0, ref1, pred0, pred1;
v8i16 avg0 = { 0 };
@@ -293,8 +282,7 @@ static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
int32_t src_stride,
const uint8_t *ref_ptr,
int32_t ref_stride,
- const uint8_t *sec_pred,
- int32_t *diff) {
+ const uint8_t *sec_pred, int32_t *diff) {
int32_t ht_cnt;
v16u8 src0, src1, src2, src3;
v16u8 ref0, ref1, ref2, ref3;
@@ -310,8 +298,8 @@ static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
src_ptr += src_stride;
LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
ref_ptr += ref_stride;
- AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
- src0, src1, src2, src3);
+ AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+ src2, src3);
CALC_MSE_AVG_B(src0, ref0, var, avg0);
CALC_MSE_AVG_B(src2, ref2, var, avg0);
CALC_MSE_AVG_B(src1, ref1, var, avg1);
@@ -323,8 +311,8 @@ static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
src_ptr += src_stride;
LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
ref_ptr += ref_stride;
- AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
- src0, src1, src2, src3);
+ AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+ src2, src3);
CALC_MSE_AVG_B(src0, ref0, var, avg0);
CALC_MSE_AVG_B(src2, ref2, var, avg0);
CALC_MSE_AVG_B(src1, ref1, var, avg1);
@@ -343,8 +331,7 @@ static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
int32_t src_stride,
const uint8_t *ref_ptr,
int32_t ref_stride,
- const uint8_t *sec_pred,
- int32_t *diff) {
+ const uint8_t *sec_pred, int32_t *diff) {
int32_t ht_cnt;
v16u8 src0, src1, src2, src3;
v16u8 ref0, ref1, ref2, ref3;
@@ -362,8 +349,8 @@ static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
src_ptr += src_stride;
LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
ref_ptr += ref_stride;
- AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
- src0, src1, src2, src3);
+ AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+ src2, src3);
CALC_MSE_AVG_B(src0, ref0, var, avg0);
CALC_MSE_AVG_B(src1, ref1, var, avg1);
CALC_MSE_AVG_B(src2, ref2, var, avg2);
@@ -375,8 +362,8 @@ static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
src_ptr += src_stride;
LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
ref_ptr += ref_stride;
- AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
- src0, src1, src2, src3);
+ AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+ src2, src3);
CALC_MSE_AVG_B(src0, ref0, var, avg0);
CALC_MSE_AVG_B(src1, ref1, var, avg1);
CALC_MSE_AVG_B(src2, ref2, var, avg2);
@@ -392,13 +379,9 @@ static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
return HADD_SW_S32(var);
}
-static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_4width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
int16_t filtval;
uint32_t loop_cnt;
uint32_t ref0, ref1, ref2, ref3;
@@ -420,11 +403,11 @@ static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src,
INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
- vec0, vec1, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
- src0, src1, src2, src3);
+ PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+ src2, src3);
ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
CALC_MSE_AVG_B(src0, ref, var, avg);
@@ -436,13 +419,9 @@ static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src,
return HADD_SW_S32(var);
}
-static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_8width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
int16_t filtval;
uint32_t loop_cnt;
v16u8 filt0, out, ref0, ref1, ref2, ref3;
@@ -464,11 +443,11 @@ static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src,
PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
- vec0, vec1, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
- src0, src1, src2, src3);
+ PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+ src2, src3);
out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
CALC_MSE_AVG_B(out, ref0, var, avg);
out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
@@ -481,13 +460,9 @@ static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src,
return HADD_SW_S32(var);
}
-static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_16width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
int16_t filtval;
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
@@ -512,14 +487,14 @@ static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src,
VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
- out0, out1, out2, out3);
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
- out4, out5, out6, out7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+ out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+ out6, out7);
SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
- PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6,
- src0, src1, src2, src3);
+ PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1,
+ src2, src3);
CALC_MSE_AVG_B(src0, dst0, var, avg);
CALC_MSE_AVG_B(src1, dst1, var, avg);
CALC_MSE_AVG_B(src2, dst2, var, avg);
@@ -532,13 +507,9 @@ static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src,
return HADD_SW_S32(var);
}
-static uint32_t sub_pixel_sse_diff_32width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_32width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
uint32_t loop_cnt, sse = 0;
int32_t diff0[2];
@@ -554,13 +525,9 @@ static uint32_t sub_pixel_sse_diff_32width_h_msa(const uint8_t *src,
return sse;
}
-static uint32_t sub_pixel_sse_diff_64width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_64width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
uint32_t loop_cnt, sse = 0;
int32_t diff0[4];
@@ -576,13 +543,9 @@ static uint32_t sub_pixel_sse_diff_64width_h_msa(const uint8_t *src,
return sse;
}
-static uint32_t sub_pixel_sse_diff_4width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_4width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
int16_t filtval;
uint32_t loop_cnt;
uint32_t ref0, ref1, ref2, ref3;
@@ -608,8 +571,8 @@ static uint32_t sub_pixel_sse_diff_4width_v_msa(const uint8_t *src,
dst += (4 * dst_stride);
INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
- ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
- src10_r, src21_r, src32_r, src43_r);
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+ src32_r, src43_r);
ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
@@ -624,13 +587,9 @@ static uint32_t sub_pixel_sse_diff_4width_v_msa(const uint8_t *src,
return HADD_SW_S32(var);
}
-static uint32_t sub_pixel_sse_diff_8width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_8width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
int16_t filtval;
uint32_t loop_cnt;
v16u8 src0, src1, src2, src3, src4;
@@ -654,10 +613,10 @@ static uint32_t sub_pixel_sse_diff_8width_v_msa(const uint8_t *src,
dst += (4 * dst_stride);
PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
- ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
- vec0, vec1, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
- tmp0, tmp1, tmp2, tmp3);
+ ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
+ vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
CALC_MSE_AVG_B(src0, ref0, var, avg);
@@ -671,13 +630,9 @@ static uint32_t sub_pixel_sse_diff_8width_v_msa(const uint8_t *src,
return HADD_SW_S32(var);
}
-static uint32_t sub_pixel_sse_diff_16width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_16width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
int16_t filtval;
uint32_t loop_cnt;
v16u8 ref0, ref1, ref2, ref3;
@@ -734,13 +689,9 @@ static uint32_t sub_pixel_sse_diff_16width_v_msa(const uint8_t *src,
return HADD_SW_S32(var);
}
-static uint32_t sub_pixel_sse_diff_32width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_32width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
uint32_t loop_cnt, sse = 0;
int32_t diff0[2];
@@ -756,13 +707,9 @@ static uint32_t sub_pixel_sse_diff_32width_v_msa(const uint8_t *src,
return sse;
}
-static uint32_t sub_pixel_sse_diff_64width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_64width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
uint32_t loop_cnt, sse = 0;
int32_t diff0[4];
@@ -778,14 +725,10 @@ static uint32_t sub_pixel_sse_diff_64width_v_msa(const uint8_t *src,
return sse;
}
-static uint32_t sub_pixel_sse_diff_4width_hv_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter_horiz,
- const uint8_t *filter_vert,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_4width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+ int32_t height, int32_t *diff) {
int16_t filtval;
uint32_t loop_cnt;
uint32_t ref0, ref1, ref2, ref3;
@@ -831,14 +774,10 @@ static uint32_t sub_pixel_sse_diff_4width_hv_msa(const uint8_t *src,
return HADD_SW_S32(var);
}
-static uint32_t sub_pixel_sse_diff_8width_hv_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter_horiz,
- const uint8_t *filter_vert,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_8width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+ int32_t height, int32_t *diff) {
int16_t filtval;
uint32_t loop_cnt;
v16u8 ref0, ref1, ref2, ref3;
@@ -892,14 +831,10 @@ static uint32_t sub_pixel_sse_diff_8width_hv_msa(const uint8_t *src,
return HADD_SW_S32(var);
}
-static uint32_t sub_pixel_sse_diff_16width_hv_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter_horiz,
- const uint8_t *filter_vert,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_16width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+ int32_t height, int32_t *diff) {
int16_t filtval;
uint32_t loop_cnt;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
@@ -969,14 +904,10 @@ static uint32_t sub_pixel_sse_diff_16width_hv_msa(const uint8_t *src,
return HADD_SW_S32(var);
}
-static uint32_t sub_pixel_sse_diff_32width_hv_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter_horiz,
- const uint8_t *filter_vert,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_32width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+ int32_t height, int32_t *diff) {
uint32_t loop_cnt, sse = 0;
int32_t diff0[2];
@@ -993,14 +924,10 @@ static uint32_t sub_pixel_sse_diff_32width_hv_msa(const uint8_t *src,
return sse;
}
-static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter_horiz,
- const uint8_t *filter_vert,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_64width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+ int32_t height, int32_t *diff) {
uint32_t loop_cnt, sse = 0;
int32_t diff0[4];
@@ -1017,14 +944,10 @@ static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src,
return sse;
}
-static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
int16_t filtval;
uint32_t loop_cnt;
uint32_t ref0, ref1, ref2, ref3;
@@ -1049,11 +972,11 @@ static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t *src,
INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
- vec0, vec1, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
- src0, src1, src2, src3);
+ PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+ src2, src3);
ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
out = __msa_aver_u_b(out, pred);
@@ -1066,14 +989,10 @@ static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t *src,
return HADD_SW_S32(var);
}
-static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
int16_t filtval;
uint32_t loop_cnt;
v16u8 out, pred, filt0;
@@ -1096,11 +1015,11 @@ static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src,
PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
- vec0, vec1, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
- src0, src1, src2, src3);
+ PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+ src2, src3);
out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
pred = LD_UB(sec_pred);
@@ -1120,15 +1039,10 @@ static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src,
return HADD_SW_S32(var);
}
-static uint32_t subpel_avg_ssediff_16w_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff,
- int32_t width) {
+static uint32_t subpel_avg_ssediff_16w_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff, int32_t width) {
int16_t filtval;
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
@@ -1157,16 +1071,16 @@ static uint32_t subpel_avg_ssediff_16w_h_msa(const uint8_t *src,
VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
- out0, out1, out2, out3);
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
- out4, out5, out6, out7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+ out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+ out6, out7);
SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
- PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6,
- tmp0, tmp1, tmp2, tmp3);
- AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3,
- tmp0, tmp1, tmp2, tmp3);
+ PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1,
+ tmp2, tmp3);
+ AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1,
+ tmp2, tmp3);
CALC_MSE_AVG_B(tmp0, dst0, var, avg);
CALC_MSE_AVG_B(tmp1, dst1, var, avg);
@@ -1180,33 +1094,25 @@ static uint32_t subpel_avg_ssediff_16w_h_msa(const uint8_t *src,
return HADD_SW_S32(var);
}
-static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
sec_pred, filter, height, diff, 16);
}
-static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
uint32_t loop_cnt, sse = 0;
int32_t diff0[2];
for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
- sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
- sec_pred, filter, height,
- &diff0[loop_cnt], 32);
+ sse +=
+ subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
+ filter, height, &diff0[loop_cnt], 32);
src += 16;
dst += 16;
sec_pred += 16;
@@ -1217,21 +1123,17 @@ static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(const uint8_t *src,
return sse;
}
-static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
uint32_t loop_cnt, sse = 0;
int32_t diff0[4];
for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
- sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
- sec_pred, filter, height,
- &diff0[loop_cnt], 64);
+ sse +=
+ subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
+ filter, height, &diff0[loop_cnt], 64);
src += 16;
dst += 16;
sec_pred += 16;
@@ -1242,14 +1144,10 @@ static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(const uint8_t *src,
return sse;
}
-static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
int16_t filtval;
uint32_t loop_cnt;
uint32_t ref0, ref1, ref2, ref3;
@@ -1276,8 +1174,8 @@ static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t *src,
dst += (4 * dst_stride);
INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
- ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
- src10_r, src21_r, src32_r, src43_r);
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+ src32_r, src43_r);
ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
@@ -1294,14 +1192,10 @@ static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t *src,
return HADD_SW_S32(var);
}
-static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
int16_t filtval;
uint32_t loop_cnt;
v16u8 src0, src1, src2, src3, src4;
@@ -1326,10 +1220,10 @@ static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t *src,
LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
dst += (4 * dst_stride);
PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
- ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
- vec0, vec1, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
- tmp0, tmp1, tmp2, tmp3);
+ ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
+ vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
@@ -1345,15 +1239,10 @@ static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t *src,
return HADD_SW_S32(var);
}
-static uint32_t subpel_avg_ssediff_16w_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff,
- int32_t width) {
+static uint32_t subpel_avg_ssediff_16w_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff, int32_t width) {
int16_t filtval;
uint32_t loop_cnt;
v16u8 ref0, ref1, ref2, ref3;
@@ -1401,8 +1290,8 @@ static uint32_t subpel_avg_ssediff_16w_v_msa(const uint8_t *src,
LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
dst += (4 * dst_stride);
- AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3,
- out0, out1, out2, out3);
+ AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
+ out2, out3);
CALC_MSE_AVG_B(out0, ref0, var, avg);
CALC_MSE_AVG_B(out1, ref1, var, avg);
@@ -1416,33 +1305,25 @@ static uint32_t subpel_avg_ssediff_16w_v_msa(const uint8_t *src,
return HADD_SW_S32(var);
}
-static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
sec_pred, filter, height, diff, 16);
}
-static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
uint32_t loop_cnt, sse = 0;
int32_t diff0[2];
for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
- sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
- sec_pred, filter, height,
- &diff0[loop_cnt], 32);
+ sse +=
+ subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
+ filter, height, &diff0[loop_cnt], 32);
src += 16;
dst += 16;
sec_pred += 16;
@@ -1453,21 +1334,17 @@ static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(const uint8_t *src,
return sse;
}
-static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
+static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
uint32_t loop_cnt, sse = 0;
int32_t diff0[4];
for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
- sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
- sec_pred, filter, height,
- &diff0[loop_cnt], 64);
+ sse +=
+ subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
+ filter, height, &diff0[loop_cnt], 64);
src += 16;
dst += 16;
sec_pred += 16;
@@ -1479,11 +1356,9 @@ static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(const uint8_t *src,
}
static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
- const uint8_t *src, int32_t src_stride,
- const uint8_t *dst, int32_t dst_stride,
- const uint8_t *sec_pred,
- const uint8_t *filter_horiz, const uint8_t *filter_vert,
- int32_t height, int32_t *diff) {
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+ const uint8_t *filter_vert, int32_t height, int32_t *diff) {
int16_t filtval;
uint32_t loop_cnt;
uint32_t ref0, ref1, ref2, ref3;
@@ -1532,11 +1407,9 @@ static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
}
static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
- const uint8_t *src, int32_t src_stride,
- const uint8_t *dst, int32_t dst_stride,
- const uint8_t *sec_pred,
- const uint8_t *filter_horiz, const uint8_t *filter_vert,
- int32_t height, int32_t *diff) {
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+ const uint8_t *filter_vert, int32_t height, int32_t *diff) {
int16_t filtval;
uint32_t loop_cnt;
v16u8 ref0, ref1, ref2, ref3;
@@ -1598,16 +1471,10 @@ static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
return HADD_SW_S32(var);
}
-static uint32_t subpel_avg_ssediff_16w_hv_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const uint8_t *filter_horiz,
- const uint8_t *filter_vert,
- int32_t height,
- int32_t *diff,
- int32_t width) {
+static uint32_t subpel_avg_ssediff_16w_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+ const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
int16_t filtval;
uint32_t loop_cnt;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
@@ -1669,8 +1536,8 @@ static uint32_t subpel_avg_ssediff_16w_hv_msa(const uint8_t *src,
LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
dst += (4 * dst_stride);
- AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3,
- out0, out1, out2, out3);
+ AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
+ out2, out3);
CALC_MSE_AVG_B(out0, ref0, var, avg);
CALC_MSE_AVG_B(out1, ref1, var, avg);
@@ -1685,22 +1552,18 @@ static uint32_t subpel_avg_ssediff_16w_hv_msa(const uint8_t *src,
}
static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
- const uint8_t *src, int32_t src_stride,
- const uint8_t *dst, int32_t dst_stride,
- const uint8_t *sec_pred,
- const uint8_t *filter_horiz, const uint8_t *filter_vert,
- int32_t height, int32_t *diff) {
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+ const uint8_t *filter_vert, int32_t height, int32_t *diff) {
return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
sec_pred, filter_horiz, filter_vert,
height, diff, 16);
}
static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
- const uint8_t *src, int32_t src_stride,
- const uint8_t *dst, int32_t dst_stride,
- const uint8_t *sec_pred,
- const uint8_t *filter_horiz, const uint8_t *filter_vert,
- int32_t height, int32_t *diff) {
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+ const uint8_t *filter_vert, int32_t height, int32_t *diff) {
uint32_t loop_cnt, sse = 0;
int32_t diff0[2];
@@ -1719,11 +1582,9 @@ static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
}
static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
- const uint8_t *src, int32_t src_stride,
- const uint8_t *dst, int32_t dst_stride,
- const uint8_t *sec_pred,
- const uint8_t *filter_horiz, const uint8_t *filter_vert,
- int32_t height, int32_t *diff) {
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+ const uint8_t *filter_vert, int32_t height, int32_t *diff) {
uint32_t loop_cnt, sse = 0;
int32_t diff0[4];
@@ -1756,47 +1617,40 @@ static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
-#define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \
-uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src, \
- int32_t src_stride, \
- int32_t xoffset, \
- int32_t yoffset, \
- const uint8_t *ref, \
- int32_t ref_stride, \
- uint32_t *sse) { \
- int32_t diff; \
- uint32_t var; \
- const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \
- const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \
- \
- if (yoffset) { \
- if (xoffset) { \
- *sse = sub_pixel_sse_diff_##wd##width_hv_msa(src, src_stride, \
- ref, ref_stride, \
- h_filter, v_filter, \
- ht, &diff); \
- } else { \
- *sse = sub_pixel_sse_diff_##wd##width_v_msa(src, src_stride, \
- ref, ref_stride, \
- v_filter, ht, &diff); \
- } \
- \
- var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
- } else { \
- if (xoffset) { \
- *sse = sub_pixel_sse_diff_##wd##width_h_msa(src, src_stride, \
- ref, ref_stride, \
- h_filter, ht, &diff); \
- \
- var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
- } else { \
- var = vpx_variance##wd##x##ht##_msa(src, src_stride, \
- ref, ref_stride, sse); \
- } \
- } \
- \
- return var; \
-}
+#define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \
+ uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa( \
+ const uint8_t *src, int32_t src_stride, int32_t xoffset, \
+ int32_t yoffset, const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sse) { \
+ int32_t diff; \
+ uint32_t var; \
+ const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \
+ const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \
+ \
+ if (yoffset) { \
+ if (xoffset) { \
+ *sse = sub_pixel_sse_diff_##wd##width_hv_msa( \
+ src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
+ } else { \
+ *sse = sub_pixel_sse_diff_##wd##width_v_msa( \
+ src, src_stride, ref, ref_stride, v_filter, ht, &diff); \
+ } \
+ \
+ var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
+ } else { \
+ if (xoffset) { \
+ *sse = sub_pixel_sse_diff_##wd##width_h_msa( \
+ src, src_stride, ref, ref_stride, h_filter, ht, &diff); \
+ \
+ var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
+ } else { \
+ var = vpx_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \
+ sse); \
+ } \
+ } \
+ \
+ return var; \
+ }
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
@@ -1817,42 +1671,37 @@ VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
#define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \
-uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa( \
- const uint8_t *src_ptr, int32_t src_stride, \
- int32_t xoffset, int32_t yoffset, \
- const uint8_t *ref_ptr, int32_t ref_stride, \
- uint32_t *sse, const uint8_t *sec_pred) { \
- int32_t diff; \
- const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \
- const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \
+ uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa( \
+ const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \
+ int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \
+ uint32_t *sse, const uint8_t *sec_pred) { \
+ int32_t diff; \
+ const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \
+ const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \
\
- if (yoffset) { \
- if (xoffset) { \
- *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(src_ptr, src_stride, \
- ref_ptr, ref_stride, \
- sec_pred, h_filter, \
- v_filter, ht, &diff); \
- } else { \
- *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa(src_ptr, src_stride, \
- ref_ptr, ref_stride, \
- sec_pred, v_filter, \
- ht, &diff); \
- } \
- } else { \
- if (xoffset) { \
- *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(src_ptr, src_stride, \
- ref_ptr, ref_stride, \
- sec_pred, h_filter, \
- ht, &diff); \
+ if (yoffset) { \
+ if (xoffset) { \
+ *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \
+ v_filter, ht, &diff); \
+ } else { \
+ *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
+ &diff); \
+ } \
} else { \
- *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, \
- ref_ptr, ref_stride, \
- sec_pred, ht, &diff); \
+ if (xoffset) { \
+ *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
+ &diff); \
+ } else { \
+ *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr, \
+ ref_stride, sec_pred, ht, &diff); \
+ } \
} \
- } \
\
- return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
-}
+ return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
+ }
VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4);
VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8);
@@ -1870,11 +1719,9 @@ VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32);
uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
int32_t src_stride,
- int32_t xoffset,
- int32_t yoffset,
+ int32_t xoffset, int32_t yoffset,
const uint8_t *ref_ptr,
- int32_t ref_stride,
- uint32_t *sse,
+ int32_t ref_stride, uint32_t *sse,
const uint8_t *sec_pred) {
int32_t diff;
const uint8_t *h_filter = bilinear_filters_msa[xoffset];
@@ -1882,22 +1729,19 @@ uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
if (yoffset) {
if (xoffset) {
- *sse = sub_pixel_avg_sse_diff_32width_hv_msa(src_ptr, src_stride,
- ref_ptr, ref_stride,
- sec_pred, h_filter,
- v_filter, 64, &diff);
+ *sse = sub_pixel_avg_sse_diff_32width_hv_msa(
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,
+ v_filter, 64, &diff);
} else {
- *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride,
- ref_ptr, ref_stride,
- sec_pred, v_filter,
- 64, &diff);
+ *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr,
+ ref_stride, sec_pred,
+ v_filter, 64, &diff);
}
} else {
if (xoffset) {
- *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride,
- ref_ptr, ref_stride,
- sec_pred, h_filter,
- 64, &diff);
+ *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr,
+ ref_stride, sec_pred,
+ h_filter, 64, &diff);
} else {
*sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
sec_pred, &diff);
@@ -1907,46 +1751,38 @@ uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
return VARIANCE_32Wx64H(*sse, diff);
}
-#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \
-uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa(const uint8_t *src_ptr, \
- int32_t src_stride, \
- int32_t xoffset, \
- int32_t yoffset, \
- const uint8_t *ref_ptr, \
- int32_t ref_stride, \
- uint32_t *sse, \
- const uint8_t *sec_pred) { \
- int32_t diff; \
- const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \
- const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \
- \
- if (yoffset) { \
- if (xoffset) { \
- *sse = sub_pixel_avg_sse_diff_64width_hv_msa(src_ptr, src_stride, \
- ref_ptr, ref_stride, \
- sec_pred, h_filter, \
- v_filter, ht, &diff); \
- } else { \
- *sse = sub_pixel_avg_sse_diff_64width_v_msa(src_ptr, src_stride, \
- ref_ptr, ref_stride, \
- sec_pred, v_filter, \
- ht, &diff); \
- } \
- } else { \
- if (xoffset) { \
- *sse = sub_pixel_avg_sse_diff_64width_h_msa(src_ptr, src_stride, \
- ref_ptr, ref_stride, \
- sec_pred, h_filter, \
- ht, &diff); \
- } else { \
- *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, \
- ref_ptr, ref_stride, \
- sec_pred, &diff); \
- } \
- } \
- \
- return VARIANCE_64Wx##ht##H(*sse, diff); \
-}
+#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \
+ uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa( \
+ const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \
+ int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \
+ uint32_t *sse, const uint8_t *sec_pred) { \
+ int32_t diff; \
+ const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \
+ const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \
+ \
+ if (yoffset) { \
+ if (xoffset) { \
+ *sse = sub_pixel_avg_sse_diff_64width_hv_msa( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \
+ v_filter, ht, &diff); \
+ } else { \
+ *sse = sub_pixel_avg_sse_diff_64width_v_msa( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
+ &diff); \
+ } \
+ } else { \
+ if (xoffset) { \
+ *sse = sub_pixel_avg_sse_diff_64width_h_msa( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
+ &diff); \
+ } else { \
+ *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr, \
+ ref_stride, sec_pred, &diff); \
+ } \
+ } \
+ \
+ return VARIANCE_64Wx##ht##H(*sse, diff); \
+ }
VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32);
VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64);
diff --git a/vpx_dsp/mips/subtract_msa.c b/vpx_dsp/mips/subtract_msa.c
index 9ac43c5cd..391a7ebf6 100644
--- a/vpx_dsp/mips/subtract_msa.c
+++ b/vpx_dsp/mips/subtract_msa.c
@@ -68,8 +68,8 @@ static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
src += (8 * src_stride);
- LD_SB8(pred, pred_stride,
- pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7);
+ LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6,
+ pred7);
pred += (8 * pred_stride);
ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
@@ -226,31 +226,31 @@ static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
}
}
-void vpx_subtract_block_msa(int32_t rows, int32_t cols,
- int16_t *diff_ptr, ptrdiff_t diff_stride,
- const uint8_t *src_ptr, ptrdiff_t src_stride,
- const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+void vpx_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr,
+ ptrdiff_t pred_stride) {
if (rows == cols) {
switch (rows) {
case 4:
- sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride,
- diff_ptr, diff_stride);
+ sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
break;
case 8:
- sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride,
- diff_ptr, diff_stride);
+ sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
break;
case 16:
- sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride,
- diff_ptr, diff_stride);
+ sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
break;
case 32:
- sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride,
- diff_ptr, diff_stride);
+ sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
break;
case 64:
- sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride,
- diff_ptr, diff_stride);
+ sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
break;
default:
vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
diff --git a/vpx_dsp/mips/txfm_macros_msa.h b/vpx_dsp/mips/txfm_macros_msa.h
index 68c63d56f..da100f6a9 100644
--- a/vpx_dsp/mips/txfm_macros_msa.h
+++ b/vpx_dsp/mips/txfm_macros_msa.h
@@ -13,81 +13,84 @@
#include "vpx_dsp/mips/macros_msa.h"
-#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) { \
- v8i16 k0_m = __msa_fill_h(cnst0); \
- v4i32 s0_m, s1_m, s2_m, s3_m; \
- \
- s0_m = (v4i32)__msa_fill_h(cnst1); \
- k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \
- \
- ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \
- ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \
- DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \
- SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
- out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
- \
- DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \
- SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
- out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
-}
+#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \
+ { \
+ v8i16 k0_m = __msa_fill_h(cnst0); \
+ v4i32 s0_m, s1_m, s2_m, s3_m; \
+ \
+ s0_m = (v4i32)__msa_fill_h(cnst1); \
+ k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \
+ \
+ ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \
+ ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \
+ DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \
+ SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
+ out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
+ \
+ DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \
+ SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
+ out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
+ }
-#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, \
- dst0, dst1, dst2, dst3) { \
- v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \
- v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \
- \
- DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, \
- tp0_m, tp2_m, tp3_m, tp4_m); \
- DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, \
- tp5_m, tp6_m, tp7_m, tp8_m); \
- BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \
- BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \
- SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \
- SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \
- PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, \
- dst0, dst1, dst2, dst3); \
-}
+#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, dst0, \
+ dst1, dst2, dst3) \
+ { \
+ v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \
+ v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \
+ \
+ DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, tp0_m, tp2_m, tp3_m, \
+ tp4_m); \
+ DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, tp5_m, tp6_m, tp7_m, \
+ tp8_m); \
+ BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \
+ BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \
+ SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \
+ SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \
+ PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, dst0, \
+ dst1, dst2, dst3); \
+ }
-#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) ({ \
- v8i16 dst_m; \
- v4i32 tp0_m, tp1_m; \
- \
- DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \
- SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS); \
- dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \
- \
- dst_m; \
-})
+#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) \
+ ({ \
+ v8i16 dst_m; \
+ v4i32 tp0_m, tp1_m; \
+ \
+ DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \
+ SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS); \
+ dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \
+ \
+ dst_m; \
+ })
-#define MADD_SHORT(m0, m1, c0, c1, res0, res1) { \
- v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \
- v8i16 madd_s0_m, madd_s1_m; \
- \
- ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \
- DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, \
- c0, c0, c1, c1, madd0_m, madd1_m, madd2_m, madd3_m); \
- SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \
-}
+#define MADD_SHORT(m0, m1, c0, c1, res0, res1) \
+ { \
+ v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \
+ v8i16 madd_s0_m, madd_s1_m; \
+ \
+ ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \
+ DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, c0, c0, c1, c1, \
+ madd0_m, madd1_m, madd2_m, madd3_m); \
+ SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \
+ }
-#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \
- out0, out1, out2, out3) { \
- v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
- v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \
- \
- ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \
- ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \
- DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \
- cst0, cst0, cst2, cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
- BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \
- m4_m, m5_m, tmp3_m, tmp2_m); \
- SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \
- DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \
- cst1, cst1, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
- BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \
- m4_m, m5_m, tmp3_m, tmp2_m); \
- SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \
-}
+#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \
+ out2, out3) \
+ { \
+ v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
+ v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \
+ \
+ ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \
+ ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \
+ DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst0, cst0, cst2, \
+ cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
+ BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \
+ SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \
+ DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst1, cst1, cst3, \
+ cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
+ BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \
+ SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \
+ }
#endif // VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
diff --git a/vpx_dsp/mips/variance_msa.c b/vpx_dsp/mips/variance_msa.c
index 33e175560..085990e48 100644
--- a/vpx_dsp/mips/variance_msa.c
+++ b/vpx_dsp/mips/variance_msa.c
@@ -11,28 +11,29 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/mips/macros_msa.h"
-#define CALC_MSE_B(src, ref, var) { \
- v16u8 src_l0_m, src_l1_m; \
- v8i16 res_l0_m, res_l1_m; \
- \
- ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
- HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
- DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
-}
+#define CALC_MSE_B(src, ref, var) \
+ { \
+ v16u8 src_l0_m, src_l1_m; \
+ v8i16 res_l0_m, res_l1_m; \
+ \
+ ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
+ HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
+ DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
+ }
-#define CALC_MSE_AVG_B(src, ref, var, sub) { \
- v16u8 src_l0_m, src_l1_m; \
- v8i16 res_l0_m, res_l1_m; \
- \
- ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
- HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
- DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
- \
- sub += res_l0_m + res_l1_m; \
-}
+#define CALC_MSE_AVG_B(src, ref, var, sub) \
+ { \
+ v16u8 src_l0_m, src_l1_m; \
+ v8i16 res_l0_m, res_l1_m; \
+ \
+ ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
+ HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
+ DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
+ \
+ sub += res_l0_m + res_l1_m; \
+ }
-#define VARIANCE_WxH(sse, diff, shift) \
- sse - (((uint32_t)diff * diff) >> shift)
+#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
#define VARIANCE_LARGE_WxH(sse, diff, shift) \
sse - (((int64_t)diff * diff) >> shift)
@@ -80,8 +81,8 @@ static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
ref_ptr += (4 * ref_stride);
- PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
- src0, src1, ref0, ref1);
+ PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+ ref0, ref1);
CALC_MSE_AVG_B(src0, ref0, var, avg);
CALC_MSE_AVG_B(src1, ref1, var, avg);
}
@@ -370,8 +371,8 @@ static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
ref_ptr += (4 * ref_stride);
- PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
- src0, src1, ref0, ref1);
+ PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+ ref0, ref1);
CALC_MSE_B(src0, ref0, var);
CALC_MSE_B(src1, ref1, var);
}
@@ -526,19 +527,17 @@ uint32_t vpx_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride,
#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
-#define VPX_VARIANCE_WDXHT_MSA(wd, ht) \
-uint32_t vpx_variance##wd##x##ht##_msa(const uint8_t *src, \
- int32_t src_stride, \
- const uint8_t *ref, \
- int32_t ref_stride, \
- uint32_t *sse) { \
- int32_t diff; \
- \
- *sse = sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, \
- ht, &diff); \
- \
- return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
-}
+#define VPX_VARIANCE_WDXHT_MSA(wd, ht) \
+ uint32_t vpx_variance##wd##x##ht##_msa( \
+ const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
+ int32_t ref_stride, uint32_t *sse) { \
+ int32_t diff; \
+ \
+ *sse = \
+ sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \
+ \
+ return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
+ }
VPX_VARIANCE_WDXHT_MSA(4, 4);
VPX_VARIANCE_WDXHT_MSA(4, 8);
@@ -585,8 +584,7 @@ uint32_t vpx_variance64x64_msa(const uint8_t *src, int32_t src_stride,
}
uint32_t vpx_mse8x8_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- uint32_t *sse) {
+ const uint8_t *ref, int32_t ref_stride, uint32_t *sse) {
*sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8);
return *sse;
@@ -617,17 +615,15 @@ uint32_t vpx_mse16x16_msa(const uint8_t *src, int32_t src_stride,
}
void vpx_get8x8var_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- uint32_t *sse, int32_t *sum) {
+ const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
+ int32_t *sum) {
*sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum);
}
void vpx_get16x16var_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- uint32_t *sse, int32_t *sum) {
+ const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
+ int32_t *sum) {
*sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum);
}
-uint32_t vpx_get_mb_ss_msa(const int16_t *src) {
- return get_mb_ss_msa(src);
-}
+uint32_t vpx_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); }
diff --git a/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c b/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
index f6244d834..ad2af2866 100644
--- a/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
@@ -13,8 +13,7 @@
#include "vpx_dsp/mips/vpx_convolve_msa.h"
static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
int8_t *filter) {
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
@@ -48,8 +47,7 @@ static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
}
static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
int8_t *filter) {
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
@@ -92,10 +90,8 @@ static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
}
static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
int32_t height) {
if (4 == height) {
common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
@@ -105,10 +101,8 @@ static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
}
static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
int32_t height) {
int32_t loop_cnt;
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
@@ -136,18 +130,16 @@ static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
SAT_SH4_SH(out0, out1, out2, out3, 7);
- CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
- dst, dst_stride);
+ CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
+ dst_stride);
dst += (4 * dst_stride);
}
}
static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
+ int8_t *filter, int32_t height) {
int32_t loop_cnt;
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
@@ -199,11 +191,9 @@ static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
}
static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
+ int8_t *filter, int32_t height) {
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
@@ -256,11 +246,9 @@ static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
}
static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
+ int8_t *filter, int32_t height) {
uint32_t loop_cnt, cnt;
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
@@ -318,8 +306,7 @@ static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
}
static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
int8_t *filter) {
v16i8 src0, src1, src2, src3, mask;
@@ -344,8 +331,7 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
}
static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
int8_t *filter) {
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
@@ -378,10 +364,8 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
}
static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
int32_t height) {
if (4 == height) {
common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
@@ -391,8 +375,7 @@ static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
}
static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
int8_t *filter) {
v16i8 src0, src1, src2, src3, mask;
@@ -412,16 +395,13 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
- dst, dst_stride);
+ PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
+ dst_stride);
}
-static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
+static void common_hz_2t_and_aver_dst_8x8mult_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
v16i8 src0, src1, src2, src3, mask;
v16u8 filt0, dst0, dst1, dst2, dst3;
v8u16 vec0, vec1, vec2, vec3, filt;
@@ -442,8 +422,8 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
- PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
- dst, dst_stride);
+ PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
+ dst_stride);
dst += (4 * dst_stride);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
@@ -452,8 +432,8 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
- dst, dst_stride);
+ PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
+ dst_stride);
dst += (4 * dst_stride);
if (16 == height) {
@@ -467,8 +447,8 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
LD_SB4(src, src_stride, src0, src1, src2, src3);
- PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
- dst, dst_stride);
+ PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
+ dst_stride);
dst += (4 * dst_stride);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
@@ -477,16 +457,14 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
- dst, dst_stride);
+ PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
+ dst_stride);
}
}
static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
int32_t height) {
if (4 == height) {
common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
@@ -497,11 +475,9 @@ static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
}
static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
+ int8_t *filter, int32_t height) {
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16u8 filt0, dst0, dst1, dst2, dst3;
@@ -566,11 +542,9 @@ static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
}
static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
+ int8_t *filter, int32_t height) {
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16u8 filt0, dst0, dst1, dst2, dst3;
@@ -617,11 +591,9 @@ static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
}
static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
+ int8_t *filter, int32_t height) {
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16u8 filt0, dst0, dst1, dst2, dst3;
@@ -662,8 +634,8 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
int8_t cnt, filt_hor[8];
assert(x_step_q4 == 16);
@@ -676,67 +648,55 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
if (((const int32_t *)filter_x)[0] == 0) {
switch (w) {
case 4:
- common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], h);
+ common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
break;
case 8:
- common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], h);
+ common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
break;
case 16:
- common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], h);
+ common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
break;
case 32:
- common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], h);
+ common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
break;
case 64:
- common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], h);
+ common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
break;
default:
- vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
break;
}
} else {
switch (w) {
case 4:
- common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, h);
+ common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
break;
case 8:
- common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, h);
+ common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
break;
case 16:
- common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, h);
+ common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
break;
case 32:
- common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, h);
+ common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
break;
case 64:
- common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, h);
+ common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
break;
default:
- vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
break;
}
}
diff --git a/vpx_dsp/mips/vpx_convolve8_avg_msa.c b/vpx_dsp/mips/vpx_convolve8_avg_msa.c
index 2abde6de8..1cfa63201 100644
--- a/vpx_dsp/mips/vpx_convolve8_avg_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_avg_msa.c
@@ -12,13 +12,9 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/mips/vpx_convolve_msa.h"
-static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
+static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
@@ -64,15 +60,15 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
src += (4 * src_stride);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
- hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
@@ -94,13 +90,9 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
}
}
-static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
+static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
@@ -154,20 +146,20 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
- hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
- hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
@@ -180,8 +172,8 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
- CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3,
- dst, dst_stride);
+ CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst,
+ dst_stride);
dst += (4 * dst_stride);
hz_out6 = hz_out10;
@@ -194,13 +186,9 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
}
}
-static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
+static void common_hv_8ht_8vt_and_aver_dst_16w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
int32_t multiple8_cnt;
for (multiple8_cnt = 2; multiple8_cnt--;) {
common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
@@ -210,13 +198,9 @@ static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src,
}
}
-static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
+static void common_hv_8ht_8vt_and_aver_dst_32w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
int32_t multiple8_cnt;
for (multiple8_cnt = 4; multiple8_cnt--;) {
common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
@@ -226,13 +210,9 @@ static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src,
}
}
-static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
+static void common_hv_8ht_8vt_and_aver_dst_64w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
int32_t multiple8_cnt;
for (multiple8_cnt = 8; multiple8_cnt--;) {
common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
@@ -242,12 +222,9 @@ static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src,
}
}
-static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert) {
+static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert) {
v16i8 src0, src1, src2, src3, src4, mask;
v16u8 filt_hz, filt_vt, vec0, vec1;
v16u8 dst0, dst1, dst2, dst3, res0, res1;
@@ -280,12 +257,9 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src,
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
-static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert) {
+static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert) {
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
@@ -316,29 +290,25 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
- ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2,
- dst4, dst6);
+ ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
+ dst6);
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
- tmp0, tmp1, tmp2, tmp3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0,
+ tmp1, tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1,
- res2, res3);
- AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1,
- res2, res3);
+ PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2,
+ res3);
+ AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
+ res3);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
}
-static void common_hv_2ht_2vt_and_aver_dst_4w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
+static void common_hv_2ht_2vt_and_aver_dst_4w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
if (4 == height) {
common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
filter_horiz, filter_vert);
@@ -348,12 +318,9 @@ static void common_hv_2ht_2vt_and_aver_dst_4w_msa(const uint8_t *src,
}
}
-static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert) {
+static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert) {
v16i8 src0, src1, src2, src3, src4, mask;
v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
@@ -390,17 +357,13 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src,
tmp3 = __msa_dotp_u_h(vec3, filt_vt);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
- dst, dst_stride);
+ PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
+ dst_stride);
}
-static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
+static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, mask;
v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
@@ -445,36 +408,27 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
- dst, dst_stride);
+ PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
+ dst_stride);
dst += (4 * dst_stride);
}
}
-static void common_hv_2ht_2vt_and_aver_dst_8w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
+static void common_hv_2ht_2vt_and_aver_dst_8w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
if (4 == height) {
common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
filter_horiz, filter_vert);
} else {
- common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert,
- height);
+ common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
+ src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height);
}
}
-static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
+static void common_hv_2ht_2vt_and_aver_dst_16w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
@@ -536,13 +490,9 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
}
}
-static void common_hv_2ht_2vt_and_aver_dst_32w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
+static void common_hv_2ht_2vt_and_aver_dst_32w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
int32_t multiple8_cnt;
for (multiple8_cnt = 2; multiple8_cnt--;) {
common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
@@ -552,13 +502,9 @@ static void common_hv_2ht_2vt_and_aver_dst_32w_msa(const uint8_t *src,
}
}
-static void common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
+static void common_hv_2ht_2vt_and_aver_dst_64w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
int32_t multiple8_cnt;
for (multiple8_cnt = 4; multiple8_cnt--;) {
common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
@@ -571,8 +517,8 @@ static void common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t *src,
void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
int8_t cnt, filt_hor[8], filt_ver[8];
assert(x_step_q4 == 16);
@@ -589,72 +535,69 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
((const int32_t *)filter_y)[0] == 0) {
switch (w) {
case 4:
- common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], &filt_ver[3], h);
+ common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], h);
break;
case 8:
- common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], &filt_ver[3], h);
+ common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], h);
break;
case 16:
- common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
+ common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride,
&filt_hor[3], &filt_ver[3], h);
break;
case 32:
- common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
+ common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride,
&filt_hor[3], &filt_ver[3], h);
break;
case 64:
- common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
+ common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride,
&filt_hor[3], &filt_ver[3], h);
break;
default:
- vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
break;
}
} else if (((const int32_t *)filter_x)[0] == 0 ||
((const int32_t *)filter_y)[0] == 0) {
- vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
} else {
switch (w) {
case 4:
- common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, filt_ver, h);
+ common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
break;
case 8:
- common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, filt_ver, h);
+ common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
break;
case 16:
- common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, filt_ver, h);
+ common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
break;
case 32:
- common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, filt_ver, h);
+ common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
break;
case 64:
- common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, filt_ver, h);
+ common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
break;
default:
- vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
break;
}
}
diff --git a/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c b/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
index 0164e41aa..146ce3b2f 100644
--- a/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
@@ -13,10 +13,8 @@
#include "vpx_dsp/mips/vpx_convolve_msa.h"
static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
int32_t height) {
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
@@ -73,10 +71,8 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
}
static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
int32_t height) {
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
@@ -106,18 +102,18 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
XORI_B4_128_SB(src7, src8, src9, src10);
ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
src87_r, src98_r, src109_r);
- out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
- filt1, filt2, filt3);
- out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
- filt1, filt2, filt3);
- out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
- filt1, filt2, filt3);
+ out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, filt1,
+ filt2, filt3);
+ out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, filt1,
+ filt2, filt3);
+ out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, filt1,
+ filt2, filt3);
out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
filt1, filt2, filt3);
SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
SAT_SH4_SH(out0, out1, out2, out3, 7);
- CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
- dst, dst_stride);
+ CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
+ dst_stride);
dst += (4 * dst_stride);
src10_r = src54_r;
@@ -130,13 +126,9 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
}
}
-static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height,
- int32_t width) {
+static void common_vt_8t_and_aver_dst_16w_mult_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height, int32_t width) {
const uint8_t *src_tmp;
uint8_t *dst_tmp;
uint32_t loop_cnt, cnt;
@@ -227,38 +219,31 @@ static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src,
}
static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
+ int8_t *filter, int32_t height) {
common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
filter, height, 16);
}
static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
+ int8_t *filter, int32_t height) {
common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
filter, height, 32);
}
static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
+ int8_t *filter, int32_t height) {
common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
filter, height, 64);
}
static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
int8_t *filter) {
v16i8 src0, src1, src2, src3, src4;
@@ -292,8 +277,7 @@ static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
}
static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
int8_t *filter) {
v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
@@ -311,15 +295,15 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
src8 = LD_SB(src);
LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
- ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
- dst2, dst3);
+ ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, dst2,
+ dst3);
ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
src32_r, src43_r);
ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
src76_r, src87_r);
- ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
- src87_r, src76_r, src2110, src4332, src6554, src8776);
+ ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
+ src76_r, src2110, src4332, src6554, src8776);
DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
tmp0, tmp1, tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
@@ -331,10 +315,8 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
}
static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
int32_t height) {
if (4 == height) {
common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
@@ -344,8 +326,7 @@ static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
}
static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
int8_t *filter) {
v16u8 src0, src1, src2, src3, src4;
@@ -364,16 +345,13 @@ static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
- dst, dst_stride);
+ PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
+ dst_stride);
}
-static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
+static void common_vt_2t_and_aver_dst_8x8mult_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
uint32_t loop_cnt;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
@@ -393,22 +371,22 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
src += (8 * src_stride);
LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8);
- ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1,
- vec2, vec3);
- ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5,
- vec6, vec7);
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
+ vec3);
+ ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
+ vec7);
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4,
- dst, dst_stride);
+ PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, dst,
+ dst_stride);
dst += (4 * dst_stride);
DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8,
- dst, dst_stride);
+ PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, dst,
+ dst_stride);
dst += (4 * dst_stride);
src0 = src8;
@@ -416,10 +394,8 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
}
static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
int32_t height) {
if (4 == height) {
common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
@@ -430,11 +406,9 @@ static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src,
}
static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
+ int8_t *filter, int32_t height) {
uint32_t loop_cnt;
v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
@@ -481,11 +455,9 @@ static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
}
static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
+ int8_t *filter, int32_t height) {
uint32_t loop_cnt;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
@@ -554,11 +526,9 @@ static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src,
}
static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
+ int8_t *filter, int32_t height) {
uint32_t loop_cnt;
v16u8 src0, src1, src2, src3, src4, src5;
v16u8 src6, src7, src8, src9, src10, src11, filt0;
@@ -636,8 +606,8 @@ static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
int8_t cnt, filt_ver[8];
assert(y_step_q4 == 16);
@@ -650,68 +620,56 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
if (((const int32_t *)filter_y)[0] == 0) {
switch (w) {
case 4:
- common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_ver[3], h);
+ common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
break;
case 8:
- common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_ver[3], h);
+ common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
break;
case 16:
- common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_ver[3], h);
+ common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
break;
case 32:
- common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_ver[3], h);
+ common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
break;
case 64:
- common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_ver[3], h);
+ common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
break;
default:
- vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
break;
}
} else {
switch (w) {
case 4:
- common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_ver, h);
+ common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
break;
case 8:
- common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_ver, h);
+ common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
break;
case 16:
- common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_ver, h);
+ common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
break;
case 32:
- common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_ver, h);
+ common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
break;
case 64:
- common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_ver, h);
+ common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
break;
default:
- vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
break;
}
}
diff --git a/vpx_dsp/mips/vpx_convolve8_horiz_msa.c b/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
index dbd120b0d..9e8bf7b51 100644
--- a/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
@@ -325,7 +325,7 @@ static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
/* rearranging filter */
filt = LD_UH(filter);
- filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
LD_SB4(src, src_stride, src0, src1, src2, src3);
VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
@@ -347,7 +347,7 @@ static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
/* rearranging filter */
filt = LD_UH(filter);
- filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
@@ -355,8 +355,8 @@ static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
vec6, vec7);
SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
- PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
- res2, res3);
+ PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
+ res3);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
@@ -383,7 +383,7 @@ static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
/* rearranging filter */
filt = LD_UH(filter);
- filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
LD_SB4(src, src_stride, src0, src1, src2, src3);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
@@ -406,7 +406,7 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
/* rearranging filter */
filt = LD_UH(filter);
- filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
@@ -482,7 +482,7 @@ static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
/* rearranging filter */
filt = LD_UH(filter);
- filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
LD_SB4(src, src_stride, src0, src2, src4, src6);
LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
@@ -545,7 +545,7 @@ static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
/* rearranging filter */
filt = LD_UH(filter);
- filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
for (loop_cnt = height >> 1; loop_cnt--;) {
src0 = LD_SB(src);
@@ -590,7 +590,7 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
/* rearranging filter */
filt = LD_UH(filter);
- filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
for (loop_cnt = height; loop_cnt--;) {
src0 = LD_SB(src);
@@ -622,8 +622,8 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
int8_t cnt, filt_hor[8];
assert(x_step_q4 == 16);
@@ -636,67 +636,55 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
if (((const int32_t *)filter_x)[0] == 0) {
switch (w) {
case 4:
- common_hz_2t_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
+ common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
&filt_hor[3], h);
break;
case 8:
- common_hz_2t_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
+ common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
&filt_hor[3], h);
break;
case 16:
- common_hz_2t_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
+ common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
&filt_hor[3], h);
break;
case 32:
- common_hz_2t_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
+ common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
&filt_hor[3], h);
break;
case 64:
- common_hz_2t_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
+ common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
&filt_hor[3], h);
break;
default:
- vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
break;
}
} else {
switch (w) {
case 4:
- common_hz_8t_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
+ common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
filt_hor, h);
break;
case 8:
- common_hz_8t_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
+ common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
filt_hor, h);
break;
case 16:
- common_hz_8t_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
+ common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
filt_hor, h);
break;
case 32:
- common_hz_8t_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
+ common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
filt_hor, h);
break;
case 64:
- common_hz_8t_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
+ common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
filt_hor, h);
break;
default:
- vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
break;
}
}
diff --git a/vpx_dsp/mips/vpx_convolve8_msa.c b/vpx_dsp/mips/vpx_convolve8_msa.c
index 7546f1315..b16ec5788 100644
--- a/vpx_dsp/mips/vpx_convolve8_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_msa.c
@@ -69,15 +69,15 @@ static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
XORI_B4_128_SB(src7, src8, src9, src10);
src += (4 * src_stride);
- hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
- hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
@@ -151,20 +151,20 @@ static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
XORI_B4_128_SB(src7, src8, src9, src10);
- hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
- hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
- hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
@@ -295,11 +295,11 @@ static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
- vec4, vec5, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4,
+ vec5, vec6, vec7);
SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
- PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
- res2, res3);
+ PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
+ res3);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
@@ -361,12 +361,10 @@ static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
}
static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
+ int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
+ int8_t *filter_vert, int32_t height) {
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
v16u8 filt_hz, filt_vt, vec0;
@@ -542,11 +540,10 @@ static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
}
}
-void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int32_t x_step_q4,
- const int16_t *filter_y, int32_t y_step_q4,
- int32_t w, int32_t h) {
+void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int32_t x_step_q4, const int16_t *filter_y,
+ int32_t y_step_q4, int32_t w, int32_t h) {
int8_t cnt, filt_hor[8], filt_ver[8];
assert(x_step_q4 == 16);
@@ -563,72 +560,69 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride,
((const int32_t *)filter_y)[0] == 0) {
switch (w) {
case 4:
- common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], &filt_ver[3], (int32_t)h);
+ common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
break;
case 8:
- common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], &filt_ver[3], (int32_t)h);
+ common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
break;
case 16:
- common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], &filt_ver[3], (int32_t)h);
+ common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
break;
case 32:
- common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], &filt_ver[3], (int32_t)h);
+ common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
break;
case 64:
- common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], &filt_ver[3], (int32_t)h);
+ common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
break;
default:
- vpx_convolve8_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
break;
}
} else if (((const int32_t *)filter_x)[0] == 0 ||
((const int32_t *)filter_y)[0] == 0) {
- vpx_convolve8_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
} else {
switch (w) {
case 4:
- common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, filt_ver, (int32_t)h);
+ common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
break;
case 8:
- common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, filt_ver, (int32_t)h);
+ common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
break;
case 16:
- common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, filt_ver, (int32_t)h);
+ common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
break;
case 32:
- common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, filt_ver, (int32_t)h);
+ common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
break;
case 64:
- common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, filt_ver, (int32_t)h);
+ common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
break;
default:
- vpx_convolve8_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
break;
}
}
diff --git a/vpx_dsp/mips/vpx_convolve8_vert_msa.c b/vpx_dsp/mips/vpx_convolve8_vert_msa.c
index 527d45719..410682271 100644
--- a/vpx_dsp/mips/vpx_convolve8_vert_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_vert_msa.c
@@ -222,11 +222,11 @@ static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
src_tmp += (7 * src_stride);
- ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
- src32_r, src54_r, src21_r);
+ ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+ src54_r, src21_r);
ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
- ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
- src32_l, src54_l, src21_l);
+ ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+ src54_l, src21_l);
ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
for (loop_cnt = (height >> 2); loop_cnt--;) {
@@ -344,8 +344,8 @@ static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
src32_r, src43_r);
ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
src76_r, src87_r);
- ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
- src87_r, src76_r, src2110, src4332, src6554, src8776);
+ ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
+ src76_r, src2110, src4332, src6554, src8776);
DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
tmp0, tmp1, tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
@@ -407,10 +407,10 @@ static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
src += (8 * src_stride);
- ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1,
- vec2, vec3);
- ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5,
- vec6, vec7);
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
+ vec3);
+ ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
+ vec7);
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
@@ -629,8 +629,8 @@ static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
int8_t cnt, filt_ver[8];
assert(y_step_q4 == 16);
@@ -643,67 +643,55 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
if (((const int32_t *)filter_y)[0] == 0) {
switch (w) {
case 4:
- common_vt_2t_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
+ common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
&filt_ver[3], h);
break;
case 8:
- common_vt_2t_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
+ common_vt_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
&filt_ver[3], h);
break;
case 16:
- common_vt_2t_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
+ common_vt_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
&filt_ver[3], h);
break;
case 32:
- common_vt_2t_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
+ common_vt_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
&filt_ver[3], h);
break;
case 64:
- common_vt_2t_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
+ common_vt_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
&filt_ver[3], h);
break;
default:
- vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
break;
}
} else {
switch (w) {
case 4:
- common_vt_8t_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
+ common_vt_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
filt_ver, h);
break;
case 8:
- common_vt_8t_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
+ common_vt_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
filt_ver, h);
break;
case 16:
- common_vt_8t_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
+ common_vt_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
filt_ver, h);
break;
case 32:
- common_vt_8t_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
+ common_vt_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
filt_ver, h);
break;
case 64:
- common_vt_8t_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
+ common_vt_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
filt_ver, h);
break;
default:
- vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
break;
}
}
diff --git a/vpx_dsp/mips/vpx_convolve_avg_msa.c b/vpx_dsp/mips/vpx_convolve_avg_msa.c
index 4c3d97803..45399bad8 100644
--- a/vpx_dsp/mips/vpx_convolve_avg_msa.c
+++ b/vpx_dsp/mips/vpx_convolve_avg_msa.c
@@ -10,8 +10,8 @@
#include "vpx_dsp/mips/macros_msa.h"
-static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride, int32_t height) {
+static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int32_t height) {
int32_t cnt;
uint32_t out0, out1, out2, out3;
v16u8 src0, src1, src2, src3;
@@ -24,8 +24,8 @@ static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
- dst0, dst1, dst2, dst3);
+ AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+ dst2, dst3);
out0 = __msa_copy_u_w((v4i32)dst0, 0);
out1 = __msa_copy_u_w((v4i32)dst1, 0);
@@ -53,8 +53,8 @@ static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
}
}
-static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride, int32_t height) {
+static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int32_t height) {
int32_t cnt;
uint64_t out0, out1, out2, out3;
v16u8 src0, src1, src2, src3;
@@ -65,8 +65,8 @@ static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
src += (4 * src_stride);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
- dst0, dst1, dst2, dst3);
+ AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+ dst2, dst3);
out0 = __msa_copy_u_d((v2i64)dst0, 0);
out1 = __msa_copy_u_d((v2i64)dst1, 0);
@@ -88,10 +88,10 @@ static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
src += (8 * src_stride);
LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
- AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
- dst0, dst1, dst2, dst3);
- AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
- dst4, dst5, dst6, dst7);
+ AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+ dst2, dst3);
+ AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
+ dst6, dst7);
ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
dst += (8 * dst_stride);
}
@@ -120,14 +120,14 @@ static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
dst_dup += (4 * dst_stride);
- AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
- dst0, dst1, dst2, dst3);
- AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
- dst4, dst5, dst6, dst7);
- AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
- dst8, dst9, dst10, dst11);
- AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
- dst12, dst13, dst14, dst15);
+ AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+ dst2, dst3);
+ AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
+ dst6, dst7);
+ AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9,
+ dst10, dst11);
+ AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12,
+ dst13, dst14, dst15);
ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
@@ -166,14 +166,14 @@ static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
dst_dup += dst_stride;
- AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
- dst0, dst1, dst2, dst3);
- AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
- dst4, dst5, dst6, dst7);
- AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
- dst8, dst9, dst10, dst11);
- AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
- dst12, dst13, dst14, dst15);
+ AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+ dst2, dst3);
+ AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
+ dst6, dst7);
+ AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9,
+ dst10, dst11);
+ AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12,
+ dst13, dst14, dst15);
ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
dst += dst_stride;
diff --git a/vpx_dsp/mips/vpx_convolve_copy_msa.c b/vpx_dsp/mips/vpx_convolve_copy_msa.c
index ba4012281..c3d87a4ab 100644
--- a/vpx_dsp/mips/vpx_convolve_copy_msa.c
+++ b/vpx_dsp/mips/vpx_convolve_copy_msa.c
@@ -105,12 +105,12 @@ static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
dst_tmp = dst;
for (loop_cnt = (height >> 3); loop_cnt--;) {
- LD_UB8(src_tmp, src_stride,
- src0, src1, src2, src3, src4, src5, src6, src7);
+ LD_UB8(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6,
+ src7);
src_tmp += (8 * src_stride);
- ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
- dst_tmp, dst_stride);
+ ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst_tmp,
+ dst_stride);
dst_tmp += (8 * dst_stride);
}
diff --git a/vpx_dsp/mips/vpx_convolve_msa.h b/vpx_dsp/mips/vpx_convolve_msa.h
index e0013983a..198c21ed2 100644
--- a/vpx_dsp/mips/vpx_convolve_msa.h
+++ b/vpx_dsp/mips/vpx_convolve_msa.h
@@ -16,104 +16,109 @@
extern const uint8_t mc_filt_mask_arr[16 * 3];
-#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \
- filt0, filt1, filt2, filt3) ({ \
- v8i16 tmp0, tmp1; \
- \
- tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \
- tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \
- tmp1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \
- tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)vec3, (v16i8)filt3); \
- tmp0 = __msa_adds_s_h(tmp0, tmp1); \
- \
- tmp0; \
-})
+#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \
+ filt3) \
+ ({ \
+ v8i16 tmp0, tmp1; \
+ \
+ tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \
+ tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \
+ tmp1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \
+ tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)vec3, (v16i8)filt3); \
+ tmp0 = __msa_adds_s_h(tmp0, tmp1); \
+ \
+ tmp0; \
+ })
-#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, \
- filt_h0, filt_h1, filt_h2, filt_h3) ({ \
- v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
- v8i16 hz_out_m; \
- \
- VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, \
- vec0_m, vec1_m, vec2_m, vec3_m); \
- hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, \
- filt_h0, filt_h1, filt_h2, filt_h3); \
- \
- hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS); \
- hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
- \
- hz_out_m; \
-})
+#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0, \
+ filt_h1, filt_h2, filt_h3) \
+ ({ \
+ v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
+ v8i16 hz_out_m; \
+ \
+ VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, vec0_m, vec1_m, vec2_m, \
+ vec3_m); \
+ hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, filt_h0, \
+ filt_h1, filt_h2, filt_h3); \
+ \
+ hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS); \
+ hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
+ \
+ hz_out_m; \
+ })
-#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
- mask0, mask1, mask2, mask3, \
- filt0, filt1, filt2, filt3, \
- out0, out1) { \
- v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
- v8i16 res0_m, res1_m, res2_m, res3_m; \
- \
- VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
- DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \
- VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
- DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \
- VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
- DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \
- VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
- DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \
- ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \
-}
+#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
+ mask2, mask3, filt0, filt1, filt2, filt3, \
+ out0, out1) \
+ { \
+ v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ v8i16 res0_m, res1_m, res2_m, res3_m; \
+ \
+ VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
+ DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \
+ VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
+ DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \
+ VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
+ DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \
+ VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
+ DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \
+ ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \
+ }
-#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
- mask0, mask1, mask2, mask3, \
- filt0, filt1, filt2, filt3, \
- out0, out1, out2, out3) { \
- v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
- v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
- \
- VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
- VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
- DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
- res0_m, res1_m, res2_m, res3_m); \
- VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
- VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
- DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
- res4_m, res5_m, res6_m, res7_m); \
- VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
- VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
- DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
- res0_m, res1_m, res2_m, res3_m); \
- VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
- VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
- DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
- res4_m, res5_m, res6_m, res7_m); \
- ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \
- res7_m, out0, out1, out2, out3); \
-}
+#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
+ mask2, mask3, filt0, filt1, filt2, filt3, \
+ out0, out1, out2, out3) \
+ { \
+ v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
+ \
+ VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
+ DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
+ res0_m, res1_m, res2_m, res3_m); \
+ VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
+ DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
+ res4_m, res5_m, res6_m, res7_m); \
+ VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
+ DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
+ res0_m, res1_m, res2_m, res3_m); \
+ VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
+ DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
+ res4_m, res5_m, res6_m, res7_m); \
+ ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \
+ res7_m, out0, out1, out2, out3); \
+ }
-#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) { \
- v16u8 tmp_m; \
- \
- tmp_m = PCKEV_XORI128_UB(in1, in0); \
- tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \
- ST_UB(tmp_m, (pdst)); \
-}
+#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \
+ { \
+ v16u8 tmp_m; \
+ \
+ tmp_m = PCKEV_XORI128_UB(in1, in0); \
+ tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \
+ ST_UB(tmp_m, (pdst)); \
+ }
-#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) { \
- v16u8 tmp_m; \
- \
- tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
- tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \
- ST_UB(tmp_m, (pdst)); \
-}
+#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \
+ { \
+ v16u8 tmp_m; \
+ \
+ tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
+ tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \
+ ST_UB(tmp_m, (pdst)); \
+ }
-#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, \
- pdst, stride) { \
- v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- \
- PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \
- PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
- AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
- ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
-}
-#endif /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */
+#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, pdst, \
+ stride) \
+ { \
+ v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ \
+ PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \
+ PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
+ AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
+ ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
+ }
+#endif /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */
diff --git a/vpx_dsp/prob.c b/vpx_dsp/prob.c
index 639d24dd2..819e95062 100644
--- a/vpx_dsp/prob.c
+++ b/vpx_dsp/prob.c
@@ -11,22 +11,16 @@
#include "./prob.h"
const uint8_t vpx_norm[256] = {
- 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
static unsigned int tree_merge_probs_impl(unsigned int i,
@@ -35,13 +29,13 @@ static unsigned int tree_merge_probs_impl(unsigned int i,
const unsigned int *counts,
vpx_prob *probs) {
const int l = tree[i];
- const unsigned int left_count = (l <= 0)
- ? counts[-l]
- : tree_merge_probs_impl(l, tree, pre_probs, counts, probs);
+ const unsigned int left_count =
+ (l <= 0) ? counts[-l]
+ : tree_merge_probs_impl(l, tree, pre_probs, counts, probs);
const int r = tree[i + 1];
- const unsigned int right_count = (r <= 0)
- ? counts[-r]
- : tree_merge_probs_impl(r, tree, pre_probs, counts, probs);
+ const unsigned int right_count =
+ (r <= 0) ? counts[-r]
+ : tree_merge_probs_impl(r, tree, pre_probs, counts, probs);
const unsigned int ct[2] = { left_count, right_count };
probs[i >> 1] = mode_mv_merge_probs(pre_probs[i >> 1], ct);
return left_count + right_count;
diff --git a/vpx_dsp/prob.h b/vpx_dsp/prob.h
index c3cb103ff..148116ed0 100644
--- a/vpx_dsp/prob.h
+++ b/vpx_dsp/prob.h
@@ -24,11 +24,11 @@ typedef uint8_t vpx_prob;
#define MAX_PROB 255
-#define vpx_prob_half ((vpx_prob) 128)
+#define vpx_prob_half ((vpx_prob)128)
typedef int8_t vpx_tree_index;
-#define TREE_SIZE(leaf_count) (2 * (leaf_count) - 2)
+#define TREE_SIZE(leaf_count) (2 * (leaf_count)-2)
#define vpx_complement(x) (255 - x)
@@ -60,8 +60,7 @@ static INLINE vpx_prob weighted_prob(int prob1, int prob2, int factor) {
return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
}
-static INLINE vpx_prob merge_probs(vpx_prob pre_prob,
- const unsigned int ct[2],
+static INLINE vpx_prob merge_probs(vpx_prob pre_prob, const unsigned int ct[2],
unsigned int count_sat,
unsigned int max_update_factor) {
const vpx_prob prob = get_binary_prob(ct[0], ct[1]);
@@ -72,7 +71,7 @@ static INLINE vpx_prob merge_probs(vpx_prob pre_prob,
// MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT;
static const int count_to_update_factor[MODE_MV_COUNT_SAT + 1] = {
- 0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64,
+ 0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64,
70, 76, 83, 89, 96, 102, 108, 115, 121, 128
};
@@ -93,7 +92,6 @@ static INLINE vpx_prob mode_mv_merge_probs(vpx_prob pre_prob,
void vpx_tree_merge_probs(const vpx_tree_index *tree, const vpx_prob *pre_probs,
const unsigned int *counts, vpx_prob *probs);
-
DECLARE_ALIGNED(16, extern const uint8_t, vpx_norm[256]);
#ifdef __cplusplus
diff --git a/vpx_dsp/psnr.c b/vpx_dsp/psnr.c
index 5bf786271..47afd4388 100644
--- a/vpx_dsp/psnr.c
+++ b/vpx_dsp/psnr.c
@@ -14,7 +14,6 @@
#include "vpx_dsp/psnr.h"
#include "vpx_scale/yv12config.h"
-
double vpx_sse_to_psnr(double samples, double peak, double sse) {
if (sse > 0.0) {
const double psnr = 10.0 * log10(samples * peak * peak / sse);
@@ -27,9 +26,9 @@ double vpx_sse_to_psnr(double samples, double peak, double sse) {
/* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
* and highbd_8_variance(). It should not.
*/
-static void encoder_variance(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- int w, int h, unsigned int *sse, int *sum) {
+static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int w, int h, unsigned int *sse,
+ int *sum) {
int i, j;
*sum = 0;
@@ -48,10 +47,9 @@ static void encoder_variance(const uint8_t *a, int a_stride,
}
#if CONFIG_VP9_HIGHBITDEPTH
-static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- int w, int h, uint64_t *sse,
- int64_t *sum) {
+static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int w,
+ int h, uint64_t *sse, int64_t *sum) {
int i, j;
uint16_t *a = CONVERT_TO_SHORTPTR(a8);
@@ -70,22 +68,20 @@ static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
}
}
-static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- int w, int h,
- unsigned int *sse, int *sum) {
+static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int w,
+ int h, unsigned int *sse, int *sum) {
uint64_t sse_long = 0;
int64_t sum_long = 0;
- encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h,
- &sse_long, &sum_long);
+ encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long,
+ &sum_long);
*sse = (unsigned int)sse_long;
*sum = (int)sum_long;
}
#endif // CONFIG_VP9_HIGHBITDEPTH
-static int64_t get_sse(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- int width, int height) {
+static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
const int dw = width % 16;
const int dh = height % 16;
int64_t total_sse = 0;
@@ -94,15 +90,15 @@ static int64_t get_sse(const uint8_t *a, int a_stride,
int x, y;
if (dw > 0) {
- encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
- dw, height, &sse, &sum);
+ encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw,
+ height, &sse, &sum);
total_sse += sse;
}
if (dh > 0) {
encoder_variance(&a[(height - dh) * a_stride], a_stride,
- &b[(height - dh) * b_stride], b_stride,
- width - dw, dh, &sse, &sum);
+ &b[(height - dh) * b_stride], b_stride, width - dw, dh,
+ &sse, &sum);
total_sse += sse;
}
@@ -126,9 +122,8 @@ static int64_t get_sse(const uint8_t *a, int a_stride,
#if CONFIG_VP9_HIGHBITDEPTH
static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- int width, int height,
- unsigned int input_shift) {
+ const uint8_t *b8, int b_stride, int width,
+ int height, unsigned int input_shift) {
const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
int64_t total_sse = 0;
@@ -145,9 +140,8 @@ static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
return total_sse;
}
-static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- int width, int height) {
+static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
int64_t total_sse = 0;
int x, y;
const int dw = width % 16;
@@ -155,15 +149,14 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
unsigned int sse = 0;
int sum = 0;
if (dw > 0) {
- encoder_highbd_8_variance(&a[width - dw], a_stride,
- &b[width - dw], b_stride,
- dw, height, &sse, &sum);
+ encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw],
+ b_stride, dw, height, &sse, &sum);
total_sse += sse;
}
if (dh > 0) {
encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
- &b[(height - dh) * b_stride], b_stride,
- width - dw, dh, &sse, &sum);
+ &b[(height - dh) * b_stride], b_stride,
+ width - dw, dh, &sse, &sum);
total_sse += sse;
}
for (y = 0; y < height / 16; ++y) {
@@ -182,38 +175,35 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
}
#endif // CONFIG_VP9_HIGHBITDEPTH
-
int64_t vpx_get_y_sse(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b) {
+ const YV12_BUFFER_CONFIG *b) {
assert(a->y_crop_width == b->y_crop_width);
assert(a->y_crop_height == b->y_crop_height);
return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
- a->y_crop_width, a->y_crop_height);
+ a->y_crop_width, a->y_crop_height);
}
#if CONFIG_VP9_HIGHBITDEPTH
int64_t vpx_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b) {
+ const YV12_BUFFER_CONFIG *b) {
assert(a->y_crop_width == b->y_crop_width);
assert(a->y_crop_height == b->y_crop_height);
assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
- a->y_crop_width, a->y_crop_height);
+ a->y_crop_width, a->y_crop_height);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_VP9_HIGHBITDEPTH
void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b,
- PSNR_STATS *psnr, uint32_t bit_depth,
- uint32_t in_bit_depth) {
- const int widths[3] =
- { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
- const int heights[3] =
- { a->y_crop_height, a->uv_crop_height, a->uv_crop_height };
+ const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
+ uint32_t bit_depth, uint32_t in_bit_depth) {
+ const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
+ const int heights[3] = { a->y_crop_height, a->uv_crop_height,
+ a->uv_crop_height };
const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer };
const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer };
@@ -231,17 +221,14 @@ void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
uint64_t sse;
if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
if (input_shift) {
- sse = highbd_get_sse_shift(a_planes[i], a_strides[i],
- b_planes[i], b_strides[i], w, h,
- input_shift);
+ sse = highbd_get_sse_shift(a_planes[i], a_strides[i], b_planes[i],
+ b_strides[i], w, h, input_shift);
} else {
- sse = highbd_get_sse(a_planes[i], a_strides[i],
- b_planes[i], b_strides[i], w, h);
+ sse = highbd_get_sse(a_planes[i], a_strides[i], b_planes[i],
+ b_strides[i], w, h);
}
} else {
- sse = get_sse(a_planes[i], a_strides[i],
- b_planes[i], b_strides[i],
- w, h);
+ sse = get_sse(a_planes[i], a_strides[i], b_planes[i], b_strides[i], w, h);
}
psnr->sse[1 + i] = sse;
psnr->samples[1 + i] = samples;
@@ -253,8 +240,8 @@ void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
psnr->sse[0] = total_sse;
psnr->samples[0] = total_samples;
- psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
- (double)total_sse);
+ psnr->psnr[0] =
+ vpx_sse_to_psnr((double)total_samples, peak, (double)total_sse);
}
#endif // !CONFIG_VP9_HIGHBITDEPTH
@@ -262,10 +249,9 @@ void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
PSNR_STATS *psnr) {
static const double peak = 255.0;
- const int widths[3] = {
- a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
- const int heights[3] = {
- a->y_crop_height, a->uv_crop_height, a->uv_crop_height };
+ const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
+ const int heights[3] = { a->y_crop_height, a->uv_crop_height,
+ a->uv_crop_height };
const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer };
const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer };
@@ -278,9 +264,8 @@ void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
const int w = widths[i];
const int h = heights[i];
const uint32_t samples = w * h;
- const uint64_t sse = get_sse(a_planes[i], a_strides[i],
- b_planes[i], b_strides[i],
- w, h);
+ const uint64_t sse =
+ get_sse(a_planes[i], a_strides[i], b_planes[i], b_strides[i], w, h);
psnr->sse[1 + i] = sse;
psnr->samples[1 + i] = samples;
psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
@@ -291,6 +276,6 @@ void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
psnr->sse[0] = total_sse;
psnr->samples[0] = total_samples;
- psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
- (double)total_sse);
+ psnr->psnr[0] =
+ vpx_sse_to_psnr((double)total_samples, peak, (double)total_sse);
}
diff --git a/vpx_dsp/psnr.h b/vpx_dsp/psnr.h
index e25b4504a..f321131d0 100644
--- a/vpx_dsp/psnr.h
+++ b/vpx_dsp/psnr.h
@@ -11,7 +11,6 @@
#ifndef VPX_DSP_PSNR_H_
#define VPX_DSP_PSNR_H_
-
#include "vpx_scale/yv12config.h"
#define MAX_PSNR 100.0
@@ -37,25 +36,20 @@ typedef struct {
* \param[in] sse Sum of squared errors
*/
double vpx_sse_to_psnr(double samples, double peak, double sse);
-int64_t vpx_get_y_sse(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b);
+int64_t vpx_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
#if CONFIG_VP9_HIGHBITDEPTH
int64_t vpx_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
const YV12_BUFFER_CONFIG *b);
void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b,
- PSNR_STATS *psnr,
- unsigned int bit_depth,
- unsigned int in_bit_depth);
+ const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
+ unsigned int bit_depth, unsigned int in_bit_depth);
#endif
-void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b,
- PSNR_STATS *psnr);
+void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
+ PSNR_STATS *psnr);
double vpx_psnrhvs(const YV12_BUFFER_CONFIG *source,
- const YV12_BUFFER_CONFIG *dest,
- double *phvs_y, double *phvs_u,
- double *phvs_v, uint32_t bd, uint32_t in_bd);
+ const YV12_BUFFER_CONFIG *dest, double *phvs_y,
+ double *phvs_u, double *phvs_v, uint32_t bd, uint32_t in_bd);
#ifdef __cplusplus
} // extern "C"
diff --git a/vpx_dsp/psnrhvs.c b/vpx_dsp/psnrhvs.c
index 3708cc3c8..b3910152c 100644
--- a/vpx_dsp/psnrhvs.c
+++ b/vpx_dsp/psnrhvs.c
@@ -22,28 +22,28 @@
#include "vpx_dsp/psnr.h"
#if !defined(M_PI)
-# define M_PI (3.141592653589793238462643)
+#define M_PI (3.141592653589793238462643)
#endif
#include <string.h>
static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
int xstride) {
int i, j;
- (void) xstride;
+ (void)xstride;
vpx_fdct8x8(x, y, ystride);
for (i = 0; i < 8; i++)
- for (j = 0; j< 8; j++)
- *(y + ystride*i + j) = (*(y + ystride*i + j) + 4) >> 3;
+ for (j = 0; j < 8; j++)
+ *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
}
#if CONFIG_VP9_HIGHBITDEPTH
static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
- int xstride) {
+ int xstride) {
int i, j;
- (void) xstride;
+ (void)xstride;
vpx_highbd_fdct8x8(x, y, ystride);
for (i = 0; i < 8; i++)
- for (j = 0; j< 8; j++)
- *(y + ystride*i + j) = (*(y + ystride*i + j) + 4) >> 3;
+ for (j = 0; j < 8; j++)
+ *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
}
#endif
@@ -51,56 +51,59 @@ static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
* transparency. This is not the JPEG based matrix from the paper,
this one gives a slightly higher MOS agreement.*/
static const double csf_y[8][8] = {
- {1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334,
- 0.678296995242, 0.466224900598, 0.3265091542},
- {2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963,
- 0.868920337363, 0.61280991668, 0.436405793551},
- {2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257,
- 0.670882927016, 0.501731932449, 0.372504254596},
- {1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575,
- 0.605636379554, 0.48309405692, 0.380429446972, 0.295774038565},
- {1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554,
- 0.448996256676, 0.352889268808, 0.283006984131, 0.226951348204},
- {0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692,
- 0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321},
- {0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972,
- 0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001},
- {0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565,
- 0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276}};
+ { 1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334,
+ 0.678296995242, 0.466224900598, 0.3265091542 },
+ { 2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963,
+ 0.868920337363, 0.61280991668, 0.436405793551 },
+ { 2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257,
+ 0.670882927016, 0.501731932449, 0.372504254596 },
+ { 1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575, 0.605636379554,
+ 0.48309405692, 0.380429446972, 0.295774038565 },
+ { 1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554, 0.448996256676,
+ 0.352889268808, 0.283006984131, 0.226951348204 },
+ { 0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692,
+ 0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321 },
+ { 0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972,
+ 0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001 },
+ { 0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565,
+ 0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276 }
+};
static const double csf_cb420[8][8] = {
- {1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788,
- 0.898018824055, 0.74725392039, 0.615105596242},
- {2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972,
- 1.17428548929, 0.996404342439, 0.830890433625},
- {1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362,
- 0.960060382087, 0.849823426169, 0.731221236837},
- {1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629,
- 0.801821139099, 0.751437590932, 0.685398513368, 0.608694761374},
- {1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099,
- 0.676555426187, 0.605503172737, 0.55002013668, 0.495804539034},
- {0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932,
- 0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965},
- {0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368,
- 0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733},
- {0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374,
- 0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237}};
+ { 1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788,
+ 0.898018824055, 0.74725392039, 0.615105596242 },
+ { 2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972,
+ 1.17428548929, 0.996404342439, 0.830890433625 },
+ { 1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362,
+ 0.960060382087, 0.849823426169, 0.731221236837 },
+ { 1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629, 0.801821139099,
+ 0.751437590932, 0.685398513368, 0.608694761374 },
+ { 1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099, 0.676555426187,
+ 0.605503172737, 0.55002013668, 0.495804539034 },
+ { 0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932,
+ 0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965 },
+ { 0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368,
+ 0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733 },
+ { 0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374,
+ 0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237 }
+};
static const double csf_cr420[8][8] = {
- {2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469,
- 0.867069376285, 0.721500455585, 0.593906509971},
- {2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198,
- 1.13381474809, 0.962064122248, 0.802254508198},
- {1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848,
- 0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706},
- {1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195,
- 0.725539939514, 0.661776842059, 0.587716619023},
- {1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195,
- 0.653238524286, 0.584635025748, 0.531064164893, 0.478717061273},
- {0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514,
- 0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543},
- {0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059,
- 0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063},
- {0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023,
- 0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658}};
+ { 2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469,
+ 0.867069376285, 0.721500455585, 0.593906509971 },
+ { 2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198,
+ 1.13381474809, 0.962064122248, 0.802254508198 },
+ { 1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848,
+ 0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706 },
+ { 1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195,
+ 0.725539939514, 0.661776842059, 0.587716619023 },
+ { 1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195, 0.653238524286,
+ 0.584635025748, 0.531064164893, 0.478717061273 },
+ { 0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514,
+ 0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543 },
+ { 0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059,
+ 0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063 },
+ { 0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023,
+ 0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658 }
+};
static double convert_score_db(double _score, double _weight, int bit_depth) {
int16_t pix_max = 255;
@@ -110,16 +113,14 @@ static double convert_score_db(double _score, double _weight, int bit_depth) {
else if (bit_depth == 12)
pix_max = 4095;
- if (_weight * _score < pix_max * pix_max * 1e-10)
- return MAX_PSNR;
+ if (_weight * _score < pix_max * pix_max * 1e-10) return MAX_PSNR;
return 10 * (log10(pix_max * pix_max) - log10(_weight * _score));
}
static double calc_psnrhvs(const unsigned char *src, int _systride,
- const unsigned char *dst, int _dystride,
- double _par, int _w, int _h, int _step,
- const double _csf[8][8], uint32_t bit_depth,
- uint32_t _shift) {
+ const unsigned char *dst, int _dystride, double _par,
+ int _w, int _h, int _step, const double _csf[8][8],
+ uint32_t bit_depth, uint32_t _shift) {
double ret;
const uint8_t *_src8 = src;
const uint8_t *_dst8 = dst;
@@ -131,7 +132,7 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
int pixels;
int x;
int y;
- (void) _par;
+ (void)_par;
ret = pixels = 0;
/*In the PSNR-HVS-M paper[1] the authors describe the construction of
@@ -152,8 +153,8 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.*/
for (x = 0; x < 8; x++)
for (y = 0; y < 8; y++)
- mask[x][y] = (_csf[x][y] * 0.3885746225901003)
- * (_csf[x][y] * 0.3885746225901003);
+ mask[x][y] =
+ (_csf[x][y] * 0.3885746225901003) * (_csf[x][y] * 0.3885746225901003);
for (y = 0; y < _h - 7; y += _step) {
for (x = 0; x < _w - 7; x += _step) {
int i;
@@ -188,27 +189,23 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
}
s_gmean /= 64.f;
d_gmean /= 64.f;
- for (i = 0; i < 4; i++)
- s_means[i] /= 16.f;
- for (i = 0; i < 4; i++)
- d_means[i] /= 16.f;
+ for (i = 0; i < 4; i++) s_means[i] /= 16.f;
+ for (i = 0; i < 4; i++) d_means[i] /= 16.f;
for (i = 0; i < 8; i++) {
for (j = 0; j < 8; j++) {
int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
s_gvar += (dct_s[i * 8 + j] - s_gmean) * (dct_s[i * 8 + j] - s_gmean);
d_gvar += (dct_d[i * 8 + j] - d_gmean) * (dct_d[i * 8 + j] - d_gmean);
- s_vars[sub] += (dct_s[i * 8 + j] - s_means[sub])
- * (dct_s[i * 8 + j] - s_means[sub]);
- d_vars[sub] += (dct_d[i * 8 + j] - d_means[sub])
- * (dct_d[i * 8 + j] - d_means[sub]);
+ s_vars[sub] += (dct_s[i * 8 + j] - s_means[sub]) *
+ (dct_s[i * 8 + j] - s_means[sub]);
+ d_vars[sub] += (dct_d[i * 8 + j] - d_means[sub]) *
+ (dct_d[i * 8 + j] - d_means[sub]);
}
}
s_gvar *= 1 / 63.f * 64;
d_gvar *= 1 / 63.f * 64;
- for (i = 0; i < 4; i++)
- s_vars[i] *= 1 / 15.f * 16;
- for (i = 0; i < 4; i++)
- d_vars[i] *= 1 / 15.f * 16;
+ for (i = 0; i < 4; i++) s_vars[i] *= 1 / 15.f * 16;
+ for (i = 0; i < 4; i++) d_vars[i] *= 1 / 15.f * 16;
if (s_gvar > 0)
s_gvar = (s_vars[0] + s_vars[1] + s_vars[2] + s_vars[3]) / s_gvar;
if (d_gvar > 0)
@@ -231,8 +228,7 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
d_mask += dct_d_coef[i * 8 + j] * dct_d_coef[i * 8 + j] * mask[i][j];
s_mask = sqrt(s_mask * s_gvar) / 32.f;
d_mask = sqrt(d_mask * d_gvar) / 32.f;
- if (d_mask > s_mask)
- s_mask = d_mask;
+ if (d_mask > s_mask) s_mask = d_mask;
for (i = 0; i < 8; i++) {
for (j = 0; j < 8; j++) {
double err;
@@ -245,16 +241,15 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
}
}
}
- if (pixels <=0)
- return 0;
+ if (pixels <= 0) return 0;
ret /= pixels;
return ret;
}
double vpx_psnrhvs(const YV12_BUFFER_CONFIG *src,
const YV12_BUFFER_CONFIG *dest, double *y_psnrhvs,
- double *u_psnrhvs, double *v_psnrhvs,
- uint32_t bd, uint32_t in_bd) {
+ double *u_psnrhvs, double *v_psnrhvs, uint32_t bd,
+ uint32_t in_bd) {
double psnrhvs;
const double par = 1.0;
const int step = 7;
@@ -268,17 +263,13 @@ double vpx_psnrhvs(const YV12_BUFFER_CONFIG *src,
*y_psnrhvs = calc_psnrhvs(src->y_buffer, src->y_stride, dest->y_buffer,
dest->y_stride, par, src->y_crop_width,
- src->y_crop_height, step, csf_y, bd,
- bd_shift);
+ src->y_crop_height, step, csf_y, bd, bd_shift);
*u_psnrhvs = calc_psnrhvs(src->u_buffer, src->uv_stride, dest->u_buffer,
dest->uv_stride, par, src->uv_crop_width,
- src->uv_crop_height, step, csf_cb420, bd,
- bd_shift);
+ src->uv_crop_height, step, csf_cb420, bd, bd_shift);
*v_psnrhvs = calc_psnrhvs(src->v_buffer, src->uv_stride, dest->v_buffer,
dest->uv_stride, par, src->uv_crop_width,
- src->uv_crop_height, step, csf_cr420, bd,
- bd_shift);
+ src->uv_crop_height, step, csf_cr420, bd, bd_shift);
psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs));
return convert_score_db(psnrhvs, 1.0, in_bd);
}
-
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index 80fcd66b0..3c7f9832f 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -12,8 +12,7 @@
#include "vpx_dsp/quantize.h"
#include "vpx_mem/vpx_mem.h"
-void vpx_quantize_dc(const tran_low_t *coeff_ptr,
- int n_coeffs, int skip_block,
+void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
const int16_t *round_ptr, const int16_t quant,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr, uint16_t *eob_ptr) {
@@ -29,20 +28,19 @@ void vpx_quantize_dc(const tran_low_t *coeff_ptr,
if (!skip_block) {
tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
tmp = (tmp * quant) >> 16;
- qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
- if (tmp)
- eob = 0;
+ if (tmp) eob = 0;
}
*eob_ptr = eob + 1;
}
#if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr,
- int n_coeffs, int skip_block,
- const int16_t *round_ptr, const int16_t quant,
- tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t dequant_ptr, uint16_t *eob_ptr) {
+void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+ int skip_block, const int16_t *round_ptr,
+ const int16_t quant, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+ uint16_t *eob_ptr) {
int eob = -1;
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
@@ -56,8 +54,7 @@ void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr,
const int abs_qcoeff = (int)((tmp * quant) >> 16);
qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
- if (abs_qcoeff)
- eob = 0;
+ if (abs_qcoeff) eob = 0;
}
*eob_ptr = eob + 1;
}
@@ -81,19 +78,16 @@ void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
INT16_MIN, INT16_MAX);
tmp = (tmp * quant) >> 15;
- qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
- if (tmp)
- eob = 0;
+ if (tmp) eob = 0;
}
*eob_ptr = eob + 1;
}
#if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
- int skip_block,
- const int16_t *round_ptr,
- const int16_t quant,
+void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+ const int16_t *round_ptr, const int16_t quant,
tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr,
@@ -112,24 +106,22 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
const int abs_qcoeff = (int)((tmp * quant) >> 15);
qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
- if (abs_qcoeff)
- eob = 0;
+ if (abs_qcoeff) eob = 0;
}
*eob_ptr = eob + 1;
}
#endif
void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block,
- const int16_t *zbin_ptr, const int16_t *round_ptr,
- const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
- tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t *dequant_ptr,
- uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
int i, non_zero_count = (int)n_coeffs, eob = -1;
- const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
- const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+ const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+ const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
(void)iscan;
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
@@ -158,12 +150,12 @@ void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
if (abs_coeff >= zbins[rc != 0]) {
int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
- quant_shift_ptr[rc != 0]) >> 16; // quantization
- qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ quant_shift_ptr[rc != 0]) >>
+ 16; // quantization
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
- if (tmp)
- eob = i;
+ if (tmp) eob = i;
}
}
}
@@ -176,12 +168,11 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t *dequant_ptr,
- uint16_t *eob_ptr, const int16_t *scan,
- const int16_t *iscan) {
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
int i, non_zero_count = (int)n_coeffs, eob = -1;
- const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
- const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+ const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+ const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
(void)iscan;
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
@@ -214,8 +205,7 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
(uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
- if (abs_qcoeff)
- eob = i;
+ if (abs_qcoeff) eob = i;
}
}
}
@@ -224,17 +214,15 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
#endif
void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block,
- const int16_t *zbin_ptr, const int16_t *round_ptr,
- const int16_t *quant_ptr,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t *dequant_ptr,
- uint16_t *eob_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
- ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
- const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+ const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
int idx = 0;
int idx_arr[1024];
@@ -267,33 +255,28 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
- quant_shift_ptr[rc != 0]) >> 15;
+ quant_shift_ptr[rc != 0]) >>
+ 15;
qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
- if (tmp)
- eob = idx_arr[i];
+ if (tmp) eob = idx_arr[i];
}
}
*eob_ptr = eob + 1;
}
#if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
- intptr_t n_coeffs, int skip_block,
- const int16_t *zbin_ptr,
- const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr,
- tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr,
- const int16_t *dequant_ptr,
- uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
- const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
- ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
- const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+void vpx_highbd_quantize_b_32x32_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+ const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+ const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
int idx = 0;
int idx_arr[1024];
@@ -322,15 +305,14 @@ void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
const int coeff = coeff_ptr[rc];
const int coeff_sign = (coeff >> 31);
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
- const int64_t tmp1 = abs_coeff
- + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+ const int64_t tmp1 =
+ abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
const uint32_t abs_qcoeff =
(uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
- if (abs_qcoeff)
- eob = idx_arr[i];
+ if (abs_qcoeff) eob = idx_arr[i];
}
}
*eob_ptr = eob + 1;
diff --git a/vpx_dsp/quantize.h b/vpx_dsp/quantize.h
index 89ec59792..e13284546 100644
--- a/vpx_dsp/quantize.h
+++ b/vpx_dsp/quantize.h
@@ -18,8 +18,7 @@
extern "C" {
#endif
-void vpx_quantize_dc(const tran_low_t *coeff_ptr,
- int n_coeffs, int skip_block,
+void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
const int16_t *round_ptr, const int16_t quant_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr, uint16_t *eob_ptr);
@@ -29,19 +28,17 @@ void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
const int16_t dequant_ptr, uint16_t *eob_ptr);
#if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr,
- int n_coeffs, int skip_block,
- const int16_t *round_ptr, const int16_t quant_ptr,
- tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t dequant_ptr, uint16_t *eob_ptr);
-void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
- int skip_block,
+void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+ int skip_block, const int16_t *round_ptr,
+ const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+ uint16_t *eob_ptr);
+void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
const int16_t *round_ptr,
const int16_t quant_ptr,
tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr,
- const int16_t dequant_ptr,
- uint16_t *eob_ptr);
+ const int16_t dequant_ptr, uint16_t *eob_ptr);
#endif
#ifdef __cplusplus
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index f1f951f14..c80ef729b 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -17,15 +17,13 @@
#include "vpx_ports/mem.h"
/* Sum the difference between every corresponding element of the buffers. */
-static INLINE unsigned int sad(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- int width, int height) {
+static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
int y, x;
unsigned int sad = 0;
for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- sad += abs(a[x] - b[x]);
+ for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
a += a_stride;
b += b_stride;
@@ -33,40 +31,43 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride,
return sad;
}
-#define sadMxN(m, n) \
-unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride) { \
- return sad(src, src_stride, ref, ref_stride, m, n); \
-} \
-unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred) { \
- uint8_t comp_pred[m * n]; \
- vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
- return sad(src, src_stride, comp_pred, m, m, n); \
-}
+#define sadMxN(m, n) \
+ unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return sad(src, src_stride, ref, ref_stride, m, n); \
+ } \
+ unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ uint8_t comp_pred[m * n]; \
+ vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
+ return sad(src, src_stride, comp_pred, m, m, n); \
+ }
// depending on call sites, pass **ref_array to avoid & in subsequent call and
// de-dup with 4D below.
-#define sadMxNxK(m, n, k) \
-void vpx_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref_array, int ref_stride, \
- uint32_t *sad_array) { \
- int i; \
- for (i = 0; i < k; ++i) \
- sad_array[i] = vpx_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \
-}
+#define sadMxNxK(m, n, k) \
+ void vpx_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref_array, int ref_stride, \
+ uint32_t *sad_array) { \
+ int i; \
+ for (i = 0; i < k; ++i) \
+ sad_array[i] = \
+ vpx_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \
+ }
// This appears to be equivalent to the above when k == 4 and refs is const
-#define sadMxNx4D(m, n) \
-void vpx_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
- const uint8_t *const ref_array[], int ref_stride, \
- uint32_t *sad_array) { \
- int i; \
- for (i = 0; i < 4; ++i) \
- sad_array[i] = vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
-}
+#define sadMxNx4D(m, n) \
+ void vpx_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[], \
+ int ref_stride, uint32_t *sad_array) { \
+ int i; \
+ for (i = 0; i < 4; ++i) \
+ sad_array[i] = \
+ vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
+ }
+/* clang-format off */
// 64x64
sadMxN(64, 64)
sadMxNxK(64, 64, 3)
@@ -134,18 +135,18 @@ sadMxN(4, 4)
sadMxNxK(4, 4, 3)
sadMxNxK(4, 4, 8)
sadMxNx4D(4, 4)
+/* clang-format on */
#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- int width, int height) {
+ static INLINE
+ unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8,
+ int b_stride, int width, int height) {
int y, x;
unsigned int sad = 0;
const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- sad += abs(a[x] - b[x]);
+ for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
a += a_stride;
b += b_stride;
@@ -160,8 +161,7 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
unsigned int sad = 0;
const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- sad += abs(a[x] - b[x]);
+ for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
a += a_stride;
b += b_stride;
@@ -169,43 +169,43 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
return sad;
}
-#define highbd_sadMxN(m, n) \
-unsigned int vpx_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride) { \
- return highbd_sad(src, src_stride, ref, ref_stride, m, n); \
-} \
-unsigned int vpx_highbd_sad##m##x##n##_avg_c(const uint8_t *src, \
- int src_stride, \
- const uint8_t *ref, \
- int ref_stride, \
- const uint8_t *second_pred) { \
- uint16_t comp_pred[m * n]; \
- vpx_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
- return highbd_sadb(src, src_stride, comp_pred, m, m, n); \
-}
+#define highbd_sadMxN(m, n) \
+ unsigned int vpx_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, \
+ int ref_stride) { \
+ return highbd_sad(src, src_stride, ref, ref_stride, m, n); \
+ } \
+ unsigned int vpx_highbd_sad##m##x##n##_avg_c( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ uint16_t comp_pred[m * n]; \
+ vpx_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
+ return highbd_sadb(src, src_stride, comp_pred, m, m, n); \
+ }
-#define highbd_sadMxNxK(m, n, k) \
-void vpx_highbd_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref_array, int ref_stride, \
- uint32_t *sad_array) { \
- int i; \
- for (i = 0; i < k; ++i) { \
- sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, &ref_array[i], \
- ref_stride); \
- } \
-}
+#define highbd_sadMxNxK(m, n, k) \
+ void vpx_highbd_sad##m##x##n##x##k##_c( \
+ const uint8_t *src, int src_stride, const uint8_t *ref_array, \
+ int ref_stride, uint32_t *sad_array) { \
+ int i; \
+ for (i = 0; i < k; ++i) { \
+ sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, \
+ &ref_array[i], ref_stride); \
+ } \
+ }
-#define highbd_sadMxNx4D(m, n) \
-void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
- const uint8_t *const ref_array[], \
- int ref_stride, uint32_t *sad_array) { \
- int i; \
- for (i = 0; i < 4; ++i) { \
- sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, ref_array[i], \
- ref_stride); \
- } \
-}
+#define highbd_sadMxNx4D(m, n) \
+ void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[], \
+ int ref_stride, uint32_t *sad_array) { \
+ int i; \
+ for (i = 0; i < 4; ++i) { \
+ sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, \
+ ref_array[i], ref_stride); \
+ } \
+ }
+/* clang-format off */
// 64x64
highbd_sadMxN(64, 64)
highbd_sadMxNxK(64, 64, 3)
@@ -273,5 +273,6 @@ highbd_sadMxN(4, 4)
highbd_sadMxNxK(4, 4, 3)
highbd_sadMxNxK(4, 4, 8)
highbd_sadMxNx4D(4, 4)
+/* clang-format on */
#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/ssim.c b/vpx_dsp/ssim.c
index 632e272dc..7a29bd29f 100644
--- a/vpx_dsp/ssim.c
+++ b/vpx_dsp/ssim.c
@@ -15,8 +15,8 @@
#include "vpx_ports/mem.h"
#include "vpx_ports/system_state.h"
-void vpx_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r,
- int rp, uint32_t *sum_s, uint32_t *sum_r,
+void vpx_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
+ uint32_t *sum_s, uint32_t *sum_r,
uint32_t *sum_sq_s, uint32_t *sum_sq_r,
uint32_t *sum_sxr) {
int i, j;
@@ -31,9 +31,8 @@ void vpx_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r,
}
}
void vpx_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
- uint32_t *sum_s, uint32_t *sum_r,
- uint32_t *sum_sq_s, uint32_t *sum_sq_r,
- uint32_t *sum_sxr) {
+ uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+ uint32_t *sum_sq_r, uint32_t *sum_sxr) {
int i, j;
for (i = 0; i < 8; i++, s += sp, r += rp) {
for (j = 0; j < 8; j++) {
@@ -47,9 +46,8 @@ void vpx_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
}
#if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp,
- const uint16_t *r, int rp,
- uint32_t *sum_s, uint32_t *sum_r,
+void vpx_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
+ int rp, uint32_t *sum_s, uint32_t *sum_r,
uint32_t *sum_sq_s, uint32_t *sum_sq_r,
uint32_t *sum_sxr) {
int i, j;
@@ -65,16 +63,15 @@ void vpx_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp,
}
#endif // CONFIG_VP9_HIGHBITDEPTH
-static const int64_t cc1 = 26634; // (64^2*(.01*255)^2
-static const int64_t cc2 = 239708; // (64^2*(.03*255)^2
-static const int64_t cc1_10 = 428658; // (64^2*(.01*1023)^2
-static const int64_t cc2_10 = 3857925; // (64^2*(.03*1023)^2
-static const int64_t cc1_12 = 6868593; // (64^2*(.01*4095)^2
+static const int64_t cc1 = 26634; // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708; // (64^2*(.03*255)^2
+static const int64_t cc1_10 = 428658; // (64^2*(.01*1023)^2
+static const int64_t cc2_10 = 3857925; // (64^2*(.03*1023)^2
+static const int64_t cc1_12 = 6868593; // (64^2*(.01*4095)^2
static const int64_t cc2_12 = 61817334; // (64^2*(.03*4095)^2
-static double similarity(uint32_t sum_s, uint32_t sum_r,
- uint32_t sum_sq_s, uint32_t sum_sq_r,
- uint32_t sum_sxr, int count,
+static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
+ uint32_t sum_sq_r, uint32_t sum_sxr, int count,
uint32_t bd) {
int64_t ssim_n, ssim_d;
int64_t c1, c2;
@@ -93,12 +90,12 @@ static double similarity(uint32_t sum_s, uint32_t sum_r,
assert(0);
}
- ssim_n = (2 * sum_s * sum_r + c1) * ((int64_t) 2 * count * sum_sxr -
- (int64_t) 2 * sum_s * sum_r + c2);
+ ssim_n = (2 * sum_s * sum_r + c1) *
+ ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2);
ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
- (int64_t)count * sum_sq_r - (int64_t) sum_r * sum_r + c2);
+ (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2);
return ssim_n * 1.0 / ssim_d;
}
@@ -116,12 +113,8 @@ static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
vpx_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
&sum_sxr);
- return similarity(sum_s >> shift,
- sum_r >> shift,
- sum_sq_s >> (2 * shift),
- sum_sq_r >> (2 * shift),
- sum_sxr >> (2 * shift),
- 64, bd);
+ return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
+ sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
@@ -161,8 +154,8 @@ static double vpx_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
for (j = 0; j <= width - 8; j += 4) {
double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
- CONVERT_TO_SHORTPTR(img2 + j), stride_img2,
- bd, shift);
+ CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd,
+ shift);
ssim_total += v;
samples++;
}
@@ -173,22 +166,18 @@ static double vpx_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
#endif // CONFIG_VP9_HIGHBITDEPTH
double vpx_calc_ssim(const YV12_BUFFER_CONFIG *source,
- const YV12_BUFFER_CONFIG *dest,
- double *weight) {
+ const YV12_BUFFER_CONFIG *dest, double *weight) {
double a, b, c;
double ssimv;
- a = vpx_ssim2(source->y_buffer, dest->y_buffer,
- source->y_stride, dest->y_stride,
- source->y_crop_width, source->y_crop_height);
+ a = vpx_ssim2(source->y_buffer, dest->y_buffer, source->y_stride,
+ dest->y_stride, source->y_crop_width, source->y_crop_height);
- b = vpx_ssim2(source->u_buffer, dest->u_buffer,
- source->uv_stride, dest->uv_stride,
- source->uv_crop_width, source->uv_crop_height);
+ b = vpx_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride,
+ dest->uv_stride, source->uv_crop_width, source->uv_crop_height);
- c = vpx_ssim2(source->v_buffer, dest->v_buffer,
- source->uv_stride, dest->uv_stride,
- source->uv_crop_width, source->uv_crop_height);
+ c = vpx_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride,
+ dest->uv_stride, source->uv_crop_width, source->uv_crop_height);
ssimv = a * .8 + .1 * (b + c);
@@ -232,13 +221,13 @@ static double ssimv_similarity(const Ssimv *sv, int64_t n) {
const int64_t c2 = (cc2 * n * n) >> 12;
const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) /
- (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1);
+ (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1);
// Since these variables are unsigned sums, convert to double so
// math is done in double arithmetic.
- const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2)
- / (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + n * sv->sum_sq_r
- - sv->sum_r * sv->sum_r + c2);
+ const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
+ (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+ n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
return l * v;
}
@@ -267,24 +256,21 @@ static double ssimv_similarity2(const Ssimv *sv, int64_t n) {
// Since these variables are unsigned, sums convert to double so
// math is done in double arithmetic.
- const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2)
- / (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
- n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+ const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
+ (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+ n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
return l * v;
}
static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2,
int img2_pitch, Ssimv *sv) {
- vpx_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch,
- &sv->sum_s, &sv->sum_r, &sv->sum_sq_s, &sv->sum_sq_r,
- &sv->sum_sxr);
+ vpx_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r,
+ &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr);
}
-double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch,
- uint8_t *img2, int img2_pitch,
- int width, int height,
- Ssimv *sv2, Metrics *m,
- int do_inconsistency) {
+double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
+ int img2_pitch, int width, int height, Ssimv *sv2,
+ Metrics *m, int do_inconsistency) {
double dssim_total = 0;
double ssim_total = 0;
double ssim2_total = 0;
@@ -295,10 +281,10 @@ double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch,
double old_ssim_total = 0;
vpx_clear_system_state();
// We can sample points as frequently as we like start with 1 per 4x4.
- for (i = 0; i < height; i += 4,
- img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+ for (i = 0; i < height;
+ i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
for (j = 0; j < width; j += 4, ++c) {
- Ssimv sv = {0};
+ Ssimv sv = { 0 };
double ssim;
double ssim2;
double dssim;
@@ -384,27 +370,29 @@ double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch,
// This measures how much consistent variance is in two consecutive
// source frames. 1.0 means they have exactly the same variance.
- const double variance_term = (2.0 * var_old * var_new + c1) /
+ const double variance_term =
+ (2.0 * var_old * var_new + c1) /
(1.0 * var_old * var_old + 1.0 * var_new * var_new + c1);
// This measures how consistent the local mean are between two
// consecutive frames. 1.0 means they have exactly the same mean.
- const double mean_term = (2.0 * mean_old * mean_new + c2) /
+ const double mean_term =
+ (2.0 * mean_old * mean_new + c2) /
(1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2);
// This measures how consistent the ssims of two
// consecutive frames is. 1.0 means they are exactly the same.
- double ssim_term = pow((2.0 * ssim_old * ssim_new + c3) /
- (ssim_old * ssim_old + ssim_new * ssim_new + c3),
- 5);
+ double ssim_term =
+ pow((2.0 * ssim_old * ssim_new + c3) /
+ (ssim_old * ssim_old + ssim_new * ssim_new + c3),
+ 5);
double this_inconsistency;
// Floating point math sometimes makes this > 1 by a tiny bit.
// We want the metric to scale between 0 and 1.0 so we can convert
// it to an snr scaled value.
- if (ssim_term > 1)
- ssim_term = 1;
+ if (ssim_term > 1) ssim_term = 1;
// This converts the consistency metric to an inconsistency metric
// ( so we can scale it like psnr to something like sum square error.
@@ -432,8 +420,7 @@ double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch,
ssim2_total *= norm;
m->ssim2 = ssim2_total;
m->ssim = ssim_total;
- if (old_ssim_total == 0)
- inconsistency_total = 0;
+ if (old_ssim_total == 0) inconsistency_total = 0;
m->ssimc = inconsistency_total;
@@ -441,11 +428,10 @@ double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch,
return inconsistency_total;
}
-
#if CONFIG_VP9_HIGHBITDEPTH
double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
- const YV12_BUFFER_CONFIG *dest,
- double *weight, uint32_t bd, uint32_t in_bd) {
+ const YV12_BUFFER_CONFIG *dest, double *weight,
+ uint32_t bd, uint32_t in_bd) {
double a, b, c;
double ssimv;
uint32_t shift = 0;
@@ -453,20 +439,17 @@ double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
assert(bd >= in_bd);
shift = bd - in_bd;
- a = vpx_highbd_ssim2(source->y_buffer, dest->y_buffer,
- source->y_stride, dest->y_stride,
- source->y_crop_width, source->y_crop_height,
- in_bd, shift);
+ a = vpx_highbd_ssim2(source->y_buffer, dest->y_buffer, source->y_stride,
+ dest->y_stride, source->y_crop_width,
+ source->y_crop_height, in_bd, shift);
- b = vpx_highbd_ssim2(source->u_buffer, dest->u_buffer,
- source->uv_stride, dest->uv_stride,
- source->uv_crop_width, source->uv_crop_height,
- in_bd, shift);
+ b = vpx_highbd_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride,
+ dest->uv_stride, source->uv_crop_width,
+ source->uv_crop_height, in_bd, shift);
- c = vpx_highbd_ssim2(source->v_buffer, dest->v_buffer,
- source->uv_stride, dest->uv_stride,
- source->uv_crop_width, source->uv_crop_height,
- in_bd, shift);
+ c = vpx_highbd_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride,
+ dest->uv_stride, source->uv_crop_width,
+ source->uv_crop_height, in_bd, shift);
ssimv = a * .8 + .1 * (b + c);
diff --git a/vpx_dsp/ssim.h b/vpx_dsp/ssim.h
index d4d6b0d8a..4f2bb1d55 100644
--- a/vpx_dsp/ssim.h
+++ b/vpx_dsp/ssim.h
@@ -63,22 +63,20 @@ typedef struct {
} Metrics;
double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
- int img2_pitch, int width, int height, Ssimv *sv2,
- Metrics *m, int do_inconsistency);
+ int img2_pitch, int width, int height, Ssimv *sv2,
+ Metrics *m, int do_inconsistency);
double vpx_calc_ssim(const YV12_BUFFER_CONFIG *source,
- const YV12_BUFFER_CONFIG *dest,
- double *weight);
+ const YV12_BUFFER_CONFIG *dest, double *weight);
double vpx_calc_fastssim(const YV12_BUFFER_CONFIG *source,
- const YV12_BUFFER_CONFIG *dest,
- double *ssim_y, double *ssim_u,
- double *ssim_v, uint32_t bd, uint32_t in_bd);
+ const YV12_BUFFER_CONFIG *dest, double *ssim_y,
+ double *ssim_u, double *ssim_v, uint32_t bd,
+ uint32_t in_bd);
#if CONFIG_VP9_HIGHBITDEPTH
double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
- const YV12_BUFFER_CONFIG *dest,
- double *weight,
+ const YV12_BUFFER_CONFIG *dest, double *weight,
uint32_t bd, uint32_t in_bd);
#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/subtract.c b/vpx_dsp/subtract.c
index 556e0134f..95e7071b2 100644
--- a/vpx_dsp/subtract.c
+++ b/vpx_dsp/subtract.c
@@ -16,32 +16,30 @@
#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
-void vpx_subtract_block_c(int rows, int cols,
- int16_t *diff, ptrdiff_t diff_stride,
- const uint8_t *src, ptrdiff_t src_stride,
- const uint8_t *pred, ptrdiff_t pred_stride) {
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src,
+ ptrdiff_t src_stride, const uint8_t *pred,
+ ptrdiff_t pred_stride) {
int r, c;
for (r = 0; r < rows; r++) {
- for (c = 0; c < cols; c++)
- diff[c] = src[c] - pred[c];
+ for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c];
diff += diff_stride;
pred += pred_stride;
- src += src_stride;
+ src += src_stride;
}
}
#if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_subtract_block_c(int rows, int cols,
- int16_t *diff, ptrdiff_t diff_stride,
- const uint8_t *src8, ptrdiff_t src_stride,
- const uint8_t *pred8, ptrdiff_t pred_stride,
- int bd) {
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src8,
+ ptrdiff_t src_stride, const uint8_t *pred8,
+ ptrdiff_t pred_stride, int bd) {
int r, c;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
- (void) bd;
+ (void)bd;
for (r = 0; r < rows; r++) {
for (c = 0; c < cols; c++) {
@@ -50,7 +48,7 @@ void vpx_highbd_subtract_block_c(int rows, int cols,
diff += diff_stride;
pred += pred_stride;
- src += src_stride;
+ src += src_stride;
}
}
#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/txfm_common.h b/vpx_dsp/txfm_common.h
index 442e6a57b..fd27f928e 100644
--- a/vpx_dsp/txfm_common.h
+++ b/vpx_dsp/txfm_common.h
@@ -15,7 +15,7 @@
// Constants and Macros used by all idct/dct functions
#define DCT_CONST_BITS 14
-#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1))
+#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1))
#define UNIT_QUANT_SHIFT 2
#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT)
@@ -25,15 +25,15 @@
// printf("static const int cospi_%d_64 = %.0f;\n", i,
// round(16384 * cos(i*M_PI/64)));
// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
-static const tran_high_t cospi_1_64 = 16364;
-static const tran_high_t cospi_2_64 = 16305;
-static const tran_high_t cospi_3_64 = 16207;
-static const tran_high_t cospi_4_64 = 16069;
-static const tran_high_t cospi_5_64 = 15893;
-static const tran_high_t cospi_6_64 = 15679;
-static const tran_high_t cospi_7_64 = 15426;
-static const tran_high_t cospi_8_64 = 15137;
-static const tran_high_t cospi_9_64 = 14811;
+static const tran_high_t cospi_1_64 = 16364;
+static const tran_high_t cospi_2_64 = 16305;
+static const tran_high_t cospi_3_64 = 16207;
+static const tran_high_t cospi_4_64 = 16069;
+static const tran_high_t cospi_5_64 = 15893;
+static const tran_high_t cospi_6_64 = 15679;
+static const tran_high_t cospi_7_64 = 15426;
+static const tran_high_t cospi_8_64 = 15137;
+static const tran_high_t cospi_9_64 = 14811;
static const tran_high_t cospi_10_64 = 14449;
static const tran_high_t cospi_11_64 = 14053;
static const tran_high_t cospi_12_64 = 13623;
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index d960c5435..de22a63a5 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -17,18 +17,12 @@
#include "vpx_dsp/variance.h"
static const uint8_t bilinear_filters[8][2] = {
- { 128, 0 },
- { 112, 16 },
- { 96, 32 },
- { 80, 48 },
- { 64, 64 },
- { 48, 80 },
- { 32, 96 },
- { 16, 112 },
+ { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+ { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
};
-uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride) {
+uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride) {
int distortion = 0;
int r, c;
@@ -58,28 +52,23 @@ uint32_t vpx_get_mb_ss_c(const int16_t *a) {
uint32_t vpx_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
uint32_t *sse) {
- return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 0,
- b, b_stride, sse);
+ return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 0, b, b_stride, sse);
}
-
uint32_t vpx_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
uint32_t *sse) {
- return vpx_sub_pixel_variance16x16_c(a, a_stride, 0, 4,
- b, b_stride, sse);
+ return vpx_sub_pixel_variance16x16_c(a, a_stride, 0, 4, b, b_stride, sse);
}
uint32_t vpx_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
uint32_t *sse) {
- return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 4,
- b, b_stride, sse);
+ return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 4, b, b_stride, sse);
}
-static void variance(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- int w, int h, uint32_t *sse, int *sum) {
+static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int w, int h, uint32_t *sse, int *sum) {
int i, j;
*sum = 0;
@@ -115,9 +104,8 @@ static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
for (i = 0; i < output_height; ++i) {
for (j = 0; j < output_width; ++j) {
- b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +
- (int)a[pixel_step] * filter[1],
- FILTER_BITS);
+ b[j] = ROUND_POWER_OF_TWO(
+ (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
++a;
}
@@ -142,13 +130,12 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
unsigned int output_height,
unsigned int output_width,
const uint8_t *filter) {
- unsigned int i, j;
+ unsigned int i, j;
for (i = 0; i < output_height; ++i) {
for (j = 0; j < output_width; ++j) {
- b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +
- (int)a[pixel_step] * filter[1],
- FILTER_BITS);
+ b[j] = ROUND_POWER_OF_TWO(
+ (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
++a;
}
@@ -157,82 +144,78 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
}
}
-#define VAR(W, H) \
-uint32_t vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
- const uint8_t *b, int b_stride, \
- uint32_t *sse) { \
- int sum; \
- variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
- return *sse - (((int64_t)sum * sum) / (W * H)); \
-}
+#define VAR(W, H) \
+ uint32_t vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ return *sse - (((int64_t)sum * sum) / (W * H)); \
+ }
-#define SUBPIX_VAR(W, H) \
-uint32_t vpx_sub_pixel_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
- int xoffset, int yoffset, \
- const uint8_t *b, int b_stride, \
- uint32_t *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint8_t temp2[H * W]; \
-\
- var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
- bilinear_filters[xoffset]); \
- var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
- bilinear_filters[yoffset]); \
-\
- return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
-}
+#define SUBPIX_VAR(W, H) \
+ uint32_t vpx_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ \
+ var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+ bilinear_filters[xoffset]); \
+ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[yoffset]); \
+ \
+ return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
+ }
-#define SUBPIX_AVG_VAR(W, H) \
-uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c(const uint8_t *a, \
- int a_stride, \
- int xoffset, int yoffset, \
- const uint8_t *b, \
- int b_stride, \
- uint32_t *sse, \
- const uint8_t *second_pred) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint8_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
-\
- var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
- bilinear_filters[xoffset]); \
- var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
- bilinear_filters[yoffset]); \
-\
- vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
-\
- return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
-}
+#define SUBPIX_AVG_VAR(W, H) \
+ uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
+ \
+ var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+ bilinear_filters[xoffset]); \
+ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[yoffset]); \
+ \
+ vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
+ \
+ return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
+ }
/* Identical to the variance call except it takes an additional parameter, sum,
* and returns that value using pass-by-reference instead of returning
* sse - sum^2 / w*h
*/
-#define GET_VAR(W, H) \
-void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
- const uint8_t *b, int b_stride, \
- uint32_t *sse, int *sum) { \
- variance(a, a_stride, b, b_stride, W, H, sse, sum); \
-}
+#define GET_VAR(W, H) \
+ void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, uint32_t *sse, \
+ int *sum) { \
+ variance(a, a_stride, b, b_stride, W, H, sse, sum); \
+ }
/* Identical to the variance call except it does not calculate the
* sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
* variable.
*/
-#define MSE(W, H) \
-uint32_t vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
- const uint8_t *b, int b_stride, \
- uint32_t *sse) { \
- int sum; \
- variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
- return *sse; \
-}
+#define MSE(W, H) \
+ uint32_t vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ return *sse; \
+ }
/* All three forms of the variance are available in the same sizes. */
#define VARIANCES(W, H) \
- VAR(W, H) \
- SUBPIX_VAR(W, H) \
- SUBPIX_AVG_VAR(W, H)
+ VAR(W, H) \
+ SUBPIX_VAR(W, H) \
+ SUBPIX_AVG_VAR(W, H)
VARIANCES(64, 64)
VARIANCES(64, 32)
@@ -256,9 +239,8 @@ MSE(16, 8)
MSE(8, 16)
MSE(8, 8)
-void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
- int width, int height,
- const uint8_t *ref, int ref_stride) {
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride) {
int i, j;
for (i = 0; i < height; ++i) {
@@ -273,9 +255,9 @@ void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
}
#if CONFIG_VP9_HIGHBITDEPTH
-static void highbd_variance64(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- int w, int h, uint64_t *sse, int64_t *sum) {
+static void highbd_variance64(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int w, int h,
+ uint64_t *sse, int64_t *sum) {
int i, j;
uint16_t *a = CONVERT_TO_SHORTPTR(a8);
@@ -294,9 +276,9 @@ static void highbd_variance64(const uint8_t *a8, int a_stride,
}
}
-static void highbd_8_variance(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- int w, int h, uint32_t *sse, int *sum) {
+static void highbd_8_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int w, int h,
+ uint32_t *sse, int *sum) {
uint64_t sse_long = 0;
int64_t sum_long = 0;
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
@@ -304,9 +286,9 @@ static void highbd_8_variance(const uint8_t *a8, int a_stride,
*sum = (int)sum_long;
}
-static void highbd_10_variance(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- int w, int h, uint32_t *sse, int *sum) {
+static void highbd_10_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int w, int h,
+ uint32_t *sse, int *sum) {
uint64_t sse_long = 0;
int64_t sum_long = 0;
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
@@ -314,9 +296,9 @@ static void highbd_10_variance(const uint8_t *a8, int a_stride,
*sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
}
-static void highbd_12_variance(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- int w, int h, uint32_t *sse, int *sum) {
+static void highbd_12_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int w, int h,
+ uint32_t *sse, int *sum) {
uint64_t sse_long = 0;
int64_t sum_long = 0;
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
@@ -324,107 +306,91 @@ static void highbd_12_variance(const uint8_t *a8, int a_stride,
*sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
}
-#define HIGHBD_VAR(W, H) \
-uint32_t vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \
- int a_stride, \
- const uint8_t *b, \
- int b_stride, \
- uint32_t *sse) { \
- int sum; \
- highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
- return *sse - (((int64_t)sum * sum) / (W * H)); \
-} \
-\
-uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
- int a_stride, \
- const uint8_t *b, \
- int b_stride, \
- uint32_t *sse) { \
- int sum; \
- int64_t var; \
- highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
- var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
- return (var >= 0) ? (uint32_t)var : 0; \
-} \
-\
-uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
- int a_stride, \
- const uint8_t *b, \
- int b_stride, \
- uint32_t *sse) { \
- int sum; \
- int64_t var; \
- highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
- var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
- return (var >= 0) ? (uint32_t)var : 0; \
-}
+#define HIGHBD_VAR(W, H) \
+ uint32_t vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ return *sse - (((int64_t)sum * sum) / (W * H)); \
+ } \
+ \
+ uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
-#define HIGHBD_GET_VAR(S) \
-void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride, \
- uint32_t *sse, int *sum) { \
- highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
-} \
-\
-void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride, \
- uint32_t *sse, int *sum) { \
- highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
-} \
-\
-void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride, \
- uint32_t *sse, int *sum) { \
- highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
-}
+#define HIGHBD_GET_VAR(S) \
+ void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ uint32_t *sse, int *sum) { \
+ highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+ } \
+ \
+ void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ uint32_t *sse, int *sum) { \
+ highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+ } \
+ \
+ void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ uint32_t *sse, int *sum) { \
+ highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+ }
-#define HIGHBD_MSE(W, H) \
-uint32_t vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \
- int src_stride, \
- const uint8_t *ref, \
- int ref_stride, \
- uint32_t *sse) { \
- int sum; \
- highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
- return *sse; \
-} \
-\
-uint32_t vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
- int src_stride, \
- const uint8_t *ref, \
- int ref_stride, \
- uint32_t *sse) { \
- int sum; \
- highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
- return *sse; \
-} \
-\
-uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
- int src_stride, \
- const uint8_t *ref, \
- int ref_stride, \
- uint32_t *sse) { \
- int sum; \
- highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
- return *sse; \
-}
+#define HIGHBD_MSE(W, H) \
+ uint32_t vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+ return *sse; \
+ } \
+ \
+ uint32_t vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+ return *sse; \
+ } \
+ \
+ uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+ return *sse; \
+ }
static void highbd_var_filter_block2d_bil_first_pass(
- const uint8_t *src_ptr8,
- uint16_t *output_ptr,
- unsigned int src_pixels_per_line,
- int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
+ const uint8_t *src_ptr8, uint16_t *output_ptr,
+ unsigned int src_pixels_per_line, int pixel_step,
+ unsigned int output_height, unsigned int output_width,
const uint8_t *filter) {
unsigned int i, j;
uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
for (i = 0; i < output_height; ++i) {
for (j = 0; j < output_width; ++j) {
- output_ptr[j] =
- ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +
- (int)src_ptr[pixel_step] * filter[1],
- FILTER_BITS);
+ output_ptr[j] = ROUND_POWER_OF_TWO(
+ (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+ FILTER_BITS);
++src_ptr;
}
@@ -436,21 +402,17 @@ static void highbd_var_filter_block2d_bil_first_pass(
}
static void highbd_var_filter_block2d_bil_second_pass(
- const uint16_t *src_ptr,
- uint16_t *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
+ const uint16_t *src_ptr, uint16_t *output_ptr,
+ unsigned int src_pixels_per_line, unsigned int pixel_step,
+ unsigned int output_height, unsigned int output_width,
const uint8_t *filter) {
- unsigned int i, j;
+ unsigned int i, j;
for (i = 0; i < output_height; ++i) {
for (j = 0; j < output_width; ++j) {
- output_ptr[j] =
- ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +
- (int)src_ptr[pixel_step] * filter[1],
- FILTER_BITS);
+ output_ptr[j] = ROUND_POWER_OF_TWO(
+ (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+ FILTER_BITS);
++src_ptr;
}
@@ -459,130 +421,118 @@ static void highbd_var_filter_block2d_bil_second_pass(
}
}
-#define HIGHBD_SUBPIX_VAR(W, H) \
-uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, \
- int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, \
- uint32_t *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
-\
- highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
- W, bilinear_filters[xoffset]); \
- highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
- bilinear_filters[yoffset]); \
-\
- return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
- dst_stride, sse); \
-} \
-\
-uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, \
- int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, \
- uint32_t *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
-\
- highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
- W, bilinear_filters[xoffset]); \
- highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
- bilinear_filters[yoffset]); \
-\
- return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
- W, dst, dst_stride, sse); \
-} \
-\
-uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, \
- int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, \
- uint32_t *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
-\
- highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
- W, bilinear_filters[xoffset]); \
- highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
- bilinear_filters[yoffset]); \
-\
- return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
- W, dst, dst_stride, sse); \
-}
+#define HIGHBD_SUBPIX_VAR(W, H) \
+ uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[yoffset]); \
+ \
+ return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+ dst, dst_stride, sse); \
+ } \
+ \
+ uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[yoffset]); \
+ \
+ return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+ dst, dst_stride, sse); \
+ } \
+ \
+ uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[yoffset]); \
+ \
+ return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+ dst, dst_stride, sse); \
+ }
-#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
-uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, \
- int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, \
- uint32_t *sse, \
- const uint8_t *second_pred) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
-\
- highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
- W, bilinear_filters[xoffset]); \
- highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
- bilinear_filters[yoffset]); \
-\
- vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
-\
- return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
- dst_stride, sse); \
-} \
-\
-uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, \
- int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, \
- uint32_t *sse, \
- const uint8_t *second_pred) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
-\
- highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
- W, bilinear_filters[xoffset]); \
- highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
- bilinear_filters[yoffset]); \
-\
- vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
-\
- return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
- W, dst, dst_stride, sse); \
-} \
-\
-uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, \
- int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, \
- uint32_t *sse, \
- const uint8_t *second_pred) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
-\
- highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
- W, bilinear_filters[xoffset]); \
- highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
- bilinear_filters[yoffset]); \
-\
- vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
-\
- return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
- W, dst, dst_stride, sse); \
-}
+#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
+ uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[yoffset]); \
+ \
+ vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
+ \
+ return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+ dst, dst_stride, sse); \
+ } \
+ \
+ uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[yoffset]); \
+ \
+ vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
+ \
+ return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+ dst, dst_stride, sse); \
+ } \
+ \
+ uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[yoffset]); \
+ \
+ vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
+ \
+ return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+ dst, dst_stride, sse); \
+ }
/* All three forms of the variance are available in the same sizes. */
#define HIGHBD_VARIANCES(W, H) \
- HIGHBD_VAR(W, H) \
- HIGHBD_SUBPIX_VAR(W, H) \
- HIGHBD_SUBPIX_AVG_VAR(W, H)
+ HIGHBD_VAR(W, H) \
+ HIGHBD_SUBPIX_VAR(W, H) \
+ HIGHBD_SUBPIX_AVG_VAR(W, H)
HIGHBD_VARIANCES(64, 64)
HIGHBD_VARIANCES(64, 32)
diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h
index c18d9b48f..af767be65 100644
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@@ -22,15 +22,15 @@ extern "C" {
#define FILTER_BITS 7
#define FILTER_WEIGHT 128
-typedef unsigned int(*vpx_sad_fn_t)(const uint8_t *a, int a_stride,
- const uint8_t *b_ptr, int b_stride);
+typedef unsigned int (*vpx_sad_fn_t)(const uint8_t *a, int a_stride,
+ const uint8_t *b_ptr, int b_stride);
-typedef unsigned int(*vpx_sad_avg_fn_t)(const uint8_t *a_ptr, int a_stride,
- const uint8_t *b_ptr, int b_stride,
- const uint8_t *second_pred);
+typedef unsigned int (*vpx_sad_avg_fn_t)(const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *second_pred);
-typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride,
- uint8_t *b, int b_stride, int n);
+typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b,
+ int b_stride, int n);
typedef void (*vpx_sad_multi_fn_t)(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
@@ -38,8 +38,7 @@ typedef void (*vpx_sad_multi_fn_t)(const uint8_t *a, int a_stride,
typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *a, int a_stride,
const uint8_t *const b_array[],
- int b_stride,
- unsigned int *sad_array);
+ int b_stride, unsigned int *sad_array);
typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
@@ -50,40 +49,37 @@ typedef unsigned int (*vpx_subpixvariance_fn_t)(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse);
-typedef unsigned int (*vpx_subp_avg_variance_fn_t)(const uint8_t *a_ptr,
- int a_stride,
- int xoffset, int yoffset,
- const uint8_t *b_ptr,
- int b_stride,
- unsigned int *sse,
- const uint8_t *second_pred);
+typedef unsigned int (*vpx_subp_avg_variance_fn_t)(
+ const uint8_t *a_ptr, int a_stride, int xoffset, int yoffset,
+ const uint8_t *b_ptr, int b_stride, unsigned int *sse,
+ const uint8_t *second_pred);
#if CONFIG_VP8
typedef struct variance_vtable {
- vpx_sad_fn_t sdf;
- vpx_variance_fn_t vf;
+ vpx_sad_fn_t sdf;
+ vpx_variance_fn_t vf;
vpx_subpixvariance_fn_t svf;
- vpx_variance_fn_t svf_halfpix_h;
- vpx_variance_fn_t svf_halfpix_v;
- vpx_variance_fn_t svf_halfpix_hv;
- vpx_sad_multi_fn_t sdx3f;
- vpx_sad_multi_fn_t sdx8f;
- vpx_sad_multi_d_fn_t sdx4df;
+ vpx_variance_fn_t svf_halfpix_h;
+ vpx_variance_fn_t svf_halfpix_v;
+ vpx_variance_fn_t svf_halfpix_hv;
+ vpx_sad_multi_fn_t sdx3f;
+ vpx_sad_multi_fn_t sdx8f;
+ vpx_sad_multi_d_fn_t sdx4df;
#if ARCH_X86 || ARCH_X86_64
- vp8_copy32xn_fn_t copymem;
+ vp8_copy32xn_fn_t copymem;
#endif
} vp8_variance_fn_ptr_t;
#endif // CONFIG_VP8
#if CONFIG_VP9
typedef struct vp9_variance_vtable {
- vpx_sad_fn_t sdf;
- vpx_sad_avg_fn_t sdaf;
- vpx_variance_fn_t vf;
- vpx_subpixvariance_fn_t svf;
+ vpx_sad_fn_t sdf;
+ vpx_sad_avg_fn_t sdaf;
+ vpx_variance_fn_t vf;
+ vpx_subpixvariance_fn_t svf;
vpx_subp_avg_variance_fn_t svaf;
- vpx_sad_multi_fn_t sdx3f;
- vpx_sad_multi_fn_t sdx8f;
- vpx_sad_multi_d_fn_t sdx4df;
+ vpx_sad_multi_fn_t sdx3f;
+ vpx_sad_multi_fn_t sdx8f;
+ vpx_sad_multi_d_fn_t sdx4df;
} vp9_variance_fn_ptr_t;
#endif // CONFIG_VP9
diff --git a/vpx_dsp/vpx_convolve.c b/vpx_dsp/vpx_convolve.c
index 2d1c927cb..f17281b36 100644
--- a/vpx_dsp/vpx_convolve.c
+++ b/vpx_dsp/vpx_convolve.c
@@ -21,8 +21,8 @@
static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *x_filters,
- int x0_q4, int x_step_q4, int w, int h) {
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h) {
int x, y;
src -= SUBPEL_TAPS / 2 - 1;
for (y = 0; y < h; ++y) {
@@ -31,8 +31,7 @@ static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k)
- sum += src_x[k] * x_filter[k];
+ for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
x_q4 += x_step_q4;
}
@@ -43,8 +42,8 @@ static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *x_filters,
- int x0_q4, int x_step_q4, int w, int h) {
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h) {
int x, y;
src -= SUBPEL_TAPS / 2 - 1;
for (y = 0; y < h; ++y) {
@@ -53,10 +52,9 @@ static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k)
- sum += src_x[k] * x_filter[k];
- dst[x] = ROUND_POWER_OF_TWO(dst[x] +
- clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+ for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+ dst[x] = ROUND_POWER_OF_TWO(
+ dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
x_q4 += x_step_q4;
}
src += src_stride;
@@ -66,8 +64,8 @@ static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *y_filters,
- int y0_q4, int y_step_q4, int w, int h) {
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h) {
int x, y;
src -= src_stride * (SUBPEL_TAPS / 2 - 1);
@@ -89,8 +87,8 @@ static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *y_filters,
- int y0_q4, int y_step_q4, int w, int h) {
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h) {
int x, y;
src -= src_stride * (SUBPEL_TAPS / 2 - 1);
@@ -102,8 +100,10 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
int k, sum = 0;
for (k = 0; k < SUBPEL_TAPS; ++k)
sum += src_y[k * src_stride] * y_filter[k];
- dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
- clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+ dst[y * dst_stride] = ROUND_POWER_OF_TWO(
+ dst[y * dst_stride] +
+ clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
+ 1);
y_q4 += y_step_q4;
}
++src;
@@ -111,13 +111,11 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
}
}
-static void convolve(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *const x_filters,
+static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *const x_filters,
int x0_q4, int x_step_q4,
- const InterpKernel *const y_filters,
- int y0_q4, int y_step_q4,
- int w, int h) {
+ const InterpKernel *const y_filters, int y0_q4,
+ int y_step_q4, int w, int h) {
// Note: Fixed size intermediate buffer, temp, places limits on parameters.
// 2d filtering proceeds in 2 steps:
// (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -132,7 +130,7 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride,
// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
uint8_t temp[135 * 64];
int intermediate_height =
- (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
assert(w <= 64);
assert(h <= 64);
@@ -158,67 +156,66 @@ static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
const InterpKernel *const filters_x = get_filter_base(filter_x);
const int x0_q4 = get_filter_offset(filter_x, filters_x);
(void)filter_y;
(void)y_step_q4;
- convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
- x0_q4, x_step_q4, w, h);
+ convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
+ w, h);
}
void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
const InterpKernel *const filters_x = get_filter_base(filter_x);
const int x0_q4 = get_filter_offset(filter_x, filters_x);
(void)filter_y;
(void)y_step_q4;
- convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
- x0_q4, x_step_q4, w, h);
+ convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+ x_step_q4, w, h);
}
void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
const InterpKernel *const filters_y = get_filter_base(filter_y);
const int y0_q4 = get_filter_offset(filter_y, filters_y);
(void)filter_x;
(void)x_step_q4;
- convolve_vert(src, src_stride, dst, dst_stride, filters_y,
- y0_q4, y_step_q4, w, h);
+ convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
+ w, h);
}
void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
const InterpKernel *const filters_y = get_filter_base(filter_y);
const int y0_q4 = get_filter_offset(filter_y, filters_y);
(void)filter_x;
(void)x_step_q4;
- convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
- y0_q4, y_step_q4, w, h);
+ convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+ y_step_q4, w, h);
}
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int x_step_q4, const int16_t *filter_y, int y_step_q4,
int w, int h) {
const InterpKernel *const filters_x = get_filter_base(filter_x);
const int x0_q4 = get_filter_offset(filter_x, filters_x);
@@ -226,35 +223,34 @@ void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
const InterpKernel *const filters_y = get_filter_base(filter_y);
const int y0_q4 = get_filter_offset(filter_y, filters_y);
- convolve(src, src_stride, dst, dst_stride,
- filters_x, x0_q4, x_step_q4,
+ convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
filters_y, y0_q4, y_step_q4, w, h);
}
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int x_step_q4, const int16_t *filter_y, int y_step_q4,
int w, int h) {
/* Fixed size intermediate buffer places limits on parameters. */
DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
assert(w <= 64);
assert(h <= 64);
- vpx_convolve8_c(src, src_stride, temp, 64,
- filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y,
+ y_step_q4, w, h);
vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
}
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int w, int h) {
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int filter_x_stride, const int16_t *filter_y,
+ int filter_y_stride, int w, int h) {
int r;
- (void)filter_x; (void)filter_x_stride;
- (void)filter_y; (void)filter_y_stride;
+ (void)filter_x;
+ (void)filter_x_stride;
+ (void)filter_y;
+ (void)filter_y_stride;
for (r = h; r > 0; --r) {
memcpy(dst, src, w);
@@ -263,47 +259,44 @@ void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
}
}
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int w, int h) {
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int filter_x_stride, const int16_t *filter_y,
+ int filter_y_stride, int w, int h) {
int x, y;
- (void)filter_x; (void)filter_x_stride;
- (void)filter_y; (void)filter_y_stride;
+ (void)filter_x;
+ (void)filter_x_stride;
+ (void)filter_y;
+ (void)filter_y_stride;
for (y = 0; y < h; ++y) {
- for (x = 0; x < w; ++x)
- dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+ for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
src += src_stride;
dst += dst_stride;
}
}
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int x_step_q4, const int16_t *filter_y, int y_step_q4,
int w, int h) {
vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
filter_y, y_step_q4, w, h);
}
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int x_step_q4, const int16_t *filter_y, int y_step_q4,
int w, int h) {
vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
filter_y, y_step_q4, w, h);
}
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int x_step_q4, const int16_t *filter_y, int y_step_q4,
int w, int h) {
vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
filter_y, y_step_q4, w, h);
@@ -312,8 +305,8 @@ void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride,
void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
x_step_q4, filter_y, y_step_q4, w, h);
}
@@ -321,17 +314,16 @@ void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
x_step_q4, filter_y, y_step_q4, w, h);
}
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int x_step_q4, const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
filter_y, y_step_q4, w, h);
}
@@ -339,9 +331,8 @@ void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride,
#if CONFIG_VP9_HIGHBITDEPTH
static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
uint8_t *dst8, ptrdiff_t dst_stride,
- const InterpKernel *x_filters,
- int x0_q4, int x_step_q4,
- int w, int h, int bd) {
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h, int bd) {
int x, y;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -352,8 +343,7 @@ static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k)
- sum += src_x[k] * x_filter[k];
+ for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
x_q4 += x_step_q4;
}
@@ -364,9 +354,8 @@ static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
uint8_t *dst8, ptrdiff_t dst_stride,
- const InterpKernel *x_filters,
- int x0_q4, int x_step_q4,
- int w, int h, int bd) {
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h, int bd) {
int x, y;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -377,10 +366,10 @@ static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k)
- sum += src_x[k] * x_filter[k];
- dst[x] = ROUND_POWER_OF_TWO(dst[x] +
- clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
+ for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+ dst[x] = ROUND_POWER_OF_TWO(
+ dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
+ 1);
x_q4 += x_step_q4;
}
src += src_stride;
@@ -390,9 +379,8 @@ static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
uint8_t *dst8, ptrdiff_t dst_stride,
- const InterpKernel *y_filters,
- int y0_q4, int y_step_q4, int w, int h,
- int bd) {
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h, int bd) {
int x, y;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -405,8 +393,8 @@ static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
int k, sum = 0;
for (k = 0; k < SUBPEL_TAPS; ++k)
sum += src_y[k * src_stride] * y_filter[k];
- dst[y * dst_stride] = clip_pixel_highbd(
- ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ dst[y * dst_stride] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
y_q4 += y_step_q4;
}
++src;
@@ -416,9 +404,8 @@ static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
uint8_t *dst8, ptrdiff_t dst_stride,
- const InterpKernel *y_filters,
- int y0_q4, int y_step_q4, int w, int h,
- int bd) {
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h, int bd) {
int x, y;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -431,8 +418,10 @@ static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
int k, sum = 0;
for (k = 0; k < SUBPEL_TAPS; ++k)
sum += src_y[k * src_stride] * y_filter[k];
- dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
- clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
+ dst[y * dst_stride] = ROUND_POWER_OF_TWO(
+ dst[y * dst_stride] +
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
+ 1);
y_q4 += y_step_q4;
}
++src;
@@ -442,11 +431,9 @@ static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *const x_filters,
- int x0_q4, int x_step_q4,
- const InterpKernel *const y_filters,
- int y0_q4, int y_step_q4,
- int w, int h, int bd) {
+ const InterpKernel *const x_filters, int x0_q4,
+ int x_step_q4, const InterpKernel *const y_filters,
+ int y0_q4, int y_step_q4, int w, int h, int bd) {
// Note: Fixed size intermediate buffer, temp, places limits on parameters.
// 2d filtering proceeds in 2 steps:
// (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -461,35 +448,33 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
uint16_t temp[64 * 135];
int intermediate_height =
- (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
assert(w <= 64);
assert(h <= 64);
assert(y_step_q4 <= 32);
assert(x_step_q4 <= 32);
- highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
- src_stride, CONVERT_TO_BYTEPTR(temp), 64,
- x_filters, x0_q4, x_step_q4, w,
- intermediate_height, bd);
+ highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
+ CONVERT_TO_BYTEPTR(temp), 64, x_filters, x0_q4,
+ x_step_q4, w, intermediate_height, bd);
highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),
- 64, dst, dst_stride, y_filters, y0_q4, y_step_q4,
- w, h, bd);
+ 64, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h,
+ bd);
}
-
void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h, int bd) {
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h, int bd) {
const InterpKernel *const filters_x = get_filter_base(filter_x);
const int x0_q4 = get_filter_offset(filter_x, filters_x);
(void)filter_y;
(void)y_step_q4;
- highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
- x0_q4, x_step_q4, w, h, bd);
+ highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+ x_step_q4, w, h, bd);
}
void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -502,22 +487,22 @@ void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
(void)filter_y;
(void)y_step_q4;
- highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
- x0_q4, x_step_q4, w, h, bd);
+ highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+ x_step_q4, w, h, bd);
}
void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h, int bd) {
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h, int bd) {
const InterpKernel *const filters_y = get_filter_base(filter_y);
const int y0_q4 = get_filter_offset(filter_y, filters_y);
(void)filter_x;
(void)x_step_q4;
- highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y,
- y0_q4, y_step_q4, w, h, bd);
+ highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+ y_step_q4, w, h, bd);
}
void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -530,31 +515,30 @@ void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
(void)filter_x;
(void)x_step_q4;
- highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
- y0_q4, y_step_q4, w, h, bd);
+ highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+ y_step_q4, w, h, bd);
}
void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h, int bd) {
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h, int bd) {
const InterpKernel *const filters_x = get_filter_base(filter_x);
const int x0_q4 = get_filter_offset(filter_x, filters_x);
const InterpKernel *const filters_y = get_filter_base(filter_y);
const int y0_q4 = get_filter_offset(filter_y, filters_y);
- highbd_convolve(src, src_stride, dst, dst_stride,
- filters_x, x0_q4, x_step_q4,
+ highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
filters_y, y0_q4, y_step_q4, w, h, bd);
}
void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h, int bd) {
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h, int bd) {
// Fixed size intermediate buffer places limits on parameters.
DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
assert(w <= 64);
@@ -562,8 +546,8 @@ void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
- vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride,
- NULL, 0, NULL, 0, w, h, bd);
+ vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride, NULL,
+ 0, NULL, 0, w, h, bd);
}
void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
diff --git a/vpx_dsp/vpx_convolve.h b/vpx_dsp/vpx_convolve.h
index 9ed3f1750..ee9744b3a 100644
--- a/vpx_dsp/vpx_convolve.h
+++ b/vpx_dsp/vpx_convolve.h
@@ -20,8 +20,8 @@ extern "C" {
typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h);
#if CONFIG_VP9_HIGHBITDEPTH
typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h
index 8d4f51418..49d36e545 100644
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h
@@ -59,12 +59,9 @@ static INLINE double fclamp(double value, double low, double high) {
static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
switch (bd) {
case 8:
- default:
- return (uint16_t)clamp(val, 0, 255);
- case 10:
- return (uint16_t)clamp(val, 0, 1023);
- case 12:
- return (uint16_t)clamp(val, 0, 4095);
+ default: return (uint16_t)clamp(val, 0, 255);
+ case 10: return (uint16_t)clamp(val, 0, 1023);
+ case 12: return (uint16_t)clamp(val, 0, 4095);
}
}
#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/vpx_dsp_rtcd.c b/vpx_dsp/vpx_dsp_rtcd.c
index 5fe27b614..030c456d3 100644
--- a/vpx_dsp/vpx_dsp_rtcd.c
+++ b/vpx_dsp/vpx_dsp_rtcd.c
@@ -12,6 +12,4 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/vpx_once.h"
-void vpx_dsp_rtcd() {
- once(setup_rtcd_internal);
-}
+void vpx_dsp_rtcd() { once(setup_rtcd_internal); }
diff --git a/vpx_dsp/vpx_filter.h b/vpx_dsp/vpx_filter.h
index 2617febf3..6cea251bc 100644
--- a/vpx_dsp/vpx_filter.h
+++ b/vpx_dsp/vpx_filter.h
@@ -13,7 +13,6 @@
#include "vpx/vpx_integer.h"
-
#ifdef __cplusplus
extern "C" {
#endif
diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c
index f9af6cf97..b0a104bad 100644
--- a/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/vpx_dsp/x86/avg_intrin_sse2.c
@@ -16,7 +16,7 @@
void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
int *min, int *max) {
__m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
- u0 = _mm_setzero_si128();
+ u0 = _mm_setzero_si128();
// Row 0
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
@@ -94,7 +94,7 @@ void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
unsigned int vpx_avg_8x8_sse2(const uint8_t *s, int p) {
__m128i s0, s1, u0;
unsigned int avg = 0;
- u0 = _mm_setzero_si128();
+ u0 = _mm_setzero_si128();
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
s0 = _mm_adds_epu16(s0, s1);
@@ -121,7 +121,7 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *s, int p) {
unsigned int vpx_avg_4x4_sse2(const uint8_t *s, int p) {
__m128i s0, s1, u0;
unsigned int avg = 0;
- u0 = _mm_setzero_si128();
+ u0 = _mm_setzero_si128();
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
s0 = _mm_adds_epu16(s0, s1);
@@ -248,8 +248,8 @@ void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
int16_t *coeff) {
int idx;
for (idx = 0; idx < 4; ++idx) {
- int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
- + (idx & 0x01) * 8;
+ int16_t const *src_ptr =
+ src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
vpx_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
}
@@ -309,7 +309,7 @@ int vpx_satd_sse2(const int16_t *coeff, int length) {
return _mm_cvtsi128_si32(accum);
}
-void vpx_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
+void vpx_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref,
const int ref_stride, const int height) {
int idx;
__m128i zero = _mm_setzero_si128();
@@ -378,8 +378,7 @@ int16_t vpx_int_pro_col_sse2(uint8_t const *ref, const int width) {
return _mm_extract_epi16(s0, 0);
}
-int vpx_vector_var_sse2(int16_t const *ref, int16_t const *src,
- const int bwl) {
+int vpx_vector_var_sse2(int16_t const *ref, int16_t const *src, const int bwl) {
int idx;
int width = 4 << bwl;
int16_t mean;
@@ -398,23 +397,23 @@ int vpx_vector_var_sse2(int16_t const *ref, int16_t const *src,
diff = _mm_subs_epi16(v0, v1);
sum = _mm_add_epi16(sum, diff);
- v0 = _mm_madd_epi16(diff, diff);
+ v0 = _mm_madd_epi16(diff, diff);
sse = _mm_add_epi32(sse, v0);
ref += 8;
src += 8;
}
- v0 = _mm_srli_si128(sum, 8);
+ v0 = _mm_srli_si128(sum, 8);
sum = _mm_add_epi16(sum, v0);
- v0 = _mm_srli_epi64(sum, 32);
+ v0 = _mm_srli_epi64(sum, 32);
sum = _mm_add_epi16(sum, v0);
- v0 = _mm_srli_epi32(sum, 16);
+ v0 = _mm_srli_epi32(sum, 16);
sum = _mm_add_epi16(sum, v0);
- v1 = _mm_srli_si128(sse, 8);
+ v1 = _mm_srli_si128(sse, 8);
sse = _mm_add_epi32(sse, v1);
- v1 = _mm_srli_epi64(sse, 32);
+ v1 = _mm_srli_epi64(sse, 32);
sse = _mm_add_epi32(sse, v1);
mean = _mm_extract_epi16(sum, 0);
diff --git a/vpx_dsp/x86/convolve.h b/vpx_dsp/x86/convolve.h
index 7e43eb7c7..2a0516cdc 100644
--- a/vpx_dsp/x86/convolve.h
+++ b/vpx_dsp/x86/convolve.h
@@ -16,259 +16,179 @@
#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
-typedef void filter8_1dfunction (
- const uint8_t *src_ptr,
- ptrdiff_t src_pitch,
- uint8_t *output_ptr,
- ptrdiff_t out_pitch,
- uint32_t output_height,
- const int16_t *filter
-);
+typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+ uint8_t *output_ptr, ptrdiff_t out_pitch,
+ uint32_t output_height, const int16_t *filter);
-#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
- void vpx_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
- uint8_t *dst, ptrdiff_t dst_stride, \
- const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, \
- int w, int h) { \
- assert(filter[3] != 128); \
- assert(step_q4 == 16); \
- if (filter[0] | filter[1] | filter[2]) { \
- while (w >= 16) { \
- vpx_filter_block1d16_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- if (w == 8) { \
- vpx_filter_block1d8_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- } else if (w == 4) { \
- vpx_filter_block1d4_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- } \
- } else { \
- while (w >= 16) { \
- vpx_filter_block1d16_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- if (w == 8) { \
- vpx_filter_block1d8_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- } else if (w == 4) { \
- vpx_filter_block1d4_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- } \
- } \
-}
+#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+ void vpx_convolve8_##name##_##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, int w, int h) { \
+ assert(filter[3] != 128); \
+ assert(step_q4 == 16); \
+ if (filter[0] | filter[1] | filter[2]) { \
+ while (w >= 16) { \
+ vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ if (w == 8) { \
+ vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ } else if (w == 4) { \
+ vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ } \
+ } else { \
+ while (w >= 16) { \
+ vpx_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ if (w == 8) { \
+ vpx_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst, \
+ dst_stride, h, filter); \
+ } else if (w == 4) { \
+ vpx_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst, \
+ dst_stride, h, filter); \
+ } \
+ } \
+ }
-#define FUN_CONV_2D(avg, opt) \
-void vpx_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
- uint8_t *dst, ptrdiff_t dst_stride, \
- const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, \
- int w, int h) { \
- assert(filter_x[3] != 128); \
- assert(filter_y[3] != 128); \
- assert(w <= 64); \
- assert(h <= 64); \
- assert(x_step_q4 == 16); \
- assert(y_step_q4 == 16); \
- if (filter_x[0] | filter_x[1] | filter_x[2]) { \
- DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
- vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
- filter_x, x_step_q4, filter_y, y_step_q4, \
- w, h + 7); \
- vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, \
- y_step_q4, w, h); \
- } else { \
- DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
- vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
- filter_x, x_step_q4, filter_y, y_step_q4, \
- w, h + 1); \
- vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, \
- y_step_q4, w, h); \
- } \
-}
+#define FUN_CONV_2D(avg, opt) \
+ void vpx_convolve8_##avg##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, int w, int h) { \
+ assert(filter_x[3] != 128); \
+ assert(filter_y[3] != 128); \
+ assert(w <= 64); \
+ assert(h <= 64); \
+ assert(x_step_q4 == 16); \
+ assert(y_step_q4 == 16); \
+ if (filter_x[0] | filter_x[1] | filter_x[2]) { \
+ DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
+ vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
+ filter_x, x_step_q4, filter_y, y_step_q4, w, \
+ h + 7); \
+ vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, \
+ y_step_q4, w, h); \
+ } else { \
+ DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
+ vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter_x, \
+ x_step_q4, filter_y, y_step_q4, w, h + 1); \
+ vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter_x, \
+ x_step_q4, filter_y, y_step_q4, w, h); \
+ } \
+ }
#if CONFIG_VP9_HIGHBITDEPTH
-typedef void highbd_filter8_1dfunction (
- const uint16_t *src_ptr,
- const ptrdiff_t src_pitch,
- uint16_t *output_ptr,
- ptrdiff_t out_pitch,
- unsigned int output_height,
- const int16_t *filter,
- int bd
-);
+typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
+ const ptrdiff_t src_pitch,
+ uint16_t *output_ptr,
+ ptrdiff_t out_pitch,
+ unsigned int output_height,
+ const int16_t *filter, int bd);
#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
- void vpx_highbd_convolve8_##name##_##opt(const uint8_t *src8, \
- ptrdiff_t src_stride, \
- uint8_t *dst8, \
- ptrdiff_t dst_stride, \
- const int16_t *filter_x, \
- int x_step_q4, \
- const int16_t *filter_y, \
- int y_step_q4, \
- int w, int h, int bd) { \
- if (step_q4 == 16 && filter[3] != 128) { \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- if (filter[0] | filter[1] | filter[2]) { \
- while (w >= 16) { \
- vpx_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vpx_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vpx_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } else { \
- while (w >= 16) { \
- vpx_highbd_filter_block1d16_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vpx_highbd_filter_block1d8_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vpx_highbd_filter_block1d4_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } \
- } \
- if (w) { \
- vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
- filter_x, x_step_q4, filter_y, y_step_q4, \
- w, h, bd); \
- } \
-}
+ void vpx_highbd_convolve8_##name##_##opt( \
+ const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, \
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \
+ if (step_q4 == 16 && filter[3] != 128) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ if (filter[0] | filter[1] | filter[2]) { \
+ while (w >= 16) { \
+ vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else { \
+ while (w >= 16) { \
+ vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \
+ src, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \
+ src, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \
+ src, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } \
+ } \
+ if (w) { \
+ vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
+ filter_x, x_step_q4, filter_y, \
+ y_step_q4, w, h, bd); \
+ } \
+ }
-#define HIGH_FUN_CONV_2D(avg, opt) \
-void vpx_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
- uint8_t *dst, ptrdiff_t dst_stride, \
- const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, \
- int w, int h, int bd) { \
- assert(w <= 64); \
- assert(h <= 64); \
- if (x_step_q4 == 16 && y_step_q4 == 16) { \
- if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \
- DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
- vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
- CONVERT_TO_BYTEPTR(fdata2), 64, \
- filter_x, x_step_q4, \
- filter_y, y_step_q4, \
- w, h + 7, bd); \
- vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
- 64, dst, dst_stride, \
- filter_x, x_step_q4, \
- filter_y, y_step_q4, \
- w, h, bd); \
- } else { \
- DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
- vpx_highbd_convolve8_horiz_##opt(src, src_stride, \
- CONVERT_TO_BYTEPTR(fdata2), 64, \
- filter_x, x_step_q4, \
- filter_y, y_step_q4, \
- w, h + 1, bd); \
- vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
- dst, dst_stride, \
- filter_x, x_step_q4, \
- filter_y, y_step_q4, \
- w, h, bd); \
- } \
- } else { \
- vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, y_step_q4, w, \
- h, bd); \
- } \
-}
+#define HIGH_FUN_CONV_2D(avg, opt) \
+ void vpx_highbd_convolve8_##avg##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \
+ assert(w <= 64); \
+ assert(h <= 64); \
+ if (x_step_q4 == 16 && y_step_q4 == 16) { \
+ if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \
+ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
+ vpx_highbd_convolve8_horiz_##opt( \
+ src - 3 * src_stride, src_stride, CONVERT_TO_BYTEPTR(fdata2), 64, \
+ filter_x, x_step_q4, filter_y, y_step_q4, w, h + 7, bd); \
+ vpx_highbd_convolve8_##avg##vert_##opt( \
+ CONVERT_TO_BYTEPTR(fdata2) + 192, 64, dst, dst_stride, filter_x, \
+ x_step_q4, filter_y, y_step_q4, w, h, bd); \
+ } else { \
+ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
+ vpx_highbd_convolve8_horiz_##opt( \
+ src, src_stride, CONVERT_TO_BYTEPTR(fdata2), 64, filter_x, \
+ x_step_q4, filter_y, y_step_q4, w, h + 1, bd); \
+ vpx_highbd_convolve8_##avg##vert_##opt( \
+ CONVERT_TO_BYTEPTR(fdata2), 64, dst, dst_stride, filter_x, \
+ x_step_q4, filter_y, y_step_q4, w, h, bd); \
+ } \
+ } else { \
+ vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, y_step_q4, \
+ w, h, bd); \
+ } \
+ }
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // VPX_DSP_X86_CONVOLVE_H_
diff --git a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
index 951af3a62..39d3a3f59 100644
--- a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
+++ b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
@@ -13,15 +13,15 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/txfm_common.h"
-#define pair256_set_epi16(a, b) \
+#define pair256_set_epi16(a, b) \
_mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
(int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
(int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
(int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
-#define pair256_set_epi32(a, b) \
- _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), \
- (int)(b), (int)(a), (int)(b), (int)(a))
+#define pair256_set_epi32(a, b) \
+ _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \
+ (int)(b), (int)(a))
#if FDCT32x32_HIGH_PRECISION
static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) {
@@ -40,8 +40,7 @@ static INLINE __m256i k_packs_epi64_avx2(__m256i a, __m256i b) {
}
#endif
-void FDCT32x32_2D_AVX2(const int16_t *input,
- int16_t *output_org, int stride) {
+void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
// Calculate pre-multiplied strides
const int str1 = stride;
const int str2 = 2 * stride;
@@ -53,43 +52,45 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
// it's a pair of them that we need to repeat four times. This is done
// by constructing the 32 bit constant corresponding to that pair.
const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
- const __m256i k__cospi_p16_m16 = pair256_set_epi16(+cospi_16_64, -cospi_16_64);
- const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m256i k__cospi_p16_m16 =
+ pair256_set_epi16(+cospi_16_64, -cospi_16_64);
+ const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64);
- const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64);
- const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
- const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
- const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64);
+ const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64);
+ const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64);
+ const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64);
const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
- const __m256i k__cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
- const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64, cospi_2_64);
- const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64, cospi_18_64);
- const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64, cospi_10_64);
- const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64, cospi_26_64);
- const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64);
- const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64);
- const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64);
- const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64);
- const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64, cospi_1_64);
- const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64, cospi_17_64);
- const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64, cospi_9_64);
- const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64, cospi_25_64);
- const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64);
- const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64);
- const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64);
- const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64);
- const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64, cospi_5_64);
- const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64, cospi_21_64);
- const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64, cospi_13_64);
- const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64, cospi_29_64);
- const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64);
- const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64);
- const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
- const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
+ const __m256i k__cospi_m12_m20 =
+ pair256_set_epi16(-cospi_12_64, -cospi_20_64);
+ const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64, cospi_2_64);
+ const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64, cospi_18_64);
+ const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64, cospi_10_64);
+ const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64, cospi_26_64);
+ const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64, cospi_1_64);
+ const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64, cospi_17_64);
+ const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64, cospi_9_64);
+ const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64, cospi_25_64);
+ const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64);
+ const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64);
+ const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64);
+ const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64);
+ const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64, cospi_5_64);
+ const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64, cospi_21_64);
+ const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64, cospi_13_64);
+ const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64, cospi_29_64);
+ const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64);
+ const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64);
+ const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
+ const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
const __m256i kZero = _mm256_set1_epi16(0);
- const __m256i kOne = _mm256_set1_epi16(1);
+ const __m256i kOne = _mm256_set1_epi16(1);
// Do the two transform/transpose passes
int pass;
for (pass = 0; pass < 2; ++pass) {
@@ -104,125 +105,149 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
// Note: even though all the loads below are aligned, using the aligned
// intrinsic make the code slightly slower.
if (0 == pass) {
- const int16_t *in = &input[column_start];
+ const int16_t *in = &input[column_start];
// step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
// Note: the next four blocks could be in a loop. That would help the
// instruction cache but is actually slower.
{
- const int16_t *ina = in + 0 * str1;
- const int16_t *inb = in + 31 * str1;
- __m256i *step1a = &step1[ 0];
+ const int16_t *ina = in + 0 * str1;
+ const int16_t *inb = in + 31 * str1;
+ __m256i *step1a = &step1[0];
__m256i *step1b = &step1[31];
- const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
- const __m256i ina1 = _mm256_loadu_si256((const __m256i *)(ina + str1));
- const __m256i ina2 = _mm256_loadu_si256((const __m256i *)(ina + str2));
- const __m256i ina3 = _mm256_loadu_si256((const __m256i *)(ina + str3));
- const __m256i inb3 = _mm256_loadu_si256((const __m256i *)(inb - str3));
- const __m256i inb2 = _mm256_loadu_si256((const __m256i *)(inb - str2));
- const __m256i inb1 = _mm256_loadu_si256((const __m256i *)(inb - str1));
- const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
- step1a[ 0] = _mm256_add_epi16(ina0, inb0);
- step1a[ 1] = _mm256_add_epi16(ina1, inb1);
- step1a[ 2] = _mm256_add_epi16(ina2, inb2);
- step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+ const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+ const __m256i ina1 =
+ _mm256_loadu_si256((const __m256i *)(ina + str1));
+ const __m256i ina2 =
+ _mm256_loadu_si256((const __m256i *)(ina + str2));
+ const __m256i ina3 =
+ _mm256_loadu_si256((const __m256i *)(ina + str3));
+ const __m256i inb3 =
+ _mm256_loadu_si256((const __m256i *)(inb - str3));
+ const __m256i inb2 =
+ _mm256_loadu_si256((const __m256i *)(inb - str2));
+ const __m256i inb1 =
+ _mm256_loadu_si256((const __m256i *)(inb - str1));
+ const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+ step1a[0] = _mm256_add_epi16(ina0, inb0);
+ step1a[1] = _mm256_add_epi16(ina1, inb1);
+ step1a[2] = _mm256_add_epi16(ina2, inb2);
+ step1a[3] = _mm256_add_epi16(ina3, inb3);
step1b[-3] = _mm256_sub_epi16(ina3, inb3);
step1b[-2] = _mm256_sub_epi16(ina2, inb2);
step1b[-1] = _mm256_sub_epi16(ina1, inb1);
step1b[-0] = _mm256_sub_epi16(ina0, inb0);
- step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
- step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
- step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
- step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+ step1a[0] = _mm256_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm256_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm256_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm256_slli_epi16(step1a[3], 2);
step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
}
{
- const int16_t *ina = in + 4 * str1;
- const int16_t *inb = in + 27 * str1;
- __m256i *step1a = &step1[ 4];
+ const int16_t *ina = in + 4 * str1;
+ const int16_t *inb = in + 27 * str1;
+ __m256i *step1a = &step1[4];
__m256i *step1b = &step1[27];
- const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
- const __m256i ina1 = _mm256_loadu_si256((const __m256i *)(ina + str1));
- const __m256i ina2 = _mm256_loadu_si256((const __m256i *)(ina + str2));
- const __m256i ina3 = _mm256_loadu_si256((const __m256i *)(ina + str3));
- const __m256i inb3 = _mm256_loadu_si256((const __m256i *)(inb - str3));
- const __m256i inb2 = _mm256_loadu_si256((const __m256i *)(inb - str2));
- const __m256i inb1 = _mm256_loadu_si256((const __m256i *)(inb - str1));
- const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
- step1a[ 0] = _mm256_add_epi16(ina0, inb0);
- step1a[ 1] = _mm256_add_epi16(ina1, inb1);
- step1a[ 2] = _mm256_add_epi16(ina2, inb2);
- step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+ const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+ const __m256i ina1 =
+ _mm256_loadu_si256((const __m256i *)(ina + str1));
+ const __m256i ina2 =
+ _mm256_loadu_si256((const __m256i *)(ina + str2));
+ const __m256i ina3 =
+ _mm256_loadu_si256((const __m256i *)(ina + str3));
+ const __m256i inb3 =
+ _mm256_loadu_si256((const __m256i *)(inb - str3));
+ const __m256i inb2 =
+ _mm256_loadu_si256((const __m256i *)(inb - str2));
+ const __m256i inb1 =
+ _mm256_loadu_si256((const __m256i *)(inb - str1));
+ const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+ step1a[0] = _mm256_add_epi16(ina0, inb0);
+ step1a[1] = _mm256_add_epi16(ina1, inb1);
+ step1a[2] = _mm256_add_epi16(ina2, inb2);
+ step1a[3] = _mm256_add_epi16(ina3, inb3);
step1b[-3] = _mm256_sub_epi16(ina3, inb3);
step1b[-2] = _mm256_sub_epi16(ina2, inb2);
step1b[-1] = _mm256_sub_epi16(ina1, inb1);
step1b[-0] = _mm256_sub_epi16(ina0, inb0);
- step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
- step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
- step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
- step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+ step1a[0] = _mm256_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm256_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm256_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm256_slli_epi16(step1a[3], 2);
step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
}
{
- const int16_t *ina = in + 8 * str1;
- const int16_t *inb = in + 23 * str1;
- __m256i *step1a = &step1[ 8];
+ const int16_t *ina = in + 8 * str1;
+ const int16_t *inb = in + 23 * str1;
+ __m256i *step1a = &step1[8];
__m256i *step1b = &step1[23];
- const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
- const __m256i ina1 = _mm256_loadu_si256((const __m256i *)(ina + str1));
- const __m256i ina2 = _mm256_loadu_si256((const __m256i *)(ina + str2));
- const __m256i ina3 = _mm256_loadu_si256((const __m256i *)(ina + str3));
- const __m256i inb3 = _mm256_loadu_si256((const __m256i *)(inb - str3));
- const __m256i inb2 = _mm256_loadu_si256((const __m256i *)(inb - str2));
- const __m256i inb1 = _mm256_loadu_si256((const __m256i *)(inb - str1));
- const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
- step1a[ 0] = _mm256_add_epi16(ina0, inb0);
- step1a[ 1] = _mm256_add_epi16(ina1, inb1);
- step1a[ 2] = _mm256_add_epi16(ina2, inb2);
- step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+ const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+ const __m256i ina1 =
+ _mm256_loadu_si256((const __m256i *)(ina + str1));
+ const __m256i ina2 =
+ _mm256_loadu_si256((const __m256i *)(ina + str2));
+ const __m256i ina3 =
+ _mm256_loadu_si256((const __m256i *)(ina + str3));
+ const __m256i inb3 =
+ _mm256_loadu_si256((const __m256i *)(inb - str3));
+ const __m256i inb2 =
+ _mm256_loadu_si256((const __m256i *)(inb - str2));
+ const __m256i inb1 =
+ _mm256_loadu_si256((const __m256i *)(inb - str1));
+ const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+ step1a[0] = _mm256_add_epi16(ina0, inb0);
+ step1a[1] = _mm256_add_epi16(ina1, inb1);
+ step1a[2] = _mm256_add_epi16(ina2, inb2);
+ step1a[3] = _mm256_add_epi16(ina3, inb3);
step1b[-3] = _mm256_sub_epi16(ina3, inb3);
step1b[-2] = _mm256_sub_epi16(ina2, inb2);
step1b[-1] = _mm256_sub_epi16(ina1, inb1);
step1b[-0] = _mm256_sub_epi16(ina0, inb0);
- step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
- step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
- step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
- step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+ step1a[0] = _mm256_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm256_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm256_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm256_slli_epi16(step1a[3], 2);
step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
}
{
- const int16_t *ina = in + 12 * str1;
- const int16_t *inb = in + 19 * str1;
+ const int16_t *ina = in + 12 * str1;
+ const int16_t *inb = in + 19 * str1;
__m256i *step1a = &step1[12];
__m256i *step1b = &step1[19];
- const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
- const __m256i ina1 = _mm256_loadu_si256((const __m256i *)(ina + str1));
- const __m256i ina2 = _mm256_loadu_si256((const __m256i *)(ina + str2));
- const __m256i ina3 = _mm256_loadu_si256((const __m256i *)(ina + str3));
- const __m256i inb3 = _mm256_loadu_si256((const __m256i *)(inb - str3));
- const __m256i inb2 = _mm256_loadu_si256((const __m256i *)(inb - str2));
- const __m256i inb1 = _mm256_loadu_si256((const __m256i *)(inb - str1));
- const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
- step1a[ 0] = _mm256_add_epi16(ina0, inb0);
- step1a[ 1] = _mm256_add_epi16(ina1, inb1);
- step1a[ 2] = _mm256_add_epi16(ina2, inb2);
- step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+ const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+ const __m256i ina1 =
+ _mm256_loadu_si256((const __m256i *)(ina + str1));
+ const __m256i ina2 =
+ _mm256_loadu_si256((const __m256i *)(ina + str2));
+ const __m256i ina3 =
+ _mm256_loadu_si256((const __m256i *)(ina + str3));
+ const __m256i inb3 =
+ _mm256_loadu_si256((const __m256i *)(inb - str3));
+ const __m256i inb2 =
+ _mm256_loadu_si256((const __m256i *)(inb - str2));
+ const __m256i inb1 =
+ _mm256_loadu_si256((const __m256i *)(inb - str1));
+ const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+ step1a[0] = _mm256_add_epi16(ina0, inb0);
+ step1a[1] = _mm256_add_epi16(ina1, inb1);
+ step1a[2] = _mm256_add_epi16(ina2, inb2);
+ step1a[3] = _mm256_add_epi16(ina3, inb3);
step1b[-3] = _mm256_sub_epi16(ina3, inb3);
step1b[-2] = _mm256_sub_epi16(ina2, inb2);
step1b[-1] = _mm256_sub_epi16(ina1, inb1);
step1b[-0] = _mm256_sub_epi16(ina0, inb0);
- step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
- step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
- step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
- step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+ step1a[0] = _mm256_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm256_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm256_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm256_slli_epi16(step1a[3], 2);
step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
@@ -237,52 +262,52 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
// Note: the next four blocks could be in a loop. That would help the
// instruction cache but is actually slower.
{
- __m256i in00 = _mm256_loadu_si256((const __m256i *)(in + 0 * 32));
- __m256i in01 = _mm256_loadu_si256((const __m256i *)(in + 1 * 32));
- __m256i in02 = _mm256_loadu_si256((const __m256i *)(in + 2 * 32));
- __m256i in03 = _mm256_loadu_si256((const __m256i *)(in + 3 * 32));
- __m256i in28 = _mm256_loadu_si256((const __m256i *)(in + 28 * 32));
- __m256i in29 = _mm256_loadu_si256((const __m256i *)(in + 29 * 32));
- __m256i in30 = _mm256_loadu_si256((const __m256i *)(in + 30 * 32));
- __m256i in31 = _mm256_loadu_si256((const __m256i *)(in + 31 * 32));
- step1[ 0] = _mm256_add_epi16(in00, in31);
- step1[ 1] = _mm256_add_epi16(in01, in30);
- step1[ 2] = _mm256_add_epi16(in02, in29);
- step1[ 3] = _mm256_add_epi16(in03, in28);
+ __m256i in00 = _mm256_loadu_si256((const __m256i *)(in + 0 * 32));
+ __m256i in01 = _mm256_loadu_si256((const __m256i *)(in + 1 * 32));
+ __m256i in02 = _mm256_loadu_si256((const __m256i *)(in + 2 * 32));
+ __m256i in03 = _mm256_loadu_si256((const __m256i *)(in + 3 * 32));
+ __m256i in28 = _mm256_loadu_si256((const __m256i *)(in + 28 * 32));
+ __m256i in29 = _mm256_loadu_si256((const __m256i *)(in + 29 * 32));
+ __m256i in30 = _mm256_loadu_si256((const __m256i *)(in + 30 * 32));
+ __m256i in31 = _mm256_loadu_si256((const __m256i *)(in + 31 * 32));
+ step1[0] = _mm256_add_epi16(in00, in31);
+ step1[1] = _mm256_add_epi16(in01, in30);
+ step1[2] = _mm256_add_epi16(in02, in29);
+ step1[3] = _mm256_add_epi16(in03, in28);
step1[28] = _mm256_sub_epi16(in03, in28);
step1[29] = _mm256_sub_epi16(in02, in29);
step1[30] = _mm256_sub_epi16(in01, in30);
step1[31] = _mm256_sub_epi16(in00, in31);
}
{
- __m256i in04 = _mm256_loadu_si256((const __m256i *)(in + 4 * 32));
- __m256i in05 = _mm256_loadu_si256((const __m256i *)(in + 5 * 32));
- __m256i in06 = _mm256_loadu_si256((const __m256i *)(in + 6 * 32));
- __m256i in07 = _mm256_loadu_si256((const __m256i *)(in + 7 * 32));
- __m256i in24 = _mm256_loadu_si256((const __m256i *)(in + 24 * 32));
- __m256i in25 = _mm256_loadu_si256((const __m256i *)(in + 25 * 32));
- __m256i in26 = _mm256_loadu_si256((const __m256i *)(in + 26 * 32));
- __m256i in27 = _mm256_loadu_si256((const __m256i *)(in + 27 * 32));
- step1[ 4] = _mm256_add_epi16(in04, in27);
- step1[ 5] = _mm256_add_epi16(in05, in26);
- step1[ 6] = _mm256_add_epi16(in06, in25);
- step1[ 7] = _mm256_add_epi16(in07, in24);
+ __m256i in04 = _mm256_loadu_si256((const __m256i *)(in + 4 * 32));
+ __m256i in05 = _mm256_loadu_si256((const __m256i *)(in + 5 * 32));
+ __m256i in06 = _mm256_loadu_si256((const __m256i *)(in + 6 * 32));
+ __m256i in07 = _mm256_loadu_si256((const __m256i *)(in + 7 * 32));
+ __m256i in24 = _mm256_loadu_si256((const __m256i *)(in + 24 * 32));
+ __m256i in25 = _mm256_loadu_si256((const __m256i *)(in + 25 * 32));
+ __m256i in26 = _mm256_loadu_si256((const __m256i *)(in + 26 * 32));
+ __m256i in27 = _mm256_loadu_si256((const __m256i *)(in + 27 * 32));
+ step1[4] = _mm256_add_epi16(in04, in27);
+ step1[5] = _mm256_add_epi16(in05, in26);
+ step1[6] = _mm256_add_epi16(in06, in25);
+ step1[7] = _mm256_add_epi16(in07, in24);
step1[24] = _mm256_sub_epi16(in07, in24);
step1[25] = _mm256_sub_epi16(in06, in25);
step1[26] = _mm256_sub_epi16(in05, in26);
step1[27] = _mm256_sub_epi16(in04, in27);
}
{
- __m256i in08 = _mm256_loadu_si256((const __m256i *)(in + 8 * 32));
- __m256i in09 = _mm256_loadu_si256((const __m256i *)(in + 9 * 32));
- __m256i in10 = _mm256_loadu_si256((const __m256i *)(in + 10 * 32));
- __m256i in11 = _mm256_loadu_si256((const __m256i *)(in + 11 * 32));
- __m256i in20 = _mm256_loadu_si256((const __m256i *)(in + 20 * 32));
- __m256i in21 = _mm256_loadu_si256((const __m256i *)(in + 21 * 32));
- __m256i in22 = _mm256_loadu_si256((const __m256i *)(in + 22 * 32));
- __m256i in23 = _mm256_loadu_si256((const __m256i *)(in + 23 * 32));
- step1[ 8] = _mm256_add_epi16(in08, in23);
- step1[ 9] = _mm256_add_epi16(in09, in22);
+ __m256i in08 = _mm256_loadu_si256((const __m256i *)(in + 8 * 32));
+ __m256i in09 = _mm256_loadu_si256((const __m256i *)(in + 9 * 32));
+ __m256i in10 = _mm256_loadu_si256((const __m256i *)(in + 10 * 32));
+ __m256i in11 = _mm256_loadu_si256((const __m256i *)(in + 11 * 32));
+ __m256i in20 = _mm256_loadu_si256((const __m256i *)(in + 20 * 32));
+ __m256i in21 = _mm256_loadu_si256((const __m256i *)(in + 21 * 32));
+ __m256i in22 = _mm256_loadu_si256((const __m256i *)(in + 22 * 32));
+ __m256i in23 = _mm256_loadu_si256((const __m256i *)(in + 23 * 32));
+ step1[8] = _mm256_add_epi16(in08, in23);
+ step1[9] = _mm256_add_epi16(in09, in22);
step1[10] = _mm256_add_epi16(in10, in21);
step1[11] = _mm256_add_epi16(in11, in20);
step1[20] = _mm256_sub_epi16(in11, in20);
@@ -291,14 +316,14 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
step1[23] = _mm256_sub_epi16(in08, in23);
}
{
- __m256i in12 = _mm256_loadu_si256((const __m256i *)(in + 12 * 32));
- __m256i in13 = _mm256_loadu_si256((const __m256i *)(in + 13 * 32));
- __m256i in14 = _mm256_loadu_si256((const __m256i *)(in + 14 * 32));
- __m256i in15 = _mm256_loadu_si256((const __m256i *)(in + 15 * 32));
- __m256i in16 = _mm256_loadu_si256((const __m256i *)(in + 16 * 32));
- __m256i in17 = _mm256_loadu_si256((const __m256i *)(in + 17 * 32));
- __m256i in18 = _mm256_loadu_si256((const __m256i *)(in + 18 * 32));
- __m256i in19 = _mm256_loadu_si256((const __m256i *)(in + 19 * 32));
+ __m256i in12 = _mm256_loadu_si256((const __m256i *)(in + 12 * 32));
+ __m256i in13 = _mm256_loadu_si256((const __m256i *)(in + 13 * 32));
+ __m256i in14 = _mm256_loadu_si256((const __m256i *)(in + 14 * 32));
+ __m256i in15 = _mm256_loadu_si256((const __m256i *)(in + 15 * 32));
+ __m256i in16 = _mm256_loadu_si256((const __m256i *)(in + 16 * 32));
+ __m256i in17 = _mm256_loadu_si256((const __m256i *)(in + 17 * 32));
+ __m256i in18 = _mm256_loadu_si256((const __m256i *)(in + 18 * 32));
+ __m256i in19 = _mm256_loadu_si256((const __m256i *)(in + 19 * 32));
step1[12] = _mm256_add_epi16(in12, in19);
step1[13] = _mm256_add_epi16(in13, in18);
step1[14] = _mm256_add_epi16(in14, in17);
@@ -311,16 +336,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
}
// Stage 2
{
- step2[ 0] = _mm256_add_epi16(step1[0], step1[15]);
- step2[ 1] = _mm256_add_epi16(step1[1], step1[14]);
- step2[ 2] = _mm256_add_epi16(step1[2], step1[13]);
- step2[ 3] = _mm256_add_epi16(step1[3], step1[12]);
- step2[ 4] = _mm256_add_epi16(step1[4], step1[11]);
- step2[ 5] = _mm256_add_epi16(step1[5], step1[10]);
- step2[ 6] = _mm256_add_epi16(step1[6], step1[ 9]);
- step2[ 7] = _mm256_add_epi16(step1[7], step1[ 8]);
- step2[ 8] = _mm256_sub_epi16(step1[7], step1[ 8]);
- step2[ 9] = _mm256_sub_epi16(step1[6], step1[ 9]);
+ step2[0] = _mm256_add_epi16(step1[0], step1[15]);
+ step2[1] = _mm256_add_epi16(step1[1], step1[14]);
+ step2[2] = _mm256_add_epi16(step1[2], step1[13]);
+ step2[3] = _mm256_add_epi16(step1[3], step1[12]);
+ step2[4] = _mm256_add_epi16(step1[4], step1[11]);
+ step2[5] = _mm256_add_epi16(step1[5], step1[10]);
+ step2[6] = _mm256_add_epi16(step1[6], step1[9]);
+ step2[7] = _mm256_add_epi16(step1[7], step1[8]);
+ step2[8] = _mm256_sub_epi16(step1[7], step1[8]);
+ step2[9] = _mm256_sub_epi16(step1[6], step1[9]);
step2[10] = _mm256_sub_epi16(step1[5], step1[10]);
step2[11] = _mm256_sub_epi16(step1[4], step1[11]);
step2[12] = _mm256_sub_epi16(step1[3], step1[12]);
@@ -354,22 +379,38 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
const __m256i s2_27_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_p16);
const __m256i s2_27_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_p16);
// dct_const_round_shift
- const __m256i s2_20_4 = _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
- const __m256i s2_20_5 = _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
- const __m256i s2_21_4 = _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
- const __m256i s2_21_5 = _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
- const __m256i s2_22_4 = _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
- const __m256i s2_22_5 = _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
- const __m256i s2_23_4 = _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
- const __m256i s2_23_5 = _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
- const __m256i s2_24_4 = _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
- const __m256i s2_24_5 = _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
- const __m256i s2_25_4 = _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
- const __m256i s2_25_5 = _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
- const __m256i s2_26_4 = _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
- const __m256i s2_26_5 = _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
- const __m256i s2_27_4 = _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
- const __m256i s2_27_5 = _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_20_4 =
+ _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_20_5 =
+ _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_21_4 =
+ _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_21_5 =
+ _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_22_4 =
+ _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_22_5 =
+ _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_23_4 =
+ _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_23_5 =
+ _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_24_4 =
+ _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_24_5 =
+ _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_25_4 =
+ _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_25_5 =
+ _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_26_4 =
+ _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_26_5 =
+ _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_27_4 =
+ _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_27_5 =
+ _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
const __m256i s2_20_6 = _mm256_srai_epi32(s2_20_4, DCT_CONST_BITS);
const __m256i s2_20_7 = _mm256_srai_epi32(s2_20_5, DCT_CONST_BITS);
const __m256i s2_21_6 = _mm256_srai_epi32(s2_21_4, DCT_CONST_BITS);
@@ -401,49 +442,49 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
// dump the magnitude by half, hence the intermediate values are within
// the range of 16 bits.
if (1 == pass) {
- __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero,step2[ 0]);
- __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero,step2[ 1]);
- __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero,step2[ 2]);
- __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero,step2[ 3]);
- __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero,step2[ 4]);
- __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero,step2[ 5]);
- __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero,step2[ 6]);
- __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero,step2[ 7]);
- __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero,step2[ 8]);
- __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero,step2[ 9]);
- __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero,step2[10]);
- __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero,step2[11]);
- __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero,step2[12]);
- __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero,step2[13]);
- __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero,step2[14]);
- __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero,step2[15]);
- __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero,step1[16]);
- __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero,step1[17]);
- __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero,step1[18]);
- __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero,step1[19]);
- __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero,step2[20]);
- __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero,step2[21]);
- __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero,step2[22]);
- __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero,step2[23]);
- __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero,step2[24]);
- __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero,step2[25]);
- __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero,step2[26]);
- __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero,step2[27]);
- __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero,step1[28]);
- __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero,step1[29]);
- __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero,step1[30]);
- __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero,step1[31]);
-
- step2[ 0] = _mm256_sub_epi16(step2[ 0], s3_00_0);
- step2[ 1] = _mm256_sub_epi16(step2[ 1], s3_01_0);
- step2[ 2] = _mm256_sub_epi16(step2[ 2], s3_02_0);
- step2[ 3] = _mm256_sub_epi16(step2[ 3], s3_03_0);
- step2[ 4] = _mm256_sub_epi16(step2[ 4], s3_04_0);
- step2[ 5] = _mm256_sub_epi16(step2[ 5], s3_05_0);
- step2[ 6] = _mm256_sub_epi16(step2[ 6], s3_06_0);
- step2[ 7] = _mm256_sub_epi16(step2[ 7], s3_07_0);
- step2[ 8] = _mm256_sub_epi16(step2[ 8], s2_08_0);
- step2[ 9] = _mm256_sub_epi16(step2[ 9], s2_09_0);
+ __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero, step2[0]);
+ __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero, step2[1]);
+ __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero, step2[2]);
+ __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero, step2[3]);
+ __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero, step2[4]);
+ __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero, step2[5]);
+ __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero, step2[6]);
+ __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero, step2[7]);
+ __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero, step2[8]);
+ __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero, step2[9]);
+ __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero, step2[10]);
+ __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero, step2[11]);
+ __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero, step2[12]);
+ __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero, step2[13]);
+ __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero, step2[14]);
+ __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero, step2[15]);
+ __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero, step1[16]);
+ __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero, step1[17]);
+ __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero, step1[18]);
+ __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero, step1[19]);
+ __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero, step2[20]);
+ __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero, step2[21]);
+ __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero, step2[22]);
+ __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero, step2[23]);
+ __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero, step2[24]);
+ __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero, step2[25]);
+ __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero, step2[26]);
+ __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero, step2[27]);
+ __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero, step1[28]);
+ __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero, step1[29]);
+ __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero, step1[30]);
+ __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero, step1[31]);
+
+ step2[0] = _mm256_sub_epi16(step2[0], s3_00_0);
+ step2[1] = _mm256_sub_epi16(step2[1], s3_01_0);
+ step2[2] = _mm256_sub_epi16(step2[2], s3_02_0);
+ step2[3] = _mm256_sub_epi16(step2[3], s3_03_0);
+ step2[4] = _mm256_sub_epi16(step2[4], s3_04_0);
+ step2[5] = _mm256_sub_epi16(step2[5], s3_05_0);
+ step2[6] = _mm256_sub_epi16(step2[6], s3_06_0);
+ step2[7] = _mm256_sub_epi16(step2[7], s3_07_0);
+ step2[8] = _mm256_sub_epi16(step2[8], s2_08_0);
+ step2[9] = _mm256_sub_epi16(step2[9], s2_09_0);
step2[10] = _mm256_sub_epi16(step2[10], s3_10_0);
step2[11] = _mm256_sub_epi16(step2[11], s3_11_0);
step2[12] = _mm256_sub_epi16(step2[12], s3_12_0);
@@ -467,16 +508,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
step1[30] = _mm256_sub_epi16(step1[30], s3_30_0);
step1[31] = _mm256_sub_epi16(step1[31], s3_31_0);
- step2[ 0] = _mm256_add_epi16(step2[ 0], kOne);
- step2[ 1] = _mm256_add_epi16(step2[ 1], kOne);
- step2[ 2] = _mm256_add_epi16(step2[ 2], kOne);
- step2[ 3] = _mm256_add_epi16(step2[ 3], kOne);
- step2[ 4] = _mm256_add_epi16(step2[ 4], kOne);
- step2[ 5] = _mm256_add_epi16(step2[ 5], kOne);
- step2[ 6] = _mm256_add_epi16(step2[ 6], kOne);
- step2[ 7] = _mm256_add_epi16(step2[ 7], kOne);
- step2[ 8] = _mm256_add_epi16(step2[ 8], kOne);
- step2[ 9] = _mm256_add_epi16(step2[ 9], kOne);
+ step2[0] = _mm256_add_epi16(step2[0], kOne);
+ step2[1] = _mm256_add_epi16(step2[1], kOne);
+ step2[2] = _mm256_add_epi16(step2[2], kOne);
+ step2[3] = _mm256_add_epi16(step2[3], kOne);
+ step2[4] = _mm256_add_epi16(step2[4], kOne);
+ step2[5] = _mm256_add_epi16(step2[5], kOne);
+ step2[6] = _mm256_add_epi16(step2[6], kOne);
+ step2[7] = _mm256_add_epi16(step2[7], kOne);
+ step2[8] = _mm256_add_epi16(step2[8], kOne);
+ step2[9] = _mm256_add_epi16(step2[9], kOne);
step2[10] = _mm256_add_epi16(step2[10], kOne);
step2[11] = _mm256_add_epi16(step2[11], kOne);
step2[12] = _mm256_add_epi16(step2[12], kOne);
@@ -500,16 +541,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
step1[30] = _mm256_add_epi16(step1[30], kOne);
step1[31] = _mm256_add_epi16(step1[31], kOne);
- step2[ 0] = _mm256_srai_epi16(step2[ 0], 2);
- step2[ 1] = _mm256_srai_epi16(step2[ 1], 2);
- step2[ 2] = _mm256_srai_epi16(step2[ 2], 2);
- step2[ 3] = _mm256_srai_epi16(step2[ 3], 2);
- step2[ 4] = _mm256_srai_epi16(step2[ 4], 2);
- step2[ 5] = _mm256_srai_epi16(step2[ 5], 2);
- step2[ 6] = _mm256_srai_epi16(step2[ 6], 2);
- step2[ 7] = _mm256_srai_epi16(step2[ 7], 2);
- step2[ 8] = _mm256_srai_epi16(step2[ 8], 2);
- step2[ 9] = _mm256_srai_epi16(step2[ 9], 2);
+ step2[0] = _mm256_srai_epi16(step2[0], 2);
+ step2[1] = _mm256_srai_epi16(step2[1], 2);
+ step2[2] = _mm256_srai_epi16(step2[2], 2);
+ step2[3] = _mm256_srai_epi16(step2[3], 2);
+ step2[4] = _mm256_srai_epi16(step2[4], 2);
+ step2[5] = _mm256_srai_epi16(step2[5], 2);
+ step2[6] = _mm256_srai_epi16(step2[6], 2);
+ step2[7] = _mm256_srai_epi16(step2[7], 2);
+ step2[8] = _mm256_srai_epi16(step2[8], 2);
+ step2[9] = _mm256_srai_epi16(step2[9], 2);
step2[10] = _mm256_srai_epi16(step2[10], 2);
step2[11] = _mm256_srai_epi16(step2[11], 2);
step2[12] = _mm256_srai_epi16(step2[12], 2);
@@ -538,616 +579,796 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
#if FDCT32x32_HIGH_PRECISION
if (pass == 0) {
#endif
- // Stage 3
- {
- step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]);
- step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]);
- step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]);
- step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]);
- step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]);
- step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]);
- step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]);
- step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]);
- }
- {
- const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
- const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
- const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
- const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
- const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
- const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
- const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
- const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
- const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
- const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
- const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
- const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m256i s3_10_4 = _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
- const __m256i s3_10_5 = _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
- const __m256i s3_11_4 = _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
- const __m256i s3_11_5 = _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
- const __m256i s3_12_4 = _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
- const __m256i s3_12_5 = _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
- const __m256i s3_13_4 = _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
- const __m256i s3_13_5 = _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
- const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
- const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
- const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
- const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
- const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
- const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
- const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
- const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
- // Combine
- step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7);
- step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7);
- step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7);
- step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7);
- }
- {
- step3[16] = _mm256_add_epi16(step2[23], step1[16]);
- step3[17] = _mm256_add_epi16(step2[22], step1[17]);
- step3[18] = _mm256_add_epi16(step2[21], step1[18]);
- step3[19] = _mm256_add_epi16(step2[20], step1[19]);
- step3[20] = _mm256_sub_epi16(step1[19], step2[20]);
- step3[21] = _mm256_sub_epi16(step1[18], step2[21]);
- step3[22] = _mm256_sub_epi16(step1[17], step2[22]);
- step3[23] = _mm256_sub_epi16(step1[16], step2[23]);
- step3[24] = _mm256_sub_epi16(step1[31], step2[24]);
- step3[25] = _mm256_sub_epi16(step1[30], step2[25]);
- step3[26] = _mm256_sub_epi16(step1[29], step2[26]);
- step3[27] = _mm256_sub_epi16(step1[28], step2[27]);
- step3[28] = _mm256_add_epi16(step2[27], step1[28]);
- step3[29] = _mm256_add_epi16(step2[26], step1[29]);
- step3[30] = _mm256_add_epi16(step2[25], step1[30]);
- step3[31] = _mm256_add_epi16(step2[24], step1[31]);
- }
+ // Stage 3
+ {
+ step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]);
+ step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]);
+ step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]);
+ step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]);
+ step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]);
+ step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]);
+ step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]);
+ step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]);
+ }
+ {
+ const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
+ const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
+ const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
+ const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
+ const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m256i s3_10_4 =
+ _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_10_5 =
+ _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_11_4 =
+ _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_11_5 =
+ _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_12_4 =
+ _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_12_5 =
+ _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_13_4 =
+ _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_13_5 =
+ _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ // Combine
+ step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7);
+ step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7);
+ step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7);
+ step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7);
+ }
+ {
+ step3[16] = _mm256_add_epi16(step2[23], step1[16]);
+ step3[17] = _mm256_add_epi16(step2[22], step1[17]);
+ step3[18] = _mm256_add_epi16(step2[21], step1[18]);
+ step3[19] = _mm256_add_epi16(step2[20], step1[19]);
+ step3[20] = _mm256_sub_epi16(step1[19], step2[20]);
+ step3[21] = _mm256_sub_epi16(step1[18], step2[21]);
+ step3[22] = _mm256_sub_epi16(step1[17], step2[22]);
+ step3[23] = _mm256_sub_epi16(step1[16], step2[23]);
+ step3[24] = _mm256_sub_epi16(step1[31], step2[24]);
+ step3[25] = _mm256_sub_epi16(step1[30], step2[25]);
+ step3[26] = _mm256_sub_epi16(step1[29], step2[26]);
+ step3[27] = _mm256_sub_epi16(step1[28], step2[27]);
+ step3[28] = _mm256_add_epi16(step2[27], step1[28]);
+ step3[29] = _mm256_add_epi16(step2[26], step1[29]);
+ step3[30] = _mm256_add_epi16(step2[25], step1[30]);
+ step3[31] = _mm256_add_epi16(step2[24], step1[31]);
+ }
- // Stage 4
- {
- step1[ 0] = _mm256_add_epi16(step3[ 3], step3[ 0]);
- step1[ 1] = _mm256_add_epi16(step3[ 2], step3[ 1]);
- step1[ 2] = _mm256_sub_epi16(step3[ 1], step3[ 2]);
- step1[ 3] = _mm256_sub_epi16(step3[ 0], step3[ 3]);
- step1[ 8] = _mm256_add_epi16(step3[11], step2[ 8]);
- step1[ 9] = _mm256_add_epi16(step3[10], step2[ 9]);
- step1[10] = _mm256_sub_epi16(step2[ 9], step3[10]);
- step1[11] = _mm256_sub_epi16(step2[ 8], step3[11]);
- step1[12] = _mm256_sub_epi16(step2[15], step3[12]);
- step1[13] = _mm256_sub_epi16(step2[14], step3[13]);
- step1[14] = _mm256_add_epi16(step3[13], step2[14]);
- step1[15] = _mm256_add_epi16(step3[12], step2[15]);
- }
- {
- const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]);
- const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]);
- const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16);
- const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16);
- const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16);
- const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m256i s1_05_4 = _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
- const __m256i s1_05_5 = _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
- const __m256i s1_06_4 = _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
- const __m256i s1_06_5 = _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
- const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS);
- const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS);
- const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS);
- const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS);
- // Combine
- step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7);
- step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7);
- }
- {
- const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]);
- const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]);
- const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]);
- const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]);
- const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]);
- const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]);
- const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]);
- const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]);
- const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24);
- const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24);
- const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24);
- const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24);
- const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08);
- const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08);
- const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08);
- const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08);
- const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24);
- const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24);
- const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24);
- const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24);
- const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08);
- const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08);
- const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08);
- const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08);
- // dct_const_round_shift
- const __m256i s1_18_4 = _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
- const __m256i s1_18_5 = _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
- const __m256i s1_19_4 = _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
- const __m256i s1_19_5 = _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
- const __m256i s1_20_4 = _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
- const __m256i s1_20_5 = _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
- const __m256i s1_21_4 = _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
- const __m256i s1_21_5 = _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
- const __m256i s1_26_4 = _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
- const __m256i s1_26_5 = _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
- const __m256i s1_27_4 = _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
- const __m256i s1_27_5 = _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
- const __m256i s1_28_4 = _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
- const __m256i s1_28_5 = _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
- const __m256i s1_29_4 = _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
- const __m256i s1_29_5 = _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
- const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS);
- const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS);
- const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS);
- const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS);
- const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS);
- const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS);
- const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS);
- const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS);
- const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS);
- const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS);
- const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS);
- const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS);
- const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS);
- const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS);
- const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS);
- const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS);
- // Combine
- step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7);
- step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7);
- step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7);
- step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7);
- step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7);
- step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7);
- step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7);
- step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7);
- }
- // Stage 5
- {
- step2[4] = _mm256_add_epi16(step1[5], step3[4]);
- step2[5] = _mm256_sub_epi16(step3[4], step1[5]);
- step2[6] = _mm256_sub_epi16(step3[7], step1[6]);
- step2[7] = _mm256_add_epi16(step1[6], step3[7]);
- }
- {
- const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]);
- const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]);
- const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]);
- const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]);
- const __m256i out_00_2 = _mm256_madd_epi16(out_00_0, k__cospi_p16_p16);
- const __m256i out_00_3 = _mm256_madd_epi16(out_00_1, k__cospi_p16_p16);
- const __m256i out_16_2 = _mm256_madd_epi16(out_00_0, k__cospi_p16_m16);
- const __m256i out_16_3 = _mm256_madd_epi16(out_00_1, k__cospi_p16_m16);
- const __m256i out_08_2 = _mm256_madd_epi16(out_08_0, k__cospi_p24_p08);
- const __m256i out_08_3 = _mm256_madd_epi16(out_08_1, k__cospi_p24_p08);
- const __m256i out_24_2 = _mm256_madd_epi16(out_08_0, k__cospi_m08_p24);
- const __m256i out_24_3 = _mm256_madd_epi16(out_08_1, k__cospi_m08_p24);
- // dct_const_round_shift
- const __m256i out_00_4 = _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
- const __m256i out_00_5 = _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
- const __m256i out_16_4 = _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
- const __m256i out_16_5 = _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
- const __m256i out_08_4 = _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
- const __m256i out_08_5 = _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
- const __m256i out_24_4 = _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
- const __m256i out_24_5 = _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
- const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS);
- const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS);
- const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS);
- const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS);
- const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS);
- const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS);
- const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS);
- const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS);
- // Combine
- out[ 0] = _mm256_packs_epi32(out_00_6, out_00_7);
- out[16] = _mm256_packs_epi32(out_16_6, out_16_7);
- out[ 8] = _mm256_packs_epi32(out_08_6, out_08_7);
- out[24] = _mm256_packs_epi32(out_24_6, out_24_7);
- }
- {
- const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[ 9], step1[14]);
- const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[ 9], step1[14]);
- const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]);
- const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]);
- const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24);
- const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24);
- const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08);
- const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08);
- const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24);
- const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24);
- const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08);
- const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08);
- // dct_const_round_shift
- const __m256i s2_09_4 = _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
- const __m256i s2_09_5 = _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
- const __m256i s2_10_4 = _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
- const __m256i s2_10_5 = _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
- const __m256i s2_13_4 = _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
- const __m256i s2_13_5 = _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
- const __m256i s2_14_4 = _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
- const __m256i s2_14_5 = _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
- const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS);
- const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS);
- const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS);
- const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS);
- const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS);
- const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS);
- const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS);
- const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS);
- // Combine
- step2[ 9] = _mm256_packs_epi32(s2_09_6, s2_09_7);
- step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7);
- step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7);
- step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7);
- }
- {
- step2[16] = _mm256_add_epi16(step1[19], step3[16]);
- step2[17] = _mm256_add_epi16(step1[18], step3[17]);
- step2[18] = _mm256_sub_epi16(step3[17], step1[18]);
- step2[19] = _mm256_sub_epi16(step3[16], step1[19]);
- step2[20] = _mm256_sub_epi16(step3[23], step1[20]);
- step2[21] = _mm256_sub_epi16(step3[22], step1[21]);
- step2[22] = _mm256_add_epi16(step1[21], step3[22]);
- step2[23] = _mm256_add_epi16(step1[20], step3[23]);
- step2[24] = _mm256_add_epi16(step1[27], step3[24]);
- step2[25] = _mm256_add_epi16(step1[26], step3[25]);
- step2[26] = _mm256_sub_epi16(step3[25], step1[26]);
- step2[27] = _mm256_sub_epi16(step3[24], step1[27]);
- step2[28] = _mm256_sub_epi16(step3[31], step1[28]);
- step2[29] = _mm256_sub_epi16(step3[30], step1[29]);
- step2[30] = _mm256_add_epi16(step1[29], step3[30]);
- step2[31] = _mm256_add_epi16(step1[28], step3[31]);
- }
- // Stage 6
- {
- const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
- const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
- const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
- const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
- const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
- const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
- const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
- const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
- const __m256i out_04_2 = _mm256_madd_epi16(out_04_0, k__cospi_p28_p04);
- const __m256i out_04_3 = _mm256_madd_epi16(out_04_1, k__cospi_p28_p04);
- const __m256i out_20_2 = _mm256_madd_epi16(out_20_0, k__cospi_p12_p20);
- const __m256i out_20_3 = _mm256_madd_epi16(out_20_1, k__cospi_p12_p20);
- const __m256i out_12_2 = _mm256_madd_epi16(out_12_0, k__cospi_m20_p12);
- const __m256i out_12_3 = _mm256_madd_epi16(out_12_1, k__cospi_m20_p12);
- const __m256i out_28_2 = _mm256_madd_epi16(out_28_0, k__cospi_m04_p28);
- const __m256i out_28_3 = _mm256_madd_epi16(out_28_1, k__cospi_m04_p28);
- // dct_const_round_shift
- const __m256i out_04_4 = _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
- const __m256i out_04_5 = _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
- const __m256i out_20_4 = _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
- const __m256i out_20_5 = _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
- const __m256i out_12_4 = _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
- const __m256i out_12_5 = _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
- const __m256i out_28_4 = _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
- const __m256i out_28_5 = _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
- const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS);
- const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS);
- const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS);
- const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS);
- const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS);
- const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS);
- const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS);
- const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS);
- // Combine
- out[ 4] = _mm256_packs_epi32(out_04_6, out_04_7);
- out[20] = _mm256_packs_epi32(out_20_6, out_20_7);
- out[12] = _mm256_packs_epi32(out_12_6, out_12_7);
- out[28] = _mm256_packs_epi32(out_28_6, out_28_7);
- }
- {
- step3[ 8] = _mm256_add_epi16(step2[ 9], step1[ 8]);
- step3[ 9] = _mm256_sub_epi16(step1[ 8], step2[ 9]);
- step3[10] = _mm256_sub_epi16(step1[11], step2[10]);
- step3[11] = _mm256_add_epi16(step2[10], step1[11]);
- step3[12] = _mm256_add_epi16(step2[13], step1[12]);
- step3[13] = _mm256_sub_epi16(step1[12], step2[13]);
- step3[14] = _mm256_sub_epi16(step1[15], step2[14]);
- step3[15] = _mm256_add_epi16(step2[14], step1[15]);
- }
- {
- const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]);
- const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]);
- const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]);
- const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]);
- const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]);
- const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]);
- const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]);
- const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]);
- const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28);
- const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28);
- const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04);
- const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04);
- const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12);
- const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12);
- const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20);
- const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20);
- const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12);
- const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12);
- const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20);
- const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20);
- const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28);
- const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28);
- const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04);
- const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04);
- // dct_const_round_shift
- const __m256i s3_17_4 = _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
- const __m256i s3_17_5 = _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
- const __m256i s3_18_4 = _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
- const __m256i s3_18_5 = _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
- const __m256i s3_21_4 = _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
- const __m256i s3_21_5 = _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
- const __m256i s3_22_4 = _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
- const __m256i s3_22_5 = _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
- const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS);
- const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS);
- const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS);
- const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS);
- const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS);
- const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS);
- const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS);
- const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS);
- const __m256i s3_25_4 = _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
- const __m256i s3_25_5 = _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
- const __m256i s3_26_4 = _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
- const __m256i s3_26_5 = _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
- const __m256i s3_29_4 = _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
- const __m256i s3_29_5 = _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
- const __m256i s3_30_4 = _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
- const __m256i s3_30_5 = _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
- const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS);
- const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS);
- const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS);
- const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS);
- const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS);
- const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS);
- const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS);
- const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS);
- // Combine
- step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7);
- step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7);
- step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7);
- step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7);
- // Combine
- step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7);
- step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7);
- step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7);
- step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7);
- }
- // Stage 7
- {
- const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[ 8], step3[15]);
- const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[ 8], step3[15]);
- const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[ 9], step3[14]);
- const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[ 9], step3[14]);
- const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]);
- const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]);
- const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]);
- const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]);
- const __m256i out_02_2 = _mm256_madd_epi16(out_02_0, k__cospi_p30_p02);
- const __m256i out_02_3 = _mm256_madd_epi16(out_02_1, k__cospi_p30_p02);
- const __m256i out_18_2 = _mm256_madd_epi16(out_18_0, k__cospi_p14_p18);
- const __m256i out_18_3 = _mm256_madd_epi16(out_18_1, k__cospi_p14_p18);
- const __m256i out_10_2 = _mm256_madd_epi16(out_10_0, k__cospi_p22_p10);
- const __m256i out_10_3 = _mm256_madd_epi16(out_10_1, k__cospi_p22_p10);
- const __m256i out_26_2 = _mm256_madd_epi16(out_26_0, k__cospi_p06_p26);
- const __m256i out_26_3 = _mm256_madd_epi16(out_26_1, k__cospi_p06_p26);
- const __m256i out_06_2 = _mm256_madd_epi16(out_26_0, k__cospi_m26_p06);
- const __m256i out_06_3 = _mm256_madd_epi16(out_26_1, k__cospi_m26_p06);
- const __m256i out_22_2 = _mm256_madd_epi16(out_10_0, k__cospi_m10_p22);
- const __m256i out_22_3 = _mm256_madd_epi16(out_10_1, k__cospi_m10_p22);
- const __m256i out_14_2 = _mm256_madd_epi16(out_18_0, k__cospi_m18_p14);
- const __m256i out_14_3 = _mm256_madd_epi16(out_18_1, k__cospi_m18_p14);
- const __m256i out_30_2 = _mm256_madd_epi16(out_02_0, k__cospi_m02_p30);
- const __m256i out_30_3 = _mm256_madd_epi16(out_02_1, k__cospi_m02_p30);
- // dct_const_round_shift
- const __m256i out_02_4 = _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
- const __m256i out_02_5 = _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
- const __m256i out_18_4 = _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
- const __m256i out_18_5 = _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
- const __m256i out_10_4 = _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
- const __m256i out_10_5 = _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
- const __m256i out_26_4 = _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
- const __m256i out_26_5 = _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
- const __m256i out_06_4 = _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
- const __m256i out_06_5 = _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
- const __m256i out_22_4 = _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
- const __m256i out_22_5 = _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
- const __m256i out_14_4 = _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
- const __m256i out_14_5 = _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
- const __m256i out_30_4 = _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
- const __m256i out_30_5 = _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
- const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS);
- const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS);
- const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS);
- const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS);
- const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS);
- const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS);
- const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS);
- const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS);
- const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS);
- const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS);
- const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS);
- const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS);
- const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS);
- const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS);
- const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS);
- const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS);
- // Combine
- out[ 2] = _mm256_packs_epi32(out_02_6, out_02_7);
- out[18] = _mm256_packs_epi32(out_18_6, out_18_7);
- out[10] = _mm256_packs_epi32(out_10_6, out_10_7);
- out[26] = _mm256_packs_epi32(out_26_6, out_26_7);
- out[ 6] = _mm256_packs_epi32(out_06_6, out_06_7);
- out[22] = _mm256_packs_epi32(out_22_6, out_22_7);
- out[14] = _mm256_packs_epi32(out_14_6, out_14_7);
- out[30] = _mm256_packs_epi32(out_30_6, out_30_7);
- }
- {
- step1[16] = _mm256_add_epi16(step3[17], step2[16]);
- step1[17] = _mm256_sub_epi16(step2[16], step3[17]);
- step1[18] = _mm256_sub_epi16(step2[19], step3[18]);
- step1[19] = _mm256_add_epi16(step3[18], step2[19]);
- step1[20] = _mm256_add_epi16(step3[21], step2[20]);
- step1[21] = _mm256_sub_epi16(step2[20], step3[21]);
- step1[22] = _mm256_sub_epi16(step2[23], step3[22]);
- step1[23] = _mm256_add_epi16(step3[22], step2[23]);
- step1[24] = _mm256_add_epi16(step3[25], step2[24]);
- step1[25] = _mm256_sub_epi16(step2[24], step3[25]);
- step1[26] = _mm256_sub_epi16(step2[27], step3[26]);
- step1[27] = _mm256_add_epi16(step3[26], step2[27]);
- step1[28] = _mm256_add_epi16(step3[29], step2[28]);
- step1[29] = _mm256_sub_epi16(step2[28], step3[29]);
- step1[30] = _mm256_sub_epi16(step2[31], step3[30]);
- step1[31] = _mm256_add_epi16(step3[30], step2[31]);
- }
- // Final stage --- outputs indices are bit-reversed.
- {
- const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]);
- const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]);
- const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]);
- const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]);
- const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]);
- const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]);
- const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]);
- const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]);
- const __m256i out_01_2 = _mm256_madd_epi16(out_01_0, k__cospi_p31_p01);
- const __m256i out_01_3 = _mm256_madd_epi16(out_01_1, k__cospi_p31_p01);
- const __m256i out_17_2 = _mm256_madd_epi16(out_17_0, k__cospi_p15_p17);
- const __m256i out_17_3 = _mm256_madd_epi16(out_17_1, k__cospi_p15_p17);
- const __m256i out_09_2 = _mm256_madd_epi16(out_09_0, k__cospi_p23_p09);
- const __m256i out_09_3 = _mm256_madd_epi16(out_09_1, k__cospi_p23_p09);
- const __m256i out_25_2 = _mm256_madd_epi16(out_25_0, k__cospi_p07_p25);
- const __m256i out_25_3 = _mm256_madd_epi16(out_25_1, k__cospi_p07_p25);
- const __m256i out_07_2 = _mm256_madd_epi16(out_25_0, k__cospi_m25_p07);
- const __m256i out_07_3 = _mm256_madd_epi16(out_25_1, k__cospi_m25_p07);
- const __m256i out_23_2 = _mm256_madd_epi16(out_09_0, k__cospi_m09_p23);
- const __m256i out_23_3 = _mm256_madd_epi16(out_09_1, k__cospi_m09_p23);
- const __m256i out_15_2 = _mm256_madd_epi16(out_17_0, k__cospi_m17_p15);
- const __m256i out_15_3 = _mm256_madd_epi16(out_17_1, k__cospi_m17_p15);
- const __m256i out_31_2 = _mm256_madd_epi16(out_01_0, k__cospi_m01_p31);
- const __m256i out_31_3 = _mm256_madd_epi16(out_01_1, k__cospi_m01_p31);
- // dct_const_round_shift
- const __m256i out_01_4 = _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
- const __m256i out_01_5 = _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
- const __m256i out_17_4 = _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
- const __m256i out_17_5 = _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
- const __m256i out_09_4 = _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
- const __m256i out_09_5 = _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
- const __m256i out_25_4 = _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
- const __m256i out_25_5 = _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
- const __m256i out_07_4 = _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
- const __m256i out_07_5 = _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
- const __m256i out_23_4 = _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
- const __m256i out_23_5 = _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
- const __m256i out_15_4 = _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
- const __m256i out_15_5 = _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
- const __m256i out_31_4 = _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
- const __m256i out_31_5 = _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
- const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS);
- const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS);
- const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS);
- const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS);
- const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS);
- const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS);
- const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS);
- const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS);
- const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS);
- const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS);
- const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS);
- const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS);
- const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS);
- const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS);
- const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS);
- const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS);
- // Combine
- out[ 1] = _mm256_packs_epi32(out_01_6, out_01_7);
- out[17] = _mm256_packs_epi32(out_17_6, out_17_7);
- out[ 9] = _mm256_packs_epi32(out_09_6, out_09_7);
- out[25] = _mm256_packs_epi32(out_25_6, out_25_7);
- out[ 7] = _mm256_packs_epi32(out_07_6, out_07_7);
- out[23] = _mm256_packs_epi32(out_23_6, out_23_7);
- out[15] = _mm256_packs_epi32(out_15_6, out_15_7);
- out[31] = _mm256_packs_epi32(out_31_6, out_31_7);
- }
- {
- const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]);
- const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]);
- const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]);
- const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]);
- const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]);
- const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]);
- const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]);
- const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]);
- const __m256i out_05_2 = _mm256_madd_epi16(out_05_0, k__cospi_p27_p05);
- const __m256i out_05_3 = _mm256_madd_epi16(out_05_1, k__cospi_p27_p05);
- const __m256i out_21_2 = _mm256_madd_epi16(out_21_0, k__cospi_p11_p21);
- const __m256i out_21_3 = _mm256_madd_epi16(out_21_1, k__cospi_p11_p21);
- const __m256i out_13_2 = _mm256_madd_epi16(out_13_0, k__cospi_p19_p13);
- const __m256i out_13_3 = _mm256_madd_epi16(out_13_1, k__cospi_p19_p13);
- const __m256i out_29_2 = _mm256_madd_epi16(out_29_0, k__cospi_p03_p29);
- const __m256i out_29_3 = _mm256_madd_epi16(out_29_1, k__cospi_p03_p29);
- const __m256i out_03_2 = _mm256_madd_epi16(out_29_0, k__cospi_m29_p03);
- const __m256i out_03_3 = _mm256_madd_epi16(out_29_1, k__cospi_m29_p03);
- const __m256i out_19_2 = _mm256_madd_epi16(out_13_0, k__cospi_m13_p19);
- const __m256i out_19_3 = _mm256_madd_epi16(out_13_1, k__cospi_m13_p19);
- const __m256i out_11_2 = _mm256_madd_epi16(out_21_0, k__cospi_m21_p11);
- const __m256i out_11_3 = _mm256_madd_epi16(out_21_1, k__cospi_m21_p11);
- const __m256i out_27_2 = _mm256_madd_epi16(out_05_0, k__cospi_m05_p27);
- const __m256i out_27_3 = _mm256_madd_epi16(out_05_1, k__cospi_m05_p27);
- // dct_const_round_shift
- const __m256i out_05_4 = _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
- const __m256i out_05_5 = _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
- const __m256i out_21_4 = _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
- const __m256i out_21_5 = _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
- const __m256i out_13_4 = _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
- const __m256i out_13_5 = _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
- const __m256i out_29_4 = _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
- const __m256i out_29_5 = _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
- const __m256i out_03_4 = _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
- const __m256i out_03_5 = _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
- const __m256i out_19_4 = _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
- const __m256i out_19_5 = _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
- const __m256i out_11_4 = _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
- const __m256i out_11_5 = _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
- const __m256i out_27_4 = _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
- const __m256i out_27_5 = _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
- const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS);
- const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS);
- const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS);
- const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS);
- const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS);
- const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS);
- const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS);
- const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS);
- const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS);
- const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS);
- const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS);
- const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS);
- const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS);
- const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS);
- const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS);
- const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS);
- // Combine
- out[ 5] = _mm256_packs_epi32(out_05_6, out_05_7);
- out[21] = _mm256_packs_epi32(out_21_6, out_21_7);
- out[13] = _mm256_packs_epi32(out_13_6, out_13_7);
- out[29] = _mm256_packs_epi32(out_29_6, out_29_7);
- out[ 3] = _mm256_packs_epi32(out_03_6, out_03_7);
- out[19] = _mm256_packs_epi32(out_19_6, out_19_7);
- out[11] = _mm256_packs_epi32(out_11_6, out_11_7);
- out[27] = _mm256_packs_epi32(out_27_6, out_27_7);
- }
+ // Stage 4
+ {
+ step1[0] = _mm256_add_epi16(step3[3], step3[0]);
+ step1[1] = _mm256_add_epi16(step3[2], step3[1]);
+ step1[2] = _mm256_sub_epi16(step3[1], step3[2]);
+ step1[3] = _mm256_sub_epi16(step3[0], step3[3]);
+ step1[8] = _mm256_add_epi16(step3[11], step2[8]);
+ step1[9] = _mm256_add_epi16(step3[10], step2[9]);
+ step1[10] = _mm256_sub_epi16(step2[9], step3[10]);
+ step1[11] = _mm256_sub_epi16(step2[8], step3[11]);
+ step1[12] = _mm256_sub_epi16(step2[15], step3[12]);
+ step1[13] = _mm256_sub_epi16(step2[14], step3[13]);
+ step1[14] = _mm256_add_epi16(step3[13], step2[14]);
+ step1[15] = _mm256_add_epi16(step3[12], step2[15]);
+ }
+ {
+ const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]);
+ const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]);
+ const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16);
+ const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16);
+ const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16);
+ const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m256i s1_05_4 =
+ _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_05_5 =
+ _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_06_4 =
+ _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_06_5 =
+ _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS);
+ const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS);
+ const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS);
+ const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS);
+ // Combine
+ step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7);
+ step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7);
+ }
+ {
+ const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]);
+ const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]);
+ const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]);
+ const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]);
+ const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]);
+ const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]);
+ const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]);
+ const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]);
+ const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24);
+ const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24);
+ const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24);
+ const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24);
+ const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08);
+ const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08);
+ const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08);
+ const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08);
+ const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24);
+ const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24);
+ const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24);
+ const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24);
+ const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08);
+ const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08);
+ const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08);
+ const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m256i s1_18_4 =
+ _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_18_5 =
+ _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_19_4 =
+ _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_19_5 =
+ _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_20_4 =
+ _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_20_5 =
+ _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_21_4 =
+ _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_21_5 =
+ _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_26_4 =
+ _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_26_5 =
+ _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_27_4 =
+ _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_27_5 =
+ _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_28_4 =
+ _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_28_5 =
+ _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_29_4 =
+ _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_29_5 =
+ _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS);
+ const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS);
+ const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS);
+ const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS);
+ const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS);
+ const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS);
+ const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS);
+ const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS);
+ const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS);
+ const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS);
+ const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS);
+ const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS);
+ const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS);
+ const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS);
+ const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS);
+ const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS);
+ // Combine
+ step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7);
+ step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7);
+ step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7);
+ step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7);
+ step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7);
+ step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7);
+ step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7);
+ step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7);
+ }
+ // Stage 5
+ {
+ step2[4] = _mm256_add_epi16(step1[5], step3[4]);
+ step2[5] = _mm256_sub_epi16(step3[4], step1[5]);
+ step2[6] = _mm256_sub_epi16(step3[7], step1[6]);
+ step2[7] = _mm256_add_epi16(step1[6], step3[7]);
+ }
+ {
+ const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]);
+ const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]);
+ const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]);
+ const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]);
+ const __m256i out_00_2 =
+ _mm256_madd_epi16(out_00_0, k__cospi_p16_p16);
+ const __m256i out_00_3 =
+ _mm256_madd_epi16(out_00_1, k__cospi_p16_p16);
+ const __m256i out_16_2 =
+ _mm256_madd_epi16(out_00_0, k__cospi_p16_m16);
+ const __m256i out_16_3 =
+ _mm256_madd_epi16(out_00_1, k__cospi_p16_m16);
+ const __m256i out_08_2 =
+ _mm256_madd_epi16(out_08_0, k__cospi_p24_p08);
+ const __m256i out_08_3 =
+ _mm256_madd_epi16(out_08_1, k__cospi_p24_p08);
+ const __m256i out_24_2 =
+ _mm256_madd_epi16(out_08_0, k__cospi_m08_p24);
+ const __m256i out_24_3 =
+ _mm256_madd_epi16(out_08_1, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m256i out_00_4 =
+ _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_00_5 =
+ _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_16_4 =
+ _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_16_5 =
+ _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_08_4 =
+ _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_08_5 =
+ _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_24_4 =
+ _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_24_5 =
+ _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS);
+ const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS);
+ const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS);
+ const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS);
+ const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS);
+ const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS);
+ const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS);
+ const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS);
+ // Combine
+ out[0] = _mm256_packs_epi32(out_00_6, out_00_7);
+ out[16] = _mm256_packs_epi32(out_16_6, out_16_7);
+ out[8] = _mm256_packs_epi32(out_08_6, out_08_7);
+ out[24] = _mm256_packs_epi32(out_24_6, out_24_7);
+ }
+ {
+ const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[9], step1[14]);
+ const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[9], step1[14]);
+ const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]);
+ const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]);
+ const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24);
+ const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24);
+ const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08);
+ const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08);
+ const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24);
+ const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24);
+ const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08);
+ const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m256i s2_09_4 =
+ _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_09_5 =
+ _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_10_4 =
+ _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_10_5 =
+ _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_13_4 =
+ _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_13_5 =
+ _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_14_4 =
+ _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_14_5 =
+ _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS);
+ const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS);
+ const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS);
+ const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS);
+ const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS);
+ const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS);
+ const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS);
+ const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS);
+ // Combine
+ step2[9] = _mm256_packs_epi32(s2_09_6, s2_09_7);
+ step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7);
+ step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7);
+ step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7);
+ }
+ {
+ step2[16] = _mm256_add_epi16(step1[19], step3[16]);
+ step2[17] = _mm256_add_epi16(step1[18], step3[17]);
+ step2[18] = _mm256_sub_epi16(step3[17], step1[18]);
+ step2[19] = _mm256_sub_epi16(step3[16], step1[19]);
+ step2[20] = _mm256_sub_epi16(step3[23], step1[20]);
+ step2[21] = _mm256_sub_epi16(step3[22], step1[21]);
+ step2[22] = _mm256_add_epi16(step1[21], step3[22]);
+ step2[23] = _mm256_add_epi16(step1[20], step3[23]);
+ step2[24] = _mm256_add_epi16(step1[27], step3[24]);
+ step2[25] = _mm256_add_epi16(step1[26], step3[25]);
+ step2[26] = _mm256_sub_epi16(step3[25], step1[26]);
+ step2[27] = _mm256_sub_epi16(step3[24], step1[27]);
+ step2[28] = _mm256_sub_epi16(step3[31], step1[28]);
+ step2[29] = _mm256_sub_epi16(step3[30], step1[29]);
+ step2[30] = _mm256_add_epi16(step1[29], step3[30]);
+ step2[31] = _mm256_add_epi16(step1[28], step3[31]);
+ }
+ // Stage 6
+ {
+ const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
+ const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
+ const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
+ const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
+ const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
+ const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
+ const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
+ const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
+ const __m256i out_04_2 =
+ _mm256_madd_epi16(out_04_0, k__cospi_p28_p04);
+ const __m256i out_04_3 =
+ _mm256_madd_epi16(out_04_1, k__cospi_p28_p04);
+ const __m256i out_20_2 =
+ _mm256_madd_epi16(out_20_0, k__cospi_p12_p20);
+ const __m256i out_20_3 =
+ _mm256_madd_epi16(out_20_1, k__cospi_p12_p20);
+ const __m256i out_12_2 =
+ _mm256_madd_epi16(out_12_0, k__cospi_m20_p12);
+ const __m256i out_12_3 =
+ _mm256_madd_epi16(out_12_1, k__cospi_m20_p12);
+ const __m256i out_28_2 =
+ _mm256_madd_epi16(out_28_0, k__cospi_m04_p28);
+ const __m256i out_28_3 =
+ _mm256_madd_epi16(out_28_1, k__cospi_m04_p28);
+ // dct_const_round_shift
+ const __m256i out_04_4 =
+ _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_04_5 =
+ _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_20_4 =
+ _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_20_5 =
+ _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_12_4 =
+ _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_12_5 =
+ _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_28_4 =
+ _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_28_5 =
+ _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS);
+ const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS);
+ const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS);
+ const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS);
+ const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS);
+ const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS);
+ const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS);
+ const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS);
+ // Combine
+ out[4] = _mm256_packs_epi32(out_04_6, out_04_7);
+ out[20] = _mm256_packs_epi32(out_20_6, out_20_7);
+ out[12] = _mm256_packs_epi32(out_12_6, out_12_7);
+ out[28] = _mm256_packs_epi32(out_28_6, out_28_7);
+ }
+ {
+ step3[8] = _mm256_add_epi16(step2[9], step1[8]);
+ step3[9] = _mm256_sub_epi16(step1[8], step2[9]);
+ step3[10] = _mm256_sub_epi16(step1[11], step2[10]);
+ step3[11] = _mm256_add_epi16(step2[10], step1[11]);
+ step3[12] = _mm256_add_epi16(step2[13], step1[12]);
+ step3[13] = _mm256_sub_epi16(step1[12], step2[13]);
+ step3[14] = _mm256_sub_epi16(step1[15], step2[14]);
+ step3[15] = _mm256_add_epi16(step2[14], step1[15]);
+ }
+ {
+ const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]);
+ const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]);
+ const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]);
+ const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]);
+ const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]);
+ const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]);
+ const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]);
+ const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]);
+ const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28);
+ const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28);
+ const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04);
+ const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04);
+ const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12);
+ const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12);
+ const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20);
+ const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20);
+ const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12);
+ const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12);
+ const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20);
+ const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20);
+ const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28);
+ const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28);
+ const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04);
+ const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04);
+ // dct_const_round_shift
+ const __m256i s3_17_4 =
+ _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_17_5 =
+ _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_18_4 =
+ _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_18_5 =
+ _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_21_4 =
+ _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_21_5 =
+ _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_22_4 =
+ _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_22_5 =
+ _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS);
+ const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS);
+ const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS);
+ const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS);
+ const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS);
+ const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS);
+ const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS);
+ const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS);
+ const __m256i s3_25_4 =
+ _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_25_5 =
+ _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_26_4 =
+ _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_26_5 =
+ _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_29_4 =
+ _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_29_5 =
+ _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_30_4 =
+ _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_30_5 =
+ _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS);
+ const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS);
+ const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS);
+ const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS);
+ const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS);
+ const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS);
+ const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS);
+ const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS);
+ // Combine
+ step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7);
+ step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7);
+ step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7);
+ step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7);
+ // Combine
+ step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7);
+ step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7);
+ step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7);
+ step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7);
+ }
+ // Stage 7
+ {
+ const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[8], step3[15]);
+ const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[8], step3[15]);
+ const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[9], step3[14]);
+ const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[9], step3[14]);
+ const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]);
+ const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]);
+ const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]);
+ const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]);
+ const __m256i out_02_2 =
+ _mm256_madd_epi16(out_02_0, k__cospi_p30_p02);
+ const __m256i out_02_3 =
+ _mm256_madd_epi16(out_02_1, k__cospi_p30_p02);
+ const __m256i out_18_2 =
+ _mm256_madd_epi16(out_18_0, k__cospi_p14_p18);
+ const __m256i out_18_3 =
+ _mm256_madd_epi16(out_18_1, k__cospi_p14_p18);
+ const __m256i out_10_2 =
+ _mm256_madd_epi16(out_10_0, k__cospi_p22_p10);
+ const __m256i out_10_3 =
+ _mm256_madd_epi16(out_10_1, k__cospi_p22_p10);
+ const __m256i out_26_2 =
+ _mm256_madd_epi16(out_26_0, k__cospi_p06_p26);
+ const __m256i out_26_3 =
+ _mm256_madd_epi16(out_26_1, k__cospi_p06_p26);
+ const __m256i out_06_2 =
+ _mm256_madd_epi16(out_26_0, k__cospi_m26_p06);
+ const __m256i out_06_3 =
+ _mm256_madd_epi16(out_26_1, k__cospi_m26_p06);
+ const __m256i out_22_2 =
+ _mm256_madd_epi16(out_10_0, k__cospi_m10_p22);
+ const __m256i out_22_3 =
+ _mm256_madd_epi16(out_10_1, k__cospi_m10_p22);
+ const __m256i out_14_2 =
+ _mm256_madd_epi16(out_18_0, k__cospi_m18_p14);
+ const __m256i out_14_3 =
+ _mm256_madd_epi16(out_18_1, k__cospi_m18_p14);
+ const __m256i out_30_2 =
+ _mm256_madd_epi16(out_02_0, k__cospi_m02_p30);
+ const __m256i out_30_3 =
+ _mm256_madd_epi16(out_02_1, k__cospi_m02_p30);
+ // dct_const_round_shift
+ const __m256i out_02_4 =
+ _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_02_5 =
+ _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_18_4 =
+ _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_18_5 =
+ _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_10_4 =
+ _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_10_5 =
+ _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_26_4 =
+ _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_26_5 =
+ _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_06_4 =
+ _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_06_5 =
+ _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_22_4 =
+ _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_22_5 =
+ _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_14_4 =
+ _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_14_5 =
+ _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_30_4 =
+ _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_30_5 =
+ _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS);
+ const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS);
+ const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS);
+ const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS);
+ const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS);
+ const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS);
+ const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS);
+ const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS);
+ const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS);
+ const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS);
+ const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS);
+ const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS);
+ const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS);
+ const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS);
+ const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS);
+ const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS);
+ // Combine
+ out[2] = _mm256_packs_epi32(out_02_6, out_02_7);
+ out[18] = _mm256_packs_epi32(out_18_6, out_18_7);
+ out[10] = _mm256_packs_epi32(out_10_6, out_10_7);
+ out[26] = _mm256_packs_epi32(out_26_6, out_26_7);
+ out[6] = _mm256_packs_epi32(out_06_6, out_06_7);
+ out[22] = _mm256_packs_epi32(out_22_6, out_22_7);
+ out[14] = _mm256_packs_epi32(out_14_6, out_14_7);
+ out[30] = _mm256_packs_epi32(out_30_6, out_30_7);
+ }
+ {
+ step1[16] = _mm256_add_epi16(step3[17], step2[16]);
+ step1[17] = _mm256_sub_epi16(step2[16], step3[17]);
+ step1[18] = _mm256_sub_epi16(step2[19], step3[18]);
+ step1[19] = _mm256_add_epi16(step3[18], step2[19]);
+ step1[20] = _mm256_add_epi16(step3[21], step2[20]);
+ step1[21] = _mm256_sub_epi16(step2[20], step3[21]);
+ step1[22] = _mm256_sub_epi16(step2[23], step3[22]);
+ step1[23] = _mm256_add_epi16(step3[22], step2[23]);
+ step1[24] = _mm256_add_epi16(step3[25], step2[24]);
+ step1[25] = _mm256_sub_epi16(step2[24], step3[25]);
+ step1[26] = _mm256_sub_epi16(step2[27], step3[26]);
+ step1[27] = _mm256_add_epi16(step3[26], step2[27]);
+ step1[28] = _mm256_add_epi16(step3[29], step2[28]);
+ step1[29] = _mm256_sub_epi16(step2[28], step3[29]);
+ step1[30] = _mm256_sub_epi16(step2[31], step3[30]);
+ step1[31] = _mm256_add_epi16(step3[30], step2[31]);
+ }
+ // Final stage --- outputs indices are bit-reversed.
+ {
+ const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]);
+ const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]);
+ const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]);
+ const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]);
+ const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]);
+ const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]);
+ const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]);
+ const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]);
+ const __m256i out_01_2 =
+ _mm256_madd_epi16(out_01_0, k__cospi_p31_p01);
+ const __m256i out_01_3 =
+ _mm256_madd_epi16(out_01_1, k__cospi_p31_p01);
+ const __m256i out_17_2 =
+ _mm256_madd_epi16(out_17_0, k__cospi_p15_p17);
+ const __m256i out_17_3 =
+ _mm256_madd_epi16(out_17_1, k__cospi_p15_p17);
+ const __m256i out_09_2 =
+ _mm256_madd_epi16(out_09_0, k__cospi_p23_p09);
+ const __m256i out_09_3 =
+ _mm256_madd_epi16(out_09_1, k__cospi_p23_p09);
+ const __m256i out_25_2 =
+ _mm256_madd_epi16(out_25_0, k__cospi_p07_p25);
+ const __m256i out_25_3 =
+ _mm256_madd_epi16(out_25_1, k__cospi_p07_p25);
+ const __m256i out_07_2 =
+ _mm256_madd_epi16(out_25_0, k__cospi_m25_p07);
+ const __m256i out_07_3 =
+ _mm256_madd_epi16(out_25_1, k__cospi_m25_p07);
+ const __m256i out_23_2 =
+ _mm256_madd_epi16(out_09_0, k__cospi_m09_p23);
+ const __m256i out_23_3 =
+ _mm256_madd_epi16(out_09_1, k__cospi_m09_p23);
+ const __m256i out_15_2 =
+ _mm256_madd_epi16(out_17_0, k__cospi_m17_p15);
+ const __m256i out_15_3 =
+ _mm256_madd_epi16(out_17_1, k__cospi_m17_p15);
+ const __m256i out_31_2 =
+ _mm256_madd_epi16(out_01_0, k__cospi_m01_p31);
+ const __m256i out_31_3 =
+ _mm256_madd_epi16(out_01_1, k__cospi_m01_p31);
+ // dct_const_round_shift
+ const __m256i out_01_4 =
+ _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_01_5 =
+ _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_17_4 =
+ _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_17_5 =
+ _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_09_4 =
+ _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_09_5 =
+ _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_25_4 =
+ _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_25_5 =
+ _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_07_4 =
+ _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_07_5 =
+ _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_23_4 =
+ _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_23_5 =
+ _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_15_4 =
+ _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_15_5 =
+ _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_31_4 =
+ _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_31_5 =
+ _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS);
+ const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS);
+ const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS);
+ const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS);
+ const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS);
+ const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS);
+ const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS);
+ const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS);
+ const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS);
+ const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS);
+ const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS);
+ const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS);
+ const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS);
+ const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS);
+ const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS);
+ const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS);
+ // Combine
+ out[1] = _mm256_packs_epi32(out_01_6, out_01_7);
+ out[17] = _mm256_packs_epi32(out_17_6, out_17_7);
+ out[9] = _mm256_packs_epi32(out_09_6, out_09_7);
+ out[25] = _mm256_packs_epi32(out_25_6, out_25_7);
+ out[7] = _mm256_packs_epi32(out_07_6, out_07_7);
+ out[23] = _mm256_packs_epi32(out_23_6, out_23_7);
+ out[15] = _mm256_packs_epi32(out_15_6, out_15_7);
+ out[31] = _mm256_packs_epi32(out_31_6, out_31_7);
+ }
+ {
+ const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]);
+ const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]);
+ const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]);
+ const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]);
+ const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]);
+ const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]);
+ const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]);
+ const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]);
+ const __m256i out_05_2 =
+ _mm256_madd_epi16(out_05_0, k__cospi_p27_p05);
+ const __m256i out_05_3 =
+ _mm256_madd_epi16(out_05_1, k__cospi_p27_p05);
+ const __m256i out_21_2 =
+ _mm256_madd_epi16(out_21_0, k__cospi_p11_p21);
+ const __m256i out_21_3 =
+ _mm256_madd_epi16(out_21_1, k__cospi_p11_p21);
+ const __m256i out_13_2 =
+ _mm256_madd_epi16(out_13_0, k__cospi_p19_p13);
+ const __m256i out_13_3 =
+ _mm256_madd_epi16(out_13_1, k__cospi_p19_p13);
+ const __m256i out_29_2 =
+ _mm256_madd_epi16(out_29_0, k__cospi_p03_p29);
+ const __m256i out_29_3 =
+ _mm256_madd_epi16(out_29_1, k__cospi_p03_p29);
+ const __m256i out_03_2 =
+ _mm256_madd_epi16(out_29_0, k__cospi_m29_p03);
+ const __m256i out_03_3 =
+ _mm256_madd_epi16(out_29_1, k__cospi_m29_p03);
+ const __m256i out_19_2 =
+ _mm256_madd_epi16(out_13_0, k__cospi_m13_p19);
+ const __m256i out_19_3 =
+ _mm256_madd_epi16(out_13_1, k__cospi_m13_p19);
+ const __m256i out_11_2 =
+ _mm256_madd_epi16(out_21_0, k__cospi_m21_p11);
+ const __m256i out_11_3 =
+ _mm256_madd_epi16(out_21_1, k__cospi_m21_p11);
+ const __m256i out_27_2 =
+ _mm256_madd_epi16(out_05_0, k__cospi_m05_p27);
+ const __m256i out_27_3 =
+ _mm256_madd_epi16(out_05_1, k__cospi_m05_p27);
+ // dct_const_round_shift
+ const __m256i out_05_4 =
+ _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_05_5 =
+ _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_21_4 =
+ _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_21_5 =
+ _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_13_4 =
+ _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_13_5 =
+ _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_29_4 =
+ _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_29_5 =
+ _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_03_4 =
+ _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_03_5 =
+ _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_19_4 =
+ _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_19_5 =
+ _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_11_4 =
+ _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_11_5 =
+ _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_27_4 =
+ _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_27_5 =
+ _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS);
+ const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS);
+ const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS);
+ const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS);
+ const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS);
+ const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS);
+ const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS);
+ const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS);
+ const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS);
+ const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS);
+ const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS);
+ const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS);
+ const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS);
+ const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS);
+ const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS);
+ const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS);
+ // Combine
+ out[5] = _mm256_packs_epi32(out_05_6, out_05_7);
+ out[21] = _mm256_packs_epi32(out_21_6, out_21_7);
+ out[13] = _mm256_packs_epi32(out_13_6, out_13_7);
+ out[29] = _mm256_packs_epi32(out_29_6, out_29_7);
+ out[3] = _mm256_packs_epi32(out_03_6, out_03_7);
+ out[19] = _mm256_packs_epi32(out_19_6, out_19_7);
+ out[11] = _mm256_packs_epi32(out_11_6, out_11_7);
+ out[27] = _mm256_packs_epi32(out_27_6, out_27_7);
+ }
#if FDCT32x32_HIGH_PRECISION
} else {
__m256i lstep1[64], lstep2[64], lstep3[64];
@@ -1157,32 +1378,32 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
// stage 3
{
// expanding to 32-bit length priori to addition operations
- lstep2[ 0] = _mm256_unpacklo_epi16(step2[ 0], kZero);
- lstep2[ 1] = _mm256_unpackhi_epi16(step2[ 0], kZero);
- lstep2[ 2] = _mm256_unpacklo_epi16(step2[ 1], kZero);
- lstep2[ 3] = _mm256_unpackhi_epi16(step2[ 1], kZero);
- lstep2[ 4] = _mm256_unpacklo_epi16(step2[ 2], kZero);
- lstep2[ 5] = _mm256_unpackhi_epi16(step2[ 2], kZero);
- lstep2[ 6] = _mm256_unpacklo_epi16(step2[ 3], kZero);
- lstep2[ 7] = _mm256_unpackhi_epi16(step2[ 3], kZero);
- lstep2[ 8] = _mm256_unpacklo_epi16(step2[ 4], kZero);
- lstep2[ 9] = _mm256_unpackhi_epi16(step2[ 4], kZero);
- lstep2[10] = _mm256_unpacklo_epi16(step2[ 5], kZero);
- lstep2[11] = _mm256_unpackhi_epi16(step2[ 5], kZero);
- lstep2[12] = _mm256_unpacklo_epi16(step2[ 6], kZero);
- lstep2[13] = _mm256_unpackhi_epi16(step2[ 6], kZero);
- lstep2[14] = _mm256_unpacklo_epi16(step2[ 7], kZero);
- lstep2[15] = _mm256_unpackhi_epi16(step2[ 7], kZero);
- lstep2[ 0] = _mm256_madd_epi16(lstep2[ 0], kOne);
- lstep2[ 1] = _mm256_madd_epi16(lstep2[ 1], kOne);
- lstep2[ 2] = _mm256_madd_epi16(lstep2[ 2], kOne);
- lstep2[ 3] = _mm256_madd_epi16(lstep2[ 3], kOne);
- lstep2[ 4] = _mm256_madd_epi16(lstep2[ 4], kOne);
- lstep2[ 5] = _mm256_madd_epi16(lstep2[ 5], kOne);
- lstep2[ 6] = _mm256_madd_epi16(lstep2[ 6], kOne);
- lstep2[ 7] = _mm256_madd_epi16(lstep2[ 7], kOne);
- lstep2[ 8] = _mm256_madd_epi16(lstep2[ 8], kOne);
- lstep2[ 9] = _mm256_madd_epi16(lstep2[ 9], kOne);
+ lstep2[0] = _mm256_unpacklo_epi16(step2[0], kZero);
+ lstep2[1] = _mm256_unpackhi_epi16(step2[0], kZero);
+ lstep2[2] = _mm256_unpacklo_epi16(step2[1], kZero);
+ lstep2[3] = _mm256_unpackhi_epi16(step2[1], kZero);
+ lstep2[4] = _mm256_unpacklo_epi16(step2[2], kZero);
+ lstep2[5] = _mm256_unpackhi_epi16(step2[2], kZero);
+ lstep2[6] = _mm256_unpacklo_epi16(step2[3], kZero);
+ lstep2[7] = _mm256_unpackhi_epi16(step2[3], kZero);
+ lstep2[8] = _mm256_unpacklo_epi16(step2[4], kZero);
+ lstep2[9] = _mm256_unpackhi_epi16(step2[4], kZero);
+ lstep2[10] = _mm256_unpacklo_epi16(step2[5], kZero);
+ lstep2[11] = _mm256_unpackhi_epi16(step2[5], kZero);
+ lstep2[12] = _mm256_unpacklo_epi16(step2[6], kZero);
+ lstep2[13] = _mm256_unpackhi_epi16(step2[6], kZero);
+ lstep2[14] = _mm256_unpacklo_epi16(step2[7], kZero);
+ lstep2[15] = _mm256_unpackhi_epi16(step2[7], kZero);
+ lstep2[0] = _mm256_madd_epi16(lstep2[0], kOne);
+ lstep2[1] = _mm256_madd_epi16(lstep2[1], kOne);
+ lstep2[2] = _mm256_madd_epi16(lstep2[2], kOne);
+ lstep2[3] = _mm256_madd_epi16(lstep2[3], kOne);
+ lstep2[4] = _mm256_madd_epi16(lstep2[4], kOne);
+ lstep2[5] = _mm256_madd_epi16(lstep2[5], kOne);
+ lstep2[6] = _mm256_madd_epi16(lstep2[6], kOne);
+ lstep2[7] = _mm256_madd_epi16(lstep2[7], kOne);
+ lstep2[8] = _mm256_madd_epi16(lstep2[8], kOne);
+ lstep2[9] = _mm256_madd_epi16(lstep2[9], kOne);
lstep2[10] = _mm256_madd_epi16(lstep2[10], kOne);
lstep2[11] = _mm256_madd_epi16(lstep2[11], kOne);
lstep2[12] = _mm256_madd_epi16(lstep2[12], kOne);
@@ -1190,22 +1411,22 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
lstep2[14] = _mm256_madd_epi16(lstep2[14], kOne);
lstep2[15] = _mm256_madd_epi16(lstep2[15], kOne);
- lstep3[ 0] = _mm256_add_epi32(lstep2[14], lstep2[ 0]);
- lstep3[ 1] = _mm256_add_epi32(lstep2[15], lstep2[ 1]);
- lstep3[ 2] = _mm256_add_epi32(lstep2[12], lstep2[ 2]);
- lstep3[ 3] = _mm256_add_epi32(lstep2[13], lstep2[ 3]);
- lstep3[ 4] = _mm256_add_epi32(lstep2[10], lstep2[ 4]);
- lstep3[ 5] = _mm256_add_epi32(lstep2[11], lstep2[ 5]);
- lstep3[ 6] = _mm256_add_epi32(lstep2[ 8], lstep2[ 6]);
- lstep3[ 7] = _mm256_add_epi32(lstep2[ 9], lstep2[ 7]);
- lstep3[ 8] = _mm256_sub_epi32(lstep2[ 6], lstep2[ 8]);
- lstep3[ 9] = _mm256_sub_epi32(lstep2[ 7], lstep2[ 9]);
- lstep3[10] = _mm256_sub_epi32(lstep2[ 4], lstep2[10]);
- lstep3[11] = _mm256_sub_epi32(lstep2[ 5], lstep2[11]);
- lstep3[12] = _mm256_sub_epi32(lstep2[ 2], lstep2[12]);
- lstep3[13] = _mm256_sub_epi32(lstep2[ 3], lstep2[13]);
- lstep3[14] = _mm256_sub_epi32(lstep2[ 0], lstep2[14]);
- lstep3[15] = _mm256_sub_epi32(lstep2[ 1], lstep2[15]);
+ lstep3[0] = _mm256_add_epi32(lstep2[14], lstep2[0]);
+ lstep3[1] = _mm256_add_epi32(lstep2[15], lstep2[1]);
+ lstep3[2] = _mm256_add_epi32(lstep2[12], lstep2[2]);
+ lstep3[3] = _mm256_add_epi32(lstep2[13], lstep2[3]);
+ lstep3[4] = _mm256_add_epi32(lstep2[10], lstep2[4]);
+ lstep3[5] = _mm256_add_epi32(lstep2[11], lstep2[5]);
+ lstep3[6] = _mm256_add_epi32(lstep2[8], lstep2[6]);
+ lstep3[7] = _mm256_add_epi32(lstep2[9], lstep2[7]);
+ lstep3[8] = _mm256_sub_epi32(lstep2[6], lstep2[8]);
+ lstep3[9] = _mm256_sub_epi32(lstep2[7], lstep2[9]);
+ lstep3[10] = _mm256_sub_epi32(lstep2[4], lstep2[10]);
+ lstep3[11] = _mm256_sub_epi32(lstep2[5], lstep2[11]);
+ lstep3[12] = _mm256_sub_epi32(lstep2[2], lstep2[12]);
+ lstep3[13] = _mm256_sub_epi32(lstep2[3], lstep2[13]);
+ lstep3[14] = _mm256_sub_epi32(lstep2[0], lstep2[14]);
+ lstep3[15] = _mm256_sub_epi32(lstep2[1], lstep2[15]);
}
{
const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
@@ -1221,14 +1442,22 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
// dct_const_round_shift
- const __m256i s3_10_4 = _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
- const __m256i s3_10_5 = _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
- const __m256i s3_11_4 = _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
- const __m256i s3_11_5 = _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
- const __m256i s3_12_4 = _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
- const __m256i s3_12_5 = _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
- const __m256i s3_13_4 = _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
- const __m256i s3_13_5 = _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_10_4 =
+ _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_10_5 =
+ _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_11_4 =
+ _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_11_5 =
+ _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_12_4 =
+ _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_12_5 =
+ _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_13_4 =
+ _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_13_5 =
+ _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
lstep3[20] = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
lstep3[21] = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
lstep3[22] = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
@@ -1343,10 +1572,10 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
// stage 4
{
// expanding to 32-bit length priori to addition operations
- lstep2[16] = _mm256_unpacklo_epi16(step2[ 8], kZero);
- lstep2[17] = _mm256_unpackhi_epi16(step2[ 8], kZero);
- lstep2[18] = _mm256_unpacklo_epi16(step2[ 9], kZero);
- lstep2[19] = _mm256_unpackhi_epi16(step2[ 9], kZero);
+ lstep2[16] = _mm256_unpacklo_epi16(step2[8], kZero);
+ lstep2[17] = _mm256_unpackhi_epi16(step2[8], kZero);
+ lstep2[18] = _mm256_unpacklo_epi16(step2[9], kZero);
+ lstep2[19] = _mm256_unpackhi_epi16(step2[9], kZero);
lstep2[28] = _mm256_unpacklo_epi16(step2[14], kZero);
lstep2[29] = _mm256_unpackhi_epi16(step2[14], kZero);
lstep2[30] = _mm256_unpacklo_epi16(step2[15], kZero);
@@ -1360,14 +1589,14 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
lstep2[30] = _mm256_madd_epi16(lstep2[30], kOne);
lstep2[31] = _mm256_madd_epi16(lstep2[31], kOne);
- lstep1[ 0] = _mm256_add_epi32(lstep3[ 6], lstep3[ 0]);
- lstep1[ 1] = _mm256_add_epi32(lstep3[ 7], lstep3[ 1]);
- lstep1[ 2] = _mm256_add_epi32(lstep3[ 4], lstep3[ 2]);
- lstep1[ 3] = _mm256_add_epi32(lstep3[ 5], lstep3[ 3]);
- lstep1[ 4] = _mm256_sub_epi32(lstep3[ 2], lstep3[ 4]);
- lstep1[ 5] = _mm256_sub_epi32(lstep3[ 3], lstep3[ 5]);
- lstep1[ 6] = _mm256_sub_epi32(lstep3[ 0], lstep3[ 6]);
- lstep1[ 7] = _mm256_sub_epi32(lstep3[ 1], lstep3[ 7]);
+ lstep1[0] = _mm256_add_epi32(lstep3[6], lstep3[0]);
+ lstep1[1] = _mm256_add_epi32(lstep3[7], lstep3[1]);
+ lstep1[2] = _mm256_add_epi32(lstep3[4], lstep3[2]);
+ lstep1[3] = _mm256_add_epi32(lstep3[5], lstep3[3]);
+ lstep1[4] = _mm256_sub_epi32(lstep3[2], lstep3[4]);
+ lstep1[5] = _mm256_sub_epi32(lstep3[3], lstep3[5]);
+ lstep1[6] = _mm256_sub_epi32(lstep3[0], lstep3[6]);
+ lstep1[7] = _mm256_sub_epi32(lstep3[1], lstep3[7]);
lstep1[16] = _mm256_add_epi32(lstep3[22], lstep2[16]);
lstep1[17] = _mm256_add_epi32(lstep3[23], lstep2[17]);
lstep1[18] = _mm256_add_epi32(lstep3[20], lstep2[18]);
@@ -1386,57 +1615,62 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
lstep1[31] = _mm256_add_epi32(lstep3[25], lstep2[31]);
}
{
- // to be continued...
- //
- const __m256i k32_p16_p16 = pair256_set_epi32(cospi_16_64, cospi_16_64);
- const __m256i k32_p16_m16 = pair256_set_epi32(cospi_16_64, -cospi_16_64);
-
- u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]);
- u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]);
- u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]);
- u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]);
-
- // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
- // instruction latency.
- v[ 0] = k_madd_epi32_avx2(u[0], k32_p16_m16);
- v[ 1] = k_madd_epi32_avx2(u[1], k32_p16_m16);
- v[ 2] = k_madd_epi32_avx2(u[2], k32_p16_m16);
- v[ 3] = k_madd_epi32_avx2(u[3], k32_p16_m16);
- v[ 4] = k_madd_epi32_avx2(u[0], k32_p16_p16);
- v[ 5] = k_madd_epi32_avx2(u[1], k32_p16_p16);
- v[ 6] = k_madd_epi32_avx2(u[2], k32_p16_p16);
- v[ 7] = k_madd_epi32_avx2(u[3], k32_p16_p16);
-
- u[0] = k_packs_epi64_avx2(v[0], v[1]);
- u[1] = k_packs_epi64_avx2(v[2], v[3]);
- u[2] = k_packs_epi64_avx2(v[4], v[5]);
- u[3] = k_packs_epi64_avx2(v[6], v[7]);
-
- v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-
- lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
- lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
- lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
- lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ // to be continued...
+ //
+ const __m256i k32_p16_p16 =
+ pair256_set_epi32(cospi_16_64, cospi_16_64);
+ const __m256i k32_p16_m16 =
+ pair256_set_epi32(cospi_16_64, -cospi_16_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]);
+ u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]);
+ u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]);
+ u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+ // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
+ // instruction latency.
+ v[0] = k_madd_epi32_avx2(u[0], k32_p16_m16);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p16_m16);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p16_m16);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p16_m16);
+ v[4] = k_madd_epi32_avx2(u[0], k32_p16_p16);
+ v[5] = k_madd_epi32_avx2(u[1], k32_p16_p16);
+ v[6] = k_madd_epi32_avx2(u[2], k32_p16_p16);
+ v[7] = k_madd_epi32_avx2(u[3], k32_p16_p16);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+ lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
}
{
- const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
- const __m256i k32_m24_m08 = pair256_set_epi32(-cospi_24_64, -cospi_8_64);
- const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
-
- u[ 0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]);
- u[ 1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]);
- u[ 2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]);
- u[ 3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]);
- u[ 4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]);
- u[ 5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]);
- u[ 6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]);
- u[ 7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]);
- u[ 8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]);
- u[ 9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]);
+ const __m256i k32_m08_p24 =
+ pair256_set_epi32(-cospi_8_64, cospi_24_64);
+ const __m256i k32_m24_m08 =
+ pair256_set_epi32(-cospi_24_64, -cospi_8_64);
+ const __m256i k32_p24_p08 =
+ pair256_set_epi32(cospi_24_64, cospi_8_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]);
+ u[1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]);
+ u[2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]);
+ u[3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]);
+ u[4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]);
+ u[5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]);
+ u[6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]);
+ u[7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]);
+ u[8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]);
+ u[9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]);
u[10] = _mm256_unpacklo_epi32(lstep3[41], lstep3[55]);
u[11] = _mm256_unpackhi_epi32(lstep3[41], lstep3[55]);
u[12] = _mm256_unpacklo_epi32(lstep3[42], lstep3[52]);
@@ -1444,16 +1678,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
u[14] = _mm256_unpacklo_epi32(lstep3[43], lstep3[53]);
u[15] = _mm256_unpackhi_epi32(lstep3[43], lstep3[53]);
- v[ 0] = k_madd_epi32_avx2(u[ 0], k32_m08_p24);
- v[ 1] = k_madd_epi32_avx2(u[ 1], k32_m08_p24);
- v[ 2] = k_madd_epi32_avx2(u[ 2], k32_m08_p24);
- v[ 3] = k_madd_epi32_avx2(u[ 3], k32_m08_p24);
- v[ 4] = k_madd_epi32_avx2(u[ 4], k32_m08_p24);
- v[ 5] = k_madd_epi32_avx2(u[ 5], k32_m08_p24);
- v[ 6] = k_madd_epi32_avx2(u[ 6], k32_m08_p24);
- v[ 7] = k_madd_epi32_avx2(u[ 7], k32_m08_p24);
- v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m24_m08);
- v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m24_m08);
+ v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24);
+ v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24);
+ v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24);
+ v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24);
+ v[4] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+ v[5] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+ v[6] = k_madd_epi32_avx2(u[6], k32_m08_p24);
+ v[7] = k_madd_epi32_avx2(u[7], k32_m08_p24);
+ v[8] = k_madd_epi32_avx2(u[8], k32_m24_m08);
+ v[9] = k_madd_epi32_avx2(u[9], k32_m24_m08);
v[10] = k_madd_epi32_avx2(u[10], k32_m24_m08);
v[11] = k_madd_epi32_avx2(u[11], k32_m24_m08);
v[12] = k_madd_epi32_avx2(u[12], k32_m24_m08);
@@ -1464,29 +1698,29 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
v[17] = k_madd_epi32_avx2(u[13], k32_m08_p24);
v[18] = k_madd_epi32_avx2(u[14], k32_m08_p24);
v[19] = k_madd_epi32_avx2(u[15], k32_m08_p24);
- v[20] = k_madd_epi32_avx2(u[ 8], k32_m08_p24);
- v[21] = k_madd_epi32_avx2(u[ 9], k32_m08_p24);
+ v[20] = k_madd_epi32_avx2(u[8], k32_m08_p24);
+ v[21] = k_madd_epi32_avx2(u[9], k32_m08_p24);
v[22] = k_madd_epi32_avx2(u[10], k32_m08_p24);
v[23] = k_madd_epi32_avx2(u[11], k32_m08_p24);
- v[24] = k_madd_epi32_avx2(u[ 4], k32_p24_p08);
- v[25] = k_madd_epi32_avx2(u[ 5], k32_p24_p08);
- v[26] = k_madd_epi32_avx2(u[ 6], k32_p24_p08);
- v[27] = k_madd_epi32_avx2(u[ 7], k32_p24_p08);
- v[28] = k_madd_epi32_avx2(u[ 0], k32_p24_p08);
- v[29] = k_madd_epi32_avx2(u[ 1], k32_p24_p08);
- v[30] = k_madd_epi32_avx2(u[ 2], k32_p24_p08);
- v[31] = k_madd_epi32_avx2(u[ 3], k32_p24_p08);
-
- u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
- u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
- u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
- u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
- u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
- u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
- u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
- u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
- u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
- u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+ v[24] = k_madd_epi32_avx2(u[4], k32_p24_p08);
+ v[25] = k_madd_epi32_avx2(u[5], k32_p24_p08);
+ v[26] = k_madd_epi32_avx2(u[6], k32_p24_p08);
+ v[27] = k_madd_epi32_avx2(u[7], k32_p24_p08);
+ v[28] = k_madd_epi32_avx2(u[0], k32_p24_p08);
+ v[29] = k_madd_epi32_avx2(u[1], k32_p24_p08);
+ v[30] = k_madd_epi32_avx2(u[2], k32_p24_p08);
+ v[31] = k_madd_epi32_avx2(u[3], k32_p24_p08);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[9] = k_packs_epi64_avx2(v[18], v[19]);
u[10] = k_packs_epi64_avx2(v[20], v[21]);
u[11] = k_packs_epi64_avx2(v[22], v[23]);
u[12] = k_packs_epi64_avx2(v[24], v[25]);
@@ -1494,16 +1728,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
u[14] = k_packs_epi64_avx2(v[28], v[29]);
u[15] = k_packs_epi64_avx2(v[30], v[31]);
- v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
- v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
- v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
- v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
- v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
- v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
- v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
- v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
- v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
- v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
@@ -1511,16 +1745,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
- lstep1[36] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
- lstep1[37] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
- lstep1[38] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
- lstep1[39] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
- lstep1[40] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
- lstep1[41] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
- lstep1[42] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
- lstep1[43] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
- lstep1[52] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
- lstep1[53] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+ lstep1[36] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep1[37] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep1[38] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep1[39] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ lstep1[40] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ lstep1[41] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ lstep1[42] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ lstep1[43] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+ lstep1[52] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+ lstep1[53] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
lstep1[54] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
lstep1[55] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
lstep1[56] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
@@ -1530,20 +1764,24 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
}
// stage 5
{
- lstep2[ 8] = _mm256_add_epi32(lstep1[10], lstep3[ 8]);
- lstep2[ 9] = _mm256_add_epi32(lstep1[11], lstep3[ 9]);
- lstep2[10] = _mm256_sub_epi32(lstep3[ 8], lstep1[10]);
- lstep2[11] = _mm256_sub_epi32(lstep3[ 9], lstep1[11]);
+ lstep2[8] = _mm256_add_epi32(lstep1[10], lstep3[8]);
+ lstep2[9] = _mm256_add_epi32(lstep1[11], lstep3[9]);
+ lstep2[10] = _mm256_sub_epi32(lstep3[8], lstep1[10]);
+ lstep2[11] = _mm256_sub_epi32(lstep3[9], lstep1[11]);
lstep2[12] = _mm256_sub_epi32(lstep3[14], lstep1[12]);
lstep2[13] = _mm256_sub_epi32(lstep3[15], lstep1[13]);
lstep2[14] = _mm256_add_epi32(lstep1[12], lstep3[14]);
lstep2[15] = _mm256_add_epi32(lstep1[13], lstep3[15]);
}
{
- const __m256i k32_p16_p16 = pair256_set_epi32(cospi_16_64, cospi_16_64);
- const __m256i k32_p16_m16 = pair256_set_epi32(cospi_16_64, -cospi_16_64);
- const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
- const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
+ const __m256i k32_p16_p16 =
+ pair256_set_epi32(cospi_16_64, cospi_16_64);
+ const __m256i k32_p16_m16 =
+ pair256_set_epi32(cospi_16_64, -cospi_16_64);
+ const __m256i k32_p24_p08 =
+ pair256_set_epi32(cospi_24_64, cospi_8_64);
+ const __m256i k32_m08_p24 =
+ pair256_set_epi32(-cospi_8_64, cospi_24_64);
u[0] = _mm256_unpacklo_epi32(lstep1[0], lstep1[2]);
u[1] = _mm256_unpackhi_epi32(lstep1[0], lstep1[2]);
@@ -1556,16 +1794,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
// TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
// instruction latency.
- v[ 0] = k_madd_epi32_avx2(u[0], k32_p16_p16);
- v[ 1] = k_madd_epi32_avx2(u[1], k32_p16_p16);
- v[ 2] = k_madd_epi32_avx2(u[2], k32_p16_p16);
- v[ 3] = k_madd_epi32_avx2(u[3], k32_p16_p16);
- v[ 4] = k_madd_epi32_avx2(u[0], k32_p16_m16);
- v[ 5] = k_madd_epi32_avx2(u[1], k32_p16_m16);
- v[ 6] = k_madd_epi32_avx2(u[2], k32_p16_m16);
- v[ 7] = k_madd_epi32_avx2(u[3], k32_p16_m16);
- v[ 8] = k_madd_epi32_avx2(u[4], k32_p24_p08);
- v[ 9] = k_madd_epi32_avx2(u[5], k32_p24_p08);
+ v[0] = k_madd_epi32_avx2(u[0], k32_p16_p16);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p16_p16);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p16_p16);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p16_p16);
+ v[4] = k_madd_epi32_avx2(u[0], k32_p16_m16);
+ v[5] = k_madd_epi32_avx2(u[1], k32_p16_m16);
+ v[6] = k_madd_epi32_avx2(u[2], k32_p16_m16);
+ v[7] = k_madd_epi32_avx2(u[3], k32_p16_m16);
+ v[8] = k_madd_epi32_avx2(u[4], k32_p24_p08);
+ v[9] = k_madd_epi32_avx2(u[5], k32_p24_p08);
v[10] = k_madd_epi32_avx2(u[6], k32_p24_p08);
v[11] = k_madd_epi32_avx2(u[7], k32_p24_p08);
v[12] = k_madd_epi32_avx2(u[4], k32_m08_p24);
@@ -1600,14 +1838,14 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
- sign[0] = _mm256_cmpgt_epi32(kZero,u[0]);
- sign[1] = _mm256_cmpgt_epi32(kZero,u[1]);
- sign[2] = _mm256_cmpgt_epi32(kZero,u[2]);
- sign[3] = _mm256_cmpgt_epi32(kZero,u[3]);
- sign[4] = _mm256_cmpgt_epi32(kZero,u[4]);
- sign[5] = _mm256_cmpgt_epi32(kZero,u[5]);
- sign[6] = _mm256_cmpgt_epi32(kZero,u[6]);
- sign[7] = _mm256_cmpgt_epi32(kZero,u[7]);
+ sign[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+ sign[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+ sign[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+ sign[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+ sign[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+ sign[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+ sign[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+ sign[7] = _mm256_cmpgt_epi32(kZero, u[7]);
u[0] = _mm256_sub_epi32(u[0], sign[0]);
u[1] = _mm256_sub_epi32(u[1], sign[1]);
@@ -1637,15 +1875,18 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
u[7] = _mm256_srai_epi32(u[7], 2);
// Combine
- out[ 0] = _mm256_packs_epi32(u[0], u[1]);
+ out[0] = _mm256_packs_epi32(u[0], u[1]);
out[16] = _mm256_packs_epi32(u[2], u[3]);
- out[ 8] = _mm256_packs_epi32(u[4], u[5]);
+ out[8] = _mm256_packs_epi32(u[4], u[5]);
out[24] = _mm256_packs_epi32(u[6], u[7]);
}
{
- const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
- const __m256i k32_m24_m08 = pair256_set_epi32(-cospi_24_64, -cospi_8_64);
- const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
+ const __m256i k32_m08_p24 =
+ pair256_set_epi32(-cospi_8_64, cospi_24_64);
+ const __m256i k32_m24_m08 =
+ pair256_set_epi32(-cospi_24_64, -cospi_8_64);
+ const __m256i k32_p24_p08 =
+ pair256_set_epi32(cospi_24_64, cospi_8_64);
u[0] = _mm256_unpacklo_epi32(lstep1[18], lstep1[28]);
u[1] = _mm256_unpackhi_epi32(lstep1[18], lstep1[28]);
@@ -1664,8 +1905,8 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
v[5] = k_madd_epi32_avx2(u[5], k32_m24_m08);
v[6] = k_madd_epi32_avx2(u[6], k32_m24_m08);
v[7] = k_madd_epi32_avx2(u[7], k32_m24_m08);
- v[ 8] = k_madd_epi32_avx2(u[4], k32_m08_p24);
- v[ 9] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+ v[8] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+ v[9] = k_madd_epi32_avx2(u[5], k32_m08_p24);
v[10] = k_madd_epi32_avx2(u[6], k32_m08_p24);
v[11] = k_madd_epi32_avx2(u[7], k32_m08_p24);
v[12] = k_madd_epi32_avx2(u[0], k32_p24_p08);
@@ -1736,15 +1977,19 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
}
// stage 6
{
- const __m256i k32_p28_p04 = pair256_set_epi32(cospi_28_64, cospi_4_64);
- const __m256i k32_p12_p20 = pair256_set_epi32(cospi_12_64, cospi_20_64);
- const __m256i k32_m20_p12 = pair256_set_epi32(-cospi_20_64, cospi_12_64);
- const __m256i k32_m04_p28 = pair256_set_epi32(-cospi_4_64, cospi_28_64);
-
- u[0] = _mm256_unpacklo_epi32(lstep2[ 8], lstep2[14]);
- u[1] = _mm256_unpackhi_epi32(lstep2[ 8], lstep2[14]);
- u[2] = _mm256_unpacklo_epi32(lstep2[ 9], lstep2[15]);
- u[3] = _mm256_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+ const __m256i k32_p28_p04 =
+ pair256_set_epi32(cospi_28_64, cospi_4_64);
+ const __m256i k32_p12_p20 =
+ pair256_set_epi32(cospi_12_64, cospi_20_64);
+ const __m256i k32_m20_p12 =
+ pair256_set_epi32(-cospi_20_64, cospi_12_64);
+ const __m256i k32_m04_p28 =
+ pair256_set_epi32(-cospi_4_64, cospi_28_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]);
+ u[1] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]);
+ u[2] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]);
+ u[3] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]);
u[4] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
u[5] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
u[6] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
@@ -1753,10 +1998,10 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
u[9] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
u[10] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
u[11] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
- u[12] = _mm256_unpacklo_epi32(lstep2[ 8], lstep2[14]);
- u[13] = _mm256_unpackhi_epi32(lstep2[ 8], lstep2[14]);
- u[14] = _mm256_unpacklo_epi32(lstep2[ 9], lstep2[15]);
- u[15] = _mm256_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+ u[12] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]);
+ u[13] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]);
+ u[14] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]);
+ u[15] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]);
v[0] = k_madd_epi32_avx2(u[0], k32_p28_p04);
v[1] = k_madd_epi32_avx2(u[1], k32_p28_p04);
@@ -1766,8 +2011,8 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
v[5] = k_madd_epi32_avx2(u[5], k32_p12_p20);
v[6] = k_madd_epi32_avx2(u[6], k32_p12_p20);
v[7] = k_madd_epi32_avx2(u[7], k32_p12_p20);
- v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m20_p12);
- v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m20_p12);
+ v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12);
+ v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12);
v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
v[12] = k_madd_epi32_avx2(u[12], k32_m04_p28);
@@ -1802,14 +2047,14 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
- sign[0] = _mm256_cmpgt_epi32(kZero,u[0]);
- sign[1] = _mm256_cmpgt_epi32(kZero,u[1]);
- sign[2] = _mm256_cmpgt_epi32(kZero,u[2]);
- sign[3] = _mm256_cmpgt_epi32(kZero,u[3]);
- sign[4] = _mm256_cmpgt_epi32(kZero,u[4]);
- sign[5] = _mm256_cmpgt_epi32(kZero,u[5]);
- sign[6] = _mm256_cmpgt_epi32(kZero,u[6]);
- sign[7] = _mm256_cmpgt_epi32(kZero,u[7]);
+ sign[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+ sign[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+ sign[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+ sign[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+ sign[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+ sign[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+ sign[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+ sign[7] = _mm256_cmpgt_epi32(kZero, u[7]);
u[0] = _mm256_sub_epi32(u[0], sign[0]);
u[1] = _mm256_sub_epi32(u[1], sign[1]);
@@ -1838,7 +2083,7 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
u[6] = _mm256_srai_epi32(u[6], 2);
u[7] = _mm256_srai_epi32(u[7], 2);
- out[ 4] = _mm256_packs_epi32(u[0], u[1]);
+ out[4] = _mm256_packs_epi32(u[0], u[1]);
out[20] = _mm256_packs_epi32(u[2], u[3]);
out[12] = _mm256_packs_epi32(u[4], u[5]);
out[28] = _mm256_packs_epi32(u[6], u[7]);
@@ -1862,24 +2107,29 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
lstep3[31] = _mm256_add_epi32(lstep2[29], lstep1[31]);
}
{
- const __m256i k32_m04_p28 = pair256_set_epi32(-cospi_4_64, cospi_28_64);
- const __m256i k32_m28_m04 = pair256_set_epi32(-cospi_28_64, -cospi_4_64);
- const __m256i k32_m20_p12 = pair256_set_epi32(-cospi_20_64, cospi_12_64);
- const __m256i k32_m12_m20 = pair256_set_epi32(-cospi_12_64,
- -cospi_20_64);
- const __m256i k32_p12_p20 = pair256_set_epi32(cospi_12_64, cospi_20_64);
- const __m256i k32_p28_p04 = pair256_set_epi32(cospi_28_64, cospi_4_64);
-
- u[ 0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]);
- u[ 1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]);
- u[ 2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]);
- u[ 3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]);
- u[ 4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]);
- u[ 5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]);
- u[ 6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]);
- u[ 7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]);
- u[ 8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]);
- u[ 9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]);
+ const __m256i k32_m04_p28 =
+ pair256_set_epi32(-cospi_4_64, cospi_28_64);
+ const __m256i k32_m28_m04 =
+ pair256_set_epi32(-cospi_28_64, -cospi_4_64);
+ const __m256i k32_m20_p12 =
+ pair256_set_epi32(-cospi_20_64, cospi_12_64);
+ const __m256i k32_m12_m20 =
+ pair256_set_epi32(-cospi_12_64, -cospi_20_64);
+ const __m256i k32_p12_p20 =
+ pair256_set_epi32(cospi_12_64, cospi_20_64);
+ const __m256i k32_p28_p04 =
+ pair256_set_epi32(cospi_28_64, cospi_4_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]);
+ u[1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]);
+ u[2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]);
+ u[3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]);
+ u[4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]);
+ u[5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]);
+ u[6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]);
+ u[7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]);
+ u[8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]);
+ u[9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]);
u[10] = _mm256_unpacklo_epi32(lstep2[43], lstep2[53]);
u[11] = _mm256_unpackhi_epi32(lstep2[43], lstep2[53]);
u[12] = _mm256_unpacklo_epi32(lstep2[44], lstep2[50]);
@@ -1887,16 +2137,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
u[14] = _mm256_unpacklo_epi32(lstep2[45], lstep2[51]);
u[15] = _mm256_unpackhi_epi32(lstep2[45], lstep2[51]);
- v[ 0] = k_madd_epi32_avx2(u[ 0], k32_m04_p28);
- v[ 1] = k_madd_epi32_avx2(u[ 1], k32_m04_p28);
- v[ 2] = k_madd_epi32_avx2(u[ 2], k32_m04_p28);
- v[ 3] = k_madd_epi32_avx2(u[ 3], k32_m04_p28);
- v[ 4] = k_madd_epi32_avx2(u[ 4], k32_m28_m04);
- v[ 5] = k_madd_epi32_avx2(u[ 5], k32_m28_m04);
- v[ 6] = k_madd_epi32_avx2(u[ 6], k32_m28_m04);
- v[ 7] = k_madd_epi32_avx2(u[ 7], k32_m28_m04);
- v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m20_p12);
- v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m20_p12);
+ v[0] = k_madd_epi32_avx2(u[0], k32_m04_p28);
+ v[1] = k_madd_epi32_avx2(u[1], k32_m04_p28);
+ v[2] = k_madd_epi32_avx2(u[2], k32_m04_p28);
+ v[3] = k_madd_epi32_avx2(u[3], k32_m04_p28);
+ v[4] = k_madd_epi32_avx2(u[4], k32_m28_m04);
+ v[5] = k_madd_epi32_avx2(u[5], k32_m28_m04);
+ v[6] = k_madd_epi32_avx2(u[6], k32_m28_m04);
+ v[7] = k_madd_epi32_avx2(u[7], k32_m28_m04);
+ v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12);
+ v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12);
v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
v[12] = k_madd_epi32_avx2(u[12], k32_m12_m20);
@@ -1907,29 +2157,29 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
v[17] = k_madd_epi32_avx2(u[13], k32_m20_p12);
v[18] = k_madd_epi32_avx2(u[14], k32_m20_p12);
v[19] = k_madd_epi32_avx2(u[15], k32_m20_p12);
- v[20] = k_madd_epi32_avx2(u[ 8], k32_p12_p20);
- v[21] = k_madd_epi32_avx2(u[ 9], k32_p12_p20);
+ v[20] = k_madd_epi32_avx2(u[8], k32_p12_p20);
+ v[21] = k_madd_epi32_avx2(u[9], k32_p12_p20);
v[22] = k_madd_epi32_avx2(u[10], k32_p12_p20);
v[23] = k_madd_epi32_avx2(u[11], k32_p12_p20);
- v[24] = k_madd_epi32_avx2(u[ 4], k32_m04_p28);
- v[25] = k_madd_epi32_avx2(u[ 5], k32_m04_p28);
- v[26] = k_madd_epi32_avx2(u[ 6], k32_m04_p28);
- v[27] = k_madd_epi32_avx2(u[ 7], k32_m04_p28);
- v[28] = k_madd_epi32_avx2(u[ 0], k32_p28_p04);
- v[29] = k_madd_epi32_avx2(u[ 1], k32_p28_p04);
- v[30] = k_madd_epi32_avx2(u[ 2], k32_p28_p04);
- v[31] = k_madd_epi32_avx2(u[ 3], k32_p28_p04);
-
- u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
- u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
- u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
- u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
- u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
- u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
- u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
- u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
- u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
- u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+ v[24] = k_madd_epi32_avx2(u[4], k32_m04_p28);
+ v[25] = k_madd_epi32_avx2(u[5], k32_m04_p28);
+ v[26] = k_madd_epi32_avx2(u[6], k32_m04_p28);
+ v[27] = k_madd_epi32_avx2(u[7], k32_m04_p28);
+ v[28] = k_madd_epi32_avx2(u[0], k32_p28_p04);
+ v[29] = k_madd_epi32_avx2(u[1], k32_p28_p04);
+ v[30] = k_madd_epi32_avx2(u[2], k32_p28_p04);
+ v[31] = k_madd_epi32_avx2(u[3], k32_p28_p04);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[9] = k_packs_epi64_avx2(v[18], v[19]);
u[10] = k_packs_epi64_avx2(v[20], v[21]);
u[11] = k_packs_epi64_avx2(v[22], v[23]);
u[12] = k_packs_epi64_avx2(v[24], v[25]);
@@ -1937,16 +2187,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
u[14] = k_packs_epi64_avx2(v[28], v[29]);
u[15] = k_packs_epi64_avx2(v[30], v[31]);
- v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
- v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
- v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
- v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
- v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
- v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
- v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
- v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
- v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
- v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
@@ -1954,16 +2204,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
- lstep3[34] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
- lstep3[35] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
- lstep3[36] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
- lstep3[37] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
- lstep3[42] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
- lstep3[43] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
- lstep3[44] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
- lstep3[45] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
- lstep3[50] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
- lstep3[51] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+ lstep3[34] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep3[35] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep3[36] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep3[37] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ lstep3[42] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ lstep3[43] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ lstep3[44] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ lstep3[45] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+ lstep3[50] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+ lstep3[51] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
lstep3[52] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
lstep3[53] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
lstep3[58] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
@@ -1973,25 +2223,33 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
}
// stage 7
{
- const __m256i k32_p30_p02 = pair256_set_epi32(cospi_30_64, cospi_2_64);
- const __m256i k32_p14_p18 = pair256_set_epi32(cospi_14_64, cospi_18_64);
- const __m256i k32_p22_p10 = pair256_set_epi32(cospi_22_64, cospi_10_64);
- const __m256i k32_p06_p26 = pair256_set_epi32(cospi_6_64, cospi_26_64);
- const __m256i k32_m26_p06 = pair256_set_epi32(-cospi_26_64, cospi_6_64);
- const __m256i k32_m10_p22 = pair256_set_epi32(-cospi_10_64, cospi_22_64);
- const __m256i k32_m18_p14 = pair256_set_epi32(-cospi_18_64, cospi_14_64);
- const __m256i k32_m02_p30 = pair256_set_epi32(-cospi_2_64, cospi_30_64);
-
- u[ 0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]);
- u[ 1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]);
- u[ 2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]);
- u[ 3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]);
- u[ 4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]);
- u[ 5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]);
- u[ 6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]);
- u[ 7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]);
- u[ 8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]);
- u[ 9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]);
+ const __m256i k32_p30_p02 =
+ pair256_set_epi32(cospi_30_64, cospi_2_64);
+ const __m256i k32_p14_p18 =
+ pair256_set_epi32(cospi_14_64, cospi_18_64);
+ const __m256i k32_p22_p10 =
+ pair256_set_epi32(cospi_22_64, cospi_10_64);
+ const __m256i k32_p06_p26 =
+ pair256_set_epi32(cospi_6_64, cospi_26_64);
+ const __m256i k32_m26_p06 =
+ pair256_set_epi32(-cospi_26_64, cospi_6_64);
+ const __m256i k32_m10_p22 =
+ pair256_set_epi32(-cospi_10_64, cospi_22_64);
+ const __m256i k32_m18_p14 =
+ pair256_set_epi32(-cospi_18_64, cospi_14_64);
+ const __m256i k32_m02_p30 =
+ pair256_set_epi32(-cospi_2_64, cospi_30_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]);
+ u[1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]);
+ u[2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]);
+ u[3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]);
+ u[4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]);
+ u[5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]);
+ u[6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]);
+ u[7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]);
+ u[8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]);
+ u[9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]);
u[10] = _mm256_unpacklo_epi32(lstep3[21], lstep3[27]);
u[11] = _mm256_unpackhi_epi32(lstep3[21], lstep3[27]);
u[12] = _mm256_unpacklo_epi32(lstep3[22], lstep3[24]);
@@ -1999,16 +2257,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
u[14] = _mm256_unpacklo_epi32(lstep3[23], lstep3[25]);
u[15] = _mm256_unpackhi_epi32(lstep3[23], lstep3[25]);
- v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p30_p02);
- v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p30_p02);
- v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p30_p02);
- v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p30_p02);
- v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p14_p18);
- v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p14_p18);
- v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p14_p18);
- v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p14_p18);
- v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p22_p10);
- v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p22_p10);
+ v[0] = k_madd_epi32_avx2(u[0], k32_p30_p02);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p30_p02);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p30_p02);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p30_p02);
+ v[4] = k_madd_epi32_avx2(u[4], k32_p14_p18);
+ v[5] = k_madd_epi32_avx2(u[5], k32_p14_p18);
+ v[6] = k_madd_epi32_avx2(u[6], k32_p14_p18);
+ v[7] = k_madd_epi32_avx2(u[7], k32_p14_p18);
+ v[8] = k_madd_epi32_avx2(u[8], k32_p22_p10);
+ v[9] = k_madd_epi32_avx2(u[9], k32_p22_p10);
v[10] = k_madd_epi32_avx2(u[10], k32_p22_p10);
v[11] = k_madd_epi32_avx2(u[11], k32_p22_p10);
v[12] = k_madd_epi32_avx2(u[12], k32_p06_p26);
@@ -2019,29 +2277,29 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
v[17] = k_madd_epi32_avx2(u[13], k32_m26_p06);
v[18] = k_madd_epi32_avx2(u[14], k32_m26_p06);
v[19] = k_madd_epi32_avx2(u[15], k32_m26_p06);
- v[20] = k_madd_epi32_avx2(u[ 8], k32_m10_p22);
- v[21] = k_madd_epi32_avx2(u[ 9], k32_m10_p22);
+ v[20] = k_madd_epi32_avx2(u[8], k32_m10_p22);
+ v[21] = k_madd_epi32_avx2(u[9], k32_m10_p22);
v[22] = k_madd_epi32_avx2(u[10], k32_m10_p22);
v[23] = k_madd_epi32_avx2(u[11], k32_m10_p22);
- v[24] = k_madd_epi32_avx2(u[ 4], k32_m18_p14);
- v[25] = k_madd_epi32_avx2(u[ 5], k32_m18_p14);
- v[26] = k_madd_epi32_avx2(u[ 6], k32_m18_p14);
- v[27] = k_madd_epi32_avx2(u[ 7], k32_m18_p14);
- v[28] = k_madd_epi32_avx2(u[ 0], k32_m02_p30);
- v[29] = k_madd_epi32_avx2(u[ 1], k32_m02_p30);
- v[30] = k_madd_epi32_avx2(u[ 2], k32_m02_p30);
- v[31] = k_madd_epi32_avx2(u[ 3], k32_m02_p30);
-
- u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
- u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
- u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
- u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
- u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
- u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
- u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
- u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
- u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
- u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+ v[24] = k_madd_epi32_avx2(u[4], k32_m18_p14);
+ v[25] = k_madd_epi32_avx2(u[5], k32_m18_p14);
+ v[26] = k_madd_epi32_avx2(u[6], k32_m18_p14);
+ v[27] = k_madd_epi32_avx2(u[7], k32_m18_p14);
+ v[28] = k_madd_epi32_avx2(u[0], k32_m02_p30);
+ v[29] = k_madd_epi32_avx2(u[1], k32_m02_p30);
+ v[30] = k_madd_epi32_avx2(u[2], k32_m02_p30);
+ v[31] = k_madd_epi32_avx2(u[3], k32_m02_p30);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[9] = k_packs_epi64_avx2(v[18], v[19]);
u[10] = k_packs_epi64_avx2(v[20], v[21]);
u[11] = k_packs_epi64_avx2(v[22], v[23]);
u[12] = k_packs_epi64_avx2(v[24], v[25]);
@@ -2049,16 +2307,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
u[14] = k_packs_epi64_avx2(v[28], v[29]);
u[15] = k_packs_epi64_avx2(v[30], v[31]);
- v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
- v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
- v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
- v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
- v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
- v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
- v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
- v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
- v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
- v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
@@ -2066,16 +2324,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
- u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
- u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
- u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
- u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
- u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
- u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
- u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
- u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
- u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
- u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+ u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
@@ -2083,33 +2341,33 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
- v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
- v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
- v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
- v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
- v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
- v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
- v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
- v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
- v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
- v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
- v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
- v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
- v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
- v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
- v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
- v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
-
- u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
- u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
- u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
- u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
- u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
- u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
- u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
- u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
- u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
- u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+ v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+ v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+ v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+ v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+ v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+ v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+ v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+ v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
+ v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
+ v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
+ v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
+ v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
+ v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
+ v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
+ v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
+ v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
+
+ u[0] = _mm256_sub_epi32(u[0], v[0]);
+ u[1] = _mm256_sub_epi32(u[1], v[1]);
+ u[2] = _mm256_sub_epi32(u[2], v[2]);
+ u[3] = _mm256_sub_epi32(u[3], v[3]);
+ u[4] = _mm256_sub_epi32(u[4], v[4]);
+ u[5] = _mm256_sub_epi32(u[5], v[5]);
+ u[6] = _mm256_sub_epi32(u[6], v[6]);
+ u[7] = _mm256_sub_epi32(u[7], v[7]);
+ u[8] = _mm256_sub_epi32(u[8], v[8]);
+ u[9] = _mm256_sub_epi32(u[9], v[9]);
u[10] = _mm256_sub_epi32(u[10], v[10]);
u[11] = _mm256_sub_epi32(u[11], v[11]);
u[12] = _mm256_sub_epi32(u[12], v[12]);
@@ -2117,16 +2375,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
u[14] = _mm256_sub_epi32(u[14], v[14]);
u[15] = _mm256_sub_epi32(u[15], v[15]);
- v[ 0] = _mm256_add_epi32(u[ 0], K32One);
- v[ 1] = _mm256_add_epi32(u[ 1], K32One);
- v[ 2] = _mm256_add_epi32(u[ 2], K32One);
- v[ 3] = _mm256_add_epi32(u[ 3], K32One);
- v[ 4] = _mm256_add_epi32(u[ 4], K32One);
- v[ 5] = _mm256_add_epi32(u[ 5], K32One);
- v[ 6] = _mm256_add_epi32(u[ 6], K32One);
- v[ 7] = _mm256_add_epi32(u[ 7], K32One);
- v[ 8] = _mm256_add_epi32(u[ 8], K32One);
- v[ 9] = _mm256_add_epi32(u[ 9], K32One);
+ v[0] = _mm256_add_epi32(u[0], K32One);
+ v[1] = _mm256_add_epi32(u[1], K32One);
+ v[2] = _mm256_add_epi32(u[2], K32One);
+ v[3] = _mm256_add_epi32(u[3], K32One);
+ v[4] = _mm256_add_epi32(u[4], K32One);
+ v[5] = _mm256_add_epi32(u[5], K32One);
+ v[6] = _mm256_add_epi32(u[6], K32One);
+ v[7] = _mm256_add_epi32(u[7], K32One);
+ v[8] = _mm256_add_epi32(u[8], K32One);
+ v[9] = _mm256_add_epi32(u[9], K32One);
v[10] = _mm256_add_epi32(u[10], K32One);
v[11] = _mm256_add_epi32(u[11], K32One);
v[12] = _mm256_add_epi32(u[12], K32One);
@@ -2134,16 +2392,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
v[14] = _mm256_add_epi32(u[14], K32One);
v[15] = _mm256_add_epi32(u[15], K32One);
- u[ 0] = _mm256_srai_epi32(v[ 0], 2);
- u[ 1] = _mm256_srai_epi32(v[ 1], 2);
- u[ 2] = _mm256_srai_epi32(v[ 2], 2);
- u[ 3] = _mm256_srai_epi32(v[ 3], 2);
- u[ 4] = _mm256_srai_epi32(v[ 4], 2);
- u[ 5] = _mm256_srai_epi32(v[ 5], 2);
- u[ 6] = _mm256_srai_epi32(v[ 6], 2);
- u[ 7] = _mm256_srai_epi32(v[ 7], 2);
- u[ 8] = _mm256_srai_epi32(v[ 8], 2);
- u[ 9] = _mm256_srai_epi32(v[ 9], 2);
+ u[0] = _mm256_srai_epi32(v[0], 2);
+ u[1] = _mm256_srai_epi32(v[1], 2);
+ u[2] = _mm256_srai_epi32(v[2], 2);
+ u[3] = _mm256_srai_epi32(v[3], 2);
+ u[4] = _mm256_srai_epi32(v[4], 2);
+ u[5] = _mm256_srai_epi32(v[5], 2);
+ u[6] = _mm256_srai_epi32(v[6], 2);
+ u[7] = _mm256_srai_epi32(v[7], 2);
+ u[8] = _mm256_srai_epi32(v[8], 2);
+ u[9] = _mm256_srai_epi32(v[9], 2);
u[10] = _mm256_srai_epi32(v[10], 2);
u[11] = _mm256_srai_epi32(v[11], 2);
u[12] = _mm256_srai_epi32(v[12], 2);
@@ -2151,11 +2409,11 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
u[14] = _mm256_srai_epi32(v[14], 2);
u[15] = _mm256_srai_epi32(v[15], 2);
- out[ 2] = _mm256_packs_epi32(u[0], u[1]);
+ out[2] = _mm256_packs_epi32(u[0], u[1]);
out[18] = _mm256_packs_epi32(u[2], u[3]);
out[10] = _mm256_packs_epi32(u[4], u[5]);
out[26] = _mm256_packs_epi32(u[6], u[7]);
- out[ 6] = _mm256_packs_epi32(u[8], u[9]);
+ out[6] = _mm256_packs_epi32(u[8], u[9]);
out[22] = _mm256_packs_epi32(u[10], u[11]);
out[14] = _mm256_packs_epi32(u[12], u[13]);
out[30] = _mm256_packs_epi32(u[14], u[15]);
@@ -2196,25 +2454,33 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
}
// stage 8
{
- const __m256i k32_p31_p01 = pair256_set_epi32(cospi_31_64, cospi_1_64);
- const __m256i k32_p15_p17 = pair256_set_epi32(cospi_15_64, cospi_17_64);
- const __m256i k32_p23_p09 = pair256_set_epi32(cospi_23_64, cospi_9_64);
- const __m256i k32_p07_p25 = pair256_set_epi32(cospi_7_64, cospi_25_64);
- const __m256i k32_m25_p07 = pair256_set_epi32(-cospi_25_64, cospi_7_64);
- const __m256i k32_m09_p23 = pair256_set_epi32(-cospi_9_64, cospi_23_64);
- const __m256i k32_m17_p15 = pair256_set_epi32(-cospi_17_64, cospi_15_64);
- const __m256i k32_m01_p31 = pair256_set_epi32(-cospi_1_64, cospi_31_64);
-
- u[ 0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]);
- u[ 1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]);
- u[ 2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]);
- u[ 3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]);
- u[ 4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]);
- u[ 5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]);
- u[ 6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]);
- u[ 7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]);
- u[ 8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]);
- u[ 9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]);
+ const __m256i k32_p31_p01 =
+ pair256_set_epi32(cospi_31_64, cospi_1_64);
+ const __m256i k32_p15_p17 =
+ pair256_set_epi32(cospi_15_64, cospi_17_64);
+ const __m256i k32_p23_p09 =
+ pair256_set_epi32(cospi_23_64, cospi_9_64);
+ const __m256i k32_p07_p25 =
+ pair256_set_epi32(cospi_7_64, cospi_25_64);
+ const __m256i k32_m25_p07 =
+ pair256_set_epi32(-cospi_25_64, cospi_7_64);
+ const __m256i k32_m09_p23 =
+ pair256_set_epi32(-cospi_9_64, cospi_23_64);
+ const __m256i k32_m17_p15 =
+ pair256_set_epi32(-cospi_17_64, cospi_15_64);
+ const __m256i k32_m01_p31 =
+ pair256_set_epi32(-cospi_1_64, cospi_31_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]);
+ u[1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]);
+ u[2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]);
+ u[3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]);
+ u[4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]);
+ u[5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]);
+ u[6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]);
+ u[7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]);
+ u[8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]);
+ u[9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]);
u[10] = _mm256_unpacklo_epi32(lstep1[37], lstep1[59]);
u[11] = _mm256_unpackhi_epi32(lstep1[37], lstep1[59]);
u[12] = _mm256_unpacklo_epi32(lstep1[38], lstep1[56]);
@@ -2222,16 +2488,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
u[14] = _mm256_unpacklo_epi32(lstep1[39], lstep1[57]);
u[15] = _mm256_unpackhi_epi32(lstep1[39], lstep1[57]);
- v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p31_p01);
- v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p31_p01);
- v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p31_p01);
- v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p31_p01);
- v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p15_p17);
- v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p15_p17);
- v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p15_p17);
- v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p15_p17);
- v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p23_p09);
- v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p23_p09);
+ v[0] = k_madd_epi32_avx2(u[0], k32_p31_p01);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p31_p01);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p31_p01);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p31_p01);
+ v[4] = k_madd_epi32_avx2(u[4], k32_p15_p17);
+ v[5] = k_madd_epi32_avx2(u[5], k32_p15_p17);
+ v[6] = k_madd_epi32_avx2(u[6], k32_p15_p17);
+ v[7] = k_madd_epi32_avx2(u[7], k32_p15_p17);
+ v[8] = k_madd_epi32_avx2(u[8], k32_p23_p09);
+ v[9] = k_madd_epi32_avx2(u[9], k32_p23_p09);
v[10] = k_madd_epi32_avx2(u[10], k32_p23_p09);
v[11] = k_madd_epi32_avx2(u[11], k32_p23_p09);
v[12] = k_madd_epi32_avx2(u[12], k32_p07_p25);
@@ -2242,29 +2508,29 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
v[17] = k_madd_epi32_avx2(u[13], k32_m25_p07);
v[18] = k_madd_epi32_avx2(u[14], k32_m25_p07);
v[19] = k_madd_epi32_avx2(u[15], k32_m25_p07);
- v[20] = k_madd_epi32_avx2(u[ 8], k32_m09_p23);
- v[21] = k_madd_epi32_avx2(u[ 9], k32_m09_p23);
+ v[20] = k_madd_epi32_avx2(u[8], k32_m09_p23);
+ v[21] = k_madd_epi32_avx2(u[9], k32_m09_p23);
v[22] = k_madd_epi32_avx2(u[10], k32_m09_p23);
v[23] = k_madd_epi32_avx2(u[11], k32_m09_p23);
- v[24] = k_madd_epi32_avx2(u[ 4], k32_m17_p15);
- v[25] = k_madd_epi32_avx2(u[ 5], k32_m17_p15);
- v[26] = k_madd_epi32_avx2(u[ 6], k32_m17_p15);
- v[27] = k_madd_epi32_avx2(u[ 7], k32_m17_p15);
- v[28] = k_madd_epi32_avx2(u[ 0], k32_m01_p31);
- v[29] = k_madd_epi32_avx2(u[ 1], k32_m01_p31);
- v[30] = k_madd_epi32_avx2(u[ 2], k32_m01_p31);
- v[31] = k_madd_epi32_avx2(u[ 3], k32_m01_p31);
-
- u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
- u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
- u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
- u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
- u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
- u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
- u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
- u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
- u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
- u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+ v[24] = k_madd_epi32_avx2(u[4], k32_m17_p15);
+ v[25] = k_madd_epi32_avx2(u[5], k32_m17_p15);
+ v[26] = k_madd_epi32_avx2(u[6], k32_m17_p15);
+ v[27] = k_madd_epi32_avx2(u[7], k32_m17_p15);
+ v[28] = k_madd_epi32_avx2(u[0], k32_m01_p31);
+ v[29] = k_madd_epi32_avx2(u[1], k32_m01_p31);
+ v[30] = k_madd_epi32_avx2(u[2], k32_m01_p31);
+ v[31] = k_madd_epi32_avx2(u[3], k32_m01_p31);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[9] = k_packs_epi64_avx2(v[18], v[19]);
u[10] = k_packs_epi64_avx2(v[20], v[21]);
u[11] = k_packs_epi64_avx2(v[22], v[23]);
u[12] = k_packs_epi64_avx2(v[24], v[25]);
@@ -2272,16 +2538,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
u[14] = k_packs_epi64_avx2(v[28], v[29]);
u[15] = k_packs_epi64_avx2(v[30], v[31]);
- v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
- v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
- v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
- v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
- v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
- v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
- v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
- v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
- v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
- v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
@@ -2289,16 +2555,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
- u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
- u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
- u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
- u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
- u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
- u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
- u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
- u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
- u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
- u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+ u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
@@ -2306,33 +2572,33 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
- v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
- v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
- v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
- v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
- v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
- v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
- v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
- v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
- v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
- v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
- v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
- v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
- v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
- v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
- v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
- v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
-
- u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
- u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
- u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
- u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
- u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
- u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
- u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
- u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
- u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
- u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+ v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+ v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+ v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+ v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+ v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+ v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+ v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+ v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
+ v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
+ v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
+ v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
+ v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
+ v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
+ v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
+ v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
+ v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
+
+ u[0] = _mm256_sub_epi32(u[0], v[0]);
+ u[1] = _mm256_sub_epi32(u[1], v[1]);
+ u[2] = _mm256_sub_epi32(u[2], v[2]);
+ u[3] = _mm256_sub_epi32(u[3], v[3]);
+ u[4] = _mm256_sub_epi32(u[4], v[4]);
+ u[5] = _mm256_sub_epi32(u[5], v[5]);
+ u[6] = _mm256_sub_epi32(u[6], v[6]);
+ u[7] = _mm256_sub_epi32(u[7], v[7]);
+ u[8] = _mm256_sub_epi32(u[8], v[8]);
+ u[9] = _mm256_sub_epi32(u[9], v[9]);
u[10] = _mm256_sub_epi32(u[10], v[10]);
u[11] = _mm256_sub_epi32(u[11], v[11]);
u[12] = _mm256_sub_epi32(u[12], v[12]);
@@ -2374,35 +2640,43 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
u[14] = _mm256_srai_epi32(v[14], 2);
u[15] = _mm256_srai_epi32(v[15], 2);
- out[ 1] = _mm256_packs_epi32(u[0], u[1]);
+ out[1] = _mm256_packs_epi32(u[0], u[1]);
out[17] = _mm256_packs_epi32(u[2], u[3]);
- out[ 9] = _mm256_packs_epi32(u[4], u[5]);
+ out[9] = _mm256_packs_epi32(u[4], u[5]);
out[25] = _mm256_packs_epi32(u[6], u[7]);
- out[ 7] = _mm256_packs_epi32(u[8], u[9]);
+ out[7] = _mm256_packs_epi32(u[8], u[9]);
out[23] = _mm256_packs_epi32(u[10], u[11]);
out[15] = _mm256_packs_epi32(u[12], u[13]);
out[31] = _mm256_packs_epi32(u[14], u[15]);
}
{
- const __m256i k32_p27_p05 = pair256_set_epi32(cospi_27_64, cospi_5_64);
- const __m256i k32_p11_p21 = pair256_set_epi32(cospi_11_64, cospi_21_64);
- const __m256i k32_p19_p13 = pair256_set_epi32(cospi_19_64, cospi_13_64);
- const __m256i k32_p03_p29 = pair256_set_epi32(cospi_3_64, cospi_29_64);
- const __m256i k32_m29_p03 = pair256_set_epi32(-cospi_29_64, cospi_3_64);
- const __m256i k32_m13_p19 = pair256_set_epi32(-cospi_13_64, cospi_19_64);
- const __m256i k32_m21_p11 = pair256_set_epi32(-cospi_21_64, cospi_11_64);
- const __m256i k32_m05_p27 = pair256_set_epi32(-cospi_5_64, cospi_27_64);
-
- u[ 0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]);
- u[ 1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]);
- u[ 2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]);
- u[ 3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]);
- u[ 4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]);
- u[ 5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]);
- u[ 6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]);
- u[ 7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]);
- u[ 8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]);
- u[ 9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]);
+ const __m256i k32_p27_p05 =
+ pair256_set_epi32(cospi_27_64, cospi_5_64);
+ const __m256i k32_p11_p21 =
+ pair256_set_epi32(cospi_11_64, cospi_21_64);
+ const __m256i k32_p19_p13 =
+ pair256_set_epi32(cospi_19_64, cospi_13_64);
+ const __m256i k32_p03_p29 =
+ pair256_set_epi32(cospi_3_64, cospi_29_64);
+ const __m256i k32_m29_p03 =
+ pair256_set_epi32(-cospi_29_64, cospi_3_64);
+ const __m256i k32_m13_p19 =
+ pair256_set_epi32(-cospi_13_64, cospi_19_64);
+ const __m256i k32_m21_p11 =
+ pair256_set_epi32(-cospi_21_64, cospi_11_64);
+ const __m256i k32_m05_p27 =
+ pair256_set_epi32(-cospi_5_64, cospi_27_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]);
+ u[1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]);
+ u[2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]);
+ u[3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]);
+ u[4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]);
+ u[5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]);
+ u[6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]);
+ u[7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]);
+ u[8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]);
+ u[9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]);
u[10] = _mm256_unpacklo_epi32(lstep1[45], lstep1[51]);
u[11] = _mm256_unpackhi_epi32(lstep1[45], lstep1[51]);
u[12] = _mm256_unpacklo_epi32(lstep1[46], lstep1[48]);
@@ -2410,16 +2684,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
u[14] = _mm256_unpacklo_epi32(lstep1[47], lstep1[49]);
u[15] = _mm256_unpackhi_epi32(lstep1[47], lstep1[49]);
- v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p27_p05);
- v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p27_p05);
- v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p27_p05);
- v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p27_p05);
- v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p11_p21);
- v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p11_p21);
- v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p11_p21);
- v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p11_p21);
- v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p19_p13);
- v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p19_p13);
+ v[0] = k_madd_epi32_avx2(u[0], k32_p27_p05);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p27_p05);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p27_p05);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p27_p05);
+ v[4] = k_madd_epi32_avx2(u[4], k32_p11_p21);
+ v[5] = k_madd_epi32_avx2(u[5], k32_p11_p21);
+ v[6] = k_madd_epi32_avx2(u[6], k32_p11_p21);
+ v[7] = k_madd_epi32_avx2(u[7], k32_p11_p21);
+ v[8] = k_madd_epi32_avx2(u[8], k32_p19_p13);
+ v[9] = k_madd_epi32_avx2(u[9], k32_p19_p13);
v[10] = k_madd_epi32_avx2(u[10], k32_p19_p13);
v[11] = k_madd_epi32_avx2(u[11], k32_p19_p13);
v[12] = k_madd_epi32_avx2(u[12], k32_p03_p29);
@@ -2430,29 +2704,29 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
v[17] = k_madd_epi32_avx2(u[13], k32_m29_p03);
v[18] = k_madd_epi32_avx2(u[14], k32_m29_p03);
v[19] = k_madd_epi32_avx2(u[15], k32_m29_p03);
- v[20] = k_madd_epi32_avx2(u[ 8], k32_m13_p19);
- v[21] = k_madd_epi32_avx2(u[ 9], k32_m13_p19);
+ v[20] = k_madd_epi32_avx2(u[8], k32_m13_p19);
+ v[21] = k_madd_epi32_avx2(u[9], k32_m13_p19);
v[22] = k_madd_epi32_avx2(u[10], k32_m13_p19);
v[23] = k_madd_epi32_avx2(u[11], k32_m13_p19);
- v[24] = k_madd_epi32_avx2(u[ 4], k32_m21_p11);
- v[25] = k_madd_epi32_avx2(u[ 5], k32_m21_p11);
- v[26] = k_madd_epi32_avx2(u[ 6], k32_m21_p11);
- v[27] = k_madd_epi32_avx2(u[ 7], k32_m21_p11);
- v[28] = k_madd_epi32_avx2(u[ 0], k32_m05_p27);
- v[29] = k_madd_epi32_avx2(u[ 1], k32_m05_p27);
- v[30] = k_madd_epi32_avx2(u[ 2], k32_m05_p27);
- v[31] = k_madd_epi32_avx2(u[ 3], k32_m05_p27);
-
- u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
- u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
- u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
- u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
- u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
- u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
- u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
- u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
- u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
- u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+ v[24] = k_madd_epi32_avx2(u[4], k32_m21_p11);
+ v[25] = k_madd_epi32_avx2(u[5], k32_m21_p11);
+ v[26] = k_madd_epi32_avx2(u[6], k32_m21_p11);
+ v[27] = k_madd_epi32_avx2(u[7], k32_m21_p11);
+ v[28] = k_madd_epi32_avx2(u[0], k32_m05_p27);
+ v[29] = k_madd_epi32_avx2(u[1], k32_m05_p27);
+ v[30] = k_madd_epi32_avx2(u[2], k32_m05_p27);
+ v[31] = k_madd_epi32_avx2(u[3], k32_m05_p27);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[9] = k_packs_epi64_avx2(v[18], v[19]);
u[10] = k_packs_epi64_avx2(v[20], v[21]);
u[11] = k_packs_epi64_avx2(v[22], v[23]);
u[12] = k_packs_epi64_avx2(v[24], v[25]);
@@ -2460,16 +2734,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
u[14] = k_packs_epi64_avx2(v[28], v[29]);
u[15] = k_packs_epi64_avx2(v[30], v[31]);
- v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
- v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
- v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
- v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
- v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
- v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
- v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
- v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
- v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
- v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
@@ -2477,16 +2751,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
- u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
- u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
- u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
- u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
- u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
- u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
- u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
- u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
- u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
- u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+ u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
@@ -2494,33 +2768,33 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
- v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
- v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
- v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
- v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
- v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
- v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
- v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
- v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
- v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
- v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
- v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
- v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
- v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
- v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
- v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
- v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
-
- u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
- u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
- u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
- u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
- u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
- u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
- u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
- u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
- u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
- u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+ v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+ v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+ v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+ v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+ v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+ v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+ v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+ v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
+ v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
+ v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
+ v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
+ v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
+ v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
+ v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
+ v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
+ v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
+
+ u[0] = _mm256_sub_epi32(u[0], v[0]);
+ u[1] = _mm256_sub_epi32(u[1], v[1]);
+ u[2] = _mm256_sub_epi32(u[2], v[2]);
+ u[3] = _mm256_sub_epi32(u[3], v[3]);
+ u[4] = _mm256_sub_epi32(u[4], v[4]);
+ u[5] = _mm256_sub_epi32(u[5], v[5]);
+ u[6] = _mm256_sub_epi32(u[6], v[6]);
+ u[7] = _mm256_sub_epi32(u[7], v[7]);
+ u[8] = _mm256_sub_epi32(u[8], v[8]);
+ u[9] = _mm256_sub_epi32(u[9], v[9]);
u[10] = _mm256_sub_epi32(u[10], v[10]);
u[11] = _mm256_sub_epi32(u[11], v[11]);
u[12] = _mm256_sub_epi32(u[12], v[12]);
@@ -2562,11 +2836,11 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
u[14] = _mm256_srai_epi32(v[14], 2);
u[15] = _mm256_srai_epi32(v[15], 2);
- out[ 5] = _mm256_packs_epi32(u[0], u[1]);
+ out[5] = _mm256_packs_epi32(u[0], u[1]);
out[21] = _mm256_packs_epi32(u[2], u[3]);
out[13] = _mm256_packs_epi32(u[4], u[5]);
out[29] = _mm256_packs_epi32(u[6], u[7]);
- out[ 3] = _mm256_packs_epi32(u[8], u[9]);
+ out[3] = _mm256_packs_epi32(u[8], u[9]);
out[19] = _mm256_packs_epi32(u[10], u[11]);
out[11] = _mm256_packs_epi32(u[12], u[13]);
out[27] = _mm256_packs_epi32(u[14], u[15]);
@@ -2576,13 +2850,13 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
// Transpose the results, do it as four 8x8 transposes.
{
int transpose_block;
- int16_t *output_currStep,*output_nextStep;
- if (0 == pass){
- output_currStep = &intermediate[column_start * 32];
- output_nextStep = &intermediate[(column_start + 8) * 32];
- } else{
- output_currStep = &output_org[column_start * 32];
- output_nextStep = &output_org[(column_start + 8) * 32];
+ int16_t *output_currStep, *output_nextStep;
+ if (0 == pass) {
+ output_currStep = &intermediate[column_start * 32];
+ output_nextStep = &intermediate[(column_start + 8) * 32];
+ } else {
+ output_currStep = &output_org[column_start * 32];
+ output_nextStep = &output_org[(column_start + 8) * 32];
}
for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
__m256i *this_out = &out[8 * transpose_block];
@@ -2685,23 +2959,39 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
}
// Note: even though all these stores are aligned, using the aligned
// intrinsic make the code slightly slower.
- _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32), _mm256_castsi256_si128(tr2_0));
- _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32), _mm256_castsi256_si128(tr2_1));
- _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32), _mm256_castsi256_si128(tr2_2));
- _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32), _mm256_castsi256_si128(tr2_3));
- _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32), _mm256_castsi256_si128(tr2_4));
- _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32), _mm256_castsi256_si128(tr2_5));
- _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32), _mm256_castsi256_si128(tr2_6));
- _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32), _mm256_castsi256_si128(tr2_7));
-
- _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32), _mm256_extractf128_si256(tr2_0,1));
- _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32), _mm256_extractf128_si256(tr2_1,1));
- _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32), _mm256_extractf128_si256(tr2_2,1));
- _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32), _mm256_extractf128_si256(tr2_3,1));
- _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32), _mm256_extractf128_si256(tr2_4,1));
- _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32), _mm256_extractf128_si256(tr2_5,1));
- _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32), _mm256_extractf128_si256(tr2_6,1));
- _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32), _mm256_extractf128_si256(tr2_7,1));
+ _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32),
+ _mm256_castsi256_si128(tr2_0));
+ _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32),
+ _mm256_castsi256_si128(tr2_1));
+ _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32),
+ _mm256_castsi256_si128(tr2_2));
+ _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32),
+ _mm256_castsi256_si128(tr2_3));
+ _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32),
+ _mm256_castsi256_si128(tr2_4));
+ _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32),
+ _mm256_castsi256_si128(tr2_5));
+ _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32),
+ _mm256_castsi256_si128(tr2_6));
+ _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32),
+ _mm256_castsi256_si128(tr2_7));
+
+ _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32),
+ _mm256_extractf128_si256(tr2_0, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32),
+ _mm256_extractf128_si256(tr2_1, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32),
+ _mm256_extractf128_si256(tr2_2, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32),
+ _mm256_extractf128_si256(tr2_3, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32),
+ _mm256_extractf128_si256(tr2_4, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32),
+ _mm256_extractf128_si256(tr2_5, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32),
+ _mm256_extractf128_si256(tr2_6, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32),
+ _mm256_extractf128_si256(tr2_7, 1));
// Process next 8x8
output_currStep += 8;
output_nextStep += 8;
diff --git a/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h b/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
index b85ae103f..374433390 100644
--- a/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
+++ b/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
@@ -22,42 +22,37 @@
#define SUB_EPI16 _mm_subs_epi16
#if FDCT32x32_HIGH_PRECISION
void vpx_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
- int i, j;
- for (i = 0; i < 32; ++i) {
- tran_high_t temp_in[32], temp_out[32];
- for (j = 0; j < 32; ++j)
- temp_in[j] = intermediate[j * 32 + i];
- vpx_fdct32(temp_in, temp_out, 0);
- for (j = 0; j < 32; ++j)
- out[j + i * 32] =
- (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
- }
+ int i, j;
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
+ vpx_fdct32(temp_in, temp_out, 0);
+ for (j = 0; j < 32; ++j)
+ out[j + i * 32] =
+ (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+ }
}
- #define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_c
- #define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rows_c
+#define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_c
+#define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rows_c
#else
void vpx_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) {
- int i, j;
- for (i = 0; i < 32; ++i) {
- tran_high_t temp_in[32], temp_out[32];
- for (j = 0; j < 32; ++j)
- temp_in[j] = intermediate[j * 32 + i];
- vpx_fdct32(temp_in, temp_out, 1);
- for (j = 0; j < 32; ++j)
- out[j + i * 32] = (tran_low_t)temp_out[j];
- }
+ int i, j;
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
+ vpx_fdct32(temp_in, temp_out, 1);
+ for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
+ }
}
- #define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_rd_c
- #define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rd_rows_c
+#define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_rd_c
+#define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rd_rows_c
#endif // FDCT32x32_HIGH_PRECISION
#else
#define ADD_EPI16 _mm_add_epi16
#define SUB_EPI16 _mm_sub_epi16
#endif // DCT_HIGH_BIT_DEPTH
-
-void FDCT32x32_2D(const int16_t *input,
- tran_low_t *output_org, int stride) {
+void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
// Calculate pre-multiplied strides
const int str1 = stride;
const int str2 = 2 * stride;
@@ -70,42 +65,42 @@ void FDCT32x32_2D(const int16_t *input,
// by constructing the 32 bit constant corresponding to that pair.
const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
- const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
- const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
- const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
- const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
- const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
- const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
- const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
- const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
- const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
- const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
- const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
- const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
- const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
- const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
- const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
- const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
- const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
- const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
- const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
- const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
- const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
- const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
- const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
- const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
+ const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
+ const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
+ const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
+ const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
+ const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
+ const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
+ const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
+ const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
+ const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
+ const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
+ const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
+ const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
+ const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
+ const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
+ const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
+ const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
+ const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
+ const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
+ const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
+ const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i kZero = _mm_set1_epi16(0);
- const __m128i kOne = _mm_set1_epi16(1);
+ const __m128i kOne = _mm_set1_epi16(1);
// Do the two transform/transpose passes
int pass;
#if DCT_HIGH_BIT_DEPTH
@@ -123,125 +118,125 @@ void FDCT32x32_2D(const int16_t *input,
// Note: even though all the loads below are aligned, using the aligned
// intrinsic make the code slightly slower.
if (0 == pass) {
- const int16_t *in = &input[column_start];
+ const int16_t *in = &input[column_start];
// step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
// Note: the next four blocks could be in a loop. That would help the
// instruction cache but is actually slower.
{
- const int16_t *ina = in + 0 * str1;
- const int16_t *inb = in + 31 * str1;
- __m128i *step1a = &step1[ 0];
+ const int16_t *ina = in + 0 * str1;
+ const int16_t *inb = in + 31 * str1;
+ __m128i *step1a = &step1[0];
__m128i *step1b = &step1[31];
- const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
- const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
- const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
- const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
- const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
- const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
- const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
- const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
- step1a[ 0] = _mm_add_epi16(ina0, inb0);
- step1a[ 1] = _mm_add_epi16(ina1, inb1);
- step1a[ 2] = _mm_add_epi16(ina2, inb2);
- step1a[ 3] = _mm_add_epi16(ina3, inb3);
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[0] = _mm_add_epi16(ina0, inb0);
+ step1a[1] = _mm_add_epi16(ina1, inb1);
+ step1a[2] = _mm_add_epi16(ina2, inb2);
+ step1a[3] = _mm_add_epi16(ina3, inb3);
step1b[-3] = _mm_sub_epi16(ina3, inb3);
step1b[-2] = _mm_sub_epi16(ina2, inb2);
step1b[-1] = _mm_sub_epi16(ina1, inb1);
step1b[-0] = _mm_sub_epi16(ina0, inb0);
- step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
- step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
- step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
- step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+ step1a[0] = _mm_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm_slli_epi16(step1a[3], 2);
step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
}
{
- const int16_t *ina = in + 4 * str1;
- const int16_t *inb = in + 27 * str1;
- __m128i *step1a = &step1[ 4];
+ const int16_t *ina = in + 4 * str1;
+ const int16_t *inb = in + 27 * str1;
+ __m128i *step1a = &step1[4];
__m128i *step1b = &step1[27];
- const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
- const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
- const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
- const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
- const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
- const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
- const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
- const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
- step1a[ 0] = _mm_add_epi16(ina0, inb0);
- step1a[ 1] = _mm_add_epi16(ina1, inb1);
- step1a[ 2] = _mm_add_epi16(ina2, inb2);
- step1a[ 3] = _mm_add_epi16(ina3, inb3);
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[0] = _mm_add_epi16(ina0, inb0);
+ step1a[1] = _mm_add_epi16(ina1, inb1);
+ step1a[2] = _mm_add_epi16(ina2, inb2);
+ step1a[3] = _mm_add_epi16(ina3, inb3);
step1b[-3] = _mm_sub_epi16(ina3, inb3);
step1b[-2] = _mm_sub_epi16(ina2, inb2);
step1b[-1] = _mm_sub_epi16(ina1, inb1);
step1b[-0] = _mm_sub_epi16(ina0, inb0);
- step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
- step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
- step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
- step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+ step1a[0] = _mm_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm_slli_epi16(step1a[3], 2);
step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
}
{
- const int16_t *ina = in + 8 * str1;
- const int16_t *inb = in + 23 * str1;
- __m128i *step1a = &step1[ 8];
+ const int16_t *ina = in + 8 * str1;
+ const int16_t *inb = in + 23 * str1;
+ __m128i *step1a = &step1[8];
__m128i *step1b = &step1[23];
- const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
- const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
- const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
- const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
- const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
- const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
- const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
- const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
- step1a[ 0] = _mm_add_epi16(ina0, inb0);
- step1a[ 1] = _mm_add_epi16(ina1, inb1);
- step1a[ 2] = _mm_add_epi16(ina2, inb2);
- step1a[ 3] = _mm_add_epi16(ina3, inb3);
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[0] = _mm_add_epi16(ina0, inb0);
+ step1a[1] = _mm_add_epi16(ina1, inb1);
+ step1a[2] = _mm_add_epi16(ina2, inb2);
+ step1a[3] = _mm_add_epi16(ina3, inb3);
step1b[-3] = _mm_sub_epi16(ina3, inb3);
step1b[-2] = _mm_sub_epi16(ina2, inb2);
step1b[-1] = _mm_sub_epi16(ina1, inb1);
step1b[-0] = _mm_sub_epi16(ina0, inb0);
- step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
- step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
- step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
- step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+ step1a[0] = _mm_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm_slli_epi16(step1a[3], 2);
step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
}
{
- const int16_t *ina = in + 12 * str1;
- const int16_t *inb = in + 19 * str1;
+ const int16_t *ina = in + 12 * str1;
+ const int16_t *inb = in + 19 * str1;
__m128i *step1a = &step1[12];
__m128i *step1b = &step1[19];
- const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
- const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
- const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
- const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
- const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
- const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
- const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
- const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
- step1a[ 0] = _mm_add_epi16(ina0, inb0);
- step1a[ 1] = _mm_add_epi16(ina1, inb1);
- step1a[ 2] = _mm_add_epi16(ina2, inb2);
- step1a[ 3] = _mm_add_epi16(ina3, inb3);
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[0] = _mm_add_epi16(ina0, inb0);
+ step1a[1] = _mm_add_epi16(ina1, inb1);
+ step1a[2] = _mm_add_epi16(ina2, inb2);
+ step1a[3] = _mm_add_epi16(ina3, inb3);
step1b[-3] = _mm_sub_epi16(ina3, inb3);
step1b[-2] = _mm_sub_epi16(ina2, inb2);
step1b[-1] = _mm_sub_epi16(ina1, inb1);
step1b[-0] = _mm_sub_epi16(ina0, inb0);
- step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
- step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
- step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
- step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+ step1a[0] = _mm_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm_slli_epi16(step1a[3], 2);
step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
@@ -256,14 +251,14 @@ void FDCT32x32_2D(const int16_t *input,
// Note: the next four blocks could be in a loop. That would help the
// instruction cache but is actually slower.
{
- __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32));
- __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32));
- __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32));
- __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32));
- __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
- __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
- __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
- __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
+ __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32));
+ __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32));
+ __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32));
+ __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32));
+ __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
+ __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
+ __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
+ __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
step1[0] = ADD_EPI16(in00, in31);
step1[1] = ADD_EPI16(in01, in30);
step1[2] = ADD_EPI16(in02, in29);
@@ -283,14 +278,14 @@ void FDCT32x32_2D(const int16_t *input,
#endif // DCT_HIGH_BIT_DEPTH
}
{
- __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
- __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32));
- __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32));
- __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32));
- __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
- __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
- __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
- __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
+ __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
+ __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32));
+ __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32));
+ __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32));
+ __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
+ __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
+ __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
+ __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
step1[4] = ADD_EPI16(in04, in27);
step1[5] = ADD_EPI16(in05, in26);
step1[6] = ADD_EPI16(in06, in25);
@@ -310,14 +305,14 @@ void FDCT32x32_2D(const int16_t *input,
#endif // DCT_HIGH_BIT_DEPTH
}
{
- __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
- __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32));
- __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
- __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
- __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
- __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
- __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
- __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
+ __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
+ __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32));
+ __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
+ __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
+ __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
+ __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
+ __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
+ __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
step1[8] = ADD_EPI16(in08, in23);
step1[9] = ADD_EPI16(in09, in22);
step1[10] = ADD_EPI16(in10, in21);
@@ -337,14 +332,14 @@ void FDCT32x32_2D(const int16_t *input,
#endif // DCT_HIGH_BIT_DEPTH
}
{
- __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
- __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
- __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
- __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
- __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
- __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
- __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
- __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
+ __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
+ __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
+ __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
+ __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
+ __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
+ __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
+ __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
+ __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
step1[12] = ADD_EPI16(in12, in19);
step1[13] = ADD_EPI16(in13, in18);
step1[14] = ADD_EPI16(in14, in17);
@@ -372,10 +367,10 @@ void FDCT32x32_2D(const int16_t *input,
step2[3] = ADD_EPI16(step1[3], step1[12]);
step2[4] = ADD_EPI16(step1[4], step1[11]);
step2[5] = ADD_EPI16(step1[5], step1[10]);
- step2[6] = ADD_EPI16(step1[6], step1[ 9]);
- step2[7] = ADD_EPI16(step1[7], step1[ 8]);
- step2[8] = SUB_EPI16(step1[7], step1[ 8]);
- step2[9] = SUB_EPI16(step1[6], step1[ 9]);
+ step2[6] = ADD_EPI16(step1[6], step1[9]);
+ step2[7] = ADD_EPI16(step1[7], step1[8]);
+ step2[8] = SUB_EPI16(step1[7], step1[8]);
+ step2[9] = SUB_EPI16(step1[6], step1[9]);
step2[10] = SUB_EPI16(step1[5], step1[10]);
step2[11] = SUB_EPI16(step1[4], step1[11]);
step2[12] = SUB_EPI16(step1[3], step1[12]);
@@ -384,9 +379,8 @@ void FDCT32x32_2D(const int16_t *input,
step2[15] = SUB_EPI16(step1[0], step1[15]);
#if DCT_HIGH_BIT_DEPTH
overflow = check_epi16_overflow_x16(
- &step2[0], &step2[1], &step2[2], &step2[3],
- &step2[4], &step2[5], &step2[6], &step2[7],
- &step2[8], &step2[9], &step2[10], &step2[11],
+ &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
+ &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
&step2[12], &step2[13], &step2[14], &step2[15]);
if (overflow) {
if (pass == 0)
@@ -482,16 +476,16 @@ void FDCT32x32_2D(const int16_t *input,
// dump the magnitude by half, hence the intermediate values are within
// the range of 16 bits.
if (1 == pass) {
- __m128i s3_00_0 = _mm_cmplt_epi16(step2[ 0], kZero);
- __m128i s3_01_0 = _mm_cmplt_epi16(step2[ 1], kZero);
- __m128i s3_02_0 = _mm_cmplt_epi16(step2[ 2], kZero);
- __m128i s3_03_0 = _mm_cmplt_epi16(step2[ 3], kZero);
- __m128i s3_04_0 = _mm_cmplt_epi16(step2[ 4], kZero);
- __m128i s3_05_0 = _mm_cmplt_epi16(step2[ 5], kZero);
- __m128i s3_06_0 = _mm_cmplt_epi16(step2[ 6], kZero);
- __m128i s3_07_0 = _mm_cmplt_epi16(step2[ 7], kZero);
- __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
- __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
+ __m128i s3_00_0 = _mm_cmplt_epi16(step2[0], kZero);
+ __m128i s3_01_0 = _mm_cmplt_epi16(step2[1], kZero);
+ __m128i s3_02_0 = _mm_cmplt_epi16(step2[2], kZero);
+ __m128i s3_03_0 = _mm_cmplt_epi16(step2[3], kZero);
+ __m128i s3_04_0 = _mm_cmplt_epi16(step2[4], kZero);
+ __m128i s3_05_0 = _mm_cmplt_epi16(step2[5], kZero);
+ __m128i s3_06_0 = _mm_cmplt_epi16(step2[6], kZero);
+ __m128i s3_07_0 = _mm_cmplt_epi16(step2[7], kZero);
+ __m128i s2_08_0 = _mm_cmplt_epi16(step2[8], kZero);
+ __m128i s2_09_0 = _mm_cmplt_epi16(step2[9], kZero);
__m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
__m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
__m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
@@ -515,16 +509,16 @@ void FDCT32x32_2D(const int16_t *input,
__m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
__m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
- step2[0] = SUB_EPI16(step2[ 0], s3_00_0);
- step2[1] = SUB_EPI16(step2[ 1], s3_01_0);
- step2[2] = SUB_EPI16(step2[ 2], s3_02_0);
- step2[3] = SUB_EPI16(step2[ 3], s3_03_0);
- step2[4] = SUB_EPI16(step2[ 4], s3_04_0);
- step2[5] = SUB_EPI16(step2[ 5], s3_05_0);
- step2[6] = SUB_EPI16(step2[ 6], s3_06_0);
- step2[7] = SUB_EPI16(step2[ 7], s3_07_0);
- step2[8] = SUB_EPI16(step2[ 8], s2_08_0);
- step2[9] = SUB_EPI16(step2[ 9], s2_09_0);
+ step2[0] = SUB_EPI16(step2[0], s3_00_0);
+ step2[1] = SUB_EPI16(step2[1], s3_01_0);
+ step2[2] = SUB_EPI16(step2[2], s3_02_0);
+ step2[3] = SUB_EPI16(step2[3], s3_03_0);
+ step2[4] = SUB_EPI16(step2[4], s3_04_0);
+ step2[5] = SUB_EPI16(step2[5], s3_05_0);
+ step2[6] = SUB_EPI16(step2[6], s3_06_0);
+ step2[7] = SUB_EPI16(step2[7], s3_07_0);
+ step2[8] = SUB_EPI16(step2[8], s2_08_0);
+ step2[9] = SUB_EPI16(step2[9], s2_09_0);
step2[10] = SUB_EPI16(step2[10], s3_10_0);
step2[11] = SUB_EPI16(step2[11], s3_11_0);
step2[12] = SUB_EPI16(step2[12], s3_12_0);
@@ -549,29 +543,27 @@ void FDCT32x32_2D(const int16_t *input,
step1[31] = SUB_EPI16(step1[31], s3_31_0);
#if DCT_HIGH_BIT_DEPTH
overflow = check_epi16_overflow_x32(
- &step2[0], &step2[1], &step2[2], &step2[3],
- &step2[4], &step2[5], &step2[6], &step2[7],
- &step2[8], &step2[9], &step2[10], &step2[11],
- &step2[12], &step2[13], &step2[14], &step2[15],
- &step1[16], &step1[17], &step1[18], &step1[19],
- &step2[20], &step2[21], &step2[22], &step2[23],
- &step2[24], &step2[25], &step2[26], &step2[27],
- &step1[28], &step1[29], &step1[30], &step1[31]);
+ &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
+ &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
+ &step2[12], &step2[13], &step2[14], &step2[15], &step1[16],
+ &step1[17], &step1[18], &step1[19], &step2[20], &step2[21],
+ &step2[22], &step2[23], &step2[24], &step2[25], &step2[26],
+ &step2[27], &step1[28], &step1[29], &step1[30], &step1[31]);
if (overflow) {
HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
return;
}
#endif // DCT_HIGH_BIT_DEPTH
- step2[0] = _mm_add_epi16(step2[ 0], kOne);
- step2[1] = _mm_add_epi16(step2[ 1], kOne);
- step2[2] = _mm_add_epi16(step2[ 2], kOne);
- step2[3] = _mm_add_epi16(step2[ 3], kOne);
- step2[4] = _mm_add_epi16(step2[ 4], kOne);
- step2[5] = _mm_add_epi16(step2[ 5], kOne);
- step2[6] = _mm_add_epi16(step2[ 6], kOne);
- step2[7] = _mm_add_epi16(step2[ 7], kOne);
- step2[8] = _mm_add_epi16(step2[ 8], kOne);
- step2[9] = _mm_add_epi16(step2[ 9], kOne);
+ step2[0] = _mm_add_epi16(step2[0], kOne);
+ step2[1] = _mm_add_epi16(step2[1], kOne);
+ step2[2] = _mm_add_epi16(step2[2], kOne);
+ step2[3] = _mm_add_epi16(step2[3], kOne);
+ step2[4] = _mm_add_epi16(step2[4], kOne);
+ step2[5] = _mm_add_epi16(step2[5], kOne);
+ step2[6] = _mm_add_epi16(step2[6], kOne);
+ step2[7] = _mm_add_epi16(step2[7], kOne);
+ step2[8] = _mm_add_epi16(step2[8], kOne);
+ step2[9] = _mm_add_epi16(step2[9], kOne);
step2[10] = _mm_add_epi16(step2[10], kOne);
step2[11] = _mm_add_epi16(step2[11], kOne);
step2[12] = _mm_add_epi16(step2[12], kOne);
@@ -595,16 +587,16 @@ void FDCT32x32_2D(const int16_t *input,
step1[30] = _mm_add_epi16(step1[30], kOne);
step1[31] = _mm_add_epi16(step1[31], kOne);
- step2[0] = _mm_srai_epi16(step2[ 0], 2);
- step2[1] = _mm_srai_epi16(step2[ 1], 2);
- step2[2] = _mm_srai_epi16(step2[ 2], 2);
- step2[3] = _mm_srai_epi16(step2[ 3], 2);
- step2[4] = _mm_srai_epi16(step2[ 4], 2);
- step2[5] = _mm_srai_epi16(step2[ 5], 2);
- step2[6] = _mm_srai_epi16(step2[ 6], 2);
- step2[7] = _mm_srai_epi16(step2[ 7], 2);
- step2[8] = _mm_srai_epi16(step2[ 8], 2);
- step2[9] = _mm_srai_epi16(step2[ 9], 2);
+ step2[0] = _mm_srai_epi16(step2[0], 2);
+ step2[1] = _mm_srai_epi16(step2[1], 2);
+ step2[2] = _mm_srai_epi16(step2[2], 2);
+ step2[3] = _mm_srai_epi16(step2[3], 2);
+ step2[4] = _mm_srai_epi16(step2[4], 2);
+ step2[5] = _mm_srai_epi16(step2[5], 2);
+ step2[6] = _mm_srai_epi16(step2[6], 2);
+ step2[7] = _mm_srai_epi16(step2[7], 2);
+ step2[8] = _mm_srai_epi16(step2[8], 2);
+ step2[9] = _mm_srai_epi16(step2[9], 2);
step2[10] = _mm_srai_epi16(step2[10], 2);
step2[11] = _mm_srai_epi16(step2[11], 2);
step2[12] = _mm_srai_epi16(step2[12], 2);
@@ -633,821 +625,884 @@ void FDCT32x32_2D(const int16_t *input,
#if FDCT32x32_HIGH_PRECISION
if (pass == 0) {
#endif
- // Stage 3
- {
- step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
- step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
- step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
- step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
- step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
- step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
- step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
- step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
+ // Stage 3
+ {
+ step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
+ step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
+ step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
+ step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
+ step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
+ step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
+ step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
+ step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
- &step3[3], &step3[4], &step3[5],
- &step3[6], &step3[7]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
+ overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
+ &step3[3], &step3[4], &step3[5],
+ &step3[6], &step3[7]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
- const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
- const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
- const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
- const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
- const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
- const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
- const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
- const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
- const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
- const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
- const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
- const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
- const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
- const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
- const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
- const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
- const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
- const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
- // Combine
- step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
- step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
- step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
- step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&step3[10], &step3[11],
- &step3[12], &step3[13]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
}
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- step3[16] = ADD_EPI16(step2[23], step1[16]);
- step3[17] = ADD_EPI16(step2[22], step1[17]);
- step3[18] = ADD_EPI16(step2[21], step1[18]);
- step3[19] = ADD_EPI16(step2[20], step1[19]);
- step3[20] = SUB_EPI16(step1[19], step2[20]);
- step3[21] = SUB_EPI16(step1[18], step2[21]);
- step3[22] = SUB_EPI16(step1[17], step2[22]);
- step3[23] = SUB_EPI16(step1[16], step2[23]);
- step3[24] = SUB_EPI16(step1[31], step2[24]);
- step3[25] = SUB_EPI16(step1[30], step2[25]);
- step3[26] = SUB_EPI16(step1[29], step2[26]);
- step3[27] = SUB_EPI16(step1[28], step2[27]);
- step3[28] = ADD_EPI16(step2[27], step1[28]);
- step3[29] = ADD_EPI16(step2[26], step1[29]);
- step3[30] = ADD_EPI16(step2[25], step1[30]);
- step3[31] = ADD_EPI16(step2[24], step1[31]);
+ {
+ const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+ const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+ const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+ const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+ const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ // Combine
+ step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
+ step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
+ step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
+ step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x16(
- &step3[16], &step3[17], &step3[18], &step3[19],
- &step3[20], &step3[21], &step3[22], &step3[23],
- &step3[24], &step3[25], &step3[26], &step3[27],
- &step3[28], &step3[29], &step3[30], &step3[31]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
+ overflow = check_epi16_overflow_x4(&step3[10], &step3[11], &step3[12],
+ &step3[13]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
+ {
+ step3[16] = ADD_EPI16(step2[23], step1[16]);
+ step3[17] = ADD_EPI16(step2[22], step1[17]);
+ step3[18] = ADD_EPI16(step2[21], step1[18]);
+ step3[19] = ADD_EPI16(step2[20], step1[19]);
+ step3[20] = SUB_EPI16(step1[19], step2[20]);
+ step3[21] = SUB_EPI16(step1[18], step2[21]);
+ step3[22] = SUB_EPI16(step1[17], step2[22]);
+ step3[23] = SUB_EPI16(step1[16], step2[23]);
+ step3[24] = SUB_EPI16(step1[31], step2[24]);
+ step3[25] = SUB_EPI16(step1[30], step2[25]);
+ step3[26] = SUB_EPI16(step1[29], step2[26]);
+ step3[27] = SUB_EPI16(step1[28], step2[27]);
+ step3[28] = ADD_EPI16(step2[27], step1[28]);
+ step3[29] = ADD_EPI16(step2[26], step1[29]);
+ step3[30] = ADD_EPI16(step2[25], step1[30]);
+ step3[31] = ADD_EPI16(step2[24], step1[31]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step3[16], &step3[17], &step3[18], &step3[19], &step3[20],
+ &step3[21], &step3[22], &step3[23], &step3[24], &step3[25],
+ &step3[26], &step3[27], &step3[28], &step3[29], &step3[30],
+ &step3[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
#endif // DCT_HIGH_BIT_DEPTH
- }
+ }
- // Stage 4
- {
- step1[0] = ADD_EPI16(step3[ 3], step3[ 0]);
- step1[1] = ADD_EPI16(step3[ 2], step3[ 1]);
- step1[2] = SUB_EPI16(step3[ 1], step3[ 2]);
- step1[3] = SUB_EPI16(step3[ 0], step3[ 3]);
- step1[8] = ADD_EPI16(step3[11], step2[ 8]);
- step1[9] = ADD_EPI16(step3[10], step2[ 9]);
- step1[10] = SUB_EPI16(step2[ 9], step3[10]);
- step1[11] = SUB_EPI16(step2[ 8], step3[11]);
- step1[12] = SUB_EPI16(step2[15], step3[12]);
- step1[13] = SUB_EPI16(step2[14], step3[13]);
- step1[14] = ADD_EPI16(step3[13], step2[14]);
- step1[15] = ADD_EPI16(step3[12], step2[15]);
+ // Stage 4
+ {
+ step1[0] = ADD_EPI16(step3[3], step3[0]);
+ step1[1] = ADD_EPI16(step3[2], step3[1]);
+ step1[2] = SUB_EPI16(step3[1], step3[2]);
+ step1[3] = SUB_EPI16(step3[0], step3[3]);
+ step1[8] = ADD_EPI16(step3[11], step2[8]);
+ step1[9] = ADD_EPI16(step3[10], step2[9]);
+ step1[10] = SUB_EPI16(step2[9], step3[10]);
+ step1[11] = SUB_EPI16(step2[8], step3[11]);
+ step1[12] = SUB_EPI16(step2[15], step3[12]);
+ step1[13] = SUB_EPI16(step2[14], step3[13]);
+ step1[14] = ADD_EPI16(step3[13], step2[14]);
+ step1[15] = ADD_EPI16(step3[12], step2[15]);
#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x16(
- &step1[0], &step1[1], &step1[2], &step1[3],
- &step1[4], &step1[5], &step1[6], &step1[7],
- &step1[8], &step1[9], &step1[10], &step1[11],
- &step1[12], &step1[13], &step1[14], &step1[15]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
+ overflow = check_epi16_overflow_x16(
+ &step1[0], &step1[1], &step1[2], &step1[3], &step1[4], &step1[5],
+ &step1[6], &step1[7], &step1[8], &step1[9], &step1[10],
+ &step1[11], &step1[12], &step1[13], &step1[14], &step1[15]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
- const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
- const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
- const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
- const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
- const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
- const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
- const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
- const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
- // Combine
- step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
- step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
}
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
- const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
- const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
- const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
- const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
- const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
- const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
- const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
- const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
- const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
- const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
- const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
- const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
- const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
- const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
- const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
- const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
- const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
- const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
- const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
- const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
- const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
- const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
- const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
- // dct_const_round_shift
- const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
- const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
- const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
- const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
- const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
- const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
- const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
- const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
- const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
- const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
- const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
- const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
- const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
- const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
- const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
- const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
- // Combine
- step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
- step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
- step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
- step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
- step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
- step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
- step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
- step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
+ {
+ const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
+ const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
+ const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
+ const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
+ const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
+ const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
+ const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
+ const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
+ const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
+ // Combine
+ step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
+ step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
- &step1[21], &step1[26], &step1[27],
- &step1[28], &step1[29]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
+ overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
#endif // DCT_HIGH_BIT_DEPTH
- }
- // Stage 5
- {
- step2[4] = ADD_EPI16(step1[5], step3[4]);
- step2[5] = SUB_EPI16(step3[4], step1[5]);
- step2[6] = SUB_EPI16(step3[7], step1[6]);
- step2[7] = ADD_EPI16(step1[6], step3[7]);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&step2[4], &step2[5],
- &step2[6], &step2[7]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
}
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
- const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
- const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
- const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
- const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
- const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
- const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
- const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
- const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
- const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
- const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
- const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
- // dct_const_round_shift
- const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
- const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
- const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
- const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
- const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
- const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
- const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
- const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
- const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
- const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
- const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
- const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
- const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
- const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
- const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
- const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
- // Combine
- out[ 0] = _mm_packs_epi32(out_00_6, out_00_7);
- out[16] = _mm_packs_epi32(out_16_6, out_16_7);
- out[ 8] = _mm_packs_epi32(out_08_6, out_08_7);
- out[24] = _mm_packs_epi32(out_24_6, out_24_7);
+ {
+ const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
+ const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
+ const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
+ const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
+ const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
+ const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
+ const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
+ const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
+ const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
+ const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
+ const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
+ const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
+ const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
+ const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
+ const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
+ const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
+ const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
+ const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
+ const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
+ const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
+ const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
+ const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
+ const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
+ const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
+ const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
+ const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
+ const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
+ const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
+ const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
+ const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
+ const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
+ const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
+ const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
+ const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
+ const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
+ const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
+ const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
+ const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
+ const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
+ // Combine
+ step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
+ step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
+ step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
+ step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
+ step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
+ step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
+ step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
+ step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&out[0], &out[16],
- &out[8], &out[24]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
+ overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
+ &step1[21], &step1[26], &step1[27],
+ &step1[28], &step1[29]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]);
- const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[ 9], step1[14]);
- const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
- const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
- const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
- const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
- const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
- const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
- const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
- const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
- const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
- const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
- // dct_const_round_shift
- const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
- const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
- const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
- const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
- const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
- const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
- const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
- const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
- // Combine
- step2[ 9] = _mm_packs_epi32(s2_09_6, s2_09_7);
- step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
- step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
- step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&step2[9], &step2[10],
- &step2[13], &step2[14]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
}
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- step2[16] = ADD_EPI16(step1[19], step3[16]);
- step2[17] = ADD_EPI16(step1[18], step3[17]);
- step2[18] = SUB_EPI16(step3[17], step1[18]);
- step2[19] = SUB_EPI16(step3[16], step1[19]);
- step2[20] = SUB_EPI16(step3[23], step1[20]);
- step2[21] = SUB_EPI16(step3[22], step1[21]);
- step2[22] = ADD_EPI16(step1[21], step3[22]);
- step2[23] = ADD_EPI16(step1[20], step3[23]);
- step2[24] = ADD_EPI16(step1[27], step3[24]);
- step2[25] = ADD_EPI16(step1[26], step3[25]);
- step2[26] = SUB_EPI16(step3[25], step1[26]);
- step2[27] = SUB_EPI16(step3[24], step1[27]);
- step2[28] = SUB_EPI16(step3[31], step1[28]);
- step2[29] = SUB_EPI16(step3[30], step1[29]);
- step2[30] = ADD_EPI16(step1[29], step3[30]);
- step2[31] = ADD_EPI16(step1[28], step3[31]);
+ // Stage 5
+ {
+ step2[4] = ADD_EPI16(step1[5], step3[4]);
+ step2[5] = SUB_EPI16(step3[4], step1[5]);
+ step2[6] = SUB_EPI16(step3[7], step1[6]);
+ step2[7] = ADD_EPI16(step1[6], step3[7]);
#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x16(
- &step2[16], &step2[17], &step2[18], &step2[19],
- &step2[20], &step2[21], &step2[22], &step2[23],
- &step2[24], &step2[25], &step2[26], &step2[27],
- &step2[28], &step2[29], &step2[30], &step2[31]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
+ overflow = check_epi16_overflow_x4(&step2[4], &step2[5], &step2[6],
+ &step2[7]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
#endif // DCT_HIGH_BIT_DEPTH
- }
- // Stage 6
- {
- const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
- const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
- const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
- const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
- const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
- const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
- const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
- const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
- const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
- const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
- const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
- const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
- const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
- const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
- const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
- const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
- // dct_const_round_shift
- const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
- const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
- const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
- const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
- const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
- const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
- const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
- const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
- const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
- const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
- const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
- const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
- const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
- const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
- const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
- const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
- // Combine
- out[4] = _mm_packs_epi32(out_04_6, out_04_7);
- out[20] = _mm_packs_epi32(out_20_6, out_20_7);
- out[12] = _mm_packs_epi32(out_12_6, out_12_7);
- out[28] = _mm_packs_epi32(out_28_6, out_28_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&out[4], &out[20],
- &out[12], &out[28]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
}
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- step3[8] = ADD_EPI16(step2[ 9], step1[ 8]);
- step3[9] = SUB_EPI16(step1[ 8], step2[ 9]);
- step3[10] = SUB_EPI16(step1[11], step2[10]);
- step3[11] = ADD_EPI16(step2[10], step1[11]);
- step3[12] = ADD_EPI16(step2[13], step1[12]);
- step3[13] = SUB_EPI16(step1[12], step2[13]);
- step3[14] = SUB_EPI16(step1[15], step2[14]);
- step3[15] = ADD_EPI16(step2[14], step1[15]);
+ {
+ const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
+ const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
+ const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
+ const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
+ const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
+ const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
+ const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
+ const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
+ const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
+ const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
+ const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
+ const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m128i out_00_4 =
+ _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_00_5 =
+ _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_16_4 =
+ _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_16_5 =
+ _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_08_4 =
+ _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_08_5 =
+ _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_24_4 =
+ _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_24_5 =
+ _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
+ const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
+ const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
+ const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
+ const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
+ const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
+ const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
+ const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
+ // Combine
+ out[0] = _mm_packs_epi32(out_00_6, out_00_7);
+ out[16] = _mm_packs_epi32(out_16_6, out_16_7);
+ out[8] = _mm_packs_epi32(out_08_6, out_08_7);
+ out[24] = _mm_packs_epi32(out_24_6, out_24_7);
#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
- &step3[11], &step3[12], &step3[13],
- &step3[14], &step3[15]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
+ overflow =
+ check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
- const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
- const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
- const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
- const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
- const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
- const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
- const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
- const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
- const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
- const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
- const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
- const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
- const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
- const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
- const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
- const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
- const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
- const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
- const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
- const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
- const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
- const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
- const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
- // dct_const_round_shift
- const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
- const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
- const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
- const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
- const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
- const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
- const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
- const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
- const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
- const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
- const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
- const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
- const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
- const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
- const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
- const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
- // Combine
- step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
- step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
- step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
- step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
- // Combine
- step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
- step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
- step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
- step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
- &step3[22], &step3[25], &step3[26],
- &step3[29], &step3[30]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
}
+ {
+ const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
+ const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
+ const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
+ const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
+ const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
+ const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
+ const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
+ const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
+ const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
+ const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
+ const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
+ const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
+ const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
+ const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
+ const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
+ const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
+ const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
+ const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
+ const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
+ // Combine
+ step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
+ step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
+ step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
+ step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&step2[9], &step2[10], &step2[13],
+ &step2[14]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
#endif // DCT_HIGH_BIT_DEPTH
- }
- // Stage 7
- {
- const __m128i out_02_0 = _mm_unpacklo_epi16(step3[ 8], step3[15]);
- const __m128i out_02_1 = _mm_unpackhi_epi16(step3[ 8], step3[15]);
- const __m128i out_18_0 = _mm_unpacklo_epi16(step3[ 9], step3[14]);
- const __m128i out_18_1 = _mm_unpackhi_epi16(step3[ 9], step3[14]);
- const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
- const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
- const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
- const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
- const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
- const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
- const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
- const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
- const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
- const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
- const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
- const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
- const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
- const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
- const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
- const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
- const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
- const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
- const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
- const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
- // dct_const_round_shift
- const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
- const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
- const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
- const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
- const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
- const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
- const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
- const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
- const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
- const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
- const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
- const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
- const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
- const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
- const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
- const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
- const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
- const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
- const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
- const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
- const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
- const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
- const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
- const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
- const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
- const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
- const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
- const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
- const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
- const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
- const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
- const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
- // Combine
- out[ 2] = _mm_packs_epi32(out_02_6, out_02_7);
- out[18] = _mm_packs_epi32(out_18_6, out_18_7);
- out[10] = _mm_packs_epi32(out_10_6, out_10_7);
- out[26] = _mm_packs_epi32(out_26_6, out_26_7);
- out[ 6] = _mm_packs_epi32(out_06_6, out_06_7);
- out[22] = _mm_packs_epi32(out_22_6, out_22_7);
- out[14] = _mm_packs_epi32(out_14_6, out_14_7);
- out[30] = _mm_packs_epi32(out_30_6, out_30_7);
+ }
+ {
+ step2[16] = ADD_EPI16(step1[19], step3[16]);
+ step2[17] = ADD_EPI16(step1[18], step3[17]);
+ step2[18] = SUB_EPI16(step3[17], step1[18]);
+ step2[19] = SUB_EPI16(step3[16], step1[19]);
+ step2[20] = SUB_EPI16(step3[23], step1[20]);
+ step2[21] = SUB_EPI16(step3[22], step1[21]);
+ step2[22] = ADD_EPI16(step1[21], step3[22]);
+ step2[23] = ADD_EPI16(step1[20], step3[23]);
+ step2[24] = ADD_EPI16(step1[27], step3[24]);
+ step2[25] = ADD_EPI16(step1[26], step3[25]);
+ step2[26] = SUB_EPI16(step3[25], step1[26]);
+ step2[27] = SUB_EPI16(step3[24], step1[27]);
+ step2[28] = SUB_EPI16(step3[31], step1[28]);
+ step2[29] = SUB_EPI16(step3[30], step1[29]);
+ step2[30] = ADD_EPI16(step1[29], step3[30]);
+ step2[31] = ADD_EPI16(step1[28], step3[31]);
#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10],
- &out[26], &out[6], &out[22],
- &out[14], &out[30]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
+ overflow = check_epi16_overflow_x16(
+ &step2[16], &step2[17], &step2[18], &step2[19], &step2[20],
+ &step2[21], &step2[22], &step2[23], &step2[24], &step2[25],
+ &step2[26], &step2[27], &step2[28], &step2[29], &step2[30],
+ &step2[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
+ // Stage 6
+ {
+ const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+ const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+ const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+ const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+ const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+ const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+ const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+ const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+ const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
+ const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
+ const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
+ const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
+ const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
+ const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
+ const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
+ const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
+ // dct_const_round_shift
+ const __m128i out_04_4 =
+ _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_04_5 =
+ _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_20_4 =
+ _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_20_5 =
+ _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_12_4 =
+ _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_12_5 =
+ _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_28_4 =
+ _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_28_5 =
+ _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
+ const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
+ const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
+ const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
+ const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
+ const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
+ const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
+ const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
+ // Combine
+ out[4] = _mm_packs_epi32(out_04_6, out_04_7);
+ out[20] = _mm_packs_epi32(out_20_6, out_20_7);
+ out[12] = _mm_packs_epi32(out_12_6, out_12_7);
+ out[28] = _mm_packs_epi32(out_28_6, out_28_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- step1[16] = ADD_EPI16(step3[17], step2[16]);
- step1[17] = SUB_EPI16(step2[16], step3[17]);
- step1[18] = SUB_EPI16(step2[19], step3[18]);
- step1[19] = ADD_EPI16(step3[18], step2[19]);
- step1[20] = ADD_EPI16(step3[21], step2[20]);
- step1[21] = SUB_EPI16(step2[20], step3[21]);
- step1[22] = SUB_EPI16(step2[23], step3[22]);
- step1[23] = ADD_EPI16(step3[22], step2[23]);
- step1[24] = ADD_EPI16(step3[25], step2[24]);
- step1[25] = SUB_EPI16(step2[24], step3[25]);
- step1[26] = SUB_EPI16(step2[27], step3[26]);
- step1[27] = ADD_EPI16(step3[26], step2[27]);
- step1[28] = ADD_EPI16(step3[29], step2[28]);
- step1[29] = SUB_EPI16(step2[28], step3[29]);
- step1[30] = SUB_EPI16(step2[31], step3[30]);
- step1[31] = ADD_EPI16(step3[30], step2[31]);
+ }
+ {
+ step3[8] = ADD_EPI16(step2[9], step1[8]);
+ step3[9] = SUB_EPI16(step1[8], step2[9]);
+ step3[10] = SUB_EPI16(step1[11], step2[10]);
+ step3[11] = ADD_EPI16(step2[10], step1[11]);
+ step3[12] = ADD_EPI16(step2[13], step1[12]);
+ step3[13] = SUB_EPI16(step1[12], step2[13]);
+ step3[14] = SUB_EPI16(step1[15], step2[14]);
+ step3[15] = ADD_EPI16(step2[14], step1[15]);
#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x16(
- &step1[16], &step1[17], &step1[18], &step1[19],
- &step1[20], &step1[21], &step1[22], &step1[23],
- &step1[24], &step1[25], &step1[26], &step1[27],
- &step1[28], &step1[29], &step1[30], &step1[31]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
+ overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
+ &step3[11], &step3[12], &step3[13],
+ &step3[14], &step3[15]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
+ {
+ const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
+ const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
+ const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
+ const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
+ const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
+ const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
+ const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
+ const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
+ const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
+ const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
+ const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
+ const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
+ const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
+ const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
+ const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
+ const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
+ const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
+ const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
+ const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
+ const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
+ const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
+ const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
+ const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
+ const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
+ // dct_const_round_shift
+ const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
+ const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
+ const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
+ const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
+ const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
+ const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
+ const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
+ const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
+ const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
+ const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
+ const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
+ const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
+ const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
+ const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
+ const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
+ const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
+ // Combine
+ step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
+ step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
+ step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
+ step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
+ // Combine
+ step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
+ step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
+ step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
+ step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
+ &step3[22], &step3[25], &step3[26],
+ &step3[29], &step3[30]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
#endif // DCT_HIGH_BIT_DEPTH
- }
- // Final stage --- outputs indices are bit-reversed.
- {
- const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
- const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
- const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
- const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
- const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
- const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
- const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
- const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
- const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
- const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
- const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
- const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
- const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
- const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
- const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
- const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
- const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
- const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
- const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
- const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
- const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
- const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
- const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
- const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
- // dct_const_round_shift
- const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
- const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
- const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
- const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
- const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
- const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
- const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
- const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
- const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
- const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
- const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
- const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
- const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
- const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
- const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
- const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
- const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
- const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
- const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
- const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
- const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
- const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
- const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
- const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
- const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
- const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
- const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
- const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
- const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
- const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
- const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
- const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
- // Combine
- out[ 1] = _mm_packs_epi32(out_01_6, out_01_7);
- out[17] = _mm_packs_epi32(out_17_6, out_17_7);
- out[ 9] = _mm_packs_epi32(out_09_6, out_09_7);
- out[25] = _mm_packs_epi32(out_25_6, out_25_7);
- out[ 7] = _mm_packs_epi32(out_07_6, out_07_7);
- out[23] = _mm_packs_epi32(out_23_6, out_23_7);
- out[15] = _mm_packs_epi32(out_15_6, out_15_7);
- out[31] = _mm_packs_epi32(out_31_6, out_31_7);
+ }
+ // Stage 7
+ {
+ const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
+ const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
+ const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
+ const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
+ const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
+ const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
+ const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
+ const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
+ const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
+ const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
+ const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
+ const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
+ const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
+ const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
+ const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
+ const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
+ const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
+ const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
+ const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
+ const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
+ const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
+ const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
+ const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
+ const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
+ // dct_const_round_shift
+ const __m128i out_02_4 =
+ _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_02_5 =
+ _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_18_4 =
+ _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_18_5 =
+ _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_10_4 =
+ _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_10_5 =
+ _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_26_4 =
+ _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_26_5 =
+ _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_06_4 =
+ _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_06_5 =
+ _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_22_4 =
+ _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_22_5 =
+ _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_14_4 =
+ _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_14_5 =
+ _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_30_4 =
+ _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_30_5 =
+ _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
+ const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
+ const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
+ const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
+ const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
+ const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
+ const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
+ const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
+ const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
+ const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
+ const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
+ const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
+ const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
+ const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
+ const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
+ const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
+ // Combine
+ out[2] = _mm_packs_epi32(out_02_6, out_02_7);
+ out[18] = _mm_packs_epi32(out_18_6, out_18_7);
+ out[10] = _mm_packs_epi32(out_10_6, out_10_7);
+ out[26] = _mm_packs_epi32(out_26_6, out_26_7);
+ out[6] = _mm_packs_epi32(out_06_6, out_06_7);
+ out[22] = _mm_packs_epi32(out_22_6, out_22_7);
+ out[14] = _mm_packs_epi32(out_14_6, out_14_7);
+ out[30] = _mm_packs_epi32(out_30_6, out_30_7);
#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9],
- &out[25], &out[7], &out[23],
- &out[15], &out[31]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
+ overflow =
+ check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
+ &out[6], &out[22], &out[14], &out[30]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
+ {
+ step1[16] = ADD_EPI16(step3[17], step2[16]);
+ step1[17] = SUB_EPI16(step2[16], step3[17]);
+ step1[18] = SUB_EPI16(step2[19], step3[18]);
+ step1[19] = ADD_EPI16(step3[18], step2[19]);
+ step1[20] = ADD_EPI16(step3[21], step2[20]);
+ step1[21] = SUB_EPI16(step2[20], step3[21]);
+ step1[22] = SUB_EPI16(step2[23], step3[22]);
+ step1[23] = ADD_EPI16(step3[22], step2[23]);
+ step1[24] = ADD_EPI16(step3[25], step2[24]);
+ step1[25] = SUB_EPI16(step2[24], step3[25]);
+ step1[26] = SUB_EPI16(step2[27], step3[26]);
+ step1[27] = ADD_EPI16(step3[26], step2[27]);
+ step1[28] = ADD_EPI16(step3[29], step2[28]);
+ step1[29] = SUB_EPI16(step2[28], step3[29]);
+ step1[30] = SUB_EPI16(step2[31], step3[30]);
+ step1[31] = ADD_EPI16(step3[30], step2[31]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step1[16], &step1[17], &step1[18], &step1[19], &step1[20],
+ &step1[21], &step1[22], &step1[23], &step1[24], &step1[25],
+ &step1[26], &step1[27], &step1[28], &step1[29], &step1[30],
+ &step1[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
- const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
- const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
- const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
- const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
- const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
- const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
- const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
- const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
- const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
- const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
- const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
- const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
- const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
- const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
- const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
- const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
- const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
- const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
- const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
- const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
- const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
- const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
- const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
- // dct_const_round_shift
- const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
- const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
- const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
- const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
- const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
- const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
- const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
- const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
- const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
- const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
- const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
- const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
- const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
- const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
- const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
- const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
- const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
- const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
- const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
- const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
- const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
- const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
- const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
- const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
- const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
- const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
- const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
- const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
- const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
- const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
- const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
- const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
- // Combine
- out[ 5] = _mm_packs_epi32(out_05_6, out_05_7);
- out[21] = _mm_packs_epi32(out_21_6, out_21_7);
- out[13] = _mm_packs_epi32(out_13_6, out_13_7);
- out[29] = _mm_packs_epi32(out_29_6, out_29_7);
- out[ 3] = _mm_packs_epi32(out_03_6, out_03_7);
- out[19] = _mm_packs_epi32(out_19_6, out_19_7);
- out[11] = _mm_packs_epi32(out_11_6, out_11_7);
- out[27] = _mm_packs_epi32(out_27_6, out_27_7);
+ }
+ // Final stage --- outputs indices are bit-reversed.
+ {
+ const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
+ const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
+ const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
+ const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
+ const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
+ const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
+ const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
+ const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
+ const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
+ const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
+ const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
+ const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
+ const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
+ const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
+ const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
+ const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
+ const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
+ const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
+ const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
+ const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
+ const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
+ const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
+ const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
+ const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
+ // dct_const_round_shift
+ const __m128i out_01_4 =
+ _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_01_5 =
+ _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_17_4 =
+ _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_17_5 =
+ _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_09_4 =
+ _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_09_5 =
+ _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_25_4 =
+ _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_25_5 =
+ _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_07_4 =
+ _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_07_5 =
+ _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_23_4 =
+ _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_23_5 =
+ _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_15_4 =
+ _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_15_5 =
+ _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_31_4 =
+ _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_31_5 =
+ _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
+ const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
+ const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
+ const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
+ const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
+ const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
+ const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
+ const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
+ const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
+ const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
+ const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
+ const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
+ const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
+ const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
+ const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
+ const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
+ // Combine
+ out[1] = _mm_packs_epi32(out_01_6, out_01_7);
+ out[17] = _mm_packs_epi32(out_17_6, out_17_7);
+ out[9] = _mm_packs_epi32(out_09_6, out_09_7);
+ out[25] = _mm_packs_epi32(out_25_6, out_25_7);
+ out[7] = _mm_packs_epi32(out_07_6, out_07_7);
+ out[23] = _mm_packs_epi32(out_23_6, out_23_7);
+ out[15] = _mm_packs_epi32(out_15_6, out_15_7);
+ out[31] = _mm_packs_epi32(out_31_6, out_31_7);
#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13],
- &out[29], &out[3], &out[19],
- &out[11], &out[27]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
+ overflow =
+ check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
+ &out[7], &out[23], &out[15], &out[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
+ {
+ const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
+ const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
+ const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
+ const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
+ const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
+ const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
+ const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
+ const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
+ const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
+ const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
+ const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
+ const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
+ const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
+ const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
+ const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
+ const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
+ const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
+ const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
+ const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
+ const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
+ const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
+ const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
+ const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
+ const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
+ // dct_const_round_shift
+ const __m128i out_05_4 =
+ _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_05_5 =
+ _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_21_4 =
+ _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_21_5 =
+ _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_13_4 =
+ _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_13_5 =
+ _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_29_4 =
+ _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_29_5 =
+ _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_03_4 =
+ _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_03_5 =
+ _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_19_4 =
+ _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_19_5 =
+ _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_11_4 =
+ _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_11_5 =
+ _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_27_4 =
+ _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_27_5 =
+ _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
+ const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
+ const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
+ const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
+ const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
+ const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
+ const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
+ const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
+ const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
+ const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
+ const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
+ const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
+ const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
+ const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
+ const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
+ const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
+ // Combine
+ out[5] = _mm_packs_epi32(out_05_6, out_05_7);
+ out[21] = _mm_packs_epi32(out_21_6, out_21_7);
+ out[13] = _mm_packs_epi32(out_13_6, out_13_7);
+ out[29] = _mm_packs_epi32(out_29_6, out_29_7);
+ out[3] = _mm_packs_epi32(out_03_6, out_03_7);
+ out[19] = _mm_packs_epi32(out_19_6, out_19_7);
+ out[11] = _mm_packs_epi32(out_11_6, out_11_7);
+ out[27] = _mm_packs_epi32(out_27_6, out_27_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
+ &out[3], &out[19], &out[11], &out[27]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
#endif // DCT_HIGH_BIT_DEPTH
- }
+ }
#if FDCT32x32_HIGH_PRECISION
} else {
__m128i lstep1[64], lstep2[64], lstep3[64];
@@ -1457,32 +1512,32 @@ void FDCT32x32_2D(const int16_t *input,
// stage 3
{
// expanding to 32-bit length priori to addition operations
- lstep2[ 0] = _mm_unpacklo_epi16(step2[ 0], kZero);
- lstep2[ 1] = _mm_unpackhi_epi16(step2[ 0], kZero);
- lstep2[ 2] = _mm_unpacklo_epi16(step2[ 1], kZero);
- lstep2[ 3] = _mm_unpackhi_epi16(step2[ 1], kZero);
- lstep2[ 4] = _mm_unpacklo_epi16(step2[ 2], kZero);
- lstep2[ 5] = _mm_unpackhi_epi16(step2[ 2], kZero);
- lstep2[ 6] = _mm_unpacklo_epi16(step2[ 3], kZero);
- lstep2[ 7] = _mm_unpackhi_epi16(step2[ 3], kZero);
- lstep2[ 8] = _mm_unpacklo_epi16(step2[ 4], kZero);
- lstep2[ 9] = _mm_unpackhi_epi16(step2[ 4], kZero);
- lstep2[10] = _mm_unpacklo_epi16(step2[ 5], kZero);
- lstep2[11] = _mm_unpackhi_epi16(step2[ 5], kZero);
- lstep2[12] = _mm_unpacklo_epi16(step2[ 6], kZero);
- lstep2[13] = _mm_unpackhi_epi16(step2[ 6], kZero);
- lstep2[14] = _mm_unpacklo_epi16(step2[ 7], kZero);
- lstep2[15] = _mm_unpackhi_epi16(step2[ 7], kZero);
- lstep2[ 0] = _mm_madd_epi16(lstep2[ 0], kOne);
- lstep2[ 1] = _mm_madd_epi16(lstep2[ 1], kOne);
- lstep2[ 2] = _mm_madd_epi16(lstep2[ 2], kOne);
- lstep2[ 3] = _mm_madd_epi16(lstep2[ 3], kOne);
- lstep2[ 4] = _mm_madd_epi16(lstep2[ 4], kOne);
- lstep2[ 5] = _mm_madd_epi16(lstep2[ 5], kOne);
- lstep2[ 6] = _mm_madd_epi16(lstep2[ 6], kOne);
- lstep2[ 7] = _mm_madd_epi16(lstep2[ 7], kOne);
- lstep2[ 8] = _mm_madd_epi16(lstep2[ 8], kOne);
- lstep2[ 9] = _mm_madd_epi16(lstep2[ 9], kOne);
+ lstep2[0] = _mm_unpacklo_epi16(step2[0], kZero);
+ lstep2[1] = _mm_unpackhi_epi16(step2[0], kZero);
+ lstep2[2] = _mm_unpacklo_epi16(step2[1], kZero);
+ lstep2[3] = _mm_unpackhi_epi16(step2[1], kZero);
+ lstep2[4] = _mm_unpacklo_epi16(step2[2], kZero);
+ lstep2[5] = _mm_unpackhi_epi16(step2[2], kZero);
+ lstep2[6] = _mm_unpacklo_epi16(step2[3], kZero);
+ lstep2[7] = _mm_unpackhi_epi16(step2[3], kZero);
+ lstep2[8] = _mm_unpacklo_epi16(step2[4], kZero);
+ lstep2[9] = _mm_unpackhi_epi16(step2[4], kZero);
+ lstep2[10] = _mm_unpacklo_epi16(step2[5], kZero);
+ lstep2[11] = _mm_unpackhi_epi16(step2[5], kZero);
+ lstep2[12] = _mm_unpacklo_epi16(step2[6], kZero);
+ lstep2[13] = _mm_unpackhi_epi16(step2[6], kZero);
+ lstep2[14] = _mm_unpacklo_epi16(step2[7], kZero);
+ lstep2[15] = _mm_unpackhi_epi16(step2[7], kZero);
+ lstep2[0] = _mm_madd_epi16(lstep2[0], kOne);
+ lstep2[1] = _mm_madd_epi16(lstep2[1], kOne);
+ lstep2[2] = _mm_madd_epi16(lstep2[2], kOne);
+ lstep2[3] = _mm_madd_epi16(lstep2[3], kOne);
+ lstep2[4] = _mm_madd_epi16(lstep2[4], kOne);
+ lstep2[5] = _mm_madd_epi16(lstep2[5], kOne);
+ lstep2[6] = _mm_madd_epi16(lstep2[6], kOne);
+ lstep2[7] = _mm_madd_epi16(lstep2[7], kOne);
+ lstep2[8] = _mm_madd_epi16(lstep2[8], kOne);
+ lstep2[9] = _mm_madd_epi16(lstep2[9], kOne);
lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
@@ -1490,22 +1545,22 @@ void FDCT32x32_2D(const int16_t *input,
lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
- lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]);
- lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]);
- lstep3[ 2] = _mm_add_epi32(lstep2[12], lstep2[ 2]);
- lstep3[ 3] = _mm_add_epi32(lstep2[13], lstep2[ 3]);
- lstep3[ 4] = _mm_add_epi32(lstep2[10], lstep2[ 4]);
- lstep3[ 5] = _mm_add_epi32(lstep2[11], lstep2[ 5]);
- lstep3[ 6] = _mm_add_epi32(lstep2[ 8], lstep2[ 6]);
- lstep3[ 7] = _mm_add_epi32(lstep2[ 9], lstep2[ 7]);
- lstep3[ 8] = _mm_sub_epi32(lstep2[ 6], lstep2[ 8]);
- lstep3[ 9] = _mm_sub_epi32(lstep2[ 7], lstep2[ 9]);
- lstep3[10] = _mm_sub_epi32(lstep2[ 4], lstep2[10]);
- lstep3[11] = _mm_sub_epi32(lstep2[ 5], lstep2[11]);
- lstep3[12] = _mm_sub_epi32(lstep2[ 2], lstep2[12]);
- lstep3[13] = _mm_sub_epi32(lstep2[ 3], lstep2[13]);
- lstep3[14] = _mm_sub_epi32(lstep2[ 0], lstep2[14]);
- lstep3[15] = _mm_sub_epi32(lstep2[ 1], lstep2[15]);
+ lstep3[0] = _mm_add_epi32(lstep2[14], lstep2[0]);
+ lstep3[1] = _mm_add_epi32(lstep2[15], lstep2[1]);
+ lstep3[2] = _mm_add_epi32(lstep2[12], lstep2[2]);
+ lstep3[3] = _mm_add_epi32(lstep2[13], lstep2[3]);
+ lstep3[4] = _mm_add_epi32(lstep2[10], lstep2[4]);
+ lstep3[5] = _mm_add_epi32(lstep2[11], lstep2[5]);
+ lstep3[6] = _mm_add_epi32(lstep2[8], lstep2[6]);
+ lstep3[7] = _mm_add_epi32(lstep2[9], lstep2[7]);
+ lstep3[8] = _mm_sub_epi32(lstep2[6], lstep2[8]);
+ lstep3[9] = _mm_sub_epi32(lstep2[7], lstep2[9]);
+ lstep3[10] = _mm_sub_epi32(lstep2[4], lstep2[10]);
+ lstep3[11] = _mm_sub_epi32(lstep2[5], lstep2[11]);
+ lstep3[12] = _mm_sub_epi32(lstep2[2], lstep2[12]);
+ lstep3[13] = _mm_sub_epi32(lstep2[3], lstep2[13]);
+ lstep3[14] = _mm_sub_epi32(lstep2[0], lstep2[14]);
+ lstep3[15] = _mm_sub_epi32(lstep2[1], lstep2[15]);
}
{
const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
@@ -1643,10 +1698,10 @@ void FDCT32x32_2D(const int16_t *input,
// stage 4
{
// expanding to 32-bit length priori to addition operations
- lstep2[16] = _mm_unpacklo_epi16(step2[ 8], kZero);
- lstep2[17] = _mm_unpackhi_epi16(step2[ 8], kZero);
- lstep2[18] = _mm_unpacklo_epi16(step2[ 9], kZero);
- lstep2[19] = _mm_unpackhi_epi16(step2[ 9], kZero);
+ lstep2[16] = _mm_unpacklo_epi16(step2[8], kZero);
+ lstep2[17] = _mm_unpackhi_epi16(step2[8], kZero);
+ lstep2[18] = _mm_unpacklo_epi16(step2[9], kZero);
+ lstep2[19] = _mm_unpackhi_epi16(step2[9], kZero);
lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero);
lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero);
lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero);
@@ -1660,14 +1715,14 @@ void FDCT32x32_2D(const int16_t *input,
lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
- lstep1[ 0] = _mm_add_epi32(lstep3[ 6], lstep3[ 0]);
- lstep1[ 1] = _mm_add_epi32(lstep3[ 7], lstep3[ 1]);
- lstep1[ 2] = _mm_add_epi32(lstep3[ 4], lstep3[ 2]);
- lstep1[ 3] = _mm_add_epi32(lstep3[ 5], lstep3[ 3]);
- lstep1[ 4] = _mm_sub_epi32(lstep3[ 2], lstep3[ 4]);
- lstep1[ 5] = _mm_sub_epi32(lstep3[ 3], lstep3[ 5]);
- lstep1[ 6] = _mm_sub_epi32(lstep3[ 0], lstep3[ 6]);
- lstep1[ 7] = _mm_sub_epi32(lstep3[ 1], lstep3[ 7]);
+ lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]);
+ lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]);
+ lstep1[2] = _mm_add_epi32(lstep3[4], lstep3[2]);
+ lstep1[3] = _mm_add_epi32(lstep3[5], lstep3[3]);
+ lstep1[4] = _mm_sub_epi32(lstep3[2], lstep3[4]);
+ lstep1[5] = _mm_sub_epi32(lstep3[3], lstep3[5]);
+ lstep1[6] = _mm_sub_epi32(lstep3[0], lstep3[6]);
+ lstep1[7] = _mm_sub_epi32(lstep3[1], lstep3[7]);
lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]);
lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]);
lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]);
@@ -1686,64 +1741,64 @@ void FDCT32x32_2D(const int16_t *input,
lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
}
{
- // to be continued...
- //
- const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
- const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
-
- u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
- u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
- u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
- u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
-
- // TODO(jingning): manually inline k_madd_epi32_ to further hide
- // instruction latency.
- v[0] = k_madd_epi32(u[0], k32_p16_m16);
- v[1] = k_madd_epi32(u[1], k32_p16_m16);
- v[2] = k_madd_epi32(u[2], k32_p16_m16);
- v[3] = k_madd_epi32(u[3], k32_p16_m16);
- v[4] = k_madd_epi32(u[0], k32_p16_p16);
- v[5] = k_madd_epi32(u[1], k32_p16_p16);
- v[6] = k_madd_epi32(u[2], k32_p16_p16);
- v[7] = k_madd_epi32(u[3], k32_p16_p16);
+ // to be continued...
+ //
+ const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+ const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
+ u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
+ u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
+ u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+ // TODO(jingning): manually inline k_madd_epi32_ to further hide
+ // instruction latency.
+ v[0] = k_madd_epi32(u[0], k32_p16_m16);
+ v[1] = k_madd_epi32(u[1], k32_p16_m16);
+ v[2] = k_madd_epi32(u[2], k32_p16_m16);
+ v[3] = k_madd_epi32(u[3], k32_p16_m16);
+ v[4] = k_madd_epi32(u[0], k32_p16_p16);
+ v[5] = k_madd_epi32(u[1], k32_p16_p16);
+ v[6] = k_madd_epi32(u[2], k32_p16_p16);
+ v[7] = k_madd_epi32(u[3], k32_p16_p16);
#if DCT_HIGH_BIT_DEPTH
- overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3],
- &v[4], &v[5], &v[6], &v[7], &kZero);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
+ overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], &v[4],
+ &v[5], &v[6], &v[7], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
#endif // DCT_HIGH_BIT_DEPTH
- u[0] = k_packs_epi64(v[0], v[1]);
- u[1] = k_packs_epi64(v[2], v[3]);
- u[2] = k_packs_epi64(v[4], v[5]);
- u[3] = k_packs_epi64(v[6], v[7]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-
- lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+ lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
}
{
const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
- u[ 0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
- u[ 1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
- u[ 2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
- u[ 3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
- u[ 4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
- u[ 5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
- u[ 6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
- u[ 7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
- u[ 8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
- u[ 9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
+ u[0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
+ u[1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
+ u[2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
+ u[3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
+ u[4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
+ u[5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
+ u[6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
+ u[7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
+ u[8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
+ u[9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
@@ -1751,16 +1806,16 @@ void FDCT32x32_2D(const int16_t *input,
u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
- v[ 0] = k_madd_epi32(u[ 0], k32_m08_p24);
- v[ 1] = k_madd_epi32(u[ 1], k32_m08_p24);
- v[ 2] = k_madd_epi32(u[ 2], k32_m08_p24);
- v[ 3] = k_madd_epi32(u[ 3], k32_m08_p24);
- v[ 4] = k_madd_epi32(u[ 4], k32_m08_p24);
- v[ 5] = k_madd_epi32(u[ 5], k32_m08_p24);
- v[ 6] = k_madd_epi32(u[ 6], k32_m08_p24);
- v[ 7] = k_madd_epi32(u[ 7], k32_m08_p24);
- v[ 8] = k_madd_epi32(u[ 8], k32_m24_m08);
- v[ 9] = k_madd_epi32(u[ 9], k32_m24_m08);
+ v[0] = k_madd_epi32(u[0], k32_m08_p24);
+ v[1] = k_madd_epi32(u[1], k32_m08_p24);
+ v[2] = k_madd_epi32(u[2], k32_m08_p24);
+ v[3] = k_madd_epi32(u[3], k32_m08_p24);
+ v[4] = k_madd_epi32(u[4], k32_m08_p24);
+ v[5] = k_madd_epi32(u[5], k32_m08_p24);
+ v[6] = k_madd_epi32(u[6], k32_m08_p24);
+ v[7] = k_madd_epi32(u[7], k32_m08_p24);
+ v[8] = k_madd_epi32(u[8], k32_m24_m08);
+ v[9] = k_madd_epi32(u[9], k32_m24_m08);
v[10] = k_madd_epi32(u[10], k32_m24_m08);
v[11] = k_madd_epi32(u[11], k32_m24_m08);
v[12] = k_madd_epi32(u[12], k32_m24_m08);
@@ -1771,41 +1826,40 @@ void FDCT32x32_2D(const int16_t *input,
v[17] = k_madd_epi32(u[13], k32_m08_p24);
v[18] = k_madd_epi32(u[14], k32_m08_p24);
v[19] = k_madd_epi32(u[15], k32_m08_p24);
- v[20] = k_madd_epi32(u[ 8], k32_m08_p24);
- v[21] = k_madd_epi32(u[ 9], k32_m08_p24);
+ v[20] = k_madd_epi32(u[8], k32_m08_p24);
+ v[21] = k_madd_epi32(u[9], k32_m08_p24);
v[22] = k_madd_epi32(u[10], k32_m08_p24);
v[23] = k_madd_epi32(u[11], k32_m08_p24);
- v[24] = k_madd_epi32(u[ 4], k32_p24_p08);
- v[25] = k_madd_epi32(u[ 5], k32_p24_p08);
- v[26] = k_madd_epi32(u[ 6], k32_p24_p08);
- v[27] = k_madd_epi32(u[ 7], k32_p24_p08);
- v[28] = k_madd_epi32(u[ 0], k32_p24_p08);
- v[29] = k_madd_epi32(u[ 1], k32_p24_p08);
- v[30] = k_madd_epi32(u[ 2], k32_p24_p08);
- v[31] = k_madd_epi32(u[ 3], k32_p24_p08);
+ v[24] = k_madd_epi32(u[4], k32_p24_p08);
+ v[25] = k_madd_epi32(u[5], k32_p24_p08);
+ v[26] = k_madd_epi32(u[6], k32_p24_p08);
+ v[27] = k_madd_epi32(u[7], k32_p24_p08);
+ v[28] = k_madd_epi32(u[0], k32_p24_p08);
+ v[29] = k_madd_epi32(u[1], k32_p24_p08);
+ v[30] = k_madd_epi32(u[2], k32_p24_p08);
+ v[31] = k_madd_epi32(u[3], k32_p24_p08);
#if DCT_HIGH_BIT_DEPTH
overflow = k_check_epi32_overflow_32(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
- &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
- &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
- &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
- &kZero);
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+ &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+ &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
if (overflow) {
HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
return;
}
#endif // DCT_HIGH_BIT_DEPTH
- u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
- u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
- u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
- u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
- u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
- u[ 5] = k_packs_epi64(v[10], v[11]);
- u[ 6] = k_packs_epi64(v[12], v[13]);
- u[ 7] = k_packs_epi64(v[14], v[15]);
- u[ 8] = k_packs_epi64(v[16], v[17]);
- u[ 9] = k_packs_epi64(v[18], v[19]);
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+ u[8] = k_packs_epi64(v[16], v[17]);
+ u[9] = k_packs_epi64(v[18], v[19]);
u[10] = k_packs_epi64(v[20], v[21]);
u[11] = k_packs_epi64(v[22], v[23]);
u[12] = k_packs_epi64(v[24], v[25]);
@@ -1813,16 +1867,16 @@ void FDCT32x32_2D(const int16_t *input,
u[14] = k_packs_epi64(v[28], v[29]);
u[15] = k_packs_epi64(v[30], v[31]);
- v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
- v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
- v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
- v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
- v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
- v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
- v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
- v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
- v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
- v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
@@ -1830,16 +1884,16 @@ void FDCT32x32_2D(const int16_t *input,
v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
- lstep1[36] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
- lstep1[37] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
- lstep1[38] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
- lstep1[39] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
- lstep1[40] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
- lstep1[41] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
- lstep1[42] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
- lstep1[43] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
- lstep1[52] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
- lstep1[53] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+ lstep1[36] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep1[37] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep1[38] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep1[39] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ lstep1[40] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ lstep1[41] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ lstep1[42] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ lstep1[43] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ lstep1[52] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ lstep1[53] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
@@ -1849,10 +1903,10 @@ void FDCT32x32_2D(const int16_t *input,
}
// stage 5
{
- lstep2[ 8] = _mm_add_epi32(lstep1[10], lstep3[ 8]);
- lstep2[ 9] = _mm_add_epi32(lstep1[11], lstep3[ 9]);
- lstep2[10] = _mm_sub_epi32(lstep3[ 8], lstep1[10]);
- lstep2[11] = _mm_sub_epi32(lstep3[ 9], lstep1[11]);
+ lstep2[8] = _mm_add_epi32(lstep1[10], lstep3[8]);
+ lstep2[9] = _mm_add_epi32(lstep1[11], lstep3[9]);
+ lstep2[10] = _mm_sub_epi32(lstep3[8], lstep1[10]);
+ lstep2[11] = _mm_sub_epi32(lstep3[9], lstep1[11]);
lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]);
lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]);
lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]);
@@ -1875,16 +1929,16 @@ void FDCT32x32_2D(const int16_t *input,
// TODO(jingning): manually inline k_madd_epi32_ to further hide
// instruction latency.
- v[ 0] = k_madd_epi32(u[0], k32_p16_p16);
- v[ 1] = k_madd_epi32(u[1], k32_p16_p16);
- v[ 2] = k_madd_epi32(u[2], k32_p16_p16);
- v[ 3] = k_madd_epi32(u[3], k32_p16_p16);
- v[ 4] = k_madd_epi32(u[0], k32_p16_m16);
- v[ 5] = k_madd_epi32(u[1], k32_p16_m16);
- v[ 6] = k_madd_epi32(u[2], k32_p16_m16);
- v[ 7] = k_madd_epi32(u[3], k32_p16_m16);
- v[ 8] = k_madd_epi32(u[4], k32_p24_p08);
- v[ 9] = k_madd_epi32(u[5], k32_p24_p08);
+ v[0] = k_madd_epi32(u[0], k32_p16_p16);
+ v[1] = k_madd_epi32(u[1], k32_p16_p16);
+ v[2] = k_madd_epi32(u[2], k32_p16_p16);
+ v[3] = k_madd_epi32(u[3], k32_p16_p16);
+ v[4] = k_madd_epi32(u[0], k32_p16_m16);
+ v[5] = k_madd_epi32(u[1], k32_p16_m16);
+ v[6] = k_madd_epi32(u[2], k32_p16_m16);
+ v[7] = k_madd_epi32(u[3], k32_p16_m16);
+ v[8] = k_madd_epi32(u[4], k32_p24_p08);
+ v[9] = k_madd_epi32(u[5], k32_p24_p08);
v[10] = k_madd_epi32(u[6], k32_p24_p08);
v[11] = k_madd_epi32(u[7], k32_p24_p08);
v[12] = k_madd_epi32(u[4], k32_m08_p24);
@@ -1894,9 +1948,8 @@ void FDCT32x32_2D(const int16_t *input,
#if DCT_HIGH_BIT_DEPTH
overflow = k_check_epi32_overflow_16(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
- &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
- &kZero);
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
if (overflow) {
HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
return;
@@ -1966,13 +2019,13 @@ void FDCT32x32_2D(const int16_t *input,
u[7] = _mm_srai_epi32(u[7], 2);
// Combine
- out[ 0] = _mm_packs_epi32(u[0], u[1]);
+ out[0] = _mm_packs_epi32(u[0], u[1]);
out[16] = _mm_packs_epi32(u[2], u[3]);
- out[ 8] = _mm_packs_epi32(u[4], u[5]);
+ out[8] = _mm_packs_epi32(u[4], u[5]);
out[24] = _mm_packs_epi32(u[6], u[7]);
#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&out[0], &out[16],
- &out[8], &out[24]);
+ overflow =
+ check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
if (overflow) {
HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
return;
@@ -2001,8 +2054,8 @@ void FDCT32x32_2D(const int16_t *input,
v[5] = k_madd_epi32(u[5], k32_m24_m08);
v[6] = k_madd_epi32(u[6], k32_m24_m08);
v[7] = k_madd_epi32(u[7], k32_m24_m08);
- v[ 8] = k_madd_epi32(u[4], k32_m08_p24);
- v[ 9] = k_madd_epi32(u[5], k32_m08_p24);
+ v[8] = k_madd_epi32(u[4], k32_m08_p24);
+ v[9] = k_madd_epi32(u[5], k32_m08_p24);
v[10] = k_madd_epi32(u[6], k32_m08_p24);
v[11] = k_madd_epi32(u[7], k32_m08_p24);
v[12] = k_madd_epi32(u[0], k32_p24_p08);
@@ -2012,9 +2065,8 @@ void FDCT32x32_2D(const int16_t *input,
#if DCT_HIGH_BIT_DEPTH
overflow = k_check_epi32_overflow_16(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
- &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
- &kZero);
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
if (overflow) {
HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
return;
@@ -2088,10 +2140,10 @@ void FDCT32x32_2D(const int16_t *input,
const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
- u[0] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]);
- u[1] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]);
- u[2] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]);
- u[3] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+ u[0] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
+ u[1] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
+ u[2] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
+ u[3] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
@@ -2100,10 +2152,10 @@ void FDCT32x32_2D(const int16_t *input,
u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
- u[12] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]);
- u[13] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]);
- u[14] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]);
- u[15] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+ u[12] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
+ u[13] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
+ u[14] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
+ u[15] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
v[0] = k_madd_epi32(u[0], k32_p28_p04);
v[1] = k_madd_epi32(u[1], k32_p28_p04);
@@ -2113,8 +2165,8 @@ void FDCT32x32_2D(const int16_t *input,
v[5] = k_madd_epi32(u[5], k32_p12_p20);
v[6] = k_madd_epi32(u[6], k32_p12_p20);
v[7] = k_madd_epi32(u[7], k32_p12_p20);
- v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12);
- v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12);
+ v[8] = k_madd_epi32(u[8], k32_m20_p12);
+ v[9] = k_madd_epi32(u[9], k32_m20_p12);
v[10] = k_madd_epi32(u[10], k32_m20_p12);
v[11] = k_madd_epi32(u[11], k32_m20_p12);
v[12] = k_madd_epi32(u[12], k32_m04_p28);
@@ -2124,9 +2176,8 @@ void FDCT32x32_2D(const int16_t *input,
#if DCT_HIGH_BIT_DEPTH
overflow = k_check_epi32_overflow_16(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
- &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
- &kZero);
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
if (overflow) {
HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
return;
@@ -2195,13 +2246,13 @@ void FDCT32x32_2D(const int16_t *input,
u[6] = _mm_srai_epi32(u[6], 2);
u[7] = _mm_srai_epi32(u[7], 2);
- out[ 4] = _mm_packs_epi32(u[0], u[1]);
+ out[4] = _mm_packs_epi32(u[0], u[1]);
out[20] = _mm_packs_epi32(u[2], u[3]);
out[12] = _mm_packs_epi32(u[4], u[5]);
out[28] = _mm_packs_epi32(u[6], u[7]);
#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&out[4], &out[20],
- &out[12], &out[28]);
+ overflow =
+ check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
if (overflow) {
HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
return;
@@ -2230,21 +2281,21 @@ void FDCT32x32_2D(const int16_t *input,
const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64);
const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
- const __m128i k32_m12_m20 = pair_set_epi32(-cospi_12_64,
- -cospi_20_64);
+ const __m128i k32_m12_m20 =
+ pair_set_epi32(-cospi_12_64, -cospi_20_64);
const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
- u[ 0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
- u[ 1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
- u[ 2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
- u[ 3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
- u[ 4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
- u[ 5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
- u[ 6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
- u[ 7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
- u[ 8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
- u[ 9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
+ u[0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
+ u[1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
+ u[2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
+ u[3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
+ u[4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
+ u[5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
+ u[6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
+ u[7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
+ u[8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
+ u[9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]);
u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]);
u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]);
@@ -2252,16 +2303,16 @@ void FDCT32x32_2D(const int16_t *input,
u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]);
u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]);
- v[ 0] = k_madd_epi32(u[ 0], k32_m04_p28);
- v[ 1] = k_madd_epi32(u[ 1], k32_m04_p28);
- v[ 2] = k_madd_epi32(u[ 2], k32_m04_p28);
- v[ 3] = k_madd_epi32(u[ 3], k32_m04_p28);
- v[ 4] = k_madd_epi32(u[ 4], k32_m28_m04);
- v[ 5] = k_madd_epi32(u[ 5], k32_m28_m04);
- v[ 6] = k_madd_epi32(u[ 6], k32_m28_m04);
- v[ 7] = k_madd_epi32(u[ 7], k32_m28_m04);
- v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12);
- v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12);
+ v[0] = k_madd_epi32(u[0], k32_m04_p28);
+ v[1] = k_madd_epi32(u[1], k32_m04_p28);
+ v[2] = k_madd_epi32(u[2], k32_m04_p28);
+ v[3] = k_madd_epi32(u[3], k32_m04_p28);
+ v[4] = k_madd_epi32(u[4], k32_m28_m04);
+ v[5] = k_madd_epi32(u[5], k32_m28_m04);
+ v[6] = k_madd_epi32(u[6], k32_m28_m04);
+ v[7] = k_madd_epi32(u[7], k32_m28_m04);
+ v[8] = k_madd_epi32(u[8], k32_m20_p12);
+ v[9] = k_madd_epi32(u[9], k32_m20_p12);
v[10] = k_madd_epi32(u[10], k32_m20_p12);
v[11] = k_madd_epi32(u[11], k32_m20_p12);
v[12] = k_madd_epi32(u[12], k32_m12_m20);
@@ -2272,41 +2323,40 @@ void FDCT32x32_2D(const int16_t *input,
v[17] = k_madd_epi32(u[13], k32_m20_p12);
v[18] = k_madd_epi32(u[14], k32_m20_p12);
v[19] = k_madd_epi32(u[15], k32_m20_p12);
- v[20] = k_madd_epi32(u[ 8], k32_p12_p20);
- v[21] = k_madd_epi32(u[ 9], k32_p12_p20);
+ v[20] = k_madd_epi32(u[8], k32_p12_p20);
+ v[21] = k_madd_epi32(u[9], k32_p12_p20);
v[22] = k_madd_epi32(u[10], k32_p12_p20);
v[23] = k_madd_epi32(u[11], k32_p12_p20);
- v[24] = k_madd_epi32(u[ 4], k32_m04_p28);
- v[25] = k_madd_epi32(u[ 5], k32_m04_p28);
- v[26] = k_madd_epi32(u[ 6], k32_m04_p28);
- v[27] = k_madd_epi32(u[ 7], k32_m04_p28);
- v[28] = k_madd_epi32(u[ 0], k32_p28_p04);
- v[29] = k_madd_epi32(u[ 1], k32_p28_p04);
- v[30] = k_madd_epi32(u[ 2], k32_p28_p04);
- v[31] = k_madd_epi32(u[ 3], k32_p28_p04);
+ v[24] = k_madd_epi32(u[4], k32_m04_p28);
+ v[25] = k_madd_epi32(u[5], k32_m04_p28);
+ v[26] = k_madd_epi32(u[6], k32_m04_p28);
+ v[27] = k_madd_epi32(u[7], k32_m04_p28);
+ v[28] = k_madd_epi32(u[0], k32_p28_p04);
+ v[29] = k_madd_epi32(u[1], k32_p28_p04);
+ v[30] = k_madd_epi32(u[2], k32_p28_p04);
+ v[31] = k_madd_epi32(u[3], k32_p28_p04);
#if DCT_HIGH_BIT_DEPTH
overflow = k_check_epi32_overflow_32(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
- &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
- &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
- &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
- &kZero);
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+ &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+ &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
if (overflow) {
HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
return;
}
#endif // DCT_HIGH_BIT_DEPTH
- u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
- u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
- u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
- u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
- u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
- u[ 5] = k_packs_epi64(v[10], v[11]);
- u[ 6] = k_packs_epi64(v[12], v[13]);
- u[ 7] = k_packs_epi64(v[14], v[15]);
- u[ 8] = k_packs_epi64(v[16], v[17]);
- u[ 9] = k_packs_epi64(v[18], v[19]);
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+ u[8] = k_packs_epi64(v[16], v[17]);
+ u[9] = k_packs_epi64(v[18], v[19]);
u[10] = k_packs_epi64(v[20], v[21]);
u[11] = k_packs_epi64(v[22], v[23]);
u[12] = k_packs_epi64(v[24], v[25]);
@@ -2314,16 +2364,16 @@ void FDCT32x32_2D(const int16_t *input,
u[14] = k_packs_epi64(v[28], v[29]);
u[15] = k_packs_epi64(v[30], v[31]);
- v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
- v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
- v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
- v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
- v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
- v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
- v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
- v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
- v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
- v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
@@ -2331,16 +2381,16 @@ void FDCT32x32_2D(const int16_t *input,
v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
- lstep3[34] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
- lstep3[35] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
- lstep3[36] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
- lstep3[37] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
- lstep3[42] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
- lstep3[43] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
- lstep3[44] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
- lstep3[45] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
- lstep3[50] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
- lstep3[51] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+ lstep3[34] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep3[35] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep3[36] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep3[37] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ lstep3[42] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ lstep3[43] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ lstep3[44] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ lstep3[45] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ lstep3[50] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ lstep3[51] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
@@ -2353,22 +2403,22 @@ void FDCT32x32_2D(const int16_t *input,
const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64);
const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64);
const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64);
- const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64);
+ const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64);
const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64);
const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64);
const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64);
const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64);
- u[ 0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
- u[ 1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
- u[ 2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
- u[ 3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
- u[ 4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
- u[ 5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
- u[ 6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
- u[ 7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
- u[ 8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
- u[ 9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
+ u[0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
+ u[1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
+ u[2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
+ u[3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
+ u[4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
+ u[5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
+ u[6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
+ u[7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
+ u[8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
+ u[9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]);
u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]);
u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]);
@@ -2376,16 +2426,16 @@ void FDCT32x32_2D(const int16_t *input,
u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]);
u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]);
- v[ 0] = k_madd_epi32(u[ 0], k32_p30_p02);
- v[ 1] = k_madd_epi32(u[ 1], k32_p30_p02);
- v[ 2] = k_madd_epi32(u[ 2], k32_p30_p02);
- v[ 3] = k_madd_epi32(u[ 3], k32_p30_p02);
- v[ 4] = k_madd_epi32(u[ 4], k32_p14_p18);
- v[ 5] = k_madd_epi32(u[ 5], k32_p14_p18);
- v[ 6] = k_madd_epi32(u[ 6], k32_p14_p18);
- v[ 7] = k_madd_epi32(u[ 7], k32_p14_p18);
- v[ 8] = k_madd_epi32(u[ 8], k32_p22_p10);
- v[ 9] = k_madd_epi32(u[ 9], k32_p22_p10);
+ v[0] = k_madd_epi32(u[0], k32_p30_p02);
+ v[1] = k_madd_epi32(u[1], k32_p30_p02);
+ v[2] = k_madd_epi32(u[2], k32_p30_p02);
+ v[3] = k_madd_epi32(u[3], k32_p30_p02);
+ v[4] = k_madd_epi32(u[4], k32_p14_p18);
+ v[5] = k_madd_epi32(u[5], k32_p14_p18);
+ v[6] = k_madd_epi32(u[6], k32_p14_p18);
+ v[7] = k_madd_epi32(u[7], k32_p14_p18);
+ v[8] = k_madd_epi32(u[8], k32_p22_p10);
+ v[9] = k_madd_epi32(u[9], k32_p22_p10);
v[10] = k_madd_epi32(u[10], k32_p22_p10);
v[11] = k_madd_epi32(u[11], k32_p22_p10);
v[12] = k_madd_epi32(u[12], k32_p06_p26);
@@ -2396,41 +2446,40 @@ void FDCT32x32_2D(const int16_t *input,
v[17] = k_madd_epi32(u[13], k32_m26_p06);
v[18] = k_madd_epi32(u[14], k32_m26_p06);
v[19] = k_madd_epi32(u[15], k32_m26_p06);
- v[20] = k_madd_epi32(u[ 8], k32_m10_p22);
- v[21] = k_madd_epi32(u[ 9], k32_m10_p22);
+ v[20] = k_madd_epi32(u[8], k32_m10_p22);
+ v[21] = k_madd_epi32(u[9], k32_m10_p22);
v[22] = k_madd_epi32(u[10], k32_m10_p22);
v[23] = k_madd_epi32(u[11], k32_m10_p22);
- v[24] = k_madd_epi32(u[ 4], k32_m18_p14);
- v[25] = k_madd_epi32(u[ 5], k32_m18_p14);
- v[26] = k_madd_epi32(u[ 6], k32_m18_p14);
- v[27] = k_madd_epi32(u[ 7], k32_m18_p14);
- v[28] = k_madd_epi32(u[ 0], k32_m02_p30);
- v[29] = k_madd_epi32(u[ 1], k32_m02_p30);
- v[30] = k_madd_epi32(u[ 2], k32_m02_p30);
- v[31] = k_madd_epi32(u[ 3], k32_m02_p30);
+ v[24] = k_madd_epi32(u[4], k32_m18_p14);
+ v[25] = k_madd_epi32(u[5], k32_m18_p14);
+ v[26] = k_madd_epi32(u[6], k32_m18_p14);
+ v[27] = k_madd_epi32(u[7], k32_m18_p14);
+ v[28] = k_madd_epi32(u[0], k32_m02_p30);
+ v[29] = k_madd_epi32(u[1], k32_m02_p30);
+ v[30] = k_madd_epi32(u[2], k32_m02_p30);
+ v[31] = k_madd_epi32(u[3], k32_m02_p30);
#if DCT_HIGH_BIT_DEPTH
overflow = k_check_epi32_overflow_32(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
- &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
- &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
- &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
- &kZero);
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+ &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+ &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
if (overflow) {
HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
return;
}
#endif // DCT_HIGH_BIT_DEPTH
- u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
- u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
- u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
- u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
- u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
- u[ 5] = k_packs_epi64(v[10], v[11]);
- u[ 6] = k_packs_epi64(v[12], v[13]);
- u[ 7] = k_packs_epi64(v[14], v[15]);
- u[ 8] = k_packs_epi64(v[16], v[17]);
- u[ 9] = k_packs_epi64(v[18], v[19]);
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+ u[8] = k_packs_epi64(v[16], v[17]);
+ u[9] = k_packs_epi64(v[18], v[19]);
u[10] = k_packs_epi64(v[20], v[21]);
u[11] = k_packs_epi64(v[22], v[23]);
u[12] = k_packs_epi64(v[24], v[25]);
@@ -2438,16 +2487,16 @@ void FDCT32x32_2D(const int16_t *input,
u[14] = k_packs_epi64(v[28], v[29]);
u[15] = k_packs_epi64(v[30], v[31]);
- v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
- v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
- v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
- v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
- v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
- v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
- v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
- v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
- v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
- v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
@@ -2455,16 +2504,16 @@ void FDCT32x32_2D(const int16_t *input,
v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
- u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
- u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
- u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
- u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
- u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
- u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
- u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
- u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
- u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
- u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
@@ -2472,16 +2521,16 @@ void FDCT32x32_2D(const int16_t *input,
u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
- v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
- v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
- v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
- v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
- v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
- v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
- v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
- v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
- v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
- v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
+ v[0] = _mm_cmplt_epi32(u[0], kZero);
+ v[1] = _mm_cmplt_epi32(u[1], kZero);
+ v[2] = _mm_cmplt_epi32(u[2], kZero);
+ v[3] = _mm_cmplt_epi32(u[3], kZero);
+ v[4] = _mm_cmplt_epi32(u[4], kZero);
+ v[5] = _mm_cmplt_epi32(u[5], kZero);
+ v[6] = _mm_cmplt_epi32(u[6], kZero);
+ v[7] = _mm_cmplt_epi32(u[7], kZero);
+ v[8] = _mm_cmplt_epi32(u[8], kZero);
+ v[9] = _mm_cmplt_epi32(u[9], kZero);
v[10] = _mm_cmplt_epi32(u[10], kZero);
v[11] = _mm_cmplt_epi32(u[11], kZero);
v[12] = _mm_cmplt_epi32(u[12], kZero);
@@ -2489,16 +2538,16 @@ void FDCT32x32_2D(const int16_t *input,
v[14] = _mm_cmplt_epi32(u[14], kZero);
v[15] = _mm_cmplt_epi32(u[15], kZero);
- u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
- u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
- u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
- u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
- u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
- u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
- u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
- u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
- u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
- u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
+ u[0] = _mm_sub_epi32(u[0], v[0]);
+ u[1] = _mm_sub_epi32(u[1], v[1]);
+ u[2] = _mm_sub_epi32(u[2], v[2]);
+ u[3] = _mm_sub_epi32(u[3], v[3]);
+ u[4] = _mm_sub_epi32(u[4], v[4]);
+ u[5] = _mm_sub_epi32(u[5], v[5]);
+ u[6] = _mm_sub_epi32(u[6], v[6]);
+ u[7] = _mm_sub_epi32(u[7], v[7]);
+ u[8] = _mm_sub_epi32(u[8], v[8]);
+ u[9] = _mm_sub_epi32(u[9], v[9]);
u[10] = _mm_sub_epi32(u[10], v[10]);
u[11] = _mm_sub_epi32(u[11], v[11]);
u[12] = _mm_sub_epi32(u[12], v[12]);
@@ -2506,16 +2555,16 @@ void FDCT32x32_2D(const int16_t *input,
u[14] = _mm_sub_epi32(u[14], v[14]);
u[15] = _mm_sub_epi32(u[15], v[15]);
- v[ 0] = _mm_add_epi32(u[ 0], K32One);
- v[ 1] = _mm_add_epi32(u[ 1], K32One);
- v[ 2] = _mm_add_epi32(u[ 2], K32One);
- v[ 3] = _mm_add_epi32(u[ 3], K32One);
- v[ 4] = _mm_add_epi32(u[ 4], K32One);
- v[ 5] = _mm_add_epi32(u[ 5], K32One);
- v[ 6] = _mm_add_epi32(u[ 6], K32One);
- v[ 7] = _mm_add_epi32(u[ 7], K32One);
- v[ 8] = _mm_add_epi32(u[ 8], K32One);
- v[ 9] = _mm_add_epi32(u[ 9], K32One);
+ v[0] = _mm_add_epi32(u[0], K32One);
+ v[1] = _mm_add_epi32(u[1], K32One);
+ v[2] = _mm_add_epi32(u[2], K32One);
+ v[3] = _mm_add_epi32(u[3], K32One);
+ v[4] = _mm_add_epi32(u[4], K32One);
+ v[5] = _mm_add_epi32(u[5], K32One);
+ v[6] = _mm_add_epi32(u[6], K32One);
+ v[7] = _mm_add_epi32(u[7], K32One);
+ v[8] = _mm_add_epi32(u[8], K32One);
+ v[9] = _mm_add_epi32(u[9], K32One);
v[10] = _mm_add_epi32(u[10], K32One);
v[11] = _mm_add_epi32(u[11], K32One);
v[12] = _mm_add_epi32(u[12], K32One);
@@ -2523,16 +2572,16 @@ void FDCT32x32_2D(const int16_t *input,
v[14] = _mm_add_epi32(u[14], K32One);
v[15] = _mm_add_epi32(u[15], K32One);
- u[ 0] = _mm_srai_epi32(v[ 0], 2);
- u[ 1] = _mm_srai_epi32(v[ 1], 2);
- u[ 2] = _mm_srai_epi32(v[ 2], 2);
- u[ 3] = _mm_srai_epi32(v[ 3], 2);
- u[ 4] = _mm_srai_epi32(v[ 4], 2);
- u[ 5] = _mm_srai_epi32(v[ 5], 2);
- u[ 6] = _mm_srai_epi32(v[ 6], 2);
- u[ 7] = _mm_srai_epi32(v[ 7], 2);
- u[ 8] = _mm_srai_epi32(v[ 8], 2);
- u[ 9] = _mm_srai_epi32(v[ 9], 2);
+ u[0] = _mm_srai_epi32(v[0], 2);
+ u[1] = _mm_srai_epi32(v[1], 2);
+ u[2] = _mm_srai_epi32(v[2], 2);
+ u[3] = _mm_srai_epi32(v[3], 2);
+ u[4] = _mm_srai_epi32(v[4], 2);
+ u[5] = _mm_srai_epi32(v[5], 2);
+ u[6] = _mm_srai_epi32(v[6], 2);
+ u[7] = _mm_srai_epi32(v[7], 2);
+ u[8] = _mm_srai_epi32(v[8], 2);
+ u[9] = _mm_srai_epi32(v[9], 2);
u[10] = _mm_srai_epi32(v[10], 2);
u[11] = _mm_srai_epi32(v[11], 2);
u[12] = _mm_srai_epi32(v[12], 2);
@@ -2540,18 +2589,18 @@ void FDCT32x32_2D(const int16_t *input,
u[14] = _mm_srai_epi32(v[14], 2);
u[15] = _mm_srai_epi32(v[15], 2);
- out[ 2] = _mm_packs_epi32(u[0], u[1]);
+ out[2] = _mm_packs_epi32(u[0], u[1]);
out[18] = _mm_packs_epi32(u[2], u[3]);
out[10] = _mm_packs_epi32(u[4], u[5]);
out[26] = _mm_packs_epi32(u[6], u[7]);
- out[ 6] = _mm_packs_epi32(u[8], u[9]);
+ out[6] = _mm_packs_epi32(u[8], u[9]);
out[22] = _mm_packs_epi32(u[10], u[11]);
out[14] = _mm_packs_epi32(u[12], u[13]);
out[30] = _mm_packs_epi32(u[14], u[15]);
#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10],
- &out[26], &out[6], &out[22],
- &out[14], &out[30]);
+ overflow =
+ check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
+ &out[6], &out[22], &out[14], &out[30]);
if (overflow) {
HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
return;
@@ -2603,16 +2652,16 @@ void FDCT32x32_2D(const int16_t *input,
const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64);
const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64);
- u[ 0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
- u[ 1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
- u[ 2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
- u[ 3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
- u[ 4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
- u[ 5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
- u[ 6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
- u[ 7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
- u[ 8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
- u[ 9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
+ u[0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
+ u[1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
+ u[2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
+ u[3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
+ u[4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
+ u[5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
+ u[6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
+ u[7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
+ u[8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
+ u[9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]);
u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]);
u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]);
@@ -2620,16 +2669,16 @@ void FDCT32x32_2D(const int16_t *input,
u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]);
u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]);
- v[ 0] = k_madd_epi32(u[ 0], k32_p31_p01);
- v[ 1] = k_madd_epi32(u[ 1], k32_p31_p01);
- v[ 2] = k_madd_epi32(u[ 2], k32_p31_p01);
- v[ 3] = k_madd_epi32(u[ 3], k32_p31_p01);
- v[ 4] = k_madd_epi32(u[ 4], k32_p15_p17);
- v[ 5] = k_madd_epi32(u[ 5], k32_p15_p17);
- v[ 6] = k_madd_epi32(u[ 6], k32_p15_p17);
- v[ 7] = k_madd_epi32(u[ 7], k32_p15_p17);
- v[ 8] = k_madd_epi32(u[ 8], k32_p23_p09);
- v[ 9] = k_madd_epi32(u[ 9], k32_p23_p09);
+ v[0] = k_madd_epi32(u[0], k32_p31_p01);
+ v[1] = k_madd_epi32(u[1], k32_p31_p01);
+ v[2] = k_madd_epi32(u[2], k32_p31_p01);
+ v[3] = k_madd_epi32(u[3], k32_p31_p01);
+ v[4] = k_madd_epi32(u[4], k32_p15_p17);
+ v[5] = k_madd_epi32(u[5], k32_p15_p17);
+ v[6] = k_madd_epi32(u[6], k32_p15_p17);
+ v[7] = k_madd_epi32(u[7], k32_p15_p17);
+ v[8] = k_madd_epi32(u[8], k32_p23_p09);
+ v[9] = k_madd_epi32(u[9], k32_p23_p09);
v[10] = k_madd_epi32(u[10], k32_p23_p09);
v[11] = k_madd_epi32(u[11], k32_p23_p09);
v[12] = k_madd_epi32(u[12], k32_p07_p25);
@@ -2640,41 +2689,40 @@ void FDCT32x32_2D(const int16_t *input,
v[17] = k_madd_epi32(u[13], k32_m25_p07);
v[18] = k_madd_epi32(u[14], k32_m25_p07);
v[19] = k_madd_epi32(u[15], k32_m25_p07);
- v[20] = k_madd_epi32(u[ 8], k32_m09_p23);
- v[21] = k_madd_epi32(u[ 9], k32_m09_p23);
+ v[20] = k_madd_epi32(u[8], k32_m09_p23);
+ v[21] = k_madd_epi32(u[9], k32_m09_p23);
v[22] = k_madd_epi32(u[10], k32_m09_p23);
v[23] = k_madd_epi32(u[11], k32_m09_p23);
- v[24] = k_madd_epi32(u[ 4], k32_m17_p15);
- v[25] = k_madd_epi32(u[ 5], k32_m17_p15);
- v[26] = k_madd_epi32(u[ 6], k32_m17_p15);
- v[27] = k_madd_epi32(u[ 7], k32_m17_p15);
- v[28] = k_madd_epi32(u[ 0], k32_m01_p31);
- v[29] = k_madd_epi32(u[ 1], k32_m01_p31);
- v[30] = k_madd_epi32(u[ 2], k32_m01_p31);
- v[31] = k_madd_epi32(u[ 3], k32_m01_p31);
+ v[24] = k_madd_epi32(u[4], k32_m17_p15);
+ v[25] = k_madd_epi32(u[5], k32_m17_p15);
+ v[26] = k_madd_epi32(u[6], k32_m17_p15);
+ v[27] = k_madd_epi32(u[7], k32_m17_p15);
+ v[28] = k_madd_epi32(u[0], k32_m01_p31);
+ v[29] = k_madd_epi32(u[1], k32_m01_p31);
+ v[30] = k_madd_epi32(u[2], k32_m01_p31);
+ v[31] = k_madd_epi32(u[3], k32_m01_p31);
#if DCT_HIGH_BIT_DEPTH
overflow = k_check_epi32_overflow_32(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
- &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
- &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
- &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
- &kZero);
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+ &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+ &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
if (overflow) {
HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
return;
}
#endif // DCT_HIGH_BIT_DEPTH
- u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
- u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
- u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
- u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
- u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
- u[ 5] = k_packs_epi64(v[10], v[11]);
- u[ 6] = k_packs_epi64(v[12], v[13]);
- u[ 7] = k_packs_epi64(v[14], v[15]);
- u[ 8] = k_packs_epi64(v[16], v[17]);
- u[ 9] = k_packs_epi64(v[18], v[19]);
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+ u[8] = k_packs_epi64(v[16], v[17]);
+ u[9] = k_packs_epi64(v[18], v[19]);
u[10] = k_packs_epi64(v[20], v[21]);
u[11] = k_packs_epi64(v[22], v[23]);
u[12] = k_packs_epi64(v[24], v[25]);
@@ -2682,16 +2730,16 @@ void FDCT32x32_2D(const int16_t *input,
u[14] = k_packs_epi64(v[28], v[29]);
u[15] = k_packs_epi64(v[30], v[31]);
- v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
- v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
- v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
- v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
- v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
- v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
- v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
- v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
- v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
- v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
@@ -2699,16 +2747,16 @@ void FDCT32x32_2D(const int16_t *input,
v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
- u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
- u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
- u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
- u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
- u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
- u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
- u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
- u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
- u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
- u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
@@ -2716,16 +2764,16 @@ void FDCT32x32_2D(const int16_t *input,
u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
- v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
- v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
- v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
- v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
- v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
- v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
- v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
- v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
- v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
- v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
+ v[0] = _mm_cmplt_epi32(u[0], kZero);
+ v[1] = _mm_cmplt_epi32(u[1], kZero);
+ v[2] = _mm_cmplt_epi32(u[2], kZero);
+ v[3] = _mm_cmplt_epi32(u[3], kZero);
+ v[4] = _mm_cmplt_epi32(u[4], kZero);
+ v[5] = _mm_cmplt_epi32(u[5], kZero);
+ v[6] = _mm_cmplt_epi32(u[6], kZero);
+ v[7] = _mm_cmplt_epi32(u[7], kZero);
+ v[8] = _mm_cmplt_epi32(u[8], kZero);
+ v[9] = _mm_cmplt_epi32(u[9], kZero);
v[10] = _mm_cmplt_epi32(u[10], kZero);
v[11] = _mm_cmplt_epi32(u[11], kZero);
v[12] = _mm_cmplt_epi32(u[12], kZero);
@@ -2733,16 +2781,16 @@ void FDCT32x32_2D(const int16_t *input,
v[14] = _mm_cmplt_epi32(u[14], kZero);
v[15] = _mm_cmplt_epi32(u[15], kZero);
- u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
- u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
- u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
- u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
- u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
- u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
- u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
- u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
- u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
- u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
+ u[0] = _mm_sub_epi32(u[0], v[0]);
+ u[1] = _mm_sub_epi32(u[1], v[1]);
+ u[2] = _mm_sub_epi32(u[2], v[2]);
+ u[3] = _mm_sub_epi32(u[3], v[3]);
+ u[4] = _mm_sub_epi32(u[4], v[4]);
+ u[5] = _mm_sub_epi32(u[5], v[5]);
+ u[6] = _mm_sub_epi32(u[6], v[6]);
+ u[7] = _mm_sub_epi32(u[7], v[7]);
+ u[8] = _mm_sub_epi32(u[8], v[8]);
+ u[9] = _mm_sub_epi32(u[9], v[9]);
u[10] = _mm_sub_epi32(u[10], v[10]);
u[11] = _mm_sub_epi32(u[11], v[11]);
u[12] = _mm_sub_epi32(u[12], v[12]);
@@ -2784,18 +2832,18 @@ void FDCT32x32_2D(const int16_t *input,
u[14] = _mm_srai_epi32(v[14], 2);
u[15] = _mm_srai_epi32(v[15], 2);
- out[ 1] = _mm_packs_epi32(u[0], u[1]);
+ out[1] = _mm_packs_epi32(u[0], u[1]);
out[17] = _mm_packs_epi32(u[2], u[3]);
- out[ 9] = _mm_packs_epi32(u[4], u[5]);
+ out[9] = _mm_packs_epi32(u[4], u[5]);
out[25] = _mm_packs_epi32(u[6], u[7]);
- out[ 7] = _mm_packs_epi32(u[8], u[9]);
+ out[7] = _mm_packs_epi32(u[8], u[9]);
out[23] = _mm_packs_epi32(u[10], u[11]);
out[15] = _mm_packs_epi32(u[12], u[13]);
out[31] = _mm_packs_epi32(u[14], u[15]);
#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9],
- &out[25], &out[7], &out[23],
- &out[15], &out[31]);
+ overflow =
+ check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
+ &out[7], &out[23], &out[15], &out[31]);
if (overflow) {
HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
return;
@@ -2812,16 +2860,16 @@ void FDCT32x32_2D(const int16_t *input,
const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64);
const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64);
- u[ 0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
- u[ 1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
- u[ 2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
- u[ 3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
- u[ 4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
- u[ 5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
- u[ 6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
- u[ 7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
- u[ 8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
- u[ 9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
+ u[0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
+ u[1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
+ u[2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
+ u[3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
+ u[4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
+ u[5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
+ u[6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
+ u[7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
+ u[8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
+ u[9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]);
u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]);
u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]);
@@ -2829,16 +2877,16 @@ void FDCT32x32_2D(const int16_t *input,
u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]);
u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]);
- v[ 0] = k_madd_epi32(u[ 0], k32_p27_p05);
- v[ 1] = k_madd_epi32(u[ 1], k32_p27_p05);
- v[ 2] = k_madd_epi32(u[ 2], k32_p27_p05);
- v[ 3] = k_madd_epi32(u[ 3], k32_p27_p05);
- v[ 4] = k_madd_epi32(u[ 4], k32_p11_p21);
- v[ 5] = k_madd_epi32(u[ 5], k32_p11_p21);
- v[ 6] = k_madd_epi32(u[ 6], k32_p11_p21);
- v[ 7] = k_madd_epi32(u[ 7], k32_p11_p21);
- v[ 8] = k_madd_epi32(u[ 8], k32_p19_p13);
- v[ 9] = k_madd_epi32(u[ 9], k32_p19_p13);
+ v[0] = k_madd_epi32(u[0], k32_p27_p05);
+ v[1] = k_madd_epi32(u[1], k32_p27_p05);
+ v[2] = k_madd_epi32(u[2], k32_p27_p05);
+ v[3] = k_madd_epi32(u[3], k32_p27_p05);
+ v[4] = k_madd_epi32(u[4], k32_p11_p21);
+ v[5] = k_madd_epi32(u[5], k32_p11_p21);
+ v[6] = k_madd_epi32(u[6], k32_p11_p21);
+ v[7] = k_madd_epi32(u[7], k32_p11_p21);
+ v[8] = k_madd_epi32(u[8], k32_p19_p13);
+ v[9] = k_madd_epi32(u[9], k32_p19_p13);
v[10] = k_madd_epi32(u[10], k32_p19_p13);
v[11] = k_madd_epi32(u[11], k32_p19_p13);
v[12] = k_madd_epi32(u[12], k32_p03_p29);
@@ -2849,41 +2897,40 @@ void FDCT32x32_2D(const int16_t *input,
v[17] = k_madd_epi32(u[13], k32_m29_p03);
v[18] = k_madd_epi32(u[14], k32_m29_p03);
v[19] = k_madd_epi32(u[15], k32_m29_p03);
- v[20] = k_madd_epi32(u[ 8], k32_m13_p19);
- v[21] = k_madd_epi32(u[ 9], k32_m13_p19);
+ v[20] = k_madd_epi32(u[8], k32_m13_p19);
+ v[21] = k_madd_epi32(u[9], k32_m13_p19);
v[22] = k_madd_epi32(u[10], k32_m13_p19);
v[23] = k_madd_epi32(u[11], k32_m13_p19);
- v[24] = k_madd_epi32(u[ 4], k32_m21_p11);
- v[25] = k_madd_epi32(u[ 5], k32_m21_p11);
- v[26] = k_madd_epi32(u[ 6], k32_m21_p11);
- v[27] = k_madd_epi32(u[ 7], k32_m21_p11);
- v[28] = k_madd_epi32(u[ 0], k32_m05_p27);
- v[29] = k_madd_epi32(u[ 1], k32_m05_p27);
- v[30] = k_madd_epi32(u[ 2], k32_m05_p27);
- v[31] = k_madd_epi32(u[ 3], k32_m05_p27);
+ v[24] = k_madd_epi32(u[4], k32_m21_p11);
+ v[25] = k_madd_epi32(u[5], k32_m21_p11);
+ v[26] = k_madd_epi32(u[6], k32_m21_p11);
+ v[27] = k_madd_epi32(u[7], k32_m21_p11);
+ v[28] = k_madd_epi32(u[0], k32_m05_p27);
+ v[29] = k_madd_epi32(u[1], k32_m05_p27);
+ v[30] = k_madd_epi32(u[2], k32_m05_p27);
+ v[31] = k_madd_epi32(u[3], k32_m05_p27);
#if DCT_HIGH_BIT_DEPTH
overflow = k_check_epi32_overflow_32(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
- &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
- &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
- &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
- &kZero);
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+ &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+ &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
if (overflow) {
HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
return;
}
#endif // DCT_HIGH_BIT_DEPTH
- u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
- u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
- u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
- u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
- u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
- u[ 5] = k_packs_epi64(v[10], v[11]);
- u[ 6] = k_packs_epi64(v[12], v[13]);
- u[ 7] = k_packs_epi64(v[14], v[15]);
- u[ 8] = k_packs_epi64(v[16], v[17]);
- u[ 9] = k_packs_epi64(v[18], v[19]);
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+ u[8] = k_packs_epi64(v[16], v[17]);
+ u[9] = k_packs_epi64(v[18], v[19]);
u[10] = k_packs_epi64(v[20], v[21]);
u[11] = k_packs_epi64(v[22], v[23]);
u[12] = k_packs_epi64(v[24], v[25]);
@@ -2891,16 +2938,16 @@ void FDCT32x32_2D(const int16_t *input,
u[14] = k_packs_epi64(v[28], v[29]);
u[15] = k_packs_epi64(v[30], v[31]);
- v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
- v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
- v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
- v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
- v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
- v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
- v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
- v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
- v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
- v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
@@ -2908,16 +2955,16 @@ void FDCT32x32_2D(const int16_t *input,
v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
- u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
- u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
- u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
- u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
- u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
- u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
- u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
- u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
- u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
- u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
@@ -2925,16 +2972,16 @@ void FDCT32x32_2D(const int16_t *input,
u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
- v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
- v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
- v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
- v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
- v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
- v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
- v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
- v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
- v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
- v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
+ v[0] = _mm_cmplt_epi32(u[0], kZero);
+ v[1] = _mm_cmplt_epi32(u[1], kZero);
+ v[2] = _mm_cmplt_epi32(u[2], kZero);
+ v[3] = _mm_cmplt_epi32(u[3], kZero);
+ v[4] = _mm_cmplt_epi32(u[4], kZero);
+ v[5] = _mm_cmplt_epi32(u[5], kZero);
+ v[6] = _mm_cmplt_epi32(u[6], kZero);
+ v[7] = _mm_cmplt_epi32(u[7], kZero);
+ v[8] = _mm_cmplt_epi32(u[8], kZero);
+ v[9] = _mm_cmplt_epi32(u[9], kZero);
v[10] = _mm_cmplt_epi32(u[10], kZero);
v[11] = _mm_cmplt_epi32(u[11], kZero);
v[12] = _mm_cmplt_epi32(u[12], kZero);
@@ -2942,16 +2989,16 @@ void FDCT32x32_2D(const int16_t *input,
v[14] = _mm_cmplt_epi32(u[14], kZero);
v[15] = _mm_cmplt_epi32(u[15], kZero);
- u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
- u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
- u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
- u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
- u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
- u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
- u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
- u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
- u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
- u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
+ u[0] = _mm_sub_epi32(u[0], v[0]);
+ u[1] = _mm_sub_epi32(u[1], v[1]);
+ u[2] = _mm_sub_epi32(u[2], v[2]);
+ u[3] = _mm_sub_epi32(u[3], v[3]);
+ u[4] = _mm_sub_epi32(u[4], v[4]);
+ u[5] = _mm_sub_epi32(u[5], v[5]);
+ u[6] = _mm_sub_epi32(u[6], v[6]);
+ u[7] = _mm_sub_epi32(u[7], v[7]);
+ u[8] = _mm_sub_epi32(u[8], v[8]);
+ u[9] = _mm_sub_epi32(u[9], v[9]);
u[10] = _mm_sub_epi32(u[10], v[10]);
u[11] = _mm_sub_epi32(u[11], v[11]);
u[12] = _mm_sub_epi32(u[12], v[12]);
@@ -2993,18 +3040,18 @@ void FDCT32x32_2D(const int16_t *input,
u[14] = _mm_srai_epi32(v[14], 2);
u[15] = _mm_srai_epi32(v[15], 2);
- out[ 5] = _mm_packs_epi32(u[0], u[1]);
+ out[5] = _mm_packs_epi32(u[0], u[1]);
out[21] = _mm_packs_epi32(u[2], u[3]);
out[13] = _mm_packs_epi32(u[4], u[5]);
out[29] = _mm_packs_epi32(u[6], u[7]);
- out[ 3] = _mm_packs_epi32(u[8], u[9]);
+ out[3] = _mm_packs_epi32(u[8], u[9]);
out[19] = _mm_packs_epi32(u[10], u[11]);
out[11] = _mm_packs_epi32(u[12], u[13]);
out[27] = _mm_packs_epi32(u[14], u[15]);
#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13],
- &out[29], &out[3], &out[19],
- &out[11], &out[27]);
+ overflow =
+ check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
+ &out[3], &out[19], &out[11], &out[27]);
if (overflow) {
HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
return;
diff --git a/vpx_dsp/x86/fwd_txfm_avx2.c b/vpx_dsp/x86/fwd_txfm_avx2.c
index 6d9da6aa8..21f11f0c3 100644
--- a/vpx_dsp/x86/fwd_txfm_avx2.c
+++ b/vpx_dsp/x86/fwd_txfm_avx2.c
@@ -13,11 +13,11 @@
#define FDCT32x32_2D_AVX2 vpx_fdct32x32_rd_avx2
#define FDCT32x32_HIGH_PRECISION 0
#include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h"
-#undef FDCT32x32_2D_AVX2
-#undef FDCT32x32_HIGH_PRECISION
+#undef FDCT32x32_2D_AVX2
+#undef FDCT32x32_HIGH_PRECISION
#define FDCT32x32_2D_AVX2 vpx_fdct32x32_avx2
#define FDCT32x32_HIGH_PRECISION 1
-#include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h" // NOLINT
-#undef FDCT32x32_2D_AVX2
-#undef FDCT32x32_HIGH_PRECISION
+#include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h" // NOLINT
+#undef FDCT32x32_2D_AVX2
+#undef FDCT32x32_HIGH_PRECISION
diff --git a/vpx_dsp/x86/fwd_txfm_impl_sse2.h b/vpx_dsp/x86/fwd_txfm_impl_sse2.h
index 69889e2e9..743e55e63 100644
--- a/vpx_dsp/x86/fwd_txfm_impl_sse2.h
+++ b/vpx_dsp/x86/fwd_txfm_impl_sse2.h
@@ -43,44 +43,36 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
// These are the coefficients used for the multiplies.
// In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
// where cospi_N_64 = cos(N pi /64)
- const __m128i k__cospi_A = octa_set_epi16(cospi_16_64, cospi_16_64,
- cospi_16_64, cospi_16_64,
- cospi_16_64, -cospi_16_64,
- cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_B = octa_set_epi16(cospi_16_64, -cospi_16_64,
- cospi_16_64, -cospi_16_64,
- cospi_16_64, cospi_16_64,
- cospi_16_64, cospi_16_64);
- const __m128i k__cospi_C = octa_set_epi16(cospi_8_64, cospi_24_64,
- cospi_8_64, cospi_24_64,
- cospi_24_64, -cospi_8_64,
- cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_D = octa_set_epi16(cospi_24_64, -cospi_8_64,
- cospi_24_64, -cospi_8_64,
- cospi_8_64, cospi_24_64,
- cospi_8_64, cospi_24_64);
- const __m128i k__cospi_E = octa_set_epi16(cospi_16_64, cospi_16_64,
- cospi_16_64, cospi_16_64,
- cospi_16_64, cospi_16_64,
- cospi_16_64, cospi_16_64);
- const __m128i k__cospi_F = octa_set_epi16(cospi_16_64, -cospi_16_64,
- cospi_16_64, -cospi_16_64,
- cospi_16_64, -cospi_16_64,
- cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_G = octa_set_epi16(cospi_8_64, cospi_24_64,
- cospi_8_64, cospi_24_64,
- -cospi_8_64, -cospi_24_64,
- -cospi_8_64, -cospi_24_64);
- const __m128i k__cospi_H = octa_set_epi16(cospi_24_64, -cospi_8_64,
- cospi_24_64, -cospi_8_64,
- -cospi_24_64, cospi_8_64,
- -cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_A =
+ octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
+ cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_B =
+ octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
+ cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_C =
+ octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
+ cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_D =
+ octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
+ cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_E =
+ octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
+ cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_F =
+ octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
+ cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_G =
+ octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
+ -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64);
+ const __m128i k__cospi_H =
+ octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
+ -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
// This second rounding constant saves doing some extra adds at the end
- const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING
- +(DCT_CONST_ROUNDING << 1));
- const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2;
+ const __m128i k__DCT_CONST_ROUNDING2 =
+ _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1));
+ const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2;
const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
__m128i in0, in1;
@@ -90,14 +82,14 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
#endif
// Load inputs.
- in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
- in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
- in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
- (input + 2 * stride)));
- in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
- (input + 3 * stride)));
- // in0 = [i0 i1 i2 i3 iC iD iE iF]
- // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+ in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in1 = _mm_unpacklo_epi64(
+ in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
+ in0 = _mm_unpacklo_epi64(
+ in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
+// in0 = [i0 i1 i2 i3 iC iD iE iF]
+// in1 = [i4 i5 i6 i7 i8 i9 iA iB]
#if DCT_HIGH_BIT_DEPTH
// Check inputs small enough to use optimised code
cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)),
@@ -194,8 +186,8 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
const __m128i t0 = ADD_EPI16(in0, in1);
const __m128i t1 = SUB_EPI16(in0, in1);
- // t0 = [c0 c1 c8 c9 c4 c5 cC cD]
- // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
+// t0 = [c0 c1 c8 c9 c4 c5 cC cD]
+// t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
#if DCT_HIGH_BIT_DEPTH
overflow = check_epi16_overflow_x2(&t0, &t1);
if (overflow) {
@@ -263,7 +255,6 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
storeu_output(&in1, output + 2 * 4);
}
-
void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
int pass;
// Constants
@@ -283,14 +274,14 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
int overflow;
#endif
// Load input
- __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
- __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
- __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
- __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
- __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
- __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
- __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
- __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
// Pre-condition input (shift by two)
in0 = _mm_slli_epi16(in0, 2);
in1 = _mm_slli_epi16(in1, 2);
@@ -319,8 +310,8 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
const __m128i q7 = SUB_EPI16(in0, in7);
#if DCT_HIGH_BIT_DEPTH
if (pass == 1) {
- overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3,
- &q4, &q5, &q6, &q7);
+ overflow =
+ check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
if (overflow) {
vpx_highbd_fdct8x8_c(input, output, stride);
return;
@@ -630,22 +621,22 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
__m128i res08, res09, res10, res11, res12, res13, res14, res15;
// Load and pre-condition input.
if (0 == pass) {
- in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
- in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
- in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
- in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
- in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
- in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
- in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
- in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
- in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
- in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
- in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
- in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
- in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
- in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
- in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
- in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
+ in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
+ in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
+ in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
+ in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
+ in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
+ in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
+ in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
+ in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
+ in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
+ in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
+ in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
+ in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
+ in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
+ in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
+ in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
+ in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
// x = x << 2
in00 = _mm_slli_epi16(in00, 2);
in01 = _mm_slli_epi16(in01, 2);
@@ -664,22 +655,22 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
in14 = _mm_slli_epi16(in14, 2);
in15 = _mm_slli_epi16(in15, 2);
} else {
- in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
- in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
- in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
- in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
- in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
- in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
- in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
- in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
- in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
- in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
- in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
- in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
- in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
- in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
- in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
- in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
+ in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
+ in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
+ in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
+ in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
+ in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
+ in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
+ in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
+ in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
+ in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
+ in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
+ in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
+ in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
+ in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
+ in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
+ in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
+ in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
// x = (x + 1) >> 2
in00 = _mm_add_epi16(in00, kOne);
in01 = _mm_add_epi16(in01, kOne);
@@ -745,10 +736,9 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
step1_6 = SUB_EPI16(in01, in14);
step1_7 = SUB_EPI16(in00, in15);
#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step1_0, &step1_1,
- &step1_2, &step1_3,
- &step1_4, &step1_5,
- &step1_6, &step1_7);
+ overflow =
+ check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
+ &step1_4, &step1_5, &step1_6, &step1_7);
if (overflow) {
vpx_highbd_fdct16x16_c(input, output, stride);
return;
@@ -767,8 +757,8 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
const __m128i q6 = SUB_EPI16(input1, input6);
const __m128i q7 = SUB_EPI16(input0, input7);
#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3,
- &q4, &q5, &q6, &q7);
+ overflow =
+ check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
if (overflow) {
vpx_highbd_fdct16x16_c(input, output, stride);
return;
@@ -818,12 +808,12 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
// into 32 bits.
const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
- const __m128i r0 = mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
- &k__DCT_CONST_ROUNDING,
- DCT_CONST_BITS);
- const __m128i r1 = mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
- &k__DCT_CONST_ROUNDING,
- DCT_CONST_BITS);
+ const __m128i r0 =
+ mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ const __m128i r1 =
+ mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
#if DCT_HIGH_BIT_DEPTH
overflow = check_epi16_overflow_x2(&r0, &r1);
if (overflow) {
@@ -860,8 +850,8 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12,
&k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&res02, &res14,
- &res10, &res06);
+ overflow =
+ check_epi16_overflow_x4(&res02, &res14, &res10, &res06);
if (overflow) {
vpx_highbd_fdct16x16_c(input, output, stride);
return;
@@ -888,8 +878,8 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16,
&k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5,
- &step2_4);
+ overflow =
+ check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, &step2_4);
if (overflow) {
vpx_highbd_fdct16x16_c(input, output, stride);
return;
@@ -907,10 +897,9 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
step3_6 = ADD_EPI16(step1_6, step2_5);
step3_7 = ADD_EPI16(step1_7, step2_4);
#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step3_0, &step3_1,
- &step3_2, &step3_3,
- &step3_4, &step3_5,
- &step3_6, &step3_7);
+ overflow =
+ check_epi16_overflow_x8(&step3_0, &step3_1, &step3_2, &step3_3,
+ &step3_4, &step3_5, &step3_6, &step3_7);
if (overflow) {
vpx_highbd_fdct16x16_c(input, output, stride);
return;
@@ -932,8 +921,8 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24,
&k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6,
- &step2_5);
+ overflow =
+ check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, &step2_5);
if (overflow) {
vpx_highbd_fdct16x16_c(input, output, stride);
return;
@@ -951,10 +940,9 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
step1_6 = SUB_EPI16(step3_7, step2_6);
step1_7 = ADD_EPI16(step3_7, step2_6);
#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step1_0, &step1_1,
- &step1_2, &step1_3,
- &step1_4, &step1_5,
- &step1_6, &step1_7);
+ overflow =
+ check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
+ &step1_4, &step1_5, &step1_6, &step1_7);
if (overflow) {
vpx_highbd_fdct16x16_c(input, output, stride);
return;
@@ -1006,16 +994,14 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
}
}
// Transpose the results, do it as two 8x8 transposes.
- transpose_and_output8x8(&res00, &res01, &res02, &res03,
- &res04, &res05, &res06, &res07,
- pass, out0, out1);
- transpose_and_output8x8(&res08, &res09, &res10, &res11,
- &res12, &res13, &res14, &res15,
- pass, out0 + 8, out1 + 8);
+ transpose_and_output8x8(&res00, &res01, &res02, &res03, &res04, &res05,
+ &res06, &res07, pass, out0, out1);
+ transpose_and_output8x8(&res08, &res09, &res10, &res11, &res12, &res13,
+ &res14, &res15, pass, out0 + 8, out1 + 8);
if (pass == 0) {
- out0 += 8*16;
+ out0 += 8 * 16;
} else {
- out1 += 8*16;
+ out1 += 8 * 16;
}
}
// Setup in/out for next pass.
diff --git a/vpx_dsp/x86/fwd_txfm_sse2.c b/vpx_dsp/x86/fwd_txfm_sse2.c
index 3e4f49bd9..e14b99197 100644
--- a/vpx_dsp/x86/fwd_txfm_sse2.c
+++ b/vpx_dsp/x86/fwd_txfm_sse2.c
@@ -19,12 +19,12 @@ void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
__m128i in0, in1;
__m128i tmp;
const __m128i zero = _mm_setzero_si128();
- in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
- in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
- in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
- (input + 2 * stride)));
- in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
- (input + 3 * stride)));
+ in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in1 = _mm_unpacklo_epi64(
+ in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
+ in0 = _mm_unpacklo_epi64(
+ in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
tmp = _mm_add_epi16(in0, in1);
in0 = _mm_unpacklo_epi16(zero, tmp);
@@ -45,19 +45,19 @@ void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
}
void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
- __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
- __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
- __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
- __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
__m128i u0, u1, sum;
u0 = _mm_add_epi16(in0, in1);
u1 = _mm_add_epi16(in2, in3);
- in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
- in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
- in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
- in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
sum = _mm_add_epi16(u0, u1);
@@ -65,7 +65,7 @@ void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
in2 = _mm_add_epi16(in2, in3);
sum = _mm_add_epi16(sum, in0);
- u0 = _mm_setzero_si128();
+ u0 = _mm_setzero_si128();
sum = _mm_add_epi16(sum, in2);
in0 = _mm_unpacklo_epi16(u0, sum);
@@ -92,50 +92,50 @@ void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
int i;
for (i = 0; i < 2; ++i) {
- in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
- in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
- in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
- in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
+ in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
+ in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
u0 = _mm_add_epi16(in0, in1);
u1 = _mm_add_epi16(in2, in3);
sum = _mm_add_epi16(sum, u0);
- in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
- in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
- in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
- in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
+ in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
+ in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
sum = _mm_add_epi16(sum, u0);
- in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
- in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
- in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
- in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
+ in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
+ in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
sum = _mm_add_epi16(sum, u0);
- in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
- in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
- in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
- in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
+ in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
+ in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
sum = _mm_add_epi16(sum, u0);
sum = _mm_add_epi16(sum, u1);
input += 8 * stride;
}
- u0 = _mm_setzero_si128();
+ u0 = _mm_setzero_si128();
in0 = _mm_unpacklo_epi16(u0, sum);
in1 = _mm_unpackhi_epi16(u0, sum);
in0 = _mm_srai_epi32(in0, 16);
@@ -161,53 +161,53 @@ void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
int i;
for (i = 0; i < 8; ++i) {
- in0 = _mm_load_si128((const __m128i *)(input + 0));
- in1 = _mm_load_si128((const __m128i *)(input + 8));
- in2 = _mm_load_si128((const __m128i *)(input + 16));
- in3 = _mm_load_si128((const __m128i *)(input + 24));
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
input += stride;
u0 = _mm_add_epi16(in0, in1);
u1 = _mm_add_epi16(in2, in3);
sum = _mm_add_epi16(sum, u0);
- in0 = _mm_load_si128((const __m128i *)(input + 0));
- in1 = _mm_load_si128((const __m128i *)(input + 8));
- in2 = _mm_load_si128((const __m128i *)(input + 16));
- in3 = _mm_load_si128((const __m128i *)(input + 24));
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
input += stride;
sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
sum = _mm_add_epi16(sum, u0);
- in0 = _mm_load_si128((const __m128i *)(input + 0));
- in1 = _mm_load_si128((const __m128i *)(input + 8));
- in2 = _mm_load_si128((const __m128i *)(input + 16));
- in3 = _mm_load_si128((const __m128i *)(input + 24));
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
input += stride;
sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
sum = _mm_add_epi16(sum, u0);
- in0 = _mm_load_si128((const __m128i *)(input + 0));
- in1 = _mm_load_si128((const __m128i *)(input + 8));
- in2 = _mm_load_si128((const __m128i *)(input + 16));
- in3 = _mm_load_si128((const __m128i *)(input + 24));
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
input += stride;
sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
sum = _mm_add_epi16(sum, u0);
sum = _mm_add_epi16(sum, u1);
}
- u0 = _mm_setzero_si128();
+ u0 = _mm_setzero_si128();
in0 = _mm_unpacklo_epi16(u0, sum);
in1 = _mm_unpackhi_epi16(u0, sum);
in0 = _mm_srai_epi32(in0, 16);
@@ -230,43 +230,43 @@ void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
#define FDCT8x8_2D vpx_fdct8x8_sse2
#define FDCT16x16_2D vpx_fdct16x16_sse2
#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"
-#undef FDCT4x4_2D
-#undef FDCT8x8_2D
-#undef FDCT16x16_2D
+#undef FDCT4x4_2D
+#undef FDCT8x8_2D
+#undef FDCT16x16_2D
#define FDCT32x32_2D vpx_fdct32x32_rd_sse2
#define FDCT32x32_HIGH_PRECISION 0
#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
#define FDCT32x32_2D vpx_fdct32x32_sse2
#define FDCT32x32_HIGH_PRECISION 1
#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-#undef DCT_HIGH_BIT_DEPTH
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
+#undef DCT_HIGH_BIT_DEPTH
#if CONFIG_VP9_HIGHBITDEPTH
#define DCT_HIGH_BIT_DEPTH 1
#define FDCT4x4_2D vpx_highbd_fdct4x4_sse2
#define FDCT8x8_2D vpx_highbd_fdct8x8_sse2
#define FDCT16x16_2D vpx_highbd_fdct16x16_sse2
-#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT
-#undef FDCT4x4_2D
-#undef FDCT8x8_2D
-#undef FDCT16x16_2D
+#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT
+#undef FDCT4x4_2D
+#undef FDCT8x8_2D
+#undef FDCT16x16_2D
#define FDCT32x32_2D vpx_highbd_fdct32x32_rd_sse2
#define FDCT32x32_HIGH_PRECISION 0
-#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
+#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
#define FDCT32x32_2D vpx_highbd_fdct32x32_sse2
#define FDCT32x32_HIGH_PRECISION 1
-#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-#undef DCT_HIGH_BIT_DEPTH
+#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
+#undef DCT_HIGH_BIT_DEPTH
#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/fwd_txfm_sse2.h b/vpx_dsp/x86/fwd_txfm_sse2.h
index 94d5befbf..5201e764c 100644
--- a/vpx_dsp/x86/fwd_txfm_sse2.h
+++ b/vpx_dsp/x86/fwd_txfm_sse2.h
@@ -63,99 +63,57 @@ static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
return _mm_movemask_epi8(cmp0);
}
-static INLINE int check_epi16_overflow_x8(const __m128i *preg0,
- const __m128i *preg1,
- const __m128i *preg2,
- const __m128i *preg3,
- const __m128i *preg4,
- const __m128i *preg5,
- const __m128i *preg6,
- const __m128i *preg7) {
+static INLINE int check_epi16_overflow_x8(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7) {
int res0, res1;
res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
return res0 + res1;
}
-static INLINE int check_epi16_overflow_x12(const __m128i *preg0,
- const __m128i *preg1,
- const __m128i *preg2,
- const __m128i *preg3,
- const __m128i *preg4,
- const __m128i *preg5,
- const __m128i *preg6,
- const __m128i *preg7,
- const __m128i *preg8,
- const __m128i *preg9,
- const __m128i *preg10,
- const __m128i *preg11) {
+static INLINE int check_epi16_overflow_x12(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) {
int res0, res1;
res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
- if (!res0)
- res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+ if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
return res0 + res1;
}
-static INLINE int check_epi16_overflow_x16(const __m128i *preg0,
- const __m128i *preg1,
- const __m128i *preg2,
- const __m128i *preg3,
- const __m128i *preg4,
- const __m128i *preg5,
- const __m128i *preg6,
- const __m128i *preg7,
- const __m128i *preg8,
- const __m128i *preg9,
- const __m128i *preg10,
- const __m128i *preg11,
- const __m128i *preg12,
- const __m128i *preg13,
- const __m128i *preg14,
- const __m128i *preg15) {
+static INLINE int check_epi16_overflow_x16(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+ const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+ const __m128i *preg15) {
int res0, res1;
res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
if (!res0) {
res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
- if (!res1)
- res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+ if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
}
return res0 + res1;
}
-static INLINE int check_epi16_overflow_x32(const __m128i *preg0,
- const __m128i *preg1,
- const __m128i *preg2,
- const __m128i *preg3,
- const __m128i *preg4,
- const __m128i *preg5,
- const __m128i *preg6,
- const __m128i *preg7,
- const __m128i *preg8,
- const __m128i *preg9,
- const __m128i *preg10,
- const __m128i *preg11,
- const __m128i *preg12,
- const __m128i *preg13,
- const __m128i *preg14,
- const __m128i *preg15,
- const __m128i *preg16,
- const __m128i *preg17,
- const __m128i *preg18,
- const __m128i *preg19,
- const __m128i *preg20,
- const __m128i *preg21,
- const __m128i *preg22,
- const __m128i *preg23,
- const __m128i *preg24,
- const __m128i *preg25,
- const __m128i *preg26,
- const __m128i *preg27,
- const __m128i *preg28,
- const __m128i *preg29,
- const __m128i *preg30,
- const __m128i *preg31) {
+static INLINE int check_epi16_overflow_x32(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+ const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+ const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
+ const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
+ const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
+ const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
+ const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
+ const __m128i *preg30, const __m128i *preg31) {
int res0, res1;
res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
@@ -190,36 +148,31 @@ static INLINE int k_check_epi32_overflow_4(const __m128i *preg0,
__m128i reg1_shifted = _mm_slli_epi64(*preg1, 1);
__m128i reg2_shifted = _mm_slli_epi64(*preg2, 1);
__m128i reg3_shifted = _mm_slli_epi64(*preg3, 1);
- __m128i reg0_top_dwords = _mm_shuffle_epi32(
- reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
- __m128i reg1_top_dwords = _mm_shuffle_epi32(
- reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
- __m128i reg2_top_dwords = _mm_shuffle_epi32(
- reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
- __m128i reg3_top_dwords = _mm_shuffle_epi32(
- reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+ __m128i reg0_top_dwords =
+ _mm_shuffle_epi32(reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+ __m128i reg1_top_dwords =
+ _mm_shuffle_epi32(reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+ __m128i reg2_top_dwords =
+ _mm_shuffle_epi32(reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+ __m128i reg3_top_dwords =
+ _mm_shuffle_epi32(reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
__m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
__m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
__m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
__m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
__m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
__m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
- int overflow_01 = _mm_movemask_epi8(
- _mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
- int overflow_23 = _mm_movemask_epi8(
- _mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
+ int overflow_01 =
+ _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
+ int overflow_23 =
+ _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
return (overflow_01 + overflow_23);
}
-static INLINE int k_check_epi32_overflow_8(const __m128i *preg0,
- const __m128i *preg1,
- const __m128i *preg2,
- const __m128i *preg3,
- const __m128i *preg4,
- const __m128i *preg5,
- const __m128i *preg6,
- const __m128i *preg7,
- const __m128i *zero) {
+static INLINE int k_check_epi32_overflow_8(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *zero) {
int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
if (!overflow) {
overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
@@ -227,91 +180,59 @@ static INLINE int k_check_epi32_overflow_8(const __m128i *preg0,
return overflow;
}
-static INLINE int k_check_epi32_overflow_16(const __m128i *preg0,
- const __m128i *preg1,
- const __m128i *preg2,
- const __m128i *preg3,
- const __m128i *preg4,
- const __m128i *preg5,
- const __m128i *preg6,
- const __m128i *preg7,
- const __m128i *preg8,
- const __m128i *preg9,
- const __m128i *preg10,
- const __m128i *preg11,
- const __m128i *preg12,
- const __m128i *preg13,
- const __m128i *preg14,
- const __m128i *preg15,
- const __m128i *zero) {
+static INLINE int k_check_epi32_overflow_16(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+ const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+ const __m128i *preg15, const __m128i *zero) {
int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
if (!overflow) {
overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
if (!overflow) {
- overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11,
- zero);
+ overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
if (!overflow) {
- overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15,
- zero);
+ overflow =
+ k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
}
}
}
return overflow;
}
-static INLINE int k_check_epi32_overflow_32(const __m128i *preg0,
- const __m128i *preg1,
- const __m128i *preg2,
- const __m128i *preg3,
- const __m128i *preg4,
- const __m128i *preg5,
- const __m128i *preg6,
- const __m128i *preg7,
- const __m128i *preg8,
- const __m128i *preg9,
- const __m128i *preg10,
- const __m128i *preg11,
- const __m128i *preg12,
- const __m128i *preg13,
- const __m128i *preg14,
- const __m128i *preg15,
- const __m128i *preg16,
- const __m128i *preg17,
- const __m128i *preg18,
- const __m128i *preg19,
- const __m128i *preg20,
- const __m128i *preg21,
- const __m128i *preg22,
- const __m128i *preg23,
- const __m128i *preg24,
- const __m128i *preg25,
- const __m128i *preg26,
- const __m128i *preg27,
- const __m128i *preg28,
- const __m128i *preg29,
- const __m128i *preg30,
- const __m128i *preg31,
- const __m128i *zero) {
+static INLINE int k_check_epi32_overflow_32(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+ const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+ const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
+ const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
+ const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
+ const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
+ const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
+ const __m128i *preg30, const __m128i *preg31, const __m128i *zero) {
int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
if (!overflow) {
overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
if (!overflow) {
overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
if (!overflow) {
- overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15,
- zero);
+ overflow =
+ k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
if (!overflow) {
- overflow = k_check_epi32_overflow_4(preg16, preg17, preg18, preg19,
- zero);
+ overflow =
+ k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, zero);
if (!overflow) {
- overflow = k_check_epi32_overflow_4(preg20, preg21,
- preg22, preg23, zero);
+ overflow =
+ k_check_epi32_overflow_4(preg20, preg21, preg22, preg23, zero);
if (!overflow) {
- overflow = k_check_epi32_overflow_4(preg24, preg25,
- preg26, preg27, zero);
+ overflow = k_check_epi32_overflow_4(preg24, preg25, preg26,
+ preg27, zero);
if (!overflow) {
- overflow = k_check_epi32_overflow_4(preg28, preg29,
- preg30, preg31, zero);
+ overflow = k_check_epi32_overflow_4(preg28, preg29, preg30,
+ preg31, zero);
}
}
}
@@ -322,7 +243,7 @@ static INLINE int k_check_epi32_overflow_32(const __m128i *preg0,
return overflow;
}
-static INLINE void store_output(const __m128i *poutput, tran_low_t* dst_ptr) {
+static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
#if CONFIG_VP9_HIGHBITDEPTH
const __m128i zero = _mm_setzero_si128();
const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
@@ -335,7 +256,7 @@ static INLINE void store_output(const __m128i *poutput, tran_low_t* dst_ptr) {
#endif // CONFIG_VP9_HIGHBITDEPTH
}
-static INLINE void storeu_output(const __m128i *poutput, tran_low_t* dst_ptr) {
+static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
#if CONFIG_VP9_HIGHBITDEPTH
const __m128i zero = _mm_setzero_si128();
const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
@@ -348,9 +269,7 @@ static INLINE void storeu_output(const __m128i *poutput, tran_low_t* dst_ptr) {
#endif // CONFIG_VP9_HIGHBITDEPTH
}
-
-static INLINE __m128i mult_round_shift(const __m128i *pin0,
- const __m128i *pin1,
+static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1,
const __m128i *pmultiplier,
const __m128i *prounding,
const int shift) {
@@ -364,12 +283,10 @@ static INLINE __m128i mult_round_shift(const __m128i *pin0,
}
static INLINE void transpose_and_output8x8(
- const __m128i *pin00, const __m128i *pin01,
- const __m128i *pin02, const __m128i *pin03,
- const __m128i *pin04, const __m128i *pin05,
- const __m128i *pin06, const __m128i *pin07,
- const int pass, int16_t* out0_ptr,
- tran_low_t* out1_ptr) {
+ const __m128i *pin00, const __m128i *pin01, const __m128i *pin02,
+ const __m128i *pin03, const __m128i *pin04, const __m128i *pin05,
+ const __m128i *pin06, const __m128i *pin07, const int pass,
+ int16_t *out0_ptr, tran_low_t *out1_ptr) {
// 00 01 02 03 04 05 06 07
// 10 11 12 13 14 15 16 17
// 20 21 22 23 24 25 26 27
@@ -427,14 +344,14 @@ static INLINE void transpose_and_output8x8(
// 06 16 26 36 46 56 66 76
// 07 17 27 37 47 57 67 77
if (pass == 0) {
- _mm_storeu_si128((__m128i*)(out0_ptr + 0 * 16), tr2_0);
- _mm_storeu_si128((__m128i*)(out0_ptr + 1 * 16), tr2_1);
- _mm_storeu_si128((__m128i*)(out0_ptr + 2 * 16), tr2_2);
- _mm_storeu_si128((__m128i*)(out0_ptr + 3 * 16), tr2_3);
- _mm_storeu_si128((__m128i*)(out0_ptr + 4 * 16), tr2_4);
- _mm_storeu_si128((__m128i*)(out0_ptr + 5 * 16), tr2_5);
- _mm_storeu_si128((__m128i*)(out0_ptr + 6 * 16), tr2_6);
- _mm_storeu_si128((__m128i*)(out0_ptr + 7 * 16), tr2_7);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 0 * 16), tr2_0);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 1 * 16), tr2_1);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 2 * 16), tr2_2);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 3 * 16), tr2_3);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 4 * 16), tr2_4);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 5 * 16), tr2_5);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 6 * 16), tr2_6);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 7 * 16), tr2_7);
} else {
storeu_output(&tr2_0, (out1_ptr + 0 * 16));
storeu_output(&tr2_1, (out1_ptr + 1 * 16));
diff --git a/vpx_dsp/x86/halfpix_variance_sse2.c b/vpx_dsp/x86/halfpix_variance_sse2.c
index 4a8fb6df7..b5c3f5fa2 100644
--- a/vpx_dsp/x86/halfpix_variance_sse2.c
+++ b/vpx_dsp/x86/halfpix_variance_sse2.c
@@ -17,10 +17,8 @@
void vpx_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref,
int ref_stride,
const unsigned char *src,
- int src_stride,
- unsigned int height,
- int *sum,
- unsigned int *sumsquared);
+ int src_stride, unsigned int height,
+ int *sum, unsigned int *sumsquared);
void vpx_half_horiz_variance16x_h_sse2(const unsigned char *ref, int ref_stride,
const unsigned char *src, int src_stride,
unsigned int height, int *sum,
@@ -33,8 +31,7 @@ void vpx_half_vert_variance16x_h_sse2(const unsigned char *ref, int ref_stride,
uint32_t vpx_variance_halfpixvar16x16_h_sse2(const unsigned char *src,
int src_stride,
const unsigned char *dst,
- int dst_stride,
- uint32_t *sse) {
+ int dst_stride, uint32_t *sse) {
int xsum0;
unsigned int xxsum0;
@@ -50,12 +47,11 @@ uint32_t vpx_variance_halfpixvar16x16_h_sse2(const unsigned char *src,
uint32_t vpx_variance_halfpixvar16x16_v_sse2(const unsigned char *src,
int src_stride,
const unsigned char *dst,
- int dst_stride,
- uint32_t *sse) {
+ int dst_stride, uint32_t *sse) {
int xsum0;
unsigned int xxsum0;
- vpx_half_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16,
- &xsum0, &xxsum0);
+ vpx_half_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, &xsum0,
+ &xxsum0);
*sse = xxsum0;
assert(xsum0 <= 255 * 16 * 16);
@@ -63,12 +59,10 @@ uint32_t vpx_variance_halfpixvar16x16_v_sse2(const unsigned char *src,
return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
}
-
uint32_t vpx_variance_halfpixvar16x16_hv_sse2(const unsigned char *src,
int src_stride,
const unsigned char *dst,
- int dst_stride,
- uint32_t *sse) {
+ int dst_stride, uint32_t *sse) {
int xsum0;
unsigned int xxsum0;
diff --git a/vpx_dsp/x86/highbd_loopfilter_sse2.c b/vpx_dsp/x86/highbd_loopfilter_sse2.c
index 72e42adc9..7d6641108 100644
--- a/vpx_dsp/x86/highbd_loopfilter_sse2.c
+++ b/vpx_dsp/x86/highbd_loopfilter_sse2.c
@@ -25,16 +25,13 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
if (bd == 8) {
t80 = _mm_set1_epi16(0x80);
- max = _mm_subs_epi16(
- _mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80);
+ max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80);
} else if (bd == 10) {
t80 = _mm_set1_epi16(0x200);
- max = _mm_subs_epi16(
- _mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80);
+ max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80);
} else { // bd == 12
t80 = _mm_set1_epi16(0x800);
- max = _mm_subs_epi16(
- _mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80);
+ max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80);
}
min = _mm_subs_epi16(zero, t80);
@@ -81,16 +78,16 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
blimit = _mm_slli_epi16(
_mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
limit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
thresh = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
} else { // bd == 12
blimit = _mm_slli_epi16(
_mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
limit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
thresh = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
}
q4 = _mm_load_si128((__m128i *)(s + 4 * p));
@@ -118,25 +115,22 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
hev = _mm_subs_epu16(flat, thresh);
hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
- abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2
- abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2
mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
- work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p1, p0),
- _mm_subs_epu16(p0, p1)),
- _mm_or_si128(_mm_subs_epu16(q1, q0),
- _mm_subs_epu16(q0, q1)));
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)),
+ _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)));
mask = _mm_max_epi16(work, mask);
- work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
- _mm_subs_epu16(p1, p2)),
- _mm_or_si128(_mm_subs_epu16(q2, q1),
- _mm_subs_epu16(q1, q2)));
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
+ _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
mask = _mm_max_epi16(work, mask);
- work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
- _mm_subs_epu16(p2, p3)),
- _mm_or_si128(_mm_subs_epu16(q3, q2),
- _mm_subs_epu16(q2, q3)));
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
+ _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
mask = _mm_max_epi16(work, mask);
mask = _mm_subs_epu16(mask, limit);
@@ -160,8 +154,8 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
ps0 = _mm_subs_epi16(p0, t80);
qs0 = _mm_subs_epi16(q0, t80);
- filt = _mm_and_si128(
- signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd), hev);
+ filt = _mm_and_si128(signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd),
+ hev);
work_a = _mm_subs_epi16(qs0, ps0);
filt = _mm_adds_epi16(filt, work_a);
filt = _mm_adds_epi16(filt, work_a);
@@ -175,33 +169,27 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
filter2 = _mm_srai_epi16(filter2, 0x3);
qs0 = _mm_adds_epi16(
- signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd),
- t80);
+ signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
ps0 = _mm_adds_epi16(
- signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd),
- t80);
+ signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
filt = _mm_adds_epi16(filter1, t1);
filt = _mm_srai_epi16(filt, 1);
filt = _mm_andnot_si128(hev, filt);
- qs1 = _mm_adds_epi16(
- signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
- t80);
- ps1 = _mm_adds_epi16(
- signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
- t80);
+ qs1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
+ t80);
+ ps1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
+ t80);
// end highbd_filter4
// loopfilter done
// highbd_flat_mask4
- flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
- _mm_subs_epu16(p0, p2)),
- _mm_or_si128(_mm_subs_epu16(p3, p0),
- _mm_subs_epu16(p0, p3)));
- work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q0),
- _mm_subs_epu16(q0, q2)),
- _mm_or_si128(_mm_subs_epu16(q3, q0),
- _mm_subs_epu16(q0, q3)));
+ flat = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
+ _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)));
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)),
+ _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
flat = _mm_max_epi16(work, flat);
work = _mm_max_epi16(abs_p1p0, abs_q1q0);
flat = _mm_max_epi16(work, flat);
@@ -229,27 +217,23 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
// highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
// but referred to as p0-p4 & q0-q4 in fn)
- flat2 = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p4, p0),
- _mm_subs_epu16(p0, p4)),
- _mm_or_si128(_mm_subs_epu16(q4, q0),
- _mm_subs_epu16(q0, q4)));
-
- work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p5, p0),
- _mm_subs_epu16(p0, p5)),
- _mm_or_si128(_mm_subs_epu16(q5, q0),
- _mm_subs_epu16(q0, q5)));
+ flat2 = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p4, p0), _mm_subs_epu16(p0, p4)),
+ _mm_or_si128(_mm_subs_epu16(q4, q0), _mm_subs_epu16(q0, q4)));
+
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p5, p0), _mm_subs_epu16(p0, p5)),
+ _mm_or_si128(_mm_subs_epu16(q5, q0), _mm_subs_epu16(q0, q5)));
flat2 = _mm_max_epi16(work, flat2);
- work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p6, p0),
- _mm_subs_epu16(p0, p6)),
- _mm_or_si128(_mm_subs_epu16(q6, q0),
- _mm_subs_epu16(q0, q6)));
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p6, p0), _mm_subs_epu16(p0, p6)),
+ _mm_or_si128(_mm_subs_epu16(q6, q0), _mm_subs_epu16(q0, q6)));
flat2 = _mm_max_epi16(work, flat2);
- work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p7, p0),
- _mm_subs_epu16(p0, p7)),
- _mm_or_si128(_mm_subs_epu16(q7, q0),
- _mm_subs_epu16(q0, q7)));
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p7, p0), _mm_subs_epu16(p0, p7)),
+ _mm_or_si128(_mm_subs_epu16(q7, q0), _mm_subs_epu16(q0, q7)));
flat2 = _mm_max_epi16(work, flat2);
if (bd == 8)
@@ -268,29 +252,26 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
eight = _mm_set1_epi16(8);
four = _mm_set1_epi16(4);
- pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5),
- _mm_add_epi16(p4, p3));
- pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5),
- _mm_add_epi16(q4, q3));
+ pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3));
+ pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3));
pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1));
pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1));
pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
- pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
- pixelFilter_q));
- pixetFilter_p2p1p0 = _mm_add_epi16(four,
- _mm_add_epi16(pixetFilter_p2p1p0,
- pixetFilter_q2q1q0));
- flat2_p0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(p7, p0)), 4);
- flat2_q0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(q7, q0)), 4);
- flat_p0 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
- _mm_add_epi16(p3, p0)), 3);
- flat_q0 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
- _mm_add_epi16(q3, q0)), 3);
+ pixelFilter_p =
+ _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+ pixetFilter_p2p1p0 = _mm_add_epi16(
+ four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+ flat2_p0 =
+ _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7, p0)), 4);
+ flat2_q0 =
+ _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7, q0)), 4);
+ flat_p0 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3, p0)), 3);
+ flat_q0 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3, q0)), 3);
sum_p7 = _mm_add_epi16(p7, p7);
sum_q7 = _mm_add_epi16(q7, q7);
@@ -306,10 +287,10 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2);
pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2);
- flat_p1 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
- _mm_add_epi16(sum_p3, p1)), 3);
- flat_q1 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
- _mm_add_epi16(sum_q3, q1)), 3);
+ flat_p1 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1)), 3);
+ flat_q1 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1)), 3);
sum_p7 = _mm_add_epi16(sum_p7, p7);
sum_q7 = _mm_add_epi16(sum_q7, q7);
@@ -318,53 +299,53 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5);
pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5);
- flat2_p2 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(sum_p7, p2)), 4);
- flat2_q2 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
- _mm_add_epi16(sum_q7, q2)), 4);
+ flat2_p2 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2)), 4);
+ flat2_q2 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2)), 4);
pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1);
pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1);
- flat_p2 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
- _mm_add_epi16(sum_p3, p2)), 3);
- flat_q2 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
- _mm_add_epi16(sum_q3, q2)), 3);
+ flat_p2 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2)), 3);
+ flat_q2 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2)), 3);
sum_p7 = _mm_add_epi16(sum_p7, p7);
sum_q7 = _mm_add_epi16(sum_q7, q7);
pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4);
pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4);
- flat2_p3 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(sum_p7, p3)), 4);
- flat2_q3 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
- _mm_add_epi16(sum_q7, q3)), 4);
+ flat2_p3 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3)), 4);
+ flat2_q3 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3)), 4);
sum_p7 = _mm_add_epi16(sum_p7, p7);
sum_q7 = _mm_add_epi16(sum_q7, q7);
pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3);
pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3);
- flat2_p4 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(sum_p7, p4)), 4);
- flat2_q4 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
- _mm_add_epi16(sum_q7, q4)), 4);
+ flat2_p4 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4)), 4);
+ flat2_q4 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4)), 4);
sum_p7 = _mm_add_epi16(sum_p7, p7);
sum_q7 = _mm_add_epi16(sum_q7, q7);
pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2);
pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2);
- flat2_p5 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(sum_p7, p5)), 4);
- flat2_q5 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
- _mm_add_epi16(sum_q7, q5)), 4);
+ flat2_p5 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5)), 4);
+ flat2_q5 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5)), 4);
sum_p7 = _mm_add_epi16(sum_p7, p7);
sum_q7 = _mm_add_epi16(sum_q7, q7);
pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1);
pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1);
- flat2_p6 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(sum_p7, p6)), 4);
- flat2_q6 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
- _mm_add_epi16(sum_q7, q6)), 4);
+ flat2_p6 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6)), 4);
+ flat2_q6 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6)), 4);
// wide flat
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -505,8 +486,7 @@ void vpx_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p,
void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
const uint8_t *_blimit,
const uint8_t *_limit,
- const uint8_t *_thresh,
- int bd) {
+ const uint8_t *_thresh, int bd) {
DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
@@ -546,19 +526,19 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
t80 = _mm_set1_epi16(0x80);
} else if (bd == 10) {
blimit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
limit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
thresh = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
t80 = _mm_set1_epi16(0x200);
} else { // bd == 12
blimit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
limit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
thresh = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
t80 = _mm_set1_epi16(0x800);
}
@@ -568,20 +548,16 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
qs1 = _mm_subs_epi16(q1, t80);
// filter_mask and hev_mask
- abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
- _mm_subs_epu16(p0, p1));
- abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
- _mm_subs_epu16(q0, q1));
-
- abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
- _mm_subs_epu16(q0, p0));
- abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
- _mm_subs_epu16(q1, p1));
+ abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
+ abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+
+ abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
+ abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
hev = _mm_subs_epu16(flat, thresh);
hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
- abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
@@ -593,28 +569,24 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
mask = _mm_max_epi16(abs_q1q0, mask);
// mask |= (abs(q1 - q0) > limit) * -1;
- work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
- _mm_subs_epu16(p1, p2)),
- _mm_or_si128(_mm_subs_epu16(q2, q1),
- _mm_subs_epu16(q1, q2)));
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
+ _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
mask = _mm_max_epi16(work, mask);
- work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
- _mm_subs_epu16(p2, p3)),
- _mm_or_si128(_mm_subs_epu16(q3, q2),
- _mm_subs_epu16(q2, q3)));
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
+ _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
mask = _mm_max_epi16(work, mask);
mask = _mm_subs_epu16(mask, limit);
mask = _mm_cmpeq_epi16(mask, zero);
// flat_mask4
- flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
- _mm_subs_epu16(p0, p2)),
- _mm_or_si128(_mm_subs_epu16(q2, q0),
- _mm_subs_epu16(q0, q2)));
- work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p0),
- _mm_subs_epu16(p0, p3)),
- _mm_or_si128(_mm_subs_epu16(q3, q0),
- _mm_subs_epu16(q0, q3)));
+ flat = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
+ _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)));
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)),
+ _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
flat = _mm_max_epi16(work, flat);
flat = _mm_max_epi16(abs_p1p0, flat);
flat = _mm_max_epi16(abs_q1q0, flat);
@@ -737,14 +709,10 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
_mm_store_si128((__m128i *)(s + 2 * p), q2);
}
-void vpx_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p,
- const uint8_t *_blimit0,
- const uint8_t *_limit0,
- const uint8_t *_thresh0,
- const uint8_t *_blimit1,
- const uint8_t *_limit1,
- const uint8_t *_thresh1,
- int bd) {
+void vpx_highbd_lpf_horizontal_8_dual_sse2(
+ uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+ const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+ const uint8_t *_thresh1, int bd) {
vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
}
@@ -752,8 +720,7 @@ void vpx_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p,
void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
const uint8_t *_blimit,
const uint8_t *_limit,
- const uint8_t *_thresh,
- int bd) {
+ const uint8_t *_thresh, int bd) {
const __m128i zero = _mm_set1_epi16(0);
__m128i blimit, limit, thresh;
__m128i mask, hev, flat;
@@ -765,16 +732,16 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
__m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
__m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
__m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
- const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
- _mm_subs_epu16(p0, p1));
- const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
- _mm_subs_epu16(q0, q1));
+ const __m128i abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
+ const __m128i abs_q1q0 =
+ _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
const __m128i one = _mm_set1_epi16(1);
- __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
- _mm_subs_epu16(q0, p0));
- __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
- _mm_subs_epu16(q1, p1));
+ __m128i abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
+ __m128i abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
__m128i work;
const __m128i t4 = _mm_set1_epi16(4);
const __m128i t3 = _mm_set1_epi16(3);
@@ -838,7 +805,7 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
hev = _mm_subs_epu16(flat, thresh);
hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
- abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
@@ -848,15 +815,13 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
mask = _mm_max_epi16(flat, mask);
// mask |= (abs(p1 - p0) > limit) * -1;
// mask |= (abs(q1 - q0) > limit) * -1;
- work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
- _mm_subs_epu16(p1, p2)),
- _mm_or_si128(_mm_subs_epu16(p3, p2),
- _mm_subs_epu16(p2, p3)));
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
+ _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)));
mask = _mm_max_epi16(work, mask);
- work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q1),
- _mm_subs_epu16(q1, q2)),
- _mm_or_si128(_mm_subs_epu16(q3, q2),
- _mm_subs_epu16(q2, q3)));
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)),
+ _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
mask = _mm_max_epi16(work, mask);
mask = _mm_subs_epu16(mask, limit);
mask = _mm_cmpeq_epi16(mask, zero);
@@ -878,8 +843,8 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
// Filter1 >> 3
work_a = _mm_cmpgt_epi16(zero, filter1); // get the values that are <0
filter1 = _mm_srli_epi16(filter1, 3);
- work_a = _mm_and_si128(work_a, tffe0); // sign bits for the values < 0
- filter1 = _mm_and_si128(filter1, t1f); // clamp the range
+ work_a = _mm_and_si128(work_a, tffe0); // sign bits for the values < 0
+ filter1 = _mm_and_si128(filter1, t1f); // clamp the range
filter1 = _mm_or_si128(filter1, work_a); // reinsert the sign bits
// Filter2 >> 3
@@ -901,12 +866,12 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
q0 = _mm_adds_epi16(
signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
- q1 = _mm_adds_epi16(
- signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), t80);
+ q1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
+ t80);
p0 = _mm_adds_epi16(
signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
- p1 = _mm_adds_epi16(
- signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), t80);
+ p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
+ t80);
_mm_storeu_si128((__m128i *)(s - 2 * p), p1);
_mm_storeu_si128((__m128i *)(s - 1 * p), p0);
@@ -914,35 +879,38 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
_mm_storeu_si128((__m128i *)(s + 1 * p), q1);
}
-void vpx_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p,
- const uint8_t *_blimit0,
- const uint8_t *_limit0,
- const uint8_t *_thresh0,
- const uint8_t *_blimit1,
- const uint8_t *_limit1,
- const uint8_t *_thresh1,
- int bd) {
+void vpx_highbd_lpf_horizontal_4_dual_sse2(
+ uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+ const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+ const uint8_t *_thresh1, int bd) {
vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
}
-static INLINE void highbd_transpose(uint16_t *src[], int in_p,
- uint16_t *dst[], int out_p,
- int num_8x8_to_transpose) {
+static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],
+ int out_p, int num_8x8_to_transpose) {
int idx8x8 = 0;
__m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
do {
uint16_t *in = src[idx8x8];
uint16_t *out = dst[idx8x8];
- p0 = _mm_loadu_si128((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07
- p1 = _mm_loadu_si128((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17
- p2 = _mm_loadu_si128((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27
- p3 = _mm_loadu_si128((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37
- p4 = _mm_loadu_si128((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47
- p5 = _mm_loadu_si128((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57
- p6 = _mm_loadu_si128((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67
- p7 = _mm_loadu_si128((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77
+ p0 =
+ _mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07
+ p1 =
+ _mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17
+ p2 =
+ _mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27
+ p3 =
+ _mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37
+ p4 =
+ _mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47
+ p5 =
+ _mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57
+ p6 =
+ _mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67
+ p7 =
+ _mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77
// 00 10 01 11 02 12 03 13
x0 = _mm_unpacklo_epi16(p0, p1);
// 20 30 21 31 22 32 23 33
@@ -960,9 +928,9 @@ static INLINE void highbd_transpose(uint16_t *src[], int in_p,
// 01 11 21 31 41 51 61 71
x7 = _mm_unpackhi_epi64(x4, x5);
- _mm_storeu_si128((__m128i *)(out + 0*out_p), x6);
+ _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6);
// 00 10 20 30 40 50 60 70
- _mm_storeu_si128((__m128i *)(out + 1*out_p), x7);
+ _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7);
// 01 11 21 31 41 51 61 71
// 02 12 22 32 03 13 23 33
@@ -974,9 +942,9 @@ static INLINE void highbd_transpose(uint16_t *src[], int in_p,
// 03 13 23 33 43 53 63 73
x7 = _mm_unpackhi_epi64(x4, x5);
- _mm_storeu_si128((__m128i *)(out + 2*out_p), x6);
+ _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6);
// 02 12 22 32 42 52 62 72
- _mm_storeu_si128((__m128i *)(out + 3*out_p), x7);
+ _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7);
// 03 13 23 33 43 53 63 73
// 04 14 05 15 06 16 07 17
@@ -996,9 +964,9 @@ static INLINE void highbd_transpose(uint16_t *src[], int in_p,
// 05 15 25 35 45 55 65 75
x7 = _mm_unpackhi_epi64(x4, x5);
- _mm_storeu_si128((__m128i *)(out + 4*out_p), x6);
+ _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6);
// 04 14 24 34 44 54 64 74
- _mm_storeu_si128((__m128i *)(out + 5*out_p), x7);
+ _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7);
// 05 15 25 35 45 55 65 75
// 06 16 26 36 07 17 27 37
@@ -1010,15 +978,15 @@ static INLINE void highbd_transpose(uint16_t *src[], int in_p,
// 07 17 27 37 47 57 67 77
x7 = _mm_unpackhi_epi64(x4, x5);
- _mm_storeu_si128((__m128i *)(out + 6*out_p), x6);
+ _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6);
// 06 16 26 36 46 56 66 76
- _mm_storeu_si128((__m128i *)(out + 7*out_p), x7);
+ _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7);
// 07 17 27 37 47 57 67 77
} while (++idx8x8 < num_8x8_to_transpose);
}
-static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1,
- int in_p, uint16_t *out, int out_p) {
+static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
+ uint16_t *out, int out_p) {
uint16_t *src0[1];
uint16_t *src1[1];
uint16_t *dest0[1];
@@ -1031,10 +999,8 @@ static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1,
highbd_transpose(src1, in_p, dest1, out_p, 1);
}
-void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh,
+void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
int bd) {
DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
uint16_t *src[1];
@@ -1056,14 +1022,10 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p,
highbd_transpose(src, 8, dst, p, 1);
}
-void vpx_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int p,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1,
- int bd) {
+void vpx_highbd_lpf_vertical_4_dual_sse2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
uint16_t *src[2];
uint16_t *dst[2];
@@ -1083,10 +1045,8 @@ void vpx_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int p,
highbd_transpose(src, 16, dst, p, 2);
}
-void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh,
+void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
int bd) {
DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
uint16_t *src[1];
@@ -1108,14 +1068,10 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p,
highbd_transpose(src, 8, dst, p, 1);
}
-void vpx_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int p,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1,
- int bd) {
+void vpx_highbd_lpf_vertical_8_dual_sse2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
uint16_t *src[2];
uint16_t *dst[2];
@@ -1136,11 +1092,9 @@ void vpx_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int p,
highbd_transpose(src, 16, dst, p, 2);
}
-void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p,
- const uint8_t *blimit,
+void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
const uint8_t *limit,
- const uint8_t *thresh,
- int bd) {
+ const uint8_t *thresh, int bd) {
DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
uint16_t *src[2];
uint16_t *dst[2];
@@ -1154,8 +1108,8 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p,
highbd_transpose(src, p, dst, 8, 2);
// Loop filtering
- vpx_highbd_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit,
- thresh, bd);
+ vpx_highbd_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh,
+ bd);
src[0] = t_dst;
src[1] = t_dst + 8 * 8;
dst[0] = s - 8;
@@ -1165,12 +1119,10 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p,
highbd_transpose(src, 8, dst, p, 2);
}
-void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s,
- int p,
+void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p,
const uint8_t *blimit,
const uint8_t *limit,
- const uint8_t *thresh,
- int bd) {
+ const uint8_t *thresh, int bd) {
DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
// Transpose 16x16
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index fd46bef3d..64b56223e 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -15,26 +15,19 @@
#include "vpx_ports/mem.h"
#if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr,
- intptr_t count,
- int skip_block,
- const int16_t *zbin_ptr,
+void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
+ int skip_block, const int16_t *zbin_ptr,
const int16_t *round_ptr,
const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
- tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr,
- const int16_t *dequant_ptr,
- uint16_t *eob_ptr,
- const int16_t *scan,
- const int16_t *iscan) {
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
__m128i zbins[2];
__m128i nzbins[2];
- zbins[0] = _mm_set_epi32((int)zbin_ptr[1],
- (int)zbin_ptr[1],
- (int)zbin_ptr[1],
+ zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
(int)zbin_ptr[0]);
zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
@@ -73,14 +66,13 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr,
coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
coeffs_sign = _mm_srai_epi32(coeffs, 31);
- coeffs = _mm_sub_epi32(
- _mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+ coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
tmp1 = _mm_or_si128(tmp1, tmp2);
test = _mm_movemask_epi8(tmp1);
- _mm_storeu_si128((__m128i*)abs_coeff, coeffs);
- _mm_storeu_si128((__m128i*)coeff_sign, coeffs_sign);
+ _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
+ _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
for (j = 0; j < 4; j++) {
if (test & (1 << (4 * j))) {
@@ -91,8 +83,7 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr,
(uint32_t)((tmp2 * quant_shift_ptr[k != 0]) >> 16);
qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
- if (abs_qcoeff)
- eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
+ if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
}
}
}
@@ -100,20 +91,12 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr,
*eob_ptr = eob_i + 1;
}
-
-void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,
- intptr_t n_coeffs,
- int skip_block,
- const int16_t *zbin_ptr,
- const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr,
- tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr,
- const int16_t *dequant_ptr,
- uint16_t *eob_ptr,
- const int16_t *scan,
- const int16_t *iscan) {
+void vpx_highbd_quantize_b_32x32_sse2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+ const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
__m128i zbins[2];
__m128i nzbins[2];
int idx = 0;
@@ -122,10 +105,7 @@ void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,
const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
(void)scan;
- zbins[0] = _mm_set_epi32(zbin1_tmp,
- zbin1_tmp,
- zbin1_tmp,
- zbin0_tmp);
+ zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
zbins[1] = _mm_set1_epi32(zbin1_tmp);
nzbins[0] = _mm_setzero_si128();
@@ -146,14 +126,10 @@ void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,
cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
cmp1 = _mm_and_si128(cmp1, cmp2);
test = _mm_movemask_epi8(cmp1);
- if (!(test & 0xf))
- idx_arr[idx++] = i * 4;
- if (!(test & 0xf0))
- idx_arr[idx++] = i * 4 + 1;
- if (!(test & 0xf00))
- idx_arr[idx++] = i * 4 + 2;
- if (!(test & 0xf000))
- idx_arr[idx++] = i * 4 + 3;
+ if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+ if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+ if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+ if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
}
// Quantization pass: only process the coefficients selected in
@@ -163,15 +139,14 @@ void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,
const int coeff = coeff_ptr[rc];
const int coeff_sign = (coeff >> 31);
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
- const int64_t tmp1 = abs_coeff
- + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+ const int64_t tmp1 =
+ abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
const uint32_t abs_qcoeff =
(uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
- if (abs_qcoeff)
- eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+ if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
}
}
*eob_ptr = eob + 1;
diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c
index 0d62adf8b..414ae5de1 100644
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c
@@ -11,9 +11,9 @@
#include "vpx_ports/mem.h"
-typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride,
- uint32_t *sse, int *sum);
+typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
@@ -24,8 +24,8 @@ uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
uint32_t *sse, int *sum);
static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride,
- int w, int h, uint32_t *sse, int *sum,
+ const uint16_t *ref, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum,
high_variance_fn_t var_fn, int block_size) {
int i, j;
@@ -36,8 +36,8 @@ static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
- var_fn(src + src_stride * i + j, src_stride,
- ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
*sse += sse0;
*sum += sum0;
}
@@ -45,8 +45,8 @@ static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
}
static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride,
- int w, int h, uint32_t *sse, int *sum,
+ const uint16_t *ref, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum,
high_variance_fn_t var_fn, int block_size) {
int i, j;
uint64_t sse_long = 0;
@@ -56,8 +56,8 @@ static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
- var_fn(src + src_stride * i + j, src_stride,
- ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
sse_long += sse0;
sum_long += sum0;
}
@@ -67,8 +67,8 @@ static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
}
static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride,
- int w, int h, uint32_t *sse, int *sum,
+ const uint16_t *ref, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum,
high_variance_fn_t var_fn, int block_size) {
int i, j;
uint64_t sse_long = 0;
@@ -78,8 +78,8 @@ static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
- var_fn(src + src_stride * i + j, src_stride,
- ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
sse_long += sse0;
sum_long += sum0;
}
@@ -88,84 +88,83 @@ static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
*sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
}
-
-#define HIGH_GET_VAR(S) \
-void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
- const uint8_t *ref8, int ref_stride, \
- uint32_t *sse, int *sum) { \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
- sse, sum); \
-} \
-\
-void vpx_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
- const uint8_t *ref8, int ref_stride, \
- uint32_t *sse, int *sum) { \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
- sse, sum); \
- *sum = ROUND_POWER_OF_TWO(*sum, 2); \
- *sse = ROUND_POWER_OF_TWO(*sse, 4); \
-} \
-\
-void vpx_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
- const uint8_t *ref8, int ref_stride, \
- uint32_t *sse, int *sum) { \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
- sse, sum); \
- *sum = ROUND_POWER_OF_TWO(*sum, 4); \
- *sse = ROUND_POWER_OF_TWO(*sse, 8); \
-}
+#define HIGH_GET_VAR(S) \
+ void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+ const uint8_t *ref8, int ref_stride, \
+ uint32_t *sse, int *sum) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
+ sum); \
+ } \
+ \
+ void vpx_highbd_10_get##S##x##S##var_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
+ sum); \
+ *sum = ROUND_POWER_OF_TWO(*sum, 2); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 4); \
+ } \
+ \
+ void vpx_highbd_12_get##S##x##S##var_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
+ sum); \
+ *sum = ROUND_POWER_OF_TWO(*sum, 4); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 8); \
+ }
HIGH_GET_VAR(16);
HIGH_GET_VAR(8);
#undef HIGH_GET_VAR
-#define VAR_FN(w, h, block_size, shift) \
-uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \
- const uint8_t *src8, int src_stride, \
- const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
- int sum; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- highbd_8_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
- vpx_highbd_calc##block_size##x##block_size##var_sse2, \
- block_size); \
- return *sse - (((int64_t)sum * sum) >> shift); \
-} \
-\
-uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \
- const uint8_t *src8, int src_stride, \
- const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
- int sum; \
- int64_t var; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- highbd_10_variance_sse2( \
- src, src_stride, ref, ref_stride, w, h, sse, &sum, \
- vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
- var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
- return (var >= 0) ? (uint32_t)var : 0; \
-} \
-\
-uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \
- const uint8_t *src8, int src_stride, \
- const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
- int sum; \
- int64_t var; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- highbd_12_variance_sse2( \
- src, src_stride, ref, ref_stride, w, h, sse, &sum, \
- vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
- var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
- return (var >= 0) ? (uint32_t)var : 0; \
-}
+#define VAR_FN(w, h, block_size, shift) \
+ uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_8_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ return *sse - (((int64_t)sum * sum) >> shift); \
+ } \
+ \
+ uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_10_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_12_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
VAR_FN(64, 64, 16, 12);
VAR_FN(64, 32, 16, 11);
@@ -181,13 +180,13 @@ VAR_FN(8, 8, 8, 6);
#undef VAR_FN
unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
- const uint8_t *ref8, int ref_stride,
- unsigned int *sse) {
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
- sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
+ highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+ vpx_highbd_calc16x16var_sse2, 16);
return *sse;
}
@@ -197,8 +196,8 @@ unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
- sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
+ highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+ vpx_highbd_calc16x16var_sse2, 16);
return *sse;
}
@@ -208,19 +207,19 @@ unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
- sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
+ highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+ vpx_highbd_calc16x16var_sse2, 16);
return *sse;
}
unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
- const uint8_t *ref8, int ref_stride,
- unsigned int *sse) {
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
- sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
+ highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+ vpx_highbd_calc8x8var_sse2, 8);
return *sse;
}
@@ -230,8 +229,8 @@ unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
- sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
+ highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+ vpx_highbd_calc8x8var_sse2, 8);
return *sse;
}
@@ -241,25 +240,21 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
- sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
+ highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+ vpx_highbd_calc8x8var_sse2, 8);
return *sse;
}
// The 2 unused parameters are place holders for PIC enabled build.
// These definitions are for functions defined in
// highbd_subpel_variance_impl_sse2.asm
-#define DECL(w, opt) \
- int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
- ptrdiff_t src_stride, \
- int x_offset, int y_offset, \
- const uint16_t *dst, \
- ptrdiff_t dst_stride, \
- int height, \
- unsigned int *sse, \
- void *unused0, void *unused);
+#define DECL(w, opt) \
+ int vpx_highbd_sub_pixel_variance##w##xh_##opt( \
+ const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+ const uint16_t *dst, ptrdiff_t dst_stride, int height, \
+ unsigned int *sse, void *unused0, void *unused);
#define DECLS(opt) \
- DECL(8, opt); \
+ DECL(8, opt); \
DECL(16, opt)
DECLS(sse2);
@@ -267,152 +262,134 @@ DECLS(sse2);
#undef DECLS
#undef DECL
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
-uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
- int src_stride, \
- int x_offset, \
- int y_offset, \
- const uint8_t *dst8, \
- int dst_stride, \
- uint32_t *sse_ptr) { \
- uint32_t sse; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
- x_offset, y_offset, \
- dst, dst_stride, h, \
- &sse, NULL, NULL); \
- if (w > wf) { \
- unsigned int sse2; \
- int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
- src_stride, \
- x_offset, y_offset, \
- dst + 16, \
- dst_stride, \
- h, &sse2, \
- NULL, NULL); \
- se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
- se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
- x_offset, y_offset, \
- dst + 32, dst_stride, \
- h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 48, src_stride, x_offset, y_offset, \
- dst + 48, dst_stride, h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- } \
- } \
- *sse_ptr = sse; \
- return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-} \
-\
-uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
- const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
- uint32_t sse; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
- x_offset, y_offset, \
- dst, dst_stride, \
- h, &sse, NULL, NULL); \
- if (w > wf) { \
- uint32_t sse2; \
- int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
- src_stride, \
- x_offset, y_offset, \
- dst + 16, \
- dst_stride, \
- h, &sse2, \
- NULL, NULL); \
- se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
- se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
- x_offset, y_offset, \
- dst + 32, dst_stride, \
- h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
- x_offset, y_offset, \
- dst + 48, dst_stride, \
- h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- } \
- } \
- se = ROUND_POWER_OF_TWO(se, 2); \
- sse = ROUND_POWER_OF_TWO(sse, 4); \
- *sse_ptr = sse; \
- return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-} \
-\
-uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
- const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
- int start_row; \
- uint32_t sse; \
- int se = 0; \
- uint64_t long_sse = 0; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- for (start_row = 0; start_row < h; start_row +=16) { \
- uint32_t sse2; \
- int height = h - start_row < 16 ? h - start_row : 16; \
- int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + (start_row * src_stride), src_stride, \
- x_offset, y_offset, dst + (start_row * dst_stride), \
- dst_stride, height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- if (w > wf) { \
- se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 16 + (start_row * src_stride), src_stride, \
- x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
- dst_stride, height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- if (w > wf * 2) { \
- se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 32 + (start_row * src_stride), src_stride, \
- x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
- dst_stride, height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 48 + (start_row * src_stride), src_stride, \
- x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
- dst_stride, height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- }\
- } \
- } \
- se = ROUND_POWER_OF_TWO(se, 4); \
- sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
- *sse_ptr = sse; \
- return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-}
-
-#define FNS(opt) \
-FN(64, 64, 16, 6, 6, opt, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt, (int64_t)); \
-FN(16, 8, 16, 4, 3, opt, (int64_t)); \
-FN(8, 16, 8, 3, 4, opt, (int64_t)); \
-FN(8, 8, 8, 3, 3, opt, (int64_t)); \
-FN(8, 4, 8, 3, 2, opt, (int64_t));
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+ uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \
+ NULL); \
+ if (w > wf) { \
+ unsigned int sse2; \
+ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+ } \
+ \
+ uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \
+ NULL); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 2); \
+ sse = ROUND_POWER_OF_TWO(sse, 4); \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+ } \
+ \
+ uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+ int start_row; \
+ uint32_t sse; \
+ int se = 0; \
+ uint64_t long_sse = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ for (start_row = 0; start_row < h; start_row += 16) { \
+ uint32_t sse2; \
+ int height = h - start_row < 16 ? h - start_row : 16; \
+ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + (start_row * src_stride), src_stride, x_offset, y_offset, \
+ dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL, \
+ NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf) { \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 16 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 32 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \
+ height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 48 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \
+ height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ } \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 4); \
+ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+ }
+#define FNS(opt) \
+ FN(64, 64, 16, 6, 6, opt, (int64_t)); \
+ FN(64, 32, 16, 6, 5, opt, (int64_t)); \
+ FN(32, 64, 16, 5, 6, opt, (int64_t)); \
+ FN(32, 32, 16, 5, 5, opt, (int64_t)); \
+ FN(32, 16, 16, 5, 4, opt, (int64_t)); \
+ FN(16, 32, 16, 4, 5, opt, (int64_t)); \
+ FN(16, 16, 16, 4, 4, opt, (int64_t)); \
+ FN(16, 8, 16, 4, 3, opt, (int64_t)); \
+ FN(8, 16, 8, 3, 4, opt, (int64_t)); \
+ FN(8, 8, 8, 3, 3, opt, (int64_t)); \
+ FN(8, 4, 8, 3, 2, opt, (int64_t));
FNS(sse2);
@@ -420,173 +397,154 @@ FNS(sse2);
#undef FN
// The 2 unused parameters are place holders for PIC enabled build.
-#define DECL(w, opt) \
-int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
- ptrdiff_t src_stride, \
- int x_offset, int y_offset, \
- const uint16_t *dst, \
- ptrdiff_t dst_stride, \
- const uint16_t *sec, \
- ptrdiff_t sec_stride, \
- int height, \
- unsigned int *sse, \
- void *unused0, void *unused);
+#define DECL(w, opt) \
+ int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt( \
+ const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+ const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec, \
+ ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \
+ void *unused);
#define DECLS(opt1) \
-DECL(16, opt1) \
-DECL(8, opt1)
+ DECL(16, opt1) \
+ DECL(8, opt1)
DECLS(sse2);
#undef DECL
#undef DECLS
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
-uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
- const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
- const uint8_t *sec8) { \
- uint32_t sse; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
- int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src, src_stride, x_offset, \
- y_offset, dst, dst_stride, sec, w, h, &sse, NULL, NULL); \
- if (w > wf) { \
- uint32_t sse2; \
- int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 16, src_stride, x_offset, y_offset, \
- dst + 16, dst_stride, sec + 16, w, h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
- se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 32, src_stride, x_offset, y_offset, \
- dst + 32, dst_stride, sec + 32, w, h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 48, src_stride, x_offset, y_offset, \
- dst + 48, dst_stride, sec + 48, w, h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- } \
- } \
- *sse_ptr = sse; \
- return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-} \
-\
-uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
- const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
- const uint8_t *sec8) { \
- uint32_t sse; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
- int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src, src_stride, x_offset, \
- y_offset, dst, dst_stride, \
- sec, w, h, &sse, NULL, NULL); \
- if (w > wf) { \
- uint32_t sse2; \
- int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 16, src_stride, \
- x_offset, y_offset, \
- dst + 16, dst_stride, \
- sec + 16, w, h, &sse2, \
- NULL, NULL); \
- se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
- se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 32, src_stride, \
- x_offset, y_offset, \
- dst + 32, dst_stride, \
- sec + 32, w, h, &sse2, \
- NULL, NULL); \
- se += se2; \
- sse += sse2; \
- se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 48, src_stride, \
- x_offset, y_offset, \
- dst + 48, dst_stride, \
- sec + 48, w, h, &sse2, \
- NULL, NULL); \
- se += se2; \
- sse += sse2; \
- } \
- } \
- se = ROUND_POWER_OF_TWO(se, 2); \
- sse = ROUND_POWER_OF_TWO(sse, 4); \
- *sse_ptr = sse; \
- return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-} \
-\
-uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
- const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
- const uint8_t *sec8) { \
- int start_row; \
- uint32_t sse; \
- int se = 0; \
- uint64_t long_sse = 0; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
- for (start_row = 0; start_row < h; start_row +=16) { \
- uint32_t sse2; \
- int height = h - start_row < 16 ? h - start_row : 16; \
- int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + (start_row * src_stride), src_stride, x_offset, \
- y_offset, dst + (start_row * dst_stride), dst_stride, \
- sec + (start_row * w), w, height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- if (w > wf) { \
- se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 16 + (start_row * src_stride), src_stride, \
- x_offset, y_offset, \
- dst + 16 + (start_row * dst_stride), dst_stride, \
- sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- if (w > wf * 2) { \
- se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 32 + (start_row * src_stride), src_stride, \
- x_offset, y_offset, \
- dst + 32 + (start_row * dst_stride), dst_stride, \
- sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 48 + (start_row * src_stride), src_stride, \
- x_offset, y_offset, \
- dst + 48 + (start_row * dst_stride), dst_stride, \
- sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- } \
- } \
- } \
- se = ROUND_POWER_OF_TWO(se, 4); \
- sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
- *sse_ptr = sse; \
- return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-}
-
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+ uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
+ NULL, NULL); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \
+ sec + 16, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \
+ sec + 32, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \
+ sec + 48, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+ } \
+ \
+ uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
+ NULL, NULL); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \
+ sec + 16, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \
+ sec + 32, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \
+ sec + 48, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 2); \
+ sse = ROUND_POWER_OF_TWO(sse, 4); \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+ } \
+ \
+ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ int start_row; \
+ uint32_t sse; \
+ int se = 0; \
+ uint64_t long_sse = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ for (start_row = 0; start_row < h; start_row += 16) { \
+ uint32_t sse2; \
+ int height = h - start_row < 16 ? h - start_row : 16; \
+ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + (start_row * src_stride), src_stride, x_offset, y_offset, \
+ dst + (start_row * dst_stride), dst_stride, sec + (start_row * w), \
+ w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf) { \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 16 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, dst + 16 + (start_row * dst_stride), dst_stride, \
+ sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 32 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \
+ sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 48 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \
+ sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ } \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 4); \
+ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+ }
-#define FNS(opt1) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
-FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
-FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
-FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
-FN(8, 4, 8, 3, 2, opt1, (int64_t));
+#define FNS(opt1) \
+ FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+ FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+ FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+ FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+ FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+ FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+ FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
+ FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
+ FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
+ FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
+ FN(8, 4, 8, 3, 2, opt1, (int64_t));
FNS(sse2);
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c
index df5068c62..a6fc1161f 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -12,14 +12,14 @@
#include "vpx_dsp/x86/inv_txfm_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
-#define RECON_AND_STORE4X4(dest, in_x) \
-{ \
- __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
- d0 = _mm_unpacklo_epi8(d0, zero); \
- d0 = _mm_add_epi16(in_x, d0); \
- d0 = _mm_packus_epi16(d0, d0); \
- *(int *)(dest) = _mm_cvtsi128_si32(d0); \
-}
+#define RECON_AND_STORE4X4(dest, in_x) \
+ { \
+ __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
+ d0 = _mm_unpacklo_epi8(d0, zero); \
+ d0 = _mm_add_epi16(in_x, d0); \
+ d0 = _mm_packus_epi16(d0, d0); \
+ *(int *)(dest) = _mm_cvtsi128_si32(d0); \
+ }
void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
@@ -263,192 +263,189 @@ void iadst4_sse2(__m128i *in) {
in[1] = _mm_packs_epi32(u[2], u[3]);
}
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3, out4, out5, out6, out7) \
- { \
- const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
- const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
- const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
- const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
- const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
- const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
- const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
- \
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
- \
- out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
- out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
- out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
- out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
- out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
- out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
- out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
- out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
+#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ { \
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+ const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
+ const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
+ const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
+ const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
+ \
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
+ \
+ out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+ out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+ out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+ out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+ out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
+ out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
+ out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
+ out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
}
-#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
- out0, out1, out2, out3) \
- { \
- const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
- const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
- const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
- \
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
- \
- out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
- out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
- out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
- out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
+ { \
+ const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
+ const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
+ const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
+ \
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+ \
+ out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+ out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+ out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+ out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
}
#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
- { \
- const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
- out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
- out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+ { \
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+ out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+ out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
}
// Define Macro for multiplying elements by constants and adding them together.
-#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
- cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
- { \
- tmp0 = _mm_madd_epi16(lo_0, cst0); \
- tmp1 = _mm_madd_epi16(hi_0, cst0); \
- tmp2 = _mm_madd_epi16(lo_0, cst1); \
- tmp3 = _mm_madd_epi16(hi_0, cst1); \
- tmp4 = _mm_madd_epi16(lo_1, cst2); \
- tmp5 = _mm_madd_epi16(hi_1, cst2); \
- tmp6 = _mm_madd_epi16(lo_1, cst3); \
- tmp7 = _mm_madd_epi16(hi_1, cst3); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- tmp4 = _mm_add_epi32(tmp4, rounding); \
- tmp5 = _mm_add_epi32(tmp5, rounding); \
- tmp6 = _mm_add_epi32(tmp6, rounding); \
- tmp7 = _mm_add_epi32(tmp7, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
- \
- res0 = _mm_packs_epi32(tmp0, tmp1); \
- res1 = _mm_packs_epi32(tmp2, tmp3); \
- res2 = _mm_packs_epi32(tmp4, tmp5); \
- res3 = _mm_packs_epi32(tmp6, tmp7); \
+#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \
+ res0, res1, res2, res3) \
+ { \
+ tmp0 = _mm_madd_epi16(lo_0, cst0); \
+ tmp1 = _mm_madd_epi16(hi_0, cst0); \
+ tmp2 = _mm_madd_epi16(lo_0, cst1); \
+ tmp3 = _mm_madd_epi16(hi_0, cst1); \
+ tmp4 = _mm_madd_epi16(lo_1, cst2); \
+ tmp5 = _mm_madd_epi16(hi_1, cst2); \
+ tmp6 = _mm_madd_epi16(lo_1, cst3); \
+ tmp7 = _mm_madd_epi16(hi_1, cst3); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ tmp4 = _mm_add_epi32(tmp4, rounding); \
+ tmp5 = _mm_add_epi32(tmp5, rounding); \
+ tmp6 = _mm_add_epi32(tmp6, rounding); \
+ tmp7 = _mm_add_epi32(tmp7, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
+ tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
+ tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
+ tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
+ \
+ res0 = _mm_packs_epi32(tmp0, tmp1); \
+ res1 = _mm_packs_epi32(tmp2, tmp3); \
+ res2 = _mm_packs_epi32(tmp4, tmp5); \
+ res3 = _mm_packs_epi32(tmp6, tmp7); \
}
#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
- { \
- tmp0 = _mm_madd_epi16(lo_0, cst0); \
- tmp1 = _mm_madd_epi16(hi_0, cst0); \
- tmp2 = _mm_madd_epi16(lo_0, cst1); \
- tmp3 = _mm_madd_epi16(hi_0, cst1); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- res0 = _mm_packs_epi32(tmp0, tmp1); \
- res1 = _mm_packs_epi32(tmp2, tmp3); \
+ { \
+ tmp0 = _mm_madd_epi16(lo_0, cst0); \
+ tmp1 = _mm_madd_epi16(hi_0, cst0); \
+ tmp2 = _mm_madd_epi16(lo_0, cst1); \
+ tmp3 = _mm_madd_epi16(hi_0, cst1); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ res0 = _mm_packs_epi32(tmp0, tmp1); \
+ res1 = _mm_packs_epi32(tmp2, tmp3); \
}
-#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3, out4, out5, out6, out7) \
- { \
- /* Stage1 */ \
- { \
- const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
- const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
- const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
- const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
- \
- MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
- stg1_1, stg1_2, stg1_3, stp1_4, \
- stp1_7, stp1_5, stp1_6) \
- } \
- \
- /* Stage2 */ \
- { \
- const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
- const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
- const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
- const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
- \
- MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
- stg2_1, stg2_2, stg2_3, stp2_0, \
- stp2_1, stp2_2, stp2_3) \
- \
- stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
- stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
- stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
- stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
- } \
- \
- /* Stage3 */ \
- { \
- const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- \
- stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
- stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
- stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
- stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
- \
- tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
- tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
- tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
- tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- } \
- \
- /* Stage4 */ \
- out0 = _mm_adds_epi16(stp1_0, stp2_7); \
- out1 = _mm_adds_epi16(stp1_1, stp1_6); \
- out2 = _mm_adds_epi16(stp1_2, stp1_5); \
- out3 = _mm_adds_epi16(stp1_3, stp2_4); \
- out4 = _mm_subs_epi16(stp1_3, stp2_4); \
- out5 = _mm_subs_epi16(stp1_2, stp1_5); \
- out6 = _mm_subs_epi16(stp1_1, stp1_6); \
- out7 = _mm_subs_epi16(stp1_0, stp2_7); \
+#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
+ out4, out5, out6, out7) \
+ { \
+ /* Stage1 */ \
+ { \
+ const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
+ const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
+ const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
+ const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
+ \
+ MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1, \
+ stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6) \
+ } \
+ \
+ /* Stage2 */ \
+ { \
+ const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
+ const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
+ const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
+ const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
+ \
+ MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1, \
+ stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3) \
+ \
+ stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
+ stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
+ stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
+ stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
+ } \
+ \
+ /* Stage3 */ \
+ { \
+ const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ \
+ stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
+ stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
+ stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
+ stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
+ \
+ tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
+ tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
+ tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
+ tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ } \
+ \
+ /* Stage4 */ \
+ out0 = _mm_adds_epi16(stp1_0, stp2_7); \
+ out1 = _mm_adds_epi16(stp1_1, stp1_6); \
+ out2 = _mm_adds_epi16(stp1_2, stp1_5); \
+ out3 = _mm_adds_epi16(stp1_3, stp2_4); \
+ out4 = _mm_subs_epi16(stp1_3, stp2_4); \
+ out5 = _mm_subs_epi16(stp1_2, stp1_5); \
+ out6 = _mm_subs_epi16(stp1_1, stp1_6); \
+ out7 = _mm_subs_epi16(stp1_0, stp2_7); \
}
void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
@@ -484,12 +481,12 @@ void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
// 2-D
for (i = 0; i < 2; i++) {
// 8x8 Transpose is copied from vpx_fdct8x8_sse2()
- TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
- in0, in1, in2, in3, in4, in5, in6, in7);
+ TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
// 4-stage 1D idct8x8
- IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
- in0, in1, in2, in3, in4, in5, in6, in7);
+ IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5,
+ in6, in7);
}
// Final rounding and shift
@@ -560,12 +557,12 @@ void idct8_sse2(__m128i *in) {
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
// 8x8 Transpose is copied from vpx_fdct8x8_sse2()
- TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
- in0, in1, in2, in3, in4, in5, in6, in7);
+ TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
+ in1, in2, in3, in4, in5, in6, in7);
// 4-stage 1D idct8x8
- IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
- in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
+ IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3],
+ in[4], in[5], in[6], in[7]);
}
void iadst8_sse2(__m128i *in) {
@@ -906,8 +903,8 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
- IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
- in0, in1, in2, in3, in4, in5, in6, in7);
+ IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4,
+ in5, in6, in7);
// Final rounding and shift
in0 = _mm_adds_epi16(in0, final_rounding);
in1 = _mm_adds_epi16(in1, final_rounding);
@@ -937,242 +934,234 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
RECON_AND_STORE(dest + 7 * stride, in7);
}
-#define IDCT16 \
- /* Stage2 */ \
- { \
- const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
- const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
- const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \
- const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \
- const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
- const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
- const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
- const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
- \
- MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
- stg2_0, stg2_1, stg2_2, stg2_3, \
- stp2_8, stp2_15, stp2_9, stp2_14) \
- \
- MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
- stg2_4, stg2_5, stg2_6, stg2_7, \
- stp2_10, stp2_13, stp2_11, stp2_12) \
- } \
- \
- /* Stage3 */ \
- { \
- const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
- const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
- const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
- const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
- \
- MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
- stg3_0, stg3_1, stg3_2, stg3_3, \
- stp1_4, stp1_7, stp1_5, stp1_6) \
- \
- stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \
- stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
- stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
- stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
- \
- stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
- stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
- stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
- stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
- } \
- \
- /* Stage4 */ \
- { \
- const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
- const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
- const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
- const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
- \
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- \
- MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
- stg4_0, stg4_1, stg4_2, stg4_3, \
- stp2_0, stp2_1, stp2_2, stp2_3) \
- \
- stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
- stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
- stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
- \
- MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
- stg4_4, stg4_5, stg4_6, stg4_7, \
- stp2_9, stp2_14, stp2_10, stp2_13) \
- } \
- \
- /* Stage5 */ \
- { \
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
- stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
- stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
- \
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- \
- stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
- \
- stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
- stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
- } \
- \
- /* Stage6 */ \
- { \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
- const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
- \
- stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
- stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
- stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
- stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
- \
- MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
- stg6_0, stg4_0, stg6_0, stg4_0, \
- stp2_10, stp2_13, stp2_11, stp2_12) \
+#define IDCT16 \
+ /* Stage2 */ \
+ { \
+ const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
+ const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
+ const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \
+ const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \
+ const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
+ const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
+ const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
+ const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
+ \
+ MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1, \
+ stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14) \
+ \
+ MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \
+ stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \
+ } \
+ \
+ /* Stage3 */ \
+ { \
+ const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
+ const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
+ const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
+ const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
+ \
+ MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \
+ stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6) \
+ \
+ stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \
+ stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+ stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+ \
+ stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
+ stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+ } \
+ \
+ /* Stage4 */ \
+ { \
+ const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
+ const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
+ const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
+ const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
+ \
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ \
+ MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1, \
+ stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \
+ \
+ stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+ stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+ \
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+ stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \
+ stp2_13) \
+ } \
+ \
+ /* Stage5 */ \
+ { \
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+ \
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ \
+ stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
+ \
+ stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+ stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
+ } \
+ \
+ /* Stage6 */ \
+ { \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+ \
+ stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+ stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
+ stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+ stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
+ \
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
+ stp2_12) \
}
-#define IDCT16_10 \
- /* Stage2 */ \
- { \
- const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
- const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
- const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
- const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
- \
- MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
- stg2_0, stg2_1, stg2_6, stg2_7, \
- stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
- } \
- \
- /* Stage3 */ \
- { \
- const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
- const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
- \
- MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
- stg3_0, stg3_1, \
- stp2_4, stp2_7) \
- \
- stp1_9 = stp1_8_0; \
- stp1_10 = stp1_11; \
- \
- stp1_13 = stp1_12_0; \
- stp1_14 = stp1_15; \
- } \
- \
- /* Stage4 */ \
- { \
- const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
- const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
- \
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- \
- MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
- stg4_0, stg4_1, \
- stp1_0, stp1_1) \
- stp2_5 = stp2_4; \
- stp2_6 = stp2_7; \
- \
- MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
- stg4_4, stg4_5, stg4_6, stg4_7, \
- stp2_9, stp2_14, stp2_10, stp2_13) \
- } \
- \
- /* Stage5 */ \
- { \
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- \
- stp1_2 = stp1_1; \
- stp1_3 = stp1_0; \
- \
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- \
- stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
- \
- stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
- stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
- } \
- \
- /* Stage6 */ \
- { \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
- const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
- \
- stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
- stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
- stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
- stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
- \
- MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
- stg6_0, stg4_0, stg6_0, stg4_0, \
- stp2_10, stp2_13, stp2_11, stp2_12) \
- }
+#define IDCT16_10 \
+ /* Stage2 */ \
+ { \
+ const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
+ const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
+ const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
+ const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
+ \
+ MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \
+ stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11, \
+ stp1_12_0) \
+ } \
+ \
+ /* Stage3 */ \
+ { \
+ const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
+ const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \
+ \
+ stp1_9 = stp1_8_0; \
+ stp1_10 = stp1_11; \
+ \
+ stp1_13 = stp1_12_0; \
+ stp1_14 = stp1_15; \
+ } \
+ \
+ /* Stage4 */ \
+ { \
+ const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
+ const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
+ \
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1) \
+ stp2_5 = stp2_4; \
+ stp2_6 = stp2_7; \
+ \
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+ stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \
+ stp2_13) \
+ } \
+ \
+ /* Stage5 */ \
+ { \
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ \
+ stp1_2 = stp1_1; \
+ stp1_3 = stp1_0; \
+ \
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ \
+ stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
+ \
+ stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+ stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
+ } \
+ \
+ /* Stage6 */ \
+ { \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+ \
+ stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+ stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
+ stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+ stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
+ \
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
+ stp2_12) \
+ }
void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
@@ -1207,10 +1196,10 @@ void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
__m128i in[16], l[16], r[16], *curr1;
__m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
- stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
- stp1_8_0, stp1_12_0;
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_8_0, stp1_12_0;
__m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int i;
@@ -1312,8 +1301,8 @@ void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
dc_value = _mm_set1_epi16(a);
for (i = 0; i < 16; ++i) {
- RECON_AND_STORE(dest + 0, dc_value);
- RECON_AND_STORE(dest + 8, dc_value);
+ RECON_AND_STORE(dest + 0, dc_value);
+ RECON_AND_STORE(dest + 8, dc_value);
dest += stride;
}
}
@@ -1891,9 +1880,9 @@ static void idct16_8col(__m128i *in) {
u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
- s[8] = _mm_packs_epi32(u[0], u[1]);
+ s[8] = _mm_packs_epi32(u[0], u[1]);
s[15] = _mm_packs_epi32(u[2], u[3]);
- s[9] = _mm_packs_epi32(u[4], u[5]);
+ s[9] = _mm_packs_epi32(u[4], u[5]);
s[14] = _mm_packs_epi32(u[6], u[7]);
s[10] = _mm_packs_epi32(u[8], u[9]);
s[13] = _mm_packs_epi32(u[10], u[11]);
@@ -2021,7 +2010,7 @@ static void idct16_8col(__m128i *in) {
s[7] = _mm_add_epi16(t[6], t[7]);
s[8] = t[8];
s[15] = t[15];
- s[9] = _mm_packs_epi32(u[8], u[9]);
+ s[9] = _mm_packs_epi32(u[8], u[9]);
s[14] = _mm_packs_epi32(u[10], u[11]);
s[10] = _mm_packs_epi32(u[12], u[13]);
s[13] = _mm_packs_epi32(u[14], u[15]);
@@ -2167,11 +2156,11 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
__m128i in[16], l[16];
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
- stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
- stp1_8_0, stp1_12_0;
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8,
+ stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0,
+ stp1_12_0;
__m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int i;
// First 1-D inverse DCT
@@ -2203,7 +2192,7 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
- stp2_8 = _mm_packs_epi32(tmp0, tmp2);
+ stp2_8 = _mm_packs_epi32(tmp0, tmp2);
stp2_11 = _mm_packs_epi32(tmp5, tmp7);
}
@@ -2267,9 +2256,9 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
tmp2 = _mm_add_epi16(stp2_9, stp2_10);
tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
- stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
+ stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
- stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
+ stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
@@ -2381,650 +2370,647 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
}
}
-#define LOAD_DQCOEFF(reg, input) \
- { \
+#define LOAD_DQCOEFF(reg, input) \
+ { \
reg = load_input_data(input); \
- input += 8; \
- } \
-
-#define IDCT32_34 \
-/* Stage1 */ \
-{ \
- const __m128i zero = _mm_setzero_si128();\
- const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
- const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
- \
- const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
- const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
- \
- const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
- const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
- \
- const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
- const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
- \
- MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
- stg1_1, stp1_16, stp1_31); \
- MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
- stg1_7, stp1_19, stp1_28); \
- MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
- stg1_9, stp1_20, stp1_27); \
- MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
- stg1_15, stp1_23, stp1_24); \
-} \
-\
-/* Stage2 */ \
-{ \
- const __m128i zero = _mm_setzero_si128();\
- const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
- const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
- \
- const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
- const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
- \
- MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
- stg2_1, stp2_8, stp2_15); \
- MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
- stg2_7, stp2_11, stp2_12); \
- \
- stp2_16 = stp1_16; \
- stp2_19 = stp1_19; \
- \
- stp2_20 = stp1_20; \
- stp2_23 = stp1_23; \
- \
- stp2_24 = stp1_24; \
- stp2_27 = stp1_27; \
- \
- stp2_28 = stp1_28; \
- stp2_31 = stp1_31; \
-} \
-\
-/* Stage3 */ \
-{ \
- const __m128i zero = _mm_setzero_si128();\
- const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
- const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
- \
- const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
- const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
- \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
- \
- MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
- stg3_1, stp1_4, stp1_7); \
- \
- stp1_8 = stp2_8; \
- stp1_11 = stp2_11; \
- stp1_12 = stp2_12; \
- stp1_15 = stp2_15; \
- \
- MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
- stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
- stp1_18, stp1_29) \
- MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
- stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
- stp1_22, stp1_25) \
- \
- stp1_16 = stp2_16; \
- stp1_31 = stp2_31; \
- stp1_19 = stp2_19; \
- stp1_20 = stp2_20; \
- stp1_23 = stp2_23; \
- stp1_24 = stp2_24; \
- stp1_27 = stp2_27; \
- stp1_28 = stp2_28; \
-} \
-\
-/* Stage4 */ \
-{ \
- const __m128i zero = _mm_setzero_si128();\
- const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
- const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
- \
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
- \
- MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
- stg4_1, stp2_0, stp2_1); \
- \
- stp2_4 = stp1_4; \
- stp2_5 = stp1_4; \
- stp2_6 = stp1_7; \
- stp2_7 = stp1_7; \
- \
- MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
- stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
- stp2_10, stp2_13) \
- \
- stp2_8 = stp1_8; \
- stp2_15 = stp1_15; \
- stp2_11 = stp1_11; \
- stp2_12 = stp1_12; \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
- stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
- stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
- stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
- stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
- stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
- stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
- stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
- \
- stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
- stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
- stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
- stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
- stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
- stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
- stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
-} \
-\
-/* Stage5 */ \
-{ \
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
- \
- const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
- const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
- \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- \
- stp1_0 = stp2_0; \
- stp1_1 = stp2_1; \
- stp1_2 = stp2_1; \
- stp1_3 = stp2_0; \
- \
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- \
- stp1_4 = stp2_4; \
- stp1_7 = stp2_7; \
- \
- stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
- stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
- stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
- \
- stp1_16 = stp2_16; \
- stp1_17 = stp2_17; \
- \
- MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
- stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
- stp1_19, stp1_28) \
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
- stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
- stp1_21, stp1_26) \
- \
- stp1_22 = stp2_22; \
- stp1_23 = stp2_23; \
- stp1_24 = stp2_24; \
- stp1_25 = stp2_25; \
- stp1_30 = stp2_30; \
- stp1_31 = stp2_31; \
-} \
-\
-/* Stage6 */ \
-{ \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
- const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
- \
- stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
- stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
- stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
- stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
- \
- stp2_8 = stp1_8; \
- stp2_9 = stp1_9; \
- stp2_14 = stp1_14; \
- stp2_15 = stp1_15; \
- \
- MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
- stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
- stp2_13, stp2_11, stp2_12) \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
- stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
- stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
- stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
- stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
- stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
- stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
- stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
- \
- stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
- stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
- stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
- stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
- stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
- stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
- stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
-} \
-\
-/* Stage7 */ \
-{ \
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- \
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
- const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
- const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
- \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
- stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
- stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
- stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
- stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
- stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
- stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
- stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
- stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
- stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
- stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
- stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
- stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
- stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
- \
- stp1_16 = stp2_16; \
- stp1_17 = stp2_17; \
- stp1_18 = stp2_18; \
- stp1_19 = stp2_19; \
- \
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
- stp1_21, stp1_26) \
- MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
- stp1_23, stp1_24) \
- \
- stp1_28 = stp2_28; \
- stp1_29 = stp2_29; \
- stp1_30 = stp2_30; \
- stp1_31 = stp2_31; \
-}
+ input += 8; \
+ }
+#define IDCT32_34 \
+ /* Stage1 */ \
+ { \
+ const __m128i zero = _mm_setzero_si128(); \
+ const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
+ const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
+ \
+ const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]); \
+ const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
+ \
+ const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
+ const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
+ \
+ const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
+ const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16, \
+ stp1_31); \
+ MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19, \
+ stp1_28); \
+ MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20, \
+ stp1_27); \
+ MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23, \
+ stp1_24); \
+ } \
+ \
+ /* Stage2 */ \
+ { \
+ const __m128i zero = _mm_setzero_si128(); \
+ const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
+ const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
+ \
+ const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
+ const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8, \
+ stp2_15); \
+ MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11, \
+ stp2_12); \
+ \
+ stp2_16 = stp1_16; \
+ stp2_19 = stp1_19; \
+ \
+ stp2_20 = stp1_20; \
+ stp2_23 = stp1_23; \
+ \
+ stp2_24 = stp1_24; \
+ stp2_27 = stp1_27; \
+ \
+ stp2_28 = stp1_28; \
+ stp2_31 = stp1_31; \
+ } \
+ \
+ /* Stage3 */ \
+ { \
+ const __m128i zero = _mm_setzero_si128(); \
+ const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
+ const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
+ \
+ const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
+ const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4, \
+ stp1_7); \
+ \
+ stp1_8 = stp2_8; \
+ stp1_11 = stp2_11; \
+ stp1_12 = stp2_12; \
+ stp1_15 = stp2_15; \
+ \
+ MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+ stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \
+ stp1_29) \
+ MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+ stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
+ stp1_25) \
+ \
+ stp1_16 = stp2_16; \
+ stp1_31 = stp2_31; \
+ stp1_19 = stp2_19; \
+ stp1_20 = stp2_20; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_27 = stp2_27; \
+ stp1_28 = stp2_28; \
+ } \
+ \
+ /* Stage4 */ \
+ { \
+ const __m128i zero = _mm_setzero_si128(); \
+ const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
+ const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
+ \
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0, \
+ stp2_1); \
+ \
+ stp2_4 = stp1_4; \
+ stp2_5 = stp1_4; \
+ stp2_6 = stp1_7; \
+ stp2_7 = stp1_7; \
+ \
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+ stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \
+ stp2_13) \
+ \
+ stp2_8 = stp1_8; \
+ stp2_15 = stp1_15; \
+ stp2_11 = stp1_11; \
+ stp2_12 = stp1_12; \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+ stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+ stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+ stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+ stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+ \
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+ stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+ stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+ stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+ stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+ } \
+ \
+ /* Stage5 */ \
+ { \
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+ \
+ const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+ const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ stp1_0 = stp2_0; \
+ stp1_1 = stp2_1; \
+ stp1_2 = stp2_1; \
+ stp1_3 = stp2_0; \
+ \
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ \
+ stp1_4 = stp2_4; \
+ stp1_7 = stp2_7; \
+ \
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ \
+ MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+ stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \
+ stp1_28) \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+ stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \
+ stp1_26) \
+ \
+ stp1_22 = stp2_22; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_25 = stp2_25; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+ } \
+ \
+ /* Stage6 */ \
+ { \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+ \
+ stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+ stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+ stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+ stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+ \
+ stp2_8 = stp1_8; \
+ stp2_9 = stp1_9; \
+ stp2_14 = stp1_14; \
+ stp2_15 = stp1_15; \
+ \
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
+ stp2_12) \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+ stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+ stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+ stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+ stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+ \
+ stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+ stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+ stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+ stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+ stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+ } \
+ \
+ /* Stage7 */ \
+ { \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+ const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+ const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+ stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+ stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+ stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+ stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+ stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+ stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+ stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+ stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+ stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ stp1_18 = stp2_18; \
+ stp1_19 = stp2_19; \
+ \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \
+ stp1_26) \
+ MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \
+ stp1_24) \
+ \
+ stp1_28 = stp2_28; \
+ stp1_29 = stp2_29; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+ }
-#define IDCT32 \
-/* Stage1 */ \
-{ \
- const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
- const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
- const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
- const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
- \
- const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
- const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
- const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
- const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
- \
- const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
- const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
- const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
- const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
- \
- const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
- const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
- const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
- const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
- \
- MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
- stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
- stp1_17, stp1_30) \
- MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
- stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
- stp1_19, stp1_28) \
- MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
- stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
- stp1_21, stp1_26) \
- MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
- stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
- stp1_23, stp1_24) \
-} \
-\
-/* Stage2 */ \
-{ \
- const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
- const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
- const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
- const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
- \
- const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
- const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
- const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
- const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
- \
- MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
- stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
- stp2_14) \
- MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
- stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
- stp2_11, stp2_12) \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
- stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
- stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
- stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
- \
- stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
- stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
- stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
- stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
- \
- stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
- stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
- stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
- stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
- \
- stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
- stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
- stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
-} \
-\
-/* Stage3 */ \
-{ \
- const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
- const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
- const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
- const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
- \
- const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
- const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
- \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
- \
- MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
- stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
- stp1_6) \
- \
- stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
- stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
- stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
- stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
- stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
- stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
- stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
- stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
- \
- MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
- stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
- stp1_18, stp1_29) \
- MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
- stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
- stp1_22, stp1_25) \
- \
- stp1_16 = stp2_16; \
- stp1_31 = stp2_31; \
- stp1_19 = stp2_19; \
- stp1_20 = stp2_20; \
- stp1_23 = stp2_23; \
- stp1_24 = stp2_24; \
- stp1_27 = stp2_27; \
- stp1_28 = stp2_28; \
-} \
-\
-/* Stage4 */ \
-{ \
- const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
- const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
- const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
- const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
- \
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- \
- MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
- stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
- stp2_2, stp2_3) \
- \
- stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
- stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
- stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
- \
- MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
- stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
- stp2_10, stp2_13) \
- \
- stp2_8 = stp1_8; \
- stp2_15 = stp1_15; \
- stp2_11 = stp1_11; \
- stp2_12 = stp1_12; \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
- stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
- stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
- stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
- stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
- stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
- stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
- stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
- \
- stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
- stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
- stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
- stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
- stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
- stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
- stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
-} \
-\
-/* Stage5 */ \
-{ \
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
- \
- const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
- const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
- \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
- stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
- stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
- \
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- \
- stp1_4 = stp2_4; \
- stp1_7 = stp2_7; \
- \
- stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
- stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
- stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
- \
- stp1_16 = stp2_16; \
- stp1_17 = stp2_17; \
- \
- MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
- stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
- stp1_19, stp1_28) \
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
- stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
- stp1_21, stp1_26) \
- \
- stp1_22 = stp2_22; \
- stp1_23 = stp2_23; \
- stp1_24 = stp2_24; \
- stp1_25 = stp2_25; \
- stp1_30 = stp2_30; \
- stp1_31 = stp2_31; \
-} \
-\
-/* Stage6 */ \
-{ \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
- const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
- \
- stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
- stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
- stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
- stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
- \
- stp2_8 = stp1_8; \
- stp2_9 = stp1_9; \
- stp2_14 = stp1_14; \
- stp2_15 = stp1_15; \
- \
- MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
- stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
- stp2_13, stp2_11, stp2_12) \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
- stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
- stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
- stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
- stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
- stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
- stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
- stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
- \
- stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
- stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
- stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
- stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
- stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
- stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
- stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
-} \
-\
-/* Stage7 */ \
-{ \
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- \
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
- const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
- const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
- \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
- stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
- stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
- stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
- stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
- stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
- stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
- stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
- stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
- stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
- stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
- stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
- stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
- stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
- \
- stp1_16 = stp2_16; \
- stp1_17 = stp2_17; \
- stp1_18 = stp2_18; \
- stp1_19 = stp2_19; \
- \
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
- stp1_21, stp1_26) \
- MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
- stp1_23, stp1_24) \
- \
- stp1_28 = stp2_28; \
- stp1_29 = stp2_29; \
- stp1_30 = stp2_30; \
- stp1_31 = stp2_31; \
-}
+#define IDCT32 \
+ /* Stage1 */ \
+ { \
+ const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
+ const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
+ const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
+ const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
+ \
+ const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
+ const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
+ const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]); \
+ const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
+ \
+ const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
+ const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
+ const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
+ const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
+ \
+ const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
+ const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
+ const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
+ const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
+ \
+ MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
+ stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, \
+ stp1_30) \
+ MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \
+ stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \
+ MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
+ stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
+ stp1_21, stp1_26) \
+ MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
+ stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
+ stp1_23, stp1_24) \
+ } \
+ \
+ /* Stage2 */ \
+ { \
+ const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
+ const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
+ const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
+ const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
+ \
+ const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
+ const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
+ const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
+ const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
+ \
+ MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
+ stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
+ stp2_14) \
+ MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
+ stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, \
+ stp2_12) \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
+ stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
+ stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
+ \
+ stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
+ stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
+ stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
+ \
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
+ stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
+ stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
+ stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
+ \
+ stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
+ stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
+ stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
+ } \
+ \
+ /* Stage3 */ \
+ { \
+ const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
+ const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
+ const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
+ const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
+ \
+ const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
+ const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+ \
+ MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
+ stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
+ stp1_6) \
+ \
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
+ stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+ stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+ stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
+ stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+ \
+ MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+ stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \
+ stp1_29) \
+ MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+ stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
+ stp1_25) \
+ \
+ stp1_16 = stp2_16; \
+ stp1_31 = stp2_31; \
+ stp1_19 = stp2_19; \
+ stp1_20 = stp2_20; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_27 = stp2_27; \
+ stp1_28 = stp2_28; \
+ } \
+ \
+ /* Stage4 */ \
+ { \
+ const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
+ const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
+ const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
+ const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
+ \
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ \
+ MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \
+ stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \
+ \
+ stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+ stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+ \
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+ stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \
+ stp2_13) \
+ \
+ stp2_8 = stp1_8; \
+ stp2_15 = stp1_15; \
+ stp2_11 = stp1_11; \
+ stp2_12 = stp1_12; \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+ stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+ stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+ stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+ stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+ \
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+ stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+ stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+ stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+ stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+ } \
+ \
+ /* Stage5 */ \
+ { \
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+ \
+ const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+ const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+ \
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ \
+ stp1_4 = stp2_4; \
+ stp1_7 = stp2_7; \
+ \
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ \
+ MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+ stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \
+ stp1_28) \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+ stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \
+ stp1_26) \
+ \
+ stp1_22 = stp2_22; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_25 = stp2_25; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+ } \
+ \
+ /* Stage6 */ \
+ { \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+ \
+ stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+ stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+ stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+ stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+ \
+ stp2_8 = stp1_8; \
+ stp2_9 = stp1_9; \
+ stp2_14 = stp1_14; \
+ stp2_15 = stp1_15; \
+ \
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
+ stp2_12) \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+ stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+ stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+ stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+ stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+ \
+ stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+ stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+ stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+ stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+ stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+ } \
+ \
+ /* Stage7 */ \
+ { \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+ const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+ const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+ stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+ stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+ stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+ stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+ stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+ stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+ stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+ stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+ stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ stp1_18 = stp2_18; \
+ stp1_19 = stp2_19; \
+ \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \
+ stp1_26) \
+ MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \
+ stp1_24) \
+ \
+ stp1_28 = stp2_28; \
+ stp1_29 = stp2_29; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+ }
// Only upper-left 8x8 has non-zero coeff
void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1<<5);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
// idct constants for each stage
const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
@@ -3060,15 +3046,13 @@ void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
__m128i in[32], col[32];
__m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
- stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
- stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
- stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
- stp1_30, stp1_31;
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
+ stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
__m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
- stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
- stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
- stp2_30, stp2_31;
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+ stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
+ stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int i;
@@ -3236,15 +3220,13 @@ void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
__m128i in[32], col[128], zero_idx[16];
__m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
- stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
- stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
- stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
- stp1_30, stp1_31;
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
+ stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
__m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
- stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
- stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
- stp2_30, stp2_31;
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+ stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
+ stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int i, j, i32;
@@ -3469,8 +3451,8 @@ void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
dc_value = _mm_set1_epi16(a);
for (j = 0; j < 32; ++j) {
- RECON_AND_STORE(dest + 0 + j * stride, dc_value);
- RECON_AND_STORE(dest + 8 + j * stride, dc_value);
+ RECON_AND_STORE(dest + 0 + j * stride, dc_value);
+ RECON_AND_STORE(dest + 8 + j * stride, dc_value);
RECON_AND_STORE(dest + 16 + j * stride, dc_value);
RECON_AND_STORE(dest + 24 + j * stride, dc_value);
}
@@ -3595,8 +3577,7 @@ void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
tran_low_t temp_in[4], temp_out[4];
// Columns
for (i = 0; i < 4; ++i) {
- for (j = 0; j < 4; ++j)
- temp_in[j] = out[j * 4 + i];
+ for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
vpx_highbd_idct4_c(temp_in, temp_out, bd);
for (j = 0; j < 4; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
@@ -3685,19 +3666,18 @@ void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
__m128i d[8];
for (i = 0; i < 8; i++) {
inptr[i] = _mm_add_epi16(inptr[i], sixteen);
- d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+ d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
inptr[i] = _mm_srai_epi16(inptr[i], 5);
d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
// Store
- _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+ _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
}
}
} else {
// Run the un-optimised column transform
tran_low_t temp_in[8], temp_out[8];
for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j)
- temp_in[j] = out[j * 8 + i];
+ for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
vpx_highbd_idct8_c(temp_in, temp_out, bd);
for (j = 0; j < 8; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
@@ -3789,19 +3769,18 @@ void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
__m128i d[8];
for (i = 0; i < 8; i++) {
inptr[i] = _mm_add_epi16(inptr[i], sixteen);
- d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+ d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
inptr[i] = _mm_srai_epi16(inptr[i], 5);
d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
// Store
- _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+ _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
}
}
} else {
// Run the un-optimised column transform
tran_low_t temp_in[8], temp_out[8];
for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j)
- temp_in[j] = out[j * 8 + i];
+ for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
vpx_highbd_idct8_c(temp_in, temp_out, bd);
for (j = 0; j < 8; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
@@ -3897,25 +3876,24 @@ void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
{
__m128i d[2];
for (i = 0; i < 16; i++) {
- inptr[i ] = _mm_add_epi16(inptr[i ], rounding);
- inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
- d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
- d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
- inptr[i ] = _mm_srai_epi16(inptr[i ], 6);
- inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
- d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd);
- d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+ inptr[i] = _mm_add_epi16(inptr[i], rounding);
+ inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
+ d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
+ d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
+ inptr[i] = _mm_srai_epi16(inptr[i], 6);
+ inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
+ d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
+ d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
// Store
- _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
- _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+ _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
+ _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
}
}
} else {
// Run the un-optimised column transform
tran_low_t temp_in[16], temp_out[16];
for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j)
- temp_in[j] = out[j * 16 + i];
+ for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
vpx_highbd_idct16_c(temp_in, temp_out, bd);
for (j = 0; j < 16; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
@@ -4016,25 +3994,24 @@ void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
{
__m128i d[2];
for (i = 0; i < 16; i++) {
- inptr[i ] = _mm_add_epi16(inptr[i ], rounding);
- inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
- d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
- d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
- inptr[i ] = _mm_srai_epi16(inptr[i ], 6);
- inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
- d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd);
- d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+ inptr[i] = _mm_add_epi16(inptr[i], rounding);
+ inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
+ d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
+ d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
+ inptr[i] = _mm_srai_epi16(inptr[i], 6);
+ inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
+ d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
+ d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
// Store
- _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
- _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+ _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
+ _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
}
}
} else {
// Run the un-optimised column transform
tran_low_t temp_in[16], temp_out[16];
for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j)
- temp_in[j] = out[j * 16 + i];
+ for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
vpx_highbd_idct16_c(temp_in, temp_out, bd);
for (j = 0; j < 16; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h
index bd520c18e..d762a04ab 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.h
+++ b/vpx_dsp/x86/inv_txfm_sse2.h
@@ -47,16 +47,16 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
}
-#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
+#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
{ \
const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
\
- in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
- in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
+ in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
+ in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
}
-static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
+static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) {
const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
@@ -95,43 +95,43 @@ static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
static INLINE __m128i load_input_data(const tran_low_t *data) {
#if CONFIG_VP9_HIGHBITDEPTH
return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
- data[6], data[7]);
+ data[6], data[7]);
#else
return _mm_load_si128((const __m128i *)data);
#endif
}
static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) {
- in[0] = load_input_data(input + 0 * 16);
- in[1] = load_input_data(input + 1 * 16);
- in[2] = load_input_data(input + 2 * 16);
- in[3] = load_input_data(input + 3 * 16);
- in[4] = load_input_data(input + 4 * 16);
- in[5] = load_input_data(input + 5 * 16);
- in[6] = load_input_data(input + 6 * 16);
- in[7] = load_input_data(input + 7 * 16);
-
- in[8] = load_input_data(input + 8 * 16);
- in[9] = load_input_data(input + 9 * 16);
- in[10] = load_input_data(input + 10 * 16);
- in[11] = load_input_data(input + 11 * 16);
- in[12] = load_input_data(input + 12 * 16);
- in[13] = load_input_data(input + 13 * 16);
- in[14] = load_input_data(input + 14 * 16);
- in[15] = load_input_data(input + 15 * 16);
+ in[0] = load_input_data(input + 0 * 16);
+ in[1] = load_input_data(input + 1 * 16);
+ in[2] = load_input_data(input + 2 * 16);
+ in[3] = load_input_data(input + 3 * 16);
+ in[4] = load_input_data(input + 4 * 16);
+ in[5] = load_input_data(input + 5 * 16);
+ in[6] = load_input_data(input + 6 * 16);
+ in[7] = load_input_data(input + 7 * 16);
+
+ in[8] = load_input_data(input + 8 * 16);
+ in[9] = load_input_data(input + 9 * 16);
+ in[10] = load_input_data(input + 10 * 16);
+ in[11] = load_input_data(input + 11 * 16);
+ in[12] = load_input_data(input + 12 * 16);
+ in[13] = load_input_data(input + 13 * 16);
+ in[14] = load_input_data(input + 14 * 16);
+ in[15] = load_input_data(input + 15 * 16);
}
-#define RECON_AND_STORE(dest, in_x) \
- { \
- __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
- d0 = _mm_unpacklo_epi8(d0, zero); \
- d0 = _mm_add_epi16(in_x, d0); \
- d0 = _mm_packus_epi16(d0, d0); \
- _mm_storel_epi64((__m128i *)(dest), d0); \
+#define RECON_AND_STORE(dest, in_x) \
+ { \
+ __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
+ d0 = _mm_unpacklo_epi8(d0, zero); \
+ d0 = _mm_add_epi16(in_x, d0); \
+ d0 = _mm_packus_epi16(d0, d0); \
+ _mm_storel_epi64((__m128i *)(dest), d0); \
}
static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
- const __m128i final_rounding = _mm_set1_epi16(1<<5);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
const __m128i zero = _mm_setzero_si128();
// Final rounding and shift
in[0] = _mm_adds_epi16(in[0], final_rounding);
@@ -168,16 +168,16 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
in[14] = _mm_srai_epi16(in[14], 6);
in[15] = _mm_srai_epi16(in[15], 6);
- RECON_AND_STORE(dest + 0 * stride, in[0]);
- RECON_AND_STORE(dest + 1 * stride, in[1]);
- RECON_AND_STORE(dest + 2 * stride, in[2]);
- RECON_AND_STORE(dest + 3 * stride, in[3]);
- RECON_AND_STORE(dest + 4 * stride, in[4]);
- RECON_AND_STORE(dest + 5 * stride, in[5]);
- RECON_AND_STORE(dest + 6 * stride, in[6]);
- RECON_AND_STORE(dest + 7 * stride, in[7]);
- RECON_AND_STORE(dest + 8 * stride, in[8]);
- RECON_AND_STORE(dest + 9 * stride, in[9]);
+ RECON_AND_STORE(dest + 0 * stride, in[0]);
+ RECON_AND_STORE(dest + 1 * stride, in[1]);
+ RECON_AND_STORE(dest + 2 * stride, in[2]);
+ RECON_AND_STORE(dest + 3 * stride, in[3]);
+ RECON_AND_STORE(dest + 4 * stride, in[4]);
+ RECON_AND_STORE(dest + 5 * stride, in[5]);
+ RECON_AND_STORE(dest + 6 * stride, in[6]);
+ RECON_AND_STORE(dest + 7 * stride, in[7]);
+ RECON_AND_STORE(dest + 8 * stride, in[8]);
+ RECON_AND_STORE(dest + 9 * stride, in[9]);
RECON_AND_STORE(dest + 10 * stride, in[10]);
RECON_AND_STORE(dest + 11 * stride, in[11]);
RECON_AND_STORE(dest + 12 * stride, in[12]);
diff --git a/vpx_dsp/x86/loopfilter_avx2.c b/vpx_dsp/x86/loopfilter_avx2.c
index be1087c1e..85923b447 100644
--- a/vpx_dsp/x86/loopfilter_avx2.c
+++ b/vpx_dsp/x86/loopfilter_avx2.c
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include <immintrin.h> /* AVX2 */
+#include <immintrin.h> /* AVX2 */
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/mem.h"
@@ -17,387 +17,353 @@ void vpx_lpf_horizontal_edge_8_avx2(unsigned char *s, int p,
const unsigned char *_blimit,
const unsigned char *_limit,
const unsigned char *_thresh) {
- __m128i mask, hev, flat, flat2;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i one = _mm_set1_epi8(1);
- __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
- __m128i abs_p1p0;
-
- const __m128i thresh = _mm_broadcastb_epi8(
- _mm_cvtsi32_si128((int) _thresh[0]));
- const __m128i limit = _mm_broadcastb_epi8(
- _mm_cvtsi32_si128((int) _limit[0]));
- const __m128i blimit = _mm_broadcastb_epi8(
- _mm_cvtsi32_si128((int) _blimit[0]));
-
- q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p));
- q4p4 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p)));
- q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p));
- q3p3 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p)));
- q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p));
- q2p2 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p)));
- q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p));
- q1p1 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p)));
- p1q1 = _mm_shuffle_epi32(q1p1, 78);
- q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p));
- q0p0 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p)));
- p0q0 = _mm_shuffle_epi32(q0p0, 78);
+ __m128i mask, hev, flat, flat2;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+ __m128i abs_p1p0;
+
+ const __m128i thresh =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
+ const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
+ const __m128i blimit =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
+
+ q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
+ q4p4 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
+ q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+ q3p3 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
+ q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+ q2p2 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
+ q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+ q1p1 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
+ p1q1 = _mm_shuffle_epi32(q1p1, 78);
+ q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+ q0p0 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
+ p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+ {
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+ abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1));
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+ fe = _mm_set1_epi8(0xfe);
+ ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0));
+ abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1));
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)),
+ _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+ mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8(0x80);
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+ __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+ __m128i qs0 = _mm_xor_si128(p0q0, t80);
+ __m128i qs1 = _mm_xor_si128(p1q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+ __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+ __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+ filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, qs0ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ filter1 = _mm_unpacklo_epi8(zero, filter1);
+ filter1 = _mm_srai_epi16(filter1, 0xB);
+ filter2 = _mm_unpacklo_epi8(zero, filter2);
+ filter2 = _mm_srai_epi16(filter2, 0xB);
+
+ /* Filter1 >> 3 */
+ filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+ qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
+
+ /* filt >> 1 */
+ filt = _mm_adds_epi16(filter1, t1);
+ filt = _mm_srai_epi16(filt, 1);
+ filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
+ filt);
+ filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+ qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
+ // loopfilter done
{
- __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
- abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
- _mm_subs_epu8(q0p0, q1p1));
- abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
- fe = _mm_set1_epi8(0xfe);
- ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
- abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
- _mm_subs_epu8(p0q0, q0p0));
- abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
- _mm_subs_epu8(p1q1, q1p1));
- flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
- abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
- // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(abs_p1p0, mask);
- // mask |= (abs(p1 - p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
-
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
- _mm_subs_epu8(q1p1, q2p2)),
- _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
- _mm_subs_epu8(q2p2, q3p3)));
- mask = _mm_max_epu8(work, mask);
- mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
- mask = _mm_subs_epu8(mask, limit);
- mask = _mm_cmpeq_epi8(mask, zero);
+ __m128i work;
+ flat = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)),
+ _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3)));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
+ q5p5 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
+
+ q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
+ q6p6 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
+
+ flat2 = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)),
+ _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5)));
+
+ q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
+ q7p7 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
+
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)),
+ _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
}
- // lp filter
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
{
- const __m128i t4 = _mm_set1_epi8(4);
- const __m128i t3 = _mm_set1_epi8(3);
- const __m128i t80 = _mm_set1_epi8(0x80);
- const __m128i t1 = _mm_set1_epi16(0x1);
- __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
- __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
- __m128i qs0 = _mm_xor_si128(p0q0, t80);
- __m128i qs1 = _mm_xor_si128(p1q1, t80);
- __m128i filt;
- __m128i work_a;
- __m128i filter1, filter2;
- __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
- __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
-
- filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
- work_a = _mm_subs_epi8(qs0, qs0ps0);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
- filt = _mm_and_si128(filt, mask);
-
- filter1 = _mm_adds_epi8(filt, t4);
- filter2 = _mm_adds_epi8(filt, t3);
-
- filter1 = _mm_unpacklo_epi8(zero, filter1);
- filter1 = _mm_srai_epi16(filter1, 0xB);
- filter2 = _mm_unpacklo_epi8(zero, filter2);
- filter2 = _mm_srai_epi16(filter2, 0xB);
-
- /* Filter1 >> 3 */
- filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
- qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
-
- /* filt >> 1 */
- filt = _mm_adds_epi16(filter1, t1);
- filt = _mm_srai_epi16(filt, 1);
- filt = _mm_andnot_si128(
- _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt);
- filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
- qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
- // loopfilter done
-
- {
- __m128i work;
- flat = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
- _mm_subs_epu8(q0p0, q2p2)),
- _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
- _mm_subs_epu8(q0p0, q3p3)));
- flat = _mm_max_epu8(abs_p1p0, flat);
- flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
-
- q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p));
- q5p5 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q5p5),
- (__m64 *) (s + 5 * p)));
-
- q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p));
- q6p6 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q6p6),
- (__m64 *) (s + 6 * p)));
-
- flat2 = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
- _mm_subs_epu8(q0p0, q4p4)),
- _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
- _mm_subs_epu8(q0p0, q5p5)));
-
- q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p));
- q7p7 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q7p7),
- (__m64 *) (s + 7 * p)));
-
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
- _mm_subs_epu8(q0p0, q6p6)),
- _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
- _mm_subs_epu8(q0p0, q7p7)));
-
- flat2 = _mm_max_epu8(work, flat2);
- flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
- flat2 = _mm_subs_epu8(flat2, one);
- flat2 = _mm_cmpeq_epi8(flat2, zero);
- flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
- }
-
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // flat and wide flat calculations
- {
- const __m128i eight = _mm_set1_epi16(8);
- const __m128i four = _mm_set1_epi16(4);
- __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
- __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
- __m128i pixelFilter_p, pixelFilter_q;
- __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
- __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
-
- p7_16 = _mm_unpacklo_epi8(q7p7, zero);
- p6_16 = _mm_unpacklo_epi8(q6p6, zero);
- p5_16 = _mm_unpacklo_epi8(q5p5, zero);
- p4_16 = _mm_unpacklo_epi8(q4p4, zero);
- p3_16 = _mm_unpacklo_epi8(q3p3, zero);
- p2_16 = _mm_unpacklo_epi8(q2p2, zero);
- p1_16 = _mm_unpacklo_epi8(q1p1, zero);
- p0_16 = _mm_unpacklo_epi8(q0p0, zero);
- q0_16 = _mm_unpackhi_epi8(q0p0, zero);
- q1_16 = _mm_unpackhi_epi8(q1p1, zero);
- q2_16 = _mm_unpackhi_epi8(q2p2, zero);
- q3_16 = _mm_unpackhi_epi8(q3p3, zero);
- q4_16 = _mm_unpackhi_epi8(q4p4, zero);
- q5_16 = _mm_unpackhi_epi8(q5p5, zero);
- q6_16 = _mm_unpackhi_epi8(q6p6, zero);
- q7_16 = _mm_unpackhi_epi8(q7p7, zero);
-
- pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
- _mm_add_epi16(p4_16, p3_16));
- pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
- _mm_add_epi16(q4_16, q3_16));
-
- pixetFilter_p2p1p0 = _mm_add_epi16(p0_16,
- _mm_add_epi16(p2_16, p1_16));
- pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
- pixetFilter_q2q1q0 = _mm_add_epi16(q0_16,
- _mm_add_epi16(q2_16, q1_16));
- pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
- pixelFilter_p = _mm_add_epi16(eight,
- _mm_add_epi16(pixelFilter_p, pixelFilter_q));
- pixetFilter_p2p1p0 = _mm_add_epi16(four,
- _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)),
- 4);
- flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0,
- _mm_add_epi16(p3_16, p0_16)), 3);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0,
- _mm_add_epi16(q3_16, q0_16)), 3);
-
- flat_q0p0 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(p7_16, p7_16);
- sum_q7 = _mm_add_epi16(q7_16, q7_16);
- sum_p3 = _mm_add_epi16(p3_16, p3_16);
- sum_q3 = _mm_add_epi16(q3_16, q3_16);
-
- pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)),
- 4);
- flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
-
- pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
- pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0,
- _mm_add_epi16(sum_p3, p1_16)), 3);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_q2q1q0,
- _mm_add_epi16(sum_q3, q1_16)), 3);
- flat_q1p1 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- sum_p3 = _mm_add_epi16(sum_p3, p3_16);
- sum_q3 = _mm_add_epi16(sum_q3, q3_16);
-
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)),
- 4);
- flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
-
- pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
- pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
-
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0,
- _mm_add_epi16(sum_p3, p2_16)), 3);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_q2q1q0,
- _mm_add_epi16(sum_q3, q2_16)), 3);
- flat_q2p2 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)),
- 4);
- flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)),
- 4);
- flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)),
- 4);
- flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)),
- 4);
- flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
- }
- // wide flat
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
- flat = _mm_shuffle_epi32(flat, 68);
- flat2 = _mm_shuffle_epi32(flat2, 68);
-
- q2p2 = _mm_andnot_si128(flat, q2p2);
- flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
- q2p2 = _mm_or_si128(q2p2, flat_q2p2);
-
- qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
- flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
- q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
-
- qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
- flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
- q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
-
- q6p6 = _mm_andnot_si128(flat2, q6p6);
- flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
- q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
- _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6);
- _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6));
-
- q5p5 = _mm_andnot_si128(flat2, q5p5);
- flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
- q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
- _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5);
- _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5));
-
- q4p4 = _mm_andnot_si128(flat2, q4p4);
- flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
- q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
- _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4);
- _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4));
-
- q3p3 = _mm_andnot_si128(flat2, q3p3);
- flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
- q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
- _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3);
- _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3));
-
- q2p2 = _mm_andnot_si128(flat2, q2p2);
- flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
- q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
- _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2);
- _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2));
-
- q1p1 = _mm_andnot_si128(flat2, q1p1);
- flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
- q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
- _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1);
- _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1));
-
- q0p0 = _mm_andnot_si128(flat2, q0p0);
- flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
- q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
- _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0);
- _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0));
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+ __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+ __m128i pixelFilter_p, pixelFilter_q;
+ __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+ __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+ p7_16 = _mm_unpacklo_epi8(q7p7, zero);
+ p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+ p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+ p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+ p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+ p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+ p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+ p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+ q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+ q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+ q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+ q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+ q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+ q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+ q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+ q7_16 = _mm_unpackhi_epi8(q7p7, zero);
+
+ pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+ _mm_add_epi16(p4_16, p3_16));
+ pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+ _mm_add_epi16(q4_16, q3_16));
+
+ pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
+ pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
+ pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+ pixelFilter_p =
+ _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+ pixetFilter_p2p1p0 = _mm_add_epi16(
+ four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
+ flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
+
+ flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(p7_16, p7_16);
+ sum_q7 = _mm_add_epi16(q7_16, q7_16);
+ sum_p3 = _mm_add_epi16(p3_16, p3_16);
+ sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
+ flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
+ flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+ sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
+ flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
+ flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
+ flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
+ flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
+ flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
+ flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
}
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ flat = _mm_shuffle_epi32(flat, 68);
+ flat2 = _mm_shuffle_epi32(flat2, 68);
+
+ q2p2 = _mm_andnot_si128(flat, q2p2);
+ flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat_q2p2);
+
+ qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+ flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+ q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+ qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+ flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+ q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+ q6p6 = _mm_andnot_si128(flat2, q6p6);
+ flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+ q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+ _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
+ _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
+
+ q5p5 = _mm_andnot_si128(flat2, q5p5);
+ flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+ q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+ _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
+ _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
+
+ q4p4 = _mm_andnot_si128(flat2, q4p4);
+ flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+ q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+ _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
+ _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
+
+ q3p3 = _mm_andnot_si128(flat2, q3p3);
+ flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+ q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+ _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
+ _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
+
+ q2p2 = _mm_andnot_si128(flat2, q2p2);
+ flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+ _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
+ _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
+
+ q1p1 = _mm_andnot_si128(flat2, q1p1);
+ flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+ q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+ _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
+ _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
+
+ q0p0 = _mm_andnot_si128(flat2, q0p0);
+ flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+ q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+ _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
+ _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
+ }
}
DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
- 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
+ 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
};
@@ -405,575 +371,543 @@ void vpx_lpf_horizontal_edge_16_avx2(unsigned char *s, int p,
const unsigned char *_blimit,
const unsigned char *_limit,
const unsigned char *_thresh) {
- __m128i mask, hev, flat, flat2;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i one = _mm_set1_epi8(1);
- __m128i p7, p6, p5;
- __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
- __m128i q5, q6, q7;
- __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
- q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
- p256_0, q256_0;
-
- const __m128i thresh = _mm_broadcastb_epi8(
- _mm_cvtsi32_si128((int) _thresh[0]));
- const __m128i limit = _mm_broadcastb_epi8(
- _mm_cvtsi32_si128((int) _limit[0]));
- const __m128i blimit = _mm_broadcastb_epi8(
- _mm_cvtsi32_si128((int) _blimit[0]));
-
- p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s - 5 * p)));
- p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s - 4 * p)));
- p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s - 3 * p)));
- p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s - 2 * p)));
- p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s - 1 * p)));
- q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s - 0 * p)));
- q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s + 1 * p)));
- q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s + 2 * p)));
- q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s + 3 * p)));
- q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s + 4 * p)));
-
- p4 = _mm256_castsi256_si128(p256_4);
- p3 = _mm256_castsi256_si128(p256_3);
- p2 = _mm256_castsi256_si128(p256_2);
- p1 = _mm256_castsi256_si128(p256_1);
- p0 = _mm256_castsi256_si128(p256_0);
- q0 = _mm256_castsi256_si128(q256_0);
- q1 = _mm256_castsi256_si128(q256_1);
- q2 = _mm256_castsi256_si128(q256_2);
- q3 = _mm256_castsi256_si128(q256_3);
- q4 = _mm256_castsi256_si128(q256_4);
+ __m128i mask, hev, flat, flat2;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i p7, p6, p5;
+ __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+ __m128i q5, q6, q7;
+ __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4,
+ p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
+
+ const __m128i thresh =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
+ const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
+ const __m128i blimit =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
+
+ p256_4 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p)));
+ p256_3 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
+ p256_2 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
+ p256_1 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
+ p256_0 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
+ q256_0 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
+ q256_1 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
+ q256_2 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
+ q256_3 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
+ q256_4 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p)));
+
+ p4 = _mm256_castsi256_si128(p256_4);
+ p3 = _mm256_castsi256_si128(p256_3);
+ p2 = _mm256_castsi256_si128(p256_2);
+ p1 = _mm256_castsi256_si128(p256_1);
+ p0 = _mm256_castsi256_si128(p256_0);
+ q0 = _mm256_castsi256_si128(q256_0);
+ q1 = _mm256_castsi256_si128(q256_1);
+ q2 = _mm256_castsi256_si128(q256_2);
+ q3 = _mm256_castsi256_si128(q256_3);
+ q4 = _mm256_castsi256_si128(q256_4);
+
+ {
+ const __m128i abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 =
+ _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
+ const __m128i fe = _mm_set1_epi8(0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ __m128i abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+ __m128i work;
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+ _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8(0x80);
+ const __m128i te0 = _mm_set1_epi8(0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+
+ __m128i ps1 = _mm_xor_si128(p1, t80);
+ __m128i ps0 = _mm_xor_si128(p0, t80);
+ __m128i qs0 = _mm_xor_si128(q0, t80);
+ __m128i qs1 = _mm_xor_si128(q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+ __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
+ flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5,
+ flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2;
+
+ filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ /* Filter1 >> 3 */
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+ /* Filter2 >> 3 */
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+ /* filt >> 1 */
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+ qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+ // loopfilter done
{
- const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
- _mm_subs_epu8(p0, p1));
- const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
- _mm_subs_epu8(q0, q1));
- const __m128i fe = _mm_set1_epi8(0xfe);
- const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
- __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
- _mm_subs_epu8(q0, p0));
- __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
- _mm_subs_epu8(q1, p1));
- __m128i work;
- flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
- abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
- // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(flat, mask);
- // mask |= (abs(p1 - p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
- _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
- mask = _mm_max_epu8(work, mask);
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
- _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
- mask = _mm_max_epu8(work, mask);
- mask = _mm_subs_epu8(mask, limit);
- mask = _mm_cmpeq_epi8(mask, zero);
+ __m128i work;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+ _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+ _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
+ _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ p256_5 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));
+ q256_5 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));
+ p5 = _mm256_castsi256_si128(p256_5);
+ q5 = _mm256_castsi256_si128(q256_5);
+ flat2 = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
+ _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ p256_6 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));
+ q256_6 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));
+ p6 = _mm256_castsi256_si128(p256_6);
+ q6 = _mm256_castsi256_si128(q256_6);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
+ _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+
+ p256_7 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 8 * p)));
+ q256_7 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 7 * p)));
+ p7 = _mm256_castsi256_si128(p256_7);
+ q7 = _mm256_castsi256_si128(q256_7);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
+ _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
}
- // lp filter
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
{
- const __m128i t4 = _mm_set1_epi8(4);
- const __m128i t3 = _mm_set1_epi8(3);
- const __m128i t80 = _mm_set1_epi8(0x80);
- const __m128i te0 = _mm_set1_epi8(0xe0);
- const __m128i t1f = _mm_set1_epi8(0x1f);
- const __m128i t1 = _mm_set1_epi8(0x1);
- const __m128i t7f = _mm_set1_epi8(0x7f);
-
- __m128i ps1 = _mm_xor_si128(p1, t80);
- __m128i ps0 = _mm_xor_si128(p0, t80);
- __m128i qs0 = _mm_xor_si128(q0, t80);
- __m128i qs1 = _mm_xor_si128(q1, t80);
- __m128i filt;
- __m128i work_a;
- __m128i filter1, filter2;
- __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
- flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4,
- flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1,
- flat_q2;
-
- filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
- work_a = _mm_subs_epi8(qs0, ps0);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
- filt = _mm_and_si128(filt, mask);
-
- filter1 = _mm_adds_epi8(filt, t4);
- filter2 = _mm_adds_epi8(filt, t3);
-
- /* Filter1 >> 3 */
- work_a = _mm_cmpgt_epi8(zero, filter1);
- filter1 = _mm_srli_epi16(filter1, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter1 = _mm_and_si128(filter1, t1f);
- filter1 = _mm_or_si128(filter1, work_a);
- qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-
- /* Filter2 >> 3 */
- work_a = _mm_cmpgt_epi8(zero, filter2);
- filter2 = _mm_srli_epi16(filter2, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter2 = _mm_and_si128(filter2, t1f);
- filter2 = _mm_or_si128(filter2, work_a);
- ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-
- /* filt >> 1 */
- filt = _mm_adds_epi8(filter1, t1);
- work_a = _mm_cmpgt_epi8(zero, filt);
- filt = _mm_srli_epi16(filt, 1);
- work_a = _mm_and_si128(work_a, t80);
- filt = _mm_and_si128(filt, t7f);
- filt = _mm_or_si128(filt, work_a);
- filt = _mm_andnot_si128(hev, filt);
- ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
- qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
- // loopfilter done
-
- {
- __m128i work;
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
- _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
- flat = _mm_max_epu8(work, flat);
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
- _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
- flat = _mm_max_epu8(work, flat);
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
- _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
-
- p256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s - 6 * p)));
- q256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s + 5 * p)));
- p5 = _mm256_castsi256_si128(p256_5);
- q5 = _mm256_castsi256_si128(q256_5);
- flat2 = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
- _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
-
- flat2 = _mm_max_epu8(work, flat2);
- p256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s - 7 * p)));
- q256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s + 6 * p)));
- p6 = _mm256_castsi256_si128(p256_6);
- q6 = _mm256_castsi256_si128(q256_6);
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
- _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
-
- flat2 = _mm_max_epu8(work, flat2);
-
- p256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s - 8 * p)));
- q256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s + 7 * p)));
- p7 = _mm256_castsi256_si128(p256_7);
- q7 = _mm256_castsi256_si128(q256_7);
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
- _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
-
- flat2 = _mm_max_epu8(work, flat2);
- flat2 = _mm_subs_epu8(flat2, one);
- flat2 = _mm_cmpeq_epi8(flat2, zero);
- flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
- }
-
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // flat and wide flat calculations
- {
- const __m256i eight = _mm256_set1_epi16(8);
- const __m256i four = _mm256_set1_epi16(4);
- __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
- pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
- res_q;
-
- const __m256i filter = _mm256_load_si256(
- (__m256i const *)filt_loopfilter_avx2);
- p256_7 = _mm256_shuffle_epi8(p256_7, filter);
- p256_6 = _mm256_shuffle_epi8(p256_6, filter);
- p256_5 = _mm256_shuffle_epi8(p256_5, filter);
- p256_4 = _mm256_shuffle_epi8(p256_4, filter);
- p256_3 = _mm256_shuffle_epi8(p256_3, filter);
- p256_2 = _mm256_shuffle_epi8(p256_2, filter);
- p256_1 = _mm256_shuffle_epi8(p256_1, filter);
- p256_0 = _mm256_shuffle_epi8(p256_0, filter);
- q256_0 = _mm256_shuffle_epi8(q256_0, filter);
- q256_1 = _mm256_shuffle_epi8(q256_1, filter);
- q256_2 = _mm256_shuffle_epi8(q256_2, filter);
- q256_3 = _mm256_shuffle_epi8(q256_3, filter);
- q256_4 = _mm256_shuffle_epi8(q256_4, filter);
- q256_5 = _mm256_shuffle_epi8(q256_5, filter);
- q256_6 = _mm256_shuffle_epi8(q256_6, filter);
- q256_7 = _mm256_shuffle_epi8(q256_7, filter);
-
- pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
- _mm256_add_epi16(p256_4, p256_3));
- pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
- _mm256_add_epi16(q256_4, q256_3));
-
- pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0,
- _mm256_add_epi16(p256_2, p256_1));
- pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
- pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0,
- _mm256_add_epi16(q256_2, q256_1));
- pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
-
- pixelFilter_p = _mm256_add_epi16(eight,
- _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
-
- pixetFilter_p2p1p0 = _mm256_add_epi16(four,
- _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
-
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p,
- _mm256_add_epi16(p256_7, p256_0)), 4);
-
- flat2_p0 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
- 168));
-
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p,
- _mm256_add_epi16(q256_7, q256_0)), 4);
-
- flat2_q0 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
- 168));
-
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixetFilter_p2p1p0,
- _mm256_add_epi16(p256_3, p256_0)), 3);
+ const __m256i eight = _mm256_set1_epi16(8);
+ const __m256i four = _mm256_set1_epi16(4);
+ __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
+ pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
- flat_p0 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
- 168));
+ const __m256i filter =
+ _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
+ p256_7 = _mm256_shuffle_epi8(p256_7, filter);
+ p256_6 = _mm256_shuffle_epi8(p256_6, filter);
+ p256_5 = _mm256_shuffle_epi8(p256_5, filter);
+ p256_4 = _mm256_shuffle_epi8(p256_4, filter);
+ p256_3 = _mm256_shuffle_epi8(p256_3, filter);
+ p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+ p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+ p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+ q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+ q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+ q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+ q256_3 = _mm256_shuffle_epi8(q256_3, filter);
+ q256_4 = _mm256_shuffle_epi8(q256_4, filter);
+ q256_5 = _mm256_shuffle_epi8(q256_5, filter);
+ q256_6 = _mm256_shuffle_epi8(q256_6, filter);
+ q256_7 = _mm256_shuffle_epi8(q256_7, filter);
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixetFilter_p2p1p0,
- _mm256_add_epi16(q256_3, q256_0)), 3);
+ pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
+ _mm256_add_epi16(p256_4, p256_3));
+ pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
+ _mm256_add_epi16(q256_4, q256_3));
- flat_q0 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
- 168));
+ pixetFilter_p2p1p0 =
+ _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
+ pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
- sum_p7 = _mm256_add_epi16(p256_7, p256_7);
+ pixetFilter_q2q1q0 =
+ _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
+ pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
- sum_q7 = _mm256_add_epi16(q256_7, q256_7);
+ pixelFilter_p = _mm256_add_epi16(
+ eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
- sum_p3 = _mm256_add_epi16(p256_3, p256_3);
+ pixetFilter_p2p1p0 = _mm256_add_epi16(
+ four, _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
- sum_q3 = _mm256_add_epi16(q256_3, q256_3);
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(p256_7, p256_0)), 4);
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
+ flat2_p0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(q256_7, q256_0)), 4);
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p,
- _mm256_add_epi16(sum_p7, p256_1)), 4);
+ flat2_q0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
- flat2_p1 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
- 168));
+ res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(p256_3, p256_0)),
+ 3);
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q,
- _mm256_add_epi16(sum_q7, q256_1)), 4);
+ flat_p0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
- flat2_q1 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
- 168));
+ res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(q256_3, q256_0)),
+ 3);
- pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
+ flat_q0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
- pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
+ sum_p7 = _mm256_add_epi16(p256_7, p256_7);
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixetFilter_p2p1p0,
- _mm256_add_epi16(sum_p3, p256_1)), 3);
+ sum_q7 = _mm256_add_epi16(q256_7, q256_7);
- flat_p1 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
- 168));
+ sum_p3 = _mm256_add_epi16(p256_3, p256_3);
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixetFilter_q2q1q0,
- _mm256_add_epi16(sum_q3, q256_1)), 3);
+ sum_q3 = _mm256_add_epi16(q256_3, q256_3);
- flat_q1 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
- 168));
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
- sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
- sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_1)), 4);
- sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
+ flat2_p1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
- sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_1)), 4);
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
+ flat2_q1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
+ pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p,
- _mm256_add_epi16(sum_p7, p256_2)), 4);
+ pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
- flat2_p2 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
- 168));
+ res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(sum_p3, p256_1)),
+ 3);
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q,
- _mm256_add_epi16(sum_q7, q256_2)), 4);
+ flat_p1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
- flat2_q2 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
- 168));
+ res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
+ _mm256_add_epi16(sum_q3, q256_1)),
+ 3);
- pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
+ flat_q1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
- pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixetFilter_p2p1p0,
- _mm256_add_epi16(sum_p3, p256_2)), 3);
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
- flat_p2 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
- 168));
+ sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixetFilter_q2q1q0,
- _mm256_add_epi16(sum_q3, q256_2)), 3);
+ sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
- flat_q2 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
- 168));
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
- sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
- sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_2)), 4);
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
+ flat2_p2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_2)), 4);
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p,
- _mm256_add_epi16(sum_p7, p256_3)), 4);
+ flat2_q2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
- flat2_p3 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
- 168));
+ pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q,
- _mm256_add_epi16(sum_q7, q256_3)), 4);
+ pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
- flat2_q3 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
- 168));
+ res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(sum_p3, p256_2)),
+ 3);
- sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+ flat_p2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
- sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+ res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
+ _mm256_add_epi16(sum_q3, q256_2)),
+ 3);
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
+ flat_q2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p,
- _mm256_add_epi16(sum_p7, p256_4)), 4);
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
- flat2_p4 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
- 168));
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q,
- _mm256_add_epi16(sum_q7, q256_4)), 4);
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
- flat2_q4 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
- 168));
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_3)), 4);
- sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+ flat2_p3 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
- sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_3)), 4);
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
+ flat2_q3 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p,
- _mm256_add_epi16(sum_p7, p256_5)), 4);
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
- flat2_p5 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
- 168));
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q,
- _mm256_add_epi16(sum_q7, q256_5)), 4);
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
- flat2_q5 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
- 168));
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_4)), 4);
- sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+ flat2_p4 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
- sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_4)), 4);
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
+ flat2_q4 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p,
- _mm256_add_epi16(sum_p7, p256_6)), 4);
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
- flat2_p6 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
- 168));
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q,
- _mm256_add_epi16(sum_q7, q256_6)), 4);
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
- flat2_q6 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
- 168));
- }
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_5)), 4);
- // wide flat
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ flat2_p5 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
- p2 = _mm_andnot_si128(flat, p2);
- flat_p2 = _mm_and_si128(flat, flat_p2);
- p2 = _mm_or_si128(flat_p2, p2);
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_5)), 4);
- p1 = _mm_andnot_si128(flat, ps1);
- flat_p1 = _mm_and_si128(flat, flat_p1);
- p1 = _mm_or_si128(flat_p1, p1);
+ flat2_q5 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
- p0 = _mm_andnot_si128(flat, ps0);
- flat_p0 = _mm_and_si128(flat, flat_p0);
- p0 = _mm_or_si128(flat_p0, p0);
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
- q0 = _mm_andnot_si128(flat, qs0);
- flat_q0 = _mm_and_si128(flat, flat_q0);
- q0 = _mm_or_si128(flat_q0, q0);
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
- q1 = _mm_andnot_si128(flat, qs1);
- flat_q1 = _mm_and_si128(flat, flat_q1);
- q1 = _mm_or_si128(flat_q1, q1);
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
- q2 = _mm_andnot_si128(flat, q2);
- flat_q2 = _mm_and_si128(flat, flat_q2);
- q2 = _mm_or_si128(flat_q2, q2);
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
- p6 = _mm_andnot_si128(flat2, p6);
- flat2_p6 = _mm_and_si128(flat2, flat2_p6);
- p6 = _mm_or_si128(flat2_p6, p6);
- _mm_storeu_si128((__m128i *) (s - 7 * p), p6);
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_6)), 4);
- p5 = _mm_andnot_si128(flat2, p5);
- flat2_p5 = _mm_and_si128(flat2, flat2_p5);
- p5 = _mm_or_si128(flat2_p5, p5);
- _mm_storeu_si128((__m128i *) (s - 6 * p), p5);
+ flat2_p6 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
- p4 = _mm_andnot_si128(flat2, p4);
- flat2_p4 = _mm_and_si128(flat2, flat2_p4);
- p4 = _mm_or_si128(flat2_p4, p4);
- _mm_storeu_si128((__m128i *) (s - 5 * p), p4);
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_6)), 4);
- p3 = _mm_andnot_si128(flat2, p3);
- flat2_p3 = _mm_and_si128(flat2, flat2_p3);
- p3 = _mm_or_si128(flat2_p3, p3);
- _mm_storeu_si128((__m128i *) (s - 4 * p), p3);
-
- p2 = _mm_andnot_si128(flat2, p2);
- flat2_p2 = _mm_and_si128(flat2, flat2_p2);
- p2 = _mm_or_si128(flat2_p2, p2);
- _mm_storeu_si128((__m128i *) (s - 3 * p), p2);
-
- p1 = _mm_andnot_si128(flat2, p1);
- flat2_p1 = _mm_and_si128(flat2, flat2_p1);
- p1 = _mm_or_si128(flat2_p1, p1);
- _mm_storeu_si128((__m128i *) (s - 2 * p), p1);
-
- p0 = _mm_andnot_si128(flat2, p0);
- flat2_p0 = _mm_and_si128(flat2, flat2_p0);
- p0 = _mm_or_si128(flat2_p0, p0);
- _mm_storeu_si128((__m128i *) (s - 1 * p), p0);
-
- q0 = _mm_andnot_si128(flat2, q0);
- flat2_q0 = _mm_and_si128(flat2, flat2_q0);
- q0 = _mm_or_si128(flat2_q0, q0);
- _mm_storeu_si128((__m128i *) (s - 0 * p), q0);
-
- q1 = _mm_andnot_si128(flat2, q1);
- flat2_q1 = _mm_and_si128(flat2, flat2_q1);
- q1 = _mm_or_si128(flat2_q1, q1);
- _mm_storeu_si128((__m128i *) (s + 1 * p), q1);
-
- q2 = _mm_andnot_si128(flat2, q2);
- flat2_q2 = _mm_and_si128(flat2, flat2_q2);
- q2 = _mm_or_si128(flat2_q2, q2);
- _mm_storeu_si128((__m128i *) (s + 2 * p), q2);
-
- q3 = _mm_andnot_si128(flat2, q3);
- flat2_q3 = _mm_and_si128(flat2, flat2_q3);
- q3 = _mm_or_si128(flat2_q3, q3);
- _mm_storeu_si128((__m128i *) (s + 3 * p), q3);
-
- q4 = _mm_andnot_si128(flat2, q4);
- flat2_q4 = _mm_and_si128(flat2, flat2_q4);
- q4 = _mm_or_si128(flat2_q4, q4);
- _mm_storeu_si128((__m128i *) (s + 4 * p), q4);
-
- q5 = _mm_andnot_si128(flat2, q5);
- flat2_q5 = _mm_and_si128(flat2, flat2_q5);
- q5 = _mm_or_si128(flat2_q5, q5);
- _mm_storeu_si128((__m128i *) (s + 5 * p), q5);
-
- q6 = _mm_andnot_si128(flat2, q6);
- flat2_q6 = _mm_and_si128(flat2, flat2_q6);
- q6 = _mm_or_si128(flat2_q6, q6);
- _mm_storeu_si128((__m128i *) (s + 6 * p), q6);
+ flat2_q6 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
}
+
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ p2 = _mm_andnot_si128(flat, p2);
+ flat_p2 = _mm_and_si128(flat, flat_p2);
+ p2 = _mm_or_si128(flat_p2, p2);
+
+ p1 = _mm_andnot_si128(flat, ps1);
+ flat_p1 = _mm_and_si128(flat, flat_p1);
+ p1 = _mm_or_si128(flat_p1, p1);
+
+ p0 = _mm_andnot_si128(flat, ps0);
+ flat_p0 = _mm_and_si128(flat, flat_p0);
+ p0 = _mm_or_si128(flat_p0, p0);
+
+ q0 = _mm_andnot_si128(flat, qs0);
+ flat_q0 = _mm_and_si128(flat, flat_q0);
+ q0 = _mm_or_si128(flat_q0, q0);
+
+ q1 = _mm_andnot_si128(flat, qs1);
+ flat_q1 = _mm_and_si128(flat, flat_q1);
+ q1 = _mm_or_si128(flat_q1, q1);
+
+ q2 = _mm_andnot_si128(flat, q2);
+ flat_q2 = _mm_and_si128(flat, flat_q2);
+ q2 = _mm_or_si128(flat_q2, q2);
+
+ p6 = _mm_andnot_si128(flat2, p6);
+ flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+ p6 = _mm_or_si128(flat2_p6, p6);
+ _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
+
+ p5 = _mm_andnot_si128(flat2, p5);
+ flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+ p5 = _mm_or_si128(flat2_p5, p5);
+ _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
+
+ p4 = _mm_andnot_si128(flat2, p4);
+ flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+ p4 = _mm_or_si128(flat2_p4, p4);
+ _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
+
+ p3 = _mm_andnot_si128(flat2, p3);
+ flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+ p3 = _mm_or_si128(flat2_p3, p3);
+ _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+
+ p2 = _mm_andnot_si128(flat2, p2);
+ flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+ p2 = _mm_or_si128(flat2_p2, p2);
+ _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+
+ p1 = _mm_andnot_si128(flat2, p1);
+ flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+ p1 = _mm_or_si128(flat2_p1, p1);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+
+ p0 = _mm_andnot_si128(flat2, p0);
+ flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+ p0 = _mm_or_si128(flat2_p0, p0);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+
+ q0 = _mm_andnot_si128(flat2, q0);
+ flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+ q0 = _mm_or_si128(flat2_q0, q0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+
+ q1 = _mm_andnot_si128(flat2, q1);
+ flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+ q1 = _mm_or_si128(flat2_q1, q1);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+
+ q2 = _mm_andnot_si128(flat2, q2);
+ flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+ q2 = _mm_or_si128(flat2_q2, q2);
+ _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+
+ q3 = _mm_andnot_si128(flat2, q3);
+ flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+ q3 = _mm_or_si128(flat2_q3, q3);
+ _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+
+ q4 = _mm_andnot_si128(flat2, q4);
+ flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+ q4 = _mm_or_si128(flat2_q4, q4);
+ _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
+
+ q5 = _mm_andnot_si128(flat2, q5);
+ flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+ q5 = _mm_or_si128(flat2_q5, q5);
+ _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+
+ q6 = _mm_andnot_si128(flat2, q6);
+ flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+ q6 = _mm_or_si128(flat2_q6, q6);
+ _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
+ }
}
diff --git a/vpx_dsp/x86/loopfilter_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c
index 739adf31d..e13334ae0 100644
--- a/vpx_dsp/x86/loopfilter_sse2.c
+++ b/vpx_dsp/x86/loopfilter_sse2.c
@@ -19,84 +19,89 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
}
// filter_mask and hev_mask
-#define FILTER_HEV_MASK do { \
- /* (abs(q1 - q0), abs(p1 - p0) */ \
- __m128i flat = abs_diff(q1p1, q0p0); \
- /* abs(p1 - q1), abs(p0 - q0) */ \
- const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \
- __m128i abs_p0q0, abs_p1q1, work; \
- \
- /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \
- hev = _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
- hev = _mm_cmpgt_epi16(hev, thresh); \
- hev = _mm_packs_epi16(hev, hev); \
- \
- /* const int8_t mask = filter_mask(*limit, *blimit, */ \
- /* p3, p2, p1, p0, q0, q1, q2, q3); */ \
- abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */\
- abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */\
- abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \
- abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \
- /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
- mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \
- /* abs(p3 - p2), abs(p2 - p1) */ \
- work = abs_diff(p3p2, p2p1); \
- flat = _mm_max_epu8(work, flat); \
- /* abs(q3 - q2), abs(q2 - q1) */ \
- work = abs_diff(q3q2, q2q1); \
- flat = _mm_max_epu8(work, flat); \
- flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \
- mask = _mm_unpacklo_epi64(mask, flat); \
- mask = _mm_subs_epu8(mask, limit); \
- mask = _mm_cmpeq_epi8(mask, zero); \
- mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \
-} while (0)
-
-#define FILTER4 do { \
- const __m128i t3t4 = _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, \
- 4, 4, 4, 4, 4, 4, 4, 4); \
- const __m128i t80 = _mm_set1_epi8(0x80); \
- __m128i filter, filter2filter1, work; \
- \
- ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \
- qs1qs0 = _mm_xor_si128(q1q0, t80); \
- \
- /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ \
- work = _mm_subs_epi8(ps1ps0, qs1qs0); \
- filter = _mm_and_si128(_mm_srli_si128(work, 8), hev); \
- /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ \
- filter = _mm_subs_epi8(filter, work); \
- filter = _mm_subs_epi8(filter, work); \
- filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ \
- filter = _mm_and_si128(filter, mask); /* & mask */ \
- filter = _mm_unpacklo_epi64(filter, filter); \
- \
- /* filter1 = signed_char_clamp(filter + 4) >> 3; */ \
- /* filter2 = signed_char_clamp(filter + 3) >> 3; */ \
- filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ \
- filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); \
- filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); \
- filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ \
- filter = _mm_srai_epi16(filter, 11); /* >> 3 */ \
- filter2filter1 = _mm_packs_epi16(filter2filter1, filter); \
- \
- /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ \
- filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ \
- filter = _mm_unpacklo_epi8(filter, filter); \
- filter = _mm_srai_epi16(filter, 9); /* round */ \
- filter = _mm_packs_epi16(filter, filter); \
- filter = _mm_andnot_si128(hev, filter); \
- \
- hev = _mm_unpackhi_epi64(filter2filter1, filter); \
- filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); \
- \
- /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \
- qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1); \
- /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \
- ps1ps0 = _mm_adds_epi8(ps1ps0, hev); \
- qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */ \
- ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \
-} while (0)
+#define FILTER_HEV_MASK \
+ do { \
+ /* (abs(q1 - q0), abs(p1 - p0) */ \
+ __m128i flat = abs_diff(q1p1, q0p0); \
+ /* abs(p1 - q1), abs(p0 - q0) */ \
+ const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \
+ __m128i abs_p0q0, abs_p1q1, work; \
+ \
+ /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \
+ hev = \
+ _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
+ hev = _mm_cmpgt_epi16(hev, thresh); \
+ hev = _mm_packs_epi16(hev, hev); \
+ \
+ /* const int8_t mask = filter_mask(*limit, *blimit, */ \
+ /* p3, p2, p1, p0, q0, q1, q2, q3); */ \
+ abs_p0q0 = \
+ _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ \
+ abs_p1q1 = \
+ _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ \
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \
+ abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \
+ /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
+ mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \
+ /* abs(p3 - p2), abs(p2 - p1) */ \
+ work = abs_diff(p3p2, p2p1); \
+ flat = _mm_max_epu8(work, flat); \
+ /* abs(q3 - q2), abs(q2 - q1) */ \
+ work = abs_diff(q3q2, q2q1); \
+ flat = _mm_max_epu8(work, flat); \
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \
+ mask = _mm_unpacklo_epi64(mask, flat); \
+ mask = _mm_subs_epu8(mask, limit); \
+ mask = _mm_cmpeq_epi8(mask, zero); \
+ mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \
+ } while (0)
+
+#define FILTER4 \
+ do { \
+ const __m128i t3t4 = \
+ _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); \
+ const __m128i t80 = _mm_set1_epi8(0x80); \
+ __m128i filter, filter2filter1, work; \
+ \
+ ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \
+ qs1qs0 = _mm_xor_si128(q1q0, t80); \
+ \
+ /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ \
+ work = _mm_subs_epi8(ps1ps0, qs1qs0); \
+ filter = _mm_and_si128(_mm_srli_si128(work, 8), hev); \
+ /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ \
+ filter = _mm_subs_epi8(filter, work); \
+ filter = _mm_subs_epi8(filter, work); \
+ filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ \
+ filter = _mm_and_si128(filter, mask); /* & mask */ \
+ filter = _mm_unpacklo_epi64(filter, filter); \
+ \
+ /* filter1 = signed_char_clamp(filter + 4) >> 3; */ \
+ /* filter2 = signed_char_clamp(filter + 3) >> 3; */ \
+ filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ \
+ filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); \
+ filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); \
+ filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ \
+ filter = _mm_srai_epi16(filter, 11); /* >> 3 */ \
+ filter2filter1 = _mm_packs_epi16(filter2filter1, filter); \
+ \
+ /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ \
+ filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ \
+ filter = _mm_unpacklo_epi8(filter, filter); \
+ filter = _mm_srai_epi16(filter, 9); /* round */ \
+ filter = _mm_packs_epi16(filter, filter); \
+ filter = _mm_andnot_si128(hev, filter); \
+ \
+ hev = _mm_unpackhi_epi64(filter2filter1, filter); \
+ filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); \
+ \
+ /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \
+ qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1); \
+ /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \
+ ps1ps0 = _mm_adds_epi8(ps1ps0, hev); \
+ qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */ \
+ ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \
+ } while (0)
void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
const uint8_t *_blimit, const uint8_t *_limit,
@@ -128,8 +133,8 @@ void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
FILTER4;
_mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1
- _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0
- _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0
+ _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0
+ _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0
_mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1
}
@@ -238,27 +243,27 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
__m128i abs_p1p0;
q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
- q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
- (__m64 *)(s + 4 * p)));
+ q4p4 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
- q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3),
- (__m64 *)(s + 3 * p)));
+ q3p3 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
- q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2),
- (__m64 *)(s + 2 * p)));
+ q2p2 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
- q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1),
- (__m64 *)(s + 1 * p)));
+ q1p1 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
p1q1 = _mm_shuffle_epi32(q1p1, 78);
q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
- q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0),
- (__m64 *)(s - 0 * p)));
+ q0p0 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
p0q0 = _mm_shuffle_epi32(q0p0, 78);
{
__m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
abs_p1p0 = abs_diff(q1p1, q0p0);
- abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
fe = _mm_set1_epi8(0xfe);
ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
abs_p0q0 = abs_diff(q0p0, p0q0);
@@ -267,7 +272,7 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
hev = _mm_subs_epu8(flat, thresh);
hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
- abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
@@ -276,8 +281,7 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
// mask |= (abs(p1 - p0) > limit) * -1;
// mask |= (abs(q1 - q0) > limit) * -1;
- work = _mm_max_epu8(abs_diff(q2p2, q1p1),
- abs_diff(q3p3, q2p2));
+ work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
mask = _mm_max_epu8(work, mask);
mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
mask = _mm_subs_epu8(mask, limit);
@@ -339,17 +343,17 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
flat = _mm_and_si128(flat, mask);
q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
- q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5),
- (__m64 *)(s + 5 * p)));
+ q5p5 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
- q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6),
- (__m64 *)(s + 6 * p)));
+ q6p6 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
- q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7),
- (__m64 *)(s + 7 * p)));
+ q7p7 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
flat2 = _mm_max_epu8(work, flat2);
flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
@@ -369,7 +373,7 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
__m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
__m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
- p7_16 = _mm_unpacklo_epi8(q7p7, zero);;
+ p7_16 = _mm_unpacklo_epi8(q7p7, zero);
p6_16 = _mm_unpacklo_epi8(q6p6, zero);
p5_16 = _mm_unpacklo_epi8(q5p5, zero);
p4_16 = _mm_unpacklo_epi8(q4p4, zero);
@@ -392,24 +396,23 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
_mm_add_epi16(q4_16, q3_16));
pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
- pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+ pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
- pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
- pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
- pixelFilter_q));
- pixetFilter_p2p1p0 = _mm_add_epi16(four,
- _mm_add_epi16(pixetFilter_p2p1p0,
- pixetFilter_q2q1q0));
- res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(p7_16, p0_16)), 4);
- res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(q7_16, q0_16)), 4);
+ pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+ pixelFilter_p =
+ _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+ pixetFilter_p2p1p0 = _mm_add_epi16(
+ four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
- res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
- _mm_add_epi16(p3_16, p0_16)), 3);
- res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
- _mm_add_epi16(q3_16, q0_16)), 3);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
flat_q0p0 = _mm_packus_epi16(res_p, res_q);
@@ -420,18 +423,18 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
- res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(sum_p7, p1_16)), 4);
- res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
- _mm_add_epi16(sum_q7, q1_16)), 4);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
- res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
- _mm_add_epi16(sum_p3, p1_16)), 3);
- res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
- _mm_add_epi16(sum_q3, q1_16)), 3);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
flat_q1p1 = _mm_packus_epi16(res_p, res_q);
sum_p7 = _mm_add_epi16(sum_p7, p7_16);
@@ -441,59 +444,59 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
- res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(sum_p7, p2_16)), 4);
- res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
- _mm_add_epi16(sum_q7, q2_16)), 4);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
- res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
- _mm_add_epi16(sum_p3, p2_16)), 3);
- res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
- _mm_add_epi16(sum_q3, q2_16)), 3);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
flat_q2p2 = _mm_packus_epi16(res_p, res_q);
sum_p7 = _mm_add_epi16(sum_p7, p7_16);
sum_q7 = _mm_add_epi16(sum_q7, q7_16);
pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
- res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(sum_p7, p3_16)), 4);
- res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
- _mm_add_epi16(sum_q7, q3_16)), 4);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
sum_p7 = _mm_add_epi16(sum_p7, p7_16);
sum_q7 = _mm_add_epi16(sum_q7, q7_16);
pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
- res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(sum_p7, p4_16)), 4);
- res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
- _mm_add_epi16(sum_q7, q4_16)), 4);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
sum_p7 = _mm_add_epi16(sum_p7, p7_16);
sum_q7 = _mm_add_epi16(sum_q7, q7_16);
pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
- res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(sum_p7, p5_16)), 4);
- res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
- _mm_add_epi16(sum_q7, q5_16)), 4);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
sum_p7 = _mm_add_epi16(sum_p7, p7_16);
sum_q7 = _mm_add_epi16(sum_q7, q7_16);
pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
- res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(sum_p7, p6_16)), 4);
- res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
- _mm_add_epi16(sum_q7, q6_16)), 4);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
}
// wide flat
@@ -554,7 +557,7 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
_mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
- _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
+ _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
}
}
@@ -572,8 +575,8 @@ static INLINE __m128i filter8_mask(const __m128i *const flat,
const __m128i *const other_filt,
const __m128i *const f8_lo,
const __m128i *const f8_hi) {
- const __m128i f8 = _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3),
- _mm_srli_epi16(*f8_hi, 3));
+ const __m128i f8 =
+ _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
const __m128i result = _mm_and_si128(*flat, f8);
return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
}
@@ -582,8 +585,8 @@ static INLINE __m128i filter16_mask(const __m128i *const flat,
const __m128i *const other_filt,
const __m128i *const f_lo,
const __m128i *const f_hi) {
- const __m128i f = _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4),
- _mm_srli_epi16(*f_hi, 4));
+ const __m128i f =
+ _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
const __m128i result = _mm_and_si128(*flat, f);
return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
}
@@ -633,7 +636,7 @@ void vpx_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,
__m128i work;
max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
- abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
@@ -832,16 +835,16 @@ void vpx_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,
__m128i f_hi;
f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7
- f_lo = _mm_add_epi16(_mm_slli_epi16(p6_lo, 1),
- _mm_add_epi16(p4_lo, f_lo));
+ f_lo =
+ _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo));
f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
_mm_add_epi16(p2_lo, p1_lo));
f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7
- f_hi = _mm_add_epi16(_mm_slli_epi16(p6_hi, 1),
- _mm_add_epi16(p4_hi, f_hi));
+ f_hi =
+ _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi));
f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
_mm_add_epi16(p2_hi, p1_hi));
f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
@@ -956,7 +959,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
const __m128i ff = _mm_cmpeq_epi8(fe, fe);
__m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
abs_p1p0 = abs_diff(q1p1, q0p0);
- abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
abs_p0q0 = abs_diff(q0p0, p0q0);
abs_p1q1 = abs_diff(q1p1, p1q1);
@@ -964,7 +967,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
hev = _mm_subs_epu8(flat, thresh);
hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
- abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
@@ -973,8 +976,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
// mask |= (abs(p1 - p0) > limit) * -1;
// mask |= (abs(q1 - q0) > limit) * -1;
- work = _mm_max_epu8(abs_diff(q2p2, q1p1),
- abs_diff(q3p3, q2p2));
+ work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
mask = _mm_max_epu8(work, mask);
mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
mask = _mm_subs_epu8(mask, limit);
@@ -982,8 +984,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
// flat_mask4
- flat = _mm_max_epu8(abs_diff(q2p2, q0p0),
- abs_diff(q3p3, q0p0));
+ flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
flat = _mm_max_epu8(abs_p1p0, flat);
flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
flat = _mm_subs_epu8(flat, one);
@@ -1048,14 +1049,14 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
const __m128i t3 = _mm_set1_epi8(3);
const __m128i t80 = _mm_set1_epi8(0x80);
const __m128i t1 = _mm_set1_epi8(0x1);
- const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
- t80);
- const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
- t80);
- const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)),
- t80);
- const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)),
- t80);
+ const __m128i ps1 =
+ _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80);
+ const __m128i ps0 =
+ _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80);
+ const __m128i qs0 =
+ _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80);
+ const __m128i qs1 =
+ _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80);
__m128i filt;
__m128i work_a;
__m128i filter1, filter2;
@@ -1134,8 +1135,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
}
}
-void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
- const uint8_t *_blimit0,
+void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
const uint8_t *_limit0,
const uint8_t *_thresh0,
const uint8_t *_blimit1,
@@ -1170,17 +1170,17 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
{
- const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
- _mm_subs_epu8(p0, p1));
- const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
- _mm_subs_epu8(q0, q1));
+ const __m128i abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 =
+ _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
const __m128i one = _mm_set1_epi8(1);
const __m128i fe = _mm_set1_epi8(0xfe);
const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
- __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
- _mm_subs_epu8(q0, p0));
- __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
- _mm_subs_epu8(q1, p1));
+ __m128i abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
__m128i work;
// filter_mask and hev_mask
@@ -1188,7 +1188,7 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
hev = _mm_subs_epu8(flat, thresh);
hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
- abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
@@ -1196,29 +1196,25 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
mask = _mm_max_epu8(flat, mask);
// mask |= (abs(p1 - p0) > limit) * -1;
// mask |= (abs(q1 - q0) > limit) * -1;
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
- _mm_subs_epu8(p1, p2)),
- _mm_or_si128(_mm_subs_epu8(p3, p2),
- _mm_subs_epu8(p2, p3)));
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
mask = _mm_max_epu8(work, mask);
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
- _mm_subs_epu8(q1, q2)),
- _mm_or_si128(_mm_subs_epu8(q3, q2),
- _mm_subs_epu8(q2, q3)));
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+ _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
mask = _mm_max_epu8(work, mask);
mask = _mm_subs_epu8(mask, limit);
mask = _mm_cmpeq_epi8(mask, zero);
// flat_mask4
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
- _mm_subs_epu8(p0, p2)),
- _mm_or_si128(_mm_subs_epu8(q2, q0),
- _mm_subs_epu8(q0, q2)));
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+ _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
flat = _mm_max_epu8(work, flat);
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
- _mm_subs_epu8(p0, p3)),
- _mm_or_si128(_mm_subs_epu8(q3, q0),
- _mm_subs_epu8(q0, q3)));
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+ _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
flat = _mm_max_epu8(work, flat);
flat = _mm_subs_epu8(flat, one);
flat = _mm_cmpeq_epi8(flat, zero);
@@ -1289,14 +1285,14 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
const __m128i t1 = _mm_set1_epi8(0x1);
const __m128i t7f = _mm_set1_epi8(0x7f);
- const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
- t80);
- const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
- t80);
- const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
- t80);
- const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
- t80);
+ const __m128i ps1 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+ const __m128i ps0 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+ const __m128i qs0 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+ const __m128i qs1 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
__m128i filt;
__m128i work_a;
__m128i filter1, filter2;
@@ -1412,23 +1408,23 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
// filter_mask and hev_mask
{
- const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
- _mm_subs_epu8(p0, p1));
- const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
- _mm_subs_epu8(q0, q1));
+ const __m128i abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 =
+ _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
const __m128i fe = _mm_set1_epi8(0xfe);
const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
- __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
- _mm_subs_epu8(q0, p0));
- __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
- _mm_subs_epu8(q1, p1));
+ __m128i abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
__m128i work;
flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
hev = _mm_subs_epu8(flat, thresh);
hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
- abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
@@ -1436,15 +1432,13 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
mask = _mm_max_epu8(flat, mask);
// mask |= (abs(p1 - p0) > limit) * -1;
// mask |= (abs(q1 - q0) > limit) * -1;
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
- _mm_subs_epu8(p1, p2)),
- _mm_or_si128(_mm_subs_epu8(p3, p2),
- _mm_subs_epu8(p2, p3)));
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
mask = _mm_max_epu8(work, mask);
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
- _mm_subs_epu8(q1, q2)),
- _mm_or_si128(_mm_subs_epu8(q3, q2),
- _mm_subs_epu8(q2, q3)));
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+ _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
mask = _mm_max_epu8(work, mask);
mask = _mm_subs_epu8(mask, limit);
mask = _mm_cmpeq_epi8(mask, zero);
@@ -1460,14 +1454,14 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
const __m128i t1 = _mm_set1_epi8(0x1);
const __m128i t7f = _mm_set1_epi8(0x7f);
- const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
- t80);
- const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
- t80);
- const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
- t80);
- const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
- t80);
+ const __m128i ps1 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+ const __m128i ps0 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+ const __m128i qs0 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+ const __m128i qs1 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
__m128i filt;
__m128i work_a;
__m128i filter1, filter2;
@@ -1525,44 +1519,44 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
__m128i x8, x9, x10, x11, x12, x13, x14, x15;
// 2-way interleave w/hoisting of unpacks
- x0 = _mm_loadl_epi64((__m128i *)in0); // 1
+ x0 = _mm_loadl_epi64((__m128i *)in0); // 1
x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); // 3
- x0 = _mm_unpacklo_epi8(x0, x1); // 1
+ x0 = _mm_unpacklo_epi8(x0, x1); // 1
x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); // 5
- x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p)); // 7
- x1 = _mm_unpacklo_epi8(x2, x3); // 2
+ x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p)); // 7
+ x1 = _mm_unpacklo_epi8(x2, x3); // 2
- x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p)); // 9
- x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p)); // 11
- x2 = _mm_unpacklo_epi8(x4, x5); // 3
+ x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p)); // 9
+ x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p)); // 11
+ x2 = _mm_unpacklo_epi8(x4, x5); // 3
- x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p)); // 13
- x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p)); // 15
- x3 = _mm_unpacklo_epi8(x6, x7); // 4
- x4 = _mm_unpacklo_epi16(x0, x1); // 9
+ x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p)); // 13
+ x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p)); // 15
+ x3 = _mm_unpacklo_epi8(x6, x7); // 4
+ x4 = _mm_unpacklo_epi16(x0, x1); // 9
- x8 = _mm_loadl_epi64((__m128i *)in1); // 2
+ x8 = _mm_loadl_epi64((__m128i *)in1); // 2
x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); // 4
- x8 = _mm_unpacklo_epi8(x8, x9); // 5
- x5 = _mm_unpacklo_epi16(x2, x3); // 10
+ x8 = _mm_unpacklo_epi8(x8, x9); // 5
+ x5 = _mm_unpacklo_epi16(x2, x3); // 10
x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); // 6
- x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p)); // 8
- x9 = _mm_unpacklo_epi8(x10, x11); // 6
+ x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p)); // 8
+ x9 = _mm_unpacklo_epi8(x10, x11); // 6
- x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p)); // 10
- x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p)); // 12
- x10 = _mm_unpacklo_epi8(x12, x13); // 7
- x12 = _mm_unpacklo_epi16(x8, x9); // 11
+ x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p)); // 10
+ x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p)); // 12
+ x10 = _mm_unpacklo_epi8(x12, x13); // 7
+ x12 = _mm_unpacklo_epi16(x8, x9); // 11
- x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p)); // 14
- x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p)); // 16
- x11 = _mm_unpacklo_epi8(x14, x15); // 8
- x13 = _mm_unpacklo_epi16(x10, x11); // 12
+ x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p)); // 14
+ x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p)); // 16
+ x11 = _mm_unpacklo_epi8(x14, x15); // 8
+ x13 = _mm_unpacklo_epi16(x10, x11); // 12
- x6 = _mm_unpacklo_epi32(x4, x5); // 13
- x7 = _mm_unpackhi_epi32(x4, x5); // 14
+ x6 = _mm_unpacklo_epi32(x4, x5); // 13
+ x7 = _mm_unpackhi_epi32(x4, x5); // 14
x14 = _mm_unpacklo_epi32(x12, x13); // 15
x15 = _mm_unpackhi_epi32(x12, x13); // 16
@@ -1598,23 +1592,31 @@ static INLINE void transpose(unsigned char *src[], int in_p,
unsigned char *in = src[idx8x8];
unsigned char *out = dst[idx8x8];
- x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07
- x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17
+ x0 =
+ _mm_loadl_epi64((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07
+ x1 =
+ _mm_loadl_epi64((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17
// 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
x0 = _mm_unpacklo_epi8(x0, x1);
- x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27
- x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37
+ x2 =
+ _mm_loadl_epi64((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27
+ x3 =
+ _mm_loadl_epi64((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37
// 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
x1 = _mm_unpacklo_epi8(x2, x3);
- x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47
- x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57
+ x4 =
+ _mm_loadl_epi64((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47
+ x5 =
+ _mm_loadl_epi64((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57
// 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
x2 = _mm_unpacklo_epi8(x4, x5);
- x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67
- x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77
+ x6 =
+ _mm_loadl_epi64((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67
+ x7 =
+ _mm_loadl_epi64((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77
// 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
x3 = _mm_unpacklo_epi8(x6, x7);
@@ -1624,15 +1626,15 @@ static INLINE void transpose(unsigned char *src[], int in_p,
x5 = _mm_unpacklo_epi16(x2, x3);
// 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
x6 = _mm_unpacklo_epi32(x4, x5);
- _mm_storel_pd((double *)(out + 0*out_p),
+ _mm_storel_pd((double *)(out + 0 * out_p),
_mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70
- _mm_storeh_pd((double *)(out + 1*out_p),
+ _mm_storeh_pd((double *)(out + 1 * out_p),
_mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71
// 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
x7 = _mm_unpackhi_epi32(x4, x5);
- _mm_storel_pd((double *)(out + 2*out_p),
+ _mm_storel_pd((double *)(out + 2 * out_p),
_mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72
- _mm_storeh_pd((double *)(out + 3*out_p),
+ _mm_storeh_pd((double *)(out + 3 * out_p),
_mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73
// 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
@@ -1641,25 +1643,23 @@ static INLINE void transpose(unsigned char *src[], int in_p,
x5 = _mm_unpackhi_epi16(x2, x3);
// 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
x6 = _mm_unpacklo_epi32(x4, x5);
- _mm_storel_pd((double *)(out + 4*out_p),
+ _mm_storel_pd((double *)(out + 4 * out_p),
_mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74
- _mm_storeh_pd((double *)(out + 5*out_p),
+ _mm_storeh_pd((double *)(out + 5 * out_p),
_mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75
// 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
x7 = _mm_unpackhi_epi32(x4, x5);
- _mm_storel_pd((double *)(out + 6*out_p),
+ _mm_storel_pd((double *)(out + 6 * out_p),
_mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76
- _mm_storeh_pd((double *)(out + 7*out_p),
+ _mm_storeh_pd((double *)(out + 7 * out_p),
_mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77
} while (++idx8x8 < num_8x8_to_transpose);
}
void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1) {
DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
unsigned char *src[2];
@@ -1705,10 +1705,8 @@ void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
}
void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1) {
DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
unsigned char *src[2];
diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c
index 8aa4568d6..d3885d7e1 100644
--- a/vpx_dsp/x86/quantize_sse2.c
+++ b/vpx_dsp/x86/quantize_sse2.c
@@ -14,18 +14,19 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
-static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
+static INLINE __m128i load_coefficients(const tran_low_t* coeff_ptr) {
#if CONFIG_VP9_HIGHBITDEPTH
return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
- (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], (int16_t)coeff_ptr[4],
- (int16_t)coeff_ptr[5], (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
+ (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
+ (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
+ (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
#else
- return _mm_load_si128((const __m128i *)coeff_ptr);
+ return _mm_load_si128((const __m128i*)coeff_ptr);
#endif
}
static INLINE void store_coefficients(__m128i coeff_vals,
- tran_low_t *coeff_ptr) {
+ tran_low_t* coeff_ptr) {
#if CONFIG_VP9_HIGHBITDEPTH
__m128i one = _mm_set1_epi16(1);
__m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
@@ -44,8 +45,7 @@ void vpx_quantize_b_sse2(const tran_low_t* coeff_ptr, intptr_t n_coeffs,
const int16_t* round_ptr, const int16_t* quant_ptr,
const int16_t* quant_shift_ptr, tran_low_t* qcoeff_ptr,
tran_low_t* dqcoeff_ptr, const int16_t* dequant_ptr,
- uint16_t* eob_ptr,
- const int16_t* scan_ptr,
+ uint16_t* eob_ptr, const int16_t* scan_ptr,
const int16_t* iscan_ptr) {
__m128i zero;
(void)scan_ptr;
diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c
index 793658f9e..962b8fb11 100644
--- a/vpx_dsp/x86/sad4d_avx2.c
+++ b/vpx_dsp/x86/sad4d_avx2.c
@@ -11,10 +11,8 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
-void vpx_sad32x32x4d_avx2(const uint8_t *src,
- int src_stride,
- const uint8_t *const ref[4],
- int ref_stride,
+void vpx_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
uint32_t res[4]) {
__m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
__m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
@@ -30,7 +28,7 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src,
sum_ref1 = _mm256_set1_epi16(0);
sum_ref2 = _mm256_set1_epi16(0);
sum_ref3 = _mm256_set1_epi16(0);
- for (i = 0; i < 32 ; i++) {
+ for (i = 0; i < 32; i++) {
// load src and all refs
src_reg = _mm256_loadu_si256((const __m256i *)src);
ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
@@ -48,11 +46,11 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src,
sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
- src+= src_stride;
- ref0+= ref_stride;
- ref1+= ref_stride;
- ref2+= ref_stride;
- ref3+= ref_stride;
+ src += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
+ ref3 += ref_stride;
}
{
__m128i sum;
@@ -81,10 +79,8 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src,
}
}
-void vpx_sad64x64x4d_avx2(const uint8_t *src,
- int src_stride,
- const uint8_t *const ref[4],
- int ref_stride,
+void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
uint32_t res[4]) {
__m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
__m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
@@ -102,7 +98,7 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src,
sum_ref1 = _mm256_set1_epi16(0);
sum_ref2 = _mm256_set1_epi16(0);
sum_ref3 = _mm256_set1_epi16(0);
- for (i = 0; i < 64 ; i++) {
+ for (i = 0; i < 64; i++) {
// load 64 bytes from src and all refs
src_reg = _mm256_loadu_si256((const __m256i *)src);
srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32));
@@ -133,11 +129,11 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src,
sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg);
sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg);
sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg);
- src+= src_stride;
- ref0+= ref_stride;
- ref1+= ref_stride;
- ref2+= ref_stride;
- ref3+= ref_stride;
+ src += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
+ ref3 += ref_stride;
}
{
__m128i sum;
diff --git a/vpx_dsp/x86/sad_avx2.c b/vpx_dsp/x86/sad_avx2.c
index ce9ad8f78..d94413430 100644
--- a/vpx_dsp/x86/sad_avx2.c
+++ b/vpx_dsp/x86/sad_avx2.c
@@ -11,75 +11,74 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/mem.h"
-#define FSAD64_H(h) \
-unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, \
- int src_stride, \
- const uint8_t *ref_ptr, \
- int ref_stride) { \
- int i, res; \
- __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
- __m256i sum_sad = _mm256_setzero_si256(); \
- __m256i sum_sad_h; \
- __m128i sum_sad128; \
- for (i = 0 ; i < h ; i++) { \
- ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
- ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
- sad1_reg = _mm256_sad_epu8(ref1_reg, \
- _mm256_loadu_si256((__m256i const *)src_ptr)); \
- sad2_reg = _mm256_sad_epu8(ref2_reg, \
- _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
- sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
- ref_ptr+= ref_stride; \
- src_ptr+= src_stride; \
- } \
- sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
- sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
- sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
- sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
- res = _mm_cvtsi128_si32(sum_sad128); \
- return res; \
-}
+#define FSAD64_H(h) \
+ unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ int i, res; \
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+ __m256i sum_sad = _mm256_setzero_si256(); \
+ __m256i sum_sad_h; \
+ __m128i sum_sad128; \
+ for (i = 0; i < h; i++) { \
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
+ sad1_reg = _mm256_sad_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
+ sad2_reg = _mm256_sad_epu8( \
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
+ sum_sad = \
+ _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+ ref_ptr += ref_stride; \
+ src_ptr += src_stride; \
+ } \
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+ res = _mm_cvtsi128_si32(sum_sad128); \
+ return res; \
+ }
-#define FSAD32_H(h) \
-unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, \
- int src_stride, \
- const uint8_t *ref_ptr, \
- int ref_stride) { \
- int i, res; \
- __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
- __m256i sum_sad = _mm256_setzero_si256(); \
- __m256i sum_sad_h; \
- __m128i sum_sad128; \
- int ref2_stride = ref_stride << 1; \
- int src2_stride = src_stride << 1; \
- int max = h >> 1; \
- for (i = 0 ; i < max ; i++) { \
- ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
- ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
- sad1_reg = _mm256_sad_epu8(ref1_reg, \
- _mm256_loadu_si256((__m256i const *)src_ptr)); \
- sad2_reg = _mm256_sad_epu8(ref2_reg, \
- _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
- sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
- ref_ptr+= ref2_stride; \
- src_ptr+= src2_stride; \
- } \
- sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
- sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
- sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
- sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
- res = _mm_cvtsi128_si32(sum_sad128); \
- return res; \
-}
+#define FSAD32_H(h) \
+ unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ int i, res; \
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+ __m256i sum_sad = _mm256_setzero_si256(); \
+ __m256i sum_sad_h; \
+ __m128i sum_sad128; \
+ int ref2_stride = ref_stride << 1; \
+ int src2_stride = src_stride << 1; \
+ int max = h >> 1; \
+ for (i = 0; i < max; i++) { \
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
+ sad1_reg = _mm256_sad_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
+ sad2_reg = _mm256_sad_epu8( \
+ ref2_reg, \
+ _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
+ sum_sad = \
+ _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+ ref_ptr += ref2_stride; \
+ src_ptr += src2_stride; \
+ } \
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+ res = _mm_cvtsi128_si32(sum_sad128); \
+ return res; \
+ }
-#define FSAD64 \
-FSAD64_H(64); \
-FSAD64_H(32);
+#define FSAD64 \
+ FSAD64_H(64); \
+ FSAD64_H(32);
-#define FSAD32 \
-FSAD32_H(64); \
-FSAD32_H(32); \
-FSAD32_H(16);
+#define FSAD32 \
+ FSAD32_H(64); \
+ FSAD32_H(32); \
+ FSAD32_H(16);
FSAD64;
FSAD32;
@@ -89,88 +88,86 @@ FSAD32;
#undef FSAD64_H
#undef FSAD32_H
-#define FSADAVG64_H(h) \
-unsigned int vpx_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \
- int src_stride, \
- const uint8_t *ref_ptr, \
- int ref_stride, \
- const uint8_t *second_pred) { \
- int i, res; \
- __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
- __m256i sum_sad = _mm256_setzero_si256(); \
- __m256i sum_sad_h; \
- __m128i sum_sad128; \
- for (i = 0 ; i < h ; i++) { \
- ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
- ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
- ref1_reg = _mm256_avg_epu8(ref1_reg, \
- _mm256_loadu_si256((__m256i const *)second_pred)); \
- ref2_reg = _mm256_avg_epu8(ref2_reg, \
- _mm256_loadu_si256((__m256i const *)(second_pred +32))); \
- sad1_reg = _mm256_sad_epu8(ref1_reg, \
- _mm256_loadu_si256((__m256i const *)src_ptr)); \
- sad2_reg = _mm256_sad_epu8(ref2_reg, \
- _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
- sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
- ref_ptr+= ref_stride; \
- src_ptr+= src_stride; \
- second_pred+= 64; \
- } \
- sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
- sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
- sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
- sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
- res = _mm_cvtsi128_si32(sum_sad128); \
- return res; \
-}
+#define FSADAVG64_H(h) \
+ unsigned int vpx_sad64x##h##_avg_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ int i, res; \
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+ __m256i sum_sad = _mm256_setzero_si256(); \
+ __m256i sum_sad_h; \
+ __m128i sum_sad128; \
+ for (i = 0; i < h; i++) { \
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
+ ref1_reg = _mm256_avg_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \
+ ref2_reg = _mm256_avg_epu8( \
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
+ sad1_reg = _mm256_sad_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
+ sad2_reg = _mm256_sad_epu8( \
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
+ sum_sad = \
+ _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+ ref_ptr += ref_stride; \
+ src_ptr += src_stride; \
+ second_pred += 64; \
+ } \
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+ res = _mm_cvtsi128_si32(sum_sad128); \
+ return res; \
+ }
-#define FSADAVG32_H(h) \
-unsigned int vpx_sad32x##h##_avg_avx2(const uint8_t *src_ptr, \
- int src_stride, \
- const uint8_t *ref_ptr, \
- int ref_stride, \
- const uint8_t *second_pred) { \
- int i, res; \
- __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
- __m256i sum_sad = _mm256_setzero_si256(); \
- __m256i sum_sad_h; \
- __m128i sum_sad128; \
- int ref2_stride = ref_stride << 1; \
- int src2_stride = src_stride << 1; \
- int max = h >> 1; \
- for (i = 0 ; i < max ; i++) { \
- ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
- ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
- ref1_reg = _mm256_avg_epu8(ref1_reg, \
- _mm256_loadu_si256((__m256i const *)second_pred)); \
- ref2_reg = _mm256_avg_epu8(ref2_reg, \
- _mm256_loadu_si256((__m256i const *)(second_pred +32))); \
- sad1_reg = _mm256_sad_epu8(ref1_reg, \
- _mm256_loadu_si256((__m256i const *)src_ptr)); \
- sad2_reg = _mm256_sad_epu8(ref2_reg, \
- _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
- sum_sad = _mm256_add_epi32(sum_sad, \
- _mm256_add_epi32(sad1_reg, sad2_reg)); \
- ref_ptr+= ref2_stride; \
- src_ptr+= src2_stride; \
- second_pred+= 64; \
- } \
- sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
- sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
- sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
- sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
- res = _mm_cvtsi128_si32(sum_sad128); \
- return res; \
-}
+#define FSADAVG32_H(h) \
+ unsigned int vpx_sad32x##h##_avg_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ int i, res; \
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+ __m256i sum_sad = _mm256_setzero_si256(); \
+ __m256i sum_sad_h; \
+ __m128i sum_sad128; \
+ int ref2_stride = ref_stride << 1; \
+ int src2_stride = src_stride << 1; \
+ int max = h >> 1; \
+ for (i = 0; i < max; i++) { \
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
+ ref1_reg = _mm256_avg_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \
+ ref2_reg = _mm256_avg_epu8( \
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
+ sad1_reg = _mm256_sad_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
+ sad2_reg = _mm256_sad_epu8( \
+ ref2_reg, \
+ _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
+ sum_sad = \
+ _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+ ref_ptr += ref2_stride; \
+ src_ptr += src2_stride; \
+ second_pred += 64; \
+ } \
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+ res = _mm_cvtsi128_si32(sum_sad128); \
+ return res; \
+ }
-#define FSADAVG64 \
-FSADAVG64_H(64); \
-FSADAVG64_H(32);
+#define FSADAVG64 \
+ FSADAVG64_H(64); \
+ FSADAVG64_H(32);
-#define FSADAVG32 \
-FSADAVG32_H(64); \
-FSADAVG32_H(32); \
-FSADAVG32_H(16);
+#define FSADAVG32 \
+ FSADAVG32_H(64); \
+ FSADAVG32_H(32); \
+ FSADAVG32_H(16);
FSADAVG64;
FSADAVG32;
diff --git a/vpx_dsp/x86/txfm_common_sse2.h b/vpx_dsp/x86/txfm_common_sse2.h
index 536b20687..f8edb1b78 100644
--- a/vpx_dsp/x86/txfm_common_sse2.h
+++ b/vpx_dsp/x86/txfm_common_sse2.h
@@ -14,15 +14,15 @@
#include <emmintrin.h>
#include "vpx/vpx_integer.h"
-#define pair_set_epi16(a, b) \
+#define pair_set_epi16(a, b) \
_mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
(int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
-#define dual_set_epi16(a, b) \
+#define dual_set_epi16(a, b) \
_mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
(int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
-#define octa_set_epi16(a, b, c, d, e, f, g, h) \
+#define octa_set_epi16(a, b, c, d, e, f, g, h) \
_mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
(int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c
index f8c97117d..7bc2693cf 100644
--- a/vpx_dsp/x86/variance_avx2.c
+++ b/vpx_dsp/x86/variance_avx2.c
@@ -14,13 +14,13 @@ typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
unsigned int *sse, int *sum);
void vpx_get32x32var_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse, int *sum);
+ const uint8_t *ref, int ref_stride, unsigned int *sse,
+ int *sum);
static void variance_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- int w, int h, unsigned int *sse, int *sum,
- get_var_avx2 var_fn, int block_size) {
+ const uint8_t *ref, int ref_stride, int w, int h,
+ unsigned int *sse, int *sum, get_var_avx2 var_fn,
+ int block_size) {
int i, j;
*sse = 0;
@@ -30,21 +30,20 @@ static void variance_avx2(const uint8_t *src, int src_stride,
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
- var_fn(&src[src_stride * i + j], src_stride,
- &ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
+ var_fn(&src[src_stride * i + j], src_stride, &ref[ref_stride * i + j],
+ ref_stride, &sse0, &sum0);
*sse += sse0;
*sum += sum0;
}
}
}
-
unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
- variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
- sse, &sum, vpx_get16x16var_avx2, 16);
+ variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+ vpx_get16x16var_avx2, 16);
return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
}
@@ -60,8 +59,8 @@ unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
- variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
- sse, &sum, vpx_get32x32var_avx2, 32);
+ variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
+ vpx_get32x32var_avx2, 32);
return *sse - (((int64_t)sum * sum) >> 9);
}
@@ -69,8 +68,8 @@ unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
- variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
- sse, &sum, vpx_get32x32var_avx2, 32);
+ variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
+ vpx_get32x32var_avx2, 32);
return *sse - (((int64_t)sum * sum) >> 10);
}
@@ -78,8 +77,8 @@ unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
- variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
- sse, &sum, vpx_get32x32var_avx2, 32);
+ variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
+ vpx_get32x32var_avx2, 32);
return *sse - (((int64_t)sum * sum) >> 12);
}
@@ -87,79 +86,58 @@ unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
- variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
- sse, &sum, vpx_get32x32var_avx2, 32);
+ variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
+ vpx_get32x32var_avx2, 32);
return *sse - (((int64_t)sum * sum) >> 11);
}
unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
int x_offset, int y_offset,
const uint8_t *dst, int dst_stride,
- int height,
- unsigned int *sse);
-
-unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
- int src_stride,
- int x_offset,
- int y_offset,
- const uint8_t *dst,
- int dst_stride,
- const uint8_t *sec,
- int sec_stride,
- int height,
- unsigned int *sseptr);
+ int height, unsigned int *sse);
+
+unsigned int vpx_sub_pixel_avg_variance32xh_avx2(
+ const uint8_t *src, int src_stride, int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
+ int height, unsigned int *sseptr);
unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src,
- int src_stride,
- int x_offset,
- int y_offset,
- const uint8_t *dst,
+ int src_stride, int x_offset,
+ int y_offset, const uint8_t *dst,
int dst_stride,
unsigned int *sse) {
unsigned int sse1;
- const int se1 = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
- y_offset, dst, dst_stride,
- 64, &sse1);
+ const int se1 = vpx_sub_pixel_variance32xh_avx2(
+ src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1);
unsigned int sse2;
- const int se2 = vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride,
- x_offset, y_offset,
- dst + 32, dst_stride,
- 64, &sse2);
+ const int se2 =
+ vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset,
+ dst + 32, dst_stride, 64, &sse2);
const int se = se1 + se2;
*sse = sse1 + sse2;
return *sse - (((int64_t)se * se) >> 12);
}
unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src,
- int src_stride,
- int x_offset,
- int y_offset,
- const uint8_t *dst,
+ int src_stride, int x_offset,
+ int y_offset, const uint8_t *dst,
int dst_stride,
unsigned int *sse) {
- const int se = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
- y_offset, dst, dst_stride,
- 32, sse);
+ const int se = vpx_sub_pixel_variance32xh_avx2(
+ src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse);
return *sse - (((int64_t)se * se) >> 10);
}
-unsigned int vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
- int src_stride,
- int x_offset,
- int y_offset,
- const uint8_t *dst,
- int dst_stride,
- unsigned int *sse,
- const uint8_t *sec) {
+unsigned int vpx_sub_pixel_avg_variance64x64_avx2(
+ const uint8_t *src, int src_stride, int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
unsigned int sse1;
- const int se1 = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
- y_offset, dst, dst_stride,
- sec, 64, 64, &sse1);
+ const int se1 = vpx_sub_pixel_avg_variance32xh_avx2(
+ src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1);
unsigned int sse2;
- const int se2 =
- vpx_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
- y_offset, dst + 32, dst_stride,
- sec + 32, 64, 64, &sse2);
+ const int se2 = vpx_sub_pixel_avg_variance32xh_avx2(
+ src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32,
+ 64, 64, &sse2);
const int se = se1 + se2;
*sse = sse1 + sse2;
@@ -167,17 +145,11 @@ unsigned int vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
return *sse - (((int64_t)se * se) >> 12);
}
-unsigned int vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
- int src_stride,
- int x_offset,
- int y_offset,
- const uint8_t *dst,
- int dst_stride,
- unsigned int *sse,
- const uint8_t *sec) {
+unsigned int vpx_sub_pixel_avg_variance32x32_avx2(
+ const uint8_t *src, int src_stride, int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
// Process 32 elements in parallel.
- const int se = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
- y_offset, dst, dst_stride,
- sec, 32, 32, sse);
+ const int se = vpx_sub_pixel_avg_variance32xh_avx2(
+ src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse);
return *sse - (((int64_t)se * se) >> 10);
}
diff --git a/vpx_dsp/x86/variance_impl_avx2.c b/vpx_dsp/x86/variance_impl_avx2.c
index b289e9a0c..5a1f125e5 100644
--- a/vpx_dsp/x86/variance_impl_avx2.c
+++ b/vpx_dsp/x86/variance_impl_avx2.c
@@ -14,306 +14,289 @@
#include "vpx_ports/mem.h"
DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
- 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
- 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
- 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
- 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
- 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
- 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
- 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
- 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
- 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
- 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
- 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
- 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
- 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+ 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16,
+ 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 14, 2, 14, 2, 14, 2,
+ 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+ 2, 14, 2, 14, 2, 14, 2, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+ 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+ 4, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+ 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+ 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+ 6, 10, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+ 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 2, 14, 2, 14,
+ 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+ 14, 2, 14, 2, 14, 2, 14, 2, 14,
};
+void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride,
+ const unsigned char *ref_ptr, int recon_stride,
+ unsigned int *SSE, int *Sum) {
+ __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
+ __m256i ref_expand_high, madd_low, madd_high;
+ unsigned int i, src_2strides, ref_2strides;
+ __m256i zero_reg = _mm256_set1_epi16(0);
+ __m256i sum_ref_src = _mm256_set1_epi16(0);
+ __m256i madd_ref_src = _mm256_set1_epi16(0);
+
+ // processing two strides in a 256 bit register reducing the number
+ // of loop stride by half (comparing to the sse2 code)
+ src_2strides = source_stride << 1;
+ ref_2strides = recon_stride << 1;
+ for (i = 0; i < 8; i++) {
+ src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr)));
+ src = _mm256_inserti128_si256(
+ src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1);
+
+ ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr)));
+ ref = _mm256_inserti128_si256(
+ ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1);
+
+ // expanding to 16 bit each lane
+ src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
+ src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
+
+ ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
+ ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
+
+ // src-ref
+ src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
+ src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
+
+ // madd low (src - ref)
+ madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
+
+ // add high to low
+ src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
+
+ // madd high (src - ref)
+ madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
+
+ sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
+
+ // add high to low
+ madd_ref_src =
+ _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
+
+ src_ptr += src_2strides;
+ ref_ptr += ref_2strides;
+ }
-void vpx_get16x16var_avx2(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum) {
- __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
- __m256i ref_expand_high, madd_low, madd_high;
- unsigned int i, src_2strides, ref_2strides;
- __m256i zero_reg = _mm256_set1_epi16(0);
- __m256i sum_ref_src = _mm256_set1_epi16(0);
- __m256i madd_ref_src = _mm256_set1_epi16(0);
-
- // processing two strides in a 256 bit register reducing the number
- // of loop stride by half (comparing to the sse2 code)
- src_2strides = source_stride << 1;
- ref_2strides = recon_stride << 1;
- for (i = 0; i < 8; i++) {
- src = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i const *) (src_ptr)));
- src = _mm256_inserti128_si256(src,
- _mm_loadu_si128((__m128i const *)(src_ptr+source_stride)), 1);
-
- ref =_mm256_castsi128_si256(
- _mm_loadu_si128((__m128i const *) (ref_ptr)));
- ref = _mm256_inserti128_si256(ref,
- _mm_loadu_si128((__m128i const *)(ref_ptr+recon_stride)), 1);
-
- // expanding to 16 bit each lane
- src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
- src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
-
- ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
- ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
-
- // src-ref
- src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
- src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
-
- // madd low (src - ref)
- madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
-
- // add high to low
- src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
-
- // madd high (src - ref)
- madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
-
- sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
-
- // add high to low
- madd_ref_src = _mm256_add_epi32(madd_ref_src,
- _mm256_add_epi32(madd_low, madd_high));
-
- src_ptr+= src_2strides;
- ref_ptr+= ref_2strides;
- }
-
- {
- __m128i sum_res, madd_res;
- __m128i expand_sum_low, expand_sum_high, expand_sum;
- __m128i expand_madd_low, expand_madd_high, expand_madd;
- __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
+ {
+ __m128i sum_res, madd_res;
+ __m128i expand_sum_low, expand_sum_high, expand_sum;
+ __m128i expand_madd_low, expand_madd_high, expand_madd;
+ __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
- // extract the low lane and add it to the high lane
- sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src),
- _mm256_extractf128_si256(sum_ref_src, 1));
+ // extract the low lane and add it to the high lane
+ sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src),
+ _mm256_extractf128_si256(sum_ref_src, 1));
- madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src),
- _mm256_extractf128_si256(madd_ref_src, 1));
+ madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src),
+ _mm256_extractf128_si256(madd_ref_src, 1));
- // padding each 2 bytes with another 2 zeroed bytes
- expand_sum_low = _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg),
- sum_res);
- expand_sum_high = _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg),
- sum_res);
+ // padding each 2 bytes with another 2 zeroed bytes
+ expand_sum_low =
+ _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
+ expand_sum_high =
+ _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
- // shifting the sign 16 bits right
- expand_sum_low = _mm_srai_epi32(expand_sum_low, 16);
- expand_sum_high = _mm_srai_epi32(expand_sum_high, 16);
+ // shifting the sign 16 bits right
+ expand_sum_low = _mm_srai_epi32(expand_sum_low, 16);
+ expand_sum_high = _mm_srai_epi32(expand_sum_high, 16);
- expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high);
+ expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high);
- // expand each 32 bits of the madd result to 64 bits
- expand_madd_low = _mm_unpacklo_epi32(madd_res,
- _mm256_castsi256_si128(zero_reg));
- expand_madd_high = _mm_unpackhi_epi32(madd_res,
- _mm256_castsi256_si128(zero_reg));
+ // expand each 32 bits of the madd result to 64 bits
+ expand_madd_low =
+ _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
+ expand_madd_high =
+ _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
- expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high);
+ expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high);
- ex_expand_sum_low = _mm_unpacklo_epi32(expand_sum,
- _mm256_castsi256_si128(zero_reg));
- ex_expand_sum_high = _mm_unpackhi_epi32(expand_sum,
- _mm256_castsi256_si128(zero_reg));
+ ex_expand_sum_low =
+ _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
+ ex_expand_sum_high =
+ _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
- ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
+ ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
- // shift 8 bytes eight
- madd_res = _mm_srli_si128(expand_madd, 8);
- sum_res = _mm_srli_si128(ex_expand_sum, 8);
+ // shift 8 bytes eight
+ madd_res = _mm_srli_si128(expand_madd, 8);
+ sum_res = _mm_srli_si128(ex_expand_sum, 8);
- madd_res = _mm_add_epi32(madd_res, expand_madd);
- sum_res = _mm_add_epi32(sum_res, ex_expand_sum);
+ madd_res = _mm_add_epi32(madd_res, expand_madd);
+ sum_res = _mm_add_epi32(sum_res, ex_expand_sum);
- *((int*)SSE)= _mm_cvtsi128_si32(madd_res);
+ *((int *)SSE) = _mm_cvtsi128_si32(madd_res);
- *((int*)Sum)= _mm_cvtsi128_si32(sum_res);
- }
+ *((int *)Sum) = _mm_cvtsi128_si32(sum_res);
+ }
}
-void vpx_get32x32var_avx2(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum) {
- __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
- __m256i ref_expand_high, madd_low, madd_high;
- unsigned int i;
- __m256i zero_reg = _mm256_set1_epi16(0);
- __m256i sum_ref_src = _mm256_set1_epi16(0);
- __m256i madd_ref_src = _mm256_set1_epi16(0);
+void vpx_get32x32var_avx2(const unsigned char *src_ptr, int source_stride,
+ const unsigned char *ref_ptr, int recon_stride,
+ unsigned int *SSE, int *Sum) {
+ __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
+ __m256i ref_expand_high, madd_low, madd_high;
+ unsigned int i;
+ __m256i zero_reg = _mm256_set1_epi16(0);
+ __m256i sum_ref_src = _mm256_set1_epi16(0);
+ __m256i madd_ref_src = _mm256_set1_epi16(0);
- // processing 32 elements in parallel
- for (i = 0; i < 16; i++) {
- src = _mm256_loadu_si256((__m256i const *) (src_ptr));
+ // processing 32 elements in parallel
+ for (i = 0; i < 16; i++) {
+ src = _mm256_loadu_si256((__m256i const *)(src_ptr));
- ref = _mm256_loadu_si256((__m256i const *) (ref_ptr));
+ ref = _mm256_loadu_si256((__m256i const *)(ref_ptr));
- // expanding to 16 bit each lane
- src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
- src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
+ // expanding to 16 bit each lane
+ src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
+ src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
- ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
- ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
+ ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
+ ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
- // src-ref
- src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
- src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
+ // src-ref
+ src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
+ src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
- // madd low (src - ref)
- madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
+ // madd low (src - ref)
+ madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
- // add high to low
- src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
+ // add high to low
+ src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
- // madd high (src - ref)
- madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
+ // madd high (src - ref)
+ madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
- sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
+ sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
- // add high to low
- madd_ref_src = _mm256_add_epi32(madd_ref_src,
- _mm256_add_epi32(madd_low, madd_high));
+ // add high to low
+ madd_ref_src =
+ _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
- src_ptr+= source_stride;
- ref_ptr+= recon_stride;
- }
+ src_ptr += source_stride;
+ ref_ptr += recon_stride;
+ }
- {
- __m256i expand_sum_low, expand_sum_high, expand_sum;
- __m256i expand_madd_low, expand_madd_high, expand_madd;
- __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
+ {
+ __m256i expand_sum_low, expand_sum_high, expand_sum;
+ __m256i expand_madd_low, expand_madd_high, expand_madd;
+ __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
- // padding each 2 bytes with another 2 zeroed bytes
- expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src);
- expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src);
+ // padding each 2 bytes with another 2 zeroed bytes
+ expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src);
+ expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src);
- // shifting the sign 16 bits right
- expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16);
- expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16);
+ // shifting the sign 16 bits right
+ expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16);
+ expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16);
- expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high);
+ expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high);
- // expand each 32 bits of the madd result to 64 bits
- expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg);
- expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg);
+ // expand each 32 bits of the madd result to 64 bits
+ expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg);
+ expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg);
- expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high);
+ expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high);
- ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg);
- ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg);
+ ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg);
+ ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg);
- ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
+ ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
- // shift 8 bytes eight
- madd_ref_src = _mm256_srli_si256(expand_madd, 8);
- sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8);
+ // shift 8 bytes eight
+ madd_ref_src = _mm256_srli_si256(expand_madd, 8);
+ sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8);
- madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd);
- sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum);
+ madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd);
+ sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum);
- // extract the low lane and the high lane and add the results
- *((int*)SSE)= _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) +
- _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1));
+ // extract the low lane and the high lane and add the results
+ *((int *)SSE) =
+ _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) +
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1));
- *((int*)Sum)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
- _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
- }
+ *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
+ }
}
-#define FILTER_SRC(filter) \
- /* filter the source */ \
+#define FILTER_SRC(filter) \
+ /* filter the source */ \
exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
- \
- /* add 8 to source */ \
- exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
- exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
- \
- /* divide source by 16 */ \
- exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
+ \
+ /* add 8 to source */ \
+ exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
+ exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
+ \
+ /* divide source by 16 */ \
+ exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
-#define MERGE_WITH_SRC(src_reg, reg) \
+#define MERGE_WITH_SRC(src_reg, reg) \
exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
-#define LOAD_SRC_DST \
- /* load source and destination */ \
- src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
- dst_reg = _mm256_loadu_si256((__m256i const *) (dst));
+#define LOAD_SRC_DST \
+ /* load source and destination */ \
+ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+ dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
-#define AVG_NEXT_SRC(src_reg, size_stride) \
- src_next_reg = _mm256_loadu_si256((__m256i const *) \
- (src + size_stride)); \
- /* average between current and next stride source */ \
+#define AVG_NEXT_SRC(src_reg, size_stride) \
+ src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
+ /* average between current and next stride source */ \
src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
-#define MERGE_NEXT_SRC(src_reg, size_stride) \
- src_next_reg = _mm256_loadu_si256((__m256i const *) \
- (src + size_stride)); \
+#define MERGE_NEXT_SRC(src_reg, size_stride) \
+ src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
MERGE_WITH_SRC(src_reg, src_next_reg)
-#define CALC_SUM_SSE_INSIDE_LOOP \
- /* expand each byte to 2 bytes */ \
- exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
- exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
- /* source - dest */ \
- exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
- exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
- /* caculate sum */ \
- sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
+#define CALC_SUM_SSE_INSIDE_LOOP \
+ /* expand each byte to 2 bytes */ \
+ exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
+ exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
+ /* source - dest */ \
+ exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
+ exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
+ /* caculate sum */ \
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
- sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
- /* calculate sse */ \
- sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
+ /* calculate sse */ \
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
// final calculation to sum and sse
-#define CALC_SUM_AND_SSE \
- res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
- sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
- sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
- sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
- sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
- sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
- \
- sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
- sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
- \
- sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
- sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
- *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
- _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
- sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
- sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
- sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
+#define CALC_SUM_AND_SSE \
+ res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
+ sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
+ sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
+ sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
+ sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+ sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
+ \
+ sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
+ sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
+ \
+ sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+ sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+ *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
+ sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
+ sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+ sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
_mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
-
-unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src,
- int src_stride,
- int x_offset,
- int y_offset,
- const uint8_t *dst,
- int dst_stride,
- int height,
- unsigned int *sse) {
+unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+ int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride,
+ int height, unsigned int *sse) {
__m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
__m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
__m256i zero_reg;
@@ -325,66 +308,66 @@ unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src,
// x_offset = 0 and y_offset = 0
if (x_offset == 0) {
if (y_offset == 0) {
- for (i = 0; i < height ; i++) {
+ for (i = 0; i < height; i++) {
LOAD_SRC_DST
// expend each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
- src+= src_stride;
- dst+= dst_stride;
+ src += src_stride;
+ dst += dst_stride;
}
- // x_offset = 0 and y_offset = 8
+ // x_offset = 0 and y_offset = 8
} else if (y_offset == 8) {
__m256i src_next_reg;
- for (i = 0; i < height ; i++) {
+ for (i = 0; i < height; i++) {
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, src_stride)
// expend each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
- src+= src_stride;
- dst+= dst_stride;
+ src += src_stride;
+ dst += dst_stride;
}
- // x_offset = 0 and y_offset = bilin interpolation
+ // x_offset = 0 and y_offset = bilin interpolation
} else {
__m256i filter, pw8, src_next_reg;
y_offset <<= 5;
- filter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + y_offset));
+ filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
- for (i = 0; i < height ; i++) {
+ for (i = 0; i < height; i++) {
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, src_stride)
FILTER_SRC(filter)
CALC_SUM_SSE_INSIDE_LOOP
- src+= src_stride;
- dst+= dst_stride;
+ src += src_stride;
+ dst += dst_stride;
}
}
- // x_offset = 8 and y_offset = 0
+ // x_offset = 8 and y_offset = 0
} else if (x_offset == 8) {
if (y_offset == 0) {
__m256i src_next_reg;
- for (i = 0; i < height ; i++) {
+ for (i = 0; i < height; i++) {
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
// expand each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
- src+= src_stride;
- dst+= dst_stride;
+ src += src_stride;
+ dst += dst_stride;
}
- // x_offset = 8 and y_offset = 8
+ // x_offset = 8 and y_offset = 8
} else if (y_offset == 8) {
__m256i src_next_reg, src_avg;
// load source and another source starting from the next
// following byte
- src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ src_reg = _mm256_loadu_si256((__m256i const *)(src));
AVG_NEXT_SRC(src_reg, 1)
- for (i = 0; i < height ; i++) {
+ for (i = 0; i < height; i++) {
src_avg = src_reg;
- src+= src_stride;
+ src += src_stride;
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
// average between previous average to current average
@@ -393,92 +376,92 @@ unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src,
MERGE_WITH_SRC(src_avg, zero_reg)
// save current source average
CALC_SUM_SSE_INSIDE_LOOP
- dst+= dst_stride;
+ dst += dst_stride;
}
- // x_offset = 8 and y_offset = bilin interpolation
+ // x_offset = 8 and y_offset = bilin interpolation
} else {
__m256i filter, pw8, src_next_reg, src_avg;
y_offset <<= 5;
- filter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + y_offset));
+ filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
// load source and another source starting from the next
// following byte
- src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ src_reg = _mm256_loadu_si256((__m256i const *)(src));
AVG_NEXT_SRC(src_reg, 1)
- for (i = 0; i < height ; i++) {
+ for (i = 0; i < height; i++) {
// save current source average
src_avg = src_reg;
- src+= src_stride;
+ src += src_stride;
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
MERGE_WITH_SRC(src_avg, src_reg)
FILTER_SRC(filter)
CALC_SUM_SSE_INSIDE_LOOP
- dst+= dst_stride;
+ dst += dst_stride;
}
}
- // x_offset = bilin interpolation and y_offset = 0
+ // x_offset = bilin interpolation and y_offset = 0
} else {
if (y_offset == 0) {
__m256i filter, pw8, src_next_reg;
x_offset <<= 5;
- filter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + x_offset));
+ filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + x_offset));
pw8 = _mm256_set1_epi16(8);
- for (i = 0; i < height ; i++) {
+ for (i = 0; i < height; i++) {
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
CALC_SUM_SSE_INSIDE_LOOP
- src+= src_stride;
- dst+= dst_stride;
+ src += src_stride;
+ dst += dst_stride;
}
- // x_offset = bilin interpolation and y_offset = 8
+ // x_offset = bilin interpolation and y_offset = 8
} else if (y_offset == 8) {
__m256i filter, pw8, src_next_reg, src_pack;
x_offset <<= 5;
- filter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + x_offset));
+ filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + x_offset));
pw8 = _mm256_set1_epi16(8);
- src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ src_reg = _mm256_loadu_si256((__m256i const *)(src));
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
// convert each 16 bit to 8 bit to each low and high lane source
- src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- for (i = 0; i < height ; i++) {
- src+= src_stride;
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ for (i = 0; i < height; i++) {
+ src += src_stride;
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
- src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
// average between previous pack to the current
src_pack = _mm256_avg_epu8(src_pack, src_reg);
MERGE_WITH_SRC(src_pack, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src_pack = src_reg;
- dst+= dst_stride;
+ dst += dst_stride;
}
- // x_offset = bilin interpolation and y_offset = bilin interpolation
+ // x_offset = bilin interpolation and y_offset = bilin interpolation
} else {
__m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
x_offset <<= 5;
- xfilter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + x_offset));
+ xfilter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + x_offset));
y_offset <<= 5;
- yfilter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + y_offset));
+ yfilter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
// load source and another source starting from the next
// following byte
- src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ src_reg = _mm256_loadu_si256((__m256i const *)(src));
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(xfilter)
// convert each 16 bit to 8 bit to each low and high lane source
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- for (i = 0; i < height ; i++) {
- src+= src_stride;
+ for (i = 0; i < height; i++) {
+ src += src_stride;
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(xfilter)
@@ -489,7 +472,7 @@ unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src,
FILTER_SRC(yfilter)
src_pack = src_reg;
CALC_SUM_SSE_INSIDE_LOOP
- dst+= dst_stride;
+ dst += dst_stride;
}
}
}
@@ -497,16 +480,10 @@ unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src,
return sum;
}
-unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
- int src_stride,
- int x_offset,
- int y_offset,
- const uint8_t *dst,
- int dst_stride,
- const uint8_t *sec,
- int sec_stride,
- int height,
- unsigned int *sse) {
+unsigned int vpx_sub_pixel_avg_variance32xh_avx2(
+ const uint8_t *src, int src_stride, int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
+ int height, unsigned int *sse) {
__m256i sec_reg;
__m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
__m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
@@ -519,190 +496,190 @@ unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
// x_offset = 0 and y_offset = 0
if (x_offset == 0) {
if (y_offset == 0) {
- for (i = 0; i < height ; i++) {
+ for (i = 0; i < height; i++) {
LOAD_SRC_DST
- sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
- sec+= sec_stride;
+ sec += sec_stride;
// expend each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
- src+= src_stride;
- dst+= dst_stride;
+ src += src_stride;
+ dst += dst_stride;
}
} else if (y_offset == 8) {
__m256i src_next_reg;
- for (i = 0; i < height ; i++) {
+ for (i = 0; i < height; i++) {
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, src_stride)
- sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
- sec+= sec_stride;
+ sec += sec_stride;
// expend each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
- src+= src_stride;
- dst+= dst_stride;
+ src += src_stride;
+ dst += dst_stride;
}
- // x_offset = 0 and y_offset = bilin interpolation
+ // x_offset = 0 and y_offset = bilin interpolation
} else {
__m256i filter, pw8, src_next_reg;
y_offset <<= 5;
- filter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + y_offset));
+ filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
- for (i = 0; i < height ; i++) {
+ for (i = 0; i < height; i++) {
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, src_stride)
FILTER_SRC(filter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
- sec+= sec_stride;
+ sec += sec_stride;
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
- src+= src_stride;
- dst+= dst_stride;
+ src += src_stride;
+ dst += dst_stride;
}
}
- // x_offset = 8 and y_offset = 0
+ // x_offset = 8 and y_offset = 0
} else if (x_offset == 8) {
if (y_offset == 0) {
__m256i src_next_reg;
- for (i = 0; i < height ; i++) {
+ for (i = 0; i < height; i++) {
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
- sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
- sec+= sec_stride;
+ sec += sec_stride;
// expand each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
- src+= src_stride;
- dst+= dst_stride;
+ src += src_stride;
+ dst += dst_stride;
}
- // x_offset = 8 and y_offset = 8
+ // x_offset = 8 and y_offset = 8
} else if (y_offset == 8) {
__m256i src_next_reg, src_avg;
// load source and another source starting from the next
// following byte
- src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ src_reg = _mm256_loadu_si256((__m256i const *)(src));
AVG_NEXT_SRC(src_reg, 1)
- for (i = 0; i < height ; i++) {
+ for (i = 0; i < height; i++) {
// save current source average
src_avg = src_reg;
- src+= src_stride;
+ src += src_stride;
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
// average between previous average to current average
src_avg = _mm256_avg_epu8(src_avg, src_reg);
- sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
src_avg = _mm256_avg_epu8(src_avg, sec_reg);
- sec+= sec_stride;
+ sec += sec_stride;
// expand each byte to 2 bytes
MERGE_WITH_SRC(src_avg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
- dst+= dst_stride;
+ dst += dst_stride;
}
- // x_offset = 8 and y_offset = bilin interpolation
+ // x_offset = 8 and y_offset = bilin interpolation
} else {
__m256i filter, pw8, src_next_reg, src_avg;
y_offset <<= 5;
- filter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + y_offset));
+ filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
// load source and another source starting from the next
// following byte
- src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ src_reg = _mm256_loadu_si256((__m256i const *)(src));
AVG_NEXT_SRC(src_reg, 1)
- for (i = 0; i < height ; i++) {
+ for (i = 0; i < height; i++) {
// save current source average
src_avg = src_reg;
- src+= src_stride;
+ src += src_stride;
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
MERGE_WITH_SRC(src_avg, src_reg)
FILTER_SRC(filter)
src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
src_avg = _mm256_avg_epu8(src_avg, sec_reg);
// expand each byte to 2 bytes
MERGE_WITH_SRC(src_avg, zero_reg)
- sec+= sec_stride;
+ sec += sec_stride;
CALC_SUM_SSE_INSIDE_LOOP
- dst+= dst_stride;
+ dst += dst_stride;
}
}
- // x_offset = bilin interpolation and y_offset = 0
+ // x_offset = bilin interpolation and y_offset = 0
} else {
if (y_offset == 0) {
__m256i filter, pw8, src_next_reg;
x_offset <<= 5;
- filter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + x_offset));
+ filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + x_offset));
pw8 = _mm256_set1_epi16(8);
- for (i = 0; i < height ; i++) {
+ for (i = 0; i < height; i++) {
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
MERGE_WITH_SRC(src_reg, zero_reg)
- sec+= sec_stride;
+ sec += sec_stride;
CALC_SUM_SSE_INSIDE_LOOP
- src+= src_stride;
- dst+= dst_stride;
+ src += src_stride;
+ dst += dst_stride;
}
- // x_offset = bilin interpolation and y_offset = 8
+ // x_offset = bilin interpolation and y_offset = 8
} else if (y_offset == 8) {
__m256i filter, pw8, src_next_reg, src_pack;
x_offset <<= 5;
- filter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + x_offset));
+ filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + x_offset));
pw8 = _mm256_set1_epi16(8);
- src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ src_reg = _mm256_loadu_si256((__m256i const *)(src));
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
// convert each 16 bit to 8 bit to each low and high lane source
- src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- for (i = 0; i < height ; i++) {
- src+= src_stride;
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ for (i = 0; i < height; i++) {
+ src += src_stride;
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
- src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
// average between previous pack to the current
src_pack = _mm256_avg_epu8(src_pack, src_reg);
- sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
src_pack = _mm256_avg_epu8(src_pack, sec_reg);
- sec+= sec_stride;
+ sec += sec_stride;
MERGE_WITH_SRC(src_pack, zero_reg)
src_pack = src_reg;
CALC_SUM_SSE_INSIDE_LOOP
- dst+= dst_stride;
+ dst += dst_stride;
}
- // x_offset = bilin interpolation and y_offset = bilin interpolation
+ // x_offset = bilin interpolation and y_offset = bilin interpolation
} else {
__m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
x_offset <<= 5;
- xfilter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + x_offset));
+ xfilter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + x_offset));
y_offset <<= 5;
- yfilter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + y_offset));
+ yfilter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
// load source and another source starting from the next
// following byte
- src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ src_reg = _mm256_loadu_si256((__m256i const *)(src));
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(xfilter)
// convert each 16 bit to 8 bit to each low and high lane source
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- for (i = 0; i < height ; i++) {
- src+= src_stride;
+ for (i = 0; i < height; i++) {
+ src += src_stride;
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(xfilter)
@@ -712,13 +689,13 @@ unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
// filter the source
FILTER_SRC(yfilter)
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
src_pack = _mm256_avg_epu8(src_pack, sec_reg);
MERGE_WITH_SRC(src_pack, zero_reg)
src_pack = src_reg;
- sec+= sec_stride;
+ sec += sec_stride;
CALC_SUM_SSE_INSIDE_LOOP
- dst+= dst_stride;
+ dst += dst_stride;
}
}
}
diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c
index 92dc752c0..93457ce70 100644
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@@ -15,9 +15,9 @@
#include "vpx_ports/mem.h"
-typedef void (*getNxMvar_fn_t) (const unsigned char *src, int src_stride,
- const unsigned char *ref, int ref_stride,
- unsigned int *sse, int *sum);
+typedef void (*getNxMvar_fn_t)(const unsigned char *src, int src_stride,
+ const unsigned char *ref, int ref_stride,
+ unsigned int *sse, int *sum);
unsigned int vpx_get_mb_ss_sse2(const int16_t *src) {
__m128i vsum = _mm_setzero_si128();
@@ -31,11 +31,12 @@ unsigned int vpx_get_mb_ss_sse2(const int16_t *src) {
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
- return _mm_cvtsi128_si32(vsum);
+ return _mm_cvtsi128_si32(vsum);
}
-#define READ64(p, stride, i) \
- _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
+#define READ64(p, stride, i) \
+ _mm_unpacklo_epi8( \
+ _mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
_mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
static void get4x4var_sse2(const uint8_t *src, int src_stride,
@@ -57,32 +58,31 @@ static void get4x4var_sse2(const uint8_t *src, int src_stride,
*sum = (int16_t)_mm_extract_epi16(vsum, 0);
// sse
- vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
- _mm_madd_epi16(diff1, diff1));
+ vsum =
+ _mm_add_epi32(_mm_madd_epi16(diff0, diff0), _mm_madd_epi16(diff1, diff1));
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
*sse = _mm_cvtsi128_si32(vsum);
}
-void vpx_get8x8var_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse, int *sum) {
+void vpx_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref,
+ int ref_stride, unsigned int *sse, int *sum) {
const __m128i zero = _mm_setzero_si128();
__m128i vsum = _mm_setzero_si128();
__m128i vsse = _mm_setzero_si128();
int i;
for (i = 0; i < 8; i += 2) {
- const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
- (const __m128i *)(src + i * src_stride)), zero);
- const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
- (const __m128i *)(ref + i * ref_stride)), zero);
+ const __m128i src0 = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((const __m128i *)(src + i * src_stride)), zero);
+ const __m128i ref0 = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((const __m128i *)(ref + i * ref_stride)), zero);
const __m128i diff0 = _mm_sub_epi16(src0, ref0);
- const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
- (const __m128i *)(src + (i + 1) * src_stride)), zero);
- const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
- (const __m128i *)(ref + (i + 1) * ref_stride)), zero);
+ const __m128i src1 = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((const __m128i *)(src + (i + 1) * src_stride)), zero);
+ const __m128i ref1 = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((const __m128i *)(ref + (i + 1) * ref_stride)), zero);
const __m128i diff1 = _mm_sub_epi16(src1, ref1);
vsum = _mm_add_epi16(vsum, diff0);
@@ -104,8 +104,8 @@ void vpx_get8x8var_sse2(const uint8_t *src, int src_stride,
}
void vpx_get16x16var_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse, int *sum) {
+ const uint8_t *ref, int ref_stride, unsigned int *sse,
+ int *sum) {
const __m128i zero = _mm_setzero_si128();
__m128i vsum = _mm_setzero_si128();
__m128i vsse = _mm_setzero_si128();
@@ -135,8 +135,8 @@ void vpx_get16x16var_sse2(const uint8_t *src, int src_stride,
// sum
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
- *sum = (int16_t)_mm_extract_epi16(vsum, 0) +
- (int16_t)_mm_extract_epi16(vsum, 1);
+ *sum =
+ (int16_t)_mm_extract_epi16(vsum, 0) + (int16_t)_mm_extract_epi16(vsum, 1);
// sse
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
@@ -144,10 +144,9 @@ void vpx_get16x16var_sse2(const uint8_t *src, int src_stride,
*sse = _mm_cvtsi128_si32(vsse);
}
-
static void variance_sse2(const unsigned char *src, int src_stride,
- const unsigned char *ref, int ref_stride,
- int w, int h, unsigned int *sse, int *sum,
+ const unsigned char *ref, int ref_stride, int w,
+ int h, unsigned int *sse, int *sum,
getNxMvar_fn_t var_fn, int block_size) {
int i, j;
@@ -158,8 +157,8 @@ static void variance_sse2(const unsigned char *src, int src_stride,
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
- var_fn(src + src_stride * i + j, src_stride,
- ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
*sse += sse0;
*sum += sum0;
}
@@ -178,8 +177,8 @@ unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
- sse, &sum, get4x4var_sse2, 4);
+ variance_sse2(src, src_stride, ref, ref_stride, 8, 4, sse, &sum,
+ get4x4var_sse2, 4);
return *sse - ((sum * sum) >> 5);
}
@@ -187,8 +186,8 @@ unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
- sse, &sum, get4x4var_sse2, 4);
+ variance_sse2(src, src_stride, ref, ref_stride, 4, 8, sse, &sum,
+ get4x4var_sse2, 4);
return *sse - ((sum * sum) >> 5);
}
@@ -204,8 +203,8 @@ unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse) {
int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
- sse, &sum, vpx_get8x8var_sse2, 8);
+ variance_sse2(src, src_stride, ref, ref_stride, 16, 8, sse, &sum,
+ vpx_get8x8var_sse2, 8);
return *sse - ((sum * sum) >> 7);
}
@@ -213,8 +212,8 @@ unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse) {
int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
- sse, &sum, vpx_get8x8var_sse2, 8);
+ variance_sse2(src, src_stride, ref, ref_stride, 8, 16, sse, &sum,
+ vpx_get8x8var_sse2, 8);
return *sse - ((sum * sum) >> 7);
}
@@ -230,8 +229,8 @@ unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
- sse, &sum, vpx_get16x16var_sse2, 16);
+ variance_sse2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
+ vpx_get16x16var_sse2, 16);
return *sse - (((int64_t)sum * sum) >> 10);
}
@@ -239,8 +238,8 @@ unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
- sse, &sum, vpx_get16x16var_sse2, 16);
+ variance_sse2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
+ vpx_get16x16var_sse2, 16);
return *sse - (((int64_t)sum * sum) >> 9);
}
@@ -248,8 +247,8 @@ unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
- sse, &sum, vpx_get16x16var_sse2, 16);
+ variance_sse2(src, src_stride, ref, ref_stride, 16, 32, sse, &sum,
+ vpx_get16x16var_sse2, 16);
return *sse - (((int64_t)sum * sum) >> 9);
}
@@ -257,8 +256,8 @@ unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
- sse, &sum, vpx_get16x16var_sse2, 16);
+ variance_sse2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
+ vpx_get16x16var_sse2, 16);
return *sse - (((int64_t)sum * sum) >> 12);
}
@@ -266,8 +265,8 @@ unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
- sse, &sum, vpx_get16x16var_sse2, 16);
+ variance_sse2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
+ vpx_get16x16var_sse2, 16);
return *sse - (((int64_t)sum * sum) >> 11);
}
@@ -275,8 +274,8 @@ unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
- sse, &sum, vpx_get16x16var_sse2, 16);
+ variance_sse2(src, src_stride, ref, ref_stride, 32, 64, sse, &sum,
+ vpx_get16x16var_sse2, 16);
return *sse - (((int64_t)sum * sum) >> 11);
}
@@ -310,17 +309,14 @@ unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
// The 2 unused parameters are place holders for PIC enabled build.
// These definitions are for functions defined in subpel_variance.asm
-#define DECL(w, opt) \
- int vpx_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
- ptrdiff_t src_stride, \
- int x_offset, int y_offset, \
- const uint8_t *dst, \
- ptrdiff_t dst_stride, \
- int height, unsigned int *sse, \
- void *unused0, void *unused)
+#define DECL(w, opt) \
+ int vpx_sub_pixel_variance##w##xh_##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
+ void *unused0, void *unused)
#define DECLS(opt1, opt2) \
- DECL(4, opt1); \
- DECL(8, opt1); \
+ DECL(4, opt1); \
+ DECL(8, opt1); \
DECL(16, opt1)
DECLS(sse2, sse2);
@@ -328,59 +324,52 @@ DECLS(ssse3, ssse3);
#undef DECLS
#undef DECL
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
-unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
- int src_stride, \
- int x_offset, \
- int y_offset, \
- const uint8_t *dst, \
- int dst_stride, \
- unsigned int *sse_ptr) { \
- unsigned int sse; \
- int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
- y_offset, dst, dst_stride, \
- h, &sse, NULL, NULL); \
- if (w > wf) { \
- unsigned int sse2; \
- int se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
- x_offset, y_offset, \
- dst + 16, dst_stride, \
- h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
- se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
- x_offset, y_offset, \
- dst + 32, dst_stride, \
- h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
- x_offset, y_offset, \
- dst + 48, dst_stride, \
- h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- } \
- } \
- *sse_ptr = sse; \
- return sse - (cast_prod (cast se * se) >> (wlog2 + hlog2)); \
-}
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
+ unsigned int vpx_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \
+ unsigned int sse; \
+ int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, \
+ h, &sse, NULL, NULL); \
+ if (w > wf) { \
+ unsigned int sse2; \
+ int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \
+ src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_sub_pixel_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vpx_sub_pixel_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sse_ptr = sse; \
+ return sse - (cast_prod(cast se * se) >> (wlog2 + hlog2)); \
+ }
-#define FNS(opt1, opt2) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
-FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)); \
-FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)); \
-FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)); \
-FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)); \
-FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)); \
-FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t))
+#define FNS(opt1, opt2) \
+ FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \
+ FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \
+ FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \
+ FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \
+ FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \
+ FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \
+ FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
+ FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)); \
+ FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)); \
+ FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)); \
+ FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)); \
+ FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)); \
+ FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t))
FNS(sse2, sse2);
FNS(ssse3, ssse3);
@@ -389,84 +378,69 @@ FNS(ssse3, ssse3);
#undef FN
// The 2 unused parameters are place holders for PIC enabled build.
-#define DECL(w, opt) \
-int vpx_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
- ptrdiff_t src_stride, \
- int x_offset, int y_offset, \
- const uint8_t *dst, \
- ptrdiff_t dst_stride, \
- const uint8_t *sec, \
- ptrdiff_t sec_stride, \
- int height, unsigned int *sse, \
- void *unused0, void *unused)
+#define DECL(w, opt) \
+ int vpx_sub_pixel_avg_variance##w##xh_##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \
+ ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \
+ void *unused)
#define DECLS(opt1, opt2) \
-DECL(4, opt1); \
-DECL(8, opt1); \
-DECL(16, opt1)
+ DECL(4, opt1); \
+ DECL(8, opt1); \
+ DECL(16, opt1)
DECLS(sse2, sse2);
DECLS(ssse3, ssse3);
#undef DECL
#undef DECLS
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
-unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
- int src_stride, \
- int x_offset, \
- int y_offset, \
- const uint8_t *dst, \
- int dst_stride, \
- unsigned int *sseptr, \
- const uint8_t *sec) { \
- unsigned int sse; \
- int se = vpx_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
- y_offset, dst, dst_stride, \
- sec, w, h, &sse, NULL, \
- NULL); \
- if (w > wf) { \
- unsigned int sse2; \
- int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
- x_offset, y_offset, \
- dst + 16, dst_stride, \
- sec + 16, w, h, &sse2, \
- NULL, NULL); \
- se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
- se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
- x_offset, y_offset, \
- dst + 32, dst_stride, \
- sec + 32, w, h, &sse2, \
- NULL, NULL); \
- se += se2; \
- sse += sse2; \
- se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
- x_offset, y_offset, \
- dst + 48, dst_stride, \
- sec + 48, w, h, &sse2, \
- NULL, NULL); \
- se += se2; \
- sse += sse2; \
- } \
- } \
- *sseptr = sse; \
- return sse - (cast_prod (cast se * se) >> (wlog2 + hlog2)); \
-}
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
+ unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sseptr, \
+ const uint8_t *sec) { \
+ unsigned int sse; \
+ int se = vpx_sub_pixel_avg_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
+ NULL, NULL); \
+ if (w > wf) { \
+ unsigned int sse2; \
+ int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \
+ sec + 16, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \
+ sec + 32, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \
+ sec + 48, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sseptr = sse; \
+ return sse - (cast_prod(cast se * se) >> (wlog2 + hlog2)); \
+ }
-#define FNS(opt1, opt2) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
-FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)); \
-FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)); \
-FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)); \
-FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)); \
-FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)); \
-FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t))
+#define FNS(opt1, opt2) \
+ FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \
+ FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \
+ FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \
+ FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \
+ FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \
+ FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \
+ FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
+ FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)); \
+ FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)); \
+ FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)); \
+ FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)); \
+ FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)); \
+ FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t))
FNS(sse2, sse);
FNS(ssse3, ssse3);
diff --git a/vpx_dsp/x86/vpx_asm_stubs.c b/vpx_dsp/x86/vpx_asm_stubs.c
index 422b0fc42..727d9d115 100644
--- a/vpx_dsp/x86/vpx_asm_stubs.c
+++ b/vpx_dsp/x86/vpx_asm_stubs.c
@@ -75,7 +75,7 @@ FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
FUN_CONV_2D(, sse2);
-FUN_CONV_2D(avg_ , sse2);
+FUN_CONV_2D(avg_, sse2);
#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2;
@@ -157,6 +157,6 @@ HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
// const int16_t *filter_y, int y_step_q4,
// int w, int h, int bd);
HIGH_FUN_CONV_2D(, sse2);
-HIGH_FUN_CONV_2D(avg_ , sse2);
+HIGH_FUN_CONV_2D(avg_, sse2);
#endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
#endif // HAVE_SSE2
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 01cf4354a..6d53b8705 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -36,35 +36,32 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
};
#if defined(__clang__)
-# if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \
- (defined(__APPLE__) && defined(__apple_build_version__) && \
- ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
- (__clang_major__ == 5 && __clang_minor__ == 0)))
-
-# define MM256_BROADCASTSI128_SI256(x) \
- _mm_broadcastsi128_si256((__m128i const *)&(x))
-# else // clang > 3.3, and not 5.0 on macosx.
-# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-# endif // clang <= 3.3
+#if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \
+ (defined(__APPLE__) && defined(__apple_build_version__) && \
+ ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
+ (__clang_major__ == 5 && __clang_minor__ == 0)))
+
+#define MM256_BROADCASTSI128_SI256(x) \
+ _mm_broadcastsi128_si256((__m128i const *) & (x))
+#else // clang > 3.3, and not 5.0 on macosx.
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif // clang <= 3.3
#elif defined(__GNUC__)
-# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
-# define MM256_BROADCASTSI128_SI256(x) \
- _mm_broadcastsi128_si256((__m128i const *)&(x))
-# elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
-# define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
-# else // gcc > 4.7
-# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-# endif // gcc <= 4.6
-#else // !(gcc || clang)
-# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
+#define MM256_BROADCASTSI128_SI256(x) \
+ _mm_broadcastsi128_si256((__m128i const *) & (x))
+#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
+#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
+#else // gcc > 4.7
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif // gcc <= 4.6
+#else // !(gcc || clang)
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
#endif // __clang__
-static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
- ptrdiff_t src_pixels_per_line,
- uint8_t *output_ptr,
- ptrdiff_t output_pitch,
- uint32_t output_height,
- const int16_t *filter) {
+static void vpx_filter_block1d16_h8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
__m128i filtersReg;
__m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
__m256i firstFilters, secondFilters, thirdFilters, forthFilters;
@@ -78,26 +75,22 @@ static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
// have the same data in both lanes of a 256 bit register
filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
// duplicate only the first 16 bits (first and second byte)
// across 256 bit register
- firstFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x100u));
+ firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
// duplicate only the second 16 bits (third and forth byte)
// across 256 bit register
- secondFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x302u));
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
// duplicate only the third 16 bits (fifth and sixth byte)
// across 256 bit register
- thirdFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x504u));
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
// duplicate only the forth 16 bits (seventh and eighth byte)
// across 256 bit register
- forthFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x706u));
+ forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
@@ -107,17 +100,18 @@ static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
// multiple the size of the source and destination stride by two
src_stride = src_pixels_per_line << 1;
dst_stride = output_pitch << 1;
- for (i = output_height; i > 1; i-=2) {
+ for (i = output_height; i > 1; i -= 2) {
// load the 2 strides of source
- srcReg32b1 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr - 3)));
- srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
- _mm_loadu_si128((const __m128i *)
- (src_ptr+src_pixels_per_line-3)), 1);
+ srcReg32b1 =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
+ srcReg32b1 = _mm256_inserti128_si256(
+ srcReg32b1,
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
+ 1);
// filter the source buffer
- srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
- srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+ srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
@@ -127,28 +121,29 @@ static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
// filter the source buffer
- srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
- srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
// add and saturate the results together
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
- _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+ srcRegFilt32b1_1 = _mm256_adds_epi16(
+ srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
// reading 2 strides of the next 16 bytes
// (part of it was being read by earlier read)
- srcReg32b2 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + 5)));
- srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
- _mm_loadu_si128((const __m128i *)
- (src_ptr+src_pixels_per_line+5)), 1);
+ srcReg32b2 =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
+ srcReg32b2 = _mm256_inserti128_si256(
+ srcReg32b2,
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
+ 1);
// add and saturate the results together
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
- _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
+ srcRegFilt32b1_1 = _mm256_adds_epi16(
+ srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
// filter the source buffer
srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
@@ -162,19 +157,18 @@ static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
// filter the source buffer
- srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
- srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
// add and saturate the results together
- srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
- _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
- srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
- _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
+ srcRegFilt32b2_1 = _mm256_adds_epi16(
+ srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+ srcRegFilt32b2_1 = _mm256_adds_epi16(
+ srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
@@ -187,19 +181,18 @@ static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
// shrink to 8 bit each 16 bits, the first lane contain the first
// convolve result and the second lane contain the second convolve
// result
- srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1,
- srcRegFilt32b2_1);
+ srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
- src_ptr+=src_stride;
+ src_ptr += src_stride;
// save 16 bytes
- _mm_store_si128((__m128i*)output_ptr,
- _mm256_castsi256_si128(srcRegFilt32b1_1));
+ _mm_store_si128((__m128i *)output_ptr,
+ _mm256_castsi256_si128(srcRegFilt32b1_1));
// save the next 16 bits
- _mm_store_si128((__m128i*)(output_ptr+output_pitch),
- _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
- output_ptr+=dst_stride;
+ _mm_store_si128((__m128i *)(output_ptr + output_pitch),
+ _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+ output_ptr += dst_stride;
}
// if the number of strides is odd.
@@ -211,83 +204,74 @@ static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
// filter the source buffer
- srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
- _mm256_castsi256_si128(filt1Reg));
- srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
- _mm256_castsi256_si128(filt4Reg));
+ srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
// multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
- _mm256_castsi256_si128(firstFilters));
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
- _mm256_castsi256_si128(forthFilters));
+ srcRegFilt1_1 =
+ _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
// add and saturate the results together
srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
// filter the source buffer
- srcRegFilt3= _mm_shuffle_epi8(srcReg1,
- _mm256_castsi256_si128(filt2Reg));
- srcRegFilt2= _mm_shuffle_epi8(srcReg1,
- _mm256_castsi256_si128(filt3Reg));
+ srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
// multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
- _mm256_castsi256_si128(secondFilters));
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
- _mm256_castsi256_si128(thirdFilters));
+ srcRegFilt3 =
+ _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
// add and saturate the results together
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
- _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
// reading the next 16 bytes
// (part of it was being read by earlier read)
srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
// add and saturate the results together
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
- _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
// filter the source buffer
- srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
- _mm256_castsi256_si128(filt1Reg));
- srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
- _mm256_castsi256_si128(filt4Reg));
+ srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
// multiply 2 adjacent elements with the filter and add the result
- srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
- _mm256_castsi256_si128(firstFilters));
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
- _mm256_castsi256_si128(forthFilters));
+ srcRegFilt2_1 =
+ _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
// add and saturate the results together
srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
// filter the source buffer
- srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
- _mm256_castsi256_si128(filt2Reg));
- srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
- _mm256_castsi256_si128(filt3Reg));
+ srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
// multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
- _mm256_castsi256_si128(secondFilters));
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
- _mm256_castsi256_si128(thirdFilters));
+ srcRegFilt3 =
+ _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
// add and saturate the results together
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
- _mm_min_epi16(srcRegFilt3, srcRegFilt2));
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
- _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
+ srcRegFilt2_1 =
+ _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+ srcRegFilt2_1 =
+ _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
- _mm256_castsi256_si128(addFilterReg64));
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64));
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
- _mm256_castsi256_si128(addFilterReg64));
+ srcRegFilt2_1 =
+ _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64));
// shift by 7 bit each 16 bit
srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
@@ -299,16 +283,13 @@ static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
// save 16 bytes
- _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
+ _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
}
}
-static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
- ptrdiff_t src_pitch,
- uint8_t *output_ptr,
- ptrdiff_t out_pitch,
- uint32_t output_height,
- const int16_t *filter) {
+static void vpx_filter_block1d16_v8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
__m128i filtersReg;
__m256i addFilterReg64;
__m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
@@ -323,60 +304,56 @@ static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the
// same data in both lanes of 128 bit register.
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
// have the same data in both lanes of a 256 bit register
filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
// duplicate only the first 16 bits (first and second byte)
// across 256 bit register
- firstFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x100u));
+ firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
// duplicate only the second 16 bits (third and forth byte)
// across 256 bit register
- secondFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x302u));
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
// duplicate only the third 16 bits (fifth and sixth byte)
// across 256 bit register
- thirdFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x504u));
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
// duplicate only the forth 16 bits (seventh and eighth byte)
// across 256 bit register
- forthFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x706u));
+ forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
// multiple the size of the source and destination stride by two
src_stride = src_pitch << 1;
dst_stride = out_pitch << 1;
// load 16 bytes 7 times in stride of src_pitch
- srcReg32b1 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr)));
+ srcReg32b1 =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr)));
srcReg32b2 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
srcReg32b3 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
srcReg32b4 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
srcReg32b5 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
srcReg32b6 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
srcReg32b7 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
// have each consecutive loads on the same 256 register
srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
- _mm256_castsi256_si128(srcReg32b2), 1);
+ _mm256_castsi256_si128(srcReg32b2), 1);
srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
- _mm256_castsi256_si128(srcReg32b3), 1);
+ _mm256_castsi256_si128(srcReg32b3), 1);
srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
- _mm256_castsi256_si128(srcReg32b4), 1);
+ _mm256_castsi256_si128(srcReg32b4), 1);
srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
- _mm256_castsi256_si128(srcReg32b5), 1);
+ _mm256_castsi256_si128(srcReg32b5), 1);
srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
- _mm256_castsi256_si128(srcReg32b6), 1);
+ _mm256_castsi256_si128(srcReg32b6), 1);
srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
- _mm256_castsi256_si128(srcReg32b7), 1);
+ _mm256_castsi256_si128(srcReg32b7), 1);
// merge every two consecutive registers except the last one
srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
@@ -394,89 +371,87 @@ static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
// save
srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg32b8 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
+ srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
+ _mm256_castsi256_si128(srcReg32b8), 1);
+ srcReg32b9 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
+ srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
+ _mm256_castsi256_si128(srcReg32b9), 1);
+
+ // merge every two consecutive registers
+ // save
+ srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
+ srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
+ srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+
+ // add and saturate the results together
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
+ srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
+
+ // add and saturate the results together
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+ _mm256_min_epi16(srcReg32b8, srcReg32b12));
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+ _mm256_max_epi16(srcReg32b8, srcReg32b12));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
+ srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
+
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
+ srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
+
+ // add and saturate the results together
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+ _mm256_min_epi16(srcReg32b8, srcReg32b12));
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+ _mm256_max_epi16(srcReg32b8, srcReg32b12));
+
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
+
+ // shift by 7 bit each 16 bit
+ srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
+ srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
- for (i = output_height; i > 1; i-=2) {
- // load the last 2 loads of 16 bytes and have every two
- // consecutive loads in the same 256 bit register
- srcReg32b8 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
- srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
- _mm256_castsi256_si128(srcReg32b8), 1);
- srcReg32b9 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
- srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
- _mm256_castsi256_si128(srcReg32b9), 1);
-
- // merge every two consecutive registers
- // save
- srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
- srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
- srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
-
- // add and saturate the results together
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
- srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
-
- // add and saturate the results together
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
- _mm256_min_epi16(srcReg32b8, srcReg32b12));
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
- _mm256_max_epi16(srcReg32b8, srcReg32b12));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
- srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
-
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
- srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
-
- // add and saturate the results together
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
- _mm256_min_epi16(srcReg32b8, srcReg32b12));
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
- _mm256_max_epi16(srcReg32b8, srcReg32b12));
-
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
-
- // shift by 7 bit each 16 bit
- srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
- srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
-
- src_ptr+=src_stride;
-
- // save 16 bytes
- _mm_store_si128((__m128i*)output_ptr,
- _mm256_castsi256_si128(srcReg32b1));
-
- // save the next 16 bits
- _mm_store_si128((__m128i*)(output_ptr+out_pitch),
- _mm256_extractf128_si256(srcReg32b1, 1));
-
- output_ptr+=dst_stride;
-
- // save part of the registers for next strides
- srcReg32b10 = srcReg32b11;
- srcReg32b1 = srcReg32b3;
- srcReg32b11 = srcReg32b2;
- srcReg32b3 = srcReg32b5;
- srcReg32b2 = srcReg32b4;
- srcReg32b5 = srcReg32b7;
- srcReg32b7 = srcReg32b9;
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
+
+ src_ptr += src_stride;
+
+ // save 16 bytes
+ _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1));
+
+ // save the next 16 bits
+ _mm_store_si128((__m128i *)(output_ptr + out_pitch),
+ _mm256_extractf128_si256(srcReg32b1, 1));
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg32b10 = srcReg32b11;
+ srcReg32b1 = srcReg32b3;
+ srcReg32b11 = srcReg32b2;
+ srcReg32b3 = srcReg32b5;
+ srcReg32b2 = srcReg32b4;
+ srcReg32b5 = srcReg32b7;
+ srcReg32b7 = srcReg32b9;
}
if (i > 0) {
__m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
@@ -485,55 +460,53 @@ static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
// merge the last 2 results together
- srcRegFilt4 = _mm_unpacklo_epi8(
- _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
- srcRegFilt7 = _mm_unpackhi_epi8(
- _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+ srcRegFilt4 =
+ _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+ srcRegFilt7 =
+ _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
- _mm256_castsi256_si128(firstFilters));
- srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4,
- _mm256_castsi256_si128(forthFilters));
+ _mm256_castsi256_si128(firstFilters));
+ srcRegFilt4 =
+ _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
- _mm256_castsi256_si128(firstFilters));
- srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7,
- _mm256_castsi256_si128(forthFilters));
+ _mm256_castsi256_si128(firstFilters));
+ srcRegFilt7 =
+ _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
// add and saturate the results together
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
-
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
- _mm256_castsi256_si128(secondFilters));
+ _mm256_castsi256_si128(secondFilters));
srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
- _mm256_castsi256_si128(secondFilters));
+ _mm256_castsi256_si128(secondFilters));
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
- _mm256_castsi256_si128(thirdFilters));
+ _mm256_castsi256_si128(thirdFilters));
srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
- _mm256_castsi256_si128(thirdFilters));
+ _mm256_castsi256_si128(thirdFilters));
// add and saturate the results together
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
- _mm_min_epi16(srcRegFilt4, srcRegFilt6));
- srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
- _mm_min_epi16(srcRegFilt5, srcRegFilt7));
+ srcRegFilt1 =
+ _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6));
+ srcRegFilt3 =
+ _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7));
// add and saturate the results together
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
- _mm_max_epi16(srcRegFilt4, srcRegFilt6));
- srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
- _mm_max_epi16(srcRegFilt5, srcRegFilt7));
-
+ srcRegFilt1 =
+ _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6));
+ srcRegFilt3 =
+ _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7));
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
- _mm256_castsi256_si128(addFilterReg64));
- srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
- _mm256_castsi256_si128(addFilterReg64));
+ srcRegFilt1 =
+ _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64));
+ srcRegFilt3 =
+ _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64));
// shift by 7 bit each 16 bit
srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
@@ -545,7 +518,7 @@ static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
// save 16 bytes
- _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
+ _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
}
}
@@ -575,10 +548,10 @@ filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
#define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3
#define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3
#define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3
-#define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3
-#define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3
-#define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3
-#define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3
+#define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3
+#define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3
+#define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3
+#define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3
// void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index 2bb0f4a24..b26d97b45 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -48,23 +48,20 @@ filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
-void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
- ptrdiff_t src_pixels_per_line,
- uint8_t *output_ptr,
- ptrdiff_t output_pitch,
- uint32_t output_height,
- const int16_t *filter) {
+void vpx_filter_block1d4_h8_intrin_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
__m128i firstFilters, secondFilters, shuffle1, shuffle2;
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
__m128i addFilterReg64, filtersReg, srcReg, minReg;
unsigned int i;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
+ addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
// duplicate only the first 16 bits in the filter into the first lane
firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
@@ -78,23 +75,23 @@ void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
// loading the local filters
- shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);
+ shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8);
shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
for (i = 0; i < output_height; i++) {
srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
// filter the source buffer
- srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
- srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);
+ srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1);
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
// extract the higher half of the lane
- srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
- srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
+ srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
+ srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
@@ -110,21 +107,18 @@ void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
// shrink to 8 bit each 16 bits
srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
- src_ptr+=src_pixels_per_line;
+ src_ptr += src_pixels_per_line;
// save only 4 bytes
- *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
+ *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);
- output_ptr+=output_pitch;
+ output_ptr += output_pitch;
}
}
-void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
- ptrdiff_t src_pixels_per_line,
- uint8_t *output_ptr,
- ptrdiff_t output_pitch,
- uint32_t output_height,
- const int16_t *filter) {
+void vpx_filter_block1d8_h8_intrin_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
__m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
__m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
@@ -136,7 +130,7 @@ void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
// duplicate only the first 16 bits (first and second byte)
// across 128 bit register
@@ -160,16 +154,16 @@ void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
// filter the source buffer
- srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
- srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
+ srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg);
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
// filter the source buffer
- srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
- srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
+ srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg);
+ srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
@@ -179,7 +173,7 @@ void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
- srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+ srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
@@ -190,21 +184,18 @@ void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
// shrink to 8 bit each 16 bits
srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
- src_ptr+=src_pixels_per_line;
+ src_ptr += src_pixels_per_line;
// save only 8 bytes
- _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+ _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
- output_ptr+=output_pitch;
+ output_ptr += output_pitch;
}
}
-void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
- ptrdiff_t src_pitch,
- uint8_t *output_ptr,
- ptrdiff_t out_pitch,
- uint32_t output_height,
- const int16_t *filter) {
+void vpx_filter_block1d8_v8_intrin_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
__m128i addFilterReg64, filtersReg, minReg;
__m128i firstFilters, secondFilters, thirdFilters, forthFilters;
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
@@ -217,7 +208,7 @@ void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
// duplicate only the first 16 bits in the filter
firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
@@ -269,7 +260,7 @@ void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
// shrink to 8 bit each 16 bits
srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
- src_ptr+=src_pitch;
+ src_ptr += src_pitch;
// shift down a row
srcReg1 = srcReg2;
@@ -281,9 +272,9 @@ void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
srcReg7 = srcReg8;
// save only 8 bytes convolve result
- _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+ _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
- output_ptr+=out_pitch;
+ output_ptr += out_pitch;
}
}
@@ -339,32 +330,33 @@ FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
ssse3);
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3, out4, out5, out6, out7) { \
- const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \
- const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \
- const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \
- const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \
- \
- const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \
- const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \
- const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \
- const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \
- \
- const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \
- const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \
- const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \
- const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \
- \
- out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \
- out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \
- out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \
- out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \
- out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \
- out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \
- out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \
- out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \
-}
+#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ { \
+ const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \
+ const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \
+ const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \
+ \
+ const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \
+ const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \
+ const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \
+ const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \
+ \
+ const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \
+ const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \
+ const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \
+ const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \
+ \
+ out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \
+ out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \
+ out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \
+ out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \
+ out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \
+ out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \
+ out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \
+ out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \
+ }
static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
uint8_t *dst, const int16_t *x_filter) {
@@ -420,7 +412,7 @@ static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
// shrink to 8 bit each 16 bits
temp = _mm_packus_epi16(temp, temp);
// save only 8 bytes convolve result
- _mm_storel_epi64((__m128i*)dst, temp);
+ _mm_storel_epi64((__m128i *)dst, temp);
}
static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
@@ -436,23 +428,22 @@ static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
- TRANSPOSE_8X8(A, B, C, D, E, F, G, H,
- A, B, C, D, E, F, G, H);
-
- _mm_storel_epi64((__m128i*)dst, A);
- _mm_storel_epi64((__m128i*)(dst + dst_stride * 1), B);
- _mm_storel_epi64((__m128i*)(dst + dst_stride * 2), C);
- _mm_storel_epi64((__m128i*)(dst + dst_stride * 3), D);
- _mm_storel_epi64((__m128i*)(dst + dst_stride * 4), E);
- _mm_storel_epi64((__m128i*)(dst + dst_stride * 5), F);
- _mm_storel_epi64((__m128i*)(dst + dst_stride * 6), G);
- _mm_storel_epi64((__m128i*)(dst + dst_stride * 7), H);
+ TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H);
+
+ _mm_storel_epi64((__m128i *)dst, A);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H);
}
static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *x_filters,
- int x0_q4, int x_step_q4, int w, int h) {
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h) {
DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
int x, y, z;
src -= SUBPEL_TAPS / 2 - 1;
@@ -523,7 +514,7 @@ static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
// 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
// 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
- const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
// 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
// 02 03 12 13 22 23 32 33
@@ -565,16 +556,16 @@ static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
C = _mm_srli_si128(A, 8);
D = _mm_srli_si128(A, 12);
- *(int *)(dst) = _mm_cvtsi128_si32(A);
- *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B);
- *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);
- *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);
+ *(int *)(dst) = _mm_cvtsi128_si32(A);
+ *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B);
+ *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);
+ *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);
}
static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *x_filters,
- int x0_q4, int x_step_q4, int w, int h) {
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h) {
DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
int x, y, z;
src -= SUBPEL_TAPS / 2 - 1;
@@ -648,8 +639,8 @@ static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *y_filters,
- int y0_q4, int y_step_q4, int w, int h) {
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h) {
int y;
int y_q4 = y0_q4;
@@ -705,13 +696,13 @@ static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
// shrink to 8 bit each 16 bits
temp = _mm_packus_epi16(temp, temp);
// save only 8 bytes convolve result
- _mm_storel_epi64((__m128i*)dst, temp);
+ _mm_storel_epi64((__m128i *)dst, temp);
}
static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *y_filters,
- int y0_q4, int y_step_q4, int w, int h) {
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h) {
int y;
int y_q4 = y0_q4;
@@ -794,15 +785,15 @@ static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
// result
temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
src_ptr += 16;
- // save 16 bytes convolve result
- _mm_store_si128((__m128i*)&dst[i], temp_hi);
+ // save 16 bytes convolve result
+ _mm_store_si128((__m128i *)&dst[i], temp_hi);
}
}
static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *y_filters,
- int y0_q4, int y_step_q4, int w, int h) {
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h) {
int y;
int y_q4 = y0_q4;
@@ -822,11 +813,9 @@ static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *const x_filters,
- int x0_q4, int x_step_q4,
- const InterpKernel *const y_filters,
- int y0_q4, int y_step_q4,
- int w, int h) {
+ const InterpKernel *const x_filters, int x0_q4,
+ int x_step_q4, const InterpKernel *const y_filters,
+ int y0_q4, int y_step_q4, int w, int h) {
// Note: Fixed size intermediate buffer, temp, places limits on parameters.
// 2d filtering proceeds in 2 steps:
// (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -881,10 +870,9 @@ static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
return (int)((const InterpKernel *)(intptr_t)f - base);
}
-void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
+void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int x_step_q4, const int16_t *filter_y, int y_step_q4,
int w, int h) {
const InterpKernel *const filters_x = get_filter_base(filter_x);
const int x0_q4 = get_filter_offset(filter_x, filters_x);
@@ -892,9 +880,8 @@ void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride,
const InterpKernel *const filters_y = get_filter_base(filter_y);
const int y0_q4 = get_filter_offset(filter_y, filters_y);
- scaledconvolve2d(src, src_stride, dst, dst_stride,
- filters_x, x0_q4, x_step_q4,
- filters_y, y0_q4, y_step_q4, w, h);
+ scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+ x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
}
// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
@@ -908,4 +895,4 @@ void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
FUN_CONV_2D(, ssse3);
-FUN_CONV_2D(avg_ , ssse3);
+FUN_CONV_2D(avg_, ssse3);