summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--test/datarate_test.cc45
-rw-r--r--test/encode_perf_test.cc2
-rw-r--r--test/lpf_8_test.cc44
-rw-r--r--vp8/vp8_dx_iface.c4
-rw-r--r--vp9/common/arm/neon/vp9_loopfilter_16_neon.c45
-rw-r--r--vp9/common/arm/neon/vp9_loopfilter_4_neon.c274
-rw-r--r--vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm277
-rw-r--r--vp9/common/arm/neon/vp9_loopfilter_8_neon.c453
-rw-r--r--vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm (renamed from vp9/common/arm/neon/vp9_loopfilter_neon_asm.asm)257
-rw-r--r--vp9/common/arm/neon/vp9_loopfilter_neon.c732
-rw-r--r--vp9/common/vp9_idct.c344
-rw-r--r--vp9/common/vp9_idct.h36
-rw-r--r--vp9/common/vp9_mfqe.c64
-rw-r--r--vp9/common/vp9_tile_common.c34
-rw-r--r--vp9/decoder/vp9_decodeframe.c2
-rw-r--r--vp9/decoder/vp9_decoder.c4
-rw-r--r--vp9/decoder/vp9_detokenize.c12
-rw-r--r--vp9/encoder/vp9_bitstream.c4
-rw-r--r--vp9/encoder/vp9_dct.c9
-rw-r--r--vp9/encoder/vp9_denoiser.c19
-rw-r--r--vp9/encoder/vp9_encodeframe.c30
-rw-r--r--vp9/encoder/vp9_encodemb.c2
-rw-r--r--vp9/encoder/vp9_encoder.c4
-rw-r--r--vp9/encoder/vp9_pickmode.c4
-rw-r--r--vp9/encoder/vp9_tokenize.h2
-rw-r--r--vp9/vp9_common.mk10
-rw-r--r--vp9/vp9_dx_iface.c6
27 files changed, 1448 insertions, 1271 deletions
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 3d40148b7..573870e91 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -246,6 +246,8 @@ class DatarateTestVP9Large : public ::libvpx_test::EncoderTest,
for (int i = 0; i < 3; ++i) {
bits_total_[i] = 0;
}
+ denoiser_offon_test_ = 0;
+ denoiser_offon_period_ = -1;
}
//
@@ -316,6 +318,15 @@ class DatarateTestVP9Large : public ::libvpx_test::EncoderTest,
if (video->frame() == 1)
encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+ if (denoiser_offon_test_) {
+ ASSERT_GT(denoiser_offon_period_, 0)
+ << "denoiser_offon_period_ is not positive.";
+ if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
+ // Flip denoiser_on_ periodically
+ denoiser_on_ ^= 1;
+ }
+ }
+
encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
if (cfg_.ts_number_layers > 1) {
@@ -399,6 +410,8 @@ class DatarateTestVP9Large : public ::libvpx_test::EncoderTest,
vpx_codec_pts_t first_drop_;
int num_drops_;
int denoiser_on_;
+ int denoiser_offon_test_;
+ int denoiser_offon_period_;
};
// Check basic rate targeting,
@@ -653,6 +666,38 @@ TEST_P(DatarateTestVP9Large, DenoiserLevels) {
ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
<< " The datarate for the file is greater than target by too much!";
}
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is off
+// and on.
+TEST_P(DatarateTestVP9Large, DenoiserOffon) {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 1;
+ cfg_.rc_min_quantizer = 2;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_end_usage = VPX_CBR;
+ cfg_.g_lag_in_frames = 0;
+
+ ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ 30, 1, 0, 299);
+
+ // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
+ // there is only one denoiser mode: denoiserYonly(which is 1),
+ // but may add more modes in the future.
+ cfg_.rc_target_bitrate = 300;
+ ResetModel();
+ // The denoiser is off by default.
+ denoiser_on_ = 0;
+ // Set the offon test flag.
+ denoiser_offon_test_ = 1;
+ denoiser_offon_period_ = 100;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+ << " The datarate for the file is greater than target by too much!";
+}
#endif // CONFIG_VP9_TEMPORAL_DENOISING
VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES);
diff --git a/test/encode_perf_test.cc b/test/encode_perf_test.cc
index 769e14ae2..7e9f0d6c4 100644
--- a/test/encode_perf_test.cc
+++ b/test/encode_perf_test.cc
@@ -189,7 +189,7 @@ TEST_P(VP9EncodePerfTest, PerfTest) {
printf("\t\"totalFrames\" : %u,\n", frames);
printf("\t\"framesPerSecond\" : %f,\n", fps);
printf("\t\"minPsnr\" : %f,\n", minimum_psnr);
- printf("\t\"speed\" : %d\n", kEncodePerfTestSpeeds[j]);
+ printf("\t\"speed\" : %d,\n", kEncodePerfTestSpeeds[j]);
printf("\t\"threads\" : %d\n", kEncodePerfTestThreads[k]);
printf("}\n");
}
diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index cdc0a9895..b6c1653fa 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -107,6 +107,36 @@ void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // HAVE_SSE2
+#if HAVE_NEON_ASM
+#if CONFIG_VP9_HIGHBITDEPTH
+// No neon high bitdepth functions.
+#else
+void wrapper_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count) {
+ vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count) {
+ vp9_lpf_vertical_16_c(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count) {
+ vp9_lpf_vertical_16_dual_neon(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count) {
+ vp9_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_NEON_ASM
+
class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
public:
virtual ~Loop8Test6Param() {}
@@ -594,13 +624,22 @@ INSTANTIATE_TEST_CASE_P(
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif
-#if HAVE_NEON && (!CONFIG_VP9_HIGHBITDEPTH)
+#if HAVE_NEON
+#if CONFIG_VP9_HIGHBITDEPTH
+// No neon high bitdepth functions.
+#else
INSTANTIATE_TEST_CASE_P(
NEON, Loop8Test6Param,
::testing::Values(
#if HAVE_NEON_ASM
+// Using #if inside the macro is unsupported on MSVS but the tests are not
+// currently built for MSVS with ARM and NEON.
make_tuple(&vp9_lpf_horizontal_16_neon,
&vp9_lpf_horizontal_16_c, 8),
+ make_tuple(&wrapper_vertical_16_neon,
+ &wrapper_vertical_16_c, 8),
+ make_tuple(&wrapper_vertical_16_dual_neon,
+ &wrapper_vertical_16_dual_c, 8),
#endif // HAVE_NEON_ASM
make_tuple(&vp9_lpf_horizontal_4_neon,
&vp9_lpf_horizontal_4_c, 8),
@@ -621,6 +660,7 @@ INSTANTIATE_TEST_CASE_P(
&vp9_lpf_vertical_4_dual_c, 8),
make_tuple(&vp9_lpf_vertical_8_dual_neon,
&vp9_lpf_vertical_8_dual_c, 8)));
-#endif // HAVE_NEON && (!CONFIG_VP9_HIGHBITDEPTH)
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_NEON
} // namespace
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 6810644ba..67a0fef64 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -188,7 +188,7 @@ static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data,
/* vet via sync code */
if (clear[3] != 0x9d || clear[4] != 0x01 || clear[5] != 0x2a)
- res = VPX_CODEC_UNSUP_BITSTREAM;
+ return VPX_CODEC_UNSUP_BITSTREAM;
si->w = (clear[6] | (clear[7] << 8)) & 0x3fff;
si->h = (clear[8] | (clear[9] << 8)) & 0x3fff;
@@ -402,7 +402,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
if (!res)
{
VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
- if(resolution_change)
+ if (resolution_change)
{
VP8_COMMON *const pc = & pbi->common;
MACROBLOCKD *const xd = & pbi->mb;
diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
index 09f470e97..c69ee1009 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
+++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
@@ -124,7 +124,6 @@ static INLINE void vp9_loop_filter_neon_16(
return;
}
-#if !HAVE_NEON_ASM
void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
const uint8_t *blimit0,
const uint8_t *limit0,
@@ -178,47 +177,3 @@ void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
vst1q_u8(s, q8u8);
return;
}
-#endif // !HAVE_NEON_ASM
-
-void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1) {
- vp9_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1);
- vp9_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1);
-}
-
-void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1) {
- vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
- vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
-}
-
-void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1) {
- vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
- vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
-}
-
-#if HAVE_NEON_ASM
-void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh) {
- vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
- vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
-}
-#endif // HAVE_NEON_ASM
diff --git a/vp9/common/arm/neon/vp9_loopfilter_4_neon.c b/vp9/common/arm/neon/vp9_loopfilter_4_neon.c
new file mode 100644
index 000000000..fd9db6187
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_loopfilter_4_neon.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+static INLINE void vp9_loop_filter_neon(
+ uint8x8_t dblimit, // flimit
+ uint8x8_t dlimit, // limit
+ uint8x8_t dthresh, // thresh
+ uint8x8_t d3u8, // p3
+ uint8x8_t d4u8, // p2
+ uint8x8_t d5u8, // p1
+ uint8x8_t d6u8, // p0
+ uint8x8_t d7u8, // q0
+ uint8x8_t d16u8, // q1
+ uint8x8_t d17u8, // q2
+ uint8x8_t d18u8, // q3
+ uint8x8_t *d4ru8, // p1
+ uint8x8_t *d5ru8, // p0
+ uint8x8_t *d6ru8, // q0
+ uint8x8_t *d7ru8) { // q1
+ uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
+ int16x8_t q12s16;
+ int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
+
+ d19u8 = vabd_u8(d3u8, d4u8);
+ d20u8 = vabd_u8(d4u8, d5u8);
+ d21u8 = vabd_u8(d5u8, d6u8);
+ d22u8 = vabd_u8(d16u8, d7u8);
+ d3u8 = vabd_u8(d17u8, d16u8);
+ d4u8 = vabd_u8(d18u8, d17u8);
+
+ d19u8 = vmax_u8(d19u8, d20u8);
+ d20u8 = vmax_u8(d21u8, d22u8);
+ d3u8 = vmax_u8(d3u8, d4u8);
+ d23u8 = vmax_u8(d19u8, d20u8);
+
+ d17u8 = vabd_u8(d6u8, d7u8);
+
+ d21u8 = vcgt_u8(d21u8, dthresh);
+ d22u8 = vcgt_u8(d22u8, dthresh);
+ d23u8 = vmax_u8(d23u8, d3u8);
+
+ d28u8 = vabd_u8(d5u8, d16u8);
+ d17u8 = vqadd_u8(d17u8, d17u8);
+
+ d23u8 = vcge_u8(dlimit, d23u8);
+
+ d18u8 = vdup_n_u8(0x80);
+ d5u8 = veor_u8(d5u8, d18u8);
+ d6u8 = veor_u8(d6u8, d18u8);
+ d7u8 = veor_u8(d7u8, d18u8);
+ d16u8 = veor_u8(d16u8, d18u8);
+
+ d28u8 = vshr_n_u8(d28u8, 1);
+ d17u8 = vqadd_u8(d17u8, d28u8);
+
+ d19u8 = vdup_n_u8(3);
+
+ d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8),
+ vreinterpret_s8_u8(d6u8));
+
+ d17u8 = vcge_u8(dblimit, d17u8);
+
+ d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8),
+ vreinterpret_s8_u8(d16u8));
+
+ d22u8 = vorr_u8(d21u8, d22u8);
+
+ q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+ d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
+ d23u8 = vand_u8(d23u8, d17u8);
+
+ q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
+
+ d17u8 = vdup_n_u8(4);
+
+ d27s8 = vqmovn_s16(q12s16);
+ d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
+ d27s8 = vreinterpret_s8_u8(d27u8);
+
+ d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
+ d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
+ d28s8 = vshr_n_s8(d28s8, 3);
+ d27s8 = vshr_n_s8(d27s8, 3);
+
+ d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
+ d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
+
+ d27s8 = vrshr_n_s8(d27s8, 1);
+ d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
+
+ d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
+ d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
+
+ *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
+ *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
+ *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
+ *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
+ return;
+}
+
+void vp9_lpf_horizontal_4_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char *blimit,
+ unsigned char *limit,
+ unsigned char *thresh,
+ int count) {
+ int i;
+ uint8_t *s, *psrc;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+
+ if (count == 0) // end_vp9_lf_h_edge
+ return;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ psrc = src - (pitch << 2);
+ for (i = 0; i < count; i++) {
+ s = psrc + i * 8;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ vp9_loop_filter_neon(dblimit, dlimit, dthresh,
+ d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+ &d4u8, &d5u8, &d6u8, &d7u8);
+
+ s -= (pitch * 5);
+ vst1_u8(s, d4u8);
+ s += pitch;
+ vst1_u8(s, d5u8);
+ s += pitch;
+ vst1_u8(s, d6u8);
+ s += pitch;
+ vst1_u8(s, d7u8);
+ }
+ return;
+}
+
+void vp9_lpf_vertical_4_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char *blimit,
+ unsigned char *limit,
+ unsigned char *thresh,
+ int count) {
+ int i, pitch8;
+ uint8_t *s;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+ uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+ uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+ uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+ uint8x8x4_t d4Result;
+
+ if (count == 0) // end_vp9_lf_h_edge
+ return;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ pitch8 = pitch * 8;
+ for (i = 0; i < count; i++, src += pitch8) {
+ s = src - (i + 1) * 4;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
+ vreinterpret_u32_u8(d7u8));
+ d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
+ vreinterpret_u32_u8(d16u8));
+ d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
+ vreinterpret_u32_u8(d17u8));
+ d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
+ vreinterpret_u32_u8(d18u8));
+
+ d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+ vreinterpret_u16_u32(d2tmp2.val[0]));
+ d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+ vreinterpret_u16_u32(d2tmp3.val[0]));
+ d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+ vreinterpret_u16_u32(d2tmp2.val[1]));
+ d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+ vreinterpret_u16_u32(d2tmp3.val[1]));
+
+ d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+ vreinterpret_u8_u16(d2tmp5.val[0]));
+ d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+ vreinterpret_u8_u16(d2tmp5.val[1]));
+ d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+ vreinterpret_u8_u16(d2tmp7.val[0]));
+ d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+ vreinterpret_u8_u16(d2tmp7.val[1]));
+
+ d3u8 = d2tmp8.val[0];
+ d4u8 = d2tmp8.val[1];
+ d5u8 = d2tmp9.val[0];
+ d6u8 = d2tmp9.val[1];
+ d7u8 = d2tmp10.val[0];
+ d16u8 = d2tmp10.val[1];
+ d17u8 = d2tmp11.val[0];
+ d18u8 = d2tmp11.val[1];
+
+ vp9_loop_filter_neon(dblimit, dlimit, dthresh,
+ d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+ &d4u8, &d5u8, &d6u8, &d7u8);
+
+ d4Result.val[0] = d4u8;
+ d4Result.val[1] = d5u8;
+ d4Result.val[2] = d6u8;
+ d4Result.val[3] = d7u8;
+
+ src -= 2;
+ vst4_lane_u8(src, d4Result, 0);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 1);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 2);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 3);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 4);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 5);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 6);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 7);
+ }
+ return;
+}
diff --git a/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm b/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm
new file mode 100644
index 000000000..7738e0d3a
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm
@@ -0,0 +1,277 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp9_lpf_horizontal_4_neon|
+ EXPORT |vp9_lpf_vertical_4_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vp9_lpf_horizontal_4_neon(uint8_t *s,
+; int p /* pitch */,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh,
+; int count)
+;
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+; sp+4 int count
+|vp9_lpf_horizontal_4_neon| PROC
+ push {lr}
+
+ vld1.8 {d0[]}, [r2] ; duplicate *blimit
+ ldr r12, [sp, #8] ; load count
+ ldr r2, [sp, #4] ; load thresh
+ add r1, r1, r1 ; double pitch
+
+ cmp r12, #0
+ beq end_vp9_lf_h_edge
+
+ vld1.8 {d1[]}, [r3] ; duplicate *limit
+ vld1.8 {d2[]}, [r2] ; duplicate *thresh
+
+count_lf_h_loop
+ sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines
+ add r3, r2, r1, lsr #1 ; set to 3 lines down
+
+ vld1.u8 {d3}, [r2@64], r1 ; p3
+ vld1.u8 {d4}, [r3@64], r1 ; p2
+ vld1.u8 {d5}, [r2@64], r1 ; p1
+ vld1.u8 {d6}, [r3@64], r1 ; p0
+ vld1.u8 {d7}, [r2@64], r1 ; q0
+ vld1.u8 {d16}, [r3@64], r1 ; q1
+ vld1.u8 {d17}, [r2@64] ; q2
+ vld1.u8 {d18}, [r3@64] ; q3
+
+ sub r2, r2, r1, lsl #1
+ sub r3, r3, r1, lsl #1
+
+ bl vp9_loop_filter_neon
+
+ vst1.u8 {d4}, [r2@64], r1 ; store op1
+ vst1.u8 {d5}, [r3@64], r1 ; store op0
+ vst1.u8 {d6}, [r2@64], r1 ; store oq0
+ vst1.u8 {d7}, [r3@64], r1 ; store oq1
+
+ add r0, r0, #8
+ subs r12, r12, #1
+ bne count_lf_h_loop
+
+end_vp9_lf_h_edge
+ pop {pc}
+ ENDP ; |vp9_lpf_horizontal_4_neon|
+
+; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vp9_lpf_vertical_4_neon(uint8_t *s,
+; int p /* pitch */,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh,
+; int count)
+;
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+; sp+4 int count
+|vp9_lpf_vertical_4_neon| PROC
+ push {lr}
+
+ vld1.8 {d0[]}, [r2] ; duplicate *blimit
+ ldr r12, [sp, #8] ; load count
+ vld1.8 {d1[]}, [r3] ; duplicate *limit
+
+ ldr r3, [sp, #4] ; load thresh
+ sub r2, r0, #4 ; move s pointer down by 4 columns
+ cmp r12, #0
+ beq end_vp9_lf_v_edge
+
+ vld1.8 {d2[]}, [r3] ; duplicate *thresh
+
+count_lf_v_loop
+ vld1.u8 {d3}, [r2], r1 ; load s data
+ vld1.u8 {d4}, [r2], r1
+ vld1.u8 {d5}, [r2], r1
+ vld1.u8 {d6}, [r2], r1
+ vld1.u8 {d7}, [r2], r1
+ vld1.u8 {d16}, [r2], r1
+ vld1.u8 {d17}, [r2], r1
+ vld1.u8 {d18}, [r2]
+
+ ;transpose to 8x16 matrix
+ vtrn.32 d3, d7
+ vtrn.32 d4, d16
+ vtrn.32 d5, d17
+ vtrn.32 d6, d18
+
+ vtrn.16 d3, d5
+ vtrn.16 d4, d6
+ vtrn.16 d7, d17
+ vtrn.16 d16, d18
+
+ vtrn.8 d3, d4
+ vtrn.8 d5, d6
+ vtrn.8 d7, d16
+ vtrn.8 d17, d18
+
+ bl vp9_loop_filter_neon
+
+ sub r0, r0, #2
+
+ ;store op1, op0, oq0, oq1
+ vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
+ vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
+ vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
+ vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
+ vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
+ vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
+ vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
+ vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0]
+
+ add r0, r0, r1, lsl #3 ; s += pitch * 8
+ subs r12, r12, #1
+ subne r2, r0, #4 ; move s pointer down by 4 columns
+ bne count_lf_v_loop
+
+end_vp9_lf_v_edge
+ pop {pc}
+ ENDP ; |vp9_lpf_vertical_4_neon|
+
+; void vp9_loop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; Inputs:
+; r0-r3, r12 PRESERVE
+; d0 blimit
+; d1 limit
+; d2 thresh
+; d3 p3
+; d4 p2
+; d5 p1
+; d6 p0
+; d7 q0
+; d16 q1
+; d17 q2
+; d18 q3
+;
+; Outputs:
+; d4 op1
+; d5 op0
+; d6 oq0
+; d7 oq1
+|vp9_loop_filter_neon| PROC
+ ; filter_mask
+ vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2)
+ vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1)
+ vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0)
+ vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0)
+ vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1)
+ vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2)
+
+ ; only compare the largest value to limit
+ vmax.u8 d19, d19, d20 ; m1 = max(m1, m2)
+ vmax.u8 d20, d21, d22 ; m2 = max(m3, m4)
+
+ vabd.u8 d17, d6, d7 ; abs(p0 - q0)
+
+ vmax.u8 d3, d3, d4 ; m3 = max(m5, m6)
+
+ vmov.u8 d18, #0x80
+
+ vmax.u8 d23, d19, d20 ; m1 = max(m1, m2)
+
+ ; hevmask
+ vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1
+ vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1
+ vmax.u8 d23, d23, d3 ; m1 = max(m1, m3)
+
+ vabd.u8 d28, d5, d16 ; a = abs(p1 - q1)
+ vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2
+
+ veor d7, d7, d18 ; qs0
+
+ vcge.u8 d23, d1, d23 ; abs(m1) > limit
+
+ ; filter() function
+ ; convert to signed
+
+ vshr.u8 d28, d28, #1 ; a = a / 2
+ veor d6, d6, d18 ; ps0
+
+ veor d5, d5, d18 ; ps1
+ vqadd.u8 d17, d17, d28 ; a = b + a
+
+ veor d16, d16, d18 ; qs1
+
+ vmov.u8 d19, #3
+
+ vsub.s8 d28, d7, d6 ; ( qs0 - ps0)
+
+ vcge.u8 d17, d0, d17 ; a > blimit
+
+ vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1)
+ vorr d22, d21, d22 ; hevmask
+
+ vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0)
+
+ vand d27, d27, d22 ; filter &= hev
+ vand d23, d23, d17 ; filter_mask
+
+ vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0)
+
+ vmov.u8 d17, #4
+
+ ; filter = clamp(filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d27, q12
+
+ vand d27, d27, d23 ; filter &= mask
+
+ vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3)
+ vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4)
+ vshr.s8 d28, d28, #3 ; filter2 >>= 3
+ vshr.s8 d27, d27, #3 ; filter1 >>= 3
+
+ vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2)
+ vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1)
+
+ ; outer tap adjustments
+ vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1
+
+ veor d6, d26, d18 ; *oq0 = u^0x80
+
+ vbic d27, d27, d22 ; filter &= ~hev
+
+ vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter)
+ vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter)
+
+ veor d5, d19, d18 ; *op0 = u^0x80
+ veor d4, d21, d18 ; *op1 = u^0x80
+ veor d7, d20, d18 ; *oq1 = u^0x80
+
+ bx lr
+ ENDP ; |vp9_loop_filter_neon|
+
+ END
diff --git a/vp9/common/arm/neon/vp9_loopfilter_8_neon.c b/vp9/common/arm/neon/vp9_loopfilter_8_neon.c
new file mode 100644
index 000000000..33068a8a2
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_loopfilter_8_neon.c
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+static INLINE void vp9_mbloop_filter_neon(
+ uint8x8_t dblimit, // mblimit
+ uint8x8_t dlimit, // limit
+ uint8x8_t dthresh, // thresh
+ uint8x8_t d3u8, // p2
+ uint8x8_t d4u8, // p2
+ uint8x8_t d5u8, // p1
+ uint8x8_t d6u8, // p0
+ uint8x8_t d7u8, // q0
+ uint8x8_t d16u8, // q1
+ uint8x8_t d17u8, // q2
+ uint8x8_t d18u8, // q3
+ uint8x8_t *d0ru8, // p1
+ uint8x8_t *d1ru8, // p1
+ uint8x8_t *d2ru8, // p0
+ uint8x8_t *d3ru8, // q0
+ uint8x8_t *d4ru8, // q1
+ uint8x8_t *d5ru8) { // q1
+ uint32_t flat;
+ uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
+ uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+ int16x8_t q15s16;
+ uint16x8_t q10u16, q14u16;
+ int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
+
+ d19u8 = vabd_u8(d3u8, d4u8);
+ d20u8 = vabd_u8(d4u8, d5u8);
+ d21u8 = vabd_u8(d5u8, d6u8);
+ d22u8 = vabd_u8(d16u8, d7u8);
+ d23u8 = vabd_u8(d17u8, d16u8);
+ d24u8 = vabd_u8(d18u8, d17u8);
+
+ d19u8 = vmax_u8(d19u8, d20u8);
+ d20u8 = vmax_u8(d21u8, d22u8);
+
+ d25u8 = vabd_u8(d6u8, d4u8);
+
+ d23u8 = vmax_u8(d23u8, d24u8);
+
+ d26u8 = vabd_u8(d7u8, d17u8);
+
+ d19u8 = vmax_u8(d19u8, d20u8);
+
+ d24u8 = vabd_u8(d6u8, d7u8);
+ d27u8 = vabd_u8(d3u8, d6u8);
+ d28u8 = vabd_u8(d18u8, d7u8);
+
+ d19u8 = vmax_u8(d19u8, d23u8);
+
+ d23u8 = vabd_u8(d5u8, d16u8);
+ d24u8 = vqadd_u8(d24u8, d24u8);
+
+
+ d19u8 = vcge_u8(dlimit, d19u8);
+
+
+ d25u8 = vmax_u8(d25u8, d26u8);
+ d26u8 = vmax_u8(d27u8, d28u8);
+
+ d23u8 = vshr_n_u8(d23u8, 1);
+
+ d25u8 = vmax_u8(d25u8, d26u8);
+
+ d24u8 = vqadd_u8(d24u8, d23u8);
+
+ d20u8 = vmax_u8(d20u8, d25u8);
+
+ d23u8 = vdup_n_u8(1);
+ d24u8 = vcge_u8(dblimit, d24u8);
+
+ d21u8 = vcgt_u8(d21u8, dthresh);
+
+ d20u8 = vcge_u8(d23u8, d20u8);
+
+ d19u8 = vand_u8(d19u8, d24u8);
+
+ d23u8 = vcgt_u8(d22u8, dthresh);
+
+ d20u8 = vand_u8(d20u8, d19u8);
+
+ d22u8 = vdup_n_u8(0x80);
+
+ d23u8 = vorr_u8(d21u8, d23u8);
+
+ q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8),
+ vreinterpret_u16_u8(d21u8));
+
+ d30u8 = vshrn_n_u16(q10u16, 4);
+ flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
+
+ if (flat == 0xffffffff) { // Check for all 1's, power_branch_only
+ d27u8 = vdup_n_u8(3);
+ d21u8 = vdup_n_u8(2);
+ q14u16 = vaddl_u8(d6u8, d7u8);
+ q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+ q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
+ q14u16 = vaddw_u8(q14u16, d5u8);
+ *d0ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vaddw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+ *d1ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+ *d2ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d7u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+ *d3ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vsubw_u8(q14u16, d7u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+ *d4ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vsubw_u8(q14u16, d16u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+ *d5ru8 = vqrshrn_n_u16(q14u16, 3);
+ } else {
+ d21u8 = veor_u8(d7u8, d22u8);
+ d24u8 = veor_u8(d6u8, d22u8);
+ d25u8 = veor_u8(d5u8, d22u8);
+ d26u8 = veor_u8(d16u8, d22u8);
+
+ d27u8 = vdup_n_u8(3);
+
+ d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
+ d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
+
+ q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
+
+ d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+ q15s16 = vaddw_s8(q15s16, d29s8);
+
+ d29u8 = vdup_n_u8(4);
+
+ d28s8 = vqmovn_s16(q15s16);
+
+ d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+ d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
+ d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
+ d30s8 = vshr_n_s8(d30s8, 3);
+ d29s8 = vshr_n_s8(d29s8, 3);
+
+ d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
+ d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
+
+ d29s8 = vrshr_n_s8(d29s8, 1);
+ d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+ d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
+ d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
+
+ if (flat == 0) { // filter_branch_only
+ *d0ru8 = d4u8;
+ *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+ *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+ *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+ *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+ *d5ru8 = d17u8;
+ return;
+ }
+
+ d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+ d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+ d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+ d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+
+ d23u8 = vdup_n_u8(2);
+ q14u16 = vaddl_u8(d6u8, d7u8);
+ q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+ q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
+
+ d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
+
+ q14u16 = vaddw_u8(q14u16, d5u8);
+
+ d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
+
+ d30u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vaddw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+
+ d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
+
+ d31u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+
+ *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
+
+ d23u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d7u8);
+
+ *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
+
+ q14u16 = vaddw_u8(q14u16, d18u8);
+
+ *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
+
+ d22u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vsubw_u8(q14u16, d7u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+
+ d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
+
+ q14u16 = vaddw_u8(q14u16, d18u8);
+
+ d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
+
+ d6u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vsubw_u8(q14u16, d16u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+
+ d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
+
+ d7u8 = vqrshrn_n_u16(q14u16, 3);
+
+ *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
+ *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
+ *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
+ }
+ return;
+}
+
+void vp9_lpf_horizontal_8_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char *blimit,
+ unsigned char *limit,
+ unsigned char *thresh,
+ int count) {
+ int i;
+ uint8_t *s, *psrc;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ uint8x8_t d16u8, d17u8, d18u8;
+
+ if (count == 0) // end_vp9_mblf_h_edge
+ return;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ psrc = src - (pitch << 2);
+ for (i = 0; i < count; i++) {
+ s = psrc + i * 8;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
+ d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+ &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+
+ s -= (pitch * 6);
+ vst1_u8(s, d0u8);
+ s += pitch;
+ vst1_u8(s, d1u8);
+ s += pitch;
+ vst1_u8(s, d2u8);
+ s += pitch;
+ vst1_u8(s, d3u8);
+ s += pitch;
+ vst1_u8(s, d4u8);
+ s += pitch;
+ vst1_u8(s, d5u8);
+ }
+ return;
+}
+
+void vp9_lpf_vertical_8_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char *blimit,
+ unsigned char *limit,
+ unsigned char *thresh,
+ int count) {
+ int i;
+ uint8_t *s;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ uint8x8_t d16u8, d17u8, d18u8;
+ uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+ uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+ uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+ uint8x8x4_t d4Result;
+ uint8x8x2_t d2Result;
+
+ if (count == 0)
+ return;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ for (i = 0; i < count; i++) {
+ s = src + (i * (pitch << 3)) - 4;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
+ vreinterpret_u32_u8(d7u8));
+ d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
+ vreinterpret_u32_u8(d16u8));
+ d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
+ vreinterpret_u32_u8(d17u8));
+ d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
+ vreinterpret_u32_u8(d18u8));
+
+ d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+ vreinterpret_u16_u32(d2tmp2.val[0]));
+ d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+ vreinterpret_u16_u32(d2tmp3.val[0]));
+ d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+ vreinterpret_u16_u32(d2tmp2.val[1]));
+ d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+ vreinterpret_u16_u32(d2tmp3.val[1]));
+
+ d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+ vreinterpret_u8_u16(d2tmp5.val[0]));
+ d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+ vreinterpret_u8_u16(d2tmp5.val[1]));
+ d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+ vreinterpret_u8_u16(d2tmp7.val[0]));
+ d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+ vreinterpret_u8_u16(d2tmp7.val[1]));
+
+ d3u8 = d2tmp8.val[0];
+ d4u8 = d2tmp8.val[1];
+ d5u8 = d2tmp9.val[0];
+ d6u8 = d2tmp9.val[1];
+ d7u8 = d2tmp10.val[0];
+ d16u8 = d2tmp10.val[1];
+ d17u8 = d2tmp11.val[0];
+ d18u8 = d2tmp11.val[1];
+
+ vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
+ d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+ &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+
+ d4Result.val[0] = d0u8;
+ d4Result.val[1] = d1u8;
+ d4Result.val[2] = d2u8;
+ d4Result.val[3] = d3u8;
+
+ d2Result.val[0] = d4u8;
+ d2Result.val[1] = d5u8;
+
+ s = src - 3;
+ vst4_lane_u8(s, d4Result, 0);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 1);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 2);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 3);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 4);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 5);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 6);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 7);
+
+ s = src + 1;
+ vst2_lane_u8(s, d2Result, 0);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 1);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 2);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 3);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 4);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 5);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 6);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 7);
+ }
+ return;
+}
diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon_asm.asm b/vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm
index 443032217..91aaec04e 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_neon_asm.asm
+++ b/vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm
@@ -8,8 +8,6 @@
; be found in the AUTHORS file in the root of the source tree.
;
- EXPORT |vp9_lpf_horizontal_4_neon|
- EXPORT |vp9_lpf_vertical_4_neon|
EXPORT |vp9_lpf_horizontal_8_neon|
EXPORT |vp9_lpf_vertical_8_neon|
ARM
@@ -21,261 +19,6 @@
; TODO(fgalligan): See about removing the count code as this function is only
; called with a count of 1.
;
-; void vp9_lpf_horizontal_4_neon(uint8_t *s,
-; int p /* pitch */,
-; const uint8_t *blimit,
-; const uint8_t *limit,
-; const uint8_t *thresh,
-; int count)
-;
-; r0 uint8_t *s,
-; r1 int p, /* pitch */
-; r2 const uint8_t *blimit,
-; r3 const uint8_t *limit,
-; sp const uint8_t *thresh,
-; sp+4 int count
-|vp9_lpf_horizontal_4_neon| PROC
- push {lr}
-
- vld1.8 {d0[]}, [r2] ; duplicate *blimit
- ldr r12, [sp, #8] ; load count
- ldr r2, [sp, #4] ; load thresh
- add r1, r1, r1 ; double pitch
-
- cmp r12, #0
- beq end_vp9_lf_h_edge
-
- vld1.8 {d1[]}, [r3] ; duplicate *limit
- vld1.8 {d2[]}, [r2] ; duplicate *thresh
-
-count_lf_h_loop
- sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines
- add r3, r2, r1, lsr #1 ; set to 3 lines down
-
- vld1.u8 {d3}, [r2@64], r1 ; p3
- vld1.u8 {d4}, [r3@64], r1 ; p2
- vld1.u8 {d5}, [r2@64], r1 ; p1
- vld1.u8 {d6}, [r3@64], r1 ; p0
- vld1.u8 {d7}, [r2@64], r1 ; q0
- vld1.u8 {d16}, [r3@64], r1 ; q1
- vld1.u8 {d17}, [r2@64] ; q2
- vld1.u8 {d18}, [r3@64] ; q3
-
- sub r2, r2, r1, lsl #1
- sub r3, r3, r1, lsl #1
-
- bl vp9_loop_filter_neon
-
- vst1.u8 {d4}, [r2@64], r1 ; store op1
- vst1.u8 {d5}, [r3@64], r1 ; store op0
- vst1.u8 {d6}, [r2@64], r1 ; store oq0
- vst1.u8 {d7}, [r3@64], r1 ; store oq1
-
- add r0, r0, #8
- subs r12, r12, #1
- bne count_lf_h_loop
-
-end_vp9_lf_h_edge
- pop {pc}
- ENDP ; |vp9_lpf_horizontal_4_neon|
-
-; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
-; works on 16 iterations at a time.
-; TODO(fgalligan): See about removing the count code as this function is only
-; called with a count of 1.
-;
-; void vp9_lpf_vertical_4_neon(uint8_t *s,
-; int p /* pitch */,
-; const uint8_t *blimit,
-; const uint8_t *limit,
-; const uint8_t *thresh,
-; int count)
-;
-; r0 uint8_t *s,
-; r1 int p, /* pitch */
-; r2 const uint8_t *blimit,
-; r3 const uint8_t *limit,
-; sp const uint8_t *thresh,
-; sp+4 int count
-|vp9_lpf_vertical_4_neon| PROC
- push {lr}
-
- vld1.8 {d0[]}, [r2] ; duplicate *blimit
- ldr r12, [sp, #8] ; load count
- vld1.8 {d1[]}, [r3] ; duplicate *limit
-
- ldr r3, [sp, #4] ; load thresh
- sub r2, r0, #4 ; move s pointer down by 4 columns
- cmp r12, #0
- beq end_vp9_lf_v_edge
-
- vld1.8 {d2[]}, [r3] ; duplicate *thresh
-
-count_lf_v_loop
- vld1.u8 {d3}, [r2], r1 ; load s data
- vld1.u8 {d4}, [r2], r1
- vld1.u8 {d5}, [r2], r1
- vld1.u8 {d6}, [r2], r1
- vld1.u8 {d7}, [r2], r1
- vld1.u8 {d16}, [r2], r1
- vld1.u8 {d17}, [r2], r1
- vld1.u8 {d18}, [r2]
-
- ;transpose to 8x16 matrix
- vtrn.32 d3, d7
- vtrn.32 d4, d16
- vtrn.32 d5, d17
- vtrn.32 d6, d18
-
- vtrn.16 d3, d5
- vtrn.16 d4, d6
- vtrn.16 d7, d17
- vtrn.16 d16, d18
-
- vtrn.8 d3, d4
- vtrn.8 d5, d6
- vtrn.8 d7, d16
- vtrn.8 d17, d18
-
- bl vp9_loop_filter_neon
-
- sub r0, r0, #2
-
- ;store op1, op0, oq0, oq1
- vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
- vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
- vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
- vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
- vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
- vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
- vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
- vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0]
-
- add r0, r0, r1, lsl #3 ; s += pitch * 8
- subs r12, r12, #1
- subne r2, r0, #4 ; move s pointer down by 4 columns
- bne count_lf_v_loop
-
-end_vp9_lf_v_edge
- pop {pc}
- ENDP ; |vp9_lpf_vertical_4_neon|
-
-; void vp9_loop_filter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store. The function does not use
-; registers d8-d15.
-;
-; Inputs:
-; r0-r3, r12 PRESERVE
-; d0 blimit
-; d1 limit
-; d2 thresh
-; d3 p3
-; d4 p2
-; d5 p1
-; d6 p0
-; d7 q0
-; d16 q1
-; d17 q2
-; d18 q3
-;
-; Outputs:
-; d4 op1
-; d5 op0
-; d6 oq0
-; d7 oq1
-|vp9_loop_filter_neon| PROC
- ; filter_mask
- vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2)
- vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1)
- vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0)
- vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0)
- vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1)
- vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2)
-
- ; only compare the largest value to limit
- vmax.u8 d19, d19, d20 ; m1 = max(m1, m2)
- vmax.u8 d20, d21, d22 ; m2 = max(m3, m4)
-
- vabd.u8 d17, d6, d7 ; abs(p0 - q0)
-
- vmax.u8 d3, d3, d4 ; m3 = max(m5, m6)
-
- vmov.u8 d18, #0x80
-
- vmax.u8 d23, d19, d20 ; m1 = max(m1, m2)
-
- ; hevmask
- vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1
- vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1
- vmax.u8 d23, d23, d3 ; m1 = max(m1, m3)
-
- vabd.u8 d28, d5, d16 ; a = abs(p1 - q1)
- vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2
-
- veor d7, d7, d18 ; qs0
-
- vcge.u8 d23, d1, d23 ; abs(m1) > limit
-
- ; filter() function
- ; convert to signed
-
- vshr.u8 d28, d28, #1 ; a = a / 2
- veor d6, d6, d18 ; ps0
-
- veor d5, d5, d18 ; ps1
- vqadd.u8 d17, d17, d28 ; a = b + a
-
- veor d16, d16, d18 ; qs1
-
- vmov.u8 d19, #3
-
- vsub.s8 d28, d7, d6 ; ( qs0 - ps0)
-
- vcge.u8 d17, d0, d17 ; a > blimit
-
- vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1)
- vorr d22, d21, d22 ; hevmask
-
- vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0)
-
- vand d27, d27, d22 ; filter &= hev
- vand d23, d23, d17 ; filter_mask
-
- vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0)
-
- vmov.u8 d17, #4
-
- ; filter = clamp(filter + 3 * ( qs0 - ps0))
- vqmovn.s16 d27, q12
-
- vand d27, d27, d23 ; filter &= mask
-
- vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3)
- vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4)
- vshr.s8 d28, d28, #3 ; filter2 >>= 3
- vshr.s8 d27, d27, #3 ; filter1 >>= 3
-
- vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2)
- vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1)
-
- ; outer tap adjustments
- vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1
-
- veor d6, d26, d18 ; *oq0 = u^0x80
-
- vbic d27, d27, d22 ; filter &= ~hev
-
- vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter)
- vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter)
-
- veor d5, d19, d18 ; *op0 = u^0x80
- veor d4, d21, d18 ; *op1 = u^0x80
- veor d7, d20, d18 ; *oq1 = u^0x80
-
- bx lr
- ENDP ; |vp9_loop_filter_neon|
-
; void vp9_lpf_horizontal_8_neon(uint8_t *s, int p,
; const uint8_t *blimit,
; const uint8_t *limit,
diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon.c b/vp9/common/arm/neon/vp9_loopfilter_neon.c
index 079d26677..6432c6cac 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_neon.c
+++ b/vp9/common/arm/neon/vp9_loopfilter_neon.c
@@ -10,705 +10,49 @@
#include <arm_neon.h>
+#include "./vp9_rtcd.h"
#include "./vpx_config.h"
-
-static INLINE void vp9_loop_filter_neon(
- uint8x8_t dblimit, // flimit
- uint8x8_t dlimit, // limit
- uint8x8_t dthresh, // thresh
- uint8x8_t d3u8, // p3
- uint8x8_t d4u8, // p2
- uint8x8_t d5u8, // p1
- uint8x8_t d6u8, // p0
- uint8x8_t d7u8, // q0
- uint8x8_t d16u8, // q1
- uint8x8_t d17u8, // q2
- uint8x8_t d18u8, // q3
- uint8x8_t *d4ru8, // p1
- uint8x8_t *d5ru8, // p0
- uint8x8_t *d6ru8, // q0
- uint8x8_t *d7ru8) { // q1
- uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
- int16x8_t q12s16;
- int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
-
- d19u8 = vabd_u8(d3u8, d4u8);
- d20u8 = vabd_u8(d4u8, d5u8);
- d21u8 = vabd_u8(d5u8, d6u8);
- d22u8 = vabd_u8(d16u8, d7u8);
- d3u8 = vabd_u8(d17u8, d16u8);
- d4u8 = vabd_u8(d18u8, d17u8);
-
- d19u8 = vmax_u8(d19u8, d20u8);
- d20u8 = vmax_u8(d21u8, d22u8);
- d3u8 = vmax_u8(d3u8, d4u8);
- d23u8 = vmax_u8(d19u8, d20u8);
-
- d17u8 = vabd_u8(d6u8, d7u8);
-
- d21u8 = vcgt_u8(d21u8, dthresh);
- d22u8 = vcgt_u8(d22u8, dthresh);
- d23u8 = vmax_u8(d23u8, d3u8);
-
- d28u8 = vabd_u8(d5u8, d16u8);
- d17u8 = vqadd_u8(d17u8, d17u8);
-
- d23u8 = vcge_u8(dlimit, d23u8);
-
- d18u8 = vdup_n_u8(0x80);
- d5u8 = veor_u8(d5u8, d18u8);
- d6u8 = veor_u8(d6u8, d18u8);
- d7u8 = veor_u8(d7u8, d18u8);
- d16u8 = veor_u8(d16u8, d18u8);
-
- d28u8 = vshr_n_u8(d28u8, 1);
- d17u8 = vqadd_u8(d17u8, d28u8);
-
- d19u8 = vdup_n_u8(3);
-
- d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8),
- vreinterpret_s8_u8(d6u8));
-
- d17u8 = vcge_u8(dblimit, d17u8);
-
- d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8),
- vreinterpret_s8_u8(d16u8));
-
- d22u8 = vorr_u8(d21u8, d22u8);
-
- q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
-
- d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
- d23u8 = vand_u8(d23u8, d17u8);
-
- q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
-
- d17u8 = vdup_n_u8(4);
-
- d27s8 = vqmovn_s16(q12s16);
- d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
- d27s8 = vreinterpret_s8_u8(d27u8);
-
- d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
- d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
- d28s8 = vshr_n_s8(d28s8, 3);
- d27s8 = vshr_n_s8(d27s8, 3);
-
- d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
- d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
-
- d27s8 = vrshr_n_s8(d27s8, 1);
- d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
-
- d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
- d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
-
- *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
- *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
- *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
- *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
- return;
-}
-
-void vp9_lpf_horizontal_4_neon(
- unsigned char *src,
- int pitch,
- unsigned char *blimit,
- unsigned char *limit,
- unsigned char *thresh,
- int count) {
- int i;
- uint8_t *s, *psrc;
- uint8x8_t dblimit, dlimit, dthresh;
- uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
-
- if (count == 0) // end_vp9_lf_h_edge
- return;
-
- dblimit = vld1_u8(blimit);
- dlimit = vld1_u8(limit);
- dthresh = vld1_u8(thresh);
-
- psrc = src - (pitch << 2);
- for (i = 0; i < count; i++) {
- s = psrc + i * 8;
-
- d3u8 = vld1_u8(s);
- s += pitch;
- d4u8 = vld1_u8(s);
- s += pitch;
- d5u8 = vld1_u8(s);
- s += pitch;
- d6u8 = vld1_u8(s);
- s += pitch;
- d7u8 = vld1_u8(s);
- s += pitch;
- d16u8 = vld1_u8(s);
- s += pitch;
- d17u8 = vld1_u8(s);
- s += pitch;
- d18u8 = vld1_u8(s);
-
- vp9_loop_filter_neon(dblimit, dlimit, dthresh,
- d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
- &d4u8, &d5u8, &d6u8, &d7u8);
-
- s -= (pitch * 5);
- vst1_u8(s, d4u8);
- s += pitch;
- vst1_u8(s, d5u8);
- s += pitch;
- vst1_u8(s, d6u8);
- s += pitch;
- vst1_u8(s, d7u8);
- }
- return;
-}
-
-void vp9_lpf_vertical_4_neon(
- unsigned char *src,
- int pitch,
- unsigned char *blimit,
- unsigned char *limit,
- unsigned char *thresh,
- int count) {
- int i, pitch8;
- uint8_t *s;
- uint8x8_t dblimit, dlimit, dthresh;
- uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
- uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
- uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
- uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
- uint8x8x4_t d4Result;
-
- if (count == 0) // end_vp9_lf_h_edge
- return;
-
- dblimit = vld1_u8(blimit);
- dlimit = vld1_u8(limit);
- dthresh = vld1_u8(thresh);
-
- pitch8 = pitch * 8;
- for (i = 0; i < count; i++, src += pitch8) {
- s = src - (i + 1) * 4;
-
- d3u8 = vld1_u8(s);
- s += pitch;
- d4u8 = vld1_u8(s);
- s += pitch;
- d5u8 = vld1_u8(s);
- s += pitch;
- d6u8 = vld1_u8(s);
- s += pitch;
- d7u8 = vld1_u8(s);
- s += pitch;
- d16u8 = vld1_u8(s);
- s += pitch;
- d17u8 = vld1_u8(s);
- s += pitch;
- d18u8 = vld1_u8(s);
-
- d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
- vreinterpret_u32_u8(d7u8));
- d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
- vreinterpret_u32_u8(d16u8));
- d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
- vreinterpret_u32_u8(d17u8));
- d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
- vreinterpret_u32_u8(d18u8));
-
- d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
- vreinterpret_u16_u32(d2tmp2.val[0]));
- d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
- vreinterpret_u16_u32(d2tmp3.val[0]));
- d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
- vreinterpret_u16_u32(d2tmp2.val[1]));
- d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
- vreinterpret_u16_u32(d2tmp3.val[1]));
-
- d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
- vreinterpret_u8_u16(d2tmp5.val[0]));
- d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
- vreinterpret_u8_u16(d2tmp5.val[1]));
- d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
- vreinterpret_u8_u16(d2tmp7.val[0]));
- d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
- vreinterpret_u8_u16(d2tmp7.val[1]));
-
- d3u8 = d2tmp8.val[0];
- d4u8 = d2tmp8.val[1];
- d5u8 = d2tmp9.val[0];
- d6u8 = d2tmp9.val[1];
- d7u8 = d2tmp10.val[0];
- d16u8 = d2tmp10.val[1];
- d17u8 = d2tmp11.val[0];
- d18u8 = d2tmp11.val[1];
-
- vp9_loop_filter_neon(dblimit, dlimit, dthresh,
- d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
- &d4u8, &d5u8, &d6u8, &d7u8);
-
- d4Result.val[0] = d4u8;
- d4Result.val[1] = d5u8;
- d4Result.val[2] = d6u8;
- d4Result.val[3] = d7u8;
-
- src -= 2;
- vst4_lane_u8(src, d4Result, 0);
- src += pitch;
- vst4_lane_u8(src, d4Result, 1);
- src += pitch;
- vst4_lane_u8(src, d4Result, 2);
- src += pitch;
- vst4_lane_u8(src, d4Result, 3);
- src += pitch;
- vst4_lane_u8(src, d4Result, 4);
- src += pitch;
- vst4_lane_u8(src, d4Result, 5);
- src += pitch;
- vst4_lane_u8(src, d4Result, 6);
- src += pitch;
- vst4_lane_u8(src, d4Result, 7);
- }
- return;
+#include "vpx/vpx_integer.h"
+
+void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vp9_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1);
+ vp9_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1);
}
-static INLINE void vp9_mbloop_filter_neon(
- uint8x8_t dblimit, // mblimit
- uint8x8_t dlimit, // limit
- uint8x8_t dthresh, // thresh
- uint8x8_t d3u8, // p2
- uint8x8_t d4u8, // p2
- uint8x8_t d5u8, // p1
- uint8x8_t d6u8, // p0
- uint8x8_t d7u8, // q0
- uint8x8_t d16u8, // q1
- uint8x8_t d17u8, // q2
- uint8x8_t d18u8, // q3
- uint8x8_t *d0ru8, // p1
- uint8x8_t *d1ru8, // p1
- uint8x8_t *d2ru8, // p0
- uint8x8_t *d3ru8, // q0
- uint8x8_t *d4ru8, // q1
- uint8x8_t *d5ru8) { // q1
- uint32_t flat;
- uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
- uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
- int16x8_t q15s16;
- uint16x8_t q10u16, q14u16;
- int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
-
- d19u8 = vabd_u8(d3u8, d4u8);
- d20u8 = vabd_u8(d4u8, d5u8);
- d21u8 = vabd_u8(d5u8, d6u8);
- d22u8 = vabd_u8(d16u8, d7u8);
- d23u8 = vabd_u8(d17u8, d16u8);
- d24u8 = vabd_u8(d18u8, d17u8);
-
- d19u8 = vmax_u8(d19u8, d20u8);
- d20u8 = vmax_u8(d21u8, d22u8);
-
- d25u8 = vabd_u8(d6u8, d4u8);
-
- d23u8 = vmax_u8(d23u8, d24u8);
-
- d26u8 = vabd_u8(d7u8, d17u8);
-
- d19u8 = vmax_u8(d19u8, d20u8);
-
- d24u8 = vabd_u8(d6u8, d7u8);
- d27u8 = vabd_u8(d3u8, d6u8);
- d28u8 = vabd_u8(d18u8, d7u8);
-
- d19u8 = vmax_u8(d19u8, d23u8);
-
- d23u8 = vabd_u8(d5u8, d16u8);
- d24u8 = vqadd_u8(d24u8, d24u8);
-
-
- d19u8 = vcge_u8(dlimit, d19u8);
-
-
- d25u8 = vmax_u8(d25u8, d26u8);
- d26u8 = vmax_u8(d27u8, d28u8);
-
- d23u8 = vshr_n_u8(d23u8, 1);
-
- d25u8 = vmax_u8(d25u8, d26u8);
-
- d24u8 = vqadd_u8(d24u8, d23u8);
-
- d20u8 = vmax_u8(d20u8, d25u8);
-
- d23u8 = vdup_n_u8(1);
- d24u8 = vcge_u8(dblimit, d24u8);
-
- d21u8 = vcgt_u8(d21u8, dthresh);
-
- d20u8 = vcge_u8(d23u8, d20u8);
-
- d19u8 = vand_u8(d19u8, d24u8);
-
- d23u8 = vcgt_u8(d22u8, dthresh);
-
- d20u8 = vand_u8(d20u8, d19u8);
-
- d22u8 = vdup_n_u8(0x80);
-
- d23u8 = vorr_u8(d21u8, d23u8);
-
- q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8),
- vreinterpret_u16_u8(d21u8));
-
- d30u8 = vshrn_n_u16(q10u16, 4);
- flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
-
- if (flat == 0xffffffff) { // Check for all 1's, power_branch_only
- d27u8 = vdup_n_u8(3);
- d21u8 = vdup_n_u8(2);
- q14u16 = vaddl_u8(d6u8, d7u8);
- q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
- q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
- q14u16 = vaddw_u8(q14u16, d5u8);
- *d0ru8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d4u8);
- q14u16 = vaddw_u8(q14u16, d5u8);
- q14u16 = vaddw_u8(q14u16, d16u8);
- *d1ru8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d5u8);
- q14u16 = vaddw_u8(q14u16, d6u8);
- q14u16 = vaddw_u8(q14u16, d17u8);
- *d2ru8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d6u8);
- q14u16 = vaddw_u8(q14u16, d7u8);
- q14u16 = vaddw_u8(q14u16, d18u8);
- *d3ru8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d4u8);
- q14u16 = vsubw_u8(q14u16, d7u8);
- q14u16 = vaddw_u8(q14u16, d16u8);
- q14u16 = vaddw_u8(q14u16, d18u8);
- *d4ru8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d5u8);
- q14u16 = vsubw_u8(q14u16, d16u8);
- q14u16 = vaddw_u8(q14u16, d17u8);
- q14u16 = vaddw_u8(q14u16, d18u8);
- *d5ru8 = vqrshrn_n_u16(q14u16, 3);
- } else {
- d21u8 = veor_u8(d7u8, d22u8);
- d24u8 = veor_u8(d6u8, d22u8);
- d25u8 = veor_u8(d5u8, d22u8);
- d26u8 = veor_u8(d16u8, d22u8);
-
- d27u8 = vdup_n_u8(3);
-
- d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
- d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
-
- q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
-
- d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
-
- q15s16 = vaddw_s8(q15s16, d29s8);
-
- d29u8 = vdup_n_u8(4);
-
- d28s8 = vqmovn_s16(q15s16);
-
- d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
-
- d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
- d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
- d30s8 = vshr_n_s8(d30s8, 3);
- d29s8 = vshr_n_s8(d29s8, 3);
-
- d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
- d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
-
- d29s8 = vrshr_n_s8(d29s8, 1);
- d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
-
- d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
- d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
-
- if (flat == 0) { // filter_branch_only
- *d0ru8 = d4u8;
- *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
- *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
- *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
- *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
- *d5ru8 = d17u8;
- return;
- }
-
- d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
- d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
- d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
- d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
-
- d23u8 = vdup_n_u8(2);
- q14u16 = vaddl_u8(d6u8, d7u8);
- q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
- q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
-
- d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
-
- q14u16 = vaddw_u8(q14u16, d5u8);
-
- d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
-
- d30u8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d4u8);
- q14u16 = vaddw_u8(q14u16, d5u8);
- q14u16 = vaddw_u8(q14u16, d16u8);
-
- d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
-
- d31u8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d5u8);
- q14u16 = vaddw_u8(q14u16, d6u8);
- q14u16 = vaddw_u8(q14u16, d17u8);
-
- *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
-
- d23u8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d6u8);
- q14u16 = vaddw_u8(q14u16, d7u8);
-
- *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
-
- q14u16 = vaddw_u8(q14u16, d18u8);
-
- *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
-
- d22u8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d4u8);
- q14u16 = vsubw_u8(q14u16, d7u8);
- q14u16 = vaddw_u8(q14u16, d16u8);
-
- d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
-
- q14u16 = vaddw_u8(q14u16, d18u8);
-
- d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
-
- d6u8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d5u8);
- q14u16 = vsubw_u8(q14u16, d16u8);
- q14u16 = vaddw_u8(q14u16, d17u8);
- q14u16 = vaddw_u8(q14u16, d18u8);
-
- d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
-
- d7u8 = vqrshrn_n_u16(q14u16, 3);
-
- *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
- *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
- *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
- }
- return;
+void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
+ vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
}
-void vp9_lpf_horizontal_8_neon(
- unsigned char *src,
- int pitch,
- unsigned char *blimit,
- unsigned char *limit,
- unsigned char *thresh,
- int count) {
- int i;
- uint8_t *s, *psrc;
- uint8x8_t dblimit, dlimit, dthresh;
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
- uint8x8_t d16u8, d17u8, d18u8;
-
- if (count == 0) // end_vp9_mblf_h_edge
- return;
-
- dblimit = vld1_u8(blimit);
- dlimit = vld1_u8(limit);
- dthresh = vld1_u8(thresh);
-
- psrc = src - (pitch << 2);
- for (i = 0; i < count; i++) {
- s = psrc + i * 8;
-
- d3u8 = vld1_u8(s);
- s += pitch;
- d4u8 = vld1_u8(s);
- s += pitch;
- d5u8 = vld1_u8(s);
- s += pitch;
- d6u8 = vld1_u8(s);
- s += pitch;
- d7u8 = vld1_u8(s);
- s += pitch;
- d16u8 = vld1_u8(s);
- s += pitch;
- d17u8 = vld1_u8(s);
- s += pitch;
- d18u8 = vld1_u8(s);
-
- vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
- d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
- &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
-
- s -= (pitch * 6);
- vst1_u8(s, d0u8);
- s += pitch;
- vst1_u8(s, d1u8);
- s += pitch;
- vst1_u8(s, d2u8);
- s += pitch;
- vst1_u8(s, d3u8);
- s += pitch;
- vst1_u8(s, d4u8);
- s += pitch;
- vst1_u8(s, d5u8);
- }
- return;
+void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
+ vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
}
-void vp9_lpf_vertical_8_neon(
- unsigned char *src,
- int pitch,
- unsigned char *blimit,
- unsigned char *limit,
- unsigned char *thresh,
- int count) {
- int i;
- uint8_t *s;
- uint8x8_t dblimit, dlimit, dthresh;
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
- uint8x8_t d16u8, d17u8, d18u8;
- uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
- uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
- uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
- uint8x8x4_t d4Result;
- uint8x8x2_t d2Result;
-
- if (count == 0)
- return;
-
- dblimit = vld1_u8(blimit);
- dlimit = vld1_u8(limit);
- dthresh = vld1_u8(thresh);
-
- for (i = 0; i < count; i++) {
- s = src + (i * (pitch << 3)) - 4;
-
- d3u8 = vld1_u8(s);
- s += pitch;
- d4u8 = vld1_u8(s);
- s += pitch;
- d5u8 = vld1_u8(s);
- s += pitch;
- d6u8 = vld1_u8(s);
- s += pitch;
- d7u8 = vld1_u8(s);
- s += pitch;
- d16u8 = vld1_u8(s);
- s += pitch;
- d17u8 = vld1_u8(s);
- s += pitch;
- d18u8 = vld1_u8(s);
-
- d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
- vreinterpret_u32_u8(d7u8));
- d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
- vreinterpret_u32_u8(d16u8));
- d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
- vreinterpret_u32_u8(d17u8));
- d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
- vreinterpret_u32_u8(d18u8));
-
- d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
- vreinterpret_u16_u32(d2tmp2.val[0]));
- d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
- vreinterpret_u16_u32(d2tmp3.val[0]));
- d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
- vreinterpret_u16_u32(d2tmp2.val[1]));
- d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
- vreinterpret_u16_u32(d2tmp3.val[1]));
-
- d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
- vreinterpret_u8_u16(d2tmp5.val[0]));
- d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
- vreinterpret_u8_u16(d2tmp5.val[1]));
- d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
- vreinterpret_u8_u16(d2tmp7.val[0]));
- d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
- vreinterpret_u8_u16(d2tmp7.val[1]));
-
- d3u8 = d2tmp8.val[0];
- d4u8 = d2tmp8.val[1];
- d5u8 = d2tmp9.val[0];
- d6u8 = d2tmp9.val[1];
- d7u8 = d2tmp10.val[0];
- d16u8 = d2tmp10.val[1];
- d17u8 = d2tmp11.val[0];
- d18u8 = d2tmp11.val[1];
-
- vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
- d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
- &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
-
- d4Result.val[0] = d0u8;
- d4Result.val[1] = d1u8;
- d4Result.val[2] = d2u8;
- d4Result.val[3] = d3u8;
-
- d2Result.val[0] = d4u8;
- d2Result.val[1] = d5u8;
-
- s = src - 3;
- vst4_lane_u8(s, d4Result, 0);
- s += pitch;
- vst4_lane_u8(s, d4Result, 1);
- s += pitch;
- vst4_lane_u8(s, d4Result, 2);
- s += pitch;
- vst4_lane_u8(s, d4Result, 3);
- s += pitch;
- vst4_lane_u8(s, d4Result, 4);
- s += pitch;
- vst4_lane_u8(s, d4Result, 5);
- s += pitch;
- vst4_lane_u8(s, d4Result, 6);
- s += pitch;
- vst4_lane_u8(s, d4Result, 7);
-
- s = src + 1;
- vst2_lane_u8(s, d2Result, 0);
- s += pitch;
- vst2_lane_u8(s, d2Result, 1);
- s += pitch;
- vst2_lane_u8(s, d2Result, 2);
- s += pitch;
- vst2_lane_u8(s, d2Result, 3);
- s += pitch;
- vst2_lane_u8(s, d2Result, 4);
- s += pitch;
- vst2_lane_u8(s, d2Result, 5);
- s += pitch;
- vst2_lane_u8(s, d2Result, 6);
- s += pitch;
- vst2_lane_u8(s, d2Result, 7);
- }
- return;
+#if HAVE_NEON_ASM
+void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+ vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
}
+#endif // HAVE_NEON_ASM
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 1a3fefc5f..b48d52230 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -1517,12 +1517,12 @@ void vp9_highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd) {
// stage 1
temp1 = (input[0] + input[2]) * cospi_16_64;
temp2 = (input[0] - input[2]) * cospi_16_64;
- step[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
- step[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
// stage 2
output[0] = WRAPLOW(step[0] + step[3], bd);
@@ -1562,10 +1562,11 @@ void vp9_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
int dest_stride, int bd) {
int i;
tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+ tran_low_t out = WRAPLOW(
+ highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+ out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
a1 = ROUND_POWER_OF_TWO(out, 4);
for (i = 0; i < 4; i++) {
@@ -1587,12 +1588,12 @@ void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
step1[3] = input[6];
temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
- step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
// stage 2 & stage 3 - even half
vp9_highbd_idct4(step1, step1, bd);
@@ -1607,8 +1608,8 @@ void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
step1[4] = step2[4];
temp1 = (step2[6] - step2[5]) * cospi_16_64;
temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[7] = step2[7];
// stage 4
@@ -1653,9 +1654,10 @@ void vp9_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
int i, j;
tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+ tran_low_t out = WRAPLOW(
+ highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+ out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
a1 = ROUND_POWER_OF_TWO(out, 5);
for (j = 0; j < 8; ++j) {
for (i = 0; i < 8; ++i)
@@ -1696,10 +1698,10 @@ static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
// The overall dynamic range is 14b (input) + 14b (multiplication scaling)
// + 1b (addition) = 29b.
// Hence the output bit depth is 15b.
- output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), bd);
- output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), bd);
- output[2] = WRAPLOW(dct_const_round_shift(s2), bd);
- output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
+ output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
+ output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
+ output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+ output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
}
void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1764,14 +1766,14 @@ static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
- x0 = WRAPLOW(dct_const_round_shift(s0 + s4), bd);
- x1 = WRAPLOW(dct_const_round_shift(s1 + s5), bd);
- x2 = WRAPLOW(dct_const_round_shift(s2 + s6), bd);
- x3 = WRAPLOW(dct_const_round_shift(s3 + s7), bd);
- x4 = WRAPLOW(dct_const_round_shift(s0 - s4), bd);
- x5 = WRAPLOW(dct_const_round_shift(s1 - s5), bd);
- x6 = WRAPLOW(dct_const_round_shift(s2 - s6), bd);
- x7 = WRAPLOW(dct_const_round_shift(s3 - s7), bd);
+ x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
+ x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
+ x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
+ x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
+ x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
+ x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
// stage 2
s0 = x0;
@@ -1787,10 +1789,10 @@ static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
x1 = WRAPLOW(s1 + s3, bd);
x2 = WRAPLOW(s0 - s2, bd);
x3 = WRAPLOW(s1 - s3, bd);
- x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd);
- x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd);
- x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd);
- x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd);
+ x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
+ x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
// stage 3
s2 = cospi_16_64 * (x2 + x3);
@@ -1798,10 +1800,10 @@ static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
s6 = cospi_16_64 * (x6 + x7);
s7 = cospi_16_64 * (x6 - x7);
- x2 = WRAPLOW(dct_const_round_shift(s2), bd);
- x3 = WRAPLOW(dct_const_round_shift(s3), bd);
- x6 = WRAPLOW(dct_const_round_shift(s6), bd);
- x7 = WRAPLOW(dct_const_round_shift(s7), bd);
+ x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+ x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
output[0] = WRAPLOW(x0, bd);
output[1] = WRAPLOW(-x4, bd);
@@ -1910,23 +1912,23 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
- step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
// stage 3
step1[0] = step2[0];
@@ -1936,12 +1938,12 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
- step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[8] = WRAPLOW(step2[8] + step2[9], bd);
step1[9] = WRAPLOW(step2[8] - step2[9], bd);
@@ -1955,12 +1957,12 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
// stage 4
temp1 = (step1[0] + step1[1]) * cospi_16_64;
temp2 = (step1[0] - step1[1]) * cospi_16_64;
- step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
- step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step2[4] = WRAPLOW(step1[4] + step1[5], bd);
step2[5] = WRAPLOW(step1[4] - step1[5], bd);
step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
@@ -1970,12 +1972,12 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
step2[15] = step1[15];
temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step2[11] = step1[11];
step2[12] = step1[12];
@@ -1987,8 +1989,8 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
step1[4] = step2[4];
temp1 = (step2[6] - step2[5]) * cospi_16_64;
temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[7] = step2[7];
step1[8] = WRAPLOW(step2[8] + step2[11], bd);
@@ -2013,12 +2015,12 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
step2[9] = step1[9];
temp1 = (-step1[10] + step1[13]) * cospi_16_64;
temp2 = (step1[10] + step1[13]) * cospi_16_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = (-step1[11] + step1[12]) * cospi_16_64;
temp2 = (step1[11] + step1[12]) * cospi_16_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step2[14] = step1[14];
step2[15] = step1[15];
@@ -2115,22 +2117,22 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
- x0 = WRAPLOW(dct_const_round_shift(s0 + s8), bd);
- x1 = WRAPLOW(dct_const_round_shift(s1 + s9), bd);
- x2 = WRAPLOW(dct_const_round_shift(s2 + s10), bd);
- x3 = WRAPLOW(dct_const_round_shift(s3 + s11), bd);
- x4 = WRAPLOW(dct_const_round_shift(s4 + s12), bd);
- x5 = WRAPLOW(dct_const_round_shift(s5 + s13), bd);
- x6 = WRAPLOW(dct_const_round_shift(s6 + s14), bd);
- x7 = WRAPLOW(dct_const_round_shift(s7 + s15), bd);
- x8 = WRAPLOW(dct_const_round_shift(s0 - s8), bd);
- x9 = WRAPLOW(dct_const_round_shift(s1 - s9), bd);
- x10 = WRAPLOW(dct_const_round_shift(s2 - s10), bd);
- x11 = WRAPLOW(dct_const_round_shift(s3 - s11), bd);
- x12 = WRAPLOW(dct_const_round_shift(s4 - s12), bd);
- x13 = WRAPLOW(dct_const_round_shift(s5 - s13), bd);
- x14 = WRAPLOW(dct_const_round_shift(s6 - s14), bd);
- x15 = WRAPLOW(dct_const_round_shift(s7 - s15), bd);
+ x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
+ x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
+ x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
+ x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
+ x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
+ x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
+ x8 = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
+ x9 = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
+ x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
+ x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
+ x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
+ x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
+ x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
+ x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
// stage 2
s0 = x0;
@@ -2158,14 +2160,14 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
x5 = WRAPLOW(s1 - s5, bd);
x6 = WRAPLOW(s2 - s6, bd);
x7 = WRAPLOW(s3 - s7, bd);
- x8 = WRAPLOW(dct_const_round_shift(s8 + s12), bd);
- x9 = WRAPLOW(dct_const_round_shift(s9 + s13), bd);
- x10 = WRAPLOW(dct_const_round_shift(s10 + s14), bd);
- x11 = WRAPLOW(dct_const_round_shift(s11 + s15), bd);
- x12 = WRAPLOW(dct_const_round_shift(s8 - s12), bd);
- x13 = WRAPLOW(dct_const_round_shift(s9 - s13), bd);
- x14 = WRAPLOW(dct_const_round_shift(s10 - s14), bd);
- x15 = WRAPLOW(dct_const_round_shift(s11 - s15), bd);
+ x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
+ x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
+ x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
+ x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
+ x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
+ x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
+ x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
+ x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
// stage 3
s0 = x0;
@@ -2189,18 +2191,18 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
x1 = WRAPLOW(s1 + s3, bd);
x2 = WRAPLOW(s0 - s2, bd);
x3 = WRAPLOW(s1 - s3, bd);
- x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd);
- x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd);
- x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd);
- x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd);
+ x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
+ x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
x8 = WRAPLOW(s8 + s10, bd);
x9 = WRAPLOW(s9 + s11, bd);
x10 = WRAPLOW(s8 - s10, bd);
x11 = WRAPLOW(s9 - s11, bd);
- x12 = WRAPLOW(dct_const_round_shift(s12 + s14), bd);
- x13 = WRAPLOW(dct_const_round_shift(s13 + s15), bd);
- x14 = WRAPLOW(dct_const_round_shift(s12 - s14), bd);
- x15 = WRAPLOW(dct_const_round_shift(s13 - s15), bd);
+ x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
+ x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
+ x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
+ x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
// stage 4
s2 = (- cospi_16_64) * (x2 + x3);
@@ -2212,14 +2214,14 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
s14 = (- cospi_16_64) * (x14 + x15);
s15 = cospi_16_64 * (x14 - x15);
- x2 = WRAPLOW(dct_const_round_shift(s2), bd);
- x3 = WRAPLOW(dct_const_round_shift(s3), bd);
- x6 = WRAPLOW(dct_const_round_shift(s6), bd);
- x7 = WRAPLOW(dct_const_round_shift(s7), bd);
- x10 = WRAPLOW(dct_const_round_shift(s10), bd);
- x11 = WRAPLOW(dct_const_round_shift(s11), bd);
- x14 = WRAPLOW(dct_const_round_shift(s14), bd);
- x15 = WRAPLOW(dct_const_round_shift(s15), bd);
+ x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+ x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
+ x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
+ x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
+ x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
+ x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
output[0] = WRAPLOW(x0, bd);
output[1] = WRAPLOW(-x8, bd);
@@ -2306,10 +2308,11 @@ void vp9_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
int i, j;
tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+ tran_low_t out = WRAPLOW(
+ highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+ out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
a1 = ROUND_POWER_OF_TWO(out, 6);
for (j = 0; j < 16; ++j) {
for (i = 0; i < 16; ++i)
@@ -2343,43 +2346,43 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
- step1[16] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[31] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
- step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
- step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
- step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
- step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
- step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
- step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
// stage 2
step2[0] = step1[0];
@@ -2393,23 +2396,23 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
- step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step2[16] = WRAPLOW(step1[16] + step1[17], bd);
step2[17] = WRAPLOW(step1[16] - step1[17], bd);
@@ -2436,12 +2439,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
- step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[8] = WRAPLOW(step2[8] + step2[9], bd);
step1[9] = WRAPLOW(step2[8] - step2[9], bd);
@@ -2456,22 +2459,22 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
step1[31] = step2[31];
temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
- step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
- step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[19] = step2[19];
step1[20] = step2[20];
temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
- step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[23] = step2[23];
step1[24] = step2[24];
step1[27] = step2[27];
@@ -2480,12 +2483,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
// stage 4
temp1 = (step1[0] + step1[1]) * cospi_16_64;
temp2 = (step1[0] - step1[1]) * cospi_16_64;
- step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
- step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step2[4] = WRAPLOW(step1[4] + step1[5], bd);
step2[5] = WRAPLOW(step1[4] - step1[5], bd);
step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
@@ -2495,12 +2498,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
step2[15] = step1[15];
temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step2[11] = step1[11];
step2[12] = step1[12];
@@ -2530,8 +2533,8 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
step1[4] = step2[4];
temp1 = (step2[6] - step2[5]) * cospi_16_64;
temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[7] = step2[7];
step1[8] = WRAPLOW(step2[8] + step2[11], bd);
@@ -2547,20 +2550,20 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
step1[17] = step2[17];
temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
- step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
- step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
- step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[22] = step2[22];
step1[23] = step2[23];
step1[24] = step2[24];
@@ -2581,12 +2584,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
step2[9] = step1[9];
temp1 = (-step1[10] + step1[13]) * cospi_16_64;
temp2 = (step1[10] + step1[13]) * cospi_16_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = (-step1[11] + step1[12]) * cospi_16_64;
temp2 = (step1[11] + step1[12]) * cospi_16_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step2[14] = step1[14];
step2[15] = step1[15];
@@ -2632,20 +2635,20 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
step1[19] = step2[19];
temp1 = (-step2[20] + step2[27]) * cospi_16_64;
temp2 = (step2[20] + step2[27]) * cospi_16_64;
- step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = (-step2[21] + step2[26]) * cospi_16_64;
temp2 = (step2[21] + step2[26]) * cospi_16_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = (-step2[22] + step2[25]) * cospi_16_64;
temp2 = (step2[22] + step2[25]) * cospi_16_64;
- step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = (-step2[23] + step2[24]) * cospi_16_64;
temp2 = (step2[23] + step2[24]) * cospi_16_64;
- step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[28] = step2[28];
step1[29] = step2[29];
step1[30] = step2[30];
@@ -2759,8 +2762,9 @@ void vp9_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
int a1;
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+ tran_low_t out = WRAPLOW(
+ highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+ out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
a1 = ROUND_POWER_OF_TWO(out, 6);
for (j = 0; j < 32; ++j) {
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index 1d8836cf3..6e2551dd4 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -80,13 +80,7 @@ static const tran_high_t sinpi_3_9 = 13377;
static const tran_high_t sinpi_4_9 = 15212;
static INLINE tran_low_t check_range(tran_high_t input) {
-#if CONFIG_VP9_HIGHBITDEPTH
- // For valid highbitdepth VP9 streams, intermediate stage coefficients will
- // stay within the ranges:
- // - 8 bit: signed 16 bit integer
- // - 10 bit: signed 18 bit integer
- // - 12 bit: signed 20 bit integer
-#elif CONFIG_COEFFICIENT_RANGE_CHECKING
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
// For valid VP9 input streams, intermediate stage coefficients should always
// stay within the range of a signed 16 bit integer. Coefficients can go out
// of this range for invalid/corrupt VP9 streams. However, strictly checking
@@ -95,7 +89,7 @@ static INLINE tran_low_t check_range(tran_high_t input) {
// --enable-coefficient-range-checking.
assert(INT16_MIN <= input);
assert(input <= INT16_MAX);
-#endif
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
return (tran_low_t)input;
}
@@ -104,6 +98,32 @@ static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
return check_range(rv);
}
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE tran_low_t highbd_check_range(tran_high_t input,
+ int bd) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ // For valid highbitdepth VP9 streams, intermediate stage coefficients will
+ // stay within the ranges:
+ // - 8 bit: signed 16 bit integer
+ // - 10 bit: signed 18 bit integer
+ // - 12 bit: signed 20 bit integer
+ const int32_t int_max = (1 << (7 + bd)) - 1;
+ const int32_t int_min = -int_max - 1;
+ assert(int_min <= input);
+ assert(input <= int_max);
+ (void) int_min;
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+ (void) bd;
+ return (tran_low_t)input;
+}
+
+static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
+ int bd) {
+ tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+ return highbd_check_range(rv, bd);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
typedef void (*transform_1d)(const tran_low_t*, tran_low_t*);
typedef struct {
diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c
index 110404f5a..e7ee903c6 100644
--- a/vp9/common/vp9_mfqe.c
+++ b/vp9/common/vp9_mfqe.c
@@ -136,13 +136,27 @@ static void copy_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
}
}
+static void get_thr(BLOCK_SIZE bs, int qdiff, int *sad_thr, int *vdiff_thr) {
+ const int adj = qdiff >> MFQE_PRECISION;
+ if (bs == BLOCK_16X16) {
+ *sad_thr = 7 + adj;
+ } else if (bs == BLOCK_32X32) {
+ *sad_thr = 6 + adj;
+ } else { // BLOCK_64X64
+ *sad_thr = 5 + adj;
+ }
+ *vdiff_thr = 125 + qdiff;
+}
+
static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
const uint8_t *v, int y_stride, int uv_stride,
- uint8_t *yd, uint8_t *ud, uint8_t *vd,
- int yd_stride, int uvd_stride) {
- int sad, sad_thr, vdiff;
+ uint8_t *yd, uint8_t *ud, uint8_t *vd, int yd_stride,
+ int uvd_stride, int qdiff) {
+ int sad, sad_thr, vdiff, vdiff_thr;
uint32_t sse;
+ get_thr(bs, qdiff, &sad_thr, &vdiff_thr);
+
if (bs == BLOCK_16X16) {
vdiff = (vp9_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
sad = (vp9_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
@@ -154,23 +168,18 @@ static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
sad = (vp9_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
}
- if (bs == BLOCK_16X16) {
- sad_thr = 8;
- } else if (bs == BLOCK_32X32) {
- sad_thr = 7;
- } else { // BLOCK_64X64
- sad_thr = 6;
- }
-
- // TODO(jackychen): More experiments and remove magic numbers.
// vdiff > sad * 3 means vdiff should not be too small, otherwise,
// it might be a lighting change in smooth area. When there is a
// lighting change in smooth area, it is dangerous to do MFQE.
- if (sad > 1 && sad < sad_thr && vdiff > sad * 3 && vdiff < 150) {
- int weight = ((float)sad / (sad_thr - 1)) * ((float)vdiff / (150 - 1)) *
- (1 << MFQE_PRECISION);
- apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride,
- ud, vd, uvd_stride, bs, weight);
+ if (sad > 1 && vdiff > sad * 3) {
+ const int weight = 1 << MFQE_PRECISION;
+ int ifactor = weight * sad * vdiff / (sad_thr * vdiff_thr);
+ // When ifactor equals weight, no MFQE is done.
+ if (ifactor > weight) {
+ ifactor = weight;
+ }
+ apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd,
+ uvd_stride, bs, ifactor);
} else {
// Copy the block from current frame (i.e., no mfqe is done).
copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
@@ -199,8 +208,7 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
int yd_stride, int uvd_stride) {
int mi_offset, y_offset, uv_offset;
const BLOCK_SIZE cur_bs = mi->mbmi.sb_type;
- // TODO(jackychen): Consider how and whether to use qdiff in MFQE.
- // int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex;
+ const int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex;
const int bsl = b_width_log2_lookup[bs];
PARTITION_TYPE partition = partition_lookup[bsl][cur_bs];
const BLOCK_SIZE subsize = get_subsize(bs, partition);
@@ -235,18 +243,18 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
if (mfqe_decision(mi, mfqe_bs)) {
// Do mfqe on the first square partition.
mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
- yd, ud, vd, yd_stride, uvd_stride);
+ yd, ud, vd, yd_stride, uvd_stride, qdiff);
// Do mfqe on the second square partition.
mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
y_stride, uv_stride, yd + y_offset, ud + uv_offset,
- vd + uv_offset, yd_stride, uvd_stride);
+ vd + uv_offset, yd_stride, uvd_stride, qdiff);
}
if (mfqe_decision(mi + mi_offset * cm->mi_stride, mfqe_bs)) {
// Do mfqe on the first square partition.
mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
v + uv_offset * uv_stride, y_stride, uv_stride,
yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
- vd + uv_offset * uvd_stride, yd_stride, uvd_stride);
+ vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
// Do mfqe on the second square partition.
mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
u + uv_offset * uv_stride + uv_offset,
@@ -254,7 +262,7 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
uv_stride, yd + y_offset * yd_stride + y_offset,
ud + uv_offset * uvd_stride + uv_offset,
vd + uv_offset * uvd_stride + uv_offset,
- yd_stride, uvd_stride);
+ yd_stride, uvd_stride, qdiff);
}
break;
case PARTITION_VERT:
@@ -268,18 +276,18 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
if (mfqe_decision(mi, mfqe_bs)) {
// Do mfqe on the first square partition.
mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
- yd, ud, vd, yd_stride, uvd_stride);
+ yd, ud, vd, yd_stride, uvd_stride, qdiff);
// Do mfqe on the second square partition.
mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
v + uv_offset * uv_stride, y_stride, uv_stride,
yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
- vd + uv_offset * uvd_stride, yd_stride, uvd_stride);
+ vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
}
if (mfqe_decision(mi + mi_offset, mfqe_bs)) {
// Do mfqe on the first square partition.
mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
y_stride, uv_stride, yd + y_offset, ud + uv_offset,
- vd + uv_offset, yd_stride, uvd_stride);
+ vd + uv_offset, yd_stride, uvd_stride, qdiff);
// Do mfqe on the second square partition.
mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
u + uv_offset * uv_stride + uv_offset,
@@ -287,14 +295,14 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
uv_stride, yd + y_offset * yd_stride + y_offset,
ud + uv_offset * uvd_stride + uv_offset,
vd + uv_offset * uvd_stride + uv_offset,
- yd_stride, uvd_stride);
+ yd_stride, uvd_stride, qdiff);
}
break;
case PARTITION_NONE:
if (mfqe_decision(mi, cur_bs)) {
// Do mfqe on this partition.
mfqe_block(cur_bs, y, u, v, y_stride, uv_stride,
- yd, ud, vd, yd_stride, uvd_stride);
+ yd, ud, vd, yd_stride, uvd_stride, qdiff);
} else {
// Copy the block from current frame(i.e., no mfqe is done).
copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
diff --git a/vp9/common/vp9_tile_common.c b/vp9/common/vp9_tile_common.c
index 8c4a30353..7a20e0a9e 100644
--- a/vp9/common/vp9_tile_common.c
+++ b/vp9/common/vp9_tile_common.c
@@ -36,24 +36,24 @@ void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) {
vp9_tile_set_col(tile, cm, col);
}
-void vp9_get_tile_n_bits(int mi_cols,
- int *min_log2_tile_cols, int *max_log2_tile_cols) {
- const int sb_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2;
- int min_log2 = 0, max_log2 = 0;
-
- // max
- while ((sb_cols >> max_log2) >= MIN_TILE_WIDTH_B64)
- ++max_log2;
- --max_log2;
- if (max_log2 < 0)
- max_log2 = 0;
-
- // min
- while ((MAX_TILE_WIDTH_B64 << min_log2) < sb_cols)
+static int get_min_log2_tile_cols(const int sb64_cols) {
+ int min_log2 = 0;
+ while ((MAX_TILE_WIDTH_B64 << min_log2) < sb64_cols)
++min_log2;
+ return min_log2;
+}
- assert(min_log2 <= max_log2);
+static int get_max_log2_tile_cols(const int sb64_cols) {
+ int max_log2 = 1;
+ while ((sb64_cols >> max_log2) >= MIN_TILE_WIDTH_B64)
+ ++max_log2;
+ return max_log2 - 1;
+}
- *min_log2_tile_cols = min_log2;
- *max_log2_tile_cols = max_log2;
+void vp9_get_tile_n_bits(int mi_cols,
+ int *min_log2_tile_cols, int *max_log2_tile_cols) {
+ const int sb64_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2;
+ *min_log2_tile_cols = get_min_log2_tile_cols(sb64_cols);
+ *max_log2_tile_cols = get_max_log2_tile_cols(sb64_cols);
+ assert(*min_log2_tile_cols <= *max_log2_tile_cols);
}
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 9677173db..3b8af84fa 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -1546,8 +1546,6 @@ void vp9_decode_frame(VP9Decoder *pbi,
vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
"Truncated packet or corrupt header length");
- init_macroblockd(cm, &pbi->mb);
-
cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
cm->width == cm->last_width &&
cm->height == cm->last_height &&
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 1406b4034..80654b19c 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -238,7 +238,7 @@ static void swap_frame_buffers(VP9Decoder *pbi) {
// Invalidate these references until the next frame starts.
for (ref_index = 0; ref_index < 3; ref_index++)
- cm->frame_refs[ref_index].idx = INT_MAX;
+ cm->frame_refs[ref_index].idx = -1;
}
int vp9_receive_compressed_data(VP9Decoder *pbi,
@@ -258,7 +258,7 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
// TODO(jkoleszar): Error concealment is undefined and non-normative
// at this point, but if it becomes so, [0] may not always be the correct
// thing to do here.
- if (cm->frame_refs[0].idx != INT_MAX)
+ if (cm->frame_refs[0].idx > 0)
cm->frame_refs[0].buf->corrupted = 1;
}
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 8704fddac..23d622d70 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -14,6 +14,9 @@
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_entropy.h"
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#include "vp9/common/vp9_idct.h"
+#endif
#include "vp9/decoder/vp9_detokenize.h"
@@ -32,7 +35,7 @@
#define INCREMENT_COUNT(token) \
do { \
if (!cm->frame_parallel_decoding_mode) \
- ++coef_counts[band][ctx][token]; \
+ ++coef_counts[band][ctx][token]; \
} while (0)
static INLINE int read_coeff(const vp9_prob *probs, int n, vp9_reader *r) {
@@ -191,10 +194,15 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type,
}
v = (val * dqv) >> dq_shift;
#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#if CONFIG_VP9_HIGHBITDEPTH
+ dqcoeff[scan[c]] = highbd_check_range((vp9_read_bit(r) ? -v : v),
+ cm->bit_depth);
+#else
dqcoeff[scan[c]] = check_range(vp9_read_bit(r) ? -v : v);
+#endif // CONFIG_VP9_HIGHBITDEPTH
#else
dqcoeff[scan[c]] = vp9_read_bit(r) ? -v : v;
-#endif
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
token_cache[scan[c]] = vp9_pt_energy_class[token];
++c;
ctx = get_coef_context(nb, token_cache, c);
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 752429c8f..19bcfd2b6 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -297,7 +297,6 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
if (bsize >= BLOCK_8X8) {
write_inter_mode(w, mode, inter_probs);
- ++cpi->td.counts->inter_mode[mode_ctx][INTER_OFFSET(mode)];
}
}
@@ -320,7 +319,6 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
const int j = idy * 2 + idx;
const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
write_inter_mode(w, b_mode, inter_probs);
- ++cpi->td.counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
if (b_mode == NEWMV) {
for (ref = 0; ref < 1 + is_compound; ++ref)
vp9_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
@@ -1172,8 +1170,6 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
prob_diff_update(vp9_inter_mode_tree, cm->fc->inter_mode_probs[i],
counts->inter_mode[i], INTER_MODES, &header_bc);
- vp9_zero(counts->inter_mode);
-
if (cm->interp_filter == SWITCHABLE)
update_switchable_interp_probs(cm, &header_bc, counts);
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 506f6de84..41f72f89b 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -170,7 +170,6 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
vp9_fdct4x4_c(input, output, stride);
} else {
tran_low_t out[4 * 4];
- tran_low_t *outptr = &out[0];
int i, j;
tran_low_t temp_in[4], temp_out[4];
const transform_2d ht = FHT_4[tx_type];
@@ -183,7 +182,7 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
temp_in[0] += 1;
ht.cols(temp_in, temp_out);
for (j = 0; j < 4; ++j)
- outptr[j * 4 + i] = temp_out[j];
+ out[j * 4 + i] = temp_out[j];
}
// Rows
@@ -711,7 +710,6 @@ void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
vp9_fdct8x8_c(input, output, stride);
} else {
tran_low_t out[64];
- tran_low_t *outptr = &out[0];
int i, j;
tran_low_t temp_in[8], temp_out[8];
const transform_2d ht = FHT_8[tx_type];
@@ -722,7 +720,7 @@ void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
temp_in[j] = input[j * stride + i] * 4;
ht.cols(temp_in, temp_out);
for (j = 0; j < 8; ++j)
- outptr[j * 8 + i] = temp_out[j];
+ out[j * 8 + i] = temp_out[j];
}
// Rows
@@ -1103,7 +1101,6 @@ void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
vp9_fdct16x16_c(input, output, stride);
} else {
tran_low_t out[256];
- tran_low_t *outptr = &out[0];
int i, j;
tran_low_t temp_in[16], temp_out[16];
const transform_2d ht = FHT_16[tx_type];
@@ -1114,7 +1111,7 @@ void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
temp_in[j] = input[j * stride + i] * 4;
ht.cols(temp_in, temp_out);
for (j = 0; j < 16; ++j)
- outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+ out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
}
// Rows
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 784319f08..9e037e1dd 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -195,15 +195,6 @@ static uint8_t *block_start(uint8_t *framebuf, int stride,
return framebuf + (stride * mi_row * 8) + (mi_col * 8);
}
-static void copy_block(uint8_t *dest, int dest_stride,
- const uint8_t *src, int src_stride, BLOCK_SIZE bs) {
- int r;
- for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
- vpx_memcpy(dest, src, (4 << b_width_log2_lookup[bs]));
- dest += dest_stride;
- src += src_stride;
- }
-}
static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
MACROBLOCK *mb,
@@ -348,9 +339,15 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
}
if (decision == FILTER_BLOCK) {
- copy_block(src.buf, src.stride, avg_start, avg.y_stride, bs);
+ vp9_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride,
+ NULL, 0, NULL, 0,
+ num_4x4_blocks_wide_lookup[bs] << 2,
+ num_4x4_blocks_high_lookup[bs] << 2);
} else { // COPY_BLOCK
- copy_block(avg_start, avg.y_stride, src.buf, src.stride, bs);
+ vp9_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride,
+ NULL, 0, NULL, 0,
+ num_4x4_blocks_wide_lookup[bs] << 2,
+ num_4x4_blocks_high_lookup[bs] << 2);
}
}
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 6784d0164..756052771 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -55,9 +55,6 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData * td,
int mi_row, int mi_col, BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx);
-// Motion vector component magnitude threshold for defining fast motion.
-#define FAST_MOTION_MV_THRESH 24
-
// This is used as a reference when computing the source variance for the
// purposes of activity masking.
// Eventually this should be replaced by custom no-reference routines,
@@ -1010,22 +1007,20 @@ static void update_stats(VP9_COMMON *cm, ThreadData *td) {
const MACROBLOCKD *const xd = &x->e_mbd;
const MODE_INFO *const mi = xd->mi[0].src_mi;
const MB_MODE_INFO *const mbmi = &mi->mbmi;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
if (!frame_is_intra_only(cm)) {
+ FRAME_COUNTS *const counts = td->counts;
+ const int inter_block = is_inter_block(mbmi);
const int seg_ref_active = vp9_segfeature_active(&cm->seg, mbmi->segment_id,
SEG_LVL_REF_FRAME);
if (!seg_ref_active) {
- FRAME_COUNTS *const counts = td->counts;
- const int inter_block = is_inter_block(mbmi);
-
counts->intra_inter[vp9_get_intra_inter_context(xd)][inter_block]++;
-
// If the segment reference feature is enabled we have only a single
// reference frame allowed for the segment so exclude it from
// the reference frame counts used to work out probabilities.
if (inter_block) {
const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
-
if (cm->reference_mode == REFERENCE_MODE_SELECT)
counts->comp_inter[vp9_get_reference_mode_context(cm, xd)]
[has_second_ref(mbmi)]++;
@@ -1042,6 +1037,25 @@ static void update_stats(VP9_COMMON *cm, ThreadData *td) {
}
}
}
+ if (inter_block &&
+ !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
+ if (bsize >= BLOCK_8X8) {
+ const PREDICTION_MODE mode = mbmi->mode;
+ ++counts->inter_mode[mode_ctx][INTER_OFFSET(mode)];
+ } else {
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+ int idx, idy;
+ for (idy = 0; idy < 2; idy += num_4x4_h) {
+ for (idx = 0; idx < 2; idx += num_4x4_w) {
+ const int j = idy * 2 + idx;
+ const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
+ ++counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
+ }
+ }
+ }
+ }
}
}
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index d016e9742..70b804e31 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -93,7 +93,7 @@ typedef struct vp9_token_state {
int rate;
int error;
int next;
- signed char token;
+ int16_t token;
short qc;
} vp9_token_state;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 628660b5f..230df1ffa 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1780,9 +1780,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
}
#if CONFIG_VP9_TEMPORAL_DENOISING
- if (cpi->oxcf.noise_sensitivity > 0) {
- vp9_denoiser_free(&(cpi->denoiser));
- }
+ vp9_denoiser_free(&(cpi->denoiser));
#endif
for (t = 0; t < cpi->num_workers; ++t) {
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index d9659d14c..5acfcc51d 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -976,7 +976,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
ctx->pred_pixel_ready = 0;
- for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ref_frame++) {
+ for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
int_mv dummy_mv[2];
x->pred_mv_sad[ref_frame] = INT_MAX;
@@ -1169,7 +1169,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
} // loop through sub8x8 blocks
if (this_rd < best_rd) {
- this_rd = best_rd;
+ best_rd = this_rd;
best_ref_frame = ref_frame;
}
} // reference frames
diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index 5154a59ea..81cc2e13f 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -65,7 +65,7 @@ extern const int16_t vp9_cat6_low_cost[256];
extern const int16_t vp9_cat6_high_cost[128];
extern const int16_t vp9_cat6_high10_high_cost[512];
extern const int16_t vp9_cat6_high12_high_cost[2048];
-static INLINE int16_t vp9_get_cost(uint8_t token, EXTRABIT extrabits,
+static INLINE int16_t vp9_get_cost(int16_t token, EXTRABIT extrabits,
const int16_t *cat6_high_table) {
if (token != CATEGORY6_TOKEN)
return vp9_extra_bits[token].cost[extrabits];
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index f5e6e3190..a7ba074ba 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -133,11 +133,13 @@ ifeq ($(ARCH_X86_64), yes)
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3_x86_64.asm
endif
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon.c
# neon with assembly and intrinsics implementations. If both are available
# prefer assembly.
@@ -156,9 +158,8 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon_asm$(ASM)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_8_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon_asm$(ASM)
else
ifeq ($(HAVE_NEON), yes)
@@ -176,8 +177,9 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_8_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon.c
endif # HAVE_NEON
endif # HAVE_NEON_ASM
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 43bf35f9c..b9177876e 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -148,7 +148,11 @@ static vpx_codec_err_t decoder_peek_si_internal(const uint8_t *data,
if (frame_marker != VP9_FRAME_MARKER)
return VPX_CODEC_UNSUP_BITSTREAM;
- if (profile >= MAX_PROFILES) return VPX_CODEC_UNSUP_BITSTREAM;
+ if (profile >= MAX_PROFILES)
+ return VPX_CODEC_UNSUP_BITSTREAM;
+
+ if ((profile >= 2 && data_sz <= 1) || data_sz < 1)
+ return VPX_CODEC_UNSUP_BITSTREAM;
if (vp9_rb_read_bit(&rb)) { // show an existing frame
vp9_rb_read_literal(&rb, 3); // Frame buffer to show.