summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--build/make/configure.sh9
-rw-r--r--test/convolve_test.cc176
-rw-r--r--test/decode_perf_test.cc163
-rw-r--r--test/fdct8x8_test.cc58
-rw-r--r--test/lpf_8_test.cc31
-rw-r--r--test/partial_idct_test.cc4
-rw-r--r--test/test-data.mk4
-rw-r--r--test/test.mk3
-rw-r--r--third_party/x86inc/x86inc.asm12
-rw-r--r--vp9/common/arm/neon/vp9_avg_neon.c145
-rw-r--r--vp9/common/arm/neon/vp9_avg_neon_asm.asm (renamed from vp9/common/arm/neon/vp9_avg_neon.asm)0
-rw-r--r--vp9/common/arm/neon/vp9_convolve8_avg_neon.c387
-rw-r--r--vp9/common/arm/neon/vp9_convolve8_avg_neon_asm.asm (renamed from vp9/common/arm/neon/vp9_convolve8_avg_neon.asm)0
-rw-r--r--vp9/common/arm/neon/vp9_convolve8_neon.c354
-rw-r--r--vp9/common/arm/neon/vp9_convolve8_neon_asm.asm (renamed from vp9/common/arm/neon/vp9_convolve8_neon.asm)0
-rw-r--r--vp9/common/arm/neon/vp9_copy_neon.c92
-rw-r--r--vp9/common/arm/neon/vp9_copy_neon_asm.asm (renamed from vp9/common/arm/neon/vp9_copy_neon.asm)0
-rw-r--r--vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c59
-rw-r--r--vp9/common/arm/neon/vp9_idct16x16_1_add_neon_asm.asm (renamed from vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm)0
-rw-r--r--vp9/common/arm/neon/vp9_idct16x16_add_neon.c1330
-rw-r--r--vp9/common/arm/neon/vp9_idct16x16_add_neon_asm.asm (renamed from vp9/common/arm/neon/vp9_idct16x16_add_neon.asm)0
-rw-r--r--vp9/common/arm/neon/vp9_idct16x16_neon.c14
-rw-r--r--vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c161
-rw-r--r--vp9/common/arm/neon/vp9_idct32x32_1_add_neon_asm.asm (renamed from vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm)0
-rw-r--r--vp9/common/arm/neon/vp9_idct32x32_add_neon.c748
-rw-r--r--vp9/common/arm/neon/vp9_idct32x32_add_neon_asm.asm (renamed from vp9/common/arm/neon/vp9_idct32x32_add_neon.asm)0
-rw-r--r--vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c48
-rw-r--r--vp9/common/arm/neon/vp9_idct4x4_1_add_neon_asm.asm (renamed from vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm)0
-rw-r--r--vp9/common/arm/neon/vp9_idct4x4_add_neon.c151
-rw-r--r--vp9/common/arm/neon/vp9_idct4x4_add_neon_asm.asm (renamed from vp9/common/arm/neon/vp9_idct4x4_add_neon.asm)0
-rw-r--r--vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c62
-rw-r--r--vp9/common/arm/neon/vp9_idct8x8_1_add_neon_asm.asm (renamed from vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm)0
-rw-r--r--vp9/common/arm/neon/vp9_idct8x8_add_neon.c545
-rw-r--r--vp9/common/arm/neon/vp9_idct8x8_add_neon_asm.asm (renamed from vp9/common/arm/neon/vp9_idct8x8_add_neon.asm)0
-rw-r--r--vp9/common/arm/neon/vp9_loopfilter_16_neon.c6
-rw-r--r--vp9/common/arm/neon/vp9_loopfilter_16_neon_asm.asm (renamed from vp9/common/arm/neon/vp9_loopfilter_16_neon.asm)0
-rw-r--r--vp9/common/arm/neon/vp9_loopfilter_neon.c712
-rw-r--r--vp9/common/arm/neon/vp9_loopfilter_neon_asm.asm (renamed from vp9/common/arm/neon/vp9_loopfilter_neon.asm)0
-rw-r--r--vp9/common/vp9_alloccommon.c1
-rw-r--r--vp9/common/vp9_blockd.h4
-rw-r--r--vp9/common/vp9_mfqe.c314
-rw-r--r--vp9/common/vp9_mfqe.h31
-rw-r--r--vp9/common/vp9_onyxc_int.h16
-rw-r--r--vp9/common/vp9_postproc.c76
-rw-r--r--vp9/common/vp9_postproc.h8
-rw-r--r--vp9/common/vp9_ppflags.h3
-rw-r--r--vp9/common/vp9_pred_common.c62
-rw-r--r--vp9/common/vp9_pred_common.h16
-rw-r--r--vp9/common/vp9_rtcd_defs.pl76
-rw-r--r--vp9/common/x86/vp9_idct_intrin_sse2.c2
-rw-r--r--vp9/common/x86/vp9_subpixel_8t_ssse3.asm52
-rw-r--r--vp9/decoder/vp9_decodeframe.c18
-rw-r--r--vp9/decoder/vp9_decodemv.c3
-rw-r--r--vp9/decoder/vp9_decoder.c10
-rw-r--r--vp9/encoder/vp9_aq_complexity.c112
-rw-r--r--vp9/encoder/vp9_aq_complexity.h9
-rw-r--r--vp9/encoder/vp9_bitstream.c5
-rw-r--r--vp9/encoder/vp9_encodeframe.c249
-rw-r--r--vp9/encoder/vp9_encoder.c7
-rw-r--r--vp9/encoder/vp9_encoder.h2
-rw-r--r--vp9/encoder/vp9_pickmode.c2
-rw-r--r--vp9/encoder/vp9_ratectrl.c4
-rw-r--r--vp9/encoder/vp9_rdopt.c8
-rw-r--r--vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm8
-rw-r--r--vp9/encoder/x86/vp9_subpel_variance.asm4
-rw-r--r--vp9/vp9_common.mk60
66 files changed, 6010 insertions, 426 deletions
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 9327ce95e..7be583d72 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -805,7 +805,12 @@ process_common_toolchain() {
;;
armv7|armv7s)
soft_enable neon
- soft_enable neon_asm
+ # Only enable neon_asm when neon is also enabled.
+ enabled neon && soft_enable neon_asm
+ # If someone tries to force it through, die.
+ if disabled neon && enabled neon_asm; then
+ die "Disabling neon while keeping neon-asm is not supported"
+ fi
soft_enable media
soft_enable fast_unaligned
;;
@@ -1118,7 +1123,7 @@ EOF
bits=32
enabled x86_64 && bits=64
check_cpp <<EOF && bits=x32
-#ifndef __ILP32__
+#if !defined(__ILP32__) || !defined(__x86_64__)
#error "not x32"
#endif
EOF
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index b0f60529a..e30ccf92e 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -10,12 +10,14 @@
#include <string.h>
#include "test/acm_random.h"
+#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "test/util.h"
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "./vpx_config.h"
#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_filter.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
@@ -31,13 +33,16 @@ typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride,
int w, int h);
struct ConvolveFunctions {
- ConvolveFunctions(ConvolveFunc h8, ConvolveFunc h8_avg,
+ ConvolveFunctions(ConvolveFunc copy, ConvolveFunc avg,
+ ConvolveFunc h8, ConvolveFunc h8_avg,
ConvolveFunc v8, ConvolveFunc v8_avg,
ConvolveFunc hv8, ConvolveFunc hv8_avg,
int bd)
- : h8_(h8), v8_(v8), hv8_(hv8), h8_avg_(h8_avg), v8_avg_(v8_avg),
- hv8_avg_(hv8_avg), use_highbd_(bd) {}
+ : copy_(copy), avg_(avg), h8_(h8), v8_(v8), hv8_(hv8), h8_avg_(h8_avg),
+ v8_avg_(v8_avg), hv8_avg_(hv8_avg), use_highbd_(bd) {}
+ ConvolveFunc copy_;
+ ConvolveFunc avg_;
ConvolveFunc h8_;
ConvolveFunc v8_;
ConvolveFunc hv8_;
@@ -298,25 +303,35 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
vpx_memalign(kDataAlignment, kInputBufferSize + 1)) + 1;
output_ = reinterpret_cast<uint8_t*>(
vpx_memalign(kDataAlignment, kOutputBufferSize));
+ output_ref_ = reinterpret_cast<uint8_t*>(
+ vpx_memalign(kDataAlignment, kOutputBufferSize));
#if CONFIG_VP9_HIGHBITDEPTH
input16_ = reinterpret_cast<uint16_t*>(
vpx_memalign(kDataAlignment,
(kInputBufferSize + 1) * sizeof(uint16_t))) + 1;
output16_ = reinterpret_cast<uint16_t*>(
vpx_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t)));
+ output16_ref_ = reinterpret_cast<uint16_t*>(
+ vpx_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t)));
#endif
}
+ virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
static void TearDownTestCase() {
vpx_free(input_ - 1);
input_ = NULL;
vpx_free(output_);
output_ = NULL;
+ vpx_free(output_ref_);
+ output_ref_ = NULL;
#if CONFIG_VP9_HIGHBITDEPTH
vpx_free(input16_ - 1);
input16_ = NULL;
vpx_free(output16_);
output16_ = NULL;
+ vpx_free(output16_ref_);
+ output16_ref_ = NULL;
#endif
}
@@ -382,6 +397,13 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
#endif
}
+ void CopyOutputToRef() {
+ vpx_memcpy(output_ref_, output_, kOutputBufferSize);
+#if CONFIG_VP9_HIGHBITDEPTH
+ vpx_memcpy(output16_ref_, output16_, kOutputBufferSize);
+#endif
+ }
+
void CheckGuardBlocks() {
for (int i = 0; i < kOutputBufferSize; ++i) {
if (IsIndexInBorder(i))
@@ -415,6 +437,19 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
#endif
}
+ uint8_t *output_ref() const {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (UUT_->use_highbd_ == 0) {
+ return output_ref_ + BorderTop() * kOuterBlockSize + BorderLeft();
+ } else {
+ return CONVERT_TO_BYTEPTR(output16_ref_ + BorderTop() * kOuterBlockSize +
+ BorderLeft());
+ }
+#else
+ return output_ref_ + BorderTop() * kOuterBlockSize + BorderLeft();
+#endif
+ }
+
uint16_t lookup(uint8_t *list, int index) const {
#if CONFIG_VP9_HIGHBITDEPTH
if (UUT_->use_highbd_ == 0) {
@@ -493,24 +528,65 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
const ConvolveFunctions* UUT_;
static uint8_t* input_;
static uint8_t* output_;
+ static uint8_t* output_ref_;
#if CONFIG_VP9_HIGHBITDEPTH
static uint16_t* input16_;
static uint16_t* output16_;
+ static uint16_t* output16_ref_;
int mask_;
#endif
};
uint8_t* ConvolveTest::input_ = NULL;
uint8_t* ConvolveTest::output_ = NULL;
+uint8_t* ConvolveTest::output_ref_ = NULL;
#if CONFIG_VP9_HIGHBITDEPTH
uint16_t* ConvolveTest::input16_ = NULL;
uint16_t* ConvolveTest::output16_ = NULL;
+uint16_t* ConvolveTest::output16_ref_ = NULL;
#endif
TEST_P(ConvolveTest, GuardBlocks) {
CheckGuardBlocks();
}
+TEST_P(ConvolveTest, Copy) {
+ uint8_t* const in = input();
+ uint8_t* const out = output();
+
+ ASM_REGISTER_STATE_CHECK(
+ UUT_->copy_(in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0,
+ Width(), Height()));
+
+ CheckGuardBlocks();
+
+ for (int y = 0; y < Height(); ++y)
+ for (int x = 0; x < Width(); ++x)
+ ASSERT_EQ(lookup(out, y * kOutputStride + x),
+ lookup(in, y * kInputStride + x))
+ << "(" << x << "," << y << ")";
+}
+
+TEST_P(ConvolveTest, Avg) {
+ uint8_t* const in = input();
+ uint8_t* const out = output();
+ uint8_t* const out_ref = output_ref();
+ CopyOutputToRef();
+
+ ASM_REGISTER_STATE_CHECK(
+ UUT_->avg_(in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0,
+ Width(), Height()));
+
+ CheckGuardBlocks();
+
+ for (int y = 0; y < Height(); ++y)
+ for (int x = 0; x < Width(); ++x)
+ ASSERT_EQ(lookup(out, y * kOutputStride + x),
+ ROUND_POWER_OF_TWO(lookup(in, y * kInputStride + x) +
+ lookup(out_ref, y * kOutputStride + x), 1))
+ << "(" << x << "," << y << ")";
+}
+
TEST_P(ConvolveTest, CopyHoriz) {
uint8_t* const in = input();
uint8_t* const out = output();
@@ -1188,6 +1264,30 @@ void wrap_convolve8_avg_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
}
#endif // HAVE_SSE2 && ARCH_X86_64
+void wrap_convolve_copy_c_8(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x,
+ int filter_x_stride,
+ const int16_t *filter_y,
+ int filter_y_stride,
+ int w, int h) {
+ vp9_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
+ filter_x, filter_x_stride,
+ filter_y, filter_y_stride, w, h, 8);
+}
+
+void wrap_convolve_avg_c_8(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x,
+ int filter_x_stride,
+ const int16_t *filter_y,
+ int filter_y_stride,
+ int w, int h) {
+ vp9_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
+ filter_x, filter_x_stride,
+ filter_y, filter_y_stride, w, h, 8);
+}
+
void wrap_convolve8_horiz_c_8(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x,
@@ -1260,6 +1360,30 @@ void wrap_convolve8_avg_c_8(const uint8_t *src, ptrdiff_t src_stride,
filter_y, filter_y_stride, w, h, 8);
}
+void wrap_convolve_copy_c_10(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x,
+ int filter_x_stride,
+ const int16_t *filter_y,
+ int filter_y_stride,
+ int w, int h) {
+ vp9_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
+ filter_x, filter_x_stride,
+ filter_y, filter_y_stride, w, h, 10);
+}
+
+void wrap_convolve_avg_c_10(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x,
+ int filter_x_stride,
+ const int16_t *filter_y,
+ int filter_y_stride,
+ int w, int h) {
+ vp9_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
+ filter_x, filter_x_stride,
+ filter_y, filter_y_stride, w, h, 10);
+}
+
void wrap_convolve8_horiz_c_10(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x,
@@ -1332,6 +1456,30 @@ void wrap_convolve8_avg_c_10(const uint8_t *src, ptrdiff_t src_stride,
filter_y, filter_y_stride, w, h, 10);
}
+void wrap_convolve_copy_c_12(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x,
+ int filter_x_stride,
+ const int16_t *filter_y,
+ int filter_y_stride,
+ int w, int h) {
+ vp9_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
+ filter_x, filter_x_stride,
+ filter_y, filter_y_stride, w, h, 12);
+}
+
+void wrap_convolve_avg_c_12(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x,
+ int filter_x_stride,
+ const int16_t *filter_y,
+ int filter_y_stride,
+ int w, int h) {
+ vp9_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
+ filter_x, filter_x_stride,
+ filter_y, filter_y_stride, w, h, 12);
+}
+
void wrap_convolve8_horiz_c_12(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x,
@@ -1405,6 +1553,7 @@ void wrap_convolve8_avg_c_12(const uint8_t *src, ptrdiff_t src_stride,
}
const ConvolveFunctions convolve8_c(
+ wrap_convolve_copy_c_8, wrap_convolve_avg_c_8,
wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8,
wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8,
wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8);
@@ -1423,6 +1572,7 @@ INSTANTIATE_TEST_CASE_P(C_8, ConvolveTest, ::testing::Values(
make_tuple(32, 64, &convolve8_c),
make_tuple(64, 64, &convolve8_c)));
const ConvolveFunctions convolve10_c(
+ wrap_convolve_copy_c_10, wrap_convolve_avg_c_10,
wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10,
wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10,
wrap_convolve8_c_10, wrap_convolve8_avg_c_10, 10);
@@ -1441,6 +1591,7 @@ INSTANTIATE_TEST_CASE_P(C_10, ConvolveTest, ::testing::Values(
make_tuple(32, 64, &convolve10_c),
make_tuple(64, 64, &convolve10_c)));
const ConvolveFunctions convolve12_c(
+ wrap_convolve_copy_c_12, wrap_convolve_avg_c_12,
wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12,
wrap_convolve8_c_12, wrap_convolve8_avg_c_12, 12);
@@ -1462,6 +1613,7 @@ INSTANTIATE_TEST_CASE_P(C_12, ConvolveTest, ::testing::Values(
#else
const ConvolveFunctions convolve8_c(
+ vp9_convolve_copy_c, vp9_convolve_avg_c,
vp9_convolve8_horiz_c, vp9_convolve8_avg_horiz_c,
vp9_convolve8_vert_c, vp9_convolve8_avg_vert_c,
vp9_convolve8_c, vp9_convolve8_avg_c, 0);
@@ -1485,14 +1637,17 @@ INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(
#if HAVE_SSE2 && ARCH_X86_64
#if CONFIG_VP9_HIGHBITDEPTH
const ConvolveFunctions convolve8_sse2(
+ wrap_convolve_copy_c_8, wrap_convolve_avg_c_8,
wrap_convolve8_horiz_sse2_8, wrap_convolve8_avg_horiz_sse2_8,
wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8, 8);
const ConvolveFunctions convolve10_sse2(
+ wrap_convolve_copy_c_10, wrap_convolve_avg_c_10,
wrap_convolve8_horiz_sse2_10, wrap_convolve8_avg_horiz_sse2_10,
wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10, 10);
const ConvolveFunctions convolve12_sse2(
+ wrap_convolve_copy_c_12, wrap_convolve_avg_c_12,
wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12, 12);
@@ -1538,6 +1693,7 @@ INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
make_tuple(64, 64, &convolve12_sse2)));
#else
const ConvolveFunctions convolve8_sse2(
+ vp9_convolve_copy_sse2, vp9_convolve_avg_sse2,
vp9_convolve8_horiz_sse2, vp9_convolve8_avg_horiz_sse2,
vp9_convolve8_vert_sse2, vp9_convolve8_avg_vert_sse2,
vp9_convolve8_sse2, vp9_convolve8_avg_sse2, 0);
@@ -1561,6 +1717,7 @@ INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
#if HAVE_SSSE3
const ConvolveFunctions convolve8_ssse3(
+ vp9_convolve_copy_c, vp9_convolve_avg_c,
vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_ssse3,
vp9_convolve8_vert_ssse3, vp9_convolve8_avg_vert_ssse3,
vp9_convolve8_ssse3, vp9_convolve8_avg_ssse3, 0);
@@ -1583,6 +1740,7 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
#if HAVE_AVX2 && HAVE_SSSE3
const ConvolveFunctions convolve8_avx2(
+ vp9_convolve_copy_c, vp9_convolve_avg_c,
vp9_convolve8_horiz_avx2, vp9_convolve8_avg_horiz_ssse3,
vp9_convolve8_vert_avx2, vp9_convolve8_avg_vert_ssse3,
vp9_convolve8_avx2, vp9_convolve8_avg_ssse3, 0);
@@ -1603,11 +1761,20 @@ INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values(
make_tuple(64, 64, &convolve8_avx2)));
#endif // HAVE_AVX2 && HAVE_SSSE3
+#if HAVE_NEON
#if HAVE_NEON_ASM
const ConvolveFunctions convolve8_neon(
+ vp9_convolve_copy_neon, vp9_convolve_avg_neon,
+ vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon,
+ vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon,
+ vp9_convolve8_neon, vp9_convolve8_avg_neon, 0);
+#else // HAVE_NEON
+const ConvolveFunctions convolve8_neon(
+ vp9_convolve_copy_neon, vp9_convolve_avg_neon,
vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon,
vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon,
vp9_convolve8_neon, vp9_convolve8_avg_neon, 0);
+#endif // HAVE_NEON_ASM
INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values(
make_tuple(4, 4, &convolve8_neon),
@@ -1623,10 +1790,11 @@ INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values(
make_tuple(64, 32, &convolve8_neon),
make_tuple(32, 64, &convolve8_neon),
make_tuple(64, 64, &convolve8_neon)));
-#endif
+#endif // HAVE_NEON
#if HAVE_DSPR2
const ConvolveFunctions convolve8_dspr2(
+ vp9_convolve_copy_dspr2, vp9_convolve_avg_dspr2,
vp9_convolve8_horiz_dspr2, vp9_convolve8_avg_horiz_dspr2,
vp9_convolve8_vert_dspr2, vp9_convolve8_avg_vert_dspr2,
vp9_convolve8_dspr2, vp9_convolve8_avg_dspr2, 0);
diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc
index 5a7114022..0a0713ab5 100644
--- a/test/decode_perf_test.cc
+++ b/test/decode_perf_test.cc
@@ -8,13 +8,17 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <string>
#include "test/codec_factory.h"
#include "test/decode_test_driver.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
#include "test/ivf_video_source.h"
#include "test/md5_helper.h"
#include "test/util.h"
#include "test/webm_video_source.h"
#include "vpx_ports/vpx_timer.h"
+#include "./ivfenc.h"
#include "./vpx_version.h"
using std::tr1::make_tuple;
@@ -24,7 +28,9 @@ namespace {
#define VIDEO_NAME 0
#define THREADS 1
+const int kMaxPsnr = 100;
const double kUsecsInSec = 1000000.0;
+static const char *kNewEncodeOutputFile = "new_encode.ivf";
/*
DecodePerfTest takes a tuple of filename + number of threads to decode with
@@ -105,4 +111,161 @@ TEST_P(DecodePerfTest, PerfTest) {
INSTANTIATE_TEST_CASE_P(VP9, DecodePerfTest,
::testing::ValuesIn(kVP9DecodePerfVectors));
+class VP9NewEncodeDecodePerfTest : public ::libvpx_test::EncoderTest,
+ public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+ VP9NewEncodeDecodePerfTest()
+ : EncoderTest(GET_PARAM(0)),
+ encoding_mode_(GET_PARAM(1)),
+ speed_(0),
+ outfile_(0),
+ out_frames_(0) {
+ }
+
+ virtual ~VP9NewEncodeDecodePerfTest() {}
+
+ virtual void SetUp() {
+ InitializeConfig();
+ SetMode(encoding_mode_);
+
+ cfg_.g_lag_in_frames = 25;
+ cfg_.rc_min_quantizer = 2;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_undershoot_pct = 50;
+ cfg_.rc_overshoot_pct = 50;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 600;
+ cfg_.rc_resize_allowed = 0;
+ cfg_.rc_end_usage = VPX_VBR;
+ }
+
+ virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+ ::libvpx_test::Encoder *encoder) {
+ if (video->frame() == 1) {
+ encoder->Control(VP8E_SET_CPUUSED, speed_);
+ encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
+ encoder->Control(VP9E_SET_TILE_COLUMNS, 2);
+ }
+ }
+
+ virtual void BeginPassHook(unsigned int /*pass*/) {
+ const std::string data_path = getenv("LIBVPX_TEST_DATA_PATH");
+ const std::string path_to_source = data_path + "/" + kNewEncodeOutputFile;
+ outfile_ = fopen(path_to_source.c_str(), "wb");
+ }
+
+ virtual void EndPassHook() {
+ if (outfile_) {
+ if (!fseek(outfile_, 0, SEEK_SET))
+ ivf_write_file_header(outfile_, &cfg_, VP9_FOURCC, out_frames_);
+ fclose(outfile_);
+ outfile_ = NULL;
+ }
+ }
+
+ virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+ ++out_frames_;
+
+ // Write initial file header if first frame.
+ if (pkt->data.frame.pts == 0)
+ ivf_write_file_header(outfile_, &cfg_, VP9_FOURCC, out_frames_);
+
+ // Write frame header and data.
+ ivf_write_frame_header(outfile_, out_frames_, pkt->data.frame.sz);
+ (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
+ }
+
+ virtual bool DoDecode() { return 0; }
+
+ void set_speed(unsigned int speed) {
+ speed_ = speed;
+ }
+
+ private:
+ libvpx_test::TestMode encoding_mode_;
+ uint32_t speed_;
+ FILE *outfile_;
+ uint32_t out_frames_;
+};
+
+
+struct EncodePerfTestVideo {
+ EncodePerfTestVideo(const char *name_, uint32_t width_, uint32_t height_,
+ uint32_t bitrate_, int frames_)
+ : name(name_),
+ width(width_),
+ height(height_),
+ bitrate(bitrate_),
+ frames(frames_) {}
+ const char *name;
+ uint32_t width;
+ uint32_t height;
+ uint32_t bitrate;
+ int frames;
+};
+
+const EncodePerfTestVideo kVP9EncodePerfTestVectors[] = {
+ EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470),
+};
+
+TEST_P(VP9NewEncodeDecodePerfTest, PerfTest) {
+ SetUp();
+
+ // TODO(JBB): Make this work by going through the set of given files.
+ const int i = 0;
+ const vpx_rational timebase = { 33333333, 1000000000 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_target_bitrate = kVP9EncodePerfTestVectors[i].bitrate;
+
+ init_flags_ = VPX_CODEC_USE_PSNR;
+
+ const char *video_name = kVP9EncodePerfTestVectors[i].name;
+ libvpx_test::I420VideoSource video(
+ video_name,
+ kVP9EncodePerfTestVectors[i].width,
+ kVP9EncodePerfTestVectors[i].height,
+ timebase.den, timebase.num, 0,
+ kVP9EncodePerfTestVectors[i].frames);
+ set_speed(2);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+ const uint32_t threads = 4;
+
+ libvpx_test::IVFVideoSource decode_video(kNewEncodeOutputFile);
+ decode_video.Init();
+
+ vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+ cfg.threads = threads;
+ libvpx_test::VP9Decoder decoder(cfg, 0);
+
+ vpx_usec_timer t;
+ vpx_usec_timer_start(&t);
+
+ for (decode_video.Begin(); decode_video.cxdata() != NULL;
+ decode_video.Next()) {
+ decoder.DecodeFrame(decode_video.cxdata(), decode_video.frame_size());
+ }
+
+ vpx_usec_timer_mark(&t);
+ const double elapsed_secs =
+ double(vpx_usec_timer_elapsed(&t)) / kUsecsInSec;
+ const unsigned decode_frames = decode_video.frame_number();
+ const double fps = double(decode_frames) / elapsed_secs;
+
+ printf("{\n");
+ printf("\t\"type\" : \"decode_perf_test\",\n");
+ printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+ printf("\t\"videoName\" : \"%s\",\n", kNewEncodeOutputFile);
+ printf("\t\"threadCount\" : %u,\n", threads);
+ printf("\t\"decodeTimeSecs\" : %f,\n", elapsed_secs);
+ printf("\t\"totalFrames\" : %u,\n", decode_frames);
+ printf("\t\"framesPerSecond\" : %f\n", fps);
+ printf("}\n");
+}
+
+VP9_INSTANTIATE_TEST_CASE(
+ VP9NewEncodeDecodePerfTest, ::testing::Values(::libvpx_test::kTwoPassGood));
} // namespace
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 110c9c3c9..01abca5fa 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -62,6 +62,10 @@ void reference_8x8_dct_2d(const int16_t input[kNumCoeffs],
using libvpx_test::ACMRandom;
namespace {
+
+const int kSignBiasMaxDiff255 = 1500;
+const int kSignBiasMaxDiff15 = 10000;
+
typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
@@ -160,7 +164,7 @@ class FwdTrans8x8TestBase {
for (int j = 0; j < 64; ++j) {
const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
- const int max_diff = 1125;
+ const int max_diff = kSignBiasMaxDiff255;
EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
<< "Error: 8x8 FDCT/FHT has a sign bias > "
<< 1. * max_diff / count_test_block * 100 << "%"
@@ -190,9 +194,9 @@ class FwdTrans8x8TestBase {
for (int j = 0; j < 64; ++j) {
const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
- const int max_diff = 10000;
+ const int max_diff = kSignBiasMaxDiff15;
EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
- << "Error: 4x4 FDCT/FHT has a sign bias > "
+ << "Error: 8x8 FDCT/FHT has a sign bias > "
<< 1. * max_diff / count_test_block * 100 << "%"
<< " for input range [-15, 15] at index " << j
<< " count0: " << count_sign_block[j][0]
@@ -646,36 +650,24 @@ TEST_P(InvTrans8x8DCT, CompareReference) {
using std::tr1::make_tuple;
#if CONFIG_VP9_HIGHBITDEPTH
-// TODO(jingning): re-enable after this handles the expanded range [0, 65535]
-// returned from Rand16().
-INSTANTIATE_TEST_CASE_P(
- DISABLED_C, FwdTrans8x8DCT,
- ::testing::Values(
- make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
INSTANTIATE_TEST_CASE_P(
C, FwdTrans8x8DCT,
::testing::Values(
+ make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8),
make_tuple(&vp9_highbd_fdct8x8_c, &idct8x8_10, 0, VPX_BITS_10),
make_tuple(&vp9_highbd_fdct8x8_c, &idct8x8_12, 0, VPX_BITS_12)));
#else
-// TODO(jingning): re-enable after this handles the expanded range [0, 65535]
-// returned from Rand16().
INSTANTIATE_TEST_CASE_P(
- DISABLED_C, FwdTrans8x8DCT,
+ C, FwdTrans8x8DCT,
::testing::Values(
make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_VP9_HIGHBITDEPTH
-// TODO(jingning): re-enable after this handles the expanded range [0, 65535]
-// returned from Rand16().
-INSTANTIATE_TEST_CASE_P(
- DISABLED_C, FwdTrans8x8HT,
- ::testing::Values(
- make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8)));
INSTANTIATE_TEST_CASE_P(
C, FwdTrans8x8HT,
::testing::Values(
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 0, VPX_BITS_10),
make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 1, VPX_BITS_10),
make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 2, VPX_BITS_10),
@@ -691,12 +683,9 @@ INSTANTIATE_TEST_CASE_P(
// TODO(jingning): re-enable after this handles the expanded range [0, 65535]
// returned from Rand16().
INSTANTIATE_TEST_CASE_P(
- DISABLED_C, FwdTrans8x8HT,
- ::testing::Values(
- make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
C, FwdTrans8x8HT,
::testing::Values(
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
@@ -706,12 +695,12 @@ INSTANTIATE_TEST_CASE_P(
// TODO(jingning): re-enable after this handles the expanded range [0, 65535]
// returned from Rand16().
INSTANTIATE_TEST_CASE_P(
- DISABLED_NEON, FwdTrans8x8DCT,
+ NEON, FwdTrans8x8DCT,
::testing::Values(
make_tuple(&vp9_fdct8x8_neon, &vp9_idct8x8_64_add_neon, 0,
VPX_BITS_8)));
INSTANTIATE_TEST_CASE_P(
- DISABLED_NEON, FwdTrans8x8HT,
+ NEON, FwdTrans8x8HT,
::testing::Values(
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 0, VPX_BITS_8),
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1, VPX_BITS_8),
@@ -723,32 +712,24 @@ INSTANTIATE_TEST_CASE_P(
// TODO(jingning): re-enable after these handle the expanded range [0, 65535]
// returned from Rand16().
INSTANTIATE_TEST_CASE_P(
- DISABLED_SSE2, FwdTrans8x8DCT,
+ SSE2, FwdTrans8x8DCT,
::testing::Values(
make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_sse2, 0,
VPX_BITS_8)));
INSTANTIATE_TEST_CASE_P(
- DISABLED_SSE2, FwdTrans8x8HT,
- ::testing::Values(
- make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
SSE2, FwdTrans8x8HT,
::testing::Values(
+ make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0, VPX_BITS_8),
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1, VPX_BITS_8),
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2, VPX_BITS_8),
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3, VPX_BITS_8)));
#endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-// TODO(jingning): re-enable after these handle the expanded range [0, 65535]
-// returned from Rand16().
-INSTANTIATE_TEST_CASE_P(
- DISABLED_SSE2, FwdTrans8x8DCT,
- ::testing::Values(
- make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
INSTANTIATE_TEST_CASE_P(
SSE2, FwdTrans8x8DCT,
::testing::Values(
+ make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8),
make_tuple(&vp9_highbd_fdct8x8_c,
&idct8x8_64_add_10_sse2, 12, VPX_BITS_10),
make_tuple(&vp9_highbd_fdct8x8_sse2,
@@ -761,12 +742,9 @@ INSTANTIATE_TEST_CASE_P(
// TODO(jingning): re-enable after these handle the expanded range [0, 65535]
// returned from Rand16().
INSTANTIATE_TEST_CASE_P(
- DISABLED_SSE2, FwdTrans8x8HT,
- ::testing::Values(
- make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
SSE2, FwdTrans8x8HT,
::testing::Values(
+ make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
@@ -791,7 +769,7 @@ INSTANTIATE_TEST_CASE_P(
// TODO(jingning): re-enable after this handles the expanded range [0, 65535]
// returned from Rand16().
INSTANTIATE_TEST_CASE_P(
- DISABLED_SSSE3, FwdTrans8x8DCT,
+ SSSE3, FwdTrans8x8DCT,
::testing::Values(
make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0,
VPX_BITS_8)));
diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index 0f335e2e4..e1be80baa 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -594,4 +594,35 @@ INSTANTIATE_TEST_CASE_P(
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif
+#if HAVE_NEON && (!CONFIG_VP9_HIGHBITDEPTH)
+INSTANTIATE_TEST_CASE_P(
+ NEON, Loop8Test6Param,
+ ::testing::Values(
+#if HAVE_NEON_ASM
+ make_tuple(&vp9_lpf_horizontal_16_neon,
+ &vp9_lpf_horizontal_16_c, 8),
+#endif // HAVE_NEON_ASM
+ make_tuple(&vp9_lpf_horizontal_4_neon,
+ &vp9_lpf_horizontal_4_c, 8),
+ make_tuple(&vp9_lpf_horizontal_8_neon,
+ &vp9_lpf_horizontal_8_c, 8),
+ make_tuple(&vp9_lpf_vertical_4_neon,
+ &vp9_lpf_vertical_4_c, 8),
+ make_tuple(&vp9_lpf_vertical_8_neon,
+ &vp9_lpf_vertical_8_c, 8)));
+INSTANTIATE_TEST_CASE_P(
+ NEON, Loop8Test9Param,
+ ::testing::Values(
+#if HAVE_NEON_ASM
+ make_tuple(&vp9_lpf_horizontal_4_dual_neon,
+ &vp9_lpf_horizontal_4_dual_c, 8),
+#endif // HAVE_NEON_ASM
+ make_tuple(&vp9_lpf_horizontal_8_dual_neon,
+ &vp9_lpf_horizontal_8_dual_c, 8),
+ make_tuple(&vp9_lpf_vertical_4_dual_neon,
+ &vp9_lpf_vertical_4_dual_c, 8),
+ make_tuple(&vp9_lpf_vertical_8_dual_neon,
+ &vp9_lpf_vertical_8_dual_c, 8)));
+#endif // HAVE_NEON && (!CONFIG_VP9_HIGHBITDEPTH)
+
} // namespace
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index 536273e3e..ba82da4e1 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -230,7 +230,7 @@ INSTANTIATE_TEST_CASE_P(
&vp9_idct4x4_1_add_c,
TX_4X4, 1)));
-#if HAVE_NEON_ASM
+#if HAVE_NEON
INSTANTIATE_TEST_CASE_P(
NEON, PartialIDctTest,
::testing::Values(
@@ -258,7 +258,7 @@ INSTANTIATE_TEST_CASE_P(
&vp9_idct4x4_16_add_c,
&vp9_idct4x4_1_add_neon,
TX_4X4, 1)));
-#endif
+#endif // HAVE_NEON
#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
diff --git a/test/test-data.mk b/test/test-data.mk
index 157d1bc34..d07ca3295 100644
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -714,6 +714,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.iv
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf.res
ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
+# NewEncode Test
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += niklas_1280_720_30.yuv
# BBB VP9 streams
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_426x240_tile_1x1_180kbps.webm
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_640x360_tile_1x2_337kbps.webm
@@ -743,7 +745,9 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += desktop_640_360_30.yuv
LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += kirland_640_480_30.yuv
LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcomoving_640_480_30.yuv
LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcostationary_640_480_30.yuv
+ifneq ($(CONFIG_DECODE_PERF_TESTS),yes)
LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.yuv
+endif
LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv
LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomanarrows_640_480_30.yuv
LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomasmallcameramovement_640_480_30.yuv
diff --git a/test/test.mk b/test/test.mk
index 4b12a7693..d94329924 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -46,6 +46,9 @@ LIBVPX_TEST_SRCS-yes += decode_test_driver.h
LIBVPX_TEST_SRCS-yes += encode_test_driver.cc
LIBVPX_TEST_SRCS-yes += encode_test_driver.h
+## IVF writing.
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += ../ivfenc.c ../ivfenc.h
+
## Y4m parsing.
LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += y4m_test.cc ../y4menc.c ../y4menc.h
diff --git a/third_party/x86inc/x86inc.asm b/third_party/x86inc/x86inc.asm
index 99453a998..8e75a4b19 100644
--- a/third_party/x86inc/x86inc.asm
+++ b/third_party/x86inc/x86inc.asm
@@ -617,9 +617,17 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%elifidn __OUTPUT_FORMAT__,elf64
global %1:function hidden
%elifidn __OUTPUT_FORMAT__,macho32
- global %1:private_extern
+ %ifdef __NASM_VER__
+ global %1
+ %else
+ global %1:private_extern
+ %endif
%elifidn __OUTPUT_FORMAT__,macho64
- global %1:private_extern
+ %ifdef __NASM_VER__
+ global %1
+ %else
+ global %1:private_extern
+ %endif
%else
global %1
%endif
diff --git a/vp9/common/arm/neon/vp9_avg_neon.c b/vp9/common/arm/neon/vp9_avg_neon.c
new file mode 100644
index 000000000..3a3db353e
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_avg_neon.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stddef.h>
+#include <arm_neon.h>
+
+void vp9_convolve_avg_neon(
+ const uint8_t *src, // r0
+ ptrdiff_t src_stride, // r1
+ uint8_t *dst, // r2
+ ptrdiff_t dst_stride, // r3
+ const int16_t *filter_x,
+ int filter_x_stride,
+ const int16_t *filter_y,
+ int filter_y_stride,
+ int w,
+ int h) {
+ uint8_t *d;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8;
+ uint32x2_t d0u32, d2u32;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
+ (void)filter_x; (void)filter_x_stride;
+ (void)filter_y; (void)filter_y_stride;
+
+ d = dst;
+ if (w > 32) { // avg64
+ for (; h > 0; h -= 1) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ q2u8 = vld1q_u8(src + 32);
+ q3u8 = vld1q_u8(src + 48);
+ src += src_stride;
+ q8u8 = vld1q_u8(d);
+ q9u8 = vld1q_u8(d + 16);
+ q10u8 = vld1q_u8(d + 32);
+ q11u8 = vld1q_u8(d + 48);
+ d += dst_stride;
+
+ q0u8 = vrhaddq_u8(q0u8, q8u8);
+ q1u8 = vrhaddq_u8(q1u8, q9u8);
+ q2u8 = vrhaddq_u8(q2u8, q10u8);
+ q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ vst1q_u8(dst + 32, q2u8);
+ vst1q_u8(dst + 48, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w == 32) { // avg32
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ src += src_stride;
+ q2u8 = vld1q_u8(src);
+ q3u8 = vld1q_u8(src + 16);
+ src += src_stride;
+ q8u8 = vld1q_u8(d);
+ q9u8 = vld1q_u8(d + 16);
+ d += dst_stride;
+ q10u8 = vld1q_u8(d);
+ q11u8 = vld1q_u8(d + 16);
+ d += dst_stride;
+
+ q0u8 = vrhaddq_u8(q0u8, q8u8);
+ q1u8 = vrhaddq_u8(q1u8, q9u8);
+ q2u8 = vrhaddq_u8(q2u8, q10u8);
+ q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q2u8);
+ vst1q_u8(dst + 16, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w > 8) { // avg16
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ src += src_stride;
+ q1u8 = vld1q_u8(src);
+ src += src_stride;
+ q2u8 = vld1q_u8(d);
+ d += dst_stride;
+ q3u8 = vld1q_u8(d);
+ d += dst_stride;
+
+ q0u8 = vrhaddq_u8(q0u8, q2u8);
+ q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+ vst1q_u8(dst, q0u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q1u8);
+ dst += dst_stride;
+ }
+ } else if (w == 8) { // avg8
+ for (; h > 0; h -= 2) {
+ d0u8 = vld1_u8(src);
+ src += src_stride;
+ d1u8 = vld1_u8(src);
+ src += src_stride;
+ d2u8 = vld1_u8(d);
+ d += dst_stride;
+ d3u8 = vld1_u8(d);
+ d += dst_stride;
+
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q0u8 = vrhaddq_u8(q0u8, q1u8);
+
+ vst1_u8(dst, vget_low_u8(q0u8));
+ dst += dst_stride;
+ vst1_u8(dst, vget_high_u8(q0u8));
+ dst += dst_stride;
+ }
+ } else { // avg4
+ for (; h > 0; h -= 2) {
+ d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0);
+ src += src_stride;
+ d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1);
+ src += src_stride;
+ d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+
+ d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32),
+ vreinterpret_u8_u32(d2u32));
+
+ d0u32 = vreinterpret_u32_u8(d0u8);
+ vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, d0u32, 1);
+ dst += dst_stride;
+ }
+ }
+ return;
+}
diff --git a/vp9/common/arm/neon/vp9_avg_neon.asm b/vp9/common/arm/neon/vp9_avg_neon_asm.asm
index 7d2453021..7d2453021 100644
--- a/vp9/common/arm/neon/vp9_avg_neon.asm
+++ b/vp9/common/arm/neon/vp9_avg_neon_asm.asm
diff --git a/vp9/common/arm/neon/vp9_convolve8_avg_neon.c b/vp9/common/arm/neon/vp9_convolve8_avg_neon.c
new file mode 100644
index 000000000..2f8dda07c
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_convolve8_avg_neon.c
@@ -0,0 +1,387 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stddef.h>
+#include <arm_neon.h>
+
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+static inline int32x4_t MULTIPLY_BY_Q0(
+ int16x4_t dsrc0,
+ int16x4_t dsrc1,
+ int16x4_t dsrc2,
+ int16x4_t dsrc3,
+ int16x4_t dsrc4,
+ int16x4_t dsrc5,
+ int16x4_t dsrc6,
+ int16x4_t dsrc7,
+ int16x8_t q0s16) {
+ int32x4_t qdst;
+ int16x4_t d0s16, d1s16;
+
+ d0s16 = vget_low_s16(q0s16);
+ d1s16 = vget_high_s16(q0s16);
+
+ qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+ qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+ return qdst;
+}
+
+void vp9_convolve8_avg_horiz_neon(
+ uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const int16_t *filter_x,
+ int x_step_q4,
+ const int16_t *filter_y, // unused
+ int y_step_q4, // unused
+ int w,
+ int h) {
+ int width;
+ uint8_t *s, *d;
+ uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+ uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32;
+ uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+ uint16x8x2_t q0x2u16;
+ uint8x8x2_t d0x2u8, d1x2u8;
+ uint32x2x2_t d0x2u32;
+ uint16x4x2_t d0x2u16, d1x2u16;
+ uint32x4x2_t q0x2u32;
+
+ if (x_step_q4 != 16) {
+ vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+ return;
+ }
+
+ q0s16 = vld1q_s16(filter_x);
+
+ src -= 3; // adjust for taps
+ for (; h > 0; h -= 4) { // loop_horiz_v
+ s = src;
+ d24u8 = vld1_u8(s);
+ s += src_stride;
+ d25u8 = vld1_u8(s);
+ s += src_stride;
+ d26u8 = vld1_u8(s);
+ s += src_stride;
+ d27u8 = vld1_u8(s);
+
+ q12u8 = vcombine_u8(d24u8, d25u8);
+ q13u8 = vcombine_u8(d26u8, d27u8);
+
+ q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
+ vreinterpretq_u16_u8(q13u8));
+ d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+ d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+ d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+ d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(d24u8, d25u8);
+ d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+ __builtin_prefetch(src + src_stride * 4);
+ __builtin_prefetch(src + src_stride * 5);
+
+ q8u16 = vmovl_u8(d0x2u8.val[0]);
+ q9u16 = vmovl_u8(d0x2u8.val[1]);
+ q10u16 = vmovl_u8(d1x2u8.val[0]);
+ q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+ src += 7;
+ d16u16 = vget_low_u16(q8u16);
+ d17u16 = vget_high_u16(q8u16);
+ d18u16 = vget_low_u16(q9u16);
+ d19u16 = vget_high_u16(q9u16);
+ q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18
+ q9u16 = vcombine_u16(d17u16, d19u16);
+
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
+ for (width = w;
+ width > 0;
+ width -= 4, src += 4, dst += 4) { // loop_horiz
+ s = src;
+ d28u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d29u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d31u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+ __builtin_prefetch(src + 64);
+
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
+ vreinterpret_u16_u32(d31u32));
+ d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
+ vreinterpret_u16_u32(d30u32));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
+ vreinterpret_u8_u16(d1x2u16.val[0])); // d29
+ d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
+ vreinterpret_u8_u16(d1x2u16.val[1])); // d30
+
+ __builtin_prefetch(src + 64 + src_stride);
+
+ q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+ q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+ q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
+ vreinterpretq_u32_u8(q15u8));
+
+ d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+ d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+ q12u16 = vmovl_u8(d28u8);
+ q13u16 = vmovl_u8(d29u8);
+
+ __builtin_prefetch(src + 64 + src_stride * 2);
+
+ d = dst;
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+ d += dst_stride;
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
+ d18s16, d19s16, d23s16, d24s16, q0s16);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
+ d19s16, d23s16, d24s16, d26s16, q0s16);
+ q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
+ d23s16, d24s16, d26s16, d27s16, q0s16);
+ q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
+ d24s16, d26s16, d27s16, d25s16, q0s16);
+
+ __builtin_prefetch(src + 64 + src_stride * 3);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u8 = vqmovn_u16(q1u16);
+ d3u8 = vqmovn_u16(q2u16);
+
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
+ vreinterpret_u16_u8(d3u8));
+ d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+ vreinterpret_u32_u16(d0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+ vreinterpret_u8_u32(d0x2u32.val[1]));
+
+ q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+ q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+ q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+ d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+ d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+ d = dst;
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+ q8u16 = q9u16;
+ d20s16 = d23s16;
+ q11u16 = q12u16;
+ q9u16 = q13u16;
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ }
+ src += src_stride * 4 - w - 7;
+ dst += dst_stride * 4 - w;
+ }
+ return;
+}
+
+void vp9_convolve8_avg_vert_neon(
+ uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const int16_t *filter_x, // unused
+ int x_step_q4, // unused
+ const int16_t *filter_y,
+ int y_step_q4,
+ int w,
+ int h) {
+ int height;
+ uint8_t *s, *d;
+ uint8x8_t d2u8, d3u8;
+ uint32x2_t d2u32, d3u32, d6u32, d7u32;
+ uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+ uint8x16_t q1u8, q3u8;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+ if (y_step_q4 != 16) {
+ vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+ return;
+ }
+
+ src -= src_stride * 3;
+ q0s16 = vld1q_s16(filter_y);
+ for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h
+ s = src;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+ s += src_stride;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+ s += src_stride;
+ d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+ s += src_stride;
+
+ q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
+ q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
+ q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+ q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d = dst;
+ for (height = h; height > 0; height -= 4) { // loop_vert
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+ s += src_stride;
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+ s += src_stride;
+
+ q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+ q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+ d += dst_stride;
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+ d -= dst_stride * 3;
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ __builtin_prefetch(s);
+ __builtin_prefetch(s + src_stride);
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
+ d20s16, d21s16, d22s16, d24s16, q0s16);
+ __builtin_prefetch(s + src_stride * 2);
+ __builtin_prefetch(s + src_stride * 3);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
+ d21s16, d22s16, d24s16, d26s16, q0s16);
+ __builtin_prefetch(d);
+ __builtin_prefetch(d + dst_stride);
+ q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
+ d22s16, d24s16, d26s16, d27s16, q0s16);
+ __builtin_prefetch(d + dst_stride * 2);
+ __builtin_prefetch(d + dst_stride * 3);
+ q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
+ d24s16, d26s16, d27s16, d25s16, q0s16);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u8 = vqmovn_u16(q1u16);
+ d3u8 = vqmovn_u16(q2u16);
+
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+ q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+ d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+ d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+ d += dst_stride;
+
+ q8u16 = q10u16;
+ d18s16 = d22s16;
+ d19s16 = d24s16;
+ q10u16 = q13u16;
+ d22s16 = d25s16;
+ }
+ }
+ return;
+}
diff --git a/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm b/vp9/common/arm/neon/vp9_convolve8_avg_neon_asm.asm
index 4d85846f0..4d85846f0 100644
--- a/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
+++ b/vp9/common/arm/neon/vp9_convolve8_avg_neon_asm.asm
diff --git a/vp9/common/arm/neon/vp9_convolve8_neon.c b/vp9/common/arm/neon/vp9_convolve8_neon.c
new file mode 100644
index 000000000..c8704aa9c
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_convolve8_neon.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stddef.h>
+#include <arm_neon.h>
+
+void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+static inline int32x4_t MULTIPLY_BY_Q0(
+ int16x4_t dsrc0,
+ int16x4_t dsrc1,
+ int16x4_t dsrc2,
+ int16x4_t dsrc3,
+ int16x4_t dsrc4,
+ int16x4_t dsrc5,
+ int16x4_t dsrc6,
+ int16x4_t dsrc7,
+ int16x8_t q0s16) {
+ int32x4_t qdst;
+ int16x4_t d0s16, d1s16;
+
+ d0s16 = vget_low_s16(q0s16);
+ d1s16 = vget_high_s16(q0s16);
+
+ qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+ qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+ return qdst;
+}
+
+void vp9_convolve8_horiz_neon(
+ uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const int16_t *filter_x,
+ int x_step_q4,
+ const int16_t *filter_y, // unused
+ int y_step_q4, // unused
+ int w,
+ int h) {
+ int width;
+ uint8_t *s, *d, *psrc, *pdst;
+ uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+ uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
+ uint8x16_t q12u8, q13u8, q14u8, q15u8;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+ uint16x8x2_t q0x2u16;
+ uint8x8x2_t d0x2u8, d1x2u8;
+ uint32x2x2_t d0x2u32;
+ uint16x4x2_t d0x2u16, d1x2u16;
+ uint32x4x2_t q0x2u32;
+
+ if (x_step_q4 != 16) {
+ vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+ return;
+ }
+
+ q0s16 = vld1q_s16(filter_x);
+
+ src -= 3; // adjust for taps
+ for (; h > 0; h -= 4,
+ src += src_stride * 4,
+ dst += dst_stride * 4) { // loop_horiz_v
+ s = src;
+ d24u8 = vld1_u8(s);
+ s += src_stride;
+ d25u8 = vld1_u8(s);
+ s += src_stride;
+ d26u8 = vld1_u8(s);
+ s += src_stride;
+ d27u8 = vld1_u8(s);
+
+ q12u8 = vcombine_u8(d24u8, d25u8);
+ q13u8 = vcombine_u8(d26u8, d27u8);
+
+ q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
+ vreinterpretq_u16_u8(q13u8));
+ d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+ d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+ d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+ d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(d24u8, d25u8);
+ d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+ __builtin_prefetch(src + src_stride * 4);
+ __builtin_prefetch(src + src_stride * 5);
+ __builtin_prefetch(src + src_stride * 6);
+
+ q8u16 = vmovl_u8(d0x2u8.val[0]);
+ q9u16 = vmovl_u8(d0x2u8.val[1]);
+ q10u16 = vmovl_u8(d1x2u8.val[0]);
+ q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+ d16u16 = vget_low_u16(q8u16);
+ d17u16 = vget_high_u16(q8u16);
+ d18u16 = vget_low_u16(q9u16);
+ d19u16 = vget_high_u16(q9u16);
+ q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18
+ q9u16 = vcombine_u16(d17u16, d19u16);
+
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
+ for (width = w, psrc = src + 7, pdst = dst;
+ width > 0;
+ width -= 4, psrc += 4, pdst += 4) { // loop_horiz
+ s = psrc;
+ d28u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d29u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d31u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+ __builtin_prefetch(psrc + 64);
+
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
+ vreinterpret_u16_u32(d31u32));
+ d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
+ vreinterpret_u16_u32(d30u32));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
+ vreinterpret_u8_u16(d1x2u16.val[0])); // d29
+ d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
+ vreinterpret_u8_u16(d1x2u16.val[1])); // d30
+
+ __builtin_prefetch(psrc + 64 + src_stride);
+
+ q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+ q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+ q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
+ vreinterpretq_u32_u8(q15u8));
+
+ d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+ d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+ q12u16 = vmovl_u8(d28u8);
+ q13u16 = vmovl_u8(d29u8);
+
+ __builtin_prefetch(psrc + 64 + src_stride * 2);
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
+ d18s16, d19s16, d23s16, d24s16, q0s16);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
+ d19s16, d23s16, d24s16, d26s16, q0s16);
+ q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
+ d23s16, d24s16, d26s16, d27s16, q0s16);
+ q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
+ d24s16, d26s16, d27s16, d25s16, q0s16);
+
+ __builtin_prefetch(psrc + 60 + src_stride * 3);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u8 = vqmovn_u16(q1u16);
+ d3u8 = vqmovn_u16(q2u16);
+
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
+ vreinterpret_u16_u8(d3u8));
+ d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+ vreinterpret_u32_u16(d0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+ vreinterpret_u8_u32(d0x2u32.val[1]));
+
+ d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
+ d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);
+
+ d = pdst;
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+ q8u16 = q9u16;
+ d20s16 = d23s16;
+ q11u16 = q12u16;
+ q9u16 = q13u16;
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ }
+ }
+ return;
+}
+
+void vp9_convolve8_vert_neon(
+ uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const int16_t *filter_x, // unused
+ int x_step_q4, // unused
+ const int16_t *filter_y,
+ int y_step_q4,
+ int w,
+ int h) {
+ int height;
+ uint8_t *s, *d;
+ uint32x2_t d2u32, d3u32;
+ uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+ if (y_step_q4 != 16) {
+ vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+ return;
+ }
+
+ src -= src_stride * 3;
+ q0s16 = vld1q_s16(filter_y);
+ for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h
+ s = src;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+ s += src_stride;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+ s += src_stride;
+ d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+ s += src_stride;
+
+ q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
+ q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
+ q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+ q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d = dst;
+ for (height = h; height > 0; height -= 4) { // loop_vert
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+ s += src_stride;
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+ s += src_stride;
+
+ q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+ q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ __builtin_prefetch(d);
+ __builtin_prefetch(d + dst_stride);
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
+ d20s16, d21s16, d22s16, d24s16, q0s16);
+ __builtin_prefetch(d + dst_stride * 2);
+ __builtin_prefetch(d + dst_stride * 3);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
+ d21s16, d22s16, d24s16, d26s16, q0s16);
+ __builtin_prefetch(s);
+ __builtin_prefetch(s + src_stride);
+ q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
+ d22s16, d24s16, d26s16, d27s16, q0s16);
+ __builtin_prefetch(s + src_stride * 2);
+ __builtin_prefetch(s + src_stride * 3);
+ q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
+ d24s16, d26s16, d27s16, d25s16, q0s16);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16));
+ d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16));
+
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+ d += dst_stride;
+
+ q8u16 = q10u16;
+ d18s16 = d22s16;
+ d19s16 = d24s16;
+ q10u16 = q13u16;
+ d22s16 = d25s16;
+ }
+ }
+ return;
+}
diff --git a/vp9/common/arm/neon/vp9_convolve8_neon.asm b/vp9/common/arm/neon/vp9_convolve8_neon_asm.asm
index 184c3ad67..184c3ad67 100644
--- a/vp9/common/arm/neon/vp9_convolve8_neon.asm
+++ b/vp9/common/arm/neon/vp9_convolve8_neon_asm.asm
diff --git a/vp9/common/arm/neon/vp9_copy_neon.c b/vp9/common/arm/neon/vp9_copy_neon.c
new file mode 100644
index 000000000..f334abe11
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_copy_neon.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stddef.h>
+#include <arm_neon.h>
+
+void vp9_convolve_copy_neon(
+ const uint8_t *src, // r0
+ ptrdiff_t src_stride, // r1
+ uint8_t *dst, // r2
+ ptrdiff_t dst_stride, // r3
+ const int16_t *filter_x,
+ int filter_x_stride,
+ const int16_t *filter_y,
+ int filter_y_stride,
+ int w,
+ int h) {
+ uint8x8_t d0u8, d2u8;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8;
+ (void)filter_x; (void)filter_x_stride;
+ (void)filter_y; (void)filter_y_stride;
+
+ if (w > 32) { // copy64
+ for (; h > 0; h--) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ q2u8 = vld1q_u8(src + 32);
+ q3u8 = vld1q_u8(src + 48);
+ src += src_stride;
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ vst1q_u8(dst + 32, q2u8);
+ vst1q_u8(dst + 48, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w == 32) { // copy32
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ src += src_stride;
+ q2u8 = vld1q_u8(src);
+ q3u8 = vld1q_u8(src + 16);
+ src += src_stride;
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q2u8);
+ vst1q_u8(dst + 16, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w > 8) { // copy16
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ src += src_stride;
+ q1u8 = vld1q_u8(src);
+ src += src_stride;
+
+ vst1q_u8(dst, q0u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q1u8);
+ dst += dst_stride;
+ }
+ } else if (w == 8) { // copy8
+ for (; h > 0; h -= 2) {
+ d0u8 = vld1_u8(src);
+ src += src_stride;
+ d2u8 = vld1_u8(src);
+ src += src_stride;
+
+ vst1_u8(dst, d0u8);
+ dst += dst_stride;
+ vst1_u8(dst, d2u8);
+ dst += dst_stride;
+ }
+ } else { // copy4
+ for (; h > 0; h--) {
+ *(uint32_t *)dst = *(const uint32_t *)src;
+ src += src_stride;
+ dst += dst_stride;
+ }
+ }
+ return;
+}
diff --git a/vp9/common/arm/neon/vp9_copy_neon.asm b/vp9/common/arm/neon/vp9_copy_neon_asm.asm
index a0bd04a35..a0bd04a35 100644
--- a/vp9/common/arm/neon/vp9_copy_neon.asm
+++ b/vp9/common/arm/neon/vp9_copy_neon_asm.asm
diff --git a/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c b/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c
new file mode 100644
index 000000000..3c8c6a934
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vp9/common/vp9_idct.h"
+
+void vp9_idct16x16_1_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8x8_t d2u8, d3u8, d30u8, d31u8;
+ uint64x1_t d2u64, d3u64, d4u64, d5u64;
+ uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
+ int16x8_t q0s16;
+ uint8_t *d1, *d2;
+ int16_t i, j, a1, cospi_16_64 = 11585;
+ int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+
+ q0s16 = vdupq_n_s16(a1);
+ q0u16 = vreinterpretq_u16_s16(q0s16);
+
+ for (d1 = d2 = dest, i = 0; i < 4; i++) {
+ for (j = 0; j < 2; j++) {
+ d2u64 = vld1_u64((const uint64_t *)d1);
+ d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
+ d1 += dest_stride;
+ d4u64 = vld1_u64((const uint64_t *)d1);
+ d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
+ d1 += dest_stride;
+
+ q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
+ q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
+ q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
+ q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+ d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
+ vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
+ d2 += dest_stride;
+ }
+ }
+ return;
+}
diff --git a/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm b/vp9/common/arm/neon/vp9_idct16x16_1_add_neon_asm.asm
index b1fd21bb6..b1fd21bb6 100644
--- a/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_idct16x16_1_add_neon_asm.asm
diff --git a/vp9/common/arm/neon/vp9_idct16x16_add_neon.c b/vp9/common/arm/neon/vp9_idct16x16_add_neon.c
new file mode 100644
index 000000000..68d7cccc0
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_idct16x16_add_neon.c
@@ -0,0 +1,1330 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static int16_t cospi_2_64 = 16305;
+static int16_t cospi_4_64 = 16069;
+static int16_t cospi_6_64 = 15679;
+static int16_t cospi_8_64 = 15137;
+static int16_t cospi_10_64 = 14449;
+static int16_t cospi_12_64 = 13623;
+static int16_t cospi_14_64 = 12665;
+static int16_t cospi_16_64 = 11585;
+static int16_t cospi_18_64 = 10394;
+static int16_t cospi_20_64 = 9102;
+static int16_t cospi_22_64 = 7723;
+static int16_t cospi_24_64 = 6270;
+static int16_t cospi_26_64 = 4756;
+static int16_t cospi_28_64 = 3196;
+static int16_t cospi_30_64 = 1606;
+
+static inline void TRANSPOSE8X8(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16,
+ int16x8_t *q10s16,
+ int16x8_t *q11s16,
+ int16x8_t *q12s16,
+ int16x8_t *q13s16,
+ int16x8_t *q14s16,
+ int16x8_t *q15s16) {
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+ int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
+ *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
+ *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
+ *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
+ *q12s16 = vcombine_s16(d17s16, d25s16);
+ *q13s16 = vcombine_s16(d19s16, d27s16);
+ *q14s16 = vcombine_s16(d21s16, d29s16);
+ *q15s16 = vcombine_s16(d23s16, d31s16);
+
+ q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
+ vreinterpretq_s32_s16(*q10s16));
+ q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
+ vreinterpretq_s32_s16(*q11s16));
+ q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
+ vreinterpretq_s32_s16(*q14s16));
+ q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
+ vreinterpretq_s32_s16(*q15s16));
+
+ q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
+ vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
+ q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
+ vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
+ q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
+ vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
+ q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
+ vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
+
+ *q8s16 = q0x2s16.val[0];
+ *q9s16 = q0x2s16.val[1];
+ *q10s16 = q1x2s16.val[0];
+ *q11s16 = q1x2s16.val[1];
+ *q12s16 = q2x2s16.val[0];
+ *q13s16 = q2x2s16.val[1];
+ *q14s16 = q3x2s16.val[0];
+ *q15s16 = q3x2s16.val[1];
+ return;
+}
+
+void vp9_idct16x16_256_add_neon_pass1(
+ int16_t *in,
+ int16_t *out,
+ int output_stride) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
+ uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+ int16x8x2_t q0x2s16;
+
+ q0x2s16 = vld2q_s16(in);
+ q8s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q9s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q10s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q11s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q12s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q13s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q14s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q15s16 = q0x2s16.val[0];
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+ d30s16 = vget_low_s16(q15s16);
+ d31s16 = vget_high_s16(q15s16);
+
+ // stage 3
+ d0s16 = vdup_n_s16(cospi_28_64);
+ d1s16 = vdup_n_s16(cospi_4_64);
+
+ q2s32 = vmull_s16(d18s16, d0s16);
+ q3s32 = vmull_s16(d19s16, d0s16);
+ q5s32 = vmull_s16(d18s16, d1s16);
+ q6s32 = vmull_s16(d19s16, d1s16);
+
+ q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+ q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+ q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
+ q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
+
+ d2s16 = vdup_n_s16(cospi_12_64);
+ d3s16 = vdup_n_s16(cospi_20_64);
+
+ d8s16 = vqrshrn_n_s32(q2s32, 14);
+ d9s16 = vqrshrn_n_s32(q3s32, 14);
+ d14s16 = vqrshrn_n_s32(q5s32, 14);
+ d15s16 = vqrshrn_n_s32(q6s32, 14);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+ q7s16 = vcombine_s16(d14s16, d15s16);
+
+ q2s32 = vmull_s16(d26s16, d2s16);
+ q3s32 = vmull_s16(d27s16, d2s16);
+ q9s32 = vmull_s16(d26s16, d3s16);
+ q15s32 = vmull_s16(d27s16, d3s16);
+
+ q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
+ q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
+ q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+ q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
+
+ d10s16 = vqrshrn_n_s32(q2s32, 14);
+ d11s16 = vqrshrn_n_s32(q3s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q15s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ // stage 4
+ d30s16 = vdup_n_s16(cospi_16_64);
+
+ q2s32 = vmull_s16(d16s16, d30s16);
+ q11s32 = vmull_s16(d17s16, d30s16);
+ q0s32 = vmull_s16(d24s16, d30s16);
+ q1s32 = vmull_s16(d25s16, d30s16);
+
+ d30s16 = vdup_n_s16(cospi_24_64);
+ d31s16 = vdup_n_s16(cospi_8_64);
+
+ q3s32 = vaddq_s32(q2s32, q0s32);
+ q12s32 = vaddq_s32(q11s32, q1s32);
+ q13s32 = vsubq_s32(q2s32, q0s32);
+ q1s32 = vsubq_s32(q11s32, q1s32);
+
+ d16s16 = vqrshrn_n_s32(q3s32, 14);
+ d17s16 = vqrshrn_n_s32(q12s32, 14);
+ d18s16 = vqrshrn_n_s32(q13s32, 14);
+ d19s16 = vqrshrn_n_s32(q1s32, 14);
+ q8s16 = vcombine_s16(d16s16, d17s16);
+ q9s16 = vcombine_s16(d18s16, d19s16);
+
+ q0s32 = vmull_s16(d20s16, d31s16);
+ q1s32 = vmull_s16(d21s16, d31s16);
+ q12s32 = vmull_s16(d20s16, d30s16);
+ q13s32 = vmull_s16(d21s16, d30s16);
+
+ q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
+ q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
+ q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
+ q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
+
+ d22s16 = vqrshrn_n_s32(q0s32, 14);
+ d23s16 = vqrshrn_n_s32(q1s32, 14);
+ d20s16 = vqrshrn_n_s32(q12s32, 14);
+ d21s16 = vqrshrn_n_s32(q13s32, 14);
+ q10s16 = vcombine_s16(d20s16, d21s16);
+ q11s16 = vcombine_s16(d22s16, d23s16);
+
+ q13s16 = vsubq_s16(q4s16, q5s16);
+ q4s16 = vaddq_s16(q4s16, q5s16);
+ q14s16 = vsubq_s16(q7s16, q6s16);
+ q15s16 = vaddq_s16(q6s16, q7s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+
+ // stage 5
+ q0s16 = vaddq_s16(q8s16, q11s16);
+ q1s16 = vaddq_s16(q9s16, q10s16);
+ q2s16 = vsubq_s16(q9s16, q10s16);
+ q3s16 = vsubq_s16(q8s16, q11s16);
+
+ d16s16 = vdup_n_s16(cospi_16_64);
+
+ q11s32 = vmull_s16(d26s16, d16s16);
+ q12s32 = vmull_s16(d27s16, d16s16);
+ q9s32 = vmull_s16(d28s16, d16s16);
+ q10s32 = vmull_s16(d29s16, d16s16);
+
+ q6s32 = vsubq_s32(q9s32, q11s32);
+ q13s32 = vsubq_s32(q10s32, q12s32);
+ q9s32 = vaddq_s32(q9s32, q11s32);
+ q10s32 = vaddq_s32(q10s32, q12s32);
+
+ d10s16 = vqrshrn_n_s32(q6s32, 14);
+ d11s16 = vqrshrn_n_s32(q13s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q10s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ // stage 6
+ q8s16 = vaddq_s16(q0s16, q15s16);
+ q9s16 = vaddq_s16(q1s16, q6s16);
+ q10s16 = vaddq_s16(q2s16, q5s16);
+ q11s16 = vaddq_s16(q3s16, q4s16);
+ q12s16 = vsubq_s16(q3s16, q4s16);
+ q13s16 = vsubq_s16(q2s16, q5s16);
+ q14s16 = vsubq_s16(q1s16, q6s16);
+ q15s16 = vsubq_s16(q0s16, q15s16);
+
+ d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
+ d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
+ d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+ d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+ d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
+ d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
+ d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
+ d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+ d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+ d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+ d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+ // store the data
+ output_stride >>= 1; // output_stride / 2, out is int16_t
+ vst1_u64((uint64_t *)out, d16u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d17u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d18u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d19u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d20u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d21u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d22u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d23u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d24u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d28u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d29u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d30u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d31u64);
+ return;
+}
+
+void vp9_idct16x16_256_add_neon_pass2(
+ int16_t *src,
+ int16_t *out,
+ int16_t *pass1Output,
+ int16_t skip_adding,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8_t *d;
+ uint8x8_t d12u8, d13u8;
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ uint64x1_t d24u64, d25u64, d26u64, d27u64;
+ int64x1_t d12s64, d13s64;
+ uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
+ uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32;
+ int16x8x2_t q0x2s16;
+
+ q0x2s16 = vld2q_s16(src);
+ q8s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q9s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q10s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q11s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q12s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q13s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q14s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q15s16 = q0x2s16.val[0];
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+ d30s16 = vget_low_s16(q15s16);
+ d31s16 = vget_high_s16(q15s16);
+
+ // stage 3
+ d12s16 = vdup_n_s16(cospi_30_64);
+ d13s16 = vdup_n_s16(cospi_2_64);
+
+ q2s32 = vmull_s16(d16s16, d12s16);
+ q3s32 = vmull_s16(d17s16, d12s16);
+ q1s32 = vmull_s16(d16s16, d13s16);
+ q4s32 = vmull_s16(d17s16, d13s16);
+
+ q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
+ q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
+ q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
+ q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
+
+ d0s16 = vqrshrn_n_s32(q2s32, 14);
+ d1s16 = vqrshrn_n_s32(q3s32, 14);
+ d14s16 = vqrshrn_n_s32(q1s32, 14);
+ d15s16 = vqrshrn_n_s32(q4s32, 14);
+ q0s16 = vcombine_s16(d0s16, d1s16);
+ q7s16 = vcombine_s16(d14s16, d15s16);
+
+ d30s16 = vdup_n_s16(cospi_14_64);
+ d31s16 = vdup_n_s16(cospi_18_64);
+
+ q2s32 = vmull_s16(d24s16, d30s16);
+ q3s32 = vmull_s16(d25s16, d30s16);
+ q4s32 = vmull_s16(d24s16, d31s16);
+ q5s32 = vmull_s16(d25s16, d31s16);
+
+ q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
+ q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
+ q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
+ q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
+
+ d2s16 = vqrshrn_n_s32(q2s32, 14);
+ d3s16 = vqrshrn_n_s32(q3s32, 14);
+ d12s16 = vqrshrn_n_s32(q4s32, 14);
+ d13s16 = vqrshrn_n_s32(q5s32, 14);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ d30s16 = vdup_n_s16(cospi_22_64);
+ d31s16 = vdup_n_s16(cospi_10_64);
+
+ q11s32 = vmull_s16(d20s16, d30s16);
+ q12s32 = vmull_s16(d21s16, d30s16);
+ q4s32 = vmull_s16(d20s16, d31s16);
+ q5s32 = vmull_s16(d21s16, d31s16);
+
+ q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
+ q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
+ q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
+ q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
+
+ d4s16 = vqrshrn_n_s32(q11s32, 14);
+ d5s16 = vqrshrn_n_s32(q12s32, 14);
+ d11s16 = vqrshrn_n_s32(q5s32, 14);
+ d10s16 = vqrshrn_n_s32(q4s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ d30s16 = vdup_n_s16(cospi_6_64);
+ d31s16 = vdup_n_s16(cospi_26_64);
+
+ q10s32 = vmull_s16(d28s16, d30s16);
+ q11s32 = vmull_s16(d29s16, d30s16);
+ q12s32 = vmull_s16(d28s16, d31s16);
+ q13s32 = vmull_s16(d29s16, d31s16);
+
+ q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
+ q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
+ q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
+ q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
+
+ d6s16 = vqrshrn_n_s32(q10s32, 14);
+ d7s16 = vqrshrn_n_s32(q11s32, 14);
+ d8s16 = vqrshrn_n_s32(q12s32, 14);
+ d9s16 = vqrshrn_n_s32(q13s32, 14);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+
+ // stage 3
+ q9s16 = vsubq_s16(q0s16, q1s16);
+ q0s16 = vaddq_s16(q0s16, q1s16);
+ q10s16 = vsubq_s16(q3s16, q2s16);
+ q11s16 = vaddq_s16(q2s16, q3s16);
+ q12s16 = vaddq_s16(q4s16, q5s16);
+ q13s16 = vsubq_s16(q4s16, q5s16);
+ q14s16 = vsubq_s16(q7s16, q6s16);
+ q7s16 = vaddq_s16(q6s16, q7s16);
+
+ // stage 4
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+
+ d30s16 = vdup_n_s16(cospi_8_64);
+ d31s16 = vdup_n_s16(cospi_24_64);
+
+ q2s32 = vmull_s16(d18s16, d31s16);
+ q3s32 = vmull_s16(d19s16, d31s16);
+ q4s32 = vmull_s16(d28s16, d31s16);
+ q5s32 = vmull_s16(d29s16, d31s16);
+
+ q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
+ q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
+ q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
+ q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
+
+ d12s16 = vqrshrn_n_s32(q2s32, 14);
+ d13s16 = vqrshrn_n_s32(q3s32, 14);
+ d2s16 = vqrshrn_n_s32(q4s32, 14);
+ d3s16 = vqrshrn_n_s32(q5s32, 14);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ q3s16 = q11s16;
+ q4s16 = q12s16;
+
+ d30s16 = vdup_n_s16(-cospi_8_64);
+ q11s32 = vmull_s16(d26s16, d30s16);
+ q12s32 = vmull_s16(d27s16, d30s16);
+ q8s32 = vmull_s16(d20s16, d30s16);
+ q9s32 = vmull_s16(d21s16, d30s16);
+
+ q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
+ q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
+ q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
+ q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
+
+ d4s16 = vqrshrn_n_s32(q11s32, 14);
+ d5s16 = vqrshrn_n_s32(q12s32, 14);
+ d10s16 = vqrshrn_n_s32(q8s32, 14);
+ d11s16 = vqrshrn_n_s32(q9s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ // stage 5
+ q8s16 = vaddq_s16(q0s16, q3s16);
+ q9s16 = vaddq_s16(q1s16, q2s16);
+ q10s16 = vsubq_s16(q1s16, q2s16);
+ q11s16 = vsubq_s16(q0s16, q3s16);
+ q12s16 = vsubq_s16(q7s16, q4s16);
+ q13s16 = vsubq_s16(q6s16, q5s16);
+ q14s16 = vaddq_s16(q6s16, q5s16);
+ q15s16 = vaddq_s16(q7s16, q4s16);
+
+ // stage 6
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+
+ d14s16 = vdup_n_s16(cospi_16_64);
+
+ q3s32 = vmull_s16(d26s16, d14s16);
+ q4s32 = vmull_s16(d27s16, d14s16);
+ q0s32 = vmull_s16(d20s16, d14s16);
+ q1s32 = vmull_s16(d21s16, d14s16);
+
+ q5s32 = vsubq_s32(q3s32, q0s32);
+ q6s32 = vsubq_s32(q4s32, q1s32);
+ q10s32 = vaddq_s32(q3s32, q0s32);
+ q4s32 = vaddq_s32(q4s32, q1s32);
+
+ d4s16 = vqrshrn_n_s32(q5s32, 14);
+ d5s16 = vqrshrn_n_s32(q6s32, 14);
+ d10s16 = vqrshrn_n_s32(q10s32, 14);
+ d11s16 = vqrshrn_n_s32(q4s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ q0s32 = vmull_s16(d22s16, d14s16);
+ q1s32 = vmull_s16(d23s16, d14s16);
+ q13s32 = vmull_s16(d24s16, d14s16);
+ q6s32 = vmull_s16(d25s16, d14s16);
+
+ q10s32 = vsubq_s32(q13s32, q0s32);
+ q4s32 = vsubq_s32(q6s32, q1s32);
+ q13s32 = vaddq_s32(q13s32, q0s32);
+ q6s32 = vaddq_s32(q6s32, q1s32);
+
+ d6s16 = vqrshrn_n_s32(q10s32, 14);
+ d7s16 = vqrshrn_n_s32(q4s32, 14);
+ d8s16 = vqrshrn_n_s32(q13s32, 14);
+ d9s16 = vqrshrn_n_s32(q6s32, 14);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+
+ // stage 7
+ if (skip_adding != 0) {
+ d = dest;
+ // load the data in pass1
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ d13s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+
+ q12s16 = vaddq_s16(q0s16, q15s16);
+ q13s16 = vaddq_s16(q1s16, q14s16);
+ q12s16 = vrshrq_n_s16(q12s16, 6);
+ q13s16 = vrshrq_n_s16(q13s16, 6);
+ q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+ vreinterpret_u8_s64(d12s64));
+ q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+ vreinterpret_u8_s64(d13s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+ d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+ d += dest_stride;
+ q14s16 = vsubq_s16(q1s16, q14s16);
+ q15s16 = vsubq_s16(q0s16, q15s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ d13s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q12s16 = vaddq_s16(q10s16, q5s16);
+ q13s16 = vaddq_s16(q11s16, q4s16);
+ q12s16 = vrshrq_n_s16(q12s16, 6);
+ q13s16 = vrshrq_n_s16(q13s16, 6);
+ q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+ vreinterpret_u8_s64(d12s64));
+ q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+ vreinterpret_u8_s64(d13s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+ d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+ d += dest_stride;
+ q4s16 = vsubq_s16(q11s16, q4s16);
+ q5s16 = vsubq_s16(q10s16, q5s16);
+
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ d13s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q12s16 = vaddq_s16(q0s16, q3s16);
+ q13s16 = vaddq_s16(q1s16, q2s16);
+ q12s16 = vrshrq_n_s16(q12s16, 6);
+ q13s16 = vrshrq_n_s16(q13s16, 6);
+ q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+ vreinterpret_u8_s64(d12s64));
+ q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+ vreinterpret_u8_s64(d13s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+ d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+ d += dest_stride;
+ q2s16 = vsubq_s16(q1s16, q2s16);
+ q3s16 = vsubq_s16(q0s16, q3s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ d13s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q12s16 = vaddq_s16(q10s16, q9s16);
+ q13s16 = vaddq_s16(q11s16, q8s16);
+ q12s16 = vrshrq_n_s16(q12s16, 6);
+ q13s16 = vrshrq_n_s16(q13s16, 6);
+ q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+ vreinterpret_u8_s64(d12s64));
+ q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+ vreinterpret_u8_s64(d13s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+ d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+ d += dest_stride;
+ q8s16 = vsubq_s16(q11s16, q8s16);
+ q9s16 = vsubq_s16(q10s16, q9s16);
+
+ // store the data out 8,9,10,11,12,13,14,15
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q8s16 = vrshrq_n_s16(q8s16, 6);
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q9s16 = vrshrq_n_s16(q9s16, 6);
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q2s16 = vrshrq_n_s16(q2s16, 6);
+ q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q3s16 = vrshrq_n_s16(q3s16, 6);
+ q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q4s16 = vrshrq_n_s16(q4s16, 6);
+ q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q5s16 = vrshrq_n_s16(q5s16, 6);
+ q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q14s16 = vrshrq_n_s16(q14s16, 6);
+ q14u16 = vaddw_u8(vreinterpretq_u16_s16(q14s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ q15s16 = vrshrq_n_s16(q15s16, 6);
+ q15u16 = vaddw_u8(vreinterpretq_u16_s16(q15s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ } else { // skip_adding_dest
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q0s16, q15s16);
+ q13s16 = vaddq_s16(q1s16, q14s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q14s16 = vsubq_s16(q1s16, q14s16);
+ q15s16 = vsubq_s16(q0s16, q15s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q10s16, q5s16);
+ q13s16 = vaddq_s16(q11s16, q4s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q4s16 = vsubq_s16(q11s16, q4s16);
+ q5s16 = vsubq_s16(q10s16, q5s16);
+
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q0s16, q3s16);
+ q13s16 = vaddq_s16(q1s16, q2s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q2s16 = vsubq_s16(q1s16, q2s16);
+ q3s16 = vsubq_s16(q0s16, q3s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q10s16, q9s16);
+ q13s16 = vaddq_s16(q11s16, q8s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q8s16 = vsubq_s16(q11s16, q8s16);
+ q9s16 = vsubq_s16(q10s16, q9s16);
+
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
+ }
+ return;
+}
+
+void vp9_idct16x16_10_add_neon_pass1(
+ int16_t *in,
+ int16_t *out,
+ int output_stride) {
+ int16x4_t d4s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
+ uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+ int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4_t q6s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q15s32;
+ int16x8x2_t q0x2s16;
+
+ q0x2s16 = vld2q_s16(in);
+ q8s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q9s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q10s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q11s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q12s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q13s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q14s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q15s16 = q0x2s16.val[0];
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // stage 3
+ q0s16 = vdupq_n_s16(cospi_28_64 * 2);
+ q1s16 = vdupq_n_s16(cospi_4_64 * 2);
+
+ q4s16 = vqrdmulhq_s16(q9s16, q0s16);
+ q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+
+ // stage 4
+ q1s16 = vdupq_n_s16(cospi_16_64 * 2);
+ d4s16 = vdup_n_s16(cospi_16_64);
+
+ q8s16 = vqrdmulhq_s16(q8s16, q1s16);
+
+ d8s16 = vget_low_s16(q4s16);
+ d9s16 = vget_high_s16(q4s16);
+ d14s16 = vget_low_s16(q7s16);
+ d15s16 = vget_high_s16(q7s16);
+ q9s32 = vmull_s16(d14s16, d4s16);
+ q10s32 = vmull_s16(d15s16, d4s16);
+ q12s32 = vmull_s16(d9s16, d4s16);
+ q11s32 = vmull_s16(d8s16, d4s16);
+
+ q15s32 = vsubq_s32(q10s32, q12s32);
+ q6s32 = vsubq_s32(q9s32, q11s32);
+ q9s32 = vaddq_s32(q9s32, q11s32);
+ q10s32 = vaddq_s32(q10s32, q12s32);
+
+ d11s16 = vqrshrn_n_s32(q15s32, 14);
+ d10s16 = vqrshrn_n_s32(q6s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q10s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ // stage 6
+ q2s16 = vaddq_s16(q8s16, q7s16);
+ q9s16 = vaddq_s16(q8s16, q6s16);
+ q10s16 = vaddq_s16(q8s16, q5s16);
+ q11s16 = vaddq_s16(q8s16, q4s16);
+ q12s16 = vsubq_s16(q8s16, q4s16);
+ q13s16 = vsubq_s16(q8s16, q5s16);
+ q14s16 = vsubq_s16(q8s16, q6s16);
+ q15s16 = vsubq_s16(q8s16, q7s16);
+
+ d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
+ d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
+ d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+ d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+ d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
+ d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
+ d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
+ d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+ d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+ d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+ d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+ // store the data
+ output_stride >>= 1; // output_stride / 2, out is int16_t
+ vst1_u64((uint64_t *)out, d4u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d5u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d18u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d19u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d20u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d21u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d22u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d23u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d24u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d28u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d29u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d30u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d31u64);
+ return;
+}
+
+void vp9_idct16x16_10_add_neon_pass2(
+ int16_t *src,
+ int16_t *out,
+ int16_t *pass1Output,
+ int16_t skip_adding,
+ uint8_t *dest,
+ int dest_stride) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
+ uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
+ uint64x1_t d16u64, d17u64, d18u64, d19u64;
+ uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32;
+ int16x8x2_t q0x2s16;
+ (void)skip_adding;
+ (void)dest;
+ (void)dest_stride;
+
+ q0x2s16 = vld2q_s16(src);
+ q8s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q9s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q10s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q11s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q12s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q13s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q14s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q15s16 = q0x2s16.val[0];
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // stage 3
+ q6s16 = vdupq_n_s16(cospi_30_64 * 2);
+ q0s16 = vqrdmulhq_s16(q8s16, q6s16);
+ q6s16 = vdupq_n_s16(cospi_2_64 * 2);
+ q7s16 = vqrdmulhq_s16(q8s16, q6s16);
+
+ q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
+ q14s16 = vdupq_n_s16(cospi_6_64 * 2);
+ q3s16 = vqrdmulhq_s16(q9s16, q15s16);
+ q4s16 = vqrdmulhq_s16(q9s16, q14s16);
+
+ // stage 4
+ d0s16 = vget_low_s16(q0s16);
+ d1s16 = vget_high_s16(q0s16);
+ d6s16 = vget_low_s16(q3s16);
+ d7s16 = vget_high_s16(q3s16);
+ d8s16 = vget_low_s16(q4s16);
+ d9s16 = vget_high_s16(q4s16);
+ d14s16 = vget_low_s16(q7s16);
+ d15s16 = vget_high_s16(q7s16);
+
+ d30s16 = vdup_n_s16(cospi_8_64);
+ d31s16 = vdup_n_s16(cospi_24_64);
+
+ q12s32 = vmull_s16(d14s16, d31s16);
+ q5s32 = vmull_s16(d15s16, d31s16);
+ q2s32 = vmull_s16(d0s16, d31s16);
+ q11s32 = vmull_s16(d1s16, d31s16);
+
+ q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
+ q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
+ q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
+ q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
+
+ d2s16 = vqrshrn_n_s32(q12s32, 14);
+ d3s16 = vqrshrn_n_s32(q5s32, 14);
+ d12s16 = vqrshrn_n_s32(q2s32, 14);
+ d13s16 = vqrshrn_n_s32(q11s32, 14);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ d30s16 = vdup_n_s16(-cospi_8_64);
+ q10s32 = vmull_s16(d8s16, d30s16);
+ q13s32 = vmull_s16(d9s16, d30s16);
+ q8s32 = vmull_s16(d6s16, d30s16);
+ q9s32 = vmull_s16(d7s16, d30s16);
+
+ q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
+ q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
+ q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
+ q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
+
+ d4s16 = vqrshrn_n_s32(q10s32, 14);
+ d5s16 = vqrshrn_n_s32(q13s32, 14);
+ d10s16 = vqrshrn_n_s32(q8s32, 14);
+ d11s16 = vqrshrn_n_s32(q9s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ // stage 5
+ q8s16 = vaddq_s16(q0s16, q3s16);
+ q9s16 = vaddq_s16(q1s16, q2s16);
+ q10s16 = vsubq_s16(q1s16, q2s16);
+ q11s16 = vsubq_s16(q0s16, q3s16);
+ q12s16 = vsubq_s16(q7s16, q4s16);
+ q13s16 = vsubq_s16(q6s16, q5s16);
+ q14s16 = vaddq_s16(q6s16, q5s16);
+ q15s16 = vaddq_s16(q7s16, q4s16);
+
+ // stage 6
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+
+ d14s16 = vdup_n_s16(cospi_16_64);
+ q3s32 = vmull_s16(d26s16, d14s16);
+ q4s32 = vmull_s16(d27s16, d14s16);
+ q0s32 = vmull_s16(d20s16, d14s16);
+ q1s32 = vmull_s16(d21s16, d14s16);
+
+ q5s32 = vsubq_s32(q3s32, q0s32);
+ q6s32 = vsubq_s32(q4s32, q1s32);
+ q0s32 = vaddq_s32(q3s32, q0s32);
+ q4s32 = vaddq_s32(q4s32, q1s32);
+
+ d4s16 = vqrshrn_n_s32(q5s32, 14);
+ d5s16 = vqrshrn_n_s32(q6s32, 14);
+ d10s16 = vqrshrn_n_s32(q0s32, 14);
+ d11s16 = vqrshrn_n_s32(q4s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ q0s32 = vmull_s16(d22s16, d14s16);
+ q1s32 = vmull_s16(d23s16, d14s16);
+ q13s32 = vmull_s16(d24s16, d14s16);
+ q6s32 = vmull_s16(d25s16, d14s16);
+
+ q10s32 = vsubq_s32(q13s32, q0s32);
+ q4s32 = vsubq_s32(q6s32, q1s32);
+ q13s32 = vaddq_s32(q13s32, q0s32);
+ q6s32 = vaddq_s32(q6s32, q1s32);
+
+ d6s16 = vqrshrn_n_s32(q10s32, 14);
+ d7s16 = vqrshrn_n_s32(q4s32, 14);
+ d8s16 = vqrshrn_n_s32(q13s32, 14);
+ d9s16 = vqrshrn_n_s32(q6s32, 14);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+
+ // stage 7
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q0s16, q15s16);
+ q13s16 = vaddq_s16(q1s16, q14s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q14s16 = vsubq_s16(q1s16, q14s16);
+ q15s16 = vsubq_s16(q0s16, q15s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q10s16, q5s16);
+ q13s16 = vaddq_s16(q11s16, q4s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q4s16 = vsubq_s16(q11s16, q4s16);
+ q5s16 = vsubq_s16(q10s16, q5s16);
+
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q0s16, q3s16);
+ q13s16 = vaddq_s16(q1s16, q2s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q2s16 = vsubq_s16(q1s16, q2s16);
+ q3s16 = vsubq_s16(q0s16, q3s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ q12s16 = vaddq_s16(q10s16, q9s16);
+ q13s16 = vaddq_s16(q11s16, q8s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q8s16 = vsubq_s16(q11s16, q8s16);
+ q9s16 = vsubq_s16(q10s16, q9s16);
+
+ d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
+ d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
+ d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16));
+ d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16));
+ d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16));
+ d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16));
+ d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
+ d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
+ d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
+ d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
+ d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+ d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+ d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+ d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+ d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+ d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+ vst1_u64((uint64_t *)out, d16u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d17u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d18u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d19u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d4u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d5u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d6u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d7u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d8u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d9u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d10u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d11u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d28u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d29u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d30u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d31u64);
+ return;
+}
diff --git a/vp9/common/arm/neon/vp9_idct16x16_add_neon.asm b/vp9/common/arm/neon/vp9_idct16x16_add_neon_asm.asm
index a13c0d04b..a13c0d04b 100644
--- a/vp9/common/arm/neon/vp9_idct16x16_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_idct16x16_add_neon_asm.asm
diff --git a/vp9/common/arm/neon/vp9_idct16x16_neon.c b/vp9/common/arm/neon/vp9_idct16x16_neon.c
index 0b9fc09ab..f2c4ec451 100644
--- a/vp9/common/arm/neon/vp9_idct16x16_neon.c
+++ b/vp9/common/arm/neon/vp9_idct16x16_neon.c
@@ -30,18 +30,24 @@ void vp9_idct16x16_10_add_neon_pass2(const int16_t *src,
uint8_t *dest,
int dest_stride);
+#if HAVE_NEON_ASM
/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
extern void vp9_push_neon(int64_t *store);
extern void vp9_pop_neon(int64_t *store);
+#endif // HAVE_NEON_ASM
void vp9_idct16x16_256_add_neon(const int16_t *input,
uint8_t *dest, int dest_stride) {
+#if HAVE_NEON_ASM
int64_t store_reg[8];
+#endif
int16_t pass1_output[16*16] = {0};
int16_t row_idct_output[16*16] = {0};
+#if HAVE_NEON_ASM
// save d8-d15 register values.
vp9_push_neon(store_reg);
+#endif
/* Parallel idct on the upper 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
@@ -103,20 +109,26 @@ void vp9_idct16x16_256_add_neon(const int16_t *input,
dest+8,
dest_stride);
+#if HAVE_NEON_ASM
// restore d8-d15 register values.
vp9_pop_neon(store_reg);
+#endif
return;
}
void vp9_idct16x16_10_add_neon(const int16_t *input,
uint8_t *dest, int dest_stride) {
+#if HAVE_NEON_ASM
int64_t store_reg[8];
+#endif
int16_t pass1_output[16*16] = {0};
int16_t row_idct_output[16*16] = {0};
+#if HAVE_NEON_ASM
// save d8-d15 register values.
vp9_push_neon(store_reg);
+#endif
/* Parallel idct on the upper 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
@@ -165,8 +177,10 @@ void vp9_idct16x16_10_add_neon(const int16_t *input,
dest+8,
dest_stride);
+#if HAVE_NEON_ASM
// restore d8-d15 register values.
vp9_pop_neon(store_reg);
+#endif
return;
}
diff --git a/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c b/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c
new file mode 100644
index 000000000..1bfee22b2
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vp9/common/vp9_idct.h"
+
+static inline void LD_16x8(
+ uint8_t *d,
+ int d_stride,
+ uint8x16_t *q8u8,
+ uint8x16_t *q9u8,
+ uint8x16_t *q10u8,
+ uint8x16_t *q11u8,
+ uint8x16_t *q12u8,
+ uint8x16_t *q13u8,
+ uint8x16_t *q14u8,
+ uint8x16_t *q15u8) {
+ *q8u8 = vld1q_u8(d);
+ d += d_stride;
+ *q9u8 = vld1q_u8(d);
+ d += d_stride;
+ *q10u8 = vld1q_u8(d);
+ d += d_stride;
+ *q11u8 = vld1q_u8(d);
+ d += d_stride;
+ *q12u8 = vld1q_u8(d);
+ d += d_stride;
+ *q13u8 = vld1q_u8(d);
+ d += d_stride;
+ *q14u8 = vld1q_u8(d);
+ d += d_stride;
+ *q15u8 = vld1q_u8(d);
+ return;
+}
+
+static inline void ADD_DIFF_16x8(
+ uint8x16_t qdiffu8,
+ uint8x16_t *q8u8,
+ uint8x16_t *q9u8,
+ uint8x16_t *q10u8,
+ uint8x16_t *q11u8,
+ uint8x16_t *q12u8,
+ uint8x16_t *q13u8,
+ uint8x16_t *q14u8,
+ uint8x16_t *q15u8) {
+ *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
+ *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
+ *q10u8 = vqaddq_u8(*q10u8, qdiffu8);
+ *q11u8 = vqaddq_u8(*q11u8, qdiffu8);
+ *q12u8 = vqaddq_u8(*q12u8, qdiffu8);
+ *q13u8 = vqaddq_u8(*q13u8, qdiffu8);
+ *q14u8 = vqaddq_u8(*q14u8, qdiffu8);
+ *q15u8 = vqaddq_u8(*q15u8, qdiffu8);
+ return;
+}
+
+static inline void SUB_DIFF_16x8(
+ uint8x16_t qdiffu8,
+ uint8x16_t *q8u8,
+ uint8x16_t *q9u8,
+ uint8x16_t *q10u8,
+ uint8x16_t *q11u8,
+ uint8x16_t *q12u8,
+ uint8x16_t *q13u8,
+ uint8x16_t *q14u8,
+ uint8x16_t *q15u8) {
+ *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
+ *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
+ *q10u8 = vqsubq_u8(*q10u8, qdiffu8);
+ *q11u8 = vqsubq_u8(*q11u8, qdiffu8);
+ *q12u8 = vqsubq_u8(*q12u8, qdiffu8);
+ *q13u8 = vqsubq_u8(*q13u8, qdiffu8);
+ *q14u8 = vqsubq_u8(*q14u8, qdiffu8);
+ *q15u8 = vqsubq_u8(*q15u8, qdiffu8);
+ return;
+}
+
+static inline void ST_16x8(
+ uint8_t *d,
+ int d_stride,
+ uint8x16_t *q8u8,
+ uint8x16_t *q9u8,
+ uint8x16_t *q10u8,
+ uint8x16_t *q11u8,
+ uint8x16_t *q12u8,
+ uint8x16_t *q13u8,
+ uint8x16_t *q14u8,
+ uint8x16_t *q15u8) {
+ vst1q_u8(d, *q8u8);
+ d += d_stride;
+ vst1q_u8(d, *q9u8);
+ d += d_stride;
+ vst1q_u8(d, *q10u8);
+ d += d_stride;
+ vst1q_u8(d, *q11u8);
+ d += d_stride;
+ vst1q_u8(d, *q12u8);
+ d += d_stride;
+ vst1q_u8(d, *q13u8);
+ d += d_stride;
+ vst1q_u8(d, *q14u8);
+ d += d_stride;
+ vst1q_u8(d, *q15u8);
+ return;
+}
+
+void vp9_idct32x32_1_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+ int i, j, dest_stride8;
+ uint8_t *d;
+ int16_t a1, cospi_16_64 = 11585;
+ int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+
+ dest_stride8 = dest_stride * 8;
+ if (a1 >= 0) { // diff_positive_32_32
+ a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
+ q0u8 = vdupq_n_u8(a1);
+ for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop
+ d = dest;
+ for (j = 0; j < 4; j++) {
+ LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+ &q12u8, &q13u8, &q14u8, &q15u8);
+ ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
+ &q12u8, &q13u8, &q14u8, &q15u8);
+ ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+ &q12u8, &q13u8, &q14u8, &q15u8);
+ d += dest_stride8;
+ }
+ }
+ } else { // diff_negative_32_32
+ a1 = -a1;
+ a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
+ q0u8 = vdupq_n_u8(a1);
+ for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop
+ d = dest;
+ for (j = 0; j < 4; j++) {
+ LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+ &q12u8, &q13u8, &q14u8, &q15u8);
+ SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
+ &q12u8, &q13u8, &q14u8, &q15u8);
+ ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+ &q12u8, &q13u8, &q14u8, &q15u8);
+ d += dest_stride8;
+ }
+ }
+ }
+ return;
+}
diff --git a/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm b/vp9/common/arm/neon/vp9_idct32x32_1_add_neon_asm.asm
index d290d0753..d290d0753 100644
--- a/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_idct32x32_1_add_neon_asm.asm
diff --git a/vp9/common/arm/neon/vp9_idct32x32_add_neon.c b/vp9/common/arm/neon/vp9_idct32x32_add_neon.c
new file mode 100644
index 000000000..53f721b44
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_idct32x32_add_neon.c
@@ -0,0 +1,748 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static int16_t cospi_1_64 = 16364;
+static int16_t cospi_2_64 = 16305;
+static int16_t cospi_3_64 = 16207;
+static int16_t cospi_4_64 = 16069;
+static int16_t cospi_5_64 = 15893;
+static int16_t cospi_6_64 = 15679;
+static int16_t cospi_7_64 = 15426;
+static int16_t cospi_8_64 = 15137;
+static int16_t cospi_9_64 = 14811;
+static int16_t cospi_10_64 = 14449;
+static int16_t cospi_11_64 = 14053;
+static int16_t cospi_12_64 = 13623;
+static int16_t cospi_13_64 = 13160;
+static int16_t cospi_14_64 = 12665;
+static int16_t cospi_15_64 = 12140;
+static int16_t cospi_16_64 = 11585;
+static int16_t cospi_17_64 = 11003;
+static int16_t cospi_18_64 = 10394;
+static int16_t cospi_19_64 = 9760;
+static int16_t cospi_20_64 = 9102;
+static int16_t cospi_21_64 = 8423;
+static int16_t cospi_22_64 = 7723;
+static int16_t cospi_23_64 = 7005;
+static int16_t cospi_24_64 = 6270;
+static int16_t cospi_25_64 = 5520;
+static int16_t cospi_26_64 = 4756;
+static int16_t cospi_27_64 = 3981;
+static int16_t cospi_28_64 = 3196;
+static int16_t cospi_29_64 = 2404;
+static int16_t cospi_30_64 = 1606;
+static int16_t cospi_31_64 = 804;
+
+#define LOAD_FROM_TRANSPOSED(prev, first, second) \
+ q14s16 = vld1q_s16(trans_buf + first * 8); \
+ q13s16 = vld1q_s16(trans_buf + second * 8);
+
+#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
+ qA = vld1q_s16(out + first * 32); \
+ qB = vld1q_s16(out + second * 32);
+
+#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
+ vst1q_s16(out + first * 32, qA); \
+ vst1q_s16(out + second * 32, qB);
+
+#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
+ __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \
+ q6s16, q7s16, q8s16, q9s16);
+static inline void __STORE_COMBINE_CENTER_RESULTS(
+ uint8_t *p1,
+ uint8_t *p2,
+ int stride,
+ int16x8_t q6s16,
+ int16x8_t q7s16,
+ int16x8_t q8s16,
+ int16x8_t q9s16) {
+ int16x4_t d8s16, d9s16, d10s16, d11s16;
+
+ d8s16 = vld1_s16((int16_t *)p1);
+ p1 += stride;
+ d11s16 = vld1_s16((int16_t *)p2);
+ p2 -= stride;
+ d9s16 = vld1_s16((int16_t *)p1);
+ d10s16 = vld1_s16((int16_t *)p2);
+
+ q7s16 = vrshrq_n_s16(q7s16, 6);
+ q8s16 = vrshrq_n_s16(q8s16, 6);
+ q9s16 = vrshrq_n_s16(q9s16, 6);
+ q6s16 = vrshrq_n_s16(q6s16, 6);
+
+ q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
+ vreinterpret_u8_s16(d9s16)));
+ q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_s16(d10s16)));
+ q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_s16(d11s16)));
+ q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
+ vreinterpret_u8_s16(d8s16)));
+
+ d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
+ d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
+ d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
+ d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+
+ vst1_s16((int16_t *)p1, d9s16);
+ p1 -= stride;
+ vst1_s16((int16_t *)p2, d10s16);
+ p2 += stride;
+ vst1_s16((int16_t *)p1, d8s16);
+ vst1_s16((int16_t *)p2, d11s16);
+ return;
+}
+
+#define STORE_COMBINE_EXTREME_RESULTS(r7, r6); \
+ __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \
+ q4s16, q5s16, q6s16, q7s16);
+static inline void __STORE_COMBINE_EXTREME_RESULTS(
+ uint8_t *p1,
+ uint8_t *p2,
+ int stride,
+ int16x8_t q4s16,
+ int16x8_t q5s16,
+ int16x8_t q6s16,
+ int16x8_t q7s16) {
+ int16x4_t d4s16, d5s16, d6s16, d7s16;
+
+ d4s16 = vld1_s16((int16_t *)p1);
+ p1 += stride;
+ d7s16 = vld1_s16((int16_t *)p2);
+ p2 -= stride;
+ d5s16 = vld1_s16((int16_t *)p1);
+ d6s16 = vld1_s16((int16_t *)p2);
+
+ q5s16 = vrshrq_n_s16(q5s16, 6);
+ q6s16 = vrshrq_n_s16(q6s16, 6);
+ q7s16 = vrshrq_n_s16(q7s16, 6);
+ q4s16 = vrshrq_n_s16(q4s16, 6);
+
+ q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16),
+ vreinterpret_u8_s16(d5s16)));
+ q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
+ vreinterpret_u8_s16(d6s16)));
+ q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
+ vreinterpret_u8_s16(d7s16)));
+ q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16),
+ vreinterpret_u8_s16(d4s16)));
+
+ d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
+ d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+ d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
+ d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
+
+ vst1_s16((int16_t *)p1, d5s16);
+ p1 -= stride;
+ vst1_s16((int16_t *)p2, d6s16);
+ p2 += stride;
+ vst1_s16((int16_t *)p2, d7s16);
+ vst1_s16((int16_t *)p1, d4s16);
+ return;
+}
+
+#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
+ DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
+static inline void DO_BUTTERFLY(
+ int16x8_t q14s16,
+ int16x8_t q13s16,
+ int16_t first_const,
+ int16_t second_const,
+ int16x8_t *qAs16,
+ int16x8_t *qBs16) {
+ int16x4_t d30s16, d31s16;
+ int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
+ int16x4_t dCs16, dDs16, dAs16, dBs16;
+
+ dCs16 = vget_low_s16(q14s16);
+ dDs16 = vget_high_s16(q14s16);
+ dAs16 = vget_low_s16(q13s16);
+ dBs16 = vget_high_s16(q13s16);
+
+ d30s16 = vdup_n_s16(first_const);
+ d31s16 = vdup_n_s16(second_const);
+
+ q8s32 = vmull_s16(dCs16, d30s16);
+ q10s32 = vmull_s16(dAs16, d31s16);
+ q9s32 = vmull_s16(dDs16, d30s16);
+ q11s32 = vmull_s16(dBs16, d31s16);
+ q12s32 = vmull_s16(dCs16, d31s16);
+
+ q8s32 = vsubq_s32(q8s32, q10s32);
+ q9s32 = vsubq_s32(q9s32, q11s32);
+
+ q10s32 = vmull_s16(dDs16, d31s16);
+ q11s32 = vmull_s16(dAs16, d30s16);
+ q15s32 = vmull_s16(dBs16, d30s16);
+
+ q11s32 = vaddq_s32(q12s32, q11s32);
+ q10s32 = vaddq_s32(q10s32, q15s32);
+
+ *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14),
+ vqrshrn_n_s32(q9s32, 14));
+ *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14),
+ vqrshrn_n_s32(q10s32, 14));
+ return;
+}
+
+static inline void idct32_transpose_pair(
+ int16_t *input,
+ int16_t *t_buf) {
+ int16_t *in;
+ int i;
+ const int stride = 32;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+ int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+ for (i = 0; i < 4; i++, input += 8) {
+ in = input;
+ q8s16 = vld1q_s16(in);
+ in += stride;
+ q9s16 = vld1q_s16(in);
+ in += stride;
+ q10s16 = vld1q_s16(in);
+ in += stride;
+ q11s16 = vld1q_s16(in);
+ in += stride;
+ q12s16 = vld1q_s16(in);
+ in += stride;
+ q13s16 = vld1q_s16(in);
+ in += stride;
+ q14s16 = vld1q_s16(in);
+ in += stride;
+ q15s16 = vld1q_s16(in);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+ d30s16 = vget_low_s16(q15s16);
+ d31s16 = vget_high_s16(q15s16);
+
+ q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
+ q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
+ q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
+ q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
+ q12s16 = vcombine_s16(d17s16, d25s16);
+ q13s16 = vcombine_s16(d19s16, d27s16);
+ q14s16 = vcombine_s16(d21s16, d29s16);
+ q15s16 = vcombine_s16(d23s16, d31s16);
+
+ q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
+ vreinterpretq_s32_s16(q10s16));
+ q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16),
+ vreinterpretq_s32_s16(q11s16));
+ q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16),
+ vreinterpretq_s32_s16(q14s16));
+ q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16),
+ vreinterpretq_s32_s16(q15s16));
+
+ q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
+ vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
+ q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
+ vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
+ q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
+ vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
+ q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
+ vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
+
+ vst1q_s16(t_buf, q0x2s16.val[0]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q0x2s16.val[1]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q1x2s16.val[0]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q1x2s16.val[1]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q2x2s16.val[0]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q2x2s16.val[1]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q3x2s16.val[0]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q3x2s16.val[1]);
+ t_buf += 8;
+ }
+ return;
+}
+
+static inline void idct32_bands_end_1st_pass(
+ int16_t *out,
+ int16x8_t q2s16,
+ int16x8_t q3s16,
+ int16x8_t q6s16,
+ int16x8_t q7s16,
+ int16x8_t q8s16,
+ int16x8_t q9s16,
+ int16x8_t q10s16,
+ int16x8_t q11s16,
+ int16x8_t q12s16,
+ int16x8_t q13s16,
+ int16x8_t q14s16,
+ int16x8_t q15s16) {
+ int16x8_t q0s16, q1s16, q4s16, q5s16;
+
+ STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
+ STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
+
+ LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
+ STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
+
+ LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
+ q2s16 = vaddq_s16(q10s16, q1s16);
+ q3s16 = vaddq_s16(q11s16, q0s16);
+ q4s16 = vsubq_s16(q11s16, q0s16);
+ q5s16 = vsubq_s16(q10s16, q1s16);
+
+ LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
+ STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
+
+ LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
+ STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
+
+ LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
+ q2s16 = vaddq_s16(q12s16, q1s16);
+ q3s16 = vaddq_s16(q13s16, q0s16);
+ q4s16 = vsubq_s16(q13s16, q0s16);
+ q5s16 = vsubq_s16(q12s16, q1s16);
+
+ LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
+ STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
+
+ LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
+ STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
+
+ LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
+ q2s16 = vaddq_s16(q14s16, q1s16);
+ q3s16 = vaddq_s16(q15s16, q0s16);
+ q4s16 = vsubq_s16(q15s16, q0s16);
+ q5s16 = vsubq_s16(q14s16, q1s16);
+
+ LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
+ STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
+
+ LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
+ STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
+ return;
+}
+
+static inline void idct32_bands_end_2nd_pass(
+ int16_t *out,
+ uint8_t *dest,
+ int stride,
+ int16x8_t q2s16,
+ int16x8_t q3s16,
+ int16x8_t q6s16,
+ int16x8_t q7s16,
+ int16x8_t q8s16,
+ int16x8_t q9s16,
+ int16x8_t q10s16,
+ int16x8_t q11s16,
+ int16x8_t q12s16,
+ int16x8_t q13s16,
+ int16x8_t q14s16,
+ int16x8_t q15s16) {
+ uint8_t *r6 = dest + 31 * stride;
+ uint8_t *r7 = dest/* + 0 * stride*/;
+ uint8_t *r9 = dest + 15 * stride;
+ uint8_t *r10 = dest + 16 * stride;
+ int str2 = stride << 1;
+ int16x8_t q0s16, q1s16, q4s16, q5s16;
+
+ STORE_COMBINE_CENTER_RESULTS(r10, r9);
+ r10 += str2; r9 -= str2;
+
+ LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+ r7 += str2; r6 -= str2;
+
+ LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
+ q2s16 = vaddq_s16(q10s16, q1s16);
+ q3s16 = vaddq_s16(q11s16, q0s16);
+ q4s16 = vsubq_s16(q11s16, q0s16);
+ q5s16 = vsubq_s16(q10s16, q1s16);
+
+ LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_COMBINE_CENTER_RESULTS(r10, r9);
+ r10 += str2; r9 -= str2;
+
+ LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+ r7 += str2; r6 -= str2;
+
+ LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
+ q2s16 = vaddq_s16(q12s16, q1s16);
+ q3s16 = vaddq_s16(q13s16, q0s16);
+ q4s16 = vsubq_s16(q13s16, q0s16);
+ q5s16 = vsubq_s16(q12s16, q1s16);
+
+ LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_COMBINE_CENTER_RESULTS(r10, r9);
+ r10 += str2; r9 -= str2;
+
+ LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+ r7 += str2; r6 -= str2;
+
+ LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
+ q2s16 = vaddq_s16(q14s16, q1s16);
+ q3s16 = vaddq_s16(q15s16, q0s16);
+ q4s16 = vsubq_s16(q15s16, q0s16);
+ q5s16 = vsubq_s16(q14s16, q1s16);
+
+ LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_COMBINE_CENTER_RESULTS(r10, r9);
+
+ LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+ return;
+}
+
+void vp9_idct32x32_1024_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int stride) {
+ int i, idct32_pass_loop;
+ int16_t trans_buf[32 * 8];
+ int16_t pass1[32 * 32];
+ int16_t pass2[32 * 32];
+ int16_t *out;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+
+ for (idct32_pass_loop = 0, out = pass1;
+ idct32_pass_loop < 2;
+ idct32_pass_loop++,
+ input = pass1, // the input of pass2 is the result of pass1
+ out = pass2) {
+ for (i = 0;
+ i < 4; i++,
+ input += 32 * 8, out += 8) { // idct32_bands_loop
+ idct32_transpose_pair(input, trans_buf);
+
+ // -----------------------------------------
+ // BLOCK A: 16-19,28-31
+ // -----------------------------------------
+ // generate 16,17,30,31
+ // part of stage 1
+ LOAD_FROM_TRANSPOSED(0, 1, 31)
+ DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(31, 17, 15)
+ DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
+ // part of stage 2
+ q4s16 = vaddq_s16(q0s16, q1s16);
+ q13s16 = vsubq_s16(q0s16, q1s16);
+ q6s16 = vaddq_s16(q2s16, q3s16);
+ q14s16 = vsubq_s16(q2s16, q3s16);
+ // part of stage 3
+ DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
+
+ // generate 18,19,28,29
+ // part of stage 1
+ LOAD_FROM_TRANSPOSED(15, 9, 23)
+ DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(23, 25, 7)
+ DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
+ // part of stage 2
+ q13s16 = vsubq_s16(q3s16, q2s16);
+ q3s16 = vaddq_s16(q3s16, q2s16);
+ q14s16 = vsubq_s16(q1s16, q0s16);
+ q2s16 = vaddq_s16(q1s16, q0s16);
+ // part of stage 3
+ DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
+ // part of stage 4
+ q8s16 = vaddq_s16(q4s16, q2s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q10s16 = vaddq_s16(q7s16, q1s16);
+ q15s16 = vaddq_s16(q6s16, q3s16);
+ q13s16 = vsubq_s16(q5s16, q0s16);
+ q14s16 = vsubq_s16(q7s16, q1s16);
+ STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
+ STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
+ // part of stage 5
+ DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
+ STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
+ // part of stage 4
+ q13s16 = vsubq_s16(q4s16, q2s16);
+ q14s16 = vsubq_s16(q6s16, q3s16);
+ // part of stage 5
+ DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
+ STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
+
+ // -----------------------------------------
+ // BLOCK B: 20-23,24-27
+ // -----------------------------------------
+ // generate 20,21,26,27
+ // part of stage 1
+ LOAD_FROM_TRANSPOSED(7, 5, 27)
+ DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(27, 21, 11)
+ DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
+ // part of stage 2
+ q13s16 = vsubq_s16(q0s16, q1s16);
+ q0s16 = vaddq_s16(q0s16, q1s16);
+ q14s16 = vsubq_s16(q2s16, q3s16);
+ q2s16 = vaddq_s16(q2s16, q3s16);
+ // part of stage 3
+ DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+
+ // generate 22,23,24,25
+ // part of stage 1
+ LOAD_FROM_TRANSPOSED(11, 13, 19)
+ DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
+ LOAD_FROM_TRANSPOSED(19, 29, 3)
+ DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
+ // part of stage 2
+ q14s16 = vsubq_s16(q4s16, q5s16);
+ q5s16 = vaddq_s16(q4s16, q5s16);
+ q13s16 = vsubq_s16(q6s16, q7s16);
+ q6s16 = vaddq_s16(q6s16, q7s16);
+ // part of stage 3
+ DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
+ // part of stage 4
+ q10s16 = vaddq_s16(q7s16, q1s16);
+ q11s16 = vaddq_s16(q5s16, q0s16);
+ q12s16 = vaddq_s16(q6s16, q2s16);
+ q15s16 = vaddq_s16(q4s16, q3s16);
+ // part of stage 6
+ LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
+ q8s16 = vaddq_s16(q14s16, q11s16);
+ q9s16 = vaddq_s16(q13s16, q10s16);
+ q13s16 = vsubq_s16(q13s16, q10s16);
+ q11s16 = vsubq_s16(q14s16, q11s16);
+ STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
+ LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
+ q8s16 = vsubq_s16(q9s16, q12s16);
+ q10s16 = vaddq_s16(q14s16, q15s16);
+ q14s16 = vsubq_s16(q14s16, q15s16);
+ q12s16 = vaddq_s16(q9s16, q12s16);
+ STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
+ // part of stage 7
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+ STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
+ q13s16 = q11s16;
+ q14s16 = q8s16;
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+ STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
+ // part of stage 4
+ q14s16 = vsubq_s16(q5s16, q0s16);
+ q13s16 = vsubq_s16(q6s16, q2s16);
+ DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
+ q14s16 = vsubq_s16(q7s16, q1s16);
+ q13s16 = vsubq_s16(q4s16, q3s16);
+ DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
+ // part of stage 6
+ LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
+ q8s16 = vaddq_s16(q14s16, q1s16);
+ q9s16 = vaddq_s16(q13s16, q6s16);
+ q13s16 = vsubq_s16(q13s16, q6s16);
+ q1s16 = vsubq_s16(q14s16, q1s16);
+ STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
+ LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
+ q14s16 = vsubq_s16(q8s16, q5s16);
+ q10s16 = vaddq_s16(q8s16, q5s16);
+ q11s16 = vaddq_s16(q9s16, q0s16);
+ q0s16 = vsubq_s16(q9s16, q0s16);
+ STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
+ // part of stage 7
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+ STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
+ DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64,
+ &q1s16, &q0s16);
+ STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
+
+ // -----------------------------------------
+ // BLOCK C: 8-10,11-15
+ // -----------------------------------------
+ // generate 8,9,14,15
+ // part of stage 2
+ LOAD_FROM_TRANSPOSED(3, 2, 30)
+ DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(30, 18, 14)
+ DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
+ // part of stage 3
+ q13s16 = vsubq_s16(q0s16, q1s16);
+ q0s16 = vaddq_s16(q0s16, q1s16);
+ q14s16 = vsubq_s16(q2s16, q3s16);
+ q2s16 = vaddq_s16(q2s16, q3s16);
+ // part of stage 4
+ DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
+
+ // generate 10,11,12,13
+ // part of stage 2
+ LOAD_FROM_TRANSPOSED(14, 10, 22)
+ DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
+ LOAD_FROM_TRANSPOSED(22, 26, 6)
+ DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
+ // part of stage 3
+ q14s16 = vsubq_s16(q4s16, q5s16);
+ q5s16 = vaddq_s16(q4s16, q5s16);
+ q13s16 = vsubq_s16(q6s16, q7s16);
+ q6s16 = vaddq_s16(q6s16, q7s16);
+ // part of stage 4
+ DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
+ // part of stage 5
+ q8s16 = vaddq_s16(q0s16, q5s16);
+ q9s16 = vaddq_s16(q1s16, q7s16);
+ q13s16 = vsubq_s16(q1s16, q7s16);
+ q14s16 = vsubq_s16(q3s16, q4s16);
+ q10s16 = vaddq_s16(q3s16, q4s16);
+ q15s16 = vaddq_s16(q2s16, q6s16);
+ STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
+ STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
+ // part of stage 6
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+ STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
+ q13s16 = vsubq_s16(q0s16, q5s16);
+ q14s16 = vsubq_s16(q2s16, q6s16);
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+ STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
+
+ // -----------------------------------------
+ // BLOCK D: 0-3,4-7
+ // -----------------------------------------
+ // generate 4,5,6,7
+ // part of stage 3
+ LOAD_FROM_TRANSPOSED(6, 4, 28)
+ DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(28, 20, 12)
+ DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+ // part of stage 4
+ q13s16 = vsubq_s16(q0s16, q1s16);
+ q0s16 = vaddq_s16(q0s16, q1s16);
+ q14s16 = vsubq_s16(q2s16, q3s16);
+ q2s16 = vaddq_s16(q2s16, q3s16);
+ // part of stage 5
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+
+ // generate 0,1,2,3
+ // part of stage 4
+ LOAD_FROM_TRANSPOSED(12, 0, 16)
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
+ LOAD_FROM_TRANSPOSED(16, 8, 24)
+ DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
+ // part of stage 5
+ q4s16 = vaddq_s16(q7s16, q6s16);
+ q7s16 = vsubq_s16(q7s16, q6s16);
+ q6s16 = vsubq_s16(q5s16, q14s16);
+ q5s16 = vaddq_s16(q5s16, q14s16);
+ // part of stage 6
+ q8s16 = vaddq_s16(q4s16, q2s16);
+ q9s16 = vaddq_s16(q5s16, q3s16);
+ q10s16 = vaddq_s16(q6s16, q1s16);
+ q11s16 = vaddq_s16(q7s16, q0s16);
+ q12s16 = vsubq_s16(q7s16, q0s16);
+ q13s16 = vsubq_s16(q6s16, q1s16);
+ q14s16 = vsubq_s16(q5s16, q3s16);
+ q15s16 = vsubq_s16(q4s16, q2s16);
+ // part of stage 7
+ LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
+ q2s16 = vaddq_s16(q8s16, q1s16);
+ q3s16 = vaddq_s16(q9s16, q0s16);
+ q4s16 = vsubq_s16(q9s16, q0s16);
+ q5s16 = vsubq_s16(q8s16, q1s16);
+ LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+
+ if (idct32_pass_loop == 0) {
+ idct32_bands_end_1st_pass(out,
+ q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
+ q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
+ } else {
+ idct32_bands_end_2nd_pass(out, dest, stride,
+ q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
+ q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
+ dest += 8;
+ }
+ }
+ }
+ return;
+}
diff --git a/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm b/vp9/common/arm/neon/vp9_idct32x32_add_neon_asm.asm
index 72e933eee..72e933eee 100644
--- a/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_idct32x32_add_neon_asm.asm
diff --git a/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c b/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c
new file mode 100644
index 000000000..7c8a930b6
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vp9/common/vp9_idct.h"
+
+void vp9_idct4x4_1_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8x8_t d6u8;
+ uint32x2_t d2u32 = vdup_n_u32(0);
+ uint16x8_t q8u16;
+ int16x8_t q0s16;
+ uint8_t *d1, *d2;
+ int16_t i, a1, cospi_16_64 = 11585;
+ int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 4);
+
+ q0s16 = vdupq_n_s16(a1);
+
+ // dc_only_idct_add
+ d1 = d2 = dest;
+ for (i = 0; i < 2; i++) {
+ d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);
+ d1 += dest_stride;
+ d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16),
+ vreinterpret_u8_u32(d2u32));
+ d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+
+ vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
+ d2 += dest_stride;
+ vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);
+ d2 += dest_stride;
+ }
+ return;
+}
diff --git a/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm b/vp9/common/arm/neon/vp9_idct4x4_1_add_neon_asm.asm
index 0d4a721c4..0d4a721c4 100644
--- a/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_idct4x4_1_add_neon_asm.asm
diff --git a/vp9/common/arm/neon/vp9_idct4x4_add_neon.c b/vp9/common/arm/neon/vp9_idct4x4_add_neon.c
new file mode 100644
index 000000000..dc91e0f30
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_idct4x4_add_neon.c
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp9_idct4x4_16_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8x8_t d26u8, d27u8;
+ uint32x2_t d26u32, d27u32;
+ uint16x8_t q8u16, q9u16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
+ int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
+ int16x8_t q8s16, q9s16, q13s16, q14s16;
+ int32x4_t q1s32, q13s32, q14s32, q15s32;
+ int16x4x2_t d0x2s16, d1x2s16;
+ int32x4x2_t q0x2s32;
+ uint8_t *d;
+ int16_t cospi_8_64 = 15137;
+ int16_t cospi_16_64 = 11585;
+ int16_t cospi_24_64 = 6270;
+
+ d26u32 = d27u32 = vdup_n_u32(0);
+
+ q8s16 = vld1q_s16(input);
+ q9s16 = vld1q_s16(input + 8);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+
+ d0x2s16 = vtrn_s16(d16s16, d17s16);
+ d1x2s16 = vtrn_s16(d18s16, d19s16);
+ q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
+ q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
+
+ d20s16 = vdup_n_s16(cospi_8_64);
+ d21s16 = vdup_n_s16(cospi_16_64);
+
+ q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
+ vreinterpretq_s32_s16(q9s16));
+ d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+ d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+ d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+ d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+
+ d22s16 = vdup_n_s16(cospi_24_64);
+
+ // stage 1
+ d23s16 = vadd_s16(d16s16, d18s16);
+ d24s16 = vsub_s16(d16s16, d18s16);
+
+ q15s32 = vmull_s16(d17s16, d22s16);
+ q1s32 = vmull_s16(d17s16, d20s16);
+ q13s32 = vmull_s16(d23s16, d21s16);
+ q14s32 = vmull_s16(d24s16, d21s16);
+
+ q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
+ q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
+
+ d26s16 = vqrshrn_n_s32(q13s32, 14);
+ d27s16 = vqrshrn_n_s32(q14s32, 14);
+ d29s16 = vqrshrn_n_s32(q15s32, 14);
+ d28s16 = vqrshrn_n_s32(q1s32, 14);
+ q13s16 = vcombine_s16(d26s16, d27s16);
+ q14s16 = vcombine_s16(d28s16, d29s16);
+
+ // stage 2
+ q8s16 = vaddq_s16(q13s16, q14s16);
+ q9s16 = vsubq_s16(q13s16, q14s16);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_high_s16(q9s16); // vswp d18 d19
+ d19s16 = vget_low_s16(q9s16);
+
+ d0x2s16 = vtrn_s16(d16s16, d17s16);
+ d1x2s16 = vtrn_s16(d18s16, d19s16);
+ q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
+ q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
+
+ q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
+ vreinterpretq_s32_s16(q9s16));
+ d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+ d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+ d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+ d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+
+ // do the transform on columns
+ // stage 1
+ d23s16 = vadd_s16(d16s16, d18s16);
+ d24s16 = vsub_s16(d16s16, d18s16);
+
+ q15s32 = vmull_s16(d17s16, d22s16);
+ q1s32 = vmull_s16(d17s16, d20s16);
+ q13s32 = vmull_s16(d23s16, d21s16);
+ q14s32 = vmull_s16(d24s16, d21s16);
+
+ q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
+ q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
+
+ d26s16 = vqrshrn_n_s32(q13s32, 14);
+ d27s16 = vqrshrn_n_s32(q14s32, 14);
+ d29s16 = vqrshrn_n_s32(q15s32, 14);
+ d28s16 = vqrshrn_n_s32(q1s32, 14);
+ q13s16 = vcombine_s16(d26s16, d27s16);
+ q14s16 = vcombine_s16(d28s16, d29s16);
+
+ // stage 2
+ q8s16 = vaddq_s16(q13s16, q14s16);
+ q9s16 = vsubq_s16(q13s16, q14s16);
+
+ q8s16 = vrshrq_n_s16(q8s16, 4);
+ q9s16 = vrshrq_n_s16(q9s16, 4);
+
+ d = dest;
+ d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
+ d += dest_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
+ d += dest_stride;
+ d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
+ d += dest_stride;
+ d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_u32(d26u32));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_u32(d27u32));
+
+ d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+
+ d = dest;
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
+ d += dest_stride;
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
+ d += dest_stride;
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
+ d += dest_stride;
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
+ return;
+}
diff --git a/vp9/common/arm/neon/vp9_idct4x4_add_neon.asm b/vp9/common/arm/neon/vp9_idct4x4_add_neon_asm.asm
index 00283fc8d..00283fc8d 100644
--- a/vp9/common/arm/neon/vp9_idct4x4_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_idct4x4_add_neon_asm.asm
diff --git a/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c b/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c
new file mode 100644
index 000000000..24c29fb77
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vp9/common/vp9_idct.h"
+
+void vp9_idct8x8_1_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8x8_t d2u8, d3u8, d30u8, d31u8;
+ uint64x1_t d2u64, d3u64, d4u64, d5u64;
+ uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
+ int16x8_t q0s16;
+ uint8_t *d1, *d2;
+ int16_t i, a1, cospi_16_64 = 11585;
+ int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 5);
+
+ q0s16 = vdupq_n_s16(a1);
+ q0u16 = vreinterpretq_u16_s16(q0s16);
+
+ d1 = d2 = dest;
+ for (i = 0; i < 2; i++) {
+ d2u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+ d4u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+ d5u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+
+ q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
+ q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
+ q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
+ q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+ d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
+ d2 += dest_stride;
+ }
+ return;
+}
diff --git a/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm b/vp9/common/arm/neon/vp9_idct8x8_1_add_neon_asm.asm
index 421d202d4..421d202d4 100644
--- a/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_idct8x8_1_add_neon_asm.asm
diff --git a/vp9/common/arm/neon/vp9_idct8x8_add_neon.c b/vp9/common/arm/neon/vp9_idct8x8_add_neon.c
new file mode 100644
index 000000000..50587f6bc
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_idct8x8_add_neon.c
@@ -0,0 +1,545 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static int16_t cospi_4_64 = 16069;
+static int16_t cospi_8_64 = 15137;
+static int16_t cospi_12_64 = 13623;
+static int16_t cospi_16_64 = 11585;
+static int16_t cospi_20_64 = 9102;
+static int16_t cospi_24_64 = 6270;
+static int16_t cospi_28_64 = 3196;
+
+static inline void TRANSPOSE8X8(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16,
+ int16x8_t *q10s16,
+ int16x8_t *q11s16,
+ int16x8_t *q12s16,
+ int16x8_t *q13s16,
+ int16x8_t *q14s16,
+ int16x8_t *q15s16) {
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+ int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
+ *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
+ *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
+ *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
+ *q12s16 = vcombine_s16(d17s16, d25s16);
+ *q13s16 = vcombine_s16(d19s16, d27s16);
+ *q14s16 = vcombine_s16(d21s16, d29s16);
+ *q15s16 = vcombine_s16(d23s16, d31s16);
+
+ q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
+ vreinterpretq_s32_s16(*q10s16));
+ q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
+ vreinterpretq_s32_s16(*q11s16));
+ q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
+ vreinterpretq_s32_s16(*q14s16));
+ q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
+ vreinterpretq_s32_s16(*q15s16));
+
+ q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
+ vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
+ q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
+ vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
+ q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
+ vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
+ q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
+ vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
+
+ *q8s16 = q0x2s16.val[0];
+ *q9s16 = q0x2s16.val[1];
+ *q10s16 = q1x2s16.val[0];
+ *q11s16 = q1x2s16.val[1];
+ *q12s16 = q2x2s16.val[0];
+ *q13s16 = q2x2s16.val[1];
+ *q14s16 = q3x2s16.val[0];
+ *q15s16 = q3x2s16.val[1];
+ return;
+}
+
+static inline void IDCT8x8_1D(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16,
+ int16x8_t *q10s16,
+ int16x8_t *q11s16,
+ int16x8_t *q12s16,
+ int16x8_t *q13s16,
+ int16x8_t *q14s16,
+ int16x8_t *q15s16) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+
+ d0s16 = vdup_n_s16(cospi_28_64);
+ d1s16 = vdup_n_s16(cospi_4_64);
+ d2s16 = vdup_n_s16(cospi_12_64);
+ d3s16 = vdup_n_s16(cospi_20_64);
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ q2s32 = vmull_s16(d18s16, d0s16);
+ q3s32 = vmull_s16(d19s16, d0s16);
+ q5s32 = vmull_s16(d26s16, d2s16);
+ q6s32 = vmull_s16(d27s16, d2s16);
+
+ q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+ q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+ q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
+ q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+
+ d8s16 = vqrshrn_n_s32(q2s32, 14);
+ d9s16 = vqrshrn_n_s32(q3s32, 14);
+ d10s16 = vqrshrn_n_s32(q5s32, 14);
+ d11s16 = vqrshrn_n_s32(q6s32, 14);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ q2s32 = vmull_s16(d18s16, d1s16);
+ q3s32 = vmull_s16(d19s16, d1s16);
+ q9s32 = vmull_s16(d26s16, d3s16);
+ q13s32 = vmull_s16(d27s16, d3s16);
+
+ q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
+ q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
+ q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+ q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
+
+ d14s16 = vqrshrn_n_s32(q2s32, 14);
+ d15s16 = vqrshrn_n_s32(q3s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q13s32, 14);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+ q7s16 = vcombine_s16(d14s16, d15s16);
+
+ d0s16 = vdup_n_s16(cospi_16_64);
+
+ q2s32 = vmull_s16(d16s16, d0s16);
+ q3s32 = vmull_s16(d17s16, d0s16);
+ q13s32 = vmull_s16(d16s16, d0s16);
+ q15s32 = vmull_s16(d17s16, d0s16);
+
+ q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
+ q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
+ q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
+ q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
+
+ d0s16 = vdup_n_s16(cospi_24_64);
+ d1s16 = vdup_n_s16(cospi_8_64);
+
+ d18s16 = vqrshrn_n_s32(q2s32, 14);
+ d19s16 = vqrshrn_n_s32(q3s32, 14);
+ d22s16 = vqrshrn_n_s32(q13s32, 14);
+ d23s16 = vqrshrn_n_s32(q15s32, 14);
+ *q9s16 = vcombine_s16(d18s16, d19s16);
+ *q11s16 = vcombine_s16(d22s16, d23s16);
+
+ q2s32 = vmull_s16(d20s16, d0s16);
+ q3s32 = vmull_s16(d21s16, d0s16);
+ q8s32 = vmull_s16(d20s16, d1s16);
+ q12s32 = vmull_s16(d21s16, d1s16);
+
+ q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
+ q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
+ q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
+ q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
+
+ d26s16 = vqrshrn_n_s32(q2s32, 14);
+ d27s16 = vqrshrn_n_s32(q3s32, 14);
+ d30s16 = vqrshrn_n_s32(q8s32, 14);
+ d31s16 = vqrshrn_n_s32(q12s32, 14);
+ *q13s16 = vcombine_s16(d26s16, d27s16);
+ *q15s16 = vcombine_s16(d30s16, d31s16);
+
+ q0s16 = vaddq_s16(*q9s16, *q15s16);
+ q1s16 = vaddq_s16(*q11s16, *q13s16);
+ q2s16 = vsubq_s16(*q11s16, *q13s16);
+ q3s16 = vsubq_s16(*q9s16, *q15s16);
+
+ *q13s16 = vsubq_s16(q4s16, q5s16);
+ q4s16 = vaddq_s16(q4s16, q5s16);
+ *q14s16 = vsubq_s16(q7s16, q6s16);
+ q7s16 = vaddq_s16(q7s16, q6s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+
+ d16s16 = vdup_n_s16(cospi_16_64);
+
+ q9s32 = vmull_s16(d28s16, d16s16);
+ q10s32 = vmull_s16(d29s16, d16s16);
+ q11s32 = vmull_s16(d28s16, d16s16);
+ q12s32 = vmull_s16(d29s16, d16s16);
+
+ q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
+ q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+ q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+ q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+ d10s16 = vqrshrn_n_s32(q9s32, 14);
+ d11s16 = vqrshrn_n_s32(q10s32, 14);
+ d12s16 = vqrshrn_n_s32(q11s32, 14);
+ d13s16 = vqrshrn_n_s32(q12s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ *q8s16 = vaddq_s16(q0s16, q7s16);
+ *q9s16 = vaddq_s16(q1s16, q6s16);
+ *q10s16 = vaddq_s16(q2s16, q5s16);
+ *q11s16 = vaddq_s16(q3s16, q4s16);
+ *q12s16 = vsubq_s16(q3s16, q4s16);
+ *q13s16 = vsubq_s16(q2s16, q5s16);
+ *q14s16 = vsubq_s16(q1s16, q6s16);
+ *q15s16 = vsubq_s16(q0s16, q7s16);
+ return;
+}
+
+void vp9_idct8x8_64_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8_t *d1, *d2;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8;
+ uint64x1_t d0u64, d1u64, d2u64, d3u64;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+ q8s16 = vld1q_s16(input);
+ q9s16 = vld1q_s16(input + 8);
+ q10s16 = vld1q_s16(input + 16);
+ q11s16 = vld1q_s16(input + 24);
+ q12s16 = vld1q_s16(input + 32);
+ q13s16 = vld1q_s16(input + 40);
+ q14s16 = vld1q_s16(input + 48);
+ q15s16 = vld1q_s16(input + 56);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ q8s16 = vrshrq_n_s16(q8s16, 5);
+ q9s16 = vrshrq_n_s16(q9s16, 5);
+ q10s16 = vrshrq_n_s16(q10s16, 5);
+ q11s16 = vrshrq_n_s16(q11s16, 5);
+ q12s16 = vrshrq_n_s16(q12s16, 5);
+ q13s16 = vrshrq_n_s16(q13s16, 5);
+ q14s16 = vrshrq_n_s16(q14s16, 5);
+ q15s16 = vrshrq_n_s16(q15s16, 5);
+
+ d1 = d2 = dest;
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+ vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+ vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+
+ q8s16 = q12s16;
+ q9s16 = q13s16;
+ q10s16 = q14s16;
+ q11s16 = q15s16;
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+ vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+ vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ return;
+}
+
+void vp9_idct8x8_12_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8_t *d1, *d2;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8;
+ int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
+ int16x4_t d26s16, d27s16, d28s16, d29s16;
+ uint64x1_t d0u64, d1u64, d2u64, d3u64;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16;
+ int32x4_t q9s32, q10s32, q11s32, q12s32;
+
+ q8s16 = vld1q_s16(input);
+ q9s16 = vld1q_s16(input + 8);
+ q10s16 = vld1q_s16(input + 16);
+ q11s16 = vld1q_s16(input + 24);
+ q12s16 = vld1q_s16(input + 32);
+ q13s16 = vld1q_s16(input + 40);
+ q14s16 = vld1q_s16(input + 48);
+ q15s16 = vld1q_s16(input + 56);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // First transform rows
+ // stage 1
+ q0s16 = vdupq_n_s16(cospi_28_64 * 2);
+ q1s16 = vdupq_n_s16(cospi_4_64 * 2);
+
+ q4s16 = vqrdmulhq_s16(q9s16, q0s16);
+
+ q0s16 = vdupq_n_s16(-cospi_20_64 * 2);
+
+ q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+
+ q1s16 = vdupq_n_s16(cospi_12_64 * 2);
+
+ q5s16 = vqrdmulhq_s16(q11s16, q0s16);
+
+ q0s16 = vdupq_n_s16(cospi_16_64 * 2);
+
+ q6s16 = vqrdmulhq_s16(q11s16, q1s16);
+
+ // stage 2 & stage 3 - even half
+ q1s16 = vdupq_n_s16(cospi_24_64 * 2);
+
+ q9s16 = vqrdmulhq_s16(q8s16, q0s16);
+
+ q0s16 = vdupq_n_s16(cospi_8_64 * 2);
+
+ q13s16 = vqrdmulhq_s16(q10s16, q1s16);
+
+ q15s16 = vqrdmulhq_s16(q10s16, q0s16);
+
+ // stage 3 -odd half
+ q0s16 = vaddq_s16(q9s16, q15s16);
+ q1s16 = vaddq_s16(q9s16, q13s16);
+ q2s16 = vsubq_s16(q9s16, q13s16);
+ q3s16 = vsubq_s16(q9s16, q15s16);
+
+ // stage 2 - odd half
+ q13s16 = vsubq_s16(q4s16, q5s16);
+ q4s16 = vaddq_s16(q4s16, q5s16);
+ q14s16 = vsubq_s16(q7s16, q6s16);
+ q7s16 = vaddq_s16(q7s16, q6s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+
+ d16s16 = vdup_n_s16(cospi_16_64);
+ q9s32 = vmull_s16(d28s16, d16s16);
+ q10s32 = vmull_s16(d29s16, d16s16);
+ q11s32 = vmull_s16(d28s16, d16s16);
+ q12s32 = vmull_s16(d29s16, d16s16);
+
+ q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
+ q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+ q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+ q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+ d10s16 = vqrshrn_n_s32(q9s32, 14);
+ d11s16 = vqrshrn_n_s32(q10s32, 14);
+ d12s16 = vqrshrn_n_s32(q11s32, 14);
+ d13s16 = vqrshrn_n_s32(q12s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ // stage 4
+ q8s16 = vaddq_s16(q0s16, q7s16);
+ q9s16 = vaddq_s16(q1s16, q6s16);
+ q10s16 = vaddq_s16(q2s16, q5s16);
+ q11s16 = vaddq_s16(q3s16, q4s16);
+ q12s16 = vsubq_s16(q3s16, q4s16);
+ q13s16 = vsubq_s16(q2s16, q5s16);
+ q14s16 = vsubq_s16(q1s16, q6s16);
+ q15s16 = vsubq_s16(q0s16, q7s16);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ q8s16 = vrshrq_n_s16(q8s16, 5);
+ q9s16 = vrshrq_n_s16(q9s16, 5);
+ q10s16 = vrshrq_n_s16(q10s16, 5);
+ q11s16 = vrshrq_n_s16(q11s16, 5);
+ q12s16 = vrshrq_n_s16(q12s16, 5);
+ q13s16 = vrshrq_n_s16(q13s16, 5);
+ q14s16 = vrshrq_n_s16(q14s16, 5);
+ q15s16 = vrshrq_n_s16(q15s16, 5);
+
+ d1 = d2 = dest;
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+ vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+ vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+
+ q8s16 = q12s16;
+ q9s16 = q13s16;
+ q10s16 = q14s16;
+ q11s16 = q15s16;
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+ vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+ vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ return;
+}
diff --git a/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm b/vp9/common/arm/neon/vp9_idct8x8_add_neon_asm.asm
index ab5bb6920..ab5bb6920 100644
--- a/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_idct8x8_add_neon_asm.asm
diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
index bc6a17cd1..97fe02805 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
+++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
@@ -18,8 +18,8 @@ void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
const uint8_t *blimit1,
const uint8_t *limit1,
const uint8_t *thresh1) {
- vp9_lpf_horizontal_8(s, p, blimit0, limit0, thresh0, 1);
- vp9_lpf_horizontal_8(s + 8, p, blimit1, limit1, thresh1, 1);
+ vp9_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1);
+ vp9_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1);
}
void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p,
@@ -44,6 +44,7 @@ void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p,
vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
}
+#if HAVE_NEON_ASM
void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p,
const uint8_t *blimit,
const uint8_t *limit,
@@ -51,3 +52,4 @@ void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p,
vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
}
+#endif // HAVE_NEON_ASM
diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm b/vp9/common/arm/neon/vp9_loopfilter_16_neon_asm.asm
index 5b8ec2028..5b8ec2028 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm
+++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon_asm.asm
diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon.c b/vp9/common/arm/neon/vp9_loopfilter_neon.c
new file mode 100644
index 000000000..f54d7a94b
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_loopfilter_neon.c
@@ -0,0 +1,712 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static inline void vp9_loop_filter_neon(
+ uint8x8_t dblimit, // flimit
+ uint8x8_t dlimit, // limit
+ uint8x8_t dthresh, // thresh
+ uint8x8_t d3u8, // p3
+ uint8x8_t d4u8, // p2
+ uint8x8_t d5u8, // p1
+ uint8x8_t d6u8, // p0
+ uint8x8_t d7u8, // q0
+ uint8x8_t d16u8, // q1
+ uint8x8_t d17u8, // q2
+ uint8x8_t d18u8, // q3
+ uint8x8_t *d4ru8, // p1
+ uint8x8_t *d5ru8, // p0
+ uint8x8_t *d6ru8, // q0
+ uint8x8_t *d7ru8) { // q1
+ uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
+ int16x8_t q12s16;
+ int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
+
+ d19u8 = vabd_u8(d3u8, d4u8);
+ d20u8 = vabd_u8(d4u8, d5u8);
+ d21u8 = vabd_u8(d5u8, d6u8);
+ d22u8 = vabd_u8(d16u8, d7u8);
+ d3u8 = vabd_u8(d17u8, d16u8);
+ d4u8 = vabd_u8(d18u8, d17u8);
+
+ d19u8 = vmax_u8(d19u8, d20u8);
+ d20u8 = vmax_u8(d21u8, d22u8);
+ d3u8 = vmax_u8(d3u8, d4u8);
+ d23u8 = vmax_u8(d19u8, d20u8);
+
+ d17u8 = vabd_u8(d6u8, d7u8);
+
+ d21u8 = vcgt_u8(d21u8, dthresh);
+ d22u8 = vcgt_u8(d22u8, dthresh);
+ d23u8 = vmax_u8(d23u8, d3u8);
+
+ d28u8 = vabd_u8(d5u8, d16u8);
+ d17u8 = vqadd_u8(d17u8, d17u8);
+
+ d23u8 = vcge_u8(dlimit, d23u8);
+
+ d18u8 = vdup_n_u8(0x80);
+ d5u8 = veor_u8(d5u8, d18u8);
+ d6u8 = veor_u8(d6u8, d18u8);
+ d7u8 = veor_u8(d7u8, d18u8);
+ d16u8 = veor_u8(d16u8, d18u8);
+
+ d28u8 = vshr_n_u8(d28u8, 1);
+ d17u8 = vqadd_u8(d17u8, d28u8);
+
+ d19u8 = vdup_n_u8(3);
+
+ d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8),
+ vreinterpret_s8_u8(d6u8));
+
+ d17u8 = vcge_u8(dblimit, d17u8);
+
+ d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8),
+ vreinterpret_s8_u8(d16u8));
+
+ d22u8 = vorr_u8(d21u8, d22u8);
+
+ q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+ d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
+ d23u8 = vand_u8(d23u8, d17u8);
+
+ q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
+
+ d17u8 = vdup_n_u8(4);
+
+ d27s8 = vqmovn_s16(q12s16);
+ d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
+ d27s8 = vreinterpret_s8_u8(d27u8);
+
+ d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
+ d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
+ d28s8 = vshr_n_s8(d28s8, 3);
+ d27s8 = vshr_n_s8(d27s8, 3);
+
+ d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
+ d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
+
+ d27s8 = vrshr_n_s8(d27s8, 1);
+ d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
+
+ d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
+ d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
+
+ *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
+ *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
+ *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
+ *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
+ return;
+}
+
+void vp9_lpf_horizontal_4_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char *blimit,
+ unsigned char *limit,
+ unsigned char *thresh,
+ int count) {
+ int i;
+ uint8_t *s, *psrc;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+
+ if (count == 0) // end_vp9_lf_h_edge
+ return;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ psrc = src - (pitch << 2);
+ for (i = 0; i < count; i++) {
+ s = psrc + i * 8;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ vp9_loop_filter_neon(dblimit, dlimit, dthresh,
+ d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+ &d4u8, &d5u8, &d6u8, &d7u8);
+
+ s -= (pitch * 5);
+ vst1_u8(s, d4u8);
+ s += pitch;
+ vst1_u8(s, d5u8);
+ s += pitch;
+ vst1_u8(s, d6u8);
+ s += pitch;
+ vst1_u8(s, d7u8);
+ }
+ return;
+}
+
+void vp9_lpf_vertical_4_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char *blimit,
+ unsigned char *limit,
+ unsigned char *thresh,
+ int count) {
+ int i, pitch8;
+ uint8_t *s;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+ uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+ uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+ uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+ uint8x8x4_t d4Result;
+
+ if (count == 0) // end_vp9_lf_h_edge
+ return;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ pitch8 = pitch * 8;
+ for (i = 0; i < count; i++, src += pitch8) {
+ s = src - (i + 1) * 4;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
+ vreinterpret_u32_u8(d7u8));
+ d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
+ vreinterpret_u32_u8(d16u8));
+ d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
+ vreinterpret_u32_u8(d17u8));
+ d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
+ vreinterpret_u32_u8(d18u8));
+
+ d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+ vreinterpret_u16_u32(d2tmp2.val[0]));
+ d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+ vreinterpret_u16_u32(d2tmp3.val[0]));
+ d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+ vreinterpret_u16_u32(d2tmp2.val[1]));
+ d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+ vreinterpret_u16_u32(d2tmp3.val[1]));
+
+ d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+ vreinterpret_u8_u16(d2tmp5.val[0]));
+ d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+ vreinterpret_u8_u16(d2tmp5.val[1]));
+ d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+ vreinterpret_u8_u16(d2tmp7.val[0]));
+ d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+ vreinterpret_u8_u16(d2tmp7.val[1]));
+
+ d3u8 = d2tmp8.val[0];
+ d4u8 = d2tmp8.val[1];
+ d5u8 = d2tmp9.val[0];
+ d6u8 = d2tmp9.val[1];
+ d7u8 = d2tmp10.val[0];
+ d16u8 = d2tmp10.val[1];
+ d17u8 = d2tmp11.val[0];
+ d18u8 = d2tmp11.val[1];
+
+ vp9_loop_filter_neon(dblimit, dlimit, dthresh,
+ d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+ &d4u8, &d5u8, &d6u8, &d7u8);
+
+ d4Result.val[0] = d4u8;
+ d4Result.val[1] = d5u8;
+ d4Result.val[2] = d6u8;
+ d4Result.val[3] = d7u8;
+
+ src -= 2;
+ vst4_lane_u8(src, d4Result, 0);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 1);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 2);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 3);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 4);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 5);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 6);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 7);
+ }
+ return;
+}
+
+static inline void vp9_mbloop_filter_neon(
+ uint8x8_t dblimit, // mblimit
+ uint8x8_t dlimit, // limit
+ uint8x8_t dthresh, // thresh
+ uint8x8_t d3u8, // p2
+ uint8x8_t d4u8, // p2
+ uint8x8_t d5u8, // p1
+ uint8x8_t d6u8, // p0
+ uint8x8_t d7u8, // q0
+ uint8x8_t d16u8, // q1
+ uint8x8_t d17u8, // q2
+ uint8x8_t d18u8, // q3
+ uint8x8_t *d0ru8, // p1
+ uint8x8_t *d1ru8, // p1
+ uint8x8_t *d2ru8, // p0
+ uint8x8_t *d3ru8, // q0
+ uint8x8_t *d4ru8, // q1
+ uint8x8_t *d5ru8) { // q1
+ uint32_t flat;
+ uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
+ uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+ int16x8_t q15s16;
+ uint16x8_t q10u16, q14u16;
+ int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
+
+ d19u8 = vabd_u8(d3u8, d4u8);
+ d20u8 = vabd_u8(d4u8, d5u8);
+ d21u8 = vabd_u8(d5u8, d6u8);
+ d22u8 = vabd_u8(d16u8, d7u8);
+ d23u8 = vabd_u8(d17u8, d16u8);
+ d24u8 = vabd_u8(d18u8, d17u8);
+
+ d19u8 = vmax_u8(d19u8, d20u8);
+ d20u8 = vmax_u8(d21u8, d22u8);
+
+ d25u8 = vabd_u8(d6u8, d4u8);
+
+ d23u8 = vmax_u8(d23u8, d24u8);
+
+ d26u8 = vabd_u8(d7u8, d17u8);
+
+ d19u8 = vmax_u8(d19u8, d20u8);
+
+ d24u8 = vabd_u8(d6u8, d7u8);
+ d27u8 = vabd_u8(d3u8, d6u8);
+ d28u8 = vabd_u8(d18u8, d7u8);
+
+ d19u8 = vmax_u8(d19u8, d23u8);
+
+ d23u8 = vabd_u8(d5u8, d16u8);
+ d24u8 = vqadd_u8(d24u8, d24u8);
+
+
+ d19u8 = vcge_u8(dlimit, d19u8);
+
+
+ d25u8 = vmax_u8(d25u8, d26u8);
+ d26u8 = vmax_u8(d27u8, d28u8);
+
+ d23u8 = vshr_n_u8(d23u8, 1);
+
+ d25u8 = vmax_u8(d25u8, d26u8);
+
+ d24u8 = vqadd_u8(d24u8, d23u8);
+
+ d20u8 = vmax_u8(d20u8, d25u8);
+
+ d23u8 = vdup_n_u8(1);
+ d24u8 = vcge_u8(dblimit, d24u8);
+
+ d21u8 = vcgt_u8(d21u8, dthresh);
+
+ d20u8 = vcge_u8(d23u8, d20u8);
+
+ d19u8 = vand_u8(d19u8, d24u8);
+
+ d23u8 = vcgt_u8(d22u8, dthresh);
+
+ d20u8 = vand_u8(d20u8, d19u8);
+
+ d22u8 = vdup_n_u8(0x80);
+
+ d23u8 = vorr_u8(d21u8, d23u8);
+
+ q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8),
+ vreinterpret_u16_u8(d21u8));
+
+ d30u8 = vshrn_n_u16(q10u16, 4);
+ flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
+
+ if (flat == 0xffffffff) { // Check for all 1's, power_branch_only
+ d27u8 = vdup_n_u8(3);
+ d21u8 = vdup_n_u8(2);
+ q14u16 = vaddl_u8(d6u8, d7u8);
+ q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+ q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
+ q14u16 = vaddw_u8(q14u16, d5u8);
+ *d0ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vaddw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+ *d1ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+ *d2ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d7u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+ *d3ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vsubw_u8(q14u16, d7u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+ *d4ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vsubw_u8(q14u16, d16u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+ *d5ru8 = vqrshrn_n_u16(q14u16, 3);
+ } else {
+ d21u8 = veor_u8(d7u8, d22u8);
+ d24u8 = veor_u8(d6u8, d22u8);
+ d25u8 = veor_u8(d5u8, d22u8);
+ d26u8 = veor_u8(d16u8, d22u8);
+
+ d27u8 = vdup_n_u8(3);
+
+ d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
+ d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
+
+ q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
+
+ d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+ q15s16 = vaddw_s8(q15s16, d29s8);
+
+ d29u8 = vdup_n_u8(4);
+
+ d28s8 = vqmovn_s16(q15s16);
+
+ d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+ d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
+ d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
+ d30s8 = vshr_n_s8(d30s8, 3);
+ d29s8 = vshr_n_s8(d29s8, 3);
+
+ d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
+ d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
+
+ d29s8 = vrshr_n_s8(d29s8, 1);
+ d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+ d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
+ d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
+
+ if (flat == 0) { // filter_branch_only
+ *d0ru8 = d4u8;
+ *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+ *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+ *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+ *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+ *d5ru8 = d17u8;
+ return;
+ }
+
+ d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+ d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+ d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+ d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+
+ d23u8 = vdup_n_u8(2);
+ q14u16 = vaddl_u8(d6u8, d7u8);
+ q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+ q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
+
+ d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
+
+ q14u16 = vaddw_u8(q14u16, d5u8);
+
+ d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
+
+ d30u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vaddw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+
+ d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
+
+ d31u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+
+ *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
+
+ d23u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d7u8);
+
+ *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
+
+ q14u16 = vaddw_u8(q14u16, d18u8);
+
+ *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
+
+ d22u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vsubw_u8(q14u16, d7u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+
+ d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
+
+ q14u16 = vaddw_u8(q14u16, d18u8);
+
+ d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
+
+ d6u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vsubw_u8(q14u16, d16u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+
+ d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
+
+ d7u8 = vqrshrn_n_u16(q14u16, 3);
+
+ *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
+ *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
+ *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
+ }
+ return;
+}
+
+void vp9_lpf_horizontal_8_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char *blimit,
+ unsigned char *limit,
+ unsigned char *thresh,
+ int count) {
+ int i;
+ uint8_t *s, *psrc;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ uint8x8_t d16u8, d17u8, d18u8;
+
+ if (count == 0) // end_vp9_mblf_h_edge
+ return;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ psrc = src - (pitch << 2);
+ for (i = 0; i < count; i++) {
+ s = psrc + i * 8;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
+ d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+ &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+
+ s -= (pitch * 6);
+ vst1_u8(s, d0u8);
+ s += pitch;
+ vst1_u8(s, d1u8);
+ s += pitch;
+ vst1_u8(s, d2u8);
+ s += pitch;
+ vst1_u8(s, d3u8);
+ s += pitch;
+ vst1_u8(s, d4u8);
+ s += pitch;
+ vst1_u8(s, d5u8);
+ }
+ return;
+}
+
+void vp9_lpf_vertical_8_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char *blimit,
+ unsigned char *limit,
+ unsigned char *thresh,
+ int count) {
+ int i;
+ uint8_t *s;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ uint8x8_t d16u8, d17u8, d18u8;
+ uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+ uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+ uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+ uint8x8x4_t d4Result;
+ uint8x8x2_t d2Result;
+
+ if (count == 0)
+ return;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ for (i = 0; i < count; i++) {
+ s = src + (i * (pitch << 3)) - 4;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
+ vreinterpret_u32_u8(d7u8));
+ d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
+ vreinterpret_u32_u8(d16u8));
+ d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
+ vreinterpret_u32_u8(d17u8));
+ d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
+ vreinterpret_u32_u8(d18u8));
+
+ d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+ vreinterpret_u16_u32(d2tmp2.val[0]));
+ d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+ vreinterpret_u16_u32(d2tmp3.val[0]));
+ d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+ vreinterpret_u16_u32(d2tmp2.val[1]));
+ d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+ vreinterpret_u16_u32(d2tmp3.val[1]));
+
+ d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+ vreinterpret_u8_u16(d2tmp5.val[0]));
+ d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+ vreinterpret_u8_u16(d2tmp5.val[1]));
+ d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+ vreinterpret_u8_u16(d2tmp7.val[0]));
+ d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+ vreinterpret_u8_u16(d2tmp7.val[1]));
+
+ d3u8 = d2tmp8.val[0];
+ d4u8 = d2tmp8.val[1];
+ d5u8 = d2tmp9.val[0];
+ d6u8 = d2tmp9.val[1];
+ d7u8 = d2tmp10.val[0];
+ d16u8 = d2tmp10.val[1];
+ d17u8 = d2tmp11.val[0];
+ d18u8 = d2tmp11.val[1];
+
+ vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
+ d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+ &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+
+ d4Result.val[0] = d0u8;
+ d4Result.val[1] = d1u8;
+ d4Result.val[2] = d2u8;
+ d4Result.val[3] = d3u8;
+
+ d2Result.val[0] = d4u8;
+ d2Result.val[1] = d5u8;
+
+ s = src - 3;
+ vst4_lane_u8(s, d4Result, 0);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 1);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 2);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 3);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 4);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 5);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 6);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 7);
+
+ s = src + 1;
+ vst2_lane_u8(s, d2Result, 0);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 1);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 2);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 3);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 4);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 5);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 6);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 7);
+ }
+ return;
+}
diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon.asm b/vp9/common/arm/neon/vp9_loopfilter_neon_asm.asm
index 443032217..443032217 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_neon.asm
+++ b/vp9/common/arm/neon/vp9_loopfilter_neon_asm.asm
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 284d3a2b5..cb299f9f7 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -45,6 +45,7 @@ void vp9_free_ref_frame_buffers(VP9_COMMON *cm) {
}
vp9_free_frame_buffer(&cm->post_proc_buffer);
+ vp9_free_frame_buffer(&cm->post_proc_buffer_int);
}
void vp9_free_context_buffers(VP9_COMMON *cm) {
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 893a2bb63..7d7209c56 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -192,6 +192,10 @@ typedef struct macroblockd {
int mi_stride;
MODE_INFO *mi;
+ MODE_INFO *left_mi;
+ MODE_INFO *above_mi;
+ MB_MODE_INFO *left_mbmi;
+ MB_MODE_INFO *above_mbmi;
int up_available;
int left_available;
diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c
new file mode 100644
index 000000000..f1bdc1b06
--- /dev/null
+++ b/vp9/common/vp9_mfqe.c
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
+#include "./vp9_rtcd.h"
+
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_postproc.h"
+
+// TODO(jackychen): Replace this function with SSE2 code. There is
+// one SSE2 implementation in vp8, so will consider how to share it
+// between vp8 and vp9.
+static void filter_by_weight(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ int block_size, int src_weight) {
+ const int dst_weight = (1 << MFQE_PRECISION) - src_weight;
+ const int rounding_bit = 1 << (MFQE_PRECISION - 1);
+ int r, c;
+
+ for (r = 0; r < block_size; r++) {
+ for (c = 0; c < block_size; c++) {
+ dst[c] = (src[c] * src_weight + dst[c] * dst_weight + rounding_bit)
+ >> MFQE_PRECISION;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void filter_by_weight32x32(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int weight) {
+ filter_by_weight(src, src_stride, dst, dst_stride, 16, weight);
+ filter_by_weight(src + 16, src_stride, dst + 16, dst_stride, 16, weight);
+ filter_by_weight(src + src_stride * 16, src_stride, dst + dst_stride * 16,
+ dst_stride, 16, weight);
+ filter_by_weight(src + src_stride * 16 + 16, src_stride,
+ dst + dst_stride * 16 + 16, dst_stride, 16, weight);
+}
+
+static void filter_by_weight64x64(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int weight) {
+ filter_by_weight32x32(src, src_stride, dst, dst_stride, weight);
+ filter_by_weight32x32(src + 32, src_stride, dst + 32,
+ dst_stride, weight);
+ filter_by_weight32x32(src + src_stride * 32, src_stride,
+ dst + dst_stride * 32, dst_stride, weight);
+ filter_by_weight32x32(src + src_stride * 32 + 32, src_stride,
+ dst + dst_stride * 32 + 32, dst_stride, weight);
+}
+
+static void apply_ifactor(const uint8_t *y, int y_stride, uint8_t *yd,
+ int yd_stride, const uint8_t *u, const uint8_t *v,
+ int uv_stride, uint8_t *ud, uint8_t *vd,
+ int uvd_stride, BLOCK_SIZE block_size,
+ int weight) {
+ if (block_size == BLOCK_16X16) {
+ filter_by_weight(y, y_stride, yd, yd_stride, 16, weight);
+ filter_by_weight(u, uv_stride, ud, uvd_stride, 8, weight);
+ filter_by_weight(v, uv_stride, vd, uvd_stride, 8, weight);
+ } else if (block_size == BLOCK_32X32) {
+ filter_by_weight32x32(y, y_stride, yd, yd_stride, weight);
+ filter_by_weight(u, uv_stride, ud, uvd_stride, 16, weight);
+ filter_by_weight(v, uv_stride, vd, uvd_stride, 16, weight);
+ } else if (block_size == BLOCK_64X64) {
+ filter_by_weight64x64(y, y_stride, yd, yd_stride, weight);
+ filter_by_weight32x32(u, uv_stride, ud, uvd_stride, weight);
+ filter_by_weight32x32(v, uv_stride, vd, uvd_stride, weight);
+ }
+}
+
+// TODO(jackychen): Determine whether replace it with assembly code.
+static void copy_mem8x8(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride) {
+ int r;
+ for (r = 0; r < 8; r++) {
+ memcpy(dst, src, 8);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void copy_mem16x16(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride) {
+ int r;
+ for (r = 0; r < 16; r++) {
+ memcpy(dst, src, 16);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void copy_mem32x32(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride) {
+ copy_mem16x16(src, src_stride, dst, dst_stride);
+ copy_mem16x16(src + 16, src_stride, dst + 16, dst_stride);
+ copy_mem16x16(src + src_stride * 16, src_stride,
+ dst + dst_stride * 16, dst_stride);
+ copy_mem16x16(src + src_stride * 16 + 16, src_stride,
+ dst + dst_stride * 16 + 16, dst_stride);
+}
+
+void copy_mem64x64(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride) {
+ copy_mem32x32(src, src_stride, dst, dst_stride);
+ copy_mem32x32(src + 32, src_stride, dst + 32, dst_stride);
+ copy_mem32x32(src + src_stride * 32, src_stride,
+ dst + src_stride * 32, dst_stride);
+ copy_mem32x32(src + src_stride * 32 + 32, src_stride,
+ dst + src_stride * 32 + 32, dst_stride);
+}
+
+static void copy_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+ int y_stride, int uv_stride, uint8_t *yd, uint8_t *ud,
+ uint8_t *vd, int yd_stride, int uvd_stride,
+ BLOCK_SIZE bs) {
+ if (bs == BLOCK_16X16) {
+ copy_mem16x16(y, y_stride, yd, yd_stride);
+ copy_mem8x8(u, uv_stride, ud, uvd_stride);
+ copy_mem8x8(v, uv_stride, vd, uvd_stride);
+ } else if (bs == BLOCK_32X32) {
+ copy_mem32x32(y, y_stride, yd, yd_stride);
+ copy_mem16x16(u, uv_stride, ud, uvd_stride);
+ copy_mem16x16(v, uv_stride, vd, uvd_stride);
+ } else {
+ copy_mem64x64(y, y_stride, yd, yd_stride);
+ copy_mem32x32(u, uv_stride, ud, uvd_stride);
+ copy_mem32x32(v, uv_stride, vd, uvd_stride);
+ }
+}
+
+static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
+ const uint8_t *v, int y_stride, int uv_stride,
+ uint8_t *yd, uint8_t *ud, uint8_t *vd,
+ int yd_stride, int uvd_stride) {
+ int sad, sad_thr, vdiff;
+ uint32_t sse;
+
+ if (bs == BLOCK_16X16) {
+ vdiff = (vp9_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
+ sad = (vp9_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
+ } else if (bs == BLOCK_32X32) {
+ vdiff = (vp9_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
+ sad = (vp9_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
+ } else /* if (bs == BLOCK_64X64) */ {
+ vdiff = (vp9_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
+ sad = (vp9_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
+ }
+
+ if (bs == BLOCK_16X16) {
+ sad_thr = 8;
+ } else if (bs == BLOCK_32X32) {
+ sad_thr = 7;
+ } else { // BLOCK_64X64
+ sad_thr = 6;
+ }
+
+ // TODO(jackychen): More experiments and remove magic numbers.
+ // vdiff > sad * 3 means vdiff should not be too small, otherwise,
+ // it might be a lighting change in smooth area. When there is a
+ // lighting change in smooth area, it is dangerous to do MFQE.
+ if (sad > 1 && sad < sad_thr && vdiff > sad * 3 && vdiff < 150) {
+ // TODO(jackychen): Add weighted average in the calculation.
+ // Currently, the data is copied from last frame without averaging.
+ apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride,
+ ud, vd, uvd_stride, bs, 0);
+ } else {
+ // Copy the block from current frame (i.e., no mfqe is done).
+ copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
+ yd_stride, uvd_stride, bs);
+ }
+}
+
+static int mfqe_decision(MODE_INFO *mi, BLOCK_SIZE cur_bs) {
+ // Check the motion in current block(for inter frame),
+ // or check the motion in the correlated block in last frame (for keyframe).
+ const int mv_len_square = mi->mbmi.mv[0].as_mv.row *
+ mi->mbmi.mv[0].as_mv.row +
+ mi->mbmi.mv[0].as_mv.col *
+ mi->mbmi.mv[0].as_mv.col;
+ const int mv_threshold = 100;
+ return mi->mbmi.mode >= NEARESTMV && // Not an intra block
+ cur_bs >= BLOCK_16X16 &&
+ mv_len_square <= mv_threshold;
+}
+
+// Process each partiton in a super block, recursively.
+static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
+ const uint8_t *y, const uint8_t *u,
+ const uint8_t *v, int y_stride, int uv_stride,
+ uint8_t *yd, uint8_t *ud, uint8_t *vd,
+ int yd_stride, int uvd_stride) {
+ int mi_offset, y_offset, uv_offset;
+ const BLOCK_SIZE cur_bs = mi->mbmi.sb_type;
+ // TODO(jackychen): Consider how and whether to use qdiff in MFQE.
+ // int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex;
+ const int bsl = b_width_log2_lookup[bs];
+ PARTITION_TYPE partition = partition_lookup[bsl][cur_bs];
+ const BLOCK_SIZE subsize = get_subsize(bs, partition);
+
+ if (cur_bs < BLOCK_8X8) {
+ // If there are blocks smaller than 8x8, it must be on the boundary.
+ return;
+ }
+ // No MFQE on blocks smaller than 16x16
+ if (partition == PARTITION_SPLIT && bs == BLOCK_16X16) {
+ partition = PARTITION_NONE;
+ }
+ switch (partition) {
+ case PARTITION_HORZ:
+ case PARTITION_VERT:
+ // If current block size is not square.
+ // Copy the block from current frame(i.e., no mfqe is done).
+ // TODO(jackychen): Rectangle blocks should also be taken into account.
+ copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
+ yd_stride, uvd_stride, bs);
+ break;
+ case PARTITION_NONE:
+ if (mfqe_decision(mi, cur_bs)) {
+ // Do mfqe on this partition.
+ mfqe_block(cur_bs, y, u, v, y_stride, uv_stride,
+ yd, ud, vd, yd_stride, uvd_stride);
+ } else {
+ // Copy the block from current frame(i.e., no mfqe is done).
+ copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
+ yd_stride, uvd_stride, bs);
+ }
+ break;
+ case PARTITION_SPLIT:
+ if (bs == BLOCK_64X64) {
+ mi_offset = 4;
+ y_offset = 32;
+ uv_offset = 16;
+ } else {
+ mi_offset = 2;
+ y_offset = 16;
+ uv_offset = 8;
+ }
+ // Recursion on four square partitions, e.g. if bs is 64X64,
+ // then look into four 32X32 blocks in it.
+ mfqe_partition(cm, mi, subsize, y, u, v, y_stride, uv_stride, yd, ud, vd,
+ yd_stride, uvd_stride);
+ mfqe_partition(cm, mi + mi_offset, subsize, y + y_offset, u + uv_offset,
+ v + uv_offset, y_stride, uv_stride, yd + y_offset,
+ ud + uv_offset, vd + uv_offset, yd_stride, uvd_stride);
+ mfqe_partition(cm, mi + mi_offset * cm->mi_stride, subsize,
+ y + y_offset * y_stride, u + uv_offset * uv_stride,
+ v + uv_offset * uv_stride, y_stride, uv_stride,
+ yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
+ vd + uv_offset * uvd_stride, yd_stride, uvd_stride);
+ mfqe_partition(cm, mi + mi_offset * cm->mi_stride + mi_offset,
+ subsize, y + y_offset * y_stride + y_offset,
+ u + uv_offset * uv_stride + uv_offset,
+ v + uv_offset * uv_stride + uv_offset, y_stride,
+ uv_stride, yd + y_offset * yd_stride + y_offset,
+ ud + uv_offset * uvd_stride + uv_offset,
+ vd + uv_offset * uvd_stride + uv_offset,
+ yd_stride, uvd_stride);
+ break;
+ default:
+ assert(0);
+ }
+}
+
+void vp9_mfqe(VP9_COMMON *cm) {
+ int mi_row, mi_col;
+ // Current decoded frame.
+ const YV12_BUFFER_CONFIG *show = cm->frame_to_show;
+ // Last decoded frame and will store the MFQE result.
+ YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
+ // Loop through each super block.
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+ MODE_INFO *mi;
+ MODE_INFO *mi_local = cm->mi + (mi_row * cm->mi_stride + mi_col);
+ // Motion Info in last frame.
+ MODE_INFO *mi_prev = cm->postproc_state.prev_mi +
+ (mi_row * cm->mi_stride + mi_col);
+ const uint32_t y_stride = show->y_stride;
+ const uint32_t uv_stride = show->uv_stride;
+ const uint32_t yd_stride = dest->y_stride;
+ const uint32_t uvd_stride = dest->uv_stride;
+ const uint32_t row_offset_y = mi_row << 3;
+ const uint32_t row_offset_uv = mi_row << 2;
+ const uint32_t col_offset_y = mi_col << 3;
+ const uint32_t col_offset_uv = mi_col << 2;
+ const uint8_t *y = show->y_buffer + row_offset_y * y_stride +
+ col_offset_y;
+ const uint8_t *u = show->u_buffer + row_offset_uv * uv_stride +
+ col_offset_uv;
+ const uint8_t *v = show->v_buffer + row_offset_uv * uv_stride +
+ col_offset_uv;
+ uint8_t *yd = dest->y_buffer + row_offset_y * yd_stride + col_offset_y;
+ uint8_t *ud = dest->u_buffer + row_offset_uv * uvd_stride +
+ col_offset_uv;
+ uint8_t *vd = dest->v_buffer + row_offset_uv * uvd_stride +
+ col_offset_uv;
+ if (frame_is_intra_only(cm)) {
+ mi = mi_prev;
+ } else {
+ mi = mi_local;
+ }
+ mfqe_partition(cm, mi, BLOCK_64X64, y, u, v, y_stride, uv_stride, yd, ud,
+ vd, yd_stride, uvd_stride);
+ }
+ }
+}
diff --git a/vp9/common/vp9_mfqe.h b/vp9/common/vp9_mfqe.h
new file mode 100644
index 000000000..dfff8c23d
--- /dev/null
+++ b/vp9/common/vp9_mfqe.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_MFQE_H_
+#define VP9_COMMON_VP9_MFQE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Multiframe Quality Enhancement.
+// The aim for MFQE is to replace pixel blocks in the current frame with
+// the correlated pixel blocks (with higher quality) in the last frame.
+// The replacement can only be taken in stationary blocks by checking
+// the motion of the blocks and other conditions such as the SAD of
+// the current block and correlated block, the variance of the block
+// difference, etc.
+void vp9_mfqe(struct VP9Common *cm);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP9_COMMON_VP9_MFQE_H_
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index f2c2d255f..55a1f86c7 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -113,6 +113,7 @@ typedef struct VP9Common {
int new_fb_idx;
YV12_BUFFER_CONFIG post_proc_buffer;
+ YV12_BUFFER_CONFIG post_proc_buffer_int;
FRAME_TYPE last_frame_type; /* last frame's frame type for motion search.*/
FRAME_TYPE frame_type;
@@ -309,6 +310,21 @@ static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
// Are edges available for intra prediction?
xd->up_available = (mi_row != 0);
xd->left_available = (mi_col > tile->mi_col_start);
+ if (xd->up_available) {
+ xd->above_mi = xd->mi[-xd->mi_stride].src_mi;
+ xd->above_mbmi = xd->above_mi ? &xd->above_mi->mbmi : NULL;
+ } else {
+ xd->above_mi = NULL;
+ xd->above_mbmi = NULL;
+ }
+
+ if (xd->left_available) {
+ xd->left_mi = xd->mi[-1].src_mi;
+ xd->left_mbmi = xd->left_mi ? &xd->left_mi->mbmi : NULL;
+ } else {
+ xd->left_mi = NULL;
+ xd->left_mbmi = NULL;
+ }
}
static INLINE void update_partition_context(MACROBLOCKD *xd,
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index 575ffbc30..e1a389132 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -79,6 +79,9 @@ const short vp9_rv[] = {
0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
};
+static const uint8_t q_diff_thresh = 20;
+static const uint8_t last_q_thresh = 170;
+
void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
uint8_t *dst_ptr,
int src_pixels_per_line,
@@ -616,6 +619,17 @@ void vp9_plane_add_noise_c(uint8_t *start, char *noise,
}
}
+static void swap_mi_and_prev_mi(VP9_COMMON *cm) {
+ // Current mip will be the prev_mip for the next frame.
+ MODE_INFO *temp = cm->postproc_state.prev_mip;
+ cm->postproc_state.prev_mip = cm->mip;
+ cm->mip = temp;
+
+ // Update the upper left visible macroblock ptrs.
+ cm->mi = cm->mip + cm->mi_stride + 1;
+ cm->postproc_state.prev_mi = cm->postproc_state.prev_mip + cm->mi_stride + 1;
+}
+
int vp9_post_proc_frame(struct VP9Common *cm,
YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) {
const int q = MIN(63, cm->lf.filter_level * 10 / 6);
@@ -633,6 +647,42 @@ int vp9_post_proc_frame(struct VP9Common *cm,
vp9_clear_system_state();
+ // Alloc memory for prev_mip in the first frame.
+ if (cm->current_video_frame == 1) {
+ cm->postproc_state.last_base_qindex = cm->base_qindex;
+ cm->postproc_state.last_frame_valid = 1;
+ ppstate->prev_mip = vpx_calloc(cm->mi_alloc_size, sizeof(*cm->mip));
+ if (!ppstate->prev_mip) {
+ return 1;
+ }
+ ppstate->prev_mi = ppstate->prev_mip + cm->mi_stride + 1;
+ vpx_memset(ppstate->prev_mip, 0,
+ cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
+ }
+
+ // Allocate post_proc_buffer_int if needed.
+ if ((flags & VP9D_MFQE) && !cm->post_proc_buffer_int.buffer_alloc) {
+ if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
+ const int width = ALIGN_POWER_OF_TWO(cm->width, 4);
+ const int height = ALIGN_POWER_OF_TWO(cm->height, 4);
+
+ if (vp9_alloc_frame_buffer(&cm->post_proc_buffer_int, width, height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ VP9_ENC_BORDER_IN_PIXELS) < 0) {
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate MFQE framebuffer");
+ }
+
+ // Ensure that postproc is set to all 0s so that post proc
+ // doesn't pull random data in from edge.
+ vpx_memset(cm->post_proc_buffer_int.buffer_alloc, 128,
+ cm->post_proc_buffer.frame_size);
+ }
+ }
+
#if CONFIG_VP9_POSTPROC || CONFIG_INTERNAL_STATS
if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
@@ -644,7 +694,27 @@ int vp9_post_proc_frame(struct VP9Common *cm,
"Failed to allocate post-processing buffer");
#endif
- if (flags & VP9D_DEMACROBLOCK) {
+ if ((flags & VP9D_MFQE) && cm->current_video_frame >= 2 &&
+ cm->postproc_state.last_frame_valid &&
+ cm->postproc_state.last_base_qindex <= last_q_thresh &&
+ cm->base_qindex - cm->postproc_state.last_base_qindex >= q_diff_thresh) {
+ vp9_mfqe(cm);
+ // TODO(jackychen): Consider whether enable deblocking by default
+ // if mfqe is enabled. Need to take both the quality and the speed
+ // into consideration.
+ if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
+ vp8_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int);
+ }
+ if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) {
+ deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf,
+ q + (ppflags->deblocking_level - 5) * 10,
+ 1, 0);
+ } else if (flags & VP9D_DEBLOCK) {
+ vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q);
+ } else {
+ vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
+ }
+ } else if (flags & VP9D_DEMACROBLOCK) {
deblock_and_de_macro_block(cm->frame_to_show, ppbuf,
q + (ppflags->deblocking_level - 5) * 10, 1, 0);
} else if (flags & VP9D_DEBLOCK) {
@@ -653,6 +723,9 @@ int vp9_post_proc_frame(struct VP9Common *cm,
vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
}
+ cm->postproc_state.last_base_qindex = cm->base_qindex;
+ cm->postproc_state.last_frame_valid = 1;
+
if (flags & VP9D_ADDNOISE) {
const int noise_level = ppflags->noise_level;
if (ppstate->last_q != q ||
@@ -673,6 +746,7 @@ int vp9_post_proc_frame(struct VP9Common *cm,
dest->uv_width = dest->y_width >> cm->subsampling_x;
dest->uv_height = dest->y_height >> cm->subsampling_y;
+ swap_mi_and_prev_mi(cm);
return 0;
}
#endif
diff --git a/vp9/common/vp9_postproc.h b/vp9/common/vp9_postproc.h
index ebebc1ae3..035c9cdf8 100644
--- a/vp9/common/vp9_postproc.h
+++ b/vp9/common/vp9_postproc.h
@@ -14,6 +14,8 @@
#include "vpx_ports/mem.h"
#include "vpx_scale/yv12config.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_mfqe.h"
#include "vp9/common/vp9_ppflags.h"
#ifdef __cplusplus
@@ -24,6 +26,10 @@ struct postproc_state {
int last_q;
int last_noise;
char noise[3072];
+ int last_base_qindex;
+ int last_frame_valid;
+ MODE_INFO *prev_mip;
+ MODE_INFO *prev_mi;
DECLARE_ALIGNED(16, char, blackclamp[16]);
DECLARE_ALIGNED(16, char, whiteclamp[16]);
DECLARE_ALIGNED(16, char, bothclamp[16]);
@@ -31,6 +37,8 @@ struct postproc_state {
struct VP9Common;
+#define MFQE_PRECISION 4
+
int vp9_post_proc_frame(struct VP9Common *cm,
YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags);
diff --git a/vp9/common/vp9_ppflags.h b/vp9/common/vp9_ppflags.h
index 1644a1bbb..12b989f43 100644
--- a/vp9/common/vp9_ppflags.h
+++ b/vp9/common/vp9_ppflags.h
@@ -26,7 +26,8 @@ enum {
VP9D_DEBUG_TXT_RATE_INFO = 1 << 6,
VP9D_DEBUG_DRAW_MV = 1 << 7,
VP9D_DEBUG_CLR_BLK_MODES = 1 << 8,
- VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9
+ VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9,
+ VP9D_MFQE = 1 << 10
};
typedef struct {
diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c
index 901a043f6..fd735f483 100644
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -15,21 +15,17 @@
#include "vp9/common/vp9_pred_common.h"
#include "vp9/common/vp9_seg_common.h"
-static INLINE const MB_MODE_INFO *get_mbmi(const MODE_INFO *const mi) {
- return (mi != NULL) ? &mi->mbmi : NULL;
-}
-
// Returns a context number for the given MB prediction signal
int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
// Note:
// The mode info data structure has a one element border above and to the
// left of the entries correpsonding to real macroblocks.
// The prediction flags in these dummy entries are initialised to 0.
- const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
- const int left_type = left_mbmi != NULL && is_inter_block(left_mbmi) ?
- left_mbmi->interp_filter : SWITCHABLE_FILTERS;
- const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
- const int above_type = above_mbmi != NULL && is_inter_block(above_mbmi) ?
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int left_type = xd->left_available && is_inter_block(left_mbmi) ?
+ left_mbmi->interp_filter : SWITCHABLE_FILTERS;
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const int above_type = xd->up_available && is_inter_block(above_mbmi) ?
above_mbmi->interp_filter : SWITCHABLE_FILTERS;
if (left_type == above_type)
@@ -50,10 +46,10 @@ int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
// 2 - intra/--, --/intra
// 3 - intra/intra
int vp9_get_intra_inter_context(const MACROBLOCKD *xd) {
- const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
- const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
- const int has_above = above_mbmi != NULL;
- const int has_left = left_mbmi != NULL;
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int has_above = xd->up_available;
+ const int has_left = xd->left_available;
if (has_above && has_left) { // both edges available
const int above_intra = !is_inter_block(above_mbmi);
@@ -70,10 +66,10 @@ int vp9_get_intra_inter_context(const MACROBLOCKD *xd) {
int vp9_get_reference_mode_context(const VP9_COMMON *cm,
const MACROBLOCKD *xd) {
int ctx;
- const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
- const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
- const int has_above = above_mbmi != NULL;
- const int has_left = left_mbmi != NULL;
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int has_above = xd->up_available;
+ const int has_left = xd->left_available;
// Note:
// The mode info data structure has a one element border above and to the
// left of the entries correpsonding to real macroblocks.
@@ -113,10 +109,10 @@ int vp9_get_reference_mode_context(const VP9_COMMON *cm,
int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
const MACROBLOCKD *xd) {
int pred_context;
- const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
- const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
- const int above_in_image = above_mbmi != NULL;
- const int left_in_image = left_mbmi != NULL;
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int above_in_image = xd->up_available;
+ const int left_in_image = xd->left_available;
// Note:
// The mode info data structure has a one element border above and to the
@@ -194,10 +190,10 @@ int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
int pred_context;
- const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
- const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
- const int has_above = above_mbmi != NULL;
- const int has_left = left_mbmi != NULL;
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int has_above = xd->up_available;
+ const int has_left = xd->left_available;
// Note:
// The mode info data structure has a one element border above and to the
// left of the entries correpsonding to real macroblocks.
@@ -260,10 +256,10 @@ int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
int pred_context;
- const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
- const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
- const int has_above = above_mbmi != NULL;
- const int has_left = left_mbmi != NULL;
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int has_above = xd->up_available;
+ const int has_left = xd->left_available;
// Note:
// The mode info data structure has a one element border above and to the
@@ -349,10 +345,10 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
// The prediction flags in these dummy entries are initialized to 0.
int vp9_get_tx_size_context(const MACROBLOCKD *xd) {
const int max_tx_size = max_txsize_lookup[xd->mi[0].src_mi->mbmi.sb_type];
- const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
- const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
- const int has_above = above_mbmi != NULL;
- const int has_left = left_mbmi != NULL;
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int has_above = xd->up_available;
+ const int has_left = xd->left_available;
int above_ctx = (has_above && !above_mbmi->skip) ? (int)above_mbmi->tx_size
: max_tx_size;
int left_ctx = (has_left && !left_mbmi->skip) ? (int)left_mbmi->tx_size
diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h
index cf13e4a91..bc19d28b9 100644
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@@ -18,20 +18,12 @@
extern "C" {
#endif
-static INLINE const MODE_INFO *get_above_mi(const MACROBLOCKD *const xd) {
- return xd->up_available ? xd->mi[-xd->mi_stride].src_mi : NULL;
-}
-
-static INLINE const MODE_INFO *get_left_mi(const MACROBLOCKD *const xd) {
- return xd->left_available ? xd->mi[-1].src_mi : NULL;
-}
-
int vp9_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids,
BLOCK_SIZE bsize, int mi_row, int mi_col);
static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) {
- const MODE_INFO *const above_mi = get_above_mi(xd);
- const MODE_INFO *const left_mi = get_left_mi(xd);
+ const MODE_INFO *const above_mi = xd->above_mi;
+ const MODE_INFO *const left_mi = xd->left_mi;
const int above_sip = (above_mi != NULL) ?
above_mi->mbmi.seg_id_predicted : 0;
const int left_sip = (left_mi != NULL) ? left_mi->mbmi.seg_id_predicted : 0;
@@ -45,8 +37,8 @@ static INLINE vp9_prob vp9_get_pred_prob_seg_id(const struct segmentation *seg,
}
static INLINE int vp9_get_skip_context(const MACROBLOCKD *xd) {
- const MODE_INFO *const above_mi = get_above_mi(xd);
- const MODE_INFO *const left_mi = get_left_mi(xd);
+ const MODE_INFO *const above_mi = xd->above_mi;
+ const MODE_INFO *const left_mi = xd->left_mi;
const int above_skip = (above_mi != NULL) ? above_mi->mbmi.skip : 0;
const int left_skip = (left_mi != NULL) ? left_mi->mbmi.skip : 0;
return above_skip + left_skip;
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 1872191ff..575990bb5 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -236,36 +236,29 @@ specialize qw/vp9_lpf_vertical_16_dual sse2 neon_asm dspr2/;
$vp9_lpf_vertical_16_dual_neon_asm=vp9_lpf_vertical_16_dual_neon;
add_proto qw/void vp9_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_vertical_8 sse2 neon_asm dspr2/;
-$vp9_lpf_vertical_8_neon_asm=vp9_lpf_vertical_8_neon;
+specialize qw/vp9_lpf_vertical_8 sse2 neon dspr2/;
add_proto qw/void vp9_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_vertical_8_dual sse2 neon_asm dspr2/;
-$vp9_lpf_vertical_8_dual_neon_asm=vp9_lpf_vertical_8_dual_neon;
+specialize qw/vp9_lpf_vertical_8_dual sse2 neon dspr2/;
add_proto qw/void vp9_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_vertical_4 mmx neon_asm dspr2/;
-$vp9_lpf_vertical_4_neon_asm=vp9_lpf_vertical_4_neon;
+specialize qw/vp9_lpf_vertical_4 mmx neon dspr2/;
add_proto qw/void vp9_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_vertical_4_dual sse2 neon_asm dspr2/;
-$vp9_lpf_vertical_4_dual_neon_asm=vp9_lpf_vertical_4_dual_neon;
+specialize qw/vp9_lpf_vertical_4_dual sse2 neon dspr2/;
add_proto qw/void vp9_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
specialize qw/vp9_lpf_horizontal_16 sse2 avx2 neon_asm dspr2/;
$vp9_lpf_horizontal_16_neon_asm=vp9_lpf_horizontal_16_neon;
add_proto qw/void vp9_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_horizontal_8 sse2 neon_asm dspr2/;
-$vp9_lpf_horizontal_8_neon_asm=vp9_lpf_horizontal_8_neon;
+specialize qw/vp9_lpf_horizontal_8 sse2 neon dspr2/;
add_proto qw/void vp9_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_horizontal_8_dual sse2 neon_asm dspr2/;
-$vp9_lpf_horizontal_8_dual_neon_asm=vp9_lpf_horizontal_8_dual_neon;
+specialize qw/vp9_lpf_horizontal_8_dual sse2 neon dspr2/;
add_proto qw/void vp9_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_horizontal_4 mmx neon_asm dspr2/;
-$vp9_lpf_horizontal_4_neon_asm=vp9_lpf_horizontal_4_neon;
+specialize qw/vp9_lpf_horizontal_4 mmx neon dspr2/;
add_proto qw/void vp9_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
specialize qw/vp9_lpf_horizontal_4_dual sse2 neon_asm dspr2/;
@@ -296,36 +289,28 @@ $vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
# Sub Pixel Filters
#
add_proto qw/void vp9_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve_copy neon_asm dspr2/, "$sse2_x86inc";
-$vp9_convolve_copy_neon_asm=vp9_convolve_copy_neon;
+specialize qw/vp9_convolve_copy neon dspr2/, "$sse2_x86inc";
add_proto qw/void vp9_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve_avg neon_asm dspr2/, "$sse2_x86inc";
-$vp9_convolve_avg_neon_asm=vp9_convolve_avg_neon;
+specialize qw/vp9_convolve_avg neon dspr2/, "$sse2_x86inc";
add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3";
-$vp9_convolve8_neon_asm=vp9_convolve8_neon;
+specialize qw/vp9_convolve8 sse2 ssse3 neon dspr2/, "$avx2_ssse3";
add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3";
-$vp9_convolve8_horiz_neon_asm=vp9_convolve8_horiz_neon;
+specialize qw/vp9_convolve8_horiz sse2 ssse3 neon dspr2/, "$avx2_ssse3";
add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3";
-$vp9_convolve8_vert_neon_asm=vp9_convolve8_vert_neon;
+specialize qw/vp9_convolve8_vert sse2 ssse3 neon dspr2/, "$avx2_ssse3";
add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg sse2 ssse3 neon_asm dspr2/;
-$vp9_convolve8_avg_neon_asm=vp9_convolve8_avg_neon;
+specialize qw/vp9_convolve8_avg sse2 ssse3 neon dspr2/;
add_proto qw/void vp9_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg_horiz sse2 ssse3 neon_asm dspr2/;
-$vp9_convolve8_avg_horiz_neon_asm=vp9_convolve8_avg_horiz_neon;
+specialize qw/vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2/;
add_proto qw/void vp9_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg_vert sse2 ssse3 neon_asm dspr2/;
-$vp9_convolve8_avg_vert_neon_asm=vp9_convolve8_avg_vert_neon;
+specialize qw/vp9_convolve8_avg_vert sse2 ssse3 neon dspr2/;
#
# dct
@@ -437,48 +422,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_iwht4x4_16_add/;
} else {
add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/;
- $vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon;
+ specialize qw/vp9_idct4x4_1_add sse2 neon dspr2/;
add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/;
- $vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon;
+ specialize qw/vp9_idct4x4_16_add sse2 neon dspr2/;
add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/;
- $vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon;
+ specialize qw/vp9_idct8x8_1_add sse2 neon dspr2/;
add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
- $vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon;
+ specialize qw/vp9_idct8x8_64_add sse2 neon dspr2/, "$ssse3_x86_64";
add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
- $vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon;
+ specialize qw/vp9_idct8x8_12_add sse2 neon dspr2/, "$ssse3_x86_64";
add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/;
- $vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon;
+ specialize qw/vp9_idct16x16_1_add sse2 neon dspr2/;
add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/;
- $vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;
+ specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon dspr2/;
add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/;
- $vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;
+ specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon dspr2/;
add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/;
- $vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon;
+ specialize qw/vp9_idct32x32_1024_add sse2 neon dspr2/;
add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/;
+ #is this a typo?
$vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon;
add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/;
- $vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon;
+ specialize qw/vp9_idct32x32_1_add sse2 neon dspr2/;
add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 3610c7165..42e0baa05 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -4260,7 +4260,7 @@ void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
// N.B. Only first 4 cols contain non-zero coeffs
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 4; i++) {
+ for (i = 2; i < 8; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
index fd781d4bc..4a5bf1b60 100644
--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@@ -18,7 +18,7 @@
mov rcx, 0x0400040
movdqa xmm4, [rdx] ;load filters
- movd xmm5, rcx
+ movq xmm5, rcx
packsswb xmm4, xmm4
pshuflw xmm0, xmm4, 0b ;k0_k1
pshuflw xmm1, xmm4, 01010101b ;k2_k3
@@ -661,7 +661,7 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
mov rcx, 0x0400040
movdqa xmm4, [rdx] ;load filters
- movd xmm5, rcx
+ movq xmm5, rcx
packsswb xmm4, xmm4
pshuflw xmm0, xmm4, 0b ;k0_k1
pshuflw xmm1, xmm4, 01010101b ;k2_k3
@@ -765,40 +765,50 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
movq xmm0, [rsi - 3] ;load src data
movq xmm4, [rsi + 5]
- movq xmm7, [rsi + 13]
+ movq xmm6, [rsi + 13]
punpcklqdq xmm0, xmm4
- punpcklqdq xmm4, xmm7
+ punpcklqdq xmm4, xmm6
+
+ movdqa xmm7, xmm0
+ punpcklbw xmm7, xmm7
+ punpckhbw xmm0, xmm0
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, xmm0
- movdqa xmm5, xmm4
- movdqa xmm6, xmm4
- movdqa xmm7, xmm4
-
- pshufb xmm0, [GLOBAL(shuf_t0t1)]
- pshufb xmm1, [GLOBAL(shuf_t2t3)]
- pshufb xmm2, [GLOBAL(shuf_t4t5)]
- pshufb xmm3, [GLOBAL(shuf_t6t7)]
- pshufb xmm4, [GLOBAL(shuf_t0t1)]
- pshufb xmm5, [GLOBAL(shuf_t2t3)]
- pshufb xmm6, [GLOBAL(shuf_t4t5)]
- pshufb xmm7, [GLOBAL(shuf_t6t7)]
+ palignr xmm0, xmm7, 1
+ palignr xmm1, xmm7, 5
pmaddubsw xmm0, k0k1
+ palignr xmm2, xmm7, 9
pmaddubsw xmm1, k2k3
+ palignr xmm3, xmm7, 13
+
pmaddubsw xmm2, k4k5
pmaddubsw xmm3, k6k7
- pmaddubsw xmm4, k0k1
- pmaddubsw xmm5, k2k3
- pmaddubsw xmm6, k4k5
- pmaddubsw xmm7, k6k7
-
paddsw xmm0, xmm3
+
+ movdqa xmm3, xmm4
+ punpcklbw xmm3, xmm3
+ punpckhbw xmm4, xmm4
+
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+
+ palignr xmm4, xmm3, 1
+ palignr xmm5, xmm3, 5
+ palignr xmm6, xmm3, 9
+ palignr xmm7, xmm3, 13
+
movdqa xmm3, xmm1
+ pmaddubsw xmm4, k0k1
pmaxsw xmm1, xmm2
+ pmaddubsw xmm5, k2k3
pminsw xmm2, xmm3
+ pmaddubsw xmm6, k4k5
paddsw xmm0, xmm2
+ pmaddubsw xmm7, k6k7
paddsw xmm0, xmm1
paddsw xmm4, xmm7
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index a088325df..2c5fbacb9 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -747,10 +747,6 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
width = buf->y_crop_width;
height = buf->y_crop_height;
- if (buf->corrupted) {
- vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
- "Frame reference is corrupt");
- }
found = 1;
break;
}
@@ -978,9 +974,12 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
&tile_data->bit_reader, BLOCK_64X64);
}
pbi->mb.corrupted |= tile_data->xd.corrupted;
+ if (pbi->mb.corrupted)
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+ "Failed to decode tile data");
}
// Loopfilter one row.
- if (cm->lf.filter_level && !pbi->mb.corrupted) {
+ if (cm->lf.filter_level) {
const int lf_start = mi_row - MI_BLOCK_SIZE;
LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
@@ -1003,7 +1002,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
}
// Loopfilter remaining rows in the frame.
- if (cm->lf.filter_level && !pbi->mb.corrupted) {
+ if (cm->lf.filter_level) {
LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
winterface->sync(&pbi->lf_worker);
lf_data->start = lf_data->stop;
@@ -1564,6 +1563,9 @@ void vp9_decode_frame(VP9Decoder *pbi,
xd->corrupted = 0;
new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
+ if (new_fb->corrupted)
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+ "Decode failed. Frame data header is corrupted.");
// TODO(jzern): remove frame_parallel_decoding_mode restriction for
// single-frame tile decoding.
@@ -1576,6 +1578,10 @@ void vp9_decode_frame(VP9Decoder *pbi,
vp9_loop_filter_frame_mt(&pbi->lf_row_sync, new_fb, pbi->mb.plane, cm,
pbi->tile_workers, pbi->num_tile_workers,
cm->lf.filter_level, 0);
+ } else {
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+ "Decode failed. Frame data is corrupted.");
+
}
} else {
*p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index ecab71a9b..cff94db2d 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -440,9 +440,6 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
if ((!vp9_is_valid_scale(&ref_buf->sf)))
vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
"Reference frame has invalid dimensions");
- if (ref_buf->buf->corrupted)
- vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
- "Block reference is corrupt");
vp9_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col,
&ref_buf->sf);
vp9_find_mv_refs(cm, xd, tile, mi, frame, mbmi->ref_mvs[frame],
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 39f03aac1..2daf86200 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -288,16 +288,6 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
vp9_clear_system_state();
- // We do not know if the missing frame(s) was supposed to update
- // any of the reference buffers, but we act conservative and
- // mark only the last buffer as corrupted.
- //
- // TODO(jkoleszar): Error concealment is undefined and non-normative
- // at this point, but if it becomes so, [0] may not always be the correct
- // thing to do here.
- if (cm->frame_refs[0].idx != INT_MAX && cm->frame_refs[0].buf != NULL)
- cm->frame_refs[0].buf->corrupted = 1;
-
if (cm->new_fb_idx > 0 && cm->frame_bufs[cm->new_fb_idx].ref_count > 0)
cm->frame_bufs[cm->new_fb_idx].ref_count--;
diff --git a/vp9/encoder/vp9_aq_complexity.c b/vp9/encoder/vp9_aq_complexity.c
index 83f4a53d6..22e5217b6 100644
--- a/vp9/encoder/vp9_aq_complexity.c
+++ b/vp9/encoder/vp9_aq_complexity.c
@@ -16,19 +16,29 @@
#include "vp9/common/vp9_seg_common.h"
#include "vp9/encoder/vp9_segmentation.h"
-#define AQ_C_SEGMENTS 3
-#define AQ_C_STRENGTHS 3
-static const int aq_c_active_segments[AQ_C_STRENGTHS] = {1, 2, 3};
+#define AQ_C_SEGMENTS 5
+#define DEFAULT_AQ2_SEG 3 // Neutral Q segment
+#define AQ_C_STRENGTHS 3
static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
- {{1.0, 1.0, 1.0}, {1.0, 2.0, 1.0}, {1.0, 1.5, 2.5}};
+ { {1.75, 1.25, 1.05, 1.00, 0.90},
+ {2.00, 1.50, 1.15, 1.00, 0.85},
+ {2.50, 1.75, 1.25, 1.00, 0.80} };
static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
- {{1.0, 1.0, 1.0}, {1.0, 0.25, 0.0}, {1.0, 0.5, 0.25}};
-static const double aq_c_var_thresholds[AQ_C_SEGMENTS] = {100.0, 12.0, 10.0};
+ { {0.15, 0.30, 0.55, 2.00, 100.0},
+ {0.20, 0.40, 0.65, 2.00, 100.0},
+ {0.25, 0.50, 0.75, 2.00, 100.0} };
+static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
+ { {-4.0, -3.0, -2.0, 100.00, 100.0},
+ {-3.5, -2.5, -1.5, 100.00, 100.0},
+ {-3.0, -2.0, -1.0, 100.00, 100.0} };
+
+#define DEFAULT_COMPLEXITY 64
+
static int get_aq_c_strength(int q_index, vpx_bit_depth_t bit_depth) {
// Approximate base quatizer (truncated to int)
const int base_quant = vp9_ac_quant(q_index, 0, bit_depth) / 4;
- return (base_quant > 20) + (base_quant > 45);
+ return (base_quant > 10) + (base_quant > 25);
}
void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
@@ -43,13 +53,10 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
(cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
int segment;
const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
- const int active_segments = aq_c_active_segments[aq_strength];
// Clear down the segment map.
- vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
-
- // Clear down the complexity map used for rd.
- vpx_memset(cpi->complexity_map, 0, cm->mi_rows * cm->mi_cols);
+ vpx_memset(cpi->segmentation_map, DEFAULT_AQ2_SEG,
+ cm->mi_rows * cm->mi_cols);
vp9_clearall_segfeatures(seg);
@@ -65,15 +72,21 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
// Select delta coding method.
seg->abs_delta = SEGMENT_DELTADATA;
- // Segment 0 "Q" feature is disabled so it defaults to the baseline Q.
- vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q);
+ // Default segment "Q" feature is disabled so it defaults to the baseline Q.
+ vp9_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q);
// Use some of the segments for in frame Q adjustment.
- for (segment = 1; segment < active_segments; ++segment) {
- int qindex_delta =
- vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
- aq_c_q_adj_factor[aq_strength][segment],
- cm->bit_depth);
+ for (segment = 0; segment < AQ_C_SEGMENTS; ++segment) {
+ int qindex_delta;
+
+ if (segment == DEFAULT_AQ2_SEG)
+ continue;
+
+ qindex_delta =
+ vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
+ aq_c_q_adj_factor[aq_strength][segment],
+ cm->bit_depth);
+
// For AQ complexity mode, we dont allow Q0 in a segment if the base
// Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
@@ -90,61 +103,54 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
}
}
-// Select a segment for the current SB64 block.
+#define DEFAULT_LV_THRESH 10.0
+#define MIN_DEFAULT_LV_THRESH 8.0
+#define VAR_STRENGTH_STEP 0.25
+// Select a segment for the current block.
// The choice of segment for a block depends on the ratio of the projected
-// bits for the block vs a target average.
-// An "aq_strength" value determines how many segments are supported,
-// the set of transition points to use and the extent of the quantizer
-// adjustment for each segment (configured in vp9_setup_in_frame_q_adj()).
-void vp9_select_in_frame_q_segment(VP9_COMP *cpi, MACROBLOCK *mb,
- BLOCK_SIZE bs,
- int mi_row, int mi_col,
- int output_enabled, int projected_rate) {
+// bits for the block vs a target average and its spatial complexity.
+void vp9_caq_select_segment(VP9_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
+ int mi_row, int mi_col, int projected_rate) {
VP9_COMMON *const cm = &cpi->common;
const int mi_offset = mi_row * cm->mi_cols + mi_col;
const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
- const int xmis = MIN(cm->mi_cols - mi_col, bw);
- const int ymis = MIN(cm->mi_rows - mi_row, bh);
- int complexity_metric = 64;
+ const int xmis = MIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[bs]);
+ const int ymis = MIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[bs]);
int x, y;
-
+ int i;
unsigned char segment;
- if (!output_enabled) {
- segment = 0;
+ if (0) {
+ segment = DEFAULT_AQ2_SEG;
} else {
// Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
// It is converted to bits * 256 units.
const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) /
(bw * bh);
- const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
- const int active_segments = aq_c_active_segments[aq_strength];
double logvar;
+ double low_var_thresh;
+ const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
+
+ vp9_clear_system_state();
+ low_var_thresh = (cpi->oxcf.pass == 2)
+ ? MAX(cpi->twopass.mb_av_energy, MIN_DEFAULT_LV_THRESH)
+ : DEFAULT_LV_THRESH;
vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col);
logvar = vp9_log_block_var(cpi, mb, bs);
- // The number of segments considered and the transition points used to
- // select them is determined by the "aq_strength" value.
- // Currently this loop only supports segments that reduce Q (i.e. where
- // there is undershoot.
- // The loop counts down towards segment 0 which is the default segment
- // with no Q adjustment.
- segment = active_segments - 1;
- while (segment > 0) {
+ segment = AQ_C_SEGMENTS - 1; // Just in case no break out below.
+ for (i = 0; i < AQ_C_SEGMENTS; ++i) {
+ // Test rate against a threshold value and variance against a threshold.
+ // Increasing segment number (higher variance and complexity) = higher Q.
if ((projected_rate <
- target_rate * aq_c_transitions[aq_strength][segment]) &&
- (logvar < aq_c_var_thresholds[segment])) {
+ target_rate * aq_c_transitions[aq_strength][i]) &&
+ (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) {
+ segment = i;
break;
}
- --segment;
- }
-
- if (target_rate > 0) {
- complexity_metric =
- clamp((int)((projected_rate * 64) / target_rate), 16, 255);
}
}
@@ -152,8 +158,6 @@ void vp9_select_in_frame_q_segment(VP9_COMP *cpi, MACROBLOCK *mb,
for (y = 0; y < ymis; y++) {
for (x = 0; x < xmis; x++) {
cpi->segmentation_map[mi_offset + y * cm->mi_cols + x] = segment;
- cpi->complexity_map[mi_offset + y * cm->mi_cols + x] =
- (unsigned char)complexity_metric;
}
}
}
diff --git a/vp9/encoder/vp9_aq_complexity.h b/vp9/encoder/vp9_aq_complexity.h
index 3f885e450..c0dce6c5b 100644
--- a/vp9/encoder/vp9_aq_complexity.h
+++ b/vp9/encoder/vp9_aq_complexity.h
@@ -19,11 +19,10 @@ extern "C" {
struct VP9_COMP;
struct macroblock;
-// Select a segment for the current SB64.
-void vp9_select_in_frame_q_segment(struct VP9_COMP *cpi, struct macroblock *x,
- BLOCK_SIZE bs,
- int mi_row, int mi_col,
- int output_enabled, int projected_rate);
+// Select a segment for the current Block.
+void vp9_caq_select_segment(struct VP9_COMP *cpi, struct macroblock *,
+ BLOCK_SIZE bs,
+ int mi_row, int mi_col, int projected_rate);
// This function sets up a set of segments with delta Q values around
// the baseline frame quantizer.
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 4d88fb5a5..20368f096 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -346,9 +346,8 @@ static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd,
MODE_INFO *mi_8x8, vp9_writer *w) {
const struct segmentation *const seg = &cm->seg;
const MODE_INFO *const mi = mi_8x8;
- const MODE_INFO *const above_mi = mi_8x8[-xd->mi_stride].src_mi;
- const MODE_INFO *const left_mi =
- xd->left_available ? mi_8x8[-1].src_mi : NULL;
+ const MODE_INFO *const above_mi = xd->above_mi;
+ const MODE_INFO *const left_mi = xd->left_mi;
const MB_MODE_INFO *const mbmi = &mi->mbmi;
const BLOCK_SIZE bsize = mbmi->sb_type;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 5e6e77dc9..756393f31 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -189,10 +189,10 @@ static BLOCK_SIZE get_nonrd_var_based_fixed_partition(VP9_COMP *cpi,
// Lighter version of set_offsets that only sets the mode info
// pointers.
-static INLINE void set_modeinfo_offsets(VP9_COMMON *const cm,
- MACROBLOCKD *const xd,
- int mi_row,
- int mi_col) {
+static INLINE void set_mode_info_offsets(VP9_COMMON *const cm,
+ MACROBLOCKD *const xd,
+ int mi_row,
+ int mi_col) {
const int idx_str = xd->mi_stride * mi_row + mi_col;
xd->mi = cm->mi + idx_str;
xd->mi[0].src_mi = &xd->mi[0];
@@ -210,7 +210,7 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
set_skip_context(xd, mi_row, mi_col);
- set_modeinfo_offsets(cm, xd, mi_row, mi_col);
+ set_mode_info_offsets(cm, xd, mi_row, mi_col);
mbmi = &xd->mi[0].src_mi->mbmi;
@@ -270,16 +270,15 @@ static void set_block_size(VP9_COMP * const cpi,
int mi_row, int mi_col,
BLOCK_SIZE bsize) {
if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) {
- set_modeinfo_offsets(&cpi->common, xd, mi_row, mi_col);
+ set_mode_info_offsets(&cpi->common, xd, mi_row, mi_col);
xd->mi[0].src_mi->mbmi.sb_type = bsize;
- duplicate_mode_info_in_sb(&cpi->common, xd, mi_row, mi_col, bsize);
}
}
typedef struct {
int64_t sum_square_error;
int64_t sum_error;
- int count;
+ int log2_count;
int variance;
} var;
@@ -328,7 +327,6 @@ typedef enum {
static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
int i;
node->part_variances = NULL;
- vpx_memset(node->split, 0, sizeof(node->split));
switch (bsize) {
case BLOCK_64X64: {
v64x64 *vt = (v64x64 *) data;
@@ -376,18 +374,18 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
static void fill_variance(int64_t s2, int64_t s, int c, var *v) {
v->sum_square_error = s2;
v->sum_error = s;
- v->count = c;
- if (c > 0)
- v->variance = (int)(256 *
- (v->sum_square_error - v->sum_error * v->sum_error /
- v->count) / v->count);
- else
- v->variance = 0;
+ v->log2_count = c;
+}
+
+static void get_variance(var *v) {
+ v->variance = (int)(256 * (v->sum_square_error -
+ ((v->sum_error * v->sum_error) >> v->log2_count)) >> v->log2_count);
}
void sum_2_variances(const var *a, const var *b, var *r) {
+ assert(a->log2_count == b->log2_count);
fill_variance(a->sum_square_error + b->sum_square_error,
- a->sum_error + b->sum_error, a->count + b->count, r);
+ a->sum_error + b->sum_error, a->log2_count + 1, r);
}
static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
@@ -434,6 +432,7 @@ static int set_vt_partitioning(VP9_COMP *cpi,
// variance is below threshold, otherwise split will be selected.
// No check for vert/horiz split as too few samples for variance.
if (bsize == bsize_ref) {
+ get_variance(&vt.part_variances->none);
if (mi_col + block_width / 2 < cm->mi_cols &&
mi_row + block_height / 2 < cm->mi_rows &&
vt.part_variances->none.variance < threshold_bsize_ref) {
@@ -442,6 +441,7 @@ static int set_vt_partitioning(VP9_COMP *cpi,
}
return 0;
} else if (bsize > bsize_ref) {
+ get_variance(&vt.part_variances->none);
// For key frame, for bsize above 32X32, or very high variance, take split.
if (cm->frame_type == KEY_FRAME &&
(bsize > BLOCK_32X32 ||
@@ -455,24 +455,32 @@ static int set_vt_partitioning(VP9_COMP *cpi,
set_block_size(cpi, xd, mi_row, mi_col, bsize);
return 1;
}
+
// Check vertical split.
- if (mi_row + block_height / 2 < cm->mi_rows &&
- vt.part_variances->vert[0].variance < threshold_low &&
- vt.part_variances->vert[1].variance < threshold_low) {
- BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
- set_block_size(cpi, xd, mi_row, mi_col, subsize);
- set_block_size(cpi, xd, mi_row, mi_col + block_width / 2, subsize);
- return 1;
+ if (mi_row + block_height / 2 < cm->mi_rows) {
+ get_variance(&vt.part_variances->vert[0]);
+ get_variance(&vt.part_variances->vert[1]);
+ if (vt.part_variances->vert[0].variance < threshold_low &&
+ vt.part_variances->vert[1].variance < threshold_low) {
+ BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
+ set_block_size(cpi, xd, mi_row, mi_col, subsize);
+ set_block_size(cpi, xd, mi_row, mi_col + block_width / 2, subsize);
+ return 1;
+ }
}
// Check horizontal split.
- if (mi_col + block_width / 2 < cm->mi_cols &&
- vt.part_variances->horz[0].variance < threshold_low &&
- vt.part_variances->horz[1].variance < threshold_low) {
- BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
- set_block_size(cpi, xd, mi_row, mi_col, subsize);
- set_block_size(cpi, xd, mi_row + block_height / 2, mi_col, subsize);
- return 1;
+ if (mi_col + block_width / 2 < cm->mi_cols) {
+ get_variance(&vt.part_variances->horz[0]);
+ get_variance(&vt.part_variances->horz[1]);
+ if (vt.part_variances->horz[0].variance < threshold_low &&
+ vt.part_variances->horz[1].variance < threshold_low) {
+ BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
+ set_block_size(cpi, xd, mi_row, mi_col, subsize);
+ set_block_size(cpi, xd, mi_row + block_height / 2, mi_col, subsize);
+ return 1;
+ }
}
+
return 0;
}
return 0;
@@ -574,7 +582,7 @@ static void choose_partitioning(VP9_COMP *cpi,
// If variance is based on 8x8 downsampling, we stop here and have
// one sample for 8x8 block (so use 1 for count in fill_variance),
// which of course means variance = 0 for 8x8 block.
- fill_variance(sse, sum, 1, &vst->split[k].part_variances.none);
+ fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
} else {
// For key frame, go down to 4x4.
v8x8 *vst2 = &vst->split[k];
@@ -592,7 +600,7 @@ static void choose_partitioning(VP9_COMP *cpi,
// If variance is based on 4x4 downsampling, we stop here and have
// one sample for 4x4 block (so use 1 for count in fill_variance),
// which of course means variance = 0 for 4x4 block.
- fill_variance(sse, sum, 1, &vst2->split[m].part_variances.none);
+ fill_variance(sse, sum, 0, &vst2->split[m].part_variances.none);
}
}
}
@@ -700,7 +708,7 @@ static void update_state(VP9_COMP *cpi, ThreadData *td,
mi_addr->src_mi = mi_addr;
// If segmentation in use
- if (seg->enabled && output_enabled) {
+ if (seg->enabled) {
// For in frame complexity AQ copy the segment id from the segment map.
if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
const uint8_t *const map = seg->update_map ? cpi->segmentation_map
@@ -863,6 +871,18 @@ static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode,
vp9_rd_cost_init(rd_cost);
}
+static int set_segment_rdmult(VP9_COMP *const cpi,
+ MACROBLOCK *const x,
+ int8_t segment_id) {
+ int segment_qindex;
+ VP9_COMMON *const cm = &cpi->common;
+ vp9_init_plane_quantizers(cpi, x);
+ vp9_clear_system_state();
+ segment_qindex = vp9_get_qindex(&cm->seg, segment_id,
+ cm->base_qindex);
+ return vp9_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
+}
+
static void rd_pick_sb_modes(VP9_COMP *cpi,
TileDataEnc *tile_data,
MACROBLOCK *const x,
@@ -919,7 +939,6 @@ static void rd_pick_sb_modes(VP9_COMP *cpi,
if (aq_mode == VARIANCE_AQ) {
const int energy = bsize <= BLOCK_16X16 ? x->mb_energy
: vp9_block_energy(cpi, x, bsize);
- int segment_qindex;
if (cm->frame_type == KEY_FRAME ||
cpi->refresh_alt_ref_frame ||
(cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
@@ -929,18 +948,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi,
: cm->last_frame_seg_map;
mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
}
- vp9_init_plane_quantizers(cpi, x);
- vp9_clear_system_state();
- segment_qindex = vp9_get_qindex(&cm->seg, mbmi->segment_id,
- cm->base_qindex);
- x->rdmult = vp9_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
+ x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
} else if (aq_mode == COMPLEXITY_AQ) {
- const int mi_offset = mi_row * cm->mi_cols + mi_col;
- unsigned char complexity = cpi->complexity_map[mi_offset];
- const int is_edge = (mi_row <= 1) || (mi_row >= (cm->mi_rows - 2)) ||
- (mi_col <= 1) || (mi_col >= (cm->mi_cols - 2));
- if (!is_edge && (complexity > 128))
- x->rdmult += ((x->rdmult * (complexity - 128)) / 256);
+ x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
} else if (aq_mode == CYCLIC_REFRESH_AQ) {
const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map
: cm->last_frame_seg_map;
@@ -967,6 +977,16 @@ static void rd_pick_sb_modes(VP9_COMP *cpi,
}
}
+
+ // Examine the resulting rate and for AQ mode 2 make a segment choice.
+ if ((rd_cost->rate != INT_MAX) &&
+ (aq_mode == COMPLEXITY_AQ) && (bsize >= BLOCK_16X16) &&
+ (cm->frame_type == KEY_FRAME ||
+ cpi->refresh_alt_ref_frame ||
+ (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref))) {
+ vp9_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
+ }
+
x->rdmult = orig_rdmult;
// TODO(jingning) The rate-distortion optimization flow needs to be
@@ -1357,11 +1377,8 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
const int x_mis = MIN(bw, cm->mi_cols - mi_col);
const int y_mis = MIN(bh, cm->mi_rows - mi_row);
- MV_REF *const frame_mvs =
- cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
- int w, h;
- *(xd->mi[0].src_mi) = ctx->mic;
+ xd->mi[0] = ctx->mic;
xd->mi[0].src_mi = &xd->mi[0];
if (seg->enabled && cpi->oxcf.aq_mode) {
@@ -1382,21 +1399,26 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
if (is_inter_block(mbmi)) {
vp9_update_mv_count(td);
-
if (cm->interp_filter == SWITCHABLE) {
const int pred_ctx = vp9_get_pred_context_switchable_interp(xd);
++td->counts->switchable_interp[pred_ctx][mbmi->interp_filter];
}
}
- for (h = 0; h < y_mis; ++h) {
- MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
- for (w = 0; w < x_mis; ++w) {
- MV_REF *const mv = frame_mv + w;
- mv->ref_frame[0] = mi->src_mi->mbmi.ref_frame[0];
- mv->ref_frame[1] = mi->src_mi->mbmi.ref_frame[1];
- mv->mv[0].as_int = mi->src_mi->mbmi.mv[0].as_int;
- mv->mv[1].as_int = mi->src_mi->mbmi.mv[1].as_int;
+ if (cm->use_prev_frame_mvs) {
+ MV_REF *const frame_mvs =
+ cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+ int w, h;
+
+ for (h = 0; h < y_mis; ++h) {
+ MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+ for (w = 0; w < x_mis; ++w) {
+ MV_REF *const mv = frame_mv + w;
+ mv->ref_frame[0] = mi->src_mi->mbmi.ref_frame[0];
+ mv->ref_frame[1] = mi->src_mi->mbmi.ref_frame[1];
+ mv->mv[0].as_int = mi->src_mi->mbmi.mv[0].as_int;
+ mv->mv[1].as_int = mi->src_mi->mbmi.mv[1].as_int;
+ }
}
}
@@ -1761,14 +1783,6 @@ static void rd_use_partition(VP9_COMP *cpi,
if (do_recon) {
int output_enabled = (bsize == BLOCK_64X64);
-
- // Check the projected output rate for this SB against it's target
- // and and if necessary apply a Q delta using segmentation to get
- // closer to the target.
- if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
- vp9_select_in_frame_q_segment(cpi, x, bsize, mi_row, mi_col,
- output_enabled, chosen_rdc.rate);
- }
encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
pc_tree);
}
@@ -2500,13 +2514,6 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
pc_tree->index != 3) {
int output_enabled = (bsize == BLOCK_64X64);
-
- // Check the projected output rate for this SB against it's target
- // and and if necessary apply a Q delta using segmentation to get
- // closer to the target.
- if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map)
- vp9_select_in_frame_q_segment(cpi, x, bsize, mi_row, mi_col,
- output_enabled, best_rdc.rate);
encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
bsize, pc_tree);
}
@@ -2719,27 +2726,27 @@ static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x,
switch (partition) {
case PARTITION_NONE:
- set_modeinfo_offsets(cm, xd, mi_row, mi_col);
+ set_mode_info_offsets(cm, xd, mi_row, mi_col);
*(xd->mi[0].src_mi) = pc_tree->none.mic;
duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
break;
case PARTITION_VERT:
- set_modeinfo_offsets(cm, xd, mi_row, mi_col);
+ set_mode_info_offsets(cm, xd, mi_row, mi_col);
*(xd->mi[0].src_mi) = pc_tree->vertical[0].mic;
duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
if (mi_col + hbs < cm->mi_cols) {
- set_modeinfo_offsets(cm, xd, mi_row, mi_col + hbs);
+ set_mode_info_offsets(cm, xd, mi_row, mi_col + hbs);
*(xd->mi[0].src_mi) = pc_tree->vertical[1].mic;
duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col + hbs, bsize);
}
break;
case PARTITION_HORZ:
- set_modeinfo_offsets(cm, xd, mi_row, mi_col);
+ set_mode_info_offsets(cm, xd, mi_row, mi_col);
*(xd->mi[0].src_mi) = pc_tree->horizontal[0].mic;
duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
if (mi_row + hbs < cm->mi_rows) {
- set_modeinfo_offsets(cm, xd, mi_row + hbs, mi_col);
+ set_mode_info_offsets(cm, xd, mi_row + hbs, mi_col);
*(xd->mi[0].src_mi) = pc_tree->horizontal[1].mic;
duplicate_mode_info_in_sb(cm, xd, mi_row + hbs, mi_col, bsize);
}
@@ -2784,7 +2791,6 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
int do_recon, int64_t best_rd,
PC_TREE *pc_tree) {
const SPEED_FEATURES *const sf = &cpi->sf;
- const VP9EncoderConfig *const oxcf = &cpi->oxcf;
VP9_COMMON *const cm = &cpi->common;
TileInfo *const tile_info = &tile_data->tile_info;
MACROBLOCK *const x = &td->mb;
@@ -3016,14 +3022,6 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX && do_recon) {
int output_enabled = (bsize == BLOCK_64X64);
-
- // Check the projected output rate for this SB against it's target
- // and and if necessary apply a Q delta using segmentation to get
- // closer to the target.
- if ((oxcf->aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
- vp9_select_in_frame_q_segment(cpi, x, bsize, mi_row, mi_col,
- output_enabled, best_rdc.rate);
- }
encode_sb_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
bsize, pc_tree);
}
@@ -3114,7 +3112,7 @@ static void nonrd_select_partition(VP9_COMP *cpi,
if (mi_row + hbs < cm->mi_rows) {
pc_tree->horizontal[1].pred_pixel_ready = 1;
nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col,
- &this_rdc, subsize, &pc_tree->horizontal[0]);
+ &this_rdc, subsize, &pc_tree->horizontal[1]);
pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
pc_tree->horizontal[1].skip = x->skip;
@@ -3173,7 +3171,7 @@ static void nonrd_use_partition(VP9_COMP *cpi,
TOKENEXTRA **tp,
int mi_row, int mi_col,
BLOCK_SIZE bsize, int output_enabled,
- RD_COST *rd_cost, PC_TREE *pc_tree) {
+ RD_COST *dummy_cost, PC_TREE *pc_tree) {
VP9_COMMON *const cm = &cpi->common;
TileInfo *tile_info = &tile_data->tile_info;
MACROBLOCK *const x = &td->mb;
@@ -3182,9 +3180,7 @@ static void nonrd_use_partition(VP9_COMP *cpi,
const int mis = cm->mi_stride;
PARTITION_TYPE partition;
BLOCK_SIZE subsize;
- RD_COST this_rdc;
- vp9_rd_cost_reset(&this_rdc);
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
@@ -3199,7 +3195,7 @@ static void nonrd_use_partition(VP9_COMP *cpi,
switch (partition) {
case PARTITION_NONE:
pc_tree->none.pred_pixel_ready = 1;
- nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
subsize, &pc_tree->none);
pc_tree->none.mic.mbmi = xd->mi[0].src_mi->mbmi;
pc_tree->none.skip_txfm[0] = x->skip_txfm[0];
@@ -3209,7 +3205,7 @@ static void nonrd_use_partition(VP9_COMP *cpi,
break;
case PARTITION_VERT:
pc_tree->vertical[0].pred_pixel_ready = 1;
- nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
subsize, &pc_tree->vertical[0]);
pc_tree->vertical[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
@@ -3219,23 +3215,17 @@ static void nonrd_use_partition(VP9_COMP *cpi,
if (mi_col + hbs < cm->mi_cols) {
pc_tree->vertical[1].pred_pixel_ready = 1;
nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs,
- &this_rdc, subsize, &pc_tree->vertical[1]);
+ dummy_cost, subsize, &pc_tree->vertical[1]);
pc_tree->vertical[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
pc_tree->vertical[1].skip = x->skip;
encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col + hbs,
output_enabled, subsize, &pc_tree->vertical[1]);
-
- if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
- rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
- rd_cost->rate += this_rdc.rate;
- rd_cost->dist += this_rdc.dist;
- }
}
break;
case PARTITION_HORZ:
pc_tree->horizontal[0].pred_pixel_ready = 1;
- nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
subsize, &pc_tree->horizontal[0]);
pc_tree->horizontal[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
@@ -3246,48 +3236,34 @@ static void nonrd_use_partition(VP9_COMP *cpi,
if (mi_row + hbs < cm->mi_rows) {
pc_tree->horizontal[1].pred_pixel_ready = 1;
nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col,
- &this_rdc, subsize, &pc_tree->horizontal[0]);
+ dummy_cost, subsize, &pc_tree->horizontal[1]);
pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
pc_tree->horizontal[1].skip = x->skip;
encode_b_rt(cpi, td, tile_info, tp, mi_row + hbs, mi_col,
output_enabled, subsize, &pc_tree->horizontal[1]);
-
- if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
- rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
- rd_cost->rate += this_rdc.rate;
- rd_cost->dist += this_rdc.dist;
- }
}
break;
case PARTITION_SPLIT:
subsize = get_subsize(bsize, PARTITION_SPLIT);
- nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
- subsize, output_enabled, rd_cost,
- pc_tree->split[0]);
- nonrd_use_partition(cpi, td, tile_data, mi + hbs, tp,
- mi_row, mi_col + hbs, subsize, output_enabled,
- &this_rdc, pc_tree->split[1]);
- if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
- rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
- rd_cost->rate += this_rdc.rate;
- rd_cost->dist += this_rdc.dist;
- }
- nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis, tp,
- mi_row + hbs, mi_col, subsize, output_enabled,
- &this_rdc, pc_tree->split[2]);
- if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
- rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
- rd_cost->rate += this_rdc.rate;
- rd_cost->dist += this_rdc.dist;
- }
- nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp,
- mi_row + hbs, mi_col + hbs, subsize, output_enabled,
- &this_rdc, pc_tree->split[3]);
- if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
- rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
- rd_cost->rate += this_rdc.rate;
- rd_cost->dist += this_rdc.dist;
+ if (bsize == BLOCK_8X8) {
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
+ subsize, pc_tree->leaf_split[0]);
+ encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col,
+ output_enabled, subsize, pc_tree->leaf_split[0]);
+ } else {
+ nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+ subsize, output_enabled, dummy_cost,
+ pc_tree->split[0]);
+ nonrd_use_partition(cpi, td, tile_data, mi + hbs, tp,
+ mi_row, mi_col + hbs, subsize, output_enabled,
+ dummy_cost, pc_tree->split[1]);
+ nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis, tp,
+ mi_row + hbs, mi_col, subsize, output_enabled,
+ dummy_cost, pc_tree->split[2]);
+ nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp,
+ mi_row + hbs, mi_col + hbs, subsize, output_enabled,
+ dummy_cost, pc_tree->split[3]);
}
break;
default:
@@ -3329,6 +3305,9 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi,
// Set the partition type of the 64X64 block
switch (sf->partition_search_type) {
case VAR_BASED_PARTITION:
+ // TODO(jingning) Only key frame coding supports sub8x8 block at this
+ // point. To be continued to enable sub8x8 block mode decision for
+ // P frames.
choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index a03131ca6..aaa6b238d 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -213,9 +213,6 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
vpx_free(cpi->coding_context.last_frame_seg_map_copy);
cpi->coding_context.last_frame_seg_map_copy = NULL;
- vpx_free(cpi->complexity_map);
- cpi->complexity_map = NULL;
-
vpx_free(cpi->nmvcosts[0]);
vpx_free(cpi->nmvcosts[1]);
cpi->nmvcosts[0] = NULL;
@@ -1445,10 +1442,6 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
CHECK_MEM_ERROR(cm, cpi->segmentation_map,
vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
- // Create a complexity map used for rd adjustment
- CHECK_MEM_ERROR(cm, cpi->complexity_map,
- vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
-
// Create a map used for cyclic background refresh.
CHECK_MEM_ERROR(cm, cpi->cyclic_refresh,
vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 7342f7496..14f7c7f0c 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -351,8 +351,6 @@ typedef struct VP9_COMP {
// segment threashold for encode breakout
int segment_encode_breakout[MAX_SEGMENTS];
- unsigned char *complexity_map;
-
CYCLIC_REFRESH *cyclic_refresh;
fractional_mv_step_fp *find_fractional_mv_step;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 1da5a83bd..b45032456 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -622,7 +622,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col,
sf, sf);
- if (!cm->error_resilient_mode)
+ if (cm->use_prev_frame_mvs)
vp9_find_mv_refs(cm, xd, tile_info, xd->mi[0].src_mi, ref_frame,
candidates, mi_row, mi_col);
else
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 37b6718bf..3cc9d9a7b 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -426,8 +426,8 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
}
// Work out a size correction factor.
if (projected_size_based_on_q > FRAME_OVERHEAD_BITS)
- correction_factor = (100 * cpi->rc.projected_frame_size) /
- projected_size_based_on_q;
+ correction_factor = (int)((100 * (int64_t)cpi->rc.projected_frame_size) /
+ projected_size_based_on_q);
// More heavily damped adjustment used if we have been oscillating either side
// of target.
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index c1bdff77a..600a3eb1a 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -982,8 +982,8 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb,
int i, j;
const MACROBLOCKD *const xd = &mb->e_mbd;
MODE_INFO *const mic = xd->mi[0].src_mi;
- const MODE_INFO *above_mi = xd->mi[-xd->mi_stride].src_mi;
- const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1].src_mi : NULL;
+ const MODE_INFO *above_mi = xd->above_mi;
+ const MODE_INFO *left_mi = xd->left_mi;
const BLOCK_SIZE bsize = xd->mi[0].src_mi->mbmi.sb_type;
const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
@@ -1058,8 +1058,8 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
TX_SIZE best_tx = TX_4X4;
int i;
int *bmode_costs;
- const MODE_INFO *above_mi = xd->mi[-xd->mi_stride].src_mi;
- const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1].src_mi : NULL;
+ const MODE_INFO *above_mi = xd->above_mi;
+ const MODE_INFO *left_mi = xd->left_mi;
const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
bmode_costs = cpi->y_mode_costs[A][L];
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
index 508e1d4f5..f5f05e799 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
@@ -122,8 +122,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
%ifidn %1, b_32x32
- pmovmskb r6, m7
- pmovmskb r2, m12
+ pmovmskb r6d, m7
+ pmovmskb r2d, m12
or r6, r2
jz .skip_iter
%endif
@@ -308,8 +308,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
%ifidn %1, fp_32x32
pcmpgtw m7, m6, m0
pcmpgtw m12, m11, m0
- pmovmskb r6, m7
- pmovmskb r2, m12
+ pmovmskb r6d, m7
+ pmovmskb r2d, m12
or r6, r2
jz .skip_iter
diff --git a/vp9/encoder/x86/vp9_subpel_variance.asm b/vp9/encoder/x86/vp9_subpel_variance.asm
index 1a9e4e8b6..06b8b034a 100644
--- a/vp9/encoder/x86/vp9_subpel_variance.asm
+++ b/vp9/encoder/x86/vp9_subpel_variance.asm
@@ -101,7 +101,7 @@ SECTION .text
pshufd m4, m6, 0x1
movd [r1], m7 ; store sse
paddd m6, m4
- movd rax, m6 ; store sum as return value
+ movd raxd, m6 ; store sum as return value
%else ; mmsize == 8
pshufw m4, m6, 0xe
pshufw m3, m7, 0xe
@@ -113,7 +113,7 @@ SECTION .text
movd [r1], m7 ; store sse
pshufw m4, m6, 0xe
paddd m6, m4
- movd rax, m6 ; store sum as return value
+ movd raxd, m6 ; store sum as return value
%endif
RET
%endmacro
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 9414120f6..2504f4db9 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -72,6 +72,8 @@ VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_ss
VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
+VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
+VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm
@@ -131,28 +133,52 @@ ifeq ($(ARCH_X86_64), yes)
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3_x86_64.asm
endif
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve_neon.c
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_neon.c
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon.c
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve8_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct4x4_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct4x4_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct8x8_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct8x8_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct32x32_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct32x32_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht4x4_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht8x8_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_copy_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_avg_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_reconintra_neon$(ASM)
+# neon with assembly and intrinsics implementations. If both are available
+# prefer assembly.
+ifeq ($(HAVE_NEON_ASM), yes)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_avg_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_avg_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_1_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c
+else
+ifeq ($(HAVE_NEON), yes)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_avg_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_avg_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_1_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c
+endif # HAVE_NEON
+endif # HAVE_NEON_ASM
+
$(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl))