diff options
66 files changed, 6010 insertions, 426 deletions
diff --git a/build/make/configure.sh b/build/make/configure.sh index 9327ce95e..7be583d72 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -805,7 +805,12 @@ process_common_toolchain() { ;; armv7|armv7s) soft_enable neon - soft_enable neon_asm + # Only enable neon_asm when neon is also enabled. + enabled neon && soft_enable neon_asm + # If someone tries to force it through, die. + if disabled neon && enabled neon_asm; then + die "Disabling neon while keeping neon-asm is not supported" + fi soft_enable media soft_enable fast_unaligned ;; @@ -1118,7 +1123,7 @@ EOF bits=32 enabled x86_64 && bits=64 check_cpp <<EOF && bits=x32 -#ifndef __ILP32__ +#if !defined(__ILP32__) || !defined(__x86_64__) #error "not x32" #endif EOF diff --git a/test/convolve_test.cc b/test/convolve_test.cc index b0f60529a..e30ccf92e 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -10,12 +10,14 @@ #include <string.h> #include "test/acm_random.h" +#include "test/clear_system_state.h" #include "test/register_state_check.h" #include "test/util.h" #include "third_party/googletest/src/include/gtest/gtest.h" #include "./vpx_config.h" #include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" #include "vp9/common/vp9_filter.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" @@ -31,13 +33,16 @@ typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride, int w, int h); struct ConvolveFunctions { - ConvolveFunctions(ConvolveFunc h8, ConvolveFunc h8_avg, + ConvolveFunctions(ConvolveFunc copy, ConvolveFunc avg, + ConvolveFunc h8, ConvolveFunc h8_avg, ConvolveFunc v8, ConvolveFunc v8_avg, ConvolveFunc hv8, ConvolveFunc hv8_avg, int bd) - : h8_(h8), v8_(v8), hv8_(hv8), h8_avg_(h8_avg), v8_avg_(v8_avg), - hv8_avg_(hv8_avg), use_highbd_(bd) {} + : copy_(copy), avg_(avg), h8_(h8), v8_(v8), hv8_(hv8), h8_avg_(h8_avg), + v8_avg_(v8_avg), hv8_avg_(hv8_avg), use_highbd_(bd) {} + ConvolveFunc copy_; + ConvolveFunc avg_; ConvolveFunc h8_; ConvolveFunc v8_; ConvolveFunc hv8_; @@ -298,25 +303,35 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> { vpx_memalign(kDataAlignment, kInputBufferSize + 1)) + 1; output_ = reinterpret_cast<uint8_t*>( vpx_memalign(kDataAlignment, kOutputBufferSize)); + output_ref_ = reinterpret_cast<uint8_t*>( + vpx_memalign(kDataAlignment, kOutputBufferSize)); #if CONFIG_VP9_HIGHBITDEPTH input16_ = reinterpret_cast<uint16_t*>( vpx_memalign(kDataAlignment, (kInputBufferSize + 1) * sizeof(uint16_t))) + 1; output16_ = reinterpret_cast<uint16_t*>( vpx_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t))); + output16_ref_ = reinterpret_cast<uint16_t*>( + vpx_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t))); #endif } + virtual void TearDown() { libvpx_test::ClearSystemState(); } + static void TearDownTestCase() { vpx_free(input_ - 1); input_ = NULL; vpx_free(output_); output_ = NULL; + vpx_free(output_ref_); + output_ref_ = NULL; #if CONFIG_VP9_HIGHBITDEPTH vpx_free(input16_ - 1); input16_ = NULL; vpx_free(output16_); output16_ = NULL; + vpx_free(output16_ref_); + output16_ref_ = NULL; #endif } @@ -382,6 +397,13 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> { #endif } + void CopyOutputToRef() { + vpx_memcpy(output_ref_, output_, kOutputBufferSize); +#if CONFIG_VP9_HIGHBITDEPTH + vpx_memcpy(output16_ref_, output16_, kOutputBufferSize); +#endif + } + void CheckGuardBlocks() { for (int i = 0; i < kOutputBufferSize; ++i) { if (IsIndexInBorder(i)) @@ -415,6 +437,19 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> { #endif } + uint8_t *output_ref() const { +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_highbd_ == 0) { + return output_ref_ + BorderTop() * kOuterBlockSize + BorderLeft(); + } else { + return CONVERT_TO_BYTEPTR(output16_ref_ + BorderTop() * kOuterBlockSize + + BorderLeft()); + } +#else + return output_ref_ + BorderTop() * kOuterBlockSize + BorderLeft(); +#endif + } + uint16_t lookup(uint8_t *list, int index) const { #if CONFIG_VP9_HIGHBITDEPTH if (UUT_->use_highbd_ == 0) { @@ -493,24 +528,65 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> { const ConvolveFunctions* UUT_; static uint8_t* input_; static uint8_t* output_; + static uint8_t* output_ref_; #if CONFIG_VP9_HIGHBITDEPTH static uint16_t* input16_; static uint16_t* output16_; + static uint16_t* output16_ref_; int mask_; #endif }; uint8_t* ConvolveTest::input_ = NULL; uint8_t* ConvolveTest::output_ = NULL; +uint8_t* ConvolveTest::output_ref_ = NULL; #if CONFIG_VP9_HIGHBITDEPTH uint16_t* ConvolveTest::input16_ = NULL; uint16_t* ConvolveTest::output16_ = NULL; +uint16_t* ConvolveTest::output16_ref_ = NULL; #endif TEST_P(ConvolveTest, GuardBlocks) { CheckGuardBlocks(); } +TEST_P(ConvolveTest, Copy) { + uint8_t* const in = input(); + uint8_t* const out = output(); + + ASM_REGISTER_STATE_CHECK( + UUT_->copy_(in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0, + Width(), Height())); + + CheckGuardBlocks(); + + for (int y = 0; y < Height(); ++y) + for (int x = 0; x < Width(); ++x) + ASSERT_EQ(lookup(out, y * kOutputStride + x), + lookup(in, y * kInputStride + x)) + << "(" << x << "," << y << ")"; +} + +TEST_P(ConvolveTest, Avg) { + uint8_t* const in = input(); + uint8_t* const out = output(); + uint8_t* const out_ref = output_ref(); + CopyOutputToRef(); + + ASM_REGISTER_STATE_CHECK( + UUT_->avg_(in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0, + Width(), Height())); + + CheckGuardBlocks(); + + for (int y = 0; y < Height(); ++y) + for (int x = 0; x < Width(); ++x) + ASSERT_EQ(lookup(out, y * kOutputStride + x), + ROUND_POWER_OF_TWO(lookup(in, y * kInputStride + x) + + lookup(out_ref, y * kOutputStride + x), 1)) + << "(" << x << "," << y << ")"; +} + TEST_P(ConvolveTest, CopyHoriz) { uint8_t* const in = input(); uint8_t* const out = output(); @@ -1188,6 +1264,30 @@ void wrap_convolve8_avg_sse2_12(const uint8_t *src, ptrdiff_t src_stride, } #endif // HAVE_SSE2 && ARCH_X86_64 +void wrap_convolve_copy_c_8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_highbd_convolve_copy_c(src, src_stride, dst, dst_stride, + filter_x, filter_x_stride, + filter_y, filter_y_stride, w, h, 8); +} + +void wrap_convolve_avg_c_8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_highbd_convolve_avg_c(src, src_stride, dst, dst_stride, + filter_x, filter_x_stride, + filter_y, filter_y_stride, w, h, 8); +} + void wrap_convolve8_horiz_c_8(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, @@ -1260,6 +1360,30 @@ void wrap_convolve8_avg_c_8(const uint8_t *src, ptrdiff_t src_stride, filter_y, filter_y_stride, w, h, 8); } +void wrap_convolve_copy_c_10(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_highbd_convolve_copy_c(src, src_stride, dst, dst_stride, + filter_x, filter_x_stride, + filter_y, filter_y_stride, w, h, 10); +} + +void wrap_convolve_avg_c_10(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_highbd_convolve_avg_c(src, src_stride, dst, dst_stride, + filter_x, filter_x_stride, + filter_y, filter_y_stride, w, h, 10); +} + void wrap_convolve8_horiz_c_10(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, @@ -1332,6 +1456,30 @@ void wrap_convolve8_avg_c_10(const uint8_t *src, ptrdiff_t src_stride, filter_y, filter_y_stride, w, h, 10); } +void wrap_convolve_copy_c_12(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_highbd_convolve_copy_c(src, src_stride, dst, dst_stride, + filter_x, filter_x_stride, + filter_y, filter_y_stride, w, h, 12); +} + +void wrap_convolve_avg_c_12(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_highbd_convolve_avg_c(src, src_stride, dst, dst_stride, + filter_x, filter_x_stride, + filter_y, filter_y_stride, w, h, 12); +} + void wrap_convolve8_horiz_c_12(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, @@ -1405,6 +1553,7 @@ void wrap_convolve8_avg_c_12(const uint8_t *src, ptrdiff_t src_stride, } const ConvolveFunctions convolve8_c( + wrap_convolve_copy_c_8, wrap_convolve_avg_c_8, wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8); @@ -1423,6 +1572,7 @@ INSTANTIATE_TEST_CASE_P(C_8, ConvolveTest, ::testing::Values( make_tuple(32, 64, &convolve8_c), make_tuple(64, 64, &convolve8_c))); const ConvolveFunctions convolve10_c( + wrap_convolve_copy_c_10, wrap_convolve_avg_c_10, wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10, 10); @@ -1441,6 +1591,7 @@ INSTANTIATE_TEST_CASE_P(C_10, ConvolveTest, ::testing::Values( make_tuple(32, 64, &convolve10_c), make_tuple(64, 64, &convolve10_c))); const ConvolveFunctions convolve12_c( + wrap_convolve_copy_c_12, wrap_convolve_avg_c_12, wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12, 12); @@ -1462,6 +1613,7 @@ INSTANTIATE_TEST_CASE_P(C_12, ConvolveTest, ::testing::Values( #else const ConvolveFunctions convolve8_c( + vp9_convolve_copy_c, vp9_convolve_avg_c, vp9_convolve8_horiz_c, vp9_convolve8_avg_horiz_c, vp9_convolve8_vert_c, vp9_convolve8_avg_vert_c, vp9_convolve8_c, vp9_convolve8_avg_c, 0); @@ -1485,14 +1637,17 @@ INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values( #if HAVE_SSE2 && ARCH_X86_64 #if CONFIG_VP9_HIGHBITDEPTH const ConvolveFunctions convolve8_sse2( + wrap_convolve_copy_c_8, wrap_convolve_avg_c_8, wrap_convolve8_horiz_sse2_8, wrap_convolve8_avg_horiz_sse2_8, wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8, wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8, 8); const ConvolveFunctions convolve10_sse2( + wrap_convolve_copy_c_10, wrap_convolve_avg_c_10, wrap_convolve8_horiz_sse2_10, wrap_convolve8_avg_horiz_sse2_10, wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10, wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10, 10); const ConvolveFunctions convolve12_sse2( + wrap_convolve_copy_c_12, wrap_convolve_avg_c_12, wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12, wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12, wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12, 12); @@ -1538,6 +1693,7 @@ INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values( make_tuple(64, 64, &convolve12_sse2))); #else const ConvolveFunctions convolve8_sse2( + vp9_convolve_copy_sse2, vp9_convolve_avg_sse2, vp9_convolve8_horiz_sse2, vp9_convolve8_avg_horiz_sse2, vp9_convolve8_vert_sse2, vp9_convolve8_avg_vert_sse2, vp9_convolve8_sse2, vp9_convolve8_avg_sse2, 0); @@ -1561,6 +1717,7 @@ INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values( #if HAVE_SSSE3 const ConvolveFunctions convolve8_ssse3( + vp9_convolve_copy_c, vp9_convolve_avg_c, vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_ssse3, vp9_convolve8_vert_ssse3, vp9_convolve8_avg_vert_ssse3, vp9_convolve8_ssse3, vp9_convolve8_avg_ssse3, 0); @@ -1583,6 +1740,7 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values( #if HAVE_AVX2 && HAVE_SSSE3 const ConvolveFunctions convolve8_avx2( + vp9_convolve_copy_c, vp9_convolve_avg_c, vp9_convolve8_horiz_avx2, vp9_convolve8_avg_horiz_ssse3, vp9_convolve8_vert_avx2, vp9_convolve8_avg_vert_ssse3, vp9_convolve8_avx2, vp9_convolve8_avg_ssse3, 0); @@ -1603,11 +1761,20 @@ INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values( make_tuple(64, 64, &convolve8_avx2))); #endif // HAVE_AVX2 && HAVE_SSSE3 +#if HAVE_NEON #if HAVE_NEON_ASM const ConvolveFunctions convolve8_neon( + vp9_convolve_copy_neon, vp9_convolve_avg_neon, + vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon, + vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon, + vp9_convolve8_neon, vp9_convolve8_avg_neon, 0); +#else // HAVE_NEON +const ConvolveFunctions convolve8_neon( + vp9_convolve_copy_neon, vp9_convolve_avg_neon, vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon, vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon, vp9_convolve8_neon, vp9_convolve8_avg_neon, 0); +#endif // HAVE_NEON_ASM INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values( make_tuple(4, 4, &convolve8_neon), @@ -1623,10 +1790,11 @@ INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values( make_tuple(64, 32, &convolve8_neon), make_tuple(32, 64, &convolve8_neon), make_tuple(64, 64, &convolve8_neon))); -#endif +#endif // HAVE_NEON #if HAVE_DSPR2 const ConvolveFunctions convolve8_dspr2( + vp9_convolve_copy_dspr2, vp9_convolve_avg_dspr2, vp9_convolve8_horiz_dspr2, vp9_convolve8_avg_horiz_dspr2, vp9_convolve8_vert_dspr2, vp9_convolve8_avg_vert_dspr2, vp9_convolve8_dspr2, vp9_convolve8_avg_dspr2, 0); diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc index 5a7114022..0a0713ab5 100644 --- a/test/decode_perf_test.cc +++ b/test/decode_perf_test.cc @@ -8,13 +8,17 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <string> #include "test/codec_factory.h" #include "test/decode_test_driver.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" #include "test/ivf_video_source.h" #include "test/md5_helper.h" #include "test/util.h" #include "test/webm_video_source.h" #include "vpx_ports/vpx_timer.h" +#include "./ivfenc.h" #include "./vpx_version.h" using std::tr1::make_tuple; @@ -24,7 +28,9 @@ namespace { #define VIDEO_NAME 0 #define THREADS 1 +const int kMaxPsnr = 100; const double kUsecsInSec = 1000000.0; +static const char *kNewEncodeOutputFile = "new_encode.ivf"; /* DecodePerfTest takes a tuple of filename + number of threads to decode with @@ -105,4 +111,161 @@ TEST_P(DecodePerfTest, PerfTest) { INSTANTIATE_TEST_CASE_P(VP9, DecodePerfTest, ::testing::ValuesIn(kVP9DecodePerfVectors)); +class VP9NewEncodeDecodePerfTest : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> { + protected: + VP9NewEncodeDecodePerfTest() + : EncoderTest(GET_PARAM(0)), + encoding_mode_(GET_PARAM(1)), + speed_(0), + outfile_(0), + out_frames_(0) { + } + + virtual ~VP9NewEncodeDecodePerfTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(encoding_mode_); + + cfg_.g_lag_in_frames = 25; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_undershoot_pct = 50; + cfg_.rc_overshoot_pct = 50; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + cfg_.rc_resize_allowed = 0; + cfg_.rc_end_usage = VPX_VBR; + } + + virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) { + if (video->frame() == 1) { + encoder->Control(VP8E_SET_CPUUSED, speed_); + encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1); + encoder->Control(VP9E_SET_TILE_COLUMNS, 2); + } + } + + virtual void BeginPassHook(unsigned int /*pass*/) { + const std::string data_path = getenv("LIBVPX_TEST_DATA_PATH"); + const std::string path_to_source = data_path + "/" + kNewEncodeOutputFile; + outfile_ = fopen(path_to_source.c_str(), "wb"); + } + + virtual void EndPassHook() { + if (outfile_) { + if (!fseek(outfile_, 0, SEEK_SET)) + ivf_write_file_header(outfile_, &cfg_, VP9_FOURCC, out_frames_); + fclose(outfile_); + outfile_ = NULL; + } + } + + virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + ++out_frames_; + + // Write initial file header if first frame. + if (pkt->data.frame.pts == 0) + ivf_write_file_header(outfile_, &cfg_, VP9_FOURCC, out_frames_); + + // Write frame header and data. + ivf_write_frame_header(outfile_, out_frames_, pkt->data.frame.sz); + (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_); + } + + virtual bool DoDecode() { return 0; } + + void set_speed(unsigned int speed) { + speed_ = speed; + } + + private: + libvpx_test::TestMode encoding_mode_; + uint32_t speed_; + FILE *outfile_; + uint32_t out_frames_; +}; + + +struct EncodePerfTestVideo { + EncodePerfTestVideo(const char *name_, uint32_t width_, uint32_t height_, + uint32_t bitrate_, int frames_) + : name(name_), + width(width_), + height(height_), + bitrate(bitrate_), + frames(frames_) {} + const char *name; + uint32_t width; + uint32_t height; + uint32_t bitrate; + int frames; +}; + +const EncodePerfTestVideo kVP9EncodePerfTestVectors[] = { + EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470), +}; + +TEST_P(VP9NewEncodeDecodePerfTest, PerfTest) { + SetUp(); + + // TODO(JBB): Make this work by going through the set of given files. + const int i = 0; + const vpx_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = kVP9EncodePerfTestVectors[i].bitrate; + + init_flags_ = VPX_CODEC_USE_PSNR; + + const char *video_name = kVP9EncodePerfTestVectors[i].name; + libvpx_test::I420VideoSource video( + video_name, + kVP9EncodePerfTestVectors[i].width, + kVP9EncodePerfTestVectors[i].height, + timebase.den, timebase.num, 0, + kVP9EncodePerfTestVectors[i].frames); + set_speed(2); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + const uint32_t threads = 4; + + libvpx_test::IVFVideoSource decode_video(kNewEncodeOutputFile); + decode_video.Init(); + + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); + cfg.threads = threads; + libvpx_test::VP9Decoder decoder(cfg, 0); + + vpx_usec_timer t; + vpx_usec_timer_start(&t); + + for (decode_video.Begin(); decode_video.cxdata() != NULL; + decode_video.Next()) { + decoder.DecodeFrame(decode_video.cxdata(), decode_video.frame_size()); + } + + vpx_usec_timer_mark(&t); + const double elapsed_secs = + double(vpx_usec_timer_elapsed(&t)) / kUsecsInSec; + const unsigned decode_frames = decode_video.frame_number(); + const double fps = double(decode_frames) / elapsed_secs; + + printf("{\n"); + printf("\t\"type\" : \"decode_perf_test\",\n"); + printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP); + printf("\t\"videoName\" : \"%s\",\n", kNewEncodeOutputFile); + printf("\t\"threadCount\" : %u,\n", threads); + printf("\t\"decodeTimeSecs\" : %f,\n", elapsed_secs); + printf("\t\"totalFrames\" : %u,\n", decode_frames); + printf("\t\"framesPerSecond\" : %f\n", fps); + printf("}\n"); +} + +VP9_INSTANTIATE_TEST_CASE( + VP9NewEncodeDecodePerfTest, ::testing::Values(::libvpx_test::kTwoPassGood)); } // namespace diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index 110c9c3c9..01abca5fa 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -62,6 +62,10 @@ void reference_8x8_dct_2d(const int16_t input[kNumCoeffs], using libvpx_test::ACMRandom; namespace { + +const int kSignBiasMaxDiff255 = 1500; +const int kSignBiasMaxDiff15 = 10000; + typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride); typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride); typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride, @@ -160,7 +164,7 @@ class FwdTrans8x8TestBase { for (int j = 0; j < 64; ++j) { const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]); - const int max_diff = 1125; + const int max_diff = kSignBiasMaxDiff255; EXPECT_LT(diff, max_diff << (bit_depth_ - 8)) << "Error: 8x8 FDCT/FHT has a sign bias > " << 1. * max_diff / count_test_block * 100 << "%" @@ -190,9 +194,9 @@ class FwdTrans8x8TestBase { for (int j = 0; j < 64; ++j) { const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]); - const int max_diff = 10000; + const int max_diff = kSignBiasMaxDiff15; EXPECT_LT(diff, max_diff << (bit_depth_ - 8)) - << "Error: 4x4 FDCT/FHT has a sign bias > " + << "Error: 8x8 FDCT/FHT has a sign bias > " << 1. * max_diff / count_test_block * 100 << "%" << " for input range [-15, 15] at index " << j << " count0: " << count_sign_block[j][0] @@ -646,36 +650,24 @@ TEST_P(InvTrans8x8DCT, CompareReference) { using std::tr1::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH -// TODO(jingning): re-enable after this handles the expanded range [0, 65535] -// returned from Rand16(). -INSTANTIATE_TEST_CASE_P( - DISABLED_C, FwdTrans8x8DCT, - ::testing::Values( - make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8))); INSTANTIATE_TEST_CASE_P( C, FwdTrans8x8DCT, ::testing::Values( + make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8), make_tuple(&vp9_highbd_fdct8x8_c, &idct8x8_10, 0, VPX_BITS_10), make_tuple(&vp9_highbd_fdct8x8_c, &idct8x8_12, 0, VPX_BITS_12))); #else -// TODO(jingning): re-enable after this handles the expanded range [0, 65535] -// returned from Rand16(). INSTANTIATE_TEST_CASE_P( - DISABLED_C, FwdTrans8x8DCT, + C, FwdTrans8x8DCT, ::testing::Values( make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8))); #endif // CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_HIGHBITDEPTH -// TODO(jingning): re-enable after this handles the expanded range [0, 65535] -// returned from Rand16(). -INSTANTIATE_TEST_CASE_P( - DISABLED_C, FwdTrans8x8HT, - ::testing::Values( - make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8))); INSTANTIATE_TEST_CASE_P( C, FwdTrans8x8HT, ::testing::Values( + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8), make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 0, VPX_BITS_10), make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 1, VPX_BITS_10), make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 2, VPX_BITS_10), @@ -691,12 +683,9 @@ INSTANTIATE_TEST_CASE_P( // TODO(jingning): re-enable after this handles the expanded range [0, 65535] // returned from Rand16(). INSTANTIATE_TEST_CASE_P( - DISABLED_C, FwdTrans8x8HT, - ::testing::Values( - make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( C, FwdTrans8x8HT, ::testing::Values( + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8), make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8), make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8), make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8))); @@ -706,12 +695,12 @@ INSTANTIATE_TEST_CASE_P( // TODO(jingning): re-enable after this handles the expanded range [0, 65535] // returned from Rand16(). INSTANTIATE_TEST_CASE_P( - DISABLED_NEON, FwdTrans8x8DCT, + NEON, FwdTrans8x8DCT, ::testing::Values( make_tuple(&vp9_fdct8x8_neon, &vp9_idct8x8_64_add_neon, 0, VPX_BITS_8))); INSTANTIATE_TEST_CASE_P( - DISABLED_NEON, FwdTrans8x8HT, + NEON, FwdTrans8x8HT, ::testing::Values( make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 0, VPX_BITS_8), make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1, VPX_BITS_8), @@ -723,32 +712,24 @@ INSTANTIATE_TEST_CASE_P( // TODO(jingning): re-enable after these handle the expanded range [0, 65535] // returned from Rand16(). INSTANTIATE_TEST_CASE_P( - DISABLED_SSE2, FwdTrans8x8DCT, + SSE2, FwdTrans8x8DCT, ::testing::Values( make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_sse2, 0, VPX_BITS_8))); INSTANTIATE_TEST_CASE_P( - DISABLED_SSE2, FwdTrans8x8HT, - ::testing::Values( - make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( SSE2, FwdTrans8x8HT, ::testing::Values( + make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0, VPX_BITS_8), make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1, VPX_BITS_8), make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2, VPX_BITS_8), make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3, VPX_BITS_8))); #endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -// TODO(jingning): re-enable after these handle the expanded range [0, 65535] -// returned from Rand16(). -INSTANTIATE_TEST_CASE_P( - DISABLED_SSE2, FwdTrans8x8DCT, - ::testing::Values( - make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8))); INSTANTIATE_TEST_CASE_P( SSE2, FwdTrans8x8DCT, ::testing::Values( + make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8), make_tuple(&vp9_highbd_fdct8x8_c, &idct8x8_64_add_10_sse2, 12, VPX_BITS_10), make_tuple(&vp9_highbd_fdct8x8_sse2, @@ -761,12 +742,9 @@ INSTANTIATE_TEST_CASE_P( // TODO(jingning): re-enable after these handle the expanded range [0, 65535] // returned from Rand16(). INSTANTIATE_TEST_CASE_P( - DISABLED_SSE2, FwdTrans8x8HT, - ::testing::Values( - make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( SSE2, FwdTrans8x8HT, ::testing::Values( + make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8), make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8), make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8), make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8))); @@ -791,7 +769,7 @@ INSTANTIATE_TEST_CASE_P( // TODO(jingning): re-enable after this handles the expanded range [0, 65535] // returned from Rand16(). INSTANTIATE_TEST_CASE_P( - DISABLED_SSSE3, FwdTrans8x8DCT, + SSSE3, FwdTrans8x8DCT, ::testing::Values( make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0, VPX_BITS_8))); diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc index 0f335e2e4..e1be80baa 100644 --- a/test/lpf_8_test.cc +++ b/test/lpf_8_test.cc @@ -594,4 +594,35 @@ INSTANTIATE_TEST_CASE_P( #endif // CONFIG_VP9_HIGHBITDEPTH #endif +#if HAVE_NEON && (!CONFIG_VP9_HIGHBITDEPTH) +INSTANTIATE_TEST_CASE_P( + NEON, Loop8Test6Param, + ::testing::Values( +#if HAVE_NEON_ASM + make_tuple(&vp9_lpf_horizontal_16_neon, + &vp9_lpf_horizontal_16_c, 8), +#endif // HAVE_NEON_ASM + make_tuple(&vp9_lpf_horizontal_4_neon, + &vp9_lpf_horizontal_4_c, 8), + make_tuple(&vp9_lpf_horizontal_8_neon, + &vp9_lpf_horizontal_8_c, 8), + make_tuple(&vp9_lpf_vertical_4_neon, + &vp9_lpf_vertical_4_c, 8), + make_tuple(&vp9_lpf_vertical_8_neon, + &vp9_lpf_vertical_8_c, 8))); +INSTANTIATE_TEST_CASE_P( + NEON, Loop8Test9Param, + ::testing::Values( +#if HAVE_NEON_ASM + make_tuple(&vp9_lpf_horizontal_4_dual_neon, + &vp9_lpf_horizontal_4_dual_c, 8), +#endif // HAVE_NEON_ASM + make_tuple(&vp9_lpf_horizontal_8_dual_neon, + &vp9_lpf_horizontal_8_dual_c, 8), + make_tuple(&vp9_lpf_vertical_4_dual_neon, + &vp9_lpf_vertical_4_dual_c, 8), + make_tuple(&vp9_lpf_vertical_8_dual_neon, + &vp9_lpf_vertical_8_dual_c, 8))); +#endif // HAVE_NEON && (!CONFIG_VP9_HIGHBITDEPTH) + } // namespace diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc index 536273e3e..ba82da4e1 100644 --- a/test/partial_idct_test.cc +++ b/test/partial_idct_test.cc @@ -230,7 +230,7 @@ INSTANTIATE_TEST_CASE_P( &vp9_idct4x4_1_add_c, TX_4X4, 1))); -#if HAVE_NEON_ASM +#if HAVE_NEON INSTANTIATE_TEST_CASE_P( NEON, PartialIDctTest, ::testing::Values( @@ -258,7 +258,7 @@ INSTANTIATE_TEST_CASE_P( &vp9_idct4x4_16_add_c, &vp9_idct4x4_1_add_neon, TX_4X4, 1))); -#endif +#endif // HAVE_NEON #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( diff --git a/test/test-data.mk b/test/test-data.mk index 157d1bc34..d07ca3295 100644 --- a/test/test-data.mk +++ b/test/test-data.mk @@ -714,6 +714,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.iv LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf.res ifeq ($(CONFIG_DECODE_PERF_TESTS),yes) +# NewEncode Test +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += niklas_1280_720_30.yuv # BBB VP9 streams LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_426x240_tile_1x1_180kbps.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_640x360_tile_1x2_337kbps.webm @@ -743,7 +745,9 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += desktop_640_360_30.yuv LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += kirland_640_480_30.yuv LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcomoving_640_480_30.yuv LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcostationary_640_480_30.yuv +ifneq ($(CONFIG_DECODE_PERF_TESTS),yes) LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.yuv +endif LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomanarrows_640_480_30.yuv LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomasmallcameramovement_640_480_30.yuv diff --git a/test/test.mk b/test/test.mk index 4b12a7693..d94329924 100644 --- a/test/test.mk +++ b/test/test.mk @@ -46,6 +46,9 @@ LIBVPX_TEST_SRCS-yes += decode_test_driver.h LIBVPX_TEST_SRCS-yes += encode_test_driver.cc LIBVPX_TEST_SRCS-yes += encode_test_driver.h +## IVF writing. +LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += ../ivfenc.c ../ivfenc.h + ## Y4m parsing. LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += y4m_test.cc ../y4menc.c ../y4menc.h diff --git a/third_party/x86inc/x86inc.asm b/third_party/x86inc/x86inc.asm index 99453a998..8e75a4b19 100644 --- a/third_party/x86inc/x86inc.asm +++ b/third_party/x86inc/x86inc.asm @@ -617,9 +617,17 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %elifidn __OUTPUT_FORMAT__,elf64 global %1:function hidden %elifidn __OUTPUT_FORMAT__,macho32 - global %1:private_extern + %ifdef __NASM_VER__ + global %1 + %else + global %1:private_extern + %endif %elifidn __OUTPUT_FORMAT__,macho64 - global %1:private_extern + %ifdef __NASM_VER__ + global %1 + %else + global %1:private_extern + %endif %else global %1 %endif diff --git a/vp9/common/arm/neon/vp9_avg_neon.c b/vp9/common/arm/neon/vp9_avg_neon.c new file mode 100644 index 000000000..3a3db353e --- /dev/null +++ b/vp9/common/arm/neon/vp9_avg_neon.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stddef.h> +#include <arm_neon.h> + +void vp9_convolve_avg_neon( + const uint8_t *src, // r0 + ptrdiff_t src_stride, // r1 + uint8_t *dst, // r2 + ptrdiff_t dst_stride, // r3 + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, + int h) { + uint8_t *d; + uint8x8_t d0u8, d1u8, d2u8, d3u8; + uint32x2_t d0u32, d2u32; + uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8; + (void)filter_x; (void)filter_x_stride; + (void)filter_y; (void)filter_y_stride; + + d = dst; + if (w > 32) { // avg64 + for (; h > 0; h -= 1) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + q2u8 = vld1q_u8(src + 32); + q3u8 = vld1q_u8(src + 48); + src += src_stride; + q8u8 = vld1q_u8(d); + q9u8 = vld1q_u8(d + 16); + q10u8 = vld1q_u8(d + 32); + q11u8 = vld1q_u8(d + 48); + d += dst_stride; + + q0u8 = vrhaddq_u8(q0u8, q8u8); + q1u8 = vrhaddq_u8(q1u8, q9u8); + q2u8 = vrhaddq_u8(q2u8, q10u8); + q3u8 = vrhaddq_u8(q3u8, q11u8); + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + vst1q_u8(dst + 32, q2u8); + vst1q_u8(dst + 48, q3u8); + dst += dst_stride; + } + } else if (w == 32) { // avg32 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + src += src_stride; + q2u8 = vld1q_u8(src); + q3u8 = vld1q_u8(src + 16); + src += src_stride; + q8u8 = vld1q_u8(d); + q9u8 = vld1q_u8(d + 16); + d += dst_stride; + q10u8 = vld1q_u8(d); + q11u8 = vld1q_u8(d + 16); + d += dst_stride; + + q0u8 = vrhaddq_u8(q0u8, q8u8); + q1u8 = vrhaddq_u8(q1u8, q9u8); + q2u8 = vrhaddq_u8(q2u8, q10u8); + q3u8 = vrhaddq_u8(q3u8, q11u8); + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + dst += dst_stride; + vst1q_u8(dst, q2u8); + vst1q_u8(dst + 16, q3u8); + dst += dst_stride; + } + } else if (w > 8) { // avg16 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + src += src_stride; + q1u8 = vld1q_u8(src); + src += src_stride; + q2u8 = vld1q_u8(d); + d += dst_stride; + q3u8 = vld1q_u8(d); + d += dst_stride; + + q0u8 = vrhaddq_u8(q0u8, q2u8); + q1u8 = vrhaddq_u8(q1u8, q3u8); + + vst1q_u8(dst, q0u8); + dst += dst_stride; + vst1q_u8(dst, q1u8); + dst += dst_stride; + } + } else if (w == 8) { // avg8 + for (; h > 0; h -= 2) { + d0u8 = vld1_u8(src); + src += src_stride; + d1u8 = vld1_u8(src); + src += src_stride; + d2u8 = vld1_u8(d); + d += dst_stride; + d3u8 = vld1_u8(d); + d += dst_stride; + + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + q0u8 = vrhaddq_u8(q0u8, q1u8); + + vst1_u8(dst, vget_low_u8(q0u8)); + dst += dst_stride; + vst1_u8(dst, vget_high_u8(q0u8)); + dst += dst_stride; + } + } else { // avg4 + for (; h > 0; h -= 2) { + d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0); + src += src_stride; + d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1); + src += src_stride; + d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0); + d += dst_stride; + d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1); + d += dst_stride; + + d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), + vreinterpret_u8_u32(d2u32)); + + d0u32 = vreinterpret_u32_u8(d0u8); + vst1_lane_u32((uint32_t *)dst, d0u32, 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, d0u32, 1); + dst += dst_stride; + } + } + return; +} diff --git a/vp9/common/arm/neon/vp9_avg_neon.asm b/vp9/common/arm/neon/vp9_avg_neon_asm.asm index 7d2453021..7d2453021 100644 --- a/vp9/common/arm/neon/vp9_avg_neon.asm +++ b/vp9/common/arm/neon/vp9_avg_neon_asm.asm diff --git a/vp9/common/arm/neon/vp9_convolve8_avg_neon.c b/vp9/common/arm/neon/vp9_convolve8_avg_neon.c new file mode 100644 index 000000000..2f8dda07c --- /dev/null +++ b/vp9/common/arm/neon/vp9_convolve8_avg_neon.c @@ -0,0 +1,387 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stddef.h> +#include <arm_neon.h> + +void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); +void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); + +static inline int32x4_t MULTIPLY_BY_Q0( + int16x4_t dsrc0, + int16x4_t dsrc1, + int16x4_t dsrc2, + int16x4_t dsrc3, + int16x4_t dsrc4, + int16x4_t dsrc5, + int16x4_t dsrc6, + int16x4_t dsrc7, + int16x8_t q0s16) { + int32x4_t qdst; + int16x4_t d0s16, d1s16; + + d0s16 = vget_low_s16(q0s16); + d1s16 = vget_high_s16(q0s16); + + qdst = vmull_lane_s16(dsrc0, d0s16, 0); + qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1); + qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2); + qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3); + qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0); + qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1); + qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2); + qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3); + return qdst; +} + +void vp9_convolve8_avg_horiz_neon( + uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_x, + int x_step_q4, + const int16_t *filter_y, // unused + int y_step_q4, // unused + int w, + int h) { + int width; + uint8_t *s, *d; + uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; + uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32; + uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16; + uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; + int16x8_t q0s16; + uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; + int32x4_t q1s32, q2s32, q14s32, q15s32; + uint16x8x2_t q0x2u16; + uint8x8x2_t d0x2u8, d1x2u8; + uint32x2x2_t d0x2u32; + uint16x4x2_t d0x2u16, d1x2u16; + uint32x4x2_t q0x2u32; + + if (x_step_q4 != 16) { + vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, w, h); + return; + } + + q0s16 = vld1q_s16(filter_x); + + src -= 3; // adjust for taps + for (; h > 0; h -= 4) { // loop_horiz_v + s = src; + d24u8 = vld1_u8(s); + s += src_stride; + d25u8 = vld1_u8(s); + s += src_stride; + d26u8 = vld1_u8(s); + s += src_stride; + d27u8 = vld1_u8(s); + + q12u8 = vcombine_u8(d24u8, d25u8); + q13u8 = vcombine_u8(d26u8, d27u8); + + q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8), + vreinterpretq_u16_u8(q13u8)); + d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); + d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); + d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); + d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); + d0x2u8 = vtrn_u8(d24u8, d25u8); + d1x2u8 = vtrn_u8(d26u8, d27u8); + + __builtin_prefetch(src + src_stride * 4); + __builtin_prefetch(src + src_stride * 5); + + q8u16 = vmovl_u8(d0x2u8.val[0]); + q9u16 = vmovl_u8(d0x2u8.val[1]); + q10u16 = vmovl_u8(d1x2u8.val[0]); + q11u16 = vmovl_u8(d1x2u8.val[1]); + + src += 7; + d16u16 = vget_low_u16(q8u16); + d17u16 = vget_high_u16(q8u16); + d18u16 = vget_low_u16(q9u16); + d19u16 = vget_high_u16(q9u16); + q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 + q9u16 = vcombine_u16(d17u16, d19u16); + + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 + for (width = w; + width > 0; + width -= 4, src += 4, dst += 4) { // loop_horiz + s = src; + d28u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d29u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d31u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d30u32 = vld1_dup_u32((const uint32_t *)s); + + __builtin_prefetch(src + 64); + + d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32), + vreinterpret_u16_u32(d31u32)); + d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32), + vreinterpret_u16_u32(d30u32)); + d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 + vreinterpret_u8_u16(d1x2u16.val[0])); // d29 + d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 + vreinterpret_u8_u16(d1x2u16.val[1])); // d30 + + __builtin_prefetch(src + 64 + src_stride); + + q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); + q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); + q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8), + vreinterpretq_u32_u8(q15u8)); + + d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); + d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); + q12u16 = vmovl_u8(d28u8); + q13u16 = vmovl_u8(d29u8); + + __builtin_prefetch(src + 64 + src_stride * 2); + + d = dst; + d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0); + d += dst_stride; + d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0); + d += dst_stride; + d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1); + d += dst_stride; + d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1); + + d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); + d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, + d18s16, d19s16, d23s16, d24s16, q0s16); + q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, + d19s16, d23s16, d24s16, d26s16, q0s16); + q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, + d23s16, d24s16, d26s16, d27s16, q0s16); + q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, + d24s16, d26s16, d27s16, d25s16, q0s16); + + __builtin_prefetch(src + 64 + src_stride * 3); + + d2u16 = vqrshrun_n_s32(q1s32, 7); + d3u16 = vqrshrun_n_s32(q2s32, 7); + d4u16 = vqrshrun_n_s32(q14s32, 7); + d5u16 = vqrshrun_n_s32(q15s32, 7); + + q1u16 = vcombine_u16(d2u16, d3u16); + q2u16 = vcombine_u16(d4u16, d5u16); + + d2u8 = vqmovn_u16(q1u16); + d3u8 = vqmovn_u16(q2u16); + + d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), + vreinterpret_u16_u8(d3u8)); + d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), + vreinterpret_u32_u16(d0x2u16.val[1])); + d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), + vreinterpret_u8_u32(d0x2u32.val[1])); + + q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); + q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32)); + + q1u8 = vrhaddq_u8(q1u8, q3u8); + + d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8)); + d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8)); + + d = dst; + vst1_lane_u32((uint32_t *)d, d2u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d2u32, 1); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 1); + + q8u16 = q9u16; + d20s16 = d23s16; + q11u16 = q12u16; + q9u16 = q13u16; + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + } + src += src_stride * 4 - w - 7; + dst += dst_stride * 4 - w; + } + return; +} + +void vp9_convolve8_avg_vert_neon( + uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_x, // unused + int x_step_q4, // unused + const int16_t *filter_y, + int y_step_q4, + int w, + int h) { + int height; + uint8_t *s, *d; + uint8x8_t d2u8, d3u8; + uint32x2_t d2u32, d3u32, d6u32, d7u32; + uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; + uint8x16_t q1u8, q3u8; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; + int16x4_t d24s16, d25s16, d26s16, d27s16; + uint16x4_t d2u16, d3u16, d4u16, d5u16; + int16x8_t q0s16; + uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; + int32x4_t q1s32, q2s32, q14s32, q15s32; + + if (y_step_q4 != 16) { + vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, w, h); + return; + } + + src -= src_stride * 3; + q0s16 = vld1q_s16(filter_y); + for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h + s = src; + d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); + s += src_stride; + d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); + s += src_stride; + d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); + s += src_stride; + d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); + s += src_stride; + d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); + s += src_stride; + d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); + s += src_stride; + d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); + s += src_stride; + + q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); + q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); + q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); + q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); + + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d = dst; + for (height = h; height > 0; height -= 4) { // loop_vert + d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); + s += src_stride; + d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); + s += src_stride; + d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); + s += src_stride; + d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); + s += src_stride; + + q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); + q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); + + d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0); + d += dst_stride; + d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1); + d += dst_stride; + d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0); + d += dst_stride; + d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1); + d -= dst_stride * 3; + + d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); + d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + + __builtin_prefetch(s); + __builtin_prefetch(s + src_stride); + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, + d20s16, d21s16, d22s16, d24s16, q0s16); + __builtin_prefetch(s + src_stride * 2); + __builtin_prefetch(s + src_stride * 3); + q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, + d21s16, d22s16, d24s16, d26s16, q0s16); + __builtin_prefetch(d); + __builtin_prefetch(d + dst_stride); + q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, + d22s16, d24s16, d26s16, d27s16, q0s16); + __builtin_prefetch(d + dst_stride * 2); + __builtin_prefetch(d + dst_stride * 3); + q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, + d24s16, d26s16, d27s16, d25s16, q0s16); + + d2u16 = vqrshrun_n_s32(q1s32, 7); + d3u16 = vqrshrun_n_s32(q2s32, 7); + d4u16 = vqrshrun_n_s32(q14s32, 7); + d5u16 = vqrshrun_n_s32(q15s32, 7); + + q1u16 = vcombine_u16(d2u16, d3u16); + q2u16 = vcombine_u16(d4u16, d5u16); + + d2u8 = vqmovn_u16(q1u16); + d3u8 = vqmovn_u16(q2u16); + + q1u8 = vcombine_u8(d2u8, d3u8); + q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32)); + + q1u8 = vrhaddq_u8(q1u8, q3u8); + + d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8)); + d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8)); + + vst1_lane_u32((uint32_t *)d, d2u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d2u32, 1); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 1); + d += dst_stride; + + q8u16 = q10u16; + d18s16 = d22s16; + d19s16 = d24s16; + q10u16 = q13u16; + d22s16 = d25s16; + } + } + return; +} diff --git a/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm b/vp9/common/arm/neon/vp9_convolve8_avg_neon_asm.asm index 4d85846f0..4d85846f0 100644 --- a/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm +++ b/vp9/common/arm/neon/vp9_convolve8_avg_neon_asm.asm diff --git a/vp9/common/arm/neon/vp9_convolve8_neon.c b/vp9/common/arm/neon/vp9_convolve8_neon.c new file mode 100644 index 000000000..c8704aa9c --- /dev/null +++ b/vp9/common/arm/neon/vp9_convolve8_neon.c @@ -0,0 +1,354 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stddef.h> +#include <arm_neon.h> + +void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); +void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); + +static inline int32x4_t MULTIPLY_BY_Q0( + int16x4_t dsrc0, + int16x4_t dsrc1, + int16x4_t dsrc2, + int16x4_t dsrc3, + int16x4_t dsrc4, + int16x4_t dsrc5, + int16x4_t dsrc6, + int16x4_t dsrc7, + int16x8_t q0s16) { + int32x4_t qdst; + int16x4_t d0s16, d1s16; + + d0s16 = vget_low_s16(q0s16); + d1s16 = vget_high_s16(q0s16); + + qdst = vmull_lane_s16(dsrc0, d0s16, 0); + qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1); + qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2); + qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3); + qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0); + qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1); + qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2); + qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3); + return qdst; +} + +void vp9_convolve8_horiz_neon( + uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_x, + int x_step_q4, + const int16_t *filter_y, // unused + int y_step_q4, // unused + int w, + int h) { + int width; + uint8_t *s, *d, *psrc, *pdst; + uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; + uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32; + uint8x16_t q12u8, q13u8, q14u8, q15u8; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16; + uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; + int16x8_t q0s16; + uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; + int32x4_t q1s32, q2s32, q14s32, q15s32; + uint16x8x2_t q0x2u16; + uint8x8x2_t d0x2u8, d1x2u8; + uint32x2x2_t d0x2u32; + uint16x4x2_t d0x2u16, d1x2u16; + uint32x4x2_t q0x2u32; + + if (x_step_q4 != 16) { + vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, w, h); + return; + } + + q0s16 = vld1q_s16(filter_x); + + src -= 3; // adjust for taps + for (; h > 0; h -= 4, + src += src_stride * 4, + dst += dst_stride * 4) { // loop_horiz_v + s = src; + d24u8 = vld1_u8(s); + s += src_stride; + d25u8 = vld1_u8(s); + s += src_stride; + d26u8 = vld1_u8(s); + s += src_stride; + d27u8 = vld1_u8(s); + + q12u8 = vcombine_u8(d24u8, d25u8); + q13u8 = vcombine_u8(d26u8, d27u8); + + q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8), + vreinterpretq_u16_u8(q13u8)); + d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); + d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); + d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); + d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); + d0x2u8 = vtrn_u8(d24u8, d25u8); + d1x2u8 = vtrn_u8(d26u8, d27u8); + + __builtin_prefetch(src + src_stride * 4); + __builtin_prefetch(src + src_stride * 5); + __builtin_prefetch(src + src_stride * 6); + + q8u16 = vmovl_u8(d0x2u8.val[0]); + q9u16 = vmovl_u8(d0x2u8.val[1]); + q10u16 = vmovl_u8(d1x2u8.val[0]); + q11u16 = vmovl_u8(d1x2u8.val[1]); + + d16u16 = vget_low_u16(q8u16); + d17u16 = vget_high_u16(q8u16); + d18u16 = vget_low_u16(q9u16); + d19u16 = vget_high_u16(q9u16); + q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 + q9u16 = vcombine_u16(d17u16, d19u16); + + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 + for (width = w, psrc = src + 7, pdst = dst; + width > 0; + width -= 4, psrc += 4, pdst += 4) { // loop_horiz + s = psrc; + d28u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d29u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d31u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d30u32 = vld1_dup_u32((const uint32_t *)s); + + __builtin_prefetch(psrc + 64); + + d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32), + vreinterpret_u16_u32(d31u32)); + d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32), + vreinterpret_u16_u32(d30u32)); + d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 + vreinterpret_u8_u16(d1x2u16.val[0])); // d29 + d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 + vreinterpret_u8_u16(d1x2u16.val[1])); // d30 + + __builtin_prefetch(psrc + 64 + src_stride); + + q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); + q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); + q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8), + vreinterpretq_u32_u8(q15u8)); + + d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); + d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); + q12u16 = vmovl_u8(d28u8); + q13u16 = vmovl_u8(d29u8); + + __builtin_prefetch(psrc + 64 + src_stride * 2); + + d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); + d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, + d18s16, d19s16, d23s16, d24s16, q0s16); + q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, + d19s16, d23s16, d24s16, d26s16, q0s16); + q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, + d23s16, d24s16, d26s16, d27s16, q0s16); + q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, + d24s16, d26s16, d27s16, d25s16, q0s16); + + __builtin_prefetch(psrc + 60 + src_stride * 3); + + d2u16 = vqrshrun_n_s32(q1s32, 7); + d3u16 = vqrshrun_n_s32(q2s32, 7); + d4u16 = vqrshrun_n_s32(q14s32, 7); + d5u16 = vqrshrun_n_s32(q15s32, 7); + + q1u16 = vcombine_u16(d2u16, d3u16); + q2u16 = vcombine_u16(d4u16, d5u16); + + d2u8 = vqmovn_u16(q1u16); + d3u8 = vqmovn_u16(q2u16); + + d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), + vreinterpret_u16_u8(d3u8)); + d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), + vreinterpret_u32_u16(d0x2u16.val[1])); + d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), + vreinterpret_u8_u32(d0x2u32.val[1])); + + d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]); + d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]); + + d = pdst; + vst1_lane_u32((uint32_t *)d, d2u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d2u32, 1); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 1); + + q8u16 = q9u16; + d20s16 = d23s16; + q11u16 = q12u16; + q9u16 = q13u16; + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + } + } + return; +} + +void vp9_convolve8_vert_neon( + uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_x, // unused + int x_step_q4, // unused + const int16_t *filter_y, + int y_step_q4, + int w, + int h) { + int height; + uint8_t *s, *d; + uint32x2_t d2u32, d3u32; + uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; + int16x4_t d24s16, d25s16, d26s16, d27s16; + uint16x4_t d2u16, d3u16, d4u16, d5u16; + int16x8_t q0s16; + uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; + int32x4_t q1s32, q2s32, q14s32, q15s32; + + if (y_step_q4 != 16) { + vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, w, h); + return; + } + + src -= src_stride * 3; + q0s16 = vld1q_s16(filter_y); + for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h + s = src; + d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); + s += src_stride; + d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); + s += src_stride; + d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); + s += src_stride; + d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); + s += src_stride; + d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); + s += src_stride; + d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); + s += src_stride; + d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); + s += src_stride; + + q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); + q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); + q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); + q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); + + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d = dst; + for (height = h; height > 0; height -= 4) { // loop_vert + d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); + s += src_stride; + d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); + s += src_stride; + d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); + s += src_stride; + d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); + s += src_stride; + + q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); + q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); + + d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); + d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + + __builtin_prefetch(d); + __builtin_prefetch(d + dst_stride); + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, + d20s16, d21s16, d22s16, d24s16, q0s16); + __builtin_prefetch(d + dst_stride * 2); + __builtin_prefetch(d + dst_stride * 3); + q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, + d21s16, d22s16, d24s16, d26s16, q0s16); + __builtin_prefetch(s); + __builtin_prefetch(s + src_stride); + q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, + d22s16, d24s16, d26s16, d27s16, q0s16); + __builtin_prefetch(s + src_stride * 2); + __builtin_prefetch(s + src_stride * 3); + q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, + d24s16, d26s16, d27s16, d25s16, q0s16); + + d2u16 = vqrshrun_n_s32(q1s32, 7); + d3u16 = vqrshrun_n_s32(q2s32, 7); + d4u16 = vqrshrun_n_s32(q14s32, 7); + d5u16 = vqrshrun_n_s32(q15s32, 7); + + q1u16 = vcombine_u16(d2u16, d3u16); + q2u16 = vcombine_u16(d4u16, d5u16); + + d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16)); + d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16)); + + vst1_lane_u32((uint32_t *)d, d2u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d2u32, 1); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 1); + d += dst_stride; + + q8u16 = q10u16; + d18s16 = d22s16; + d19s16 = d24s16; + q10u16 = q13u16; + d22s16 = d25s16; + } + } + return; +} diff --git a/vp9/common/arm/neon/vp9_convolve8_neon.asm b/vp9/common/arm/neon/vp9_convolve8_neon_asm.asm index 184c3ad67..184c3ad67 100644 --- a/vp9/common/arm/neon/vp9_convolve8_neon.asm +++ b/vp9/common/arm/neon/vp9_convolve8_neon_asm.asm diff --git a/vp9/common/arm/neon/vp9_copy_neon.c b/vp9/common/arm/neon/vp9_copy_neon.c new file mode 100644 index 000000000..f334abe11 --- /dev/null +++ b/vp9/common/arm/neon/vp9_copy_neon.c @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stddef.h> +#include <arm_neon.h> + +void vp9_convolve_copy_neon( + const uint8_t *src, // r0 + ptrdiff_t src_stride, // r1 + uint8_t *dst, // r2 + ptrdiff_t dst_stride, // r3 + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, + int h) { + uint8x8_t d0u8, d2u8; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + (void)filter_x; (void)filter_x_stride; + (void)filter_y; (void)filter_y_stride; + + if (w > 32) { // copy64 + for (; h > 0; h--) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + q2u8 = vld1q_u8(src + 32); + q3u8 = vld1q_u8(src + 48); + src += src_stride; + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + vst1q_u8(dst + 32, q2u8); + vst1q_u8(dst + 48, q3u8); + dst += dst_stride; + } + } else if (w == 32) { // copy32 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + src += src_stride; + q2u8 = vld1q_u8(src); + q3u8 = vld1q_u8(src + 16); + src += src_stride; + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + dst += dst_stride; + vst1q_u8(dst, q2u8); + vst1q_u8(dst + 16, q3u8); + dst += dst_stride; + } + } else if (w > 8) { // copy16 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + src += src_stride; + q1u8 = vld1q_u8(src); + src += src_stride; + + vst1q_u8(dst, q0u8); + dst += dst_stride; + vst1q_u8(dst, q1u8); + dst += dst_stride; + } + } else if (w == 8) { // copy8 + for (; h > 0; h -= 2) { + d0u8 = vld1_u8(src); + src += src_stride; + d2u8 = vld1_u8(src); + src += src_stride; + + vst1_u8(dst, d0u8); + dst += dst_stride; + vst1_u8(dst, d2u8); + dst += dst_stride; + } + } else { // copy4 + for (; h > 0; h--) { + *(uint32_t *)dst = *(const uint32_t *)src; + src += src_stride; + dst += dst_stride; + } + } + return; +} diff --git a/vp9/common/arm/neon/vp9_copy_neon.asm b/vp9/common/arm/neon/vp9_copy_neon_asm.asm index a0bd04a35..a0bd04a35 100644 --- a/vp9/common/arm/neon/vp9_copy_neon.asm +++ b/vp9/common/arm/neon/vp9_copy_neon_asm.asm diff --git a/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c b/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c new file mode 100644 index 000000000..3c8c6a934 --- /dev/null +++ b/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "vp9/common/vp9_idct.h" + +void vp9_idct16x16_1_add_neon( + int16_t *input, + uint8_t *dest, + int dest_stride) { + uint8x8_t d2u8, d3u8, d30u8, d31u8; + uint64x1_t d2u64, d3u64, d4u64, d5u64; + uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q0s16; + uint8_t *d1, *d2; + int16_t i, j, a1, cospi_16_64 = 11585; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 6); + + q0s16 = vdupq_n_s16(a1); + q0u16 = vreinterpretq_u16_s16(q0s16); + + for (d1 = d2 = dest, i = 0; i < 4; i++) { + for (j = 0; j < 2; j++) { + d2u64 = vld1_u64((const uint64_t *)d1); + d3u64 = vld1_u64((const uint64_t *)(d1 + 8)); + d1 += dest_stride; + d4u64 = vld1_u64((const uint64_t *)d1); + d5u64 = vld1_u64((const uint64_t *)(d1 + 8)); + d1 += dest_stride; + + q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); + q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); + q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); + q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); + + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); + vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8)); + d2 += dest_stride; + } + } + return; +} diff --git a/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm b/vp9/common/arm/neon/vp9_idct16x16_1_add_neon_asm.asm index b1fd21bb6..b1fd21bb6 100644 --- a/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm +++ b/vp9/common/arm/neon/vp9_idct16x16_1_add_neon_asm.asm diff --git a/vp9/common/arm/neon/vp9_idct16x16_add_neon.c b/vp9/common/arm/neon/vp9_idct16x16_add_neon.c new file mode 100644 index 000000000..68d7cccc0 --- /dev/null +++ b/vp9/common/arm/neon/vp9_idct16x16_add_neon.c @@ -0,0 +1,1330 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +static int16_t cospi_2_64 = 16305; +static int16_t cospi_4_64 = 16069; +static int16_t cospi_6_64 = 15679; +static int16_t cospi_8_64 = 15137; +static int16_t cospi_10_64 = 14449; +static int16_t cospi_12_64 = 13623; +static int16_t cospi_14_64 = 12665; +static int16_t cospi_16_64 = 11585; +static int16_t cospi_18_64 = 10394; +static int16_t cospi_20_64 = 9102; +static int16_t cospi_22_64 = 7723; +static int16_t cospi_24_64 = 6270; +static int16_t cospi_26_64 = 4756; +static int16_t cospi_28_64 = 3196; +static int16_t cospi_30_64 = 1606; + +static inline void TRANSPOSE8X8( + int16x8_t *q8s16, + int16x8_t *q9s16, + int16x8_t *q10s16, + int16x8_t *q11s16, + int16x8_t *q12s16, + int16x8_t *q13s16, + int16x8_t *q14s16, + int16x8_t *q15s16) { + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; + int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 + *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 + *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 + *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 + *q12s16 = vcombine_s16(d17s16, d25s16); + *q13s16 = vcombine_s16(d19s16, d27s16); + *q14s16 = vcombine_s16(d21s16, d29s16); + *q15s16 = vcombine_s16(d23s16, d31s16); + + q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16), + vreinterpretq_s32_s16(*q10s16)); + q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16), + vreinterpretq_s32_s16(*q11s16)); + q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16), + vreinterpretq_s32_s16(*q14s16)); + q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16), + vreinterpretq_s32_s16(*q15s16)); + + q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 + vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 + q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 + vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 + q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 + vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 + q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 + vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 + + *q8s16 = q0x2s16.val[0]; + *q9s16 = q0x2s16.val[1]; + *q10s16 = q1x2s16.val[0]; + *q11s16 = q1x2s16.val[1]; + *q12s16 = q2x2s16.val[0]; + *q13s16 = q2x2s16.val[1]; + *q14s16 = q3x2s16.val[0]; + *q15s16 = q3x2s16.val[1]; + return; +} + +void vp9_idct16x16_256_add_neon_pass1( + int16_t *in, + int16_t *out, + int output_stride) { + int16x4_t d0s16, d1s16, d2s16, d3s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; + uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; + int16x8x2_t q0x2s16; + + q0x2s16 = vld2q_s16(in); + q8s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q9s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q10s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q11s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q12s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q13s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q14s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + d30s16 = vget_low_s16(q15s16); + d31s16 = vget_high_s16(q15s16); + + // stage 3 + d0s16 = vdup_n_s16(cospi_28_64); + d1s16 = vdup_n_s16(cospi_4_64); + + q2s32 = vmull_s16(d18s16, d0s16); + q3s32 = vmull_s16(d19s16, d0s16); + q5s32 = vmull_s16(d18s16, d1s16); + q6s32 = vmull_s16(d19s16, d1s16); + + q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); + q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); + q5s32 = vmlal_s16(q5s32, d30s16, d0s16); + q6s32 = vmlal_s16(q6s32, d31s16, d0s16); + + d2s16 = vdup_n_s16(cospi_12_64); + d3s16 = vdup_n_s16(cospi_20_64); + + d8s16 = vqrshrn_n_s32(q2s32, 14); + d9s16 = vqrshrn_n_s32(q3s32, 14); + d14s16 = vqrshrn_n_s32(q5s32, 14); + d15s16 = vqrshrn_n_s32(q6s32, 14); + q4s16 = vcombine_s16(d8s16, d9s16); + q7s16 = vcombine_s16(d14s16, d15s16); + + q2s32 = vmull_s16(d26s16, d2s16); + q3s32 = vmull_s16(d27s16, d2s16); + q9s32 = vmull_s16(d26s16, d3s16); + q15s32 = vmull_s16(d27s16, d3s16); + + q2s32 = vmlsl_s16(q2s32, d22s16, d3s16); + q3s32 = vmlsl_s16(q3s32, d23s16, d3s16); + q9s32 = vmlal_s16(q9s32, d22s16, d2s16); + q15s32 = vmlal_s16(q15s32, d23s16, d2s16); + + d10s16 = vqrshrn_n_s32(q2s32, 14); + d11s16 = vqrshrn_n_s32(q3s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q15s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 4 + d30s16 = vdup_n_s16(cospi_16_64); + + q2s32 = vmull_s16(d16s16, d30s16); + q11s32 = vmull_s16(d17s16, d30s16); + q0s32 = vmull_s16(d24s16, d30s16); + q1s32 = vmull_s16(d25s16, d30s16); + + d30s16 = vdup_n_s16(cospi_24_64); + d31s16 = vdup_n_s16(cospi_8_64); + + q3s32 = vaddq_s32(q2s32, q0s32); + q12s32 = vaddq_s32(q11s32, q1s32); + q13s32 = vsubq_s32(q2s32, q0s32); + q1s32 = vsubq_s32(q11s32, q1s32); + + d16s16 = vqrshrn_n_s32(q3s32, 14); + d17s16 = vqrshrn_n_s32(q12s32, 14); + d18s16 = vqrshrn_n_s32(q13s32, 14); + d19s16 = vqrshrn_n_s32(q1s32, 14); + q8s16 = vcombine_s16(d16s16, d17s16); + q9s16 = vcombine_s16(d18s16, d19s16); + + q0s32 = vmull_s16(d20s16, d31s16); + q1s32 = vmull_s16(d21s16, d31s16); + q12s32 = vmull_s16(d20s16, d30s16); + q13s32 = vmull_s16(d21s16, d30s16); + + q0s32 = vmlal_s16(q0s32, d28s16, d30s16); + q1s32 = vmlal_s16(q1s32, d29s16, d30s16); + q12s32 = vmlsl_s16(q12s32, d28s16, d31s16); + q13s32 = vmlsl_s16(q13s32, d29s16, d31s16); + + d22s16 = vqrshrn_n_s32(q0s32, 14); + d23s16 = vqrshrn_n_s32(q1s32, 14); + d20s16 = vqrshrn_n_s32(q12s32, 14); + d21s16 = vqrshrn_n_s32(q13s32, 14); + q10s16 = vcombine_s16(d20s16, d21s16); + q11s16 = vcombine_s16(d22s16, d23s16); + + q13s16 = vsubq_s16(q4s16, q5s16); + q4s16 = vaddq_s16(q4s16, q5s16); + q14s16 = vsubq_s16(q7s16, q6s16); + q15s16 = vaddq_s16(q6s16, q7s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + // stage 5 + q0s16 = vaddq_s16(q8s16, q11s16); + q1s16 = vaddq_s16(q9s16, q10s16); + q2s16 = vsubq_s16(q9s16, q10s16); + q3s16 = vsubq_s16(q8s16, q11s16); + + d16s16 = vdup_n_s16(cospi_16_64); + + q11s32 = vmull_s16(d26s16, d16s16); + q12s32 = vmull_s16(d27s16, d16s16); + q9s32 = vmull_s16(d28s16, d16s16); + q10s32 = vmull_s16(d29s16, d16s16); + + q6s32 = vsubq_s32(q9s32, q11s32); + q13s32 = vsubq_s32(q10s32, q12s32); + q9s32 = vaddq_s32(q9s32, q11s32); + q10s32 = vaddq_s32(q10s32, q12s32); + + d10s16 = vqrshrn_n_s32(q6s32, 14); + d11s16 = vqrshrn_n_s32(q13s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q10s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 6 + q8s16 = vaddq_s16(q0s16, q15s16); + q9s16 = vaddq_s16(q1s16, q6s16); + q10s16 = vaddq_s16(q2s16, q5s16); + q11s16 = vaddq_s16(q3s16, q4s16); + q12s16 = vsubq_s16(q3s16, q4s16); + q13s16 = vsubq_s16(q2s16, q5s16); + q14s16 = vsubq_s16(q1s16, q6s16); + q15s16 = vsubq_s16(q0s16, q15s16); + + d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); + d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); + d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); + d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); + d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); + d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); + d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); + d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); + d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); + d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); + d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); + + // store the data + output_stride >>= 1; // output_stride / 2, out is int16_t + vst1_u64((uint64_t *)out, d16u64); + out += output_stride; + vst1_u64((uint64_t *)out, d17u64); + out += output_stride; + vst1_u64((uint64_t *)out, d18u64); + out += output_stride; + vst1_u64((uint64_t *)out, d19u64); + out += output_stride; + vst1_u64((uint64_t *)out, d20u64); + out += output_stride; + vst1_u64((uint64_t *)out, d21u64); + out += output_stride; + vst1_u64((uint64_t *)out, d22u64); + out += output_stride; + vst1_u64((uint64_t *)out, d23u64); + out += output_stride; + vst1_u64((uint64_t *)out, d24u64); + out += output_stride; + vst1_u64((uint64_t *)out, d25u64); + out += output_stride; + vst1_u64((uint64_t *)out, d26u64); + out += output_stride; + vst1_u64((uint64_t *)out, d27u64); + out += output_stride; + vst1_u64((uint64_t *)out, d28u64); + out += output_stride; + vst1_u64((uint64_t *)out, d29u64); + out += output_stride; + vst1_u64((uint64_t *)out, d30u64); + out += output_stride; + vst1_u64((uint64_t *)out, d31u64); + return; +} + +void vp9_idct16x16_256_add_neon_pass2( + int16_t *src, + int16_t *out, + int16_t *pass1Output, + int16_t skip_adding, + uint8_t *dest, + int dest_stride) { + uint8_t *d; + uint8x8_t d12u8, d13u8; + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + uint64x1_t d24u64, d25u64, d26u64, d27u64; + int64x1_t d12s64, d13s64; + uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16; + uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32; + int16x8x2_t q0x2s16; + + q0x2s16 = vld2q_s16(src); + q8s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q9s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q10s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q11s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q12s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q13s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q14s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + d30s16 = vget_low_s16(q15s16); + d31s16 = vget_high_s16(q15s16); + + // stage 3 + d12s16 = vdup_n_s16(cospi_30_64); + d13s16 = vdup_n_s16(cospi_2_64); + + q2s32 = vmull_s16(d16s16, d12s16); + q3s32 = vmull_s16(d17s16, d12s16); + q1s32 = vmull_s16(d16s16, d13s16); + q4s32 = vmull_s16(d17s16, d13s16); + + q2s32 = vmlsl_s16(q2s32, d30s16, d13s16); + q3s32 = vmlsl_s16(q3s32, d31s16, d13s16); + q1s32 = vmlal_s16(q1s32, d30s16, d12s16); + q4s32 = vmlal_s16(q4s32, d31s16, d12s16); + + d0s16 = vqrshrn_n_s32(q2s32, 14); + d1s16 = vqrshrn_n_s32(q3s32, 14); + d14s16 = vqrshrn_n_s32(q1s32, 14); + d15s16 = vqrshrn_n_s32(q4s32, 14); + q0s16 = vcombine_s16(d0s16, d1s16); + q7s16 = vcombine_s16(d14s16, d15s16); + + d30s16 = vdup_n_s16(cospi_14_64); + d31s16 = vdup_n_s16(cospi_18_64); + + q2s32 = vmull_s16(d24s16, d30s16); + q3s32 = vmull_s16(d25s16, d30s16); + q4s32 = vmull_s16(d24s16, d31s16); + q5s32 = vmull_s16(d25s16, d31s16); + + q2s32 = vmlsl_s16(q2s32, d22s16, d31s16); + q3s32 = vmlsl_s16(q3s32, d23s16, d31s16); + q4s32 = vmlal_s16(q4s32, d22s16, d30s16); + q5s32 = vmlal_s16(q5s32, d23s16, d30s16); + + d2s16 = vqrshrn_n_s32(q2s32, 14); + d3s16 = vqrshrn_n_s32(q3s32, 14); + d12s16 = vqrshrn_n_s32(q4s32, 14); + d13s16 = vqrshrn_n_s32(q5s32, 14); + q1s16 = vcombine_s16(d2s16, d3s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + d30s16 = vdup_n_s16(cospi_22_64); + d31s16 = vdup_n_s16(cospi_10_64); + + q11s32 = vmull_s16(d20s16, d30s16); + q12s32 = vmull_s16(d21s16, d30s16); + q4s32 = vmull_s16(d20s16, d31s16); + q5s32 = vmull_s16(d21s16, d31s16); + + q11s32 = vmlsl_s16(q11s32, d26s16, d31s16); + q12s32 = vmlsl_s16(q12s32, d27s16, d31s16); + q4s32 = vmlal_s16(q4s32, d26s16, d30s16); + q5s32 = vmlal_s16(q5s32, d27s16, d30s16); + + d4s16 = vqrshrn_n_s32(q11s32, 14); + d5s16 = vqrshrn_n_s32(q12s32, 14); + d11s16 = vqrshrn_n_s32(q5s32, 14); + d10s16 = vqrshrn_n_s32(q4s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + d30s16 = vdup_n_s16(cospi_6_64); + d31s16 = vdup_n_s16(cospi_26_64); + + q10s32 = vmull_s16(d28s16, d30s16); + q11s32 = vmull_s16(d29s16, d30s16); + q12s32 = vmull_s16(d28s16, d31s16); + q13s32 = vmull_s16(d29s16, d31s16); + + q10s32 = vmlsl_s16(q10s32, d18s16, d31s16); + q11s32 = vmlsl_s16(q11s32, d19s16, d31s16); + q12s32 = vmlal_s16(q12s32, d18s16, d30s16); + q13s32 = vmlal_s16(q13s32, d19s16, d30s16); + + d6s16 = vqrshrn_n_s32(q10s32, 14); + d7s16 = vqrshrn_n_s32(q11s32, 14); + d8s16 = vqrshrn_n_s32(q12s32, 14); + d9s16 = vqrshrn_n_s32(q13s32, 14); + q3s16 = vcombine_s16(d6s16, d7s16); + q4s16 = vcombine_s16(d8s16, d9s16); + + // stage 3 + q9s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q10s16 = vsubq_s16(q3s16, q2s16); + q11s16 = vaddq_s16(q2s16, q3s16); + q12s16 = vaddq_s16(q4s16, q5s16); + q13s16 = vsubq_s16(q4s16, q5s16); + q14s16 = vsubq_s16(q7s16, q6s16); + q7s16 = vaddq_s16(q6s16, q7s16); + + // stage 4 + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + d30s16 = vdup_n_s16(cospi_8_64); + d31s16 = vdup_n_s16(cospi_24_64); + + q2s32 = vmull_s16(d18s16, d31s16); + q3s32 = vmull_s16(d19s16, d31s16); + q4s32 = vmull_s16(d28s16, d31s16); + q5s32 = vmull_s16(d29s16, d31s16); + + q2s32 = vmlal_s16(q2s32, d28s16, d30s16); + q3s32 = vmlal_s16(q3s32, d29s16, d30s16); + q4s32 = vmlsl_s16(q4s32, d18s16, d30s16); + q5s32 = vmlsl_s16(q5s32, d19s16, d30s16); + + d12s16 = vqrshrn_n_s32(q2s32, 14); + d13s16 = vqrshrn_n_s32(q3s32, 14); + d2s16 = vqrshrn_n_s32(q4s32, 14); + d3s16 = vqrshrn_n_s32(q5s32, 14); + q1s16 = vcombine_s16(d2s16, d3s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + q3s16 = q11s16; + q4s16 = q12s16; + + d30s16 = vdup_n_s16(-cospi_8_64); + q11s32 = vmull_s16(d26s16, d30s16); + q12s32 = vmull_s16(d27s16, d30s16); + q8s32 = vmull_s16(d20s16, d30s16); + q9s32 = vmull_s16(d21s16, d30s16); + + q11s32 = vmlsl_s16(q11s32, d20s16, d31s16); + q12s32 = vmlsl_s16(q12s32, d21s16, d31s16); + q8s32 = vmlal_s16(q8s32, d26s16, d31s16); + q9s32 = vmlal_s16(q9s32, d27s16, d31s16); + + d4s16 = vqrshrn_n_s32(q11s32, 14); + d5s16 = vqrshrn_n_s32(q12s32, 14); + d10s16 = vqrshrn_n_s32(q8s32, 14); + d11s16 = vqrshrn_n_s32(q9s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + // stage 5 + q8s16 = vaddq_s16(q0s16, q3s16); + q9s16 = vaddq_s16(q1s16, q2s16); + q10s16 = vsubq_s16(q1s16, q2s16); + q11s16 = vsubq_s16(q0s16, q3s16); + q12s16 = vsubq_s16(q7s16, q4s16); + q13s16 = vsubq_s16(q6s16, q5s16); + q14s16 = vaddq_s16(q6s16, q5s16); + q15s16 = vaddq_s16(q7s16, q4s16); + + // stage 6 + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + + d14s16 = vdup_n_s16(cospi_16_64); + + q3s32 = vmull_s16(d26s16, d14s16); + q4s32 = vmull_s16(d27s16, d14s16); + q0s32 = vmull_s16(d20s16, d14s16); + q1s32 = vmull_s16(d21s16, d14s16); + + q5s32 = vsubq_s32(q3s32, q0s32); + q6s32 = vsubq_s32(q4s32, q1s32); + q10s32 = vaddq_s32(q3s32, q0s32); + q4s32 = vaddq_s32(q4s32, q1s32); + + d4s16 = vqrshrn_n_s32(q5s32, 14); + d5s16 = vqrshrn_n_s32(q6s32, 14); + d10s16 = vqrshrn_n_s32(q10s32, 14); + d11s16 = vqrshrn_n_s32(q4s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + q0s32 = vmull_s16(d22s16, d14s16); + q1s32 = vmull_s16(d23s16, d14s16); + q13s32 = vmull_s16(d24s16, d14s16); + q6s32 = vmull_s16(d25s16, d14s16); + + q10s32 = vsubq_s32(q13s32, q0s32); + q4s32 = vsubq_s32(q6s32, q1s32); + q13s32 = vaddq_s32(q13s32, q0s32); + q6s32 = vaddq_s32(q6s32, q1s32); + + d6s16 = vqrshrn_n_s32(q10s32, 14); + d7s16 = vqrshrn_n_s32(q4s32, 14); + d8s16 = vqrshrn_n_s32(q13s32, 14); + d9s16 = vqrshrn_n_s32(q6s32, 14); + q3s16 = vcombine_s16(d6s16, d7s16); + q4s16 = vcombine_s16(d8s16, d9s16); + + // stage 7 + if (skip_adding != 0) { + d = dest; + // load the data in pass1 + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + + q12s16 = vaddq_s16(q0s16, q15s16); + q13s16 = vaddq_s16(q1s16, q14s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), + vreinterpret_u8_s64(d12s64)); + q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), + vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q14s16 = vsubq_s16(q1s16, q14s16); + q15s16 = vsubq_s16(q0s16, q15s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + pass1Output += 8; + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q12s16 = vaddq_s16(q10s16, q5s16); + q13s16 = vaddq_s16(q11s16, q4s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), + vreinterpret_u8_s64(d12s64)); + q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), + vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q4s16 = vsubq_s16(q11s16, q4s16); + q5s16 = vsubq_s16(q10s16, q5s16); + + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q12s16 = vaddq_s16(q0s16, q3s16); + q13s16 = vaddq_s16(q1s16, q2s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), + vreinterpret_u8_s64(d12s64)); + q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), + vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q2s16 = vsubq_s16(q1s16, q2s16); + q3s16 = vsubq_s16(q0s16, q3s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q12s16 = vaddq_s16(q10s16, q9s16); + q13s16 = vaddq_s16(q11s16, q8s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), + vreinterpret_u8_s64(d12s64)); + q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), + vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q8s16 = vsubq_s16(q11s16, q8s16); + q9s16 = vsubq_s16(q10s16, q9s16); + + // store the data out 8,9,10,11,12,13,14,15 + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q8s16 = vrshrq_n_s16(q8s16, 6); + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q9s16 = vrshrq_n_s16(q9s16, 6); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q2s16 = vrshrq_n_s16(q2s16, 6); + q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q3s16 = vrshrq_n_s16(q3s16, 6); + q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q4s16 = vrshrq_n_s16(q4s16, 6); + q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q5s16 = vrshrq_n_s16(q5s16, 6); + q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q14s16 = vrshrq_n_s16(q14s16, 6); + q14u16 = vaddw_u8(vreinterpretq_u16_s16(q14s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + q15s16 = vrshrq_n_s16(q15s16, 6); + q15u16 = vaddw_u8(vreinterpretq_u16_s16(q15s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + } else { // skip_adding_dest + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q0s16, q15s16); + q13s16 = vaddq_s16(q1s16, q14s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q14s16 = vsubq_s16(q1s16, q14s16); + q15s16 = vsubq_s16(q0s16, q15s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q10s16, q5s16); + q13s16 = vaddq_s16(q11s16, q4s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q4s16 = vsubq_s16(q11s16, q4s16); + q5s16 = vsubq_s16(q10s16, q5s16); + + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q0s16, q3s16); + q13s16 = vaddq_s16(q1s16, q2s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q2s16 = vsubq_s16(q1s16, q2s16); + q3s16 = vsubq_s16(q0s16, q3s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q10s16, q9s16); + q13s16 = vaddq_s16(q11s16, q8s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q8s16 = vsubq_s16(q11s16, q8s16); + q9s16 = vsubq_s16(q10s16, q9s16); + + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16))); + } + return; +} + +void vp9_idct16x16_10_add_neon_pass1( + int16_t *in, + int16_t *out, + int output_stride) { + int16x4_t d4s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; + uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; + int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q6s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q15s32; + int16x8x2_t q0x2s16; + + q0x2s16 = vld2q_s16(in); + q8s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q9s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q10s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q11s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q12s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q13s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q14s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + // stage 3 + q0s16 = vdupq_n_s16(cospi_28_64 * 2); + q1s16 = vdupq_n_s16(cospi_4_64 * 2); + + q4s16 = vqrdmulhq_s16(q9s16, q0s16); + q7s16 = vqrdmulhq_s16(q9s16, q1s16); + + // stage 4 + q1s16 = vdupq_n_s16(cospi_16_64 * 2); + d4s16 = vdup_n_s16(cospi_16_64); + + q8s16 = vqrdmulhq_s16(q8s16, q1s16); + + d8s16 = vget_low_s16(q4s16); + d9s16 = vget_high_s16(q4s16); + d14s16 = vget_low_s16(q7s16); + d15s16 = vget_high_s16(q7s16); + q9s32 = vmull_s16(d14s16, d4s16); + q10s32 = vmull_s16(d15s16, d4s16); + q12s32 = vmull_s16(d9s16, d4s16); + q11s32 = vmull_s16(d8s16, d4s16); + + q15s32 = vsubq_s32(q10s32, q12s32); + q6s32 = vsubq_s32(q9s32, q11s32); + q9s32 = vaddq_s32(q9s32, q11s32); + q10s32 = vaddq_s32(q10s32, q12s32); + + d11s16 = vqrshrn_n_s32(q15s32, 14); + d10s16 = vqrshrn_n_s32(q6s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q10s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 6 + q2s16 = vaddq_s16(q8s16, q7s16); + q9s16 = vaddq_s16(q8s16, q6s16); + q10s16 = vaddq_s16(q8s16, q5s16); + q11s16 = vaddq_s16(q8s16, q4s16); + q12s16 = vsubq_s16(q8s16, q4s16); + q13s16 = vsubq_s16(q8s16, q5s16); + q14s16 = vsubq_s16(q8s16, q6s16); + q15s16 = vsubq_s16(q8s16, q7s16); + + d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); + d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); + d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); + d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); + d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); + d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); + d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); + d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); + d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); + d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); + d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); + + // store the data + output_stride >>= 1; // output_stride / 2, out is int16_t + vst1_u64((uint64_t *)out, d4u64); + out += output_stride; + vst1_u64((uint64_t *)out, d5u64); + out += output_stride; + vst1_u64((uint64_t *)out, d18u64); + out += output_stride; + vst1_u64((uint64_t *)out, d19u64); + out += output_stride; + vst1_u64((uint64_t *)out, d20u64); + out += output_stride; + vst1_u64((uint64_t *)out, d21u64); + out += output_stride; + vst1_u64((uint64_t *)out, d22u64); + out += output_stride; + vst1_u64((uint64_t *)out, d23u64); + out += output_stride; + vst1_u64((uint64_t *)out, d24u64); + out += output_stride; + vst1_u64((uint64_t *)out, d25u64); + out += output_stride; + vst1_u64((uint64_t *)out, d26u64); + out += output_stride; + vst1_u64((uint64_t *)out, d27u64); + out += output_stride; + vst1_u64((uint64_t *)out, d28u64); + out += output_stride; + vst1_u64((uint64_t *)out, d29u64); + out += output_stride; + vst1_u64((uint64_t *)out, d30u64); + out += output_stride; + vst1_u64((uint64_t *)out, d31u64); + return; +} + +void vp9_idct16x16_10_add_neon_pass2( + int16_t *src, + int16_t *out, + int16_t *pass1Output, + int16_t skip_adding, + uint8_t *dest, + int dest_stride) { + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16; + uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64; + uint64x1_t d16u64, d17u64, d18u64, d19u64; + uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32; + int16x8x2_t q0x2s16; + (void)skip_adding; + (void)dest; + (void)dest_stride; + + q0x2s16 = vld2q_s16(src); + q8s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q9s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q10s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q11s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q12s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q13s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q14s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + // stage 3 + q6s16 = vdupq_n_s16(cospi_30_64 * 2); + q0s16 = vqrdmulhq_s16(q8s16, q6s16); + q6s16 = vdupq_n_s16(cospi_2_64 * 2); + q7s16 = vqrdmulhq_s16(q8s16, q6s16); + + q15s16 = vdupq_n_s16(-cospi_26_64 * 2); + q14s16 = vdupq_n_s16(cospi_6_64 * 2); + q3s16 = vqrdmulhq_s16(q9s16, q15s16); + q4s16 = vqrdmulhq_s16(q9s16, q14s16); + + // stage 4 + d0s16 = vget_low_s16(q0s16); + d1s16 = vget_high_s16(q0s16); + d6s16 = vget_low_s16(q3s16); + d7s16 = vget_high_s16(q3s16); + d8s16 = vget_low_s16(q4s16); + d9s16 = vget_high_s16(q4s16); + d14s16 = vget_low_s16(q7s16); + d15s16 = vget_high_s16(q7s16); + + d30s16 = vdup_n_s16(cospi_8_64); + d31s16 = vdup_n_s16(cospi_24_64); + + q12s32 = vmull_s16(d14s16, d31s16); + q5s32 = vmull_s16(d15s16, d31s16); + q2s32 = vmull_s16(d0s16, d31s16); + q11s32 = vmull_s16(d1s16, d31s16); + + q12s32 = vmlsl_s16(q12s32, d0s16, d30s16); + q5s32 = vmlsl_s16(q5s32, d1s16, d30s16); + q2s32 = vmlal_s16(q2s32, d14s16, d30s16); + q11s32 = vmlal_s16(q11s32, d15s16, d30s16); + + d2s16 = vqrshrn_n_s32(q12s32, 14); + d3s16 = vqrshrn_n_s32(q5s32, 14); + d12s16 = vqrshrn_n_s32(q2s32, 14); + d13s16 = vqrshrn_n_s32(q11s32, 14); + q1s16 = vcombine_s16(d2s16, d3s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + d30s16 = vdup_n_s16(-cospi_8_64); + q10s32 = vmull_s16(d8s16, d30s16); + q13s32 = vmull_s16(d9s16, d30s16); + q8s32 = vmull_s16(d6s16, d30s16); + q9s32 = vmull_s16(d7s16, d30s16); + + q10s32 = vmlsl_s16(q10s32, d6s16, d31s16); + q13s32 = vmlsl_s16(q13s32, d7s16, d31s16); + q8s32 = vmlal_s16(q8s32, d8s16, d31s16); + q9s32 = vmlal_s16(q9s32, d9s16, d31s16); + + d4s16 = vqrshrn_n_s32(q10s32, 14); + d5s16 = vqrshrn_n_s32(q13s32, 14); + d10s16 = vqrshrn_n_s32(q8s32, 14); + d11s16 = vqrshrn_n_s32(q9s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + // stage 5 + q8s16 = vaddq_s16(q0s16, q3s16); + q9s16 = vaddq_s16(q1s16, q2s16); + q10s16 = vsubq_s16(q1s16, q2s16); + q11s16 = vsubq_s16(q0s16, q3s16); + q12s16 = vsubq_s16(q7s16, q4s16); + q13s16 = vsubq_s16(q6s16, q5s16); + q14s16 = vaddq_s16(q6s16, q5s16); + q15s16 = vaddq_s16(q7s16, q4s16); + + // stage 6 + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + + d14s16 = vdup_n_s16(cospi_16_64); + q3s32 = vmull_s16(d26s16, d14s16); + q4s32 = vmull_s16(d27s16, d14s16); + q0s32 = vmull_s16(d20s16, d14s16); + q1s32 = vmull_s16(d21s16, d14s16); + + q5s32 = vsubq_s32(q3s32, q0s32); + q6s32 = vsubq_s32(q4s32, q1s32); + q0s32 = vaddq_s32(q3s32, q0s32); + q4s32 = vaddq_s32(q4s32, q1s32); + + d4s16 = vqrshrn_n_s32(q5s32, 14); + d5s16 = vqrshrn_n_s32(q6s32, 14); + d10s16 = vqrshrn_n_s32(q0s32, 14); + d11s16 = vqrshrn_n_s32(q4s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + q0s32 = vmull_s16(d22s16, d14s16); + q1s32 = vmull_s16(d23s16, d14s16); + q13s32 = vmull_s16(d24s16, d14s16); + q6s32 = vmull_s16(d25s16, d14s16); + + q10s32 = vsubq_s32(q13s32, q0s32); + q4s32 = vsubq_s32(q6s32, q1s32); + q13s32 = vaddq_s32(q13s32, q0s32); + q6s32 = vaddq_s32(q6s32, q1s32); + + d6s16 = vqrshrn_n_s32(q10s32, 14); + d7s16 = vqrshrn_n_s32(q4s32, 14); + d8s16 = vqrshrn_n_s32(q13s32, 14); + d9s16 = vqrshrn_n_s32(q6s32, 14); + q3s16 = vcombine_s16(d6s16, d7s16); + q4s16 = vcombine_s16(d8s16, d9s16); + + // stage 7 + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q0s16, q15s16); + q13s16 = vaddq_s16(q1s16, q14s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q14s16 = vsubq_s16(q1s16, q14s16); + q15s16 = vsubq_s16(q0s16, q15s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q10s16, q5s16); + q13s16 = vaddq_s16(q11s16, q4s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q4s16 = vsubq_s16(q11s16, q4s16); + q5s16 = vsubq_s16(q10s16, q5s16); + + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q0s16, q3s16); + q13s16 = vaddq_s16(q1s16, q2s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q2s16 = vsubq_s16(q1s16, q2s16); + q3s16 = vsubq_s16(q0s16, q3s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + q12s16 = vaddq_s16(q10s16, q9s16); + q13s16 = vaddq_s16(q11s16, q8s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q8s16 = vsubq_s16(q11s16, q8s16); + q9s16 = vsubq_s16(q10s16, q9s16); + + d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); + d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); + d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16)); + d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16)); + d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16)); + d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16)); + d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16)); + d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16)); + d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); + d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); + d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); + d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); + d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); + d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); + d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); + d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); + + vst1_u64((uint64_t *)out, d16u64); + out += 4; + vst1_u64((uint64_t *)out, d17u64); + out += 12; + vst1_u64((uint64_t *)out, d18u64); + out += 4; + vst1_u64((uint64_t *)out, d19u64); + out += 12; + vst1_u64((uint64_t *)out, d4u64); + out += 4; + vst1_u64((uint64_t *)out, d5u64); + out += 12; + vst1_u64((uint64_t *)out, d6u64); + out += 4; + vst1_u64((uint64_t *)out, d7u64); + out += 12; + vst1_u64((uint64_t *)out, d8u64); + out += 4; + vst1_u64((uint64_t *)out, d9u64); + out += 12; + vst1_u64((uint64_t *)out, d10u64); + out += 4; + vst1_u64((uint64_t *)out, d11u64); + out += 12; + vst1_u64((uint64_t *)out, d28u64); + out += 4; + vst1_u64((uint64_t *)out, d29u64); + out += 12; + vst1_u64((uint64_t *)out, d30u64); + out += 4; + vst1_u64((uint64_t *)out, d31u64); + return; +} diff --git a/vp9/common/arm/neon/vp9_idct16x16_add_neon.asm b/vp9/common/arm/neon/vp9_idct16x16_add_neon_asm.asm index a13c0d04b..a13c0d04b 100644 --- a/vp9/common/arm/neon/vp9_idct16x16_add_neon.asm +++ b/vp9/common/arm/neon/vp9_idct16x16_add_neon_asm.asm diff --git a/vp9/common/arm/neon/vp9_idct16x16_neon.c b/vp9/common/arm/neon/vp9_idct16x16_neon.c index 0b9fc09ab..f2c4ec451 100644 --- a/vp9/common/arm/neon/vp9_idct16x16_neon.c +++ b/vp9/common/arm/neon/vp9_idct16x16_neon.c @@ -30,18 +30,24 @@ void vp9_idct16x16_10_add_neon_pass2(const int16_t *src, uint8_t *dest, int dest_stride); +#if HAVE_NEON_ASM /* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ extern void vp9_push_neon(int64_t *store); extern void vp9_pop_neon(int64_t *store); +#endif // HAVE_NEON_ASM void vp9_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest, int dest_stride) { +#if HAVE_NEON_ASM int64_t store_reg[8]; +#endif int16_t pass1_output[16*16] = {0}; int16_t row_idct_output[16*16] = {0}; +#if HAVE_NEON_ASM // save d8-d15 register values. vp9_push_neon(store_reg); +#endif /* Parallel idct on the upper 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the @@ -103,20 +109,26 @@ void vp9_idct16x16_256_add_neon(const int16_t *input, dest+8, dest_stride); +#if HAVE_NEON_ASM // restore d8-d15 register values. vp9_pop_neon(store_reg); +#endif return; } void vp9_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest, int dest_stride) { +#if HAVE_NEON_ASM int64_t store_reg[8]; +#endif int16_t pass1_output[16*16] = {0}; int16_t row_idct_output[16*16] = {0}; +#if HAVE_NEON_ASM // save d8-d15 register values. vp9_push_neon(store_reg); +#endif /* Parallel idct on the upper 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the @@ -165,8 +177,10 @@ void vp9_idct16x16_10_add_neon(const int16_t *input, dest+8, dest_stride); +#if HAVE_NEON_ASM // restore d8-d15 register values. vp9_pop_neon(store_reg); +#endif return; } diff --git a/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c b/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c new file mode 100644 index 000000000..1bfee22b2 --- /dev/null +++ b/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "vp9/common/vp9_idct.h" + +static inline void LD_16x8( + uint8_t *d, + int d_stride, + uint8x16_t *q8u8, + uint8x16_t *q9u8, + uint8x16_t *q10u8, + uint8x16_t *q11u8, + uint8x16_t *q12u8, + uint8x16_t *q13u8, + uint8x16_t *q14u8, + uint8x16_t *q15u8) { + *q8u8 = vld1q_u8(d); + d += d_stride; + *q9u8 = vld1q_u8(d); + d += d_stride; + *q10u8 = vld1q_u8(d); + d += d_stride; + *q11u8 = vld1q_u8(d); + d += d_stride; + *q12u8 = vld1q_u8(d); + d += d_stride; + *q13u8 = vld1q_u8(d); + d += d_stride; + *q14u8 = vld1q_u8(d); + d += d_stride; + *q15u8 = vld1q_u8(d); + return; +} + +static inline void ADD_DIFF_16x8( + uint8x16_t qdiffu8, + uint8x16_t *q8u8, + uint8x16_t *q9u8, + uint8x16_t *q10u8, + uint8x16_t *q11u8, + uint8x16_t *q12u8, + uint8x16_t *q13u8, + uint8x16_t *q14u8, + uint8x16_t *q15u8) { + *q8u8 = vqaddq_u8(*q8u8, qdiffu8); + *q9u8 = vqaddq_u8(*q9u8, qdiffu8); + *q10u8 = vqaddq_u8(*q10u8, qdiffu8); + *q11u8 = vqaddq_u8(*q11u8, qdiffu8); + *q12u8 = vqaddq_u8(*q12u8, qdiffu8); + *q13u8 = vqaddq_u8(*q13u8, qdiffu8); + *q14u8 = vqaddq_u8(*q14u8, qdiffu8); + *q15u8 = vqaddq_u8(*q15u8, qdiffu8); + return; +} + +static inline void SUB_DIFF_16x8( + uint8x16_t qdiffu8, + uint8x16_t *q8u8, + uint8x16_t *q9u8, + uint8x16_t *q10u8, + uint8x16_t *q11u8, + uint8x16_t *q12u8, + uint8x16_t *q13u8, + uint8x16_t *q14u8, + uint8x16_t *q15u8) { + *q8u8 = vqsubq_u8(*q8u8, qdiffu8); + *q9u8 = vqsubq_u8(*q9u8, qdiffu8); + *q10u8 = vqsubq_u8(*q10u8, qdiffu8); + *q11u8 = vqsubq_u8(*q11u8, qdiffu8); + *q12u8 = vqsubq_u8(*q12u8, qdiffu8); + *q13u8 = vqsubq_u8(*q13u8, qdiffu8); + *q14u8 = vqsubq_u8(*q14u8, qdiffu8); + *q15u8 = vqsubq_u8(*q15u8, qdiffu8); + return; +} + +static inline void ST_16x8( + uint8_t *d, + int d_stride, + uint8x16_t *q8u8, + uint8x16_t *q9u8, + uint8x16_t *q10u8, + uint8x16_t *q11u8, + uint8x16_t *q12u8, + uint8x16_t *q13u8, + uint8x16_t *q14u8, + uint8x16_t *q15u8) { + vst1q_u8(d, *q8u8); + d += d_stride; + vst1q_u8(d, *q9u8); + d += d_stride; + vst1q_u8(d, *q10u8); + d += d_stride; + vst1q_u8(d, *q11u8); + d += d_stride; + vst1q_u8(d, *q12u8); + d += d_stride; + vst1q_u8(d, *q13u8); + d += d_stride; + vst1q_u8(d, *q14u8); + d += d_stride; + vst1q_u8(d, *q15u8); + return; +} + +void vp9_idct32x32_1_add_neon( + int16_t *input, + uint8_t *dest, + int dest_stride) { + uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int i, j, dest_stride8; + uint8_t *d; + int16_t a1, cospi_16_64 = 11585; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 6); + + dest_stride8 = dest_stride * 8; + if (a1 >= 0) { // diff_positive_32_32 + a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; + q0u8 = vdupq_n_u8(a1); + for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop + d = dest; + for (j = 0; j < 4; j++) { + LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, + &q12u8, &q13u8, &q14u8, &q15u8); + ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, + &q12u8, &q13u8, &q14u8, &q15u8); + ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, + &q12u8, &q13u8, &q14u8, &q15u8); + d += dest_stride8; + } + } + } else { // diff_negative_32_32 + a1 = -a1; + a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; + q0u8 = vdupq_n_u8(a1); + for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop + d = dest; + for (j = 0; j < 4; j++) { + LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, + &q12u8, &q13u8, &q14u8, &q15u8); + SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, + &q12u8, &q13u8, &q14u8, &q15u8); + ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, + &q12u8, &q13u8, &q14u8, &q15u8); + d += dest_stride8; + } + } + } + return; +} diff --git a/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm b/vp9/common/arm/neon/vp9_idct32x32_1_add_neon_asm.asm index d290d0753..d290d0753 100644 --- a/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm +++ b/vp9/common/arm/neon/vp9_idct32x32_1_add_neon_asm.asm diff --git a/vp9/common/arm/neon/vp9_idct32x32_add_neon.c b/vp9/common/arm/neon/vp9_idct32x32_add_neon.c new file mode 100644 index 000000000..53f721b44 --- /dev/null +++ b/vp9/common/arm/neon/vp9_idct32x32_add_neon.c @@ -0,0 +1,748 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +static int16_t cospi_1_64 = 16364; +static int16_t cospi_2_64 = 16305; +static int16_t cospi_3_64 = 16207; +static int16_t cospi_4_64 = 16069; +static int16_t cospi_5_64 = 15893; +static int16_t cospi_6_64 = 15679; +static int16_t cospi_7_64 = 15426; +static int16_t cospi_8_64 = 15137; +static int16_t cospi_9_64 = 14811; +static int16_t cospi_10_64 = 14449; +static int16_t cospi_11_64 = 14053; +static int16_t cospi_12_64 = 13623; +static int16_t cospi_13_64 = 13160; +static int16_t cospi_14_64 = 12665; +static int16_t cospi_15_64 = 12140; +static int16_t cospi_16_64 = 11585; +static int16_t cospi_17_64 = 11003; +static int16_t cospi_18_64 = 10394; +static int16_t cospi_19_64 = 9760; +static int16_t cospi_20_64 = 9102; +static int16_t cospi_21_64 = 8423; +static int16_t cospi_22_64 = 7723; +static int16_t cospi_23_64 = 7005; +static int16_t cospi_24_64 = 6270; +static int16_t cospi_25_64 = 5520; +static int16_t cospi_26_64 = 4756; +static int16_t cospi_27_64 = 3981; +static int16_t cospi_28_64 = 3196; +static int16_t cospi_29_64 = 2404; +static int16_t cospi_30_64 = 1606; +static int16_t cospi_31_64 = 804; + +#define LOAD_FROM_TRANSPOSED(prev, first, second) \ + q14s16 = vld1q_s16(trans_buf + first * 8); \ + q13s16 = vld1q_s16(trans_buf + second * 8); + +#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \ + qA = vld1q_s16(out + first * 32); \ + qB = vld1q_s16(out + second * 32); + +#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \ + vst1q_s16(out + first * 32, qA); \ + vst1q_s16(out + second * 32, qB); + +#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \ + __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \ + q6s16, q7s16, q8s16, q9s16); +static inline void __STORE_COMBINE_CENTER_RESULTS( + uint8_t *p1, + uint8_t *p2, + int stride, + int16x8_t q6s16, + int16x8_t q7s16, + int16x8_t q8s16, + int16x8_t q9s16) { + int16x4_t d8s16, d9s16, d10s16, d11s16; + + d8s16 = vld1_s16((int16_t *)p1); + p1 += stride; + d11s16 = vld1_s16((int16_t *)p2); + p2 -= stride; + d9s16 = vld1_s16((int16_t *)p1); + d10s16 = vld1_s16((int16_t *)p2); + + q7s16 = vrshrq_n_s16(q7s16, 6); + q8s16 = vrshrq_n_s16(q8s16, 6); + q9s16 = vrshrq_n_s16(q9s16, 6); + q6s16 = vrshrq_n_s16(q6s16, 6); + + q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16), + vreinterpret_u8_s16(d9s16))); + q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16), + vreinterpret_u8_s16(d10s16))); + q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16), + vreinterpret_u8_s16(d11s16))); + q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16), + vreinterpret_u8_s16(d8s16))); + + d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); + d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16)); + d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16)); + d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); + + vst1_s16((int16_t *)p1, d9s16); + p1 -= stride; + vst1_s16((int16_t *)p2, d10s16); + p2 += stride; + vst1_s16((int16_t *)p1, d8s16); + vst1_s16((int16_t *)p2, d11s16); + return; +} + +#define STORE_COMBINE_EXTREME_RESULTS(r7, r6); \ + __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \ + q4s16, q5s16, q6s16, q7s16); +static inline void __STORE_COMBINE_EXTREME_RESULTS( + uint8_t *p1, + uint8_t *p2, + int stride, + int16x8_t q4s16, + int16x8_t q5s16, + int16x8_t q6s16, + int16x8_t q7s16) { + int16x4_t d4s16, d5s16, d6s16, d7s16; + + d4s16 = vld1_s16((int16_t *)p1); + p1 += stride; + d7s16 = vld1_s16((int16_t *)p2); + p2 -= stride; + d5s16 = vld1_s16((int16_t *)p1); + d6s16 = vld1_s16((int16_t *)p2); + + q5s16 = vrshrq_n_s16(q5s16, 6); + q6s16 = vrshrq_n_s16(q6s16, 6); + q7s16 = vrshrq_n_s16(q7s16, 6); + q4s16 = vrshrq_n_s16(q4s16, 6); + + q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16), + vreinterpret_u8_s16(d5s16))); + q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16), + vreinterpret_u8_s16(d6s16))); + q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16), + vreinterpret_u8_s16(d7s16))); + q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16), + vreinterpret_u8_s16(d4s16))); + + d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16)); + d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); + d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); + d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16)); + + vst1_s16((int16_t *)p1, d5s16); + p1 -= stride; + vst1_s16((int16_t *)p2, d6s16); + p2 += stride; + vst1_s16((int16_t *)p2, d7s16); + vst1_s16((int16_t *)p1, d4s16); + return; +} + +#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \ + DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB); +static inline void DO_BUTTERFLY( + int16x8_t q14s16, + int16x8_t q13s16, + int16_t first_const, + int16_t second_const, + int16x8_t *qAs16, + int16x8_t *qBs16) { + int16x4_t d30s16, d31s16; + int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32; + int16x4_t dCs16, dDs16, dAs16, dBs16; + + dCs16 = vget_low_s16(q14s16); + dDs16 = vget_high_s16(q14s16); + dAs16 = vget_low_s16(q13s16); + dBs16 = vget_high_s16(q13s16); + + d30s16 = vdup_n_s16(first_const); + d31s16 = vdup_n_s16(second_const); + + q8s32 = vmull_s16(dCs16, d30s16); + q10s32 = vmull_s16(dAs16, d31s16); + q9s32 = vmull_s16(dDs16, d30s16); + q11s32 = vmull_s16(dBs16, d31s16); + q12s32 = vmull_s16(dCs16, d31s16); + + q8s32 = vsubq_s32(q8s32, q10s32); + q9s32 = vsubq_s32(q9s32, q11s32); + + q10s32 = vmull_s16(dDs16, d31s16); + q11s32 = vmull_s16(dAs16, d30s16); + q15s32 = vmull_s16(dBs16, d30s16); + + q11s32 = vaddq_s32(q12s32, q11s32); + q10s32 = vaddq_s32(q10s32, q15s32); + + *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), + vqrshrn_n_s32(q9s32, 14)); + *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), + vqrshrn_n_s32(q10s32, 14)); + return; +} + +static inline void idct32_transpose_pair( + int16_t *input, + int16_t *t_buf) { + int16_t *in; + int i; + const int stride = 32; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; + int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; + + for (i = 0; i < 4; i++, input += 8) { + in = input; + q8s16 = vld1q_s16(in); + in += stride; + q9s16 = vld1q_s16(in); + in += stride; + q10s16 = vld1q_s16(in); + in += stride; + q11s16 = vld1q_s16(in); + in += stride; + q12s16 = vld1q_s16(in); + in += stride; + q13s16 = vld1q_s16(in); + in += stride; + q14s16 = vld1q_s16(in); + in += stride; + q15s16 = vld1q_s16(in); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + d30s16 = vget_low_s16(q15s16); + d31s16 = vget_high_s16(q15s16); + + q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 + q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 + q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 + q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 + q12s16 = vcombine_s16(d17s16, d25s16); + q13s16 = vcombine_s16(d19s16, d27s16); + q14s16 = vcombine_s16(d21s16, d29s16); + q15s16 = vcombine_s16(d23s16, d31s16); + + q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), + vreinterpretq_s32_s16(q10s16)); + q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16), + vreinterpretq_s32_s16(q11s16)); + q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16), + vreinterpretq_s32_s16(q14s16)); + q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16), + vreinterpretq_s32_s16(q15s16)); + + q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 + vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 + q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 + vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 + q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 + vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 + q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 + vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 + + vst1q_s16(t_buf, q0x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q0x2s16.val[1]); + t_buf += 8; + vst1q_s16(t_buf, q1x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q1x2s16.val[1]); + t_buf += 8; + vst1q_s16(t_buf, q2x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q2x2s16.val[1]); + t_buf += 8; + vst1q_s16(t_buf, q3x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q3x2s16.val[1]); + t_buf += 8; + } + return; +} + +static inline void idct32_bands_end_1st_pass( + int16_t *out, + int16x8_t q2s16, + int16x8_t q3s16, + int16x8_t q6s16, + int16x8_t q7s16, + int16x8_t q8s16, + int16x8_t q9s16, + int16x8_t q10s16, + int16x8_t q11s16, + int16x8_t q12s16, + int16x8_t q13s16, + int16x8_t q14s16, + int16x8_t q15s16) { + int16x8_t q0s16, q1s16, q4s16, q5s16; + + STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16); + STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16); + + LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16); + STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16); + + LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16); + q2s16 = vaddq_s16(q10s16, q1s16); + q3s16 = vaddq_s16(q11s16, q0s16); + q4s16 = vsubq_s16(q11s16, q0s16); + q5s16 = vsubq_s16(q10s16, q1s16); + + LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16); + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16); + STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16); + + LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16); + STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16); + + LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16); + q2s16 = vaddq_s16(q12s16, q1s16); + q3s16 = vaddq_s16(q13s16, q0s16); + q4s16 = vsubq_s16(q13s16, q0s16); + q5s16 = vsubq_s16(q12s16, q1s16); + + LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16); + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16); + STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16); + + LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16); + STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16); + + LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16); + q2s16 = vaddq_s16(q14s16, q1s16); + q3s16 = vaddq_s16(q15s16, q0s16); + q4s16 = vsubq_s16(q15s16, q0s16); + q5s16 = vsubq_s16(q14s16, q1s16); + + LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16); + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16); + STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16); + + LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16); + STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16); + return; +} + +static inline void idct32_bands_end_2nd_pass( + int16_t *out, + uint8_t *dest, + int stride, + int16x8_t q2s16, + int16x8_t q3s16, + int16x8_t q6s16, + int16x8_t q7s16, + int16x8_t q8s16, + int16x8_t q9s16, + int16x8_t q10s16, + int16x8_t q11s16, + int16x8_t q12s16, + int16x8_t q13s16, + int16x8_t q14s16, + int16x8_t q15s16) { + uint8_t *r6 = dest + 31 * stride; + uint8_t *r7 = dest/* + 0 * stride*/; + uint8_t *r9 = dest + 15 * stride; + uint8_t *r10 = dest + 16 * stride; + int str2 = stride << 1; + int16x8_t q0s16, q1s16, q4s16, q5s16; + + STORE_COMBINE_CENTER_RESULTS(r10, r9); + r10 += str2; r9 -= str2; + + LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + r7 += str2; r6 -= str2; + + LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16) + q2s16 = vaddq_s16(q10s16, q1s16); + q3s16 = vaddq_s16(q11s16, q0s16); + q4s16 = vsubq_s16(q11s16, q0s16); + q5s16 = vsubq_s16(q10s16, q1s16); + + LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_COMBINE_CENTER_RESULTS(r10, r9); + r10 += str2; r9 -= str2; + + LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + r7 += str2; r6 -= str2; + + LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16) + q2s16 = vaddq_s16(q12s16, q1s16); + q3s16 = vaddq_s16(q13s16, q0s16); + q4s16 = vsubq_s16(q13s16, q0s16); + q5s16 = vsubq_s16(q12s16, q1s16); + + LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_COMBINE_CENTER_RESULTS(r10, r9); + r10 += str2; r9 -= str2; + + LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + r7 += str2; r6 -= str2; + + LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16) + q2s16 = vaddq_s16(q14s16, q1s16); + q3s16 = vaddq_s16(q15s16, q0s16); + q4s16 = vsubq_s16(q15s16, q0s16); + q5s16 = vsubq_s16(q14s16, q1s16); + + LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_COMBINE_CENTER_RESULTS(r10, r9); + + LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + return; +} + +void vp9_idct32x32_1024_add_neon( + int16_t *input, + uint8_t *dest, + int stride) { + int i, idct32_pass_loop; + int16_t trans_buf[32 * 8]; + int16_t pass1[32 * 32]; + int16_t pass2[32 * 32]; + int16_t *out; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + + for (idct32_pass_loop = 0, out = pass1; + idct32_pass_loop < 2; + idct32_pass_loop++, + input = pass1, // the input of pass2 is the result of pass1 + out = pass2) { + for (i = 0; + i < 4; i++, + input += 32 * 8, out += 8) { // idct32_bands_loop + idct32_transpose_pair(input, trans_buf); + + // ----------------------------------------- + // BLOCK A: 16-19,28-31 + // ----------------------------------------- + // generate 16,17,30,31 + // part of stage 1 + LOAD_FROM_TRANSPOSED(0, 1, 31) + DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(31, 17, 15) + DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16) + // part of stage 2 + q4s16 = vaddq_s16(q0s16, q1s16); + q13s16 = vsubq_s16(q0s16, q1s16); + q6s16 = vaddq_s16(q2s16, q3s16); + q14s16 = vsubq_s16(q2s16, q3s16); + // part of stage 3 + DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16) + + // generate 18,19,28,29 + // part of stage 1 + LOAD_FROM_TRANSPOSED(15, 9, 23) + DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(23, 25, 7) + DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16) + // part of stage 2 + q13s16 = vsubq_s16(q3s16, q2s16); + q3s16 = vaddq_s16(q3s16, q2s16); + q14s16 = vsubq_s16(q1s16, q0s16); + q2s16 = vaddq_s16(q1s16, q0s16); + // part of stage 3 + DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16) + // part of stage 4 + q8s16 = vaddq_s16(q4s16, q2s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q10s16 = vaddq_s16(q7s16, q1s16); + q15s16 = vaddq_s16(q6s16, q3s16); + q13s16 = vsubq_s16(q5s16, q0s16); + q14s16 = vsubq_s16(q7s16, q1s16); + STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16) + STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16) + // part of stage 5 + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16) + STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16) + // part of stage 4 + q13s16 = vsubq_s16(q4s16, q2s16); + q14s16 = vsubq_s16(q6s16, q3s16); + // part of stage 5 + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16) + STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16) + + // ----------------------------------------- + // BLOCK B: 20-23,24-27 + // ----------------------------------------- + // generate 20,21,26,27 + // part of stage 1 + LOAD_FROM_TRANSPOSED(7, 5, 27) + DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(27, 21, 11) + DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16) + // part of stage 2 + q13s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q14s16 = vsubq_s16(q2s16, q3s16); + q2s16 = vaddq_s16(q2s16, q3s16); + // part of stage 3 + DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) + + // generate 22,23,24,25 + // part of stage 1 + LOAD_FROM_TRANSPOSED(11, 13, 19) + DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16) + LOAD_FROM_TRANSPOSED(19, 29, 3) + DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16) + // part of stage 2 + q14s16 = vsubq_s16(q4s16, q5s16); + q5s16 = vaddq_s16(q4s16, q5s16); + q13s16 = vsubq_s16(q6s16, q7s16); + q6s16 = vaddq_s16(q6s16, q7s16); + // part of stage 3 + DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16) + // part of stage 4 + q10s16 = vaddq_s16(q7s16, q1s16); + q11s16 = vaddq_s16(q5s16, q0s16); + q12s16 = vaddq_s16(q6s16, q2s16); + q15s16 = vaddq_s16(q4s16, q3s16); + // part of stage 6 + LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16) + q8s16 = vaddq_s16(q14s16, q11s16); + q9s16 = vaddq_s16(q13s16, q10s16); + q13s16 = vsubq_s16(q13s16, q10s16); + q11s16 = vsubq_s16(q14s16, q11s16); + STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16) + LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16) + q8s16 = vsubq_s16(q9s16, q12s16); + q10s16 = vaddq_s16(q14s16, q15s16); + q14s16 = vsubq_s16(q14s16, q15s16); + q12s16 = vaddq_s16(q9s16, q12s16); + STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16) + // part of stage 7 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) + STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16) + q13s16 = q11s16; + q14s16 = q8s16; + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) + STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16) + // part of stage 4 + q14s16 = vsubq_s16(q5s16, q0s16); + q13s16 = vsubq_s16(q6s16, q2s16); + DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16); + q14s16 = vsubq_s16(q7s16, q1s16); + q13s16 = vsubq_s16(q4s16, q3s16); + DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16); + // part of stage 6 + LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16) + q8s16 = vaddq_s16(q14s16, q1s16); + q9s16 = vaddq_s16(q13s16, q6s16); + q13s16 = vsubq_s16(q13s16, q6s16); + q1s16 = vsubq_s16(q14s16, q1s16); + STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16) + LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16) + q14s16 = vsubq_s16(q8s16, q5s16); + q10s16 = vaddq_s16(q8s16, q5s16); + q11s16 = vaddq_s16(q9s16, q0s16); + q0s16 = vsubq_s16(q9s16, q0s16); + STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16) + // part of stage 7 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) + STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16) + DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, + &q1s16, &q0s16); + STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16) + + // ----------------------------------------- + // BLOCK C: 8-10,11-15 + // ----------------------------------------- + // generate 8,9,14,15 + // part of stage 2 + LOAD_FROM_TRANSPOSED(3, 2, 30) + DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(30, 18, 14) + DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16) + // part of stage 3 + q13s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q14s16 = vsubq_s16(q2s16, q3s16); + q2s16 = vaddq_s16(q2s16, q3s16); + // part of stage 4 + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16) + + // generate 10,11,12,13 + // part of stage 2 + LOAD_FROM_TRANSPOSED(14, 10, 22) + DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16) + LOAD_FROM_TRANSPOSED(22, 26, 6) + DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16) + // part of stage 3 + q14s16 = vsubq_s16(q4s16, q5s16); + q5s16 = vaddq_s16(q4s16, q5s16); + q13s16 = vsubq_s16(q6s16, q7s16); + q6s16 = vaddq_s16(q6s16, q7s16); + // part of stage 4 + DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16) + // part of stage 5 + q8s16 = vaddq_s16(q0s16, q5s16); + q9s16 = vaddq_s16(q1s16, q7s16); + q13s16 = vsubq_s16(q1s16, q7s16); + q14s16 = vsubq_s16(q3s16, q4s16); + q10s16 = vaddq_s16(q3s16, q4s16); + q15s16 = vaddq_s16(q2s16, q6s16); + STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16) + STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16) + // part of stage 6 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) + STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16) + q13s16 = vsubq_s16(q0s16, q5s16); + q14s16 = vsubq_s16(q2s16, q6s16); + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) + STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16) + + // ----------------------------------------- + // BLOCK D: 0-3,4-7 + // ----------------------------------------- + // generate 4,5,6,7 + // part of stage 3 + LOAD_FROM_TRANSPOSED(6, 4, 28) + DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(28, 20, 12) + DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) + // part of stage 4 + q13s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q14s16 = vsubq_s16(q2s16, q3s16); + q2s16 = vaddq_s16(q2s16, q3s16); + // part of stage 5 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) + + // generate 0,1,2,3 + // part of stage 4 + LOAD_FROM_TRANSPOSED(12, 0, 16) + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16) + LOAD_FROM_TRANSPOSED(16, 8, 24) + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16) + // part of stage 5 + q4s16 = vaddq_s16(q7s16, q6s16); + q7s16 = vsubq_s16(q7s16, q6s16); + q6s16 = vsubq_s16(q5s16, q14s16); + q5s16 = vaddq_s16(q5s16, q14s16); + // part of stage 6 + q8s16 = vaddq_s16(q4s16, q2s16); + q9s16 = vaddq_s16(q5s16, q3s16); + q10s16 = vaddq_s16(q6s16, q1s16); + q11s16 = vaddq_s16(q7s16, q0s16); + q12s16 = vsubq_s16(q7s16, q0s16); + q13s16 = vsubq_s16(q6s16, q1s16); + q14s16 = vsubq_s16(q5s16, q3s16); + q15s16 = vsubq_s16(q4s16, q2s16); + // part of stage 7 + LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16) + q2s16 = vaddq_s16(q8s16, q1s16); + q3s16 = vaddq_s16(q9s16, q0s16); + q4s16 = vsubq_s16(q9s16, q0s16); + q5s16 = vsubq_s16(q8s16, q1s16); + LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + + if (idct32_pass_loop == 0) { + idct32_bands_end_1st_pass(out, + q2s16, q3s16, q6s16, q7s16, q8s16, q9s16, + q10s16, q11s16, q12s16, q13s16, q14s16, q15s16); + } else { + idct32_bands_end_2nd_pass(out, dest, stride, + q2s16, q3s16, q6s16, q7s16, q8s16, q9s16, + q10s16, q11s16, q12s16, q13s16, q14s16, q15s16); + dest += 8; + } + } + } + return; +} diff --git a/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm b/vp9/common/arm/neon/vp9_idct32x32_add_neon_asm.asm index 72e933eee..72e933eee 100644 --- a/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm +++ b/vp9/common/arm/neon/vp9_idct32x32_add_neon_asm.asm diff --git a/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c b/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c new file mode 100644 index 000000000..7c8a930b6 --- /dev/null +++ b/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "vp9/common/vp9_idct.h" + +void vp9_idct4x4_1_add_neon( + int16_t *input, + uint8_t *dest, + int dest_stride) { + uint8x8_t d6u8; + uint32x2_t d2u32 = vdup_n_u32(0); + uint16x8_t q8u16; + int16x8_t q0s16; + uint8_t *d1, *d2; + int16_t i, a1, cospi_16_64 = 11585; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 4); + + q0s16 = vdupq_n_s16(a1); + + // dc_only_idct_add + d1 = d2 = dest; + for (i = 0; i < 2; i++) { + d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0); + d1 += dest_stride; + d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), + vreinterpret_u8_u32(d2u32)); + d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + + vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0); + d2 += dest_stride; + vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1); + d2 += dest_stride; + } + return; +} diff --git a/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm b/vp9/common/arm/neon/vp9_idct4x4_1_add_neon_asm.asm index 0d4a721c4..0d4a721c4 100644 --- a/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm +++ b/vp9/common/arm/neon/vp9_idct4x4_1_add_neon_asm.asm diff --git a/vp9/common/arm/neon/vp9_idct4x4_add_neon.c b/vp9/common/arm/neon/vp9_idct4x4_add_neon.c new file mode 100644 index 000000000..dc91e0f30 --- /dev/null +++ b/vp9/common/arm/neon/vp9_idct4x4_add_neon.c @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +void vp9_idct4x4_16_add_neon( + int16_t *input, + uint8_t *dest, + int dest_stride) { + uint8x8_t d26u8, d27u8; + uint32x2_t d26u32, d27u32; + uint16x8_t q8u16, q9u16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16; + int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16; + int16x8_t q8s16, q9s16, q13s16, q14s16; + int32x4_t q1s32, q13s32, q14s32, q15s32; + int16x4x2_t d0x2s16, d1x2s16; + int32x4x2_t q0x2s32; + uint8_t *d; + int16_t cospi_8_64 = 15137; + int16_t cospi_16_64 = 11585; + int16_t cospi_24_64 = 6270; + + d26u32 = d27u32 = vdup_n_u32(0); + + q8s16 = vld1q_s16(input); + q9s16 = vld1q_s16(input + 8); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + + d0x2s16 = vtrn_s16(d16s16, d17s16); + d1x2s16 = vtrn_s16(d18s16, d19s16); + q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); + q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); + + d20s16 = vdup_n_s16(cospi_8_64); + d21s16 = vdup_n_s16(cospi_16_64); + + q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), + vreinterpretq_s32_s16(q9s16)); + d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + + d22s16 = vdup_n_s16(cospi_24_64); + + // stage 1 + d23s16 = vadd_s16(d16s16, d18s16); + d24s16 = vsub_s16(d16s16, d18s16); + + q15s32 = vmull_s16(d17s16, d22s16); + q1s32 = vmull_s16(d17s16, d20s16); + q13s32 = vmull_s16(d23s16, d21s16); + q14s32 = vmull_s16(d24s16, d21s16); + + q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); + q1s32 = vmlal_s16(q1s32, d19s16, d22s16); + + d26s16 = vqrshrn_n_s32(q13s32, 14); + d27s16 = vqrshrn_n_s32(q14s32, 14); + d29s16 = vqrshrn_n_s32(q15s32, 14); + d28s16 = vqrshrn_n_s32(q1s32, 14); + q13s16 = vcombine_s16(d26s16, d27s16); + q14s16 = vcombine_s16(d28s16, d29s16); + + // stage 2 + q8s16 = vaddq_s16(q13s16, q14s16); + q9s16 = vsubq_s16(q13s16, q14s16); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_high_s16(q9s16); // vswp d18 d19 + d19s16 = vget_low_s16(q9s16); + + d0x2s16 = vtrn_s16(d16s16, d17s16); + d1x2s16 = vtrn_s16(d18s16, d19s16); + q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); + q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); + + q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), + vreinterpretq_s32_s16(q9s16)); + d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + + // do the transform on columns + // stage 1 + d23s16 = vadd_s16(d16s16, d18s16); + d24s16 = vsub_s16(d16s16, d18s16); + + q15s32 = vmull_s16(d17s16, d22s16); + q1s32 = vmull_s16(d17s16, d20s16); + q13s32 = vmull_s16(d23s16, d21s16); + q14s32 = vmull_s16(d24s16, d21s16); + + q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); + q1s32 = vmlal_s16(q1s32, d19s16, d22s16); + + d26s16 = vqrshrn_n_s32(q13s32, 14); + d27s16 = vqrshrn_n_s32(q14s32, 14); + d29s16 = vqrshrn_n_s32(q15s32, 14); + d28s16 = vqrshrn_n_s32(q1s32, 14); + q13s16 = vcombine_s16(d26s16, d27s16); + q14s16 = vcombine_s16(d28s16, d29s16); + + // stage 2 + q8s16 = vaddq_s16(q13s16, q14s16); + q9s16 = vsubq_s16(q13s16, q14s16); + + q8s16 = vrshrq_n_s16(q8s16, 4); + q9s16 = vrshrq_n_s16(q9s16, 4); + + d = dest; + d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0); + d += dest_stride; + d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1); + d += dest_stride; + d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1); + d += dest_stride; + d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0); + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), + vreinterpret_u8_u32(d26u32)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), + vreinterpret_u8_u32(d27u32)); + + d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + + d = dest; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0); + d += dest_stride; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1); + d += dest_stride; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1); + d += dest_stride; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0); + return; +} diff --git a/vp9/common/arm/neon/vp9_idct4x4_add_neon.asm b/vp9/common/arm/neon/vp9_idct4x4_add_neon_asm.asm index 00283fc8d..00283fc8d 100644 --- a/vp9/common/arm/neon/vp9_idct4x4_add_neon.asm +++ b/vp9/common/arm/neon/vp9_idct4x4_add_neon_asm.asm diff --git a/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c b/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c new file mode 100644 index 000000000..24c29fb77 --- /dev/null +++ b/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "vp9/common/vp9_idct.h" + +void vp9_idct8x8_1_add_neon( + int16_t *input, + uint8_t *dest, + int dest_stride) { + uint8x8_t d2u8, d3u8, d30u8, d31u8; + uint64x1_t d2u64, d3u64, d4u64, d5u64; + uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q0s16; + uint8_t *d1, *d2; + int16_t i, a1, cospi_16_64 = 11585; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 5); + + q0s16 = vdupq_n_s16(a1); + q0u16 = vreinterpretq_u16_s16(q0s16); + + d1 = d2 = dest; + for (i = 0; i < 2; i++) { + d2u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + d4u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + d5u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + + q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); + q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); + q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); + q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); + + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8)); + d2 += dest_stride; + } + return; +} diff --git a/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm b/vp9/common/arm/neon/vp9_idct8x8_1_add_neon_asm.asm index 421d202d4..421d202d4 100644 --- a/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm +++ b/vp9/common/arm/neon/vp9_idct8x8_1_add_neon_asm.asm diff --git a/vp9/common/arm/neon/vp9_idct8x8_add_neon.c b/vp9/common/arm/neon/vp9_idct8x8_add_neon.c new file mode 100644 index 000000000..50587f6bc --- /dev/null +++ b/vp9/common/arm/neon/vp9_idct8x8_add_neon.c @@ -0,0 +1,545 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +static int16_t cospi_4_64 = 16069; +static int16_t cospi_8_64 = 15137; +static int16_t cospi_12_64 = 13623; +static int16_t cospi_16_64 = 11585; +static int16_t cospi_20_64 = 9102; +static int16_t cospi_24_64 = 6270; +static int16_t cospi_28_64 = 3196; + +static inline void TRANSPOSE8X8( + int16x8_t *q8s16, + int16x8_t *q9s16, + int16x8_t *q10s16, + int16x8_t *q11s16, + int16x8_t *q12s16, + int16x8_t *q13s16, + int16x8_t *q14s16, + int16x8_t *q15s16) { + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; + int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 + *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 + *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 + *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 + *q12s16 = vcombine_s16(d17s16, d25s16); + *q13s16 = vcombine_s16(d19s16, d27s16); + *q14s16 = vcombine_s16(d21s16, d29s16); + *q15s16 = vcombine_s16(d23s16, d31s16); + + q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16), + vreinterpretq_s32_s16(*q10s16)); + q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16), + vreinterpretq_s32_s16(*q11s16)); + q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16), + vreinterpretq_s32_s16(*q14s16)); + q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16), + vreinterpretq_s32_s16(*q15s16)); + + q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 + vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 + q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 + vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 + q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 + vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 + q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 + vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 + + *q8s16 = q0x2s16.val[0]; + *q9s16 = q0x2s16.val[1]; + *q10s16 = q1x2s16.val[0]; + *q11s16 = q1x2s16.val[1]; + *q12s16 = q2x2s16.val[0]; + *q13s16 = q2x2s16.val[1]; + *q14s16 = q3x2s16.val[0]; + *q15s16 = q3x2s16.val[1]; + return; +} + +static inline void IDCT8x8_1D( + int16x8_t *q8s16, + int16x8_t *q9s16, + int16x8_t *q10s16, + int16x8_t *q11s16, + int16x8_t *q12s16, + int16x8_t *q13s16, + int16x8_t *q14s16, + int16x8_t *q15s16) { + int16x4_t d0s16, d1s16, d2s16, d3s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; + + d0s16 = vdup_n_s16(cospi_28_64); + d1s16 = vdup_n_s16(cospi_4_64); + d2s16 = vdup_n_s16(cospi_12_64); + d3s16 = vdup_n_s16(cospi_20_64); + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + q2s32 = vmull_s16(d18s16, d0s16); + q3s32 = vmull_s16(d19s16, d0s16); + q5s32 = vmull_s16(d26s16, d2s16); + q6s32 = vmull_s16(d27s16, d2s16); + + q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); + q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); + q5s32 = vmlsl_s16(q5s32, d22s16, d3s16); + q6s32 = vmlsl_s16(q6s32, d23s16, d3s16); + + d8s16 = vqrshrn_n_s32(q2s32, 14); + d9s16 = vqrshrn_n_s32(q3s32, 14); + d10s16 = vqrshrn_n_s32(q5s32, 14); + d11s16 = vqrshrn_n_s32(q6s32, 14); + q4s16 = vcombine_s16(d8s16, d9s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + q2s32 = vmull_s16(d18s16, d1s16); + q3s32 = vmull_s16(d19s16, d1s16); + q9s32 = vmull_s16(d26s16, d3s16); + q13s32 = vmull_s16(d27s16, d3s16); + + q2s32 = vmlal_s16(q2s32, d30s16, d0s16); + q3s32 = vmlal_s16(q3s32, d31s16, d0s16); + q9s32 = vmlal_s16(q9s32, d22s16, d2s16); + q13s32 = vmlal_s16(q13s32, d23s16, d2s16); + + d14s16 = vqrshrn_n_s32(q2s32, 14); + d15s16 = vqrshrn_n_s32(q3s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q13s32, 14); + q6s16 = vcombine_s16(d12s16, d13s16); + q7s16 = vcombine_s16(d14s16, d15s16); + + d0s16 = vdup_n_s16(cospi_16_64); + + q2s32 = vmull_s16(d16s16, d0s16); + q3s32 = vmull_s16(d17s16, d0s16); + q13s32 = vmull_s16(d16s16, d0s16); + q15s32 = vmull_s16(d17s16, d0s16); + + q2s32 = vmlal_s16(q2s32, d24s16, d0s16); + q3s32 = vmlal_s16(q3s32, d25s16, d0s16); + q13s32 = vmlsl_s16(q13s32, d24s16, d0s16); + q15s32 = vmlsl_s16(q15s32, d25s16, d0s16); + + d0s16 = vdup_n_s16(cospi_24_64); + d1s16 = vdup_n_s16(cospi_8_64); + + d18s16 = vqrshrn_n_s32(q2s32, 14); + d19s16 = vqrshrn_n_s32(q3s32, 14); + d22s16 = vqrshrn_n_s32(q13s32, 14); + d23s16 = vqrshrn_n_s32(q15s32, 14); + *q9s16 = vcombine_s16(d18s16, d19s16); + *q11s16 = vcombine_s16(d22s16, d23s16); + + q2s32 = vmull_s16(d20s16, d0s16); + q3s32 = vmull_s16(d21s16, d0s16); + q8s32 = vmull_s16(d20s16, d1s16); + q12s32 = vmull_s16(d21s16, d1s16); + + q2s32 = vmlsl_s16(q2s32, d28s16, d1s16); + q3s32 = vmlsl_s16(q3s32, d29s16, d1s16); + q8s32 = vmlal_s16(q8s32, d28s16, d0s16); + q12s32 = vmlal_s16(q12s32, d29s16, d0s16); + + d26s16 = vqrshrn_n_s32(q2s32, 14); + d27s16 = vqrshrn_n_s32(q3s32, 14); + d30s16 = vqrshrn_n_s32(q8s32, 14); + d31s16 = vqrshrn_n_s32(q12s32, 14); + *q13s16 = vcombine_s16(d26s16, d27s16); + *q15s16 = vcombine_s16(d30s16, d31s16); + + q0s16 = vaddq_s16(*q9s16, *q15s16); + q1s16 = vaddq_s16(*q11s16, *q13s16); + q2s16 = vsubq_s16(*q11s16, *q13s16); + q3s16 = vsubq_s16(*q9s16, *q15s16); + + *q13s16 = vsubq_s16(q4s16, q5s16); + q4s16 = vaddq_s16(q4s16, q5s16); + *q14s16 = vsubq_s16(q7s16, q6s16); + q7s16 = vaddq_s16(q7s16, q6s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + + d16s16 = vdup_n_s16(cospi_16_64); + + q9s32 = vmull_s16(d28s16, d16s16); + q10s32 = vmull_s16(d29s16, d16s16); + q11s32 = vmull_s16(d28s16, d16s16); + q12s32 = vmull_s16(d29s16, d16s16); + + q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); + q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); + q11s32 = vmlal_s16(q11s32, d26s16, d16s16); + q12s32 = vmlal_s16(q12s32, d27s16, d16s16); + + d10s16 = vqrshrn_n_s32(q9s32, 14); + d11s16 = vqrshrn_n_s32(q10s32, 14); + d12s16 = vqrshrn_n_s32(q11s32, 14); + d13s16 = vqrshrn_n_s32(q12s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + *q8s16 = vaddq_s16(q0s16, q7s16); + *q9s16 = vaddq_s16(q1s16, q6s16); + *q10s16 = vaddq_s16(q2s16, q5s16); + *q11s16 = vaddq_s16(q3s16, q4s16); + *q12s16 = vsubq_s16(q3s16, q4s16); + *q13s16 = vsubq_s16(q2s16, q5s16); + *q14s16 = vsubq_s16(q1s16, q6s16); + *q15s16 = vsubq_s16(q0s16, q7s16); + return; +} + +void vp9_idct8x8_64_add_neon( + int16_t *input, + uint8_t *dest, + int dest_stride) { + uint8_t *d1, *d2; + uint8x8_t d0u8, d1u8, d2u8, d3u8; + uint64x1_t d0u64, d1u64, d2u64, d3u64; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + uint16x8_t q8u16, q9u16, q10u16, q11u16; + + q8s16 = vld1q_s16(input); + q9s16 = vld1q_s16(input + 8); + q10s16 = vld1q_s16(input + 16); + q11s16 = vld1q_s16(input + 24); + q12s16 = vld1q_s16(input + 32); + q13s16 = vld1q_s16(input + 40); + q14s16 = vld1q_s16(input + 48); + q15s16 = vld1q_s16(input + 56); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + q8s16 = vrshrq_n_s16(q8s16, 5); + q9s16 = vrshrq_n_s16(q9s16, 5); + q10s16 = vrshrq_n_s16(q10s16, 5); + q11s16 = vrshrq_n_s16(q11s16, 5); + q12s16 = vrshrq_n_s16(q12s16, 5); + q13s16 = vrshrq_n_s16(q13s16, 5); + q14s16 = vrshrq_n_s16(q14s16, 5); + q15s16 = vrshrq_n_s16(q15s16, 5); + + d1 = d2 = dest; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), + vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), + vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), + vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), + vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + + q8s16 = q12s16; + q9s16 = q13s16; + q10s16 = q14s16; + q11s16 = q15s16; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), + vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), + vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), + vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), + vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + return; +} + +void vp9_idct8x8_12_add_neon( + int16_t *input, + uint8_t *dest, + int dest_stride) { + uint8_t *d1, *d2; + uint8x8_t d0u8, d1u8, d2u8, d3u8; + int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16; + int16x4_t d26s16, d27s16, d28s16, d29s16; + uint64x1_t d0u64, d1u64, d2u64, d3u64; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + uint16x8_t q8u16, q9u16, q10u16, q11u16; + int32x4_t q9s32, q10s32, q11s32, q12s32; + + q8s16 = vld1q_s16(input); + q9s16 = vld1q_s16(input + 8); + q10s16 = vld1q_s16(input + 16); + q11s16 = vld1q_s16(input + 24); + q12s16 = vld1q_s16(input + 32); + q13s16 = vld1q_s16(input + 40); + q14s16 = vld1q_s16(input + 48); + q15s16 = vld1q_s16(input + 56); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + // First transform rows + // stage 1 + q0s16 = vdupq_n_s16(cospi_28_64 * 2); + q1s16 = vdupq_n_s16(cospi_4_64 * 2); + + q4s16 = vqrdmulhq_s16(q9s16, q0s16); + + q0s16 = vdupq_n_s16(-cospi_20_64 * 2); + + q7s16 = vqrdmulhq_s16(q9s16, q1s16); + + q1s16 = vdupq_n_s16(cospi_12_64 * 2); + + q5s16 = vqrdmulhq_s16(q11s16, q0s16); + + q0s16 = vdupq_n_s16(cospi_16_64 * 2); + + q6s16 = vqrdmulhq_s16(q11s16, q1s16); + + // stage 2 & stage 3 - even half + q1s16 = vdupq_n_s16(cospi_24_64 * 2); + + q9s16 = vqrdmulhq_s16(q8s16, q0s16); + + q0s16 = vdupq_n_s16(cospi_8_64 * 2); + + q13s16 = vqrdmulhq_s16(q10s16, q1s16); + + q15s16 = vqrdmulhq_s16(q10s16, q0s16); + + // stage 3 -odd half + q0s16 = vaddq_s16(q9s16, q15s16); + q1s16 = vaddq_s16(q9s16, q13s16); + q2s16 = vsubq_s16(q9s16, q13s16); + q3s16 = vsubq_s16(q9s16, q15s16); + + // stage 2 - odd half + q13s16 = vsubq_s16(q4s16, q5s16); + q4s16 = vaddq_s16(q4s16, q5s16); + q14s16 = vsubq_s16(q7s16, q6s16); + q7s16 = vaddq_s16(q7s16, q6s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + d16s16 = vdup_n_s16(cospi_16_64); + q9s32 = vmull_s16(d28s16, d16s16); + q10s32 = vmull_s16(d29s16, d16s16); + q11s32 = vmull_s16(d28s16, d16s16); + q12s32 = vmull_s16(d29s16, d16s16); + + q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); + q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); + q11s32 = vmlal_s16(q11s32, d26s16, d16s16); + q12s32 = vmlal_s16(q12s32, d27s16, d16s16); + + d10s16 = vqrshrn_n_s32(q9s32, 14); + d11s16 = vqrshrn_n_s32(q10s32, 14); + d12s16 = vqrshrn_n_s32(q11s32, 14); + d13s16 = vqrshrn_n_s32(q12s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 4 + q8s16 = vaddq_s16(q0s16, q7s16); + q9s16 = vaddq_s16(q1s16, q6s16); + q10s16 = vaddq_s16(q2s16, q5s16); + q11s16 = vaddq_s16(q3s16, q4s16); + q12s16 = vsubq_s16(q3s16, q4s16); + q13s16 = vsubq_s16(q2s16, q5s16); + q14s16 = vsubq_s16(q1s16, q6s16); + q15s16 = vsubq_s16(q0s16, q7s16); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + q8s16 = vrshrq_n_s16(q8s16, 5); + q9s16 = vrshrq_n_s16(q9s16, 5); + q10s16 = vrshrq_n_s16(q10s16, 5); + q11s16 = vrshrq_n_s16(q11s16, 5); + q12s16 = vrshrq_n_s16(q12s16, 5); + q13s16 = vrshrq_n_s16(q13s16, 5); + q14s16 = vrshrq_n_s16(q14s16, 5); + q15s16 = vrshrq_n_s16(q15s16, 5); + + d1 = d2 = dest; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), + vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), + vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), + vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), + vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + + q8s16 = q12s16; + q9s16 = q13s16; + q10s16 = q14s16; + q11s16 = q15s16; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), + vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), + vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), + vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), + vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + return; +} diff --git a/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm b/vp9/common/arm/neon/vp9_idct8x8_add_neon_asm.asm index ab5bb6920..ab5bb6920 100644 --- a/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm +++ b/vp9/common/arm/neon/vp9_idct8x8_add_neon_asm.asm diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c index bc6a17cd1..97fe02805 100644 --- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c +++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c @@ -18,8 +18,8 @@ void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vp9_lpf_horizontal_8(s, p, blimit0, limit0, thresh0, 1); - vp9_lpf_horizontal_8(s + 8, p, blimit1, limit1, thresh1, 1); + vp9_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1); + vp9_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1); } void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p, @@ -44,6 +44,7 @@ void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p, vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1); } +#if HAVE_NEON_ASM void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, @@ -51,3 +52,4 @@ void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p, vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh); vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh); } +#endif // HAVE_NEON_ASM diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm b/vp9/common/arm/neon/vp9_loopfilter_16_neon_asm.asm index 5b8ec2028..5b8ec2028 100644 --- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm +++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon_asm.asm diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon.c b/vp9/common/arm/neon/vp9_loopfilter_neon.c new file mode 100644 index 000000000..f54d7a94b --- /dev/null +++ b/vp9/common/arm/neon/vp9_loopfilter_neon.c @@ -0,0 +1,712 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +static inline void vp9_loop_filter_neon( + uint8x8_t dblimit, // flimit + uint8x8_t dlimit, // limit + uint8x8_t dthresh, // thresh + uint8x8_t d3u8, // p3 + uint8x8_t d4u8, // p2 + uint8x8_t d5u8, // p1 + uint8x8_t d6u8, // p0 + uint8x8_t d7u8, // q0 + uint8x8_t d16u8, // q1 + uint8x8_t d17u8, // q2 + uint8x8_t d18u8, // q3 + uint8x8_t *d4ru8, // p1 + uint8x8_t *d5ru8, // p0 + uint8x8_t *d6ru8, // q0 + uint8x8_t *d7ru8) { // q1 + uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8; + int16x8_t q12s16; + int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8; + + d19u8 = vabd_u8(d3u8, d4u8); + d20u8 = vabd_u8(d4u8, d5u8); + d21u8 = vabd_u8(d5u8, d6u8); + d22u8 = vabd_u8(d16u8, d7u8); + d3u8 = vabd_u8(d17u8, d16u8); + d4u8 = vabd_u8(d18u8, d17u8); + + d19u8 = vmax_u8(d19u8, d20u8); + d20u8 = vmax_u8(d21u8, d22u8); + d3u8 = vmax_u8(d3u8, d4u8); + d23u8 = vmax_u8(d19u8, d20u8); + + d17u8 = vabd_u8(d6u8, d7u8); + + d21u8 = vcgt_u8(d21u8, dthresh); + d22u8 = vcgt_u8(d22u8, dthresh); + d23u8 = vmax_u8(d23u8, d3u8); + + d28u8 = vabd_u8(d5u8, d16u8); + d17u8 = vqadd_u8(d17u8, d17u8); + + d23u8 = vcge_u8(dlimit, d23u8); + + d18u8 = vdup_n_u8(0x80); + d5u8 = veor_u8(d5u8, d18u8); + d6u8 = veor_u8(d6u8, d18u8); + d7u8 = veor_u8(d7u8, d18u8); + d16u8 = veor_u8(d16u8, d18u8); + + d28u8 = vshr_n_u8(d28u8, 1); + d17u8 = vqadd_u8(d17u8, d28u8); + + d19u8 = vdup_n_u8(3); + + d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), + vreinterpret_s8_u8(d6u8)); + + d17u8 = vcge_u8(dblimit, d17u8); + + d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), + vreinterpret_s8_u8(d16u8)); + + d22u8 = vorr_u8(d21u8, d22u8); + + q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8)); + + d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8); + d23u8 = vand_u8(d23u8, d17u8); + + q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8)); + + d17u8 = vdup_n_u8(4); + + d27s8 = vqmovn_s16(q12s16); + d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8); + d27s8 = vreinterpret_s8_u8(d27u8); + + d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8)); + d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8)); + d28s8 = vshr_n_s8(d28s8, 3); + d27s8 = vshr_n_s8(d27s8, 3); + + d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8); + d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8); + + d27s8 = vrshr_n_s8(d27s8, 1); + d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8)); + + d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8); + d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8); + + *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8); + *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8); + *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8); + *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8); + return; +} + +void vp9_lpf_horizontal_4_neon( + unsigned char *src, + int pitch, + unsigned char *blimit, + unsigned char *limit, + unsigned char *thresh, + int count) { + int i; + uint8_t *s, *psrc; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; + + if (count == 0) // end_vp9_lf_h_edge + return; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + psrc = src - (pitch << 2); + for (i = 0; i < count; i++) { + s = psrc + i * 8; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + vp9_loop_filter_neon(dblimit, dlimit, dthresh, + d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, + &d4u8, &d5u8, &d6u8, &d7u8); + + s -= (pitch * 5); + vst1_u8(s, d4u8); + s += pitch; + vst1_u8(s, d5u8); + s += pitch; + vst1_u8(s, d6u8); + s += pitch; + vst1_u8(s, d7u8); + } + return; +} + +void vp9_lpf_vertical_4_neon( + unsigned char *src, + int pitch, + unsigned char *blimit, + unsigned char *limit, + unsigned char *thresh, + int count) { + int i, pitch8; + uint8_t *s; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; + uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; + uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; + uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; + uint8x8x4_t d4Result; + + if (count == 0) // end_vp9_lf_h_edge + return; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + pitch8 = pitch * 8; + for (i = 0; i < count; i++, src += pitch8) { + s = src - (i + 1) * 4; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), + vreinterpret_u32_u8(d7u8)); + d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), + vreinterpret_u32_u8(d16u8)); + d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), + vreinterpret_u32_u8(d17u8)); + d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), + vreinterpret_u32_u8(d18u8)); + + d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), + vreinterpret_u16_u32(d2tmp2.val[0])); + d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), + vreinterpret_u16_u32(d2tmp3.val[0])); + d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), + vreinterpret_u16_u32(d2tmp2.val[1])); + d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), + vreinterpret_u16_u32(d2tmp3.val[1])); + + d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), + vreinterpret_u8_u16(d2tmp5.val[0])); + d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), + vreinterpret_u8_u16(d2tmp5.val[1])); + d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), + vreinterpret_u8_u16(d2tmp7.val[0])); + d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), + vreinterpret_u8_u16(d2tmp7.val[1])); + + d3u8 = d2tmp8.val[0]; + d4u8 = d2tmp8.val[1]; + d5u8 = d2tmp9.val[0]; + d6u8 = d2tmp9.val[1]; + d7u8 = d2tmp10.val[0]; + d16u8 = d2tmp10.val[1]; + d17u8 = d2tmp11.val[0]; + d18u8 = d2tmp11.val[1]; + + vp9_loop_filter_neon(dblimit, dlimit, dthresh, + d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, + &d4u8, &d5u8, &d6u8, &d7u8); + + d4Result.val[0] = d4u8; + d4Result.val[1] = d5u8; + d4Result.val[2] = d6u8; + d4Result.val[3] = d7u8; + + src -= 2; + vst4_lane_u8(src, d4Result, 0); + src += pitch; + vst4_lane_u8(src, d4Result, 1); + src += pitch; + vst4_lane_u8(src, d4Result, 2); + src += pitch; + vst4_lane_u8(src, d4Result, 3); + src += pitch; + vst4_lane_u8(src, d4Result, 4); + src += pitch; + vst4_lane_u8(src, d4Result, 5); + src += pitch; + vst4_lane_u8(src, d4Result, 6); + src += pitch; + vst4_lane_u8(src, d4Result, 7); + } + return; +} + +static inline void vp9_mbloop_filter_neon( + uint8x8_t dblimit, // mblimit + uint8x8_t dlimit, // limit + uint8x8_t dthresh, // thresh + uint8x8_t d3u8, // p2 + uint8x8_t d4u8, // p2 + uint8x8_t d5u8, // p1 + uint8x8_t d6u8, // p0 + uint8x8_t d7u8, // q0 + uint8x8_t d16u8, // q1 + uint8x8_t d17u8, // q2 + uint8x8_t d18u8, // q3 + uint8x8_t *d0ru8, // p1 + uint8x8_t *d1ru8, // p1 + uint8x8_t *d2ru8, // p0 + uint8x8_t *d3ru8, // q0 + uint8x8_t *d4ru8, // q1 + uint8x8_t *d5ru8) { // q1 + uint32_t flat; + uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8; + uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; + int16x8_t q15s16; + uint16x8_t q10u16, q14u16; + int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8; + + d19u8 = vabd_u8(d3u8, d4u8); + d20u8 = vabd_u8(d4u8, d5u8); + d21u8 = vabd_u8(d5u8, d6u8); + d22u8 = vabd_u8(d16u8, d7u8); + d23u8 = vabd_u8(d17u8, d16u8); + d24u8 = vabd_u8(d18u8, d17u8); + + d19u8 = vmax_u8(d19u8, d20u8); + d20u8 = vmax_u8(d21u8, d22u8); + + d25u8 = vabd_u8(d6u8, d4u8); + + d23u8 = vmax_u8(d23u8, d24u8); + + d26u8 = vabd_u8(d7u8, d17u8); + + d19u8 = vmax_u8(d19u8, d20u8); + + d24u8 = vabd_u8(d6u8, d7u8); + d27u8 = vabd_u8(d3u8, d6u8); + d28u8 = vabd_u8(d18u8, d7u8); + + d19u8 = vmax_u8(d19u8, d23u8); + + d23u8 = vabd_u8(d5u8, d16u8); + d24u8 = vqadd_u8(d24u8, d24u8); + + + d19u8 = vcge_u8(dlimit, d19u8); + + + d25u8 = vmax_u8(d25u8, d26u8); + d26u8 = vmax_u8(d27u8, d28u8); + + d23u8 = vshr_n_u8(d23u8, 1); + + d25u8 = vmax_u8(d25u8, d26u8); + + d24u8 = vqadd_u8(d24u8, d23u8); + + d20u8 = vmax_u8(d20u8, d25u8); + + d23u8 = vdup_n_u8(1); + d24u8 = vcge_u8(dblimit, d24u8); + + d21u8 = vcgt_u8(d21u8, dthresh); + + d20u8 = vcge_u8(d23u8, d20u8); + + d19u8 = vand_u8(d19u8, d24u8); + + d23u8 = vcgt_u8(d22u8, dthresh); + + d20u8 = vand_u8(d20u8, d19u8); + + d22u8 = vdup_n_u8(0x80); + + d23u8 = vorr_u8(d21u8, d23u8); + + q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), + vreinterpret_u16_u8(d21u8)); + + d30u8 = vshrn_n_u16(q10u16, 4); + flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0); + + if (flat == 0xffffffff) { // Check for all 1's, power_branch_only + d27u8 = vdup_n_u8(3); + d21u8 = vdup_n_u8(2); + q14u16 = vaddl_u8(d6u8, d7u8); + q14u16 = vmlal_u8(q14u16, d3u8, d27u8); + q14u16 = vmlal_u8(q14u16, d4u8, d21u8); + q14u16 = vaddw_u8(q14u16, d5u8); + *d0ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vaddw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d16u8); + *d1ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d17u8); + *d2ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d7u8); + q14u16 = vaddw_u8(q14u16, d18u8); + *d3ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vsubw_u8(q14u16, d7u8); + q14u16 = vaddw_u8(q14u16, d16u8); + q14u16 = vaddw_u8(q14u16, d18u8); + *d4ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vsubw_u8(q14u16, d16u8); + q14u16 = vaddw_u8(q14u16, d17u8); + q14u16 = vaddw_u8(q14u16, d18u8); + *d5ru8 = vqrshrn_n_u16(q14u16, 3); + } else { + d21u8 = veor_u8(d7u8, d22u8); + d24u8 = veor_u8(d6u8, d22u8); + d25u8 = veor_u8(d5u8, d22u8); + d26u8 = veor_u8(d16u8, d22u8); + + d27u8 = vdup_n_u8(3); + + d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8)); + d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8)); + + q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8)); + + d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8)); + + q15s16 = vaddw_s8(q15s16, d29s8); + + d29u8 = vdup_n_u8(4); + + d28s8 = vqmovn_s16(q15s16); + + d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8)); + + d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8)); + d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8)); + d30s8 = vshr_n_s8(d30s8, 3); + d29s8 = vshr_n_s8(d29s8, 3); + + d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8); + d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8); + + d29s8 = vrshr_n_s8(d29s8, 1); + d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8)); + + d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8); + d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8); + + if (flat == 0) { // filter_branch_only + *d0ru8 = d4u8; + *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); + *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); + *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); + *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); + *d5ru8 = d17u8; + return; + } + + d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); + d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); + d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); + d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); + + d23u8 = vdup_n_u8(2); + q14u16 = vaddl_u8(d6u8, d7u8); + q14u16 = vmlal_u8(q14u16, d3u8, d27u8); + q14u16 = vmlal_u8(q14u16, d4u8, d23u8); + + d0u8 = vbsl_u8(d20u8, dblimit, d4u8); + + q14u16 = vaddw_u8(q14u16, d5u8); + + d1u8 = vbsl_u8(d20u8, dlimit, d25u8); + + d30u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vaddw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d16u8); + + d2u8 = vbsl_u8(d20u8, dthresh, d24u8); + + d31u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d17u8); + + *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8); + + d23u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d7u8); + + *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8); + + q14u16 = vaddw_u8(q14u16, d18u8); + + *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8); + + d22u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vsubw_u8(q14u16, d7u8); + q14u16 = vaddw_u8(q14u16, d16u8); + + d3u8 = vbsl_u8(d20u8, d3u8, d21u8); + + q14u16 = vaddw_u8(q14u16, d18u8); + + d4u8 = vbsl_u8(d20u8, d4u8, d26u8); + + d6u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vsubw_u8(q14u16, d16u8); + q14u16 = vaddw_u8(q14u16, d17u8); + q14u16 = vaddw_u8(q14u16, d18u8); + + d5u8 = vbsl_u8(d20u8, d5u8, d17u8); + + d7u8 = vqrshrn_n_u16(q14u16, 3); + + *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8); + *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8); + *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8); + } + return; +} + +void vp9_lpf_horizontal_8_neon( + unsigned char *src, + int pitch, + unsigned char *blimit, + unsigned char *limit, + unsigned char *thresh, + int count) { + int i; + uint8_t *s, *psrc; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + uint8x8_t d16u8, d17u8, d18u8; + + if (count == 0) // end_vp9_mblf_h_edge + return; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + psrc = src - (pitch << 2); + for (i = 0; i < count; i++) { + s = psrc + i * 8; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + vp9_mbloop_filter_neon(dblimit, dlimit, dthresh, + d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, + &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8); + + s -= (pitch * 6); + vst1_u8(s, d0u8); + s += pitch; + vst1_u8(s, d1u8); + s += pitch; + vst1_u8(s, d2u8); + s += pitch; + vst1_u8(s, d3u8); + s += pitch; + vst1_u8(s, d4u8); + s += pitch; + vst1_u8(s, d5u8); + } + return; +} + +void vp9_lpf_vertical_8_neon( + unsigned char *src, + int pitch, + unsigned char *blimit, + unsigned char *limit, + unsigned char *thresh, + int count) { + int i; + uint8_t *s; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + uint8x8_t d16u8, d17u8, d18u8; + uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; + uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; + uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; + uint8x8x4_t d4Result; + uint8x8x2_t d2Result; + + if (count == 0) + return; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + for (i = 0; i < count; i++) { + s = src + (i * (pitch << 3)) - 4; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), + vreinterpret_u32_u8(d7u8)); + d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), + vreinterpret_u32_u8(d16u8)); + d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), + vreinterpret_u32_u8(d17u8)); + d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), + vreinterpret_u32_u8(d18u8)); + + d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), + vreinterpret_u16_u32(d2tmp2.val[0])); + d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), + vreinterpret_u16_u32(d2tmp3.val[0])); + d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), + vreinterpret_u16_u32(d2tmp2.val[1])); + d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), + vreinterpret_u16_u32(d2tmp3.val[1])); + + d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), + vreinterpret_u8_u16(d2tmp5.val[0])); + d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), + vreinterpret_u8_u16(d2tmp5.val[1])); + d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), + vreinterpret_u8_u16(d2tmp7.val[0])); + d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), + vreinterpret_u8_u16(d2tmp7.val[1])); + + d3u8 = d2tmp8.val[0]; + d4u8 = d2tmp8.val[1]; + d5u8 = d2tmp9.val[0]; + d6u8 = d2tmp9.val[1]; + d7u8 = d2tmp10.val[0]; + d16u8 = d2tmp10.val[1]; + d17u8 = d2tmp11.val[0]; + d18u8 = d2tmp11.val[1]; + + vp9_mbloop_filter_neon(dblimit, dlimit, dthresh, + d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, + &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8); + + d4Result.val[0] = d0u8; + d4Result.val[1] = d1u8; + d4Result.val[2] = d2u8; + d4Result.val[3] = d3u8; + + d2Result.val[0] = d4u8; + d2Result.val[1] = d5u8; + + s = src - 3; + vst4_lane_u8(s, d4Result, 0); + s += pitch; + vst4_lane_u8(s, d4Result, 1); + s += pitch; + vst4_lane_u8(s, d4Result, 2); + s += pitch; + vst4_lane_u8(s, d4Result, 3); + s += pitch; + vst4_lane_u8(s, d4Result, 4); + s += pitch; + vst4_lane_u8(s, d4Result, 5); + s += pitch; + vst4_lane_u8(s, d4Result, 6); + s += pitch; + vst4_lane_u8(s, d4Result, 7); + + s = src + 1; + vst2_lane_u8(s, d2Result, 0); + s += pitch; + vst2_lane_u8(s, d2Result, 1); + s += pitch; + vst2_lane_u8(s, d2Result, 2); + s += pitch; + vst2_lane_u8(s, d2Result, 3); + s += pitch; + vst2_lane_u8(s, d2Result, 4); + s += pitch; + vst2_lane_u8(s, d2Result, 5); + s += pitch; + vst2_lane_u8(s, d2Result, 6); + s += pitch; + vst2_lane_u8(s, d2Result, 7); + } + return; +} diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon.asm b/vp9/common/arm/neon/vp9_loopfilter_neon_asm.asm index 443032217..443032217 100644 --- a/vp9/common/arm/neon/vp9_loopfilter_neon.asm +++ b/vp9/common/arm/neon/vp9_loopfilter_neon_asm.asm diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index 284d3a2b5..cb299f9f7 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -45,6 +45,7 @@ void vp9_free_ref_frame_buffers(VP9_COMMON *cm) { } vp9_free_frame_buffer(&cm->post_proc_buffer); + vp9_free_frame_buffer(&cm->post_proc_buffer_int); } void vp9_free_context_buffers(VP9_COMMON *cm) { diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index 893a2bb63..7d7209c56 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -192,6 +192,10 @@ typedef struct macroblockd { int mi_stride; MODE_INFO *mi; + MODE_INFO *left_mi; + MODE_INFO *above_mi; + MB_MODE_INFO *left_mbmi; + MB_MODE_INFO *above_mbmi; int up_available; int left_available; diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c new file mode 100644 index 000000000..f1bdc1b06 --- /dev/null +++ b/vp9/common/vp9_mfqe.c @@ -0,0 +1,314 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_scale_rtcd.h" +#include "./vp9_rtcd.h" + +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_postproc.h" + +// TODO(jackychen): Replace this function with SSE2 code. There is +// one SSE2 implementation in vp8, so will consider how to share it +// between vp8 and vp9. +static void filter_by_weight(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + int block_size, int src_weight) { + const int dst_weight = (1 << MFQE_PRECISION) - src_weight; + const int rounding_bit = 1 << (MFQE_PRECISION - 1); + int r, c; + + for (r = 0; r < block_size; r++) { + for (c = 0; c < block_size; c++) { + dst[c] = (src[c] * src_weight + dst[c] * dst_weight + rounding_bit) + >> MFQE_PRECISION; + } + src += src_stride; + dst += dst_stride; + } +} + +static void filter_by_weight32x32(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int weight) { + filter_by_weight(src, src_stride, dst, dst_stride, 16, weight); + filter_by_weight(src + 16, src_stride, dst + 16, dst_stride, 16, weight); + filter_by_weight(src + src_stride * 16, src_stride, dst + dst_stride * 16, + dst_stride, 16, weight); + filter_by_weight(src + src_stride * 16 + 16, src_stride, + dst + dst_stride * 16 + 16, dst_stride, 16, weight); +} + +static void filter_by_weight64x64(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int weight) { + filter_by_weight32x32(src, src_stride, dst, dst_stride, weight); + filter_by_weight32x32(src + 32, src_stride, dst + 32, + dst_stride, weight); + filter_by_weight32x32(src + src_stride * 32, src_stride, + dst + dst_stride * 32, dst_stride, weight); + filter_by_weight32x32(src + src_stride * 32 + 32, src_stride, + dst + dst_stride * 32 + 32, dst_stride, weight); +} + +static void apply_ifactor(const uint8_t *y, int y_stride, uint8_t *yd, + int yd_stride, const uint8_t *u, const uint8_t *v, + int uv_stride, uint8_t *ud, uint8_t *vd, + int uvd_stride, BLOCK_SIZE block_size, + int weight) { + if (block_size == BLOCK_16X16) { + filter_by_weight(y, y_stride, yd, yd_stride, 16, weight); + filter_by_weight(u, uv_stride, ud, uvd_stride, 8, weight); + filter_by_weight(v, uv_stride, vd, uvd_stride, 8, weight); + } else if (block_size == BLOCK_32X32) { + filter_by_weight32x32(y, y_stride, yd, yd_stride, weight); + filter_by_weight(u, uv_stride, ud, uvd_stride, 16, weight); + filter_by_weight(v, uv_stride, vd, uvd_stride, 16, weight); + } else if (block_size == BLOCK_64X64) { + filter_by_weight64x64(y, y_stride, yd, yd_stride, weight); + filter_by_weight32x32(u, uv_stride, ud, uvd_stride, weight); + filter_by_weight32x32(v, uv_stride, vd, uvd_stride, weight); + } +} + +// TODO(jackychen): Determine whether replace it with assembly code. +static void copy_mem8x8(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride) { + int r; + for (r = 0; r < 8; r++) { + memcpy(dst, src, 8); + src += src_stride; + dst += dst_stride; + } +} + +static void copy_mem16x16(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride) { + int r; + for (r = 0; r < 16; r++) { + memcpy(dst, src, 16); + src += src_stride; + dst += dst_stride; + } +} + +static void copy_mem32x32(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride) { + copy_mem16x16(src, src_stride, dst, dst_stride); + copy_mem16x16(src + 16, src_stride, dst + 16, dst_stride); + copy_mem16x16(src + src_stride * 16, src_stride, + dst + dst_stride * 16, dst_stride); + copy_mem16x16(src + src_stride * 16 + 16, src_stride, + dst + dst_stride * 16 + 16, dst_stride); +} + +void copy_mem64x64(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride) { + copy_mem32x32(src, src_stride, dst, dst_stride); + copy_mem32x32(src + 32, src_stride, dst + 32, dst_stride); + copy_mem32x32(src + src_stride * 32, src_stride, + dst + src_stride * 32, dst_stride); + copy_mem32x32(src + src_stride * 32 + 32, src_stride, + dst + src_stride * 32 + 32, dst_stride); +} + +static void copy_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, + int y_stride, int uv_stride, uint8_t *yd, uint8_t *ud, + uint8_t *vd, int yd_stride, int uvd_stride, + BLOCK_SIZE bs) { + if (bs == BLOCK_16X16) { + copy_mem16x16(y, y_stride, yd, yd_stride); + copy_mem8x8(u, uv_stride, ud, uvd_stride); + copy_mem8x8(v, uv_stride, vd, uvd_stride); + } else if (bs == BLOCK_32X32) { + copy_mem32x32(y, y_stride, yd, yd_stride); + copy_mem16x16(u, uv_stride, ud, uvd_stride); + copy_mem16x16(v, uv_stride, vd, uvd_stride); + } else { + copy_mem64x64(y, y_stride, yd, yd_stride); + copy_mem32x32(u, uv_stride, ud, uvd_stride); + copy_mem32x32(v, uv_stride, vd, uvd_stride); + } +} + +static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u, + const uint8_t *v, int y_stride, int uv_stride, + uint8_t *yd, uint8_t *ud, uint8_t *vd, + int yd_stride, int uvd_stride) { + int sad, sad_thr, vdiff; + uint32_t sse; + + if (bs == BLOCK_16X16) { + vdiff = (vp9_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8; + sad = (vp9_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8; + } else if (bs == BLOCK_32X32) { + vdiff = (vp9_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10; + sad = (vp9_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10; + } else /* if (bs == BLOCK_64X64) */ { + vdiff = (vp9_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12; + sad = (vp9_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12; + } + + if (bs == BLOCK_16X16) { + sad_thr = 8; + } else if (bs == BLOCK_32X32) { + sad_thr = 7; + } else { // BLOCK_64X64 + sad_thr = 6; + } + + // TODO(jackychen): More experiments and remove magic numbers. + // vdiff > sad * 3 means vdiff should not be too small, otherwise, + // it might be a lighting change in smooth area. When there is a + // lighting change in smooth area, it is dangerous to do MFQE. + if (sad > 1 && sad < sad_thr && vdiff > sad * 3 && vdiff < 150) { + // TODO(jackychen): Add weighted average in the calculation. + // Currently, the data is copied from last frame without averaging. + apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, + ud, vd, uvd_stride, bs, 0); + } else { + // Copy the block from current frame (i.e., no mfqe is done). + copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd, + yd_stride, uvd_stride, bs); + } +} + +static int mfqe_decision(MODE_INFO *mi, BLOCK_SIZE cur_bs) { + // Check the motion in current block(for inter frame), + // or check the motion in the correlated block in last frame (for keyframe). + const int mv_len_square = mi->mbmi.mv[0].as_mv.row * + mi->mbmi.mv[0].as_mv.row + + mi->mbmi.mv[0].as_mv.col * + mi->mbmi.mv[0].as_mv.col; + const int mv_threshold = 100; + return mi->mbmi.mode >= NEARESTMV && // Not an intra block + cur_bs >= BLOCK_16X16 && + mv_len_square <= mv_threshold; +} + +// Process each partiton in a super block, recursively. +static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs, + const uint8_t *y, const uint8_t *u, + const uint8_t *v, int y_stride, int uv_stride, + uint8_t *yd, uint8_t *ud, uint8_t *vd, + int yd_stride, int uvd_stride) { + int mi_offset, y_offset, uv_offset; + const BLOCK_SIZE cur_bs = mi->mbmi.sb_type; + // TODO(jackychen): Consider how and whether to use qdiff in MFQE. + // int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex; + const int bsl = b_width_log2_lookup[bs]; + PARTITION_TYPE partition = partition_lookup[bsl][cur_bs]; + const BLOCK_SIZE subsize = get_subsize(bs, partition); + + if (cur_bs < BLOCK_8X8) { + // If there are blocks smaller than 8x8, it must be on the boundary. + return; + } + // No MFQE on blocks smaller than 16x16 + if (partition == PARTITION_SPLIT && bs == BLOCK_16X16) { + partition = PARTITION_NONE; + } + switch (partition) { + case PARTITION_HORZ: + case PARTITION_VERT: + // If current block size is not square. + // Copy the block from current frame(i.e., no mfqe is done). + // TODO(jackychen): Rectangle blocks should also be taken into account. + copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd, + yd_stride, uvd_stride, bs); + break; + case PARTITION_NONE: + if (mfqe_decision(mi, cur_bs)) { + // Do mfqe on this partition. + mfqe_block(cur_bs, y, u, v, y_stride, uv_stride, + yd, ud, vd, yd_stride, uvd_stride); + } else { + // Copy the block from current frame(i.e., no mfqe is done). + copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd, + yd_stride, uvd_stride, bs); + } + break; + case PARTITION_SPLIT: + if (bs == BLOCK_64X64) { + mi_offset = 4; + y_offset = 32; + uv_offset = 16; + } else { + mi_offset = 2; + y_offset = 16; + uv_offset = 8; + } + // Recursion on four square partitions, e.g. if bs is 64X64, + // then look into four 32X32 blocks in it. + mfqe_partition(cm, mi, subsize, y, u, v, y_stride, uv_stride, yd, ud, vd, + yd_stride, uvd_stride); + mfqe_partition(cm, mi + mi_offset, subsize, y + y_offset, u + uv_offset, + v + uv_offset, y_stride, uv_stride, yd + y_offset, + ud + uv_offset, vd + uv_offset, yd_stride, uvd_stride); + mfqe_partition(cm, mi + mi_offset * cm->mi_stride, subsize, + y + y_offset * y_stride, u + uv_offset * uv_stride, + v + uv_offset * uv_stride, y_stride, uv_stride, + yd + y_offset * yd_stride, ud + uv_offset * uvd_stride, + vd + uv_offset * uvd_stride, yd_stride, uvd_stride); + mfqe_partition(cm, mi + mi_offset * cm->mi_stride + mi_offset, + subsize, y + y_offset * y_stride + y_offset, + u + uv_offset * uv_stride + uv_offset, + v + uv_offset * uv_stride + uv_offset, y_stride, + uv_stride, yd + y_offset * yd_stride + y_offset, + ud + uv_offset * uvd_stride + uv_offset, + vd + uv_offset * uvd_stride + uv_offset, + yd_stride, uvd_stride); + break; + default: + assert(0); + } +} + +void vp9_mfqe(VP9_COMMON *cm) { + int mi_row, mi_col; + // Current decoded frame. + const YV12_BUFFER_CONFIG *show = cm->frame_to_show; + // Last decoded frame and will store the MFQE result. + YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer; + // Loop through each super block. + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) { + MODE_INFO *mi; + MODE_INFO *mi_local = cm->mi + (mi_row * cm->mi_stride + mi_col); + // Motion Info in last frame. + MODE_INFO *mi_prev = cm->postproc_state.prev_mi + + (mi_row * cm->mi_stride + mi_col); + const uint32_t y_stride = show->y_stride; + const uint32_t uv_stride = show->uv_stride; + const uint32_t yd_stride = dest->y_stride; + const uint32_t uvd_stride = dest->uv_stride; + const uint32_t row_offset_y = mi_row << 3; + const uint32_t row_offset_uv = mi_row << 2; + const uint32_t col_offset_y = mi_col << 3; + const uint32_t col_offset_uv = mi_col << 2; + const uint8_t *y = show->y_buffer + row_offset_y * y_stride + + col_offset_y; + const uint8_t *u = show->u_buffer + row_offset_uv * uv_stride + + col_offset_uv; + const uint8_t *v = show->v_buffer + row_offset_uv * uv_stride + + col_offset_uv; + uint8_t *yd = dest->y_buffer + row_offset_y * yd_stride + col_offset_y; + uint8_t *ud = dest->u_buffer + row_offset_uv * uvd_stride + + col_offset_uv; + uint8_t *vd = dest->v_buffer + row_offset_uv * uvd_stride + + col_offset_uv; + if (frame_is_intra_only(cm)) { + mi = mi_prev; + } else { + mi = mi_local; + } + mfqe_partition(cm, mi, BLOCK_64X64, y, u, v, y_stride, uv_stride, yd, ud, + vd, yd_stride, uvd_stride); + } + } +} diff --git a/vp9/common/vp9_mfqe.h b/vp9/common/vp9_mfqe.h new file mode 100644 index 000000000..dfff8c23d --- /dev/null +++ b/vp9/common/vp9_mfqe.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_MFQE_H_ +#define VP9_COMMON_VP9_MFQE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +// Multiframe Quality Enhancement. +// The aim for MFQE is to replace pixel blocks in the current frame with +// the correlated pixel blocks (with higher quality) in the last frame. +// The replacement can only be taken in stationary blocks by checking +// the motion of the blocks and other conditions such as the SAD of +// the current block and correlated block, the variance of the block +// difference, etc. +void vp9_mfqe(struct VP9Common *cm); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_MFQE_H_ diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index f2c2d255f..55a1f86c7 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -113,6 +113,7 @@ typedef struct VP9Common { int new_fb_idx; YV12_BUFFER_CONFIG post_proc_buffer; + YV12_BUFFER_CONFIG post_proc_buffer_int; FRAME_TYPE last_frame_type; /* last frame's frame type for motion search.*/ FRAME_TYPE frame_type; @@ -309,6 +310,21 @@ static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, // Are edges available for intra prediction? xd->up_available = (mi_row != 0); xd->left_available = (mi_col > tile->mi_col_start); + if (xd->up_available) { + xd->above_mi = xd->mi[-xd->mi_stride].src_mi; + xd->above_mbmi = xd->above_mi ? &xd->above_mi->mbmi : NULL; + } else { + xd->above_mi = NULL; + xd->above_mbmi = NULL; + } + + if (xd->left_available) { + xd->left_mi = xd->mi[-1].src_mi; + xd->left_mbmi = xd->left_mi ? &xd->left_mi->mbmi : NULL; + } else { + xd->left_mi = NULL; + xd->left_mbmi = NULL; + } } static INLINE void update_partition_context(MACROBLOCKD *xd, diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c index 575ffbc30..e1a389132 100644 --- a/vp9/common/vp9_postproc.c +++ b/vp9/common/vp9_postproc.c @@ -79,6 +79,9 @@ const short vp9_rv[] = { 0, 9, 5, 5, 11, 10, 13, 9, 10, 13, }; +static const uint8_t q_diff_thresh = 20; +static const uint8_t last_q_thresh = 170; + void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, @@ -616,6 +619,17 @@ void vp9_plane_add_noise_c(uint8_t *start, char *noise, } } +static void swap_mi_and_prev_mi(VP9_COMMON *cm) { + // Current mip will be the prev_mip for the next frame. + MODE_INFO *temp = cm->postproc_state.prev_mip; + cm->postproc_state.prev_mip = cm->mip; + cm->mip = temp; + + // Update the upper left visible macroblock ptrs. + cm->mi = cm->mip + cm->mi_stride + 1; + cm->postproc_state.prev_mi = cm->postproc_state.prev_mip + cm->mi_stride + 1; +} + int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) { const int q = MIN(63, cm->lf.filter_level * 10 / 6); @@ -633,6 +647,42 @@ int vp9_post_proc_frame(struct VP9Common *cm, vp9_clear_system_state(); + // Alloc memory for prev_mip in the first frame. + if (cm->current_video_frame == 1) { + cm->postproc_state.last_base_qindex = cm->base_qindex; + cm->postproc_state.last_frame_valid = 1; + ppstate->prev_mip = vpx_calloc(cm->mi_alloc_size, sizeof(*cm->mip)); + if (!ppstate->prev_mip) { + return 1; + } + ppstate->prev_mi = ppstate->prev_mip + cm->mi_stride + 1; + vpx_memset(ppstate->prev_mip, 0, + cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip)); + } + + // Allocate post_proc_buffer_int if needed. + if ((flags & VP9D_MFQE) && !cm->post_proc_buffer_int.buffer_alloc) { + if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) { + const int width = ALIGN_POWER_OF_TWO(cm->width, 4); + const int height = ALIGN_POWER_OF_TWO(cm->height, 4); + + if (vp9_alloc_frame_buffer(&cm->post_proc_buffer_int, width, height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif // CONFIG_VP9_HIGHBITDEPTH + VP9_ENC_BORDER_IN_PIXELS) < 0) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate MFQE framebuffer"); + } + + // Ensure that postproc is set to all 0s so that post proc + // doesn't pull random data in from edge. + vpx_memset(cm->post_proc_buffer_int.buffer_alloc, 128, + cm->post_proc_buffer.frame_size); + } + } + #if CONFIG_VP9_POSTPROC || CONFIG_INTERNAL_STATS if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, @@ -644,7 +694,27 @@ int vp9_post_proc_frame(struct VP9Common *cm, "Failed to allocate post-processing buffer"); #endif - if (flags & VP9D_DEMACROBLOCK) { + if ((flags & VP9D_MFQE) && cm->current_video_frame >= 2 && + cm->postproc_state.last_frame_valid && + cm->postproc_state.last_base_qindex <= last_q_thresh && + cm->base_qindex - cm->postproc_state.last_base_qindex >= q_diff_thresh) { + vp9_mfqe(cm); + // TODO(jackychen): Consider whether enable deblocking by default + // if mfqe is enabled. Need to take both the quality and the speed + // into consideration. + if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) { + vp8_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int); + } + if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) { + deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf, + q + (ppflags->deblocking_level - 5) * 10, + 1, 0); + } else if (flags & VP9D_DEBLOCK) { + vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q); + } else { + vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf); + } + } else if (flags & VP9D_DEMACROBLOCK) { deblock_and_de_macro_block(cm->frame_to_show, ppbuf, q + (ppflags->deblocking_level - 5) * 10, 1, 0); } else if (flags & VP9D_DEBLOCK) { @@ -653,6 +723,9 @@ int vp9_post_proc_frame(struct VP9Common *cm, vp8_yv12_copy_frame(cm->frame_to_show, ppbuf); } + cm->postproc_state.last_base_qindex = cm->base_qindex; + cm->postproc_state.last_frame_valid = 1; + if (flags & VP9D_ADDNOISE) { const int noise_level = ppflags->noise_level; if (ppstate->last_q != q || @@ -673,6 +746,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, dest->uv_width = dest->y_width >> cm->subsampling_x; dest->uv_height = dest->y_height >> cm->subsampling_y; + swap_mi_and_prev_mi(cm); return 0; } #endif diff --git a/vp9/common/vp9_postproc.h b/vp9/common/vp9_postproc.h index ebebc1ae3..035c9cdf8 100644 --- a/vp9/common/vp9_postproc.h +++ b/vp9/common/vp9_postproc.h @@ -14,6 +14,8 @@ #include "vpx_ports/mem.h" #include "vpx_scale/yv12config.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_mfqe.h" #include "vp9/common/vp9_ppflags.h" #ifdef __cplusplus @@ -24,6 +26,10 @@ struct postproc_state { int last_q; int last_noise; char noise[3072]; + int last_base_qindex; + int last_frame_valid; + MODE_INFO *prev_mip; + MODE_INFO *prev_mi; DECLARE_ALIGNED(16, char, blackclamp[16]); DECLARE_ALIGNED(16, char, whiteclamp[16]); DECLARE_ALIGNED(16, char, bothclamp[16]); @@ -31,6 +37,8 @@ struct postproc_state { struct VP9Common; +#define MFQE_PRECISION 4 + int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags); diff --git a/vp9/common/vp9_ppflags.h b/vp9/common/vp9_ppflags.h index 1644a1bbb..12b989f43 100644 --- a/vp9/common/vp9_ppflags.h +++ b/vp9/common/vp9_ppflags.h @@ -26,7 +26,8 @@ enum { VP9D_DEBUG_TXT_RATE_INFO = 1 << 6, VP9D_DEBUG_DRAW_MV = 1 << 7, VP9D_DEBUG_CLR_BLK_MODES = 1 << 8, - VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9 + VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9, + VP9D_MFQE = 1 << 10 }; typedef struct { diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c index 901a043f6..fd735f483 100644 --- a/vp9/common/vp9_pred_common.c +++ b/vp9/common/vp9_pred_common.c @@ -15,21 +15,17 @@ #include "vp9/common/vp9_pred_common.h" #include "vp9/common/vp9_seg_common.h" -static INLINE const MB_MODE_INFO *get_mbmi(const MODE_INFO *const mi) { - return (mi != NULL) ? &mi->mbmi : NULL; -} - // Returns a context number for the given MB prediction signal int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) { // Note: // The mode info data structure has a one element border above and to the // left of the entries correpsonding to real macroblocks. // The prediction flags in these dummy entries are initialised to 0. - const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); - const int left_type = left_mbmi != NULL && is_inter_block(left_mbmi) ? - left_mbmi->interp_filter : SWITCHABLE_FILTERS; - const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd)); - const int above_type = above_mbmi != NULL && is_inter_block(above_mbmi) ? + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const int left_type = xd->left_available && is_inter_block(left_mbmi) ? + left_mbmi->interp_filter : SWITCHABLE_FILTERS; + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const int above_type = xd->up_available && is_inter_block(above_mbmi) ? above_mbmi->interp_filter : SWITCHABLE_FILTERS; if (left_type == above_type) @@ -50,10 +46,10 @@ int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) { // 2 - intra/--, --/intra // 3 - intra/intra int vp9_get_intra_inter_context(const MACROBLOCKD *xd) { - const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd)); - const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); - const int has_above = above_mbmi != NULL; - const int has_left = left_mbmi != NULL; + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const int has_above = xd->up_available; + const int has_left = xd->left_available; if (has_above && has_left) { // both edges available const int above_intra = !is_inter_block(above_mbmi); @@ -70,10 +66,10 @@ int vp9_get_intra_inter_context(const MACROBLOCKD *xd) { int vp9_get_reference_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd) { int ctx; - const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd)); - const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); - const int has_above = above_mbmi != NULL; - const int has_left = left_mbmi != NULL; + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const int has_above = xd->up_available; + const int has_left = xd->left_available; // Note: // The mode info data structure has a one element border above and to the // left of the entries correpsonding to real macroblocks. @@ -113,10 +109,10 @@ int vp9_get_reference_mode_context(const VP9_COMMON *cm, int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, const MACROBLOCKD *xd) { int pred_context; - const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd)); - const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); - const int above_in_image = above_mbmi != NULL; - const int left_in_image = left_mbmi != NULL; + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const int above_in_image = xd->up_available; + const int left_in_image = xd->left_available; // Note: // The mode info data structure has a one element border above and to the @@ -194,10 +190,10 @@ int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) { int pred_context; - const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd)); - const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); - const int has_above = above_mbmi != NULL; - const int has_left = left_mbmi != NULL; + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const int has_above = xd->up_available; + const int has_left = xd->left_available; // Note: // The mode info data structure has a one element border above and to the // left of the entries correpsonding to real macroblocks. @@ -260,10 +256,10 @@ int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) { int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { int pred_context; - const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd)); - const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); - const int has_above = above_mbmi != NULL; - const int has_left = left_mbmi != NULL; + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const int has_above = xd->up_available; + const int has_left = xd->left_available; // Note: // The mode info data structure has a one element border above and to the @@ -349,10 +345,10 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { // The prediction flags in these dummy entries are initialized to 0. int vp9_get_tx_size_context(const MACROBLOCKD *xd) { const int max_tx_size = max_txsize_lookup[xd->mi[0].src_mi->mbmi.sb_type]; - const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd)); - const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); - const int has_above = above_mbmi != NULL; - const int has_left = left_mbmi != NULL; + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const int has_above = xd->up_available; + const int has_left = xd->left_available; int above_ctx = (has_above && !above_mbmi->skip) ? (int)above_mbmi->tx_size : max_tx_size; int left_ctx = (has_left && !left_mbmi->skip) ? (int)left_mbmi->tx_size diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h index cf13e4a91..bc19d28b9 100644 --- a/vp9/common/vp9_pred_common.h +++ b/vp9/common/vp9_pred_common.h @@ -18,20 +18,12 @@ extern "C" { #endif -static INLINE const MODE_INFO *get_above_mi(const MACROBLOCKD *const xd) { - return xd->up_available ? xd->mi[-xd->mi_stride].src_mi : NULL; -} - -static INLINE const MODE_INFO *get_left_mi(const MACROBLOCKD *const xd) { - return xd->left_available ? xd->mi[-1].src_mi : NULL; -} - int vp9_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids, BLOCK_SIZE bsize, int mi_row, int mi_col); static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) { - const MODE_INFO *const above_mi = get_above_mi(xd); - const MODE_INFO *const left_mi = get_left_mi(xd); + const MODE_INFO *const above_mi = xd->above_mi; + const MODE_INFO *const left_mi = xd->left_mi; const int above_sip = (above_mi != NULL) ? above_mi->mbmi.seg_id_predicted : 0; const int left_sip = (left_mi != NULL) ? left_mi->mbmi.seg_id_predicted : 0; @@ -45,8 +37,8 @@ static INLINE vp9_prob vp9_get_pred_prob_seg_id(const struct segmentation *seg, } static INLINE int vp9_get_skip_context(const MACROBLOCKD *xd) { - const MODE_INFO *const above_mi = get_above_mi(xd); - const MODE_INFO *const left_mi = get_left_mi(xd); + const MODE_INFO *const above_mi = xd->above_mi; + const MODE_INFO *const left_mi = xd->left_mi; const int above_skip = (above_mi != NULL) ? above_mi->mbmi.skip : 0; const int left_skip = (left_mi != NULL) ? left_mi->mbmi.skip : 0; return above_skip + left_skip; diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 1872191ff..575990bb5 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -236,36 +236,29 @@ specialize qw/vp9_lpf_vertical_16_dual sse2 neon_asm dspr2/; $vp9_lpf_vertical_16_dual_neon_asm=vp9_lpf_vertical_16_dual_neon; add_proto qw/void vp9_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; -specialize qw/vp9_lpf_vertical_8 sse2 neon_asm dspr2/; -$vp9_lpf_vertical_8_neon_asm=vp9_lpf_vertical_8_neon; +specialize qw/vp9_lpf_vertical_8 sse2 neon dspr2/; add_proto qw/void vp9_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/vp9_lpf_vertical_8_dual sse2 neon_asm dspr2/; -$vp9_lpf_vertical_8_dual_neon_asm=vp9_lpf_vertical_8_dual_neon; +specialize qw/vp9_lpf_vertical_8_dual sse2 neon dspr2/; add_proto qw/void vp9_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; -specialize qw/vp9_lpf_vertical_4 mmx neon_asm dspr2/; -$vp9_lpf_vertical_4_neon_asm=vp9_lpf_vertical_4_neon; +specialize qw/vp9_lpf_vertical_4 mmx neon dspr2/; add_proto qw/void vp9_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/vp9_lpf_vertical_4_dual sse2 neon_asm dspr2/; -$vp9_lpf_vertical_4_dual_neon_asm=vp9_lpf_vertical_4_dual_neon; +specialize qw/vp9_lpf_vertical_4_dual sse2 neon dspr2/; add_proto qw/void vp9_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; specialize qw/vp9_lpf_horizontal_16 sse2 avx2 neon_asm dspr2/; $vp9_lpf_horizontal_16_neon_asm=vp9_lpf_horizontal_16_neon; add_proto qw/void vp9_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; -specialize qw/vp9_lpf_horizontal_8 sse2 neon_asm dspr2/; -$vp9_lpf_horizontal_8_neon_asm=vp9_lpf_horizontal_8_neon; +specialize qw/vp9_lpf_horizontal_8 sse2 neon dspr2/; add_proto qw/void vp9_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/vp9_lpf_horizontal_8_dual sse2 neon_asm dspr2/; -$vp9_lpf_horizontal_8_dual_neon_asm=vp9_lpf_horizontal_8_dual_neon; +specialize qw/vp9_lpf_horizontal_8_dual sse2 neon dspr2/; add_proto qw/void vp9_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; -specialize qw/vp9_lpf_horizontal_4 mmx neon_asm dspr2/; -$vp9_lpf_horizontal_4_neon_asm=vp9_lpf_horizontal_4_neon; +specialize qw/vp9_lpf_horizontal_4 mmx neon dspr2/; add_proto qw/void vp9_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/vp9_lpf_horizontal_4_dual sse2 neon_asm dspr2/; @@ -296,36 +289,28 @@ $vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt; # Sub Pixel Filters # add_proto qw/void vp9_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve_copy neon_asm dspr2/, "$sse2_x86inc"; -$vp9_convolve_copy_neon_asm=vp9_convolve_copy_neon; +specialize qw/vp9_convolve_copy neon dspr2/, "$sse2_x86inc"; add_proto qw/void vp9_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve_avg neon_asm dspr2/, "$sse2_x86inc"; -$vp9_convolve_avg_neon_asm=vp9_convolve_avg_neon; +specialize qw/vp9_convolve_avg neon dspr2/, "$sse2_x86inc"; add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3"; -$vp9_convolve8_neon_asm=vp9_convolve8_neon; +specialize qw/vp9_convolve8 sse2 ssse3 neon dspr2/, "$avx2_ssse3"; add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3"; -$vp9_convolve8_horiz_neon_asm=vp9_convolve8_horiz_neon; +specialize qw/vp9_convolve8_horiz sse2 ssse3 neon dspr2/, "$avx2_ssse3"; add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3"; -$vp9_convolve8_vert_neon_asm=vp9_convolve8_vert_neon; +specialize qw/vp9_convolve8_vert sse2 ssse3 neon dspr2/, "$avx2_ssse3"; add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8_avg sse2 ssse3 neon_asm dspr2/; -$vp9_convolve8_avg_neon_asm=vp9_convolve8_avg_neon; +specialize qw/vp9_convolve8_avg sse2 ssse3 neon dspr2/; add_proto qw/void vp9_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8_avg_horiz sse2 ssse3 neon_asm dspr2/; -$vp9_convolve8_avg_horiz_neon_asm=vp9_convolve8_avg_horiz_neon; +specialize qw/vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2/; add_proto qw/void vp9_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8_avg_vert sse2 ssse3 neon_asm dspr2/; -$vp9_convolve8_avg_vert_neon_asm=vp9_convolve8_avg_vert_neon; +specialize qw/vp9_convolve8_avg_vert sse2 ssse3 neon dspr2/; # # dct @@ -437,48 +422,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_iwht4x4_16_add/; } else { add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/; - $vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon; + specialize qw/vp9_idct4x4_1_add sse2 neon dspr2/; add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/; - $vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon; + specialize qw/vp9_idct4x4_16_add sse2 neon dspr2/; add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/; - $vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon; + specialize qw/vp9_idct8x8_1_add sse2 neon dspr2/; add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64"; - $vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon; + specialize qw/vp9_idct8x8_64_add sse2 neon dspr2/, "$ssse3_x86_64"; add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64"; - $vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon; + specialize qw/vp9_idct8x8_12_add sse2 neon dspr2/, "$ssse3_x86_64"; add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/; - $vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon; + specialize qw/vp9_idct16x16_1_add sse2 neon dspr2/; add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/; - $vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon; + specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon dspr2/; add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/; - $vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon; + specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon dspr2/; add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/; - $vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon; + specialize qw/vp9_idct32x32_1024_add sse2 neon dspr2/; add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/; + #is this a typo? $vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon; add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/; - $vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon; + specialize qw/vp9_idct32x32_1_add sse2 neon dspr2/; add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/; diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index 3610c7165..42e0baa05 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -4260,7 +4260,7 @@ void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, // N.B. Only first 4 cols contain non-zero coeffs max_input = _mm_max_epi16(inptr[0], inptr[1]); min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 4; i++) { + for (i = 2; i < 8; i++) { max_input = _mm_max_epi16(max_input, inptr[i]); min_input = _mm_min_epi16(min_input, inptr[i]); } diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm index fd781d4bc..4a5bf1b60 100644 --- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm +++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm @@ -18,7 +18,7 @@ mov rcx, 0x0400040 movdqa xmm4, [rdx] ;load filters - movd xmm5, rcx + movq xmm5, rcx packsswb xmm4, xmm4 pshuflw xmm0, xmm4, 0b ;k0_k1 pshuflw xmm1, xmm4, 01010101b ;k2_k3 @@ -661,7 +661,7 @@ sym(vp9_filter_block1d16_v8_avg_ssse3): mov rcx, 0x0400040 movdqa xmm4, [rdx] ;load filters - movd xmm5, rcx + movq xmm5, rcx packsswb xmm4, xmm4 pshuflw xmm0, xmm4, 0b ;k0_k1 pshuflw xmm1, xmm4, 01010101b ;k2_k3 @@ -765,40 +765,50 @@ sym(vp9_filter_block1d16_v8_avg_ssse3): movq xmm0, [rsi - 3] ;load src data movq xmm4, [rsi + 5] - movq xmm7, [rsi + 13] + movq xmm6, [rsi + 13] punpcklqdq xmm0, xmm4 - punpcklqdq xmm4, xmm7 + punpcklqdq xmm4, xmm6 + + movdqa xmm7, xmm0 + punpcklbw xmm7, xmm7 + punpckhbw xmm0, xmm0 movdqa xmm1, xmm0 movdqa xmm2, xmm0 movdqa xmm3, xmm0 - movdqa xmm5, xmm4 - movdqa xmm6, xmm4 - movdqa xmm7, xmm4 - - pshufb xmm0, [GLOBAL(shuf_t0t1)] - pshufb xmm1, [GLOBAL(shuf_t2t3)] - pshufb xmm2, [GLOBAL(shuf_t4t5)] - pshufb xmm3, [GLOBAL(shuf_t6t7)] - pshufb xmm4, [GLOBAL(shuf_t0t1)] - pshufb xmm5, [GLOBAL(shuf_t2t3)] - pshufb xmm6, [GLOBAL(shuf_t4t5)] - pshufb xmm7, [GLOBAL(shuf_t6t7)] + palignr xmm0, xmm7, 1 + palignr xmm1, xmm7, 5 pmaddubsw xmm0, k0k1 + palignr xmm2, xmm7, 9 pmaddubsw xmm1, k2k3 + palignr xmm3, xmm7, 13 + pmaddubsw xmm2, k4k5 pmaddubsw xmm3, k6k7 - pmaddubsw xmm4, k0k1 - pmaddubsw xmm5, k2k3 - pmaddubsw xmm6, k4k5 - pmaddubsw xmm7, k6k7 - paddsw xmm0, xmm3 + + movdqa xmm3, xmm4 + punpcklbw xmm3, xmm3 + punpckhbw xmm4, xmm4 + + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + + palignr xmm4, xmm3, 1 + palignr xmm5, xmm3, 5 + palignr xmm6, xmm3, 9 + palignr xmm7, xmm3, 13 + movdqa xmm3, xmm1 + pmaddubsw xmm4, k0k1 pmaxsw xmm1, xmm2 + pmaddubsw xmm5, k2k3 pminsw xmm2, xmm3 + pmaddubsw xmm6, k4k5 paddsw xmm0, xmm2 + pmaddubsw xmm7, k6k7 paddsw xmm0, xmm1 paddsw xmm4, xmm7 diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index a088325df..2c5fbacb9 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -747,10 +747,6 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm, YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf; width = buf->y_crop_width; height = buf->y_crop_height; - if (buf->corrupted) { - vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "Frame reference is corrupt"); - } found = 1; break; } @@ -978,9 +974,12 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, &tile_data->bit_reader, BLOCK_64X64); } pbi->mb.corrupted |= tile_data->xd.corrupted; + if (pbi->mb.corrupted) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Failed to decode tile data"); } // Loopfilter one row. - if (cm->lf.filter_level && !pbi->mb.corrupted) { + if (cm->lf.filter_level) { const int lf_start = mi_row - MI_BLOCK_SIZE; LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; @@ -1003,7 +1002,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, } // Loopfilter remaining rows in the frame. - if (cm->lf.filter_level && !pbi->mb.corrupted) { + if (cm->lf.filter_level) { LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; winterface->sync(&pbi->lf_worker); lf_data->start = lf_data->stop; @@ -1564,6 +1563,9 @@ void vp9_decode_frame(VP9Decoder *pbi, xd->corrupted = 0; new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size); + if (new_fb->corrupted) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Decode failed. Frame data header is corrupted."); // TODO(jzern): remove frame_parallel_decoding_mode restriction for // single-frame tile decoding. @@ -1576,6 +1578,10 @@ void vp9_decode_frame(VP9Decoder *pbi, vp9_loop_filter_frame_mt(&pbi->lf_row_sync, new_fb, pbi->mb.plane, cm, pbi->tile_workers, pbi->num_tile_workers, cm->lf.filter_level, 0); + } else { + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Decode failed. Frame data is corrupted."); + } } else { *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end); diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index ecab71a9b..cff94db2d 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -440,9 +440,6 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, if ((!vp9_is_valid_scale(&ref_buf->sf))) vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, "Reference frame has invalid dimensions"); - if (ref_buf->buf->corrupted) - vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "Block reference is corrupt"); vp9_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col, &ref_buf->sf); vp9_find_mv_refs(cm, xd, tile, mi, frame, mbmi->ref_mvs[frame], diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c index 39f03aac1..2daf86200 100644 --- a/vp9/decoder/vp9_decoder.c +++ b/vp9/decoder/vp9_decoder.c @@ -288,16 +288,6 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, vp9_clear_system_state(); - // We do not know if the missing frame(s) was supposed to update - // any of the reference buffers, but we act conservative and - // mark only the last buffer as corrupted. - // - // TODO(jkoleszar): Error concealment is undefined and non-normative - // at this point, but if it becomes so, [0] may not always be the correct - // thing to do here. - if (cm->frame_refs[0].idx != INT_MAX && cm->frame_refs[0].buf != NULL) - cm->frame_refs[0].buf->corrupted = 1; - if (cm->new_fb_idx > 0 && cm->frame_bufs[cm->new_fb_idx].ref_count > 0) cm->frame_bufs[cm->new_fb_idx].ref_count--; diff --git a/vp9/encoder/vp9_aq_complexity.c b/vp9/encoder/vp9_aq_complexity.c index 83f4a53d6..22e5217b6 100644 --- a/vp9/encoder/vp9_aq_complexity.c +++ b/vp9/encoder/vp9_aq_complexity.c @@ -16,19 +16,29 @@ #include "vp9/common/vp9_seg_common.h" #include "vp9/encoder/vp9_segmentation.h" -#define AQ_C_SEGMENTS 3 -#define AQ_C_STRENGTHS 3 -static const int aq_c_active_segments[AQ_C_STRENGTHS] = {1, 2, 3}; +#define AQ_C_SEGMENTS 5 +#define DEFAULT_AQ2_SEG 3 // Neutral Q segment +#define AQ_C_STRENGTHS 3 static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = - {{1.0, 1.0, 1.0}, {1.0, 2.0, 1.0}, {1.0, 1.5, 2.5}}; + { {1.75, 1.25, 1.05, 1.00, 0.90}, + {2.00, 1.50, 1.15, 1.00, 0.85}, + {2.50, 1.75, 1.25, 1.00, 0.80} }; static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = - {{1.0, 1.0, 1.0}, {1.0, 0.25, 0.0}, {1.0, 0.5, 0.25}}; -static const double aq_c_var_thresholds[AQ_C_SEGMENTS] = {100.0, 12.0, 10.0}; + { {0.15, 0.30, 0.55, 2.00, 100.0}, + {0.20, 0.40, 0.65, 2.00, 100.0}, + {0.25, 0.50, 0.75, 2.00, 100.0} }; +static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = + { {-4.0, -3.0, -2.0, 100.00, 100.0}, + {-3.5, -2.5, -1.5, 100.00, 100.0}, + {-3.0, -2.0, -1.0, 100.00, 100.0} }; + +#define DEFAULT_COMPLEXITY 64 + static int get_aq_c_strength(int q_index, vpx_bit_depth_t bit_depth) { // Approximate base quatizer (truncated to int) const int base_quant = vp9_ac_quant(q_index, 0, bit_depth) / 4; - return (base_quant > 20) + (base_quant > 45); + return (base_quant > 10) + (base_quant > 25); } void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) { @@ -43,13 +53,10 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) { (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { int segment; const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth); - const int active_segments = aq_c_active_segments[aq_strength]; // Clear down the segment map. - vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); - - // Clear down the complexity map used for rd. - vpx_memset(cpi->complexity_map, 0, cm->mi_rows * cm->mi_cols); + vpx_memset(cpi->segmentation_map, DEFAULT_AQ2_SEG, + cm->mi_rows * cm->mi_cols); vp9_clearall_segfeatures(seg); @@ -65,15 +72,21 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) { // Select delta coding method. seg->abs_delta = SEGMENT_DELTADATA; - // Segment 0 "Q" feature is disabled so it defaults to the baseline Q. - vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q); + // Default segment "Q" feature is disabled so it defaults to the baseline Q. + vp9_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q); // Use some of the segments for in frame Q adjustment. - for (segment = 1; segment < active_segments; ++segment) { - int qindex_delta = - vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex, - aq_c_q_adj_factor[aq_strength][segment], - cm->bit_depth); + for (segment = 0; segment < AQ_C_SEGMENTS; ++segment) { + int qindex_delta; + + if (segment == DEFAULT_AQ2_SEG) + continue; + + qindex_delta = + vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex, + aq_c_q_adj_factor[aq_strength][segment], + cm->bit_depth); + // For AQ complexity mode, we dont allow Q0 in a segment if the base // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment @@ -90,61 +103,54 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) { } } -// Select a segment for the current SB64 block. +#define DEFAULT_LV_THRESH 10.0 +#define MIN_DEFAULT_LV_THRESH 8.0 +#define VAR_STRENGTH_STEP 0.25 +// Select a segment for the current block. // The choice of segment for a block depends on the ratio of the projected -// bits for the block vs a target average. -// An "aq_strength" value determines how many segments are supported, -// the set of transition points to use and the extent of the quantizer -// adjustment for each segment (configured in vp9_setup_in_frame_q_adj()). -void vp9_select_in_frame_q_segment(VP9_COMP *cpi, MACROBLOCK *mb, - BLOCK_SIZE bs, - int mi_row, int mi_col, - int output_enabled, int projected_rate) { +// bits for the block vs a target average and its spatial complexity. +void vp9_caq_select_segment(VP9_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs, + int mi_row, int mi_col, int projected_rate) { VP9_COMMON *const cm = &cpi->common; const int mi_offset = mi_row * cm->mi_cols + mi_col; const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64]; const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64]; - const int xmis = MIN(cm->mi_cols - mi_col, bw); - const int ymis = MIN(cm->mi_rows - mi_row, bh); - int complexity_metric = 64; + const int xmis = MIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[bs]); + const int ymis = MIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[bs]); int x, y; - + int i; unsigned char segment; - if (!output_enabled) { - segment = 0; + if (0) { + segment = DEFAULT_AQ2_SEG; } else { // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh). // It is converted to bits * 256 units. const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) / (bw * bh); - const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth); - const int active_segments = aq_c_active_segments[aq_strength]; double logvar; + double low_var_thresh; + const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth); + + vp9_clear_system_state(); + low_var_thresh = (cpi->oxcf.pass == 2) + ? MAX(cpi->twopass.mb_av_energy, MIN_DEFAULT_LV_THRESH) + : DEFAULT_LV_THRESH; vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col); logvar = vp9_log_block_var(cpi, mb, bs); - // The number of segments considered and the transition points used to - // select them is determined by the "aq_strength" value. - // Currently this loop only supports segments that reduce Q (i.e. where - // there is undershoot. - // The loop counts down towards segment 0 which is the default segment - // with no Q adjustment. - segment = active_segments - 1; - while (segment > 0) { + segment = AQ_C_SEGMENTS - 1; // Just in case no break out below. + for (i = 0; i < AQ_C_SEGMENTS; ++i) { + // Test rate against a threshold value and variance against a threshold. + // Increasing segment number (higher variance and complexity) = higher Q. if ((projected_rate < - target_rate * aq_c_transitions[aq_strength][segment]) && - (logvar < aq_c_var_thresholds[segment])) { + target_rate * aq_c_transitions[aq_strength][i]) && + (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) { + segment = i; break; } - --segment; - } - - if (target_rate > 0) { - complexity_metric = - clamp((int)((projected_rate * 64) / target_rate), 16, 255); } } @@ -152,8 +158,6 @@ void vp9_select_in_frame_q_segment(VP9_COMP *cpi, MACROBLOCK *mb, for (y = 0; y < ymis; y++) { for (x = 0; x < xmis; x++) { cpi->segmentation_map[mi_offset + y * cm->mi_cols + x] = segment; - cpi->complexity_map[mi_offset + y * cm->mi_cols + x] = - (unsigned char)complexity_metric; } } } diff --git a/vp9/encoder/vp9_aq_complexity.h b/vp9/encoder/vp9_aq_complexity.h index 3f885e450..c0dce6c5b 100644 --- a/vp9/encoder/vp9_aq_complexity.h +++ b/vp9/encoder/vp9_aq_complexity.h @@ -19,11 +19,10 @@ extern "C" { struct VP9_COMP; struct macroblock; -// Select a segment for the current SB64. -void vp9_select_in_frame_q_segment(struct VP9_COMP *cpi, struct macroblock *x, - BLOCK_SIZE bs, - int mi_row, int mi_col, - int output_enabled, int projected_rate); +// Select a segment for the current Block. +void vp9_caq_select_segment(struct VP9_COMP *cpi, struct macroblock *, + BLOCK_SIZE bs, + int mi_row, int mi_col, int projected_rate); // This function sets up a set of segments with delta Q values around // the baseline frame quantizer. diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index 4d88fb5a5..20368f096 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -346,9 +346,8 @@ static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd, MODE_INFO *mi_8x8, vp9_writer *w) { const struct segmentation *const seg = &cm->seg; const MODE_INFO *const mi = mi_8x8; - const MODE_INFO *const above_mi = mi_8x8[-xd->mi_stride].src_mi; - const MODE_INFO *const left_mi = - xd->left_available ? mi_8x8[-1].src_mi : NULL; + const MODE_INFO *const above_mi = xd->above_mi; + const MODE_INFO *const left_mi = xd->left_mi; const MB_MODE_INFO *const mbmi = &mi->mbmi; const BLOCK_SIZE bsize = mbmi->sb_type; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 5e6e77dc9..756393f31 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -189,10 +189,10 @@ static BLOCK_SIZE get_nonrd_var_based_fixed_partition(VP9_COMP *cpi, // Lighter version of set_offsets that only sets the mode info // pointers. -static INLINE void set_modeinfo_offsets(VP9_COMMON *const cm, - MACROBLOCKD *const xd, - int mi_row, - int mi_col) { +static INLINE void set_mode_info_offsets(VP9_COMMON *const cm, + MACROBLOCKD *const xd, + int mi_row, + int mi_col) { const int idx_str = xd->mi_stride * mi_row + mi_col; xd->mi = cm->mi + idx_str; xd->mi[0].src_mi = &xd->mi[0]; @@ -210,7 +210,7 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile, set_skip_context(xd, mi_row, mi_col); - set_modeinfo_offsets(cm, xd, mi_row, mi_col); + set_mode_info_offsets(cm, xd, mi_row, mi_col); mbmi = &xd->mi[0].src_mi->mbmi; @@ -270,16 +270,15 @@ static void set_block_size(VP9_COMP * const cpi, int mi_row, int mi_col, BLOCK_SIZE bsize) { if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) { - set_modeinfo_offsets(&cpi->common, xd, mi_row, mi_col); + set_mode_info_offsets(&cpi->common, xd, mi_row, mi_col); xd->mi[0].src_mi->mbmi.sb_type = bsize; - duplicate_mode_info_in_sb(&cpi->common, xd, mi_row, mi_col, bsize); } } typedef struct { int64_t sum_square_error; int64_t sum_error; - int count; + int log2_count; int variance; } var; @@ -328,7 +327,6 @@ typedef enum { static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) { int i; node->part_variances = NULL; - vpx_memset(node->split, 0, sizeof(node->split)); switch (bsize) { case BLOCK_64X64: { v64x64 *vt = (v64x64 *) data; @@ -376,18 +374,18 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) { static void fill_variance(int64_t s2, int64_t s, int c, var *v) { v->sum_square_error = s2; v->sum_error = s; - v->count = c; - if (c > 0) - v->variance = (int)(256 * - (v->sum_square_error - v->sum_error * v->sum_error / - v->count) / v->count); - else - v->variance = 0; + v->log2_count = c; +} + +static void get_variance(var *v) { + v->variance = (int)(256 * (v->sum_square_error - + ((v->sum_error * v->sum_error) >> v->log2_count)) >> v->log2_count); } void sum_2_variances(const var *a, const var *b, var *r) { + assert(a->log2_count == b->log2_count); fill_variance(a->sum_square_error + b->sum_square_error, - a->sum_error + b->sum_error, a->count + b->count, r); + a->sum_error + b->sum_error, a->log2_count + 1, r); } static void fill_variance_tree(void *data, BLOCK_SIZE bsize) { @@ -434,6 +432,7 @@ static int set_vt_partitioning(VP9_COMP *cpi, // variance is below threshold, otherwise split will be selected. // No check for vert/horiz split as too few samples for variance. if (bsize == bsize_ref) { + get_variance(&vt.part_variances->none); if (mi_col + block_width / 2 < cm->mi_cols && mi_row + block_height / 2 < cm->mi_rows && vt.part_variances->none.variance < threshold_bsize_ref) { @@ -442,6 +441,7 @@ static int set_vt_partitioning(VP9_COMP *cpi, } return 0; } else if (bsize > bsize_ref) { + get_variance(&vt.part_variances->none); // For key frame, for bsize above 32X32, or very high variance, take split. if (cm->frame_type == KEY_FRAME && (bsize > BLOCK_32X32 || @@ -455,24 +455,32 @@ static int set_vt_partitioning(VP9_COMP *cpi, set_block_size(cpi, xd, mi_row, mi_col, bsize); return 1; } + // Check vertical split. - if (mi_row + block_height / 2 < cm->mi_rows && - vt.part_variances->vert[0].variance < threshold_low && - vt.part_variances->vert[1].variance < threshold_low) { - BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT); - set_block_size(cpi, xd, mi_row, mi_col, subsize); - set_block_size(cpi, xd, mi_row, mi_col + block_width / 2, subsize); - return 1; + if (mi_row + block_height / 2 < cm->mi_rows) { + get_variance(&vt.part_variances->vert[0]); + get_variance(&vt.part_variances->vert[1]); + if (vt.part_variances->vert[0].variance < threshold_low && + vt.part_variances->vert[1].variance < threshold_low) { + BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT); + set_block_size(cpi, xd, mi_row, mi_col, subsize); + set_block_size(cpi, xd, mi_row, mi_col + block_width / 2, subsize); + return 1; + } } // Check horizontal split. - if (mi_col + block_width / 2 < cm->mi_cols && - vt.part_variances->horz[0].variance < threshold_low && - vt.part_variances->horz[1].variance < threshold_low) { - BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ); - set_block_size(cpi, xd, mi_row, mi_col, subsize); - set_block_size(cpi, xd, mi_row + block_height / 2, mi_col, subsize); - return 1; + if (mi_col + block_width / 2 < cm->mi_cols) { + get_variance(&vt.part_variances->horz[0]); + get_variance(&vt.part_variances->horz[1]); + if (vt.part_variances->horz[0].variance < threshold_low && + vt.part_variances->horz[1].variance < threshold_low) { + BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ); + set_block_size(cpi, xd, mi_row, mi_col, subsize); + set_block_size(cpi, xd, mi_row + block_height / 2, mi_col, subsize); + return 1; + } } + return 0; } return 0; @@ -574,7 +582,7 @@ static void choose_partitioning(VP9_COMP *cpi, // If variance is based on 8x8 downsampling, we stop here and have // one sample for 8x8 block (so use 1 for count in fill_variance), // which of course means variance = 0 for 8x8 block. - fill_variance(sse, sum, 1, &vst->split[k].part_variances.none); + fill_variance(sse, sum, 0, &vst->split[k].part_variances.none); } else { // For key frame, go down to 4x4. v8x8 *vst2 = &vst->split[k]; @@ -592,7 +600,7 @@ static void choose_partitioning(VP9_COMP *cpi, // If variance is based on 4x4 downsampling, we stop here and have // one sample for 4x4 block (so use 1 for count in fill_variance), // which of course means variance = 0 for 4x4 block. - fill_variance(sse, sum, 1, &vst2->split[m].part_variances.none); + fill_variance(sse, sum, 0, &vst2->split[m].part_variances.none); } } } @@ -700,7 +708,7 @@ static void update_state(VP9_COMP *cpi, ThreadData *td, mi_addr->src_mi = mi_addr; // If segmentation in use - if (seg->enabled && output_enabled) { + if (seg->enabled) { // For in frame complexity AQ copy the segment id from the segment map. if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { const uint8_t *const map = seg->update_map ? cpi->segmentation_map @@ -863,6 +871,18 @@ static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, vp9_rd_cost_init(rd_cost); } +static int set_segment_rdmult(VP9_COMP *const cpi, + MACROBLOCK *const x, + int8_t segment_id) { + int segment_qindex; + VP9_COMMON *const cm = &cpi->common; + vp9_init_plane_quantizers(cpi, x); + vp9_clear_system_state(); + segment_qindex = vp9_get_qindex(&cm->seg, segment_id, + cm->base_qindex); + return vp9_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q); +} + static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *const x, @@ -919,7 +939,6 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, if (aq_mode == VARIANCE_AQ) { const int energy = bsize <= BLOCK_16X16 ? x->mb_energy : vp9_block_energy(cpi, x, bsize); - int segment_qindex; if (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame || (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { @@ -929,18 +948,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, : cm->last_frame_seg_map; mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col); } - vp9_init_plane_quantizers(cpi, x); - vp9_clear_system_state(); - segment_qindex = vp9_get_qindex(&cm->seg, mbmi->segment_id, - cm->base_qindex); - x->rdmult = vp9_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q); + x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id); } else if (aq_mode == COMPLEXITY_AQ) { - const int mi_offset = mi_row * cm->mi_cols + mi_col; - unsigned char complexity = cpi->complexity_map[mi_offset]; - const int is_edge = (mi_row <= 1) || (mi_row >= (cm->mi_rows - 2)) || - (mi_col <= 1) || (mi_col >= (cm->mi_cols - 2)); - if (!is_edge && (complexity > 128)) - x->rdmult += ((x->rdmult * (complexity - 128)) / 256); + x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id); } else if (aq_mode == CYCLIC_REFRESH_AQ) { const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map; @@ -967,6 +977,16 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, } } + + // Examine the resulting rate and for AQ mode 2 make a segment choice. + if ((rd_cost->rate != INT_MAX) && + (aq_mode == COMPLEXITY_AQ) && (bsize >= BLOCK_16X16) && + (cm->frame_type == KEY_FRAME || + cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref))) { + vp9_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate); + } + x->rdmult = orig_rdmult; // TODO(jingning) The rate-distortion optimization flow needs to be @@ -1357,11 +1377,8 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td, const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type]; const int x_mis = MIN(bw, cm->mi_cols - mi_col); const int y_mis = MIN(bh, cm->mi_rows - mi_row); - MV_REF *const frame_mvs = - cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col; - int w, h; - *(xd->mi[0].src_mi) = ctx->mic; + xd->mi[0] = ctx->mic; xd->mi[0].src_mi = &xd->mi[0]; if (seg->enabled && cpi->oxcf.aq_mode) { @@ -1382,21 +1399,26 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td, if (is_inter_block(mbmi)) { vp9_update_mv_count(td); - if (cm->interp_filter == SWITCHABLE) { const int pred_ctx = vp9_get_pred_context_switchable_interp(xd); ++td->counts->switchable_interp[pred_ctx][mbmi->interp_filter]; } } - for (h = 0; h < y_mis; ++h) { - MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols; - for (w = 0; w < x_mis; ++w) { - MV_REF *const mv = frame_mv + w; - mv->ref_frame[0] = mi->src_mi->mbmi.ref_frame[0]; - mv->ref_frame[1] = mi->src_mi->mbmi.ref_frame[1]; - mv->mv[0].as_int = mi->src_mi->mbmi.mv[0].as_int; - mv->mv[1].as_int = mi->src_mi->mbmi.mv[1].as_int; + if (cm->use_prev_frame_mvs) { + MV_REF *const frame_mvs = + cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col; + int w, h; + + for (h = 0; h < y_mis; ++h) { + MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols; + for (w = 0; w < x_mis; ++w) { + MV_REF *const mv = frame_mv + w; + mv->ref_frame[0] = mi->src_mi->mbmi.ref_frame[0]; + mv->ref_frame[1] = mi->src_mi->mbmi.ref_frame[1]; + mv->mv[0].as_int = mi->src_mi->mbmi.mv[0].as_int; + mv->mv[1].as_int = mi->src_mi->mbmi.mv[1].as_int; + } } } @@ -1761,14 +1783,6 @@ static void rd_use_partition(VP9_COMP *cpi, if (do_recon) { int output_enabled = (bsize == BLOCK_64X64); - - // Check the projected output rate for this SB against it's target - // and and if necessary apply a Q delta using segmentation to get - // closer to the target. - if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) { - vp9_select_in_frame_q_segment(cpi, x, bsize, mi_row, mi_col, - output_enabled, chosen_rdc.rate); - } encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize, pc_tree); } @@ -2500,13 +2514,6 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX && pc_tree->index != 3) { int output_enabled = (bsize == BLOCK_64X64); - - // Check the projected output rate for this SB against it's target - // and and if necessary apply a Q delta using segmentation to get - // closer to the target. - if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) - vp9_select_in_frame_q_segment(cpi, x, bsize, mi_row, mi_col, - output_enabled, best_rdc.rate); encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize, pc_tree); } @@ -2719,27 +2726,27 @@ static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x, switch (partition) { case PARTITION_NONE: - set_modeinfo_offsets(cm, xd, mi_row, mi_col); + set_mode_info_offsets(cm, xd, mi_row, mi_col); *(xd->mi[0].src_mi) = pc_tree->none.mic; duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize); break; case PARTITION_VERT: - set_modeinfo_offsets(cm, xd, mi_row, mi_col); + set_mode_info_offsets(cm, xd, mi_row, mi_col); *(xd->mi[0].src_mi) = pc_tree->vertical[0].mic; duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize); if (mi_col + hbs < cm->mi_cols) { - set_modeinfo_offsets(cm, xd, mi_row, mi_col + hbs); + set_mode_info_offsets(cm, xd, mi_row, mi_col + hbs); *(xd->mi[0].src_mi) = pc_tree->vertical[1].mic; duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col + hbs, bsize); } break; case PARTITION_HORZ: - set_modeinfo_offsets(cm, xd, mi_row, mi_col); + set_mode_info_offsets(cm, xd, mi_row, mi_col); *(xd->mi[0].src_mi) = pc_tree->horizontal[0].mic; duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize); if (mi_row + hbs < cm->mi_rows) { - set_modeinfo_offsets(cm, xd, mi_row + hbs, mi_col); + set_mode_info_offsets(cm, xd, mi_row + hbs, mi_col); *(xd->mi[0].src_mi) = pc_tree->horizontal[1].mic; duplicate_mode_info_in_sb(cm, xd, mi_row + hbs, mi_col, bsize); } @@ -2784,7 +2791,6 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, int do_recon, int64_t best_rd, PC_TREE *pc_tree) { const SPEED_FEATURES *const sf = &cpi->sf; - const VP9EncoderConfig *const oxcf = &cpi->oxcf; VP9_COMMON *const cm = &cpi->common; TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCK *const x = &td->mb; @@ -3016,14 +3022,6 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX && do_recon) { int output_enabled = (bsize == BLOCK_64X64); - - // Check the projected output rate for this SB against it's target - // and and if necessary apply a Q delta using segmentation to get - // closer to the target. - if ((oxcf->aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) { - vp9_select_in_frame_q_segment(cpi, x, bsize, mi_row, mi_col, - output_enabled, best_rdc.rate); - } encode_sb_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize, pc_tree); } @@ -3114,7 +3112,7 @@ static void nonrd_select_partition(VP9_COMP *cpi, if (mi_row + hbs < cm->mi_rows) { pc_tree->horizontal[1].pred_pixel_ready = 1; nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, - &this_rdc, subsize, &pc_tree->horizontal[0]); + &this_rdc, subsize, &pc_tree->horizontal[1]); pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi; pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0]; pc_tree->horizontal[1].skip = x->skip; @@ -3173,7 +3171,7 @@ static void nonrd_use_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int output_enabled, - RD_COST *rd_cost, PC_TREE *pc_tree) { + RD_COST *dummy_cost, PC_TREE *pc_tree) { VP9_COMMON *const cm = &cpi->common; TileInfo *tile_info = &tile_data->tile_info; MACROBLOCK *const x = &td->mb; @@ -3182,9 +3180,7 @@ static void nonrd_use_partition(VP9_COMP *cpi, const int mis = cm->mi_stride; PARTITION_TYPE partition; BLOCK_SIZE subsize; - RD_COST this_rdc; - vp9_rd_cost_reset(&this_rdc); if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; @@ -3199,7 +3195,7 @@ static void nonrd_use_partition(VP9_COMP *cpi, switch (partition) { case PARTITION_NONE: pc_tree->none.pred_pixel_ready = 1; - nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost, + nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost, subsize, &pc_tree->none); pc_tree->none.mic.mbmi = xd->mi[0].src_mi->mbmi; pc_tree->none.skip_txfm[0] = x->skip_txfm[0]; @@ -3209,7 +3205,7 @@ static void nonrd_use_partition(VP9_COMP *cpi, break; case PARTITION_VERT: pc_tree->vertical[0].pred_pixel_ready = 1; - nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost, + nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost, subsize, &pc_tree->vertical[0]); pc_tree->vertical[0].mic.mbmi = xd->mi[0].src_mi->mbmi; pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0]; @@ -3219,23 +3215,17 @@ static void nonrd_use_partition(VP9_COMP *cpi, if (mi_col + hbs < cm->mi_cols) { pc_tree->vertical[1].pred_pixel_ready = 1; nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, - &this_rdc, subsize, &pc_tree->vertical[1]); + dummy_cost, subsize, &pc_tree->vertical[1]); pc_tree->vertical[1].mic.mbmi = xd->mi[0].src_mi->mbmi; pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0]; pc_tree->vertical[1].skip = x->skip; encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col + hbs, output_enabled, subsize, &pc_tree->vertical[1]); - - if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && - rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { - rd_cost->rate += this_rdc.rate; - rd_cost->dist += this_rdc.dist; - } } break; case PARTITION_HORZ: pc_tree->horizontal[0].pred_pixel_ready = 1; - nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost, + nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost, subsize, &pc_tree->horizontal[0]); pc_tree->horizontal[0].mic.mbmi = xd->mi[0].src_mi->mbmi; pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0]; @@ -3246,48 +3236,34 @@ static void nonrd_use_partition(VP9_COMP *cpi, if (mi_row + hbs < cm->mi_rows) { pc_tree->horizontal[1].pred_pixel_ready = 1; nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, - &this_rdc, subsize, &pc_tree->horizontal[0]); + dummy_cost, subsize, &pc_tree->horizontal[1]); pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi; pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0]; pc_tree->horizontal[1].skip = x->skip; encode_b_rt(cpi, td, tile_info, tp, mi_row + hbs, mi_col, output_enabled, subsize, &pc_tree->horizontal[1]); - - if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && - rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { - rd_cost->rate += this_rdc.rate; - rd_cost->dist += this_rdc.dist; - } } break; case PARTITION_SPLIT: subsize = get_subsize(bsize, PARTITION_SPLIT); - nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, - subsize, output_enabled, rd_cost, - pc_tree->split[0]); - nonrd_use_partition(cpi, td, tile_data, mi + hbs, tp, - mi_row, mi_col + hbs, subsize, output_enabled, - &this_rdc, pc_tree->split[1]); - if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && - rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { - rd_cost->rate += this_rdc.rate; - rd_cost->dist += this_rdc.dist; - } - nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis, tp, - mi_row + hbs, mi_col, subsize, output_enabled, - &this_rdc, pc_tree->split[2]); - if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && - rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { - rd_cost->rate += this_rdc.rate; - rd_cost->dist += this_rdc.dist; - } - nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp, - mi_row + hbs, mi_col + hbs, subsize, output_enabled, - &this_rdc, pc_tree->split[3]); - if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && - rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { - rd_cost->rate += this_rdc.rate; - rd_cost->dist += this_rdc.dist; + if (bsize == BLOCK_8X8) { + nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost, + subsize, pc_tree->leaf_split[0]); + encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, + output_enabled, subsize, pc_tree->leaf_split[0]); + } else { + nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, + subsize, output_enabled, dummy_cost, + pc_tree->split[0]); + nonrd_use_partition(cpi, td, tile_data, mi + hbs, tp, + mi_row, mi_col + hbs, subsize, output_enabled, + dummy_cost, pc_tree->split[1]); + nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis, tp, + mi_row + hbs, mi_col, subsize, output_enabled, + dummy_cost, pc_tree->split[2]); + nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp, + mi_row + hbs, mi_col + hbs, subsize, output_enabled, + dummy_cost, pc_tree->split[3]); } break; default: @@ -3329,6 +3305,9 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, // Set the partition type of the 64X64 block switch (sf->partition_search_type) { case VAR_BASED_PARTITION: + // TODO(jingning) Only key frame coding supports sub8x8 block at this + // point. To be continued to enable sub8x8 block mode decision for + // P frames. choose_partitioning(cpi, tile_info, x, mi_row, mi_col); nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, 1, &dummy_rdc, td->pc_root); diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index a03131ca6..aaa6b238d 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -213,9 +213,6 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { vpx_free(cpi->coding_context.last_frame_seg_map_copy); cpi->coding_context.last_frame_seg_map_copy = NULL; - vpx_free(cpi->complexity_map); - cpi->complexity_map = NULL; - vpx_free(cpi->nmvcosts[0]); vpx_free(cpi->nmvcosts[1]); cpi->nmvcosts[0] = NULL; @@ -1445,10 +1442,6 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) { CHECK_MEM_ERROR(cm, cpi->segmentation_map, vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); - // Create a complexity map used for rd adjustment - CHECK_MEM_ERROR(cm, cpi->complexity_map, - vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); - // Create a map used for cyclic background refresh. CHECK_MEM_ERROR(cm, cpi->cyclic_refresh, vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols)); diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 7342f7496..14f7c7f0c 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -351,8 +351,6 @@ typedef struct VP9_COMP { // segment threashold for encode breakout int segment_encode_breakout[MAX_SEGMENTS]; - unsigned char *complexity_map; - CYCLIC_REFRESH *cyclic_refresh; fractional_mv_step_fp *find_fractional_mv_step; diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 1da5a83bd..b45032456 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -622,7 +622,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf); - if (!cm->error_resilient_mode) + if (cm->use_prev_frame_mvs) vp9_find_mv_refs(cm, xd, tile_info, xd->mi[0].src_mi, ref_frame, candidates, mi_row, mi_col); else diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 37b6718bf..3cc9d9a7b 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -426,8 +426,8 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) { } // Work out a size correction factor. if (projected_size_based_on_q > FRAME_OVERHEAD_BITS) - correction_factor = (100 * cpi->rc.projected_frame_size) / - projected_size_based_on_q; + correction_factor = (int)((100 * (int64_t)cpi->rc.projected_frame_size) / + projected_size_based_on_q); // More heavily damped adjustment used if we have been oscillating either side // of target. diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index c1bdff77a..600a3eb1a 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -982,8 +982,8 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb, int i, j; const MACROBLOCKD *const xd = &mb->e_mbd; MODE_INFO *const mic = xd->mi[0].src_mi; - const MODE_INFO *above_mi = xd->mi[-xd->mi_stride].src_mi; - const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1].src_mi : NULL; + const MODE_INFO *above_mi = xd->above_mi; + const MODE_INFO *left_mi = xd->left_mi; const BLOCK_SIZE bsize = xd->mi[0].src_mi->mbmi.sb_type; const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; @@ -1058,8 +1058,8 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, TX_SIZE best_tx = TX_4X4; int i; int *bmode_costs; - const MODE_INFO *above_mi = xd->mi[-xd->mi_stride].src_mi; - const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1].src_mi : NULL; + const MODE_INFO *above_mi = xd->above_mi; + const MODE_INFO *left_mi = xd->left_mi; const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0); const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0); bmode_costs = cpi->y_mode_costs[A][L]; diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm index 508e1d4f5..f5f05e799 100644 --- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm +++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm @@ -122,8 +122,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin %ifidn %1, b_32x32 - pmovmskb r6, m7 - pmovmskb r2, m12 + pmovmskb r6d, m7 + pmovmskb r2d, m12 or r6, r2 jz .skip_iter %endif @@ -308,8 +308,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ %ifidn %1, fp_32x32 pcmpgtw m7, m6, m0 pcmpgtw m12, m11, m0 - pmovmskb r6, m7 - pmovmskb r2, m12 + pmovmskb r6d, m7 + pmovmskb r2d, m12 or r6, r2 jz .skip_iter diff --git a/vp9/encoder/x86/vp9_subpel_variance.asm b/vp9/encoder/x86/vp9_subpel_variance.asm index 1a9e4e8b6..06b8b034a 100644 --- a/vp9/encoder/x86/vp9_subpel_variance.asm +++ b/vp9/encoder/x86/vp9_subpel_variance.asm @@ -101,7 +101,7 @@ SECTION .text pshufd m4, m6, 0x1 movd [r1], m7 ; store sse paddd m6, m4 - movd rax, m6 ; store sum as return value + movd raxd, m6 ; store sum as return value %else ; mmsize == 8 pshufw m4, m6, 0xe pshufw m3, m7, 0xe @@ -113,7 +113,7 @@ SECTION .text movd [r1], m7 ; store sse pshufw m4, m6, 0xe paddd m6, m4 - movd rax, m6 ; store sum as return value + movd raxd, m6 ; store sum as return value %endif RET %endmacro diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 9414120f6..2504f4db9 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -72,6 +72,8 @@ VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_ss VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c +VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h +VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm @@ -131,28 +133,52 @@ ifeq ($(ARCH_X86_64), yes) VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3_x86_64.asm endif -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve_neon.c -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_neon.c -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon.c -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve8_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve8_avg_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct4x4_1_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct4x4_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct8x8_1_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct8x8_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_1_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct32x32_1_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct32x32_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht4x4_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht8x8_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_copy_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_avg_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_reconintra_neon$(ASM) +# neon with assembly and intrinsics implementations. If both are available +# prefer assembly. +ifeq ($(HAVE_NEON_ASM), yes) +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_avg_neon_asm$(ASM) +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_avg_neon_asm$(ASM) +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_neon_asm$(ASM) +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon_asm$(ASM) +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon_asm$(ASM) +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon_asm$(ASM) +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_1_add_neon_asm$(ASM) +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_add_neon_asm$(ASM) +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon_asm$(ASM) +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon_asm$(ASM) +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM) +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM) +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon_asm$(ASM) +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c +else +ifeq ($(HAVE_NEON), yes) +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_avg_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_avg_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_1_add_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_add_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c +endif # HAVE_NEON +endif # HAVE_NEON_ASM + $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl)) |