diff options
33 files changed, 943 insertions, 125 deletions
diff --git a/examples.mk b/examples.mk index 91bd45aa4..537d8ff7d 100644 --- a/examples.mk +++ b/examples.mk @@ -31,6 +31,7 @@ LIBYUV_SRCS += third_party/libyuv/include/libyuv/basic_types.h \ third_party/libyuv/source/scale_common.cc \ third_party/libyuv/source/scale_mips.cc \ third_party/libyuv/source/scale_neon.cc \ + third_party/libyuv/source/scale_neon64.cc \ third_party/libyuv/source/scale_posix.cc \ third_party/libyuv/source/scale_win.cc \ diff --git a/test/active_map_test.cc b/test/active_map_test.cc index a9bb54090..022199519 100644 --- a/test/active_map_test.cc +++ b/test/active_map_test.cc @@ -38,7 +38,7 @@ class ActiveMapTest if (video->frame() == 1) { encoder->Control(VP8E_SET_CPUUSED, cpu_used_); } else if (video->frame() == 3) { - vpx_active_map_t map = {0}; + vpx_active_map_t map = vpx_active_map_t(); uint8_t active_map[9 * 13] = { 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, @@ -57,7 +57,7 @@ class ActiveMapTest map.active_map = active_map; encoder->Control(VP8E_SET_ACTIVEMAP, &map); } else if (video->frame() == 15) { - vpx_active_map_t map = {0}; + vpx_active_map_t map = vpx_active_map_t(); map.cols = (kWidth + 15) / 16; map.rows = (kHeight + 15) / 16; map.active_map = NULL; diff --git a/test/datarate_test.cc b/test/datarate_test.cc index 8dcf26ca2..a8d8d4250 100644 --- a/test/datarate_test.cc +++ b/test/datarate_test.cc @@ -41,7 +41,7 @@ class DatarateTestLarge : public ::libvpx_test::EncoderTest, } virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + ::libvpx_test::Encoder* /*encoder*/) { const vpx_rational_t tb = video->timebase(); timebase_ = static_cast<double>(tb.num) / tb.den; duration_ = 0; diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index ee417ce2e..c38cc2ea5 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -268,11 +268,13 @@ typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride, typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct16x16Param; typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht16x16Param; -void fdct16x16_ref(const int16_t *in, int16_t *out, int stride, int tx_type) { +void fdct16x16_ref(const int16_t *in, int16_t *out, int stride, + int /*tx_type*/) { vp9_fdct16x16_c(in, out, stride); } -void idct16x16_ref(const int16_t *in, uint8_t *dest, int stride, int tx_type) { +void idct16x16_ref(const int16_t *in, uint8_t *dest, int stride, + int /*tx_type*/) { vp9_idct16x16_256_add_c(in, dest, stride); } diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc index 4f34a44f7..d2d437c66 100644 --- a/test/dct32x32_test.cc +++ b/test/dct32x32_test.cc @@ -37,7 +37,7 @@ static int round(double x) { const int kNumCoeffs = 1024; const double kPi = 3.141592653589793238462643383279502884; -void reference_32x32_dct_1d(const double in[32], double out[32], int stride) { +void reference_32x32_dct_1d(const double in[32], double out[32]) { const double kInvSqrt2 = 0.707106781186547524400844362104; for (int k = 0; k < 32; k++) { out[k] = 0.0; @@ -55,7 +55,7 @@ void reference_32x32_dct_2d(const int16_t input[kNumCoeffs], double temp_in[32], temp_out[32]; for (int j = 0; j < 32; ++j) temp_in[j] = input[j*32 + i]; - reference_32x32_dct_1d(temp_in, temp_out, 1); + reference_32x32_dct_1d(temp_in, temp_out); for (int j = 0; j < 32; ++j) output[j * 32 + i] = temp_out[j]; } @@ -64,7 +64,7 @@ void reference_32x32_dct_2d(const int16_t input[kNumCoeffs], double temp_in[32], temp_out[32]; for (int j = 0; j < 32; ++j) temp_in[j] = output[j + i*32]; - reference_32x32_dct_1d(temp_in, temp_out, 1); + reference_32x32_dct_1d(temp_in, temp_out); // Scale by some magic number for (int j = 0; j < 32; ++j) output[j + i * 32] = temp_out[j] / 4; diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc index 11529b349..5a7114022 100644 --- a/test/decode_perf_test.cc +++ b/test/decode_perf_test.cc @@ -74,7 +74,7 @@ TEST_P(DecodePerfTest, PerfTest) { libvpx_test::WebMVideoSource video(video_name); video.Init(); - vpx_codec_dec_cfg_t cfg = {0}; + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); cfg.threads = threads; libvpx_test::VP9Decoder decoder(cfg, 0); diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc index 99610ebc5..0ef4f7b93 100644 --- a/test/decode_test_driver.cc +++ b/test/decode_test_driver.cc @@ -106,7 +106,7 @@ void DecoderTest::RunLoop(CompressedVideoSource *video, } void DecoderTest::RunLoop(CompressedVideoSource *video) { - vpx_codec_dec_cfg_t dec_cfg = {0}; + vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t(); RunLoop(video, dec_cfg); } diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h index 1f73c7d20..a757b5974 100644 --- a/test/decode_test_driver.h +++ b/test/decode_test_driver.h @@ -125,20 +125,20 @@ class DecoderTest { const vpx_codec_dec_cfg_t &dec_cfg); // Hook to be called before decompressing every frame. - virtual void PreDecodeFrameHook(const CompressedVideoSource& video, - Decoder *decoder) {} + virtual void PreDecodeFrameHook(const CompressedVideoSource& /*video*/, + Decoder* /*decoder*/) {} // Hook to be called to handle decode result. Return true to continue. virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec, - const CompressedVideoSource& /* video */, + const CompressedVideoSource& /*video*/, Decoder *decoder) { EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError(); return VPX_CODEC_OK == res_dec; } // Hook to be called on every decompressed frame. - virtual void DecompressedFrameHook(const vpx_image_t& img, - const unsigned int frame_number) {} + virtual void DecompressedFrameHook(const vpx_image_t& /*img*/, + const unsigned int /*frame_number*/) {} // Hook to be called on peek result virtual void HandlePeekResult(Decoder* const decoder, diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc index 6d4281d67..9702ddf3f 100644 --- a/test/encode_test_driver.cc +++ b/test/encode_test_driver.cc @@ -133,13 +133,13 @@ static bool compare_img(const vpx_image_t *img1, return match; } -void EncoderTest::MismatchHook(const vpx_image_t *img1, - const vpx_image_t *img2) { +void EncoderTest::MismatchHook(const vpx_image_t* /*img1*/, + const vpx_image_t* /*img2*/) { ASSERT_TRUE(0) << "Encode/Decode mismatch found"; } void EncoderTest::RunLoop(VideoSource *video) { - vpx_codec_dec_cfg_t dec_cfg = {0}; + vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t(); stats_.Reset(); diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h index 2270ce22f..a77bd6478 100644 --- a/test/encode_test_driver.h +++ b/test/encode_test_driver.h @@ -189,20 +189,21 @@ class EncoderTest { virtual void RunLoop(VideoSource *video); // Hook to be called at the beginning of a pass. - virtual void BeginPassHook(unsigned int pass) {} + virtual void BeginPassHook(unsigned int /*pass*/) {} // Hook to be called at the end of a pass. virtual void EndPassHook() {} // Hook to be called before encoding a frame. - virtual void PreEncodeFrameHook(VideoSource *video) {} - virtual void PreEncodeFrameHook(VideoSource *video, Encoder *encoder) {} + virtual void PreEncodeFrameHook(VideoSource* /*video*/) {} + virtual void PreEncodeFrameHook(VideoSource* /*video*/, + Encoder* /*encoder*/) {} // Hook to be called on every compressed data packet. - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {} + virtual void FramePktHook(const vpx_codec_cx_pkt_t* /*pkt*/) {} // Hook to be called on every PSNR packet. - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {} + virtual void PSNRPktHook(const vpx_codec_cx_pkt_t* /*pkt*/) {} // Hook to determine whether the encode loop should continue. virtual bool Continue() const { @@ -218,19 +219,19 @@ class EncoderTest { const vpx_image_t *img2); // Hook to be called on every decompressed frame. - virtual void DecompressedFrameHook(const vpx_image_t& img, - vpx_codec_pts_t pts) {} + virtual void DecompressedFrameHook(const vpx_image_t& /*img*/, + vpx_codec_pts_t /*pts*/) {} // Hook to be called to handle decode result. Return true to continue. virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec, - const VideoSource& /* video */, + const VideoSource& /*video*/, Decoder *decoder) { EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError(); return VPX_CODEC_OK == res_dec; } // Hook that can modify the encoder's output data - virtual const vpx_codec_cx_pkt_t * MutateEncoderOutputHook( + virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook( const vpx_codec_cx_pkt_t *pkt) { return pkt; } diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc index fb0449deb..44eba3317 100644 --- a/test/external_frame_buffer_test.cc +++ b/test/external_frame_buffer_test.cc @@ -285,7 +285,7 @@ class ExternalFrameBufferTest : public ::testing::Test { video_->Init(); video_->Begin(); - vpx_codec_dec_cfg_t cfg = {0}; + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); decoder_ = new libvpx_test::VP9Decoder(cfg, 0); ASSERT_TRUE(decoder_ != NULL); } diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc index 7c4826086..08a69abee 100644 --- a/test/fdct4x4_test.cc +++ b/test/fdct4x4_test.cc @@ -40,7 +40,7 @@ typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride, typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct4x4Param; typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht4x4Param; -void fdct4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) { +void fdct4x4_ref(const int16_t *in, int16_t *out, int stride, int /*tx_type*/) { vp9_fdct4x4_c(in, out, stride); } @@ -48,7 +48,7 @@ void fht4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) { vp9_fht4x4_c(in, out, stride, tx_type); } -void fwht4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) { +void fwht4x4_ref(const int16_t *in, int16_t *out, int stride, int /*tx_type*/) { vp9_fwht4x4_c(in, out, stride); } diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index 567e5f698..a694f0c45 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -39,7 +39,7 @@ typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride, typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct8x8Param; typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht8x8Param; -void fdct8x8_ref(const int16_t *in, int16_t *out, int stride, int tx_type) { +void fdct8x8_ref(const int16_t *in, int16_t *out, int stride, int /*tx_type*/) { vp9_fdct8x8_c(in, out, stride); } diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc index db27975a8..1c9a52297 100644 --- a/test/frame_size_tests.cc +++ b/test/frame_size_tests.cc @@ -27,7 +27,7 @@ class VP9FrameSizeTestsLarge } virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec, - const libvpx_test::VideoSource &video, + const libvpx_test::VideoSource& /*video*/, libvpx_test::Decoder *decoder) { EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError(); return !::testing::Test::HasFailure(); diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc index 0a1c17c99..34d623696 100644 --- a/test/invalid_file_test.cc +++ b/test/invalid_file_test.cc @@ -73,7 +73,7 @@ class InvalidFileTest void RunTest() { const DecodeParam input = GET_PARAM(1); libvpx_test::CompressedVideoSource *video = NULL; - vpx_codec_dec_cfg_t cfg = {0}; + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); cfg.threads = input.threads; const std::string filename = input.filename; @@ -126,9 +126,9 @@ VP9_INSTANTIATE_TEST_CASE(InvalidFileTest, class InvalidFileInvalidPeekTest : public InvalidFileTest { protected: InvalidFileInvalidPeekTest() : InvalidFileTest() {} - virtual void HandlePeekResult(libvpx_test::Decoder *const decoder, - libvpx_test::CompressedVideoSource *video, - const vpx_codec_err_t res_peek) {} + virtual void HandlePeekResult(libvpx_test::Decoder *const /*decoder*/, + libvpx_test::CompressedVideoSource* /*video*/, + const vpx_codec_err_t /*res_peek*/) {} }; TEST_P(InvalidFileInvalidPeekTest, ReturnCode) { diff --git a/test/resize_test.cc b/test/resize_test.cc index 8d08f1ee3..9d0c570ae 100644 --- a/test/resize_test.cc +++ b/test/resize_test.cc @@ -211,8 +211,8 @@ class ResizeInternalTest : public ResizeTest { EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0); } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { #if WRITE_COMPRESSED_STREAM + virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { ++out_frames_; // Write initial file header if first frame. @@ -222,8 +222,8 @@ class ResizeInternalTest : public ResizeTest { // Write frame header and data. write_ivf_frame_header(pkt, outfile_); (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_); -#endif } +#endif double frame0_psnr_; #if WRITE_COMPRESSED_STREAM diff --git a/test/svc_test.cc b/test/svc_test.cc index 1cb01a407..23e727a32 100644 --- a/test/svc_test.cc +++ b/test/svc_test.cc @@ -60,7 +60,7 @@ class SvcTest : public ::testing::Test { codec_enc_.kf_min_dist = 100; codec_enc_.kf_max_dist = 100; - vpx_codec_dec_cfg_t dec_cfg = {0}; + vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t(); VP9CodecFactory codec_factory; decoder_ = codec_factory.CreateDecoder(dec_cfg, 0); } diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc index d7144522b..b9f879d44 100644 --- a/test/tile_independence_test.cc +++ b/test/tile_independence_test.cc @@ -29,7 +29,7 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest, md5_inv_order_(), n_tiles_(GET_PARAM(1)) { init_flags_ = VPX_CODEC_USE_PSNR; - vpx_codec_dec_cfg_t cfg; + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); cfg.w = 704; cfg.h = 144; cfg.threads = 1; diff --git a/test/user_priv_test.cc b/test/user_priv_test.cc index 22fce857c..8512d88cf 100644 --- a/test/user_priv_test.cc +++ b/test/user_priv_test.cc @@ -47,7 +47,7 @@ string DecodeFile(const string &filename) { libvpx_test::WebMVideoSource video(filename); video.Init(); - vpx_codec_dec_cfg_t cfg = {0}; + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); libvpx_test::VP9Decoder decoder(cfg, 0); libvpx_test::MD5 md5; diff --git a/test/vp8_decrypt_test.cc b/test/vp8_decrypt_test.cc index 470fdf10d..972a1d9a3 100644 --- a/test/vp8_decrypt_test.cc +++ b/test/vp8_decrypt_test.cc @@ -47,7 +47,7 @@ TEST(TestDecrypt, DecryptWorksVp8) { libvpx_test::IVFVideoSource video("vp80-00-comprehensive-001.ivf"); video.Init(); - vpx_codec_dec_cfg_t dec_cfg = {0}; + vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t(); VP8Decoder decoder(dec_cfg, 0); video.Begin(); diff --git a/test/vp9_decrypt_test.cc b/test/vp9_decrypt_test.cc index 88a3c14f5..d98861207 100644 --- a/test/vp9_decrypt_test.cc +++ b/test/vp9_decrypt_test.cc @@ -47,7 +47,7 @@ TEST(TestDecrypt, DecryptWorksVp9) { libvpx_test::IVFVideoSource video("vp90-2-05-resize.ivf"); video.Init(); - vpx_codec_dec_cfg_t dec_cfg = {0}; + vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t(); VP9Decoder decoder(dec_cfg, 0); video.Begin(); diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc index d7fc4eedb..cc354765a 100644 --- a/test/vp9_thread_test.cc +++ b/test/vp9_thread_test.cc @@ -163,7 +163,7 @@ string DecodeFile(const string& filename, int num_threads) { libvpx_test::WebMVideoSource video(filename); video.Init(); - vpx_codec_dec_cfg_t cfg = {0}; + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); cfg.threads = num_threads; libvpx_test::VP9Decoder decoder(cfg, 0); diff --git a/third_party/libyuv/source/scale_neon64.cc b/third_party/libyuv/source/scale_neon64.cc new file mode 100644 index 000000000..64c7d10db --- /dev/null +++ b/third_party/libyuv/source/scale_neon64.cc @@ -0,0 +1,790 @@ +/* + * Copyright 2014 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon. +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +#ifdef HAS_SCALEROWDOWN2_NEON +// Read 32x1 throw away even pixels, and write 16x1. +void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + // load even pixels into q0, odd into q1 + MEMACCESS(0) + "vld2.8 {q0, q1}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop + MEMACCESS(1) + "vst1.8 {q1}, [%1]! \n" // store odd pixels + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List + ); +} +#endif //HAS_SCALEROWDOWN2_NEON + +#ifdef HAS_SCALEROWDOWN2_NEON +// Read 32x2 average down and write 16x1. +void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %0 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc + MEMACCESS(1) + "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc + "subs %3, %3, #16 \n" // 16 processed per loop + "vpaddl.u8 q0, q0 \n" // row 1 add adjacent + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1 + "vpadal.u8 q1, q3 \n" + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #2 \n" + MEMACCESS(2) + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "q0", "q1", "q2", "q3" // Clobber List + ); +} +#endif //HAS_SCALEROWDOWN2_NEON + +#ifdef HAS_SCALEROWDOWN4_NEON +void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #8 \n" // 8 processed per loop + MEMACCESS(1) + "vst1.8 {d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1", "memory", "cc" + ); +} +#endif //HAS_SCALEROWDOWN4_NEON + +#ifdef HAS_SCALEROWDOWN4_NEON +void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + const uint8* src_ptr1 = src_ptr + src_stride; + const uint8* src_ptr2 = src_ptr + src_stride * 2; + const uint8* src_ptr3 = src_ptr + src_stride * 3; +asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load up 16x4 + MEMACCESS(3) + "vld1.8 {q1}, [%3]! \n" + MEMACCESS(4) + "vld1.8 {q2}, [%4]! \n" + MEMACCESS(5) + "vld1.8 {q3}, [%5]! \n" + "subs %2, %2, #4 \n" + "vpaddl.u8 q0, q0 \n" + "vpadal.u8 q0, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q0, q3 \n" + "vpaddl.u16 q0, q0 \n" + "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding + "vmovn.u16 d0, q0 \n" + MEMACCESS(1) + "vst1.32 {d0[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_ptr1), // %3 + "+r"(src_ptr2), // %4 + "+r"(src_ptr3) // %5 + : + : "q0", "q1", "q2", "q3", "memory", "cc" + ); +} +#endif //HAS_SCALEROWDOWN4_NEON + +#ifdef HAS_SCALEROWDOWN34_NEON +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +void ScaleRowDown34_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #24 \n" + "vmov d2, d3 \n" // order d0, d1, d2 + MEMACCESS(1) + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "d0", "d1", "d2", "d3", "memory", "cc" + ); +} +#endif //HAS_SCALEROWDOWN34_NEON + +#ifdef HAS_SCALEROWDOWN34_NEON +void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + MEMACCESS(3) + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" + + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "vmovl.u8 q8, d4 \n" + "vmovl.u8 q9, d5 \n" + "vmovl.u8 q10, d6 \n" + "vmovl.u8 q11, d7 \n" + + // 3 * line_0 + line_1 + "vmlal.u8 q8, d0, d24 \n" + "vmlal.u8 q9, d1, d24 \n" + "vmlal.u8 q10, d2, d24 \n" + "vmlal.u8 q11, d3, d24 \n" + + // (3 * line_0 + line_1) >> 2 + "vqrshrn.u16 d0, q8, #2 \n" + "vqrshrn.u16 d1, q9, #2 \n" + "vqrshrn.u16 d2, q10, #2 \n" + "vqrshrn.u16 d3, q11, #2 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q8, d1 \n" + "vmlal.u8 q8, d0, d24 \n" + "vqrshrn.u16 d0, q8, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q8, d2 \n" + "vmlal.u8 q8, d3, d24 \n" + "vqrshrn.u16 d2, q8, #2 \n" + + MEMACCESS(1) + "vst3.8 {d0, d1, d2}, [%1]! \n" + + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" + ); +} +#endif //ScaleRowDown34_0_Box_NEON + +#ifdef HAS_SCALEROWDOWN34_NEON +void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + MEMACCESS(3) + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" + // average src line 0 with src line 1 + "vrhadd.u8 q0, q0, q2 \n" + "vrhadd.u8 q1, q1, q3 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q3, d1 \n" + "vmlal.u8 q3, d0, d24 \n" + "vqrshrn.u16 d0, q3, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q3, d2 \n" + "vmlal.u8 q3, d3, d24 \n" + "vqrshrn.u16 d2, q3, #2 \n" + + MEMACCESS(1) + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" + ); +} +#endif //HAS_SCALEROWDOWN34_NEON + +#ifdef HAS_SCALEROWDOWN38_NEON +#define HAS_SCALEROWDOWN38_NEON +static uvec8 kShuf38 = + { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; +static uvec8 kShuf38_2 = + { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; +static vec16 kMult38_Div6 = + { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; +static vec16 kMult38_Div9 = + { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; + +// 32 -> 12 +void ScaleRowDown38_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + MEMACCESS(3) + "vld1.8 {q3}, [%3] \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" + "subs %2, %2, #12 \n" + "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" + "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" + MEMACCESS(1) + "vst1.8 {d4}, [%1]! \n" + MEMACCESS(1) + "vst1.32 {d5[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(&kShuf38) // %3 + : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" + ); +} + +#endif //HAS_SCALEROWDOWN38_NEON + +#ifdef HAS_SCALEROWDOWN38_NEON +// 32x3 -> 12x1 +void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + const uint8* src_ptr1 = src_ptr + src_stride * 2; + + asm volatile ( + MEMACCESS(5) + "vld1.16 {q13}, [%5] \n" + MEMACCESS(6) + "vld1.8 {q14}, [%6] \n" + MEMACCESS(7) + "vld1.8 {q15}, [%7] \n" + "add %3, %0 \n" + ".p2align 2 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + MEMACCESS(3) + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + MEMACCESS(4) + "vld4.8 {d16, d17, d18, d19}, [%4]! \n" + "subs %2, %2, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + "vtrn.u8 d16, d17 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + "vtrn.u8 d18, d19 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + "vpaddl.u8 q8, q8 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + "vpaddl.u8 d19, d19 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 q0, q8 \n" + "vadd.u16 d4, d3, d7 \n" + "vadd.u16 d4, d19 \n" + + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "vqrdmulh.s16 q2, q2, q13 \n" + "vmovn.u16 d4, q2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + "vmovl.u8 q9, d18 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + "vadd.u16 q1, q9 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q15 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + MEMACCESS(1) + "vst1.8 {d3}, [%1]! \n" + MEMACCESS(1) + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride), // %3 + "+r"(src_ptr1) // %4 + : "r"(&kMult38_Div6), // %5 + "r"(&kShuf38_2), // %6 + "r"(&kMult38_Div9) // %7 + : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc" + ); +} +#endif //HAS_SCALEROWDOWN38_NEON + +#ifdef HAS_SCALEROWDOWN38_NEON +// 32x2 -> 12x1 +void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + MEMACCESS(4) + "vld1.16 {q13}, [%4] \n" + MEMACCESS(5) + "vld1.8 {q14}, [%5] \n" + "add %3, %0 \n" + ".p2align 2 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + MEMACCESS(3) + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "subs %2, %2, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 d4, d3, d7 \n" + + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "vqrshrn.u16 d4, q2, #2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q13 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + MEMACCESS(1) + "vst1.8 {d3}, [%1]! \n" + MEMACCESS(1) + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2) // %5 + : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" + ); +} +#endif //HAS_SCALEROWDOWN38_NEON + +#if 0 +// 16x2 -> 16x1 +void ScaleFilterRows_NEON(uint8* dst_ptr, + const uint8* src_ptr, ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { + asm volatile ( + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #64 \n" + "beq 75f \n" + "cmp %4, #128 \n" + "beq 50f \n" + "cmp %4, #192 \n" + "beq 25f \n" + + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" + // General purpose row blend. + "1: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" + + // Blend 25 / 75. + "25: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 25b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" + + // Blend 75 / 25. + "75: \n" + MEMACCESS(1) + "vld1.8 {q1}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 75b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" + + "99: \n" + MEMACCESS(0) + "vst1.8 {d1[7]}, [%0] \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction) // %4 + : + : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" + ); +} +#endif //0 + +#ifdef HAS_SCALEARGBROWDOWN2_NEON +void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + // load even pixels into q0, odd into q1 + MEMACCESS(0) + "vld2.32 {q0, q1}, [%0]! \n" + MEMACCESS(0) + "vld2.32 {q2, q3}, [%0]! \n" + "subs %2, %2, #8 \n" // 8 processed per loop + MEMACCESS(1) + "vst1.8 {q1}, [%1]! \n" // store odd pixels + MEMACCESS(1) + "vst1.8 {q3}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + ); +} +#endif //HAS_SCALEARGBROWDOWN2_NEON + +#ifdef HAS_SCALEARGBROWDOWN2_NEON +void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels. + MEMACCESS(1) + "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. + "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #2 \n" + "vrshrn.u16 d2, q2, #2 \n" + "vrshrn.u16 d3, q3, #2 \n" + MEMACCESS(2) + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" + ); +} +#endif //HAS_SCALEARGBROWDOWN2_NEON + +#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, uint8* dst_argb, int dst_width) { + asm volatile ( + "mov r12, %3, lsl #2 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.32 {d0[0]}, [%0], r12 \n" + MEMACCESS(0) + "vld1.32 {d0[1]}, [%0], r12 \n" + MEMACCESS(0) + "vld1.32 {d1[0]}, [%0], r12 \n" + MEMACCESS(0) + "vld1.32 {d1[1]}, [%0], r12 \n" + "subs %2, %2, #4 \n" // 4 pixels per loop. + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"(src_stepx) // %3 + : "memory", "cc", "r12", "q0" + ); +} +#endif //HAS_SCALEARGBROWDOWNEVEN_NEON + +#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + asm volatile ( + "mov r12, %4, lsl #2 \n" + "add %1, %1, %0 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 + MEMACCESS(1) + "vld1.8 {d1}, [%1], r12 \n" + MEMACCESS(0) + "vld1.8 {d2}, [%0], r12 \n" + MEMACCESS(1) + "vld1.8 {d3}, [%1], r12 \n" + MEMACCESS(0) + "vld1.8 {d4}, [%0], r12 \n" + MEMACCESS(1) + "vld1.8 {d5}, [%1], r12 \n" + MEMACCESS(0) + "vld1.8 {d6}, [%0], r12 \n" + MEMACCESS(1) + "vld1.8 {d7}, [%1], r12 \n" + "vaddl.u8 q0, d0, d1 \n" + "vaddl.u8 q1, d2, d3 \n" + "vaddl.u8 q2, d4, d5 \n" + "vaddl.u8 q3, d6, d7 \n" + "vswp.8 d1, d2 \n" // ab_cd -> ac_bd + "vswp.8 d5, d6 \n" // ef_gh -> eg_fh + "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) + "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) + "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. + "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. + "subs %3, %3, #4 \n" // 4 pixels per loop. + MEMACCESS(2) + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width) // %3 + : "r"(src_stepx) // %4 + : "memory", "cc", "r12", "q0", "q1", "q2", "q3" + ); +} +#endif // HAS_SCALEARGBROWDOWNEVEN_NEON +#endif // __aarch64__ + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h index b05ad146c..d48c4fe5e 100644 --- a/vp8/common/onyx.h +++ b/vp8/common/onyx.h @@ -224,7 +224,7 @@ extern "C" int arnr_strength; int arnr_type; - struct vpx_fixed_buf two_pass_stats_in; + vpx_fixed_buf_t two_pass_stats_in; struct vpx_codec_pkt_list *output_pkt_list; vp8e_tuning tuning; diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index f12b88613..1654cd821 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -39,40 +39,28 @@ struct vp8_extracfg }; -struct extraconfig_map -{ - int usage; - struct vp8_extracfg cfg; -}; - -static const struct extraconfig_map extracfg_map[] = -{ - { - 0, - { - NULL, +static struct vp8_extracfg default_extracfg = { + NULL, #if !(CONFIG_REALTIME_ONLY) - 0, /* cpu_used */ + 0, /* cpu_used */ #else - 4, /* cpu_used */ + 4, /* cpu_used */ #endif - 0, /* enable_auto_alt_ref */ - 0, /* noise_sensitivity */ - 0, /* Sharpness */ - 0, /* static_thresh */ + 0, /* enable_auto_alt_ref */ + 0, /* noise_sensitivity */ + 0, /* Sharpness */ + 0, /* static_thresh */ #if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) - VP8_EIGHT_TOKENPARTITION, + VP8_EIGHT_TOKENPARTITION, #else - VP8_ONE_TOKENPARTITION, /* token_partitions */ + VP8_ONE_TOKENPARTITION, /* token_partitions */ #endif - 0, /* arnr_max_frames */ - 3, /* arnr_strength */ - 3, /* arnr_type*/ - 0, /* tuning*/ - 10, /* cq_level */ - 0, /* rc_max_intra_bitrate_pct */ - } - } + 0, /* arnr_max_frames */ + 3, /* arnr_strength */ + 3, /* arnr_type*/ + 0, /* tuning*/ + 10, /* cq_level */ + 0, /* rc_max_intra_bitrate_pct */ }; struct vpx_codec_alg_priv @@ -632,9 +620,6 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx, { vpx_codec_err_t res = VPX_CODEC_OK; struct vpx_codec_alg_priv *priv; - vpx_codec_enc_cfg_t *cfg; - unsigned int i; - struct VP8_COMP *optr; vp8_rtcd(); @@ -662,17 +647,8 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx, ctx->config.enc = &ctx->priv->alg_priv->cfg; } - cfg = &ctx->priv->alg_priv->cfg; - - /* Select the extra vp8 configuration table based on the current - * usage value. If the current usage value isn't found, use the - * values for usage case 0. - */ - for (i = 0; - extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage; - i++); - priv->vp8_cfg = extracfg_map[i].cfg; + priv->vp8_cfg = default_extracfg; priv->vp8_cfg.pkt_list = &priv->pkt_list.head; priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2; diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index bd3b0fdc8..46f463a01 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -76,16 +76,12 @@ struct macroblock { int pred_mv_sad[MAX_REF_FRAMES]; int nmvjointcost[MV_JOINTS]; - int nmvcosts[2][MV_VALS]; int *nmvcost[2]; - int nmvcosts_hp[2][MV_VALS]; int *nmvcost_hp[2]; int **mvcost; int nmvjointsadcost[MV_JOINTS]; - int nmvsadcosts[2][MV_VALS]; int *nmvsadcost[2]; - int nmvsadcosts_hp[2][MV_VALS]; int *nmvsadcost_hp[2]; int **mvsadcost; diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index fe39e103f..0626f44a4 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -167,6 +167,26 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { vpx_free(cpi->complexity_map); cpi->complexity_map = NULL; + vpx_free(cpi->nmvcosts[0]); + vpx_free(cpi->nmvcosts[1]); + cpi->nmvcosts[0] = NULL; + cpi->nmvcosts[1] = NULL; + + vpx_free(cpi->nmvcosts_hp[0]); + vpx_free(cpi->nmvcosts_hp[1]); + cpi->nmvcosts_hp[0] = NULL; + cpi->nmvcosts_hp[1] = NULL; + + vpx_free(cpi->nmvsadcosts[0]); + vpx_free(cpi->nmvsadcosts[1]); + cpi->nmvsadcosts[0] = NULL; + cpi->nmvsadcosts[1] = NULL; + + vpx_free(cpi->nmvsadcosts_hp[0]); + vpx_free(cpi->nmvsadcosts_hp[1]); + cpi->nmvsadcosts_hp[0] = NULL; + cpi->nmvsadcosts_hp[1] = NULL; + vp9_cyclic_refresh_free(cpi->cyclic_refresh); cpi->cyclic_refresh = NULL; @@ -212,8 +232,15 @@ static void save_coding_context(VP9_COMP *cpi) { // intended for use in a re-code loop in vp9_compress_frame where the // quantizer value is adjusted between loop iterations. vp9_copy(cc->nmvjointcost, cpi->mb.nmvjointcost); - vp9_copy(cc->nmvcosts, cpi->mb.nmvcosts); - vp9_copy(cc->nmvcosts_hp, cpi->mb.nmvcosts_hp); + + vpx_memcpy(cc->nmvcosts[0], cpi->nmvcosts[0], + MV_VALS * sizeof(*cpi->nmvcosts[0])); + vpx_memcpy(cc->nmvcosts[1], cpi->nmvcosts[1], + MV_VALS * sizeof(*cpi->nmvcosts[1])); + vpx_memcpy(cc->nmvcosts_hp[0], cpi->nmvcosts_hp[0], + MV_VALS * sizeof(*cpi->nmvcosts_hp[0])); + vpx_memcpy(cc->nmvcosts_hp[1], cpi->nmvcosts_hp[1], + MV_VALS * sizeof(*cpi->nmvcosts_hp[1])); vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs); @@ -233,8 +260,15 @@ static void restore_coding_context(VP9_COMP *cpi) { // Restore key state variables to the snapshot state stored in the // previous call to vp9_save_coding_context. vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost); - vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts); - vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp); + + vpx_memcpy(cpi->nmvcosts[0], cc->nmvcosts[0], + MV_VALS * sizeof(*cc->nmvcosts[0])); + vpx_memcpy(cpi->nmvcosts[1], cc->nmvcosts[1], + MV_VALS * sizeof(*cc->nmvcosts[1])); + vpx_memcpy(cpi->nmvcosts_hp[0], cc->nmvcosts_hp[0], + MV_VALS * sizeof(*cc->nmvcosts_hp[0])); + vpx_memcpy(cpi->nmvcosts_hp[1], cc->nmvcosts_hp[1], + MV_VALS * sizeof(*cc->nmvcosts_hp[1])); vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs); @@ -734,6 +768,23 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) { CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy, vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); + CHECK_MEM_ERROR(cm, cpi->nmvcosts[0], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0]))); + CHECK_MEM_ERROR(cm, cpi->nmvcosts[1], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1]))); + CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[0], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0]))); + CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[1], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1]))); + CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[0], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0]))); + CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[1], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1]))); + CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[0], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0]))); + CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1]))); + for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0])); i++) { CHECK_MEM_ERROR(cm, cpi->mbgraph_stats[i].mb_stats, @@ -814,16 +865,16 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) { cpi->first_time_stamp_ever = INT64_MAX; cal_nmvjointsadcost(cpi->mb.nmvjointsadcost); - cpi->mb.nmvcost[0] = &cpi->mb.nmvcosts[0][MV_MAX]; - cpi->mb.nmvcost[1] = &cpi->mb.nmvcosts[1][MV_MAX]; - cpi->mb.nmvsadcost[0] = &cpi->mb.nmvsadcosts[0][MV_MAX]; - cpi->mb.nmvsadcost[1] = &cpi->mb.nmvsadcosts[1][MV_MAX]; + cpi->mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX]; + cpi->mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX]; + cpi->mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX]; + cpi->mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX]; cal_nmvsadcosts(cpi->mb.nmvsadcost); - cpi->mb.nmvcost_hp[0] = &cpi->mb.nmvcosts_hp[0][MV_MAX]; - cpi->mb.nmvcost_hp[1] = &cpi->mb.nmvcosts_hp[1][MV_MAX]; - cpi->mb.nmvsadcost_hp[0] = &cpi->mb.nmvsadcosts_hp[0][MV_MAX]; - cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX]; + cpi->mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX]; + cpi->mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX]; + cpi->mb.nmvsadcost_hp[0] = &cpi->nmvsadcosts_hp[0][MV_MAX]; + cpi->mb.nmvsadcost_hp[1] = &cpi->nmvsadcosts_hp[1][MV_MAX]; cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp); #if CONFIG_VP9_TEMPORAL_DENOISING @@ -2461,6 +2512,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, RATE_CONTROL *const rc = &cpi->rc; struct vpx_usec_timer cmptimer; YV12_BUFFER_CONFIG *force_src_buffer = NULL; + struct lookahead_entry *last_source = NULL; MV_REFERENCE_FRAME ref_frame; int arf_src_index; @@ -2474,7 +2526,6 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, vpx_usec_timer_start(&cmptimer); cpi->source = NULL; - cpi->last_source = NULL; vp9_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV); @@ -2537,11 +2588,11 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, if (cm->current_video_frame > 0) { #if CONFIG_SPATIAL_SVC if (is_spatial_svc(cpi)) - cpi->last_source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, -1, 0); + last_source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, -1, 0); else #endif - cpi->last_source = vp9_lookahead_peek(cpi->lookahead, -1); - if (cpi->last_source == NULL) + last_source = vp9_lookahead_peek(cpi->lookahead, -1); + if (last_source == NULL) return -1; } @@ -2565,8 +2616,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cpi->un_scaled_source = cpi->Source = force_src_buffer ? force_src_buffer : &cpi->source->img; - cpi->unscaled_last_source = cpi->last_source != NULL ? - &cpi->last_source->img : NULL; + cpi->unscaled_last_source = last_source != NULL ? &last_source->img : NULL; *time_stamp = cpi->source->ts_start; *time_end = cpi->source->ts_end; diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 82be0f4b6..9739244c8 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -208,11 +208,11 @@ typedef struct VP9EncoderConfig { int tile_columns; int tile_rows; - struct vpx_fixed_buf two_pass_stats_in; - struct vpx_codec_pkt_list *output_pkt_list; + vpx_fixed_buf_t two_pass_stats_in; + struct vpx_codec_pkt_list *output_pkt_list; #if CONFIG_FP_MB_STATS - struct vpx_fixed_buf firstpass_mb_stats_in; + vpx_fixed_buf_t firstpass_mb_stats_in; #endif vp8e_tuning tuning; @@ -235,7 +235,6 @@ typedef struct VP9_COMP { struct lookahead_ctx *lookahead; struct lookahead_entry *source; struct lookahead_entry *alt_ref_source; - struct lookahead_entry *last_source; YV12_BUFFER_CONFIG *Source; YV12_BUFFER_CONFIG *Last_Source; // NULL for first frame and alt_ref frames @@ -275,6 +274,11 @@ typedef struct VP9_COMP { CODING_CONTEXT coding_context; + int *nmvcosts[2]; + int *nmvcosts_hp[2]; + int *nmvsadcosts[2]; + int *nmvsadcosts_hp[2]; + int zbin_mode_boost; int zbin_mode_boost_enabled; diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h index 801449b6f..0857e39cd 100644 --- a/vp9/encoder/vp9_svc_layercontext.h +++ b/vp9/encoder/vp9_svc_layercontext.h @@ -25,7 +25,7 @@ typedef struct { double framerate; int avg_frame_size; TWO_PASS twopass; - struct vpx_fixed_buf rc_twopass_stats_in; + vpx_fixed_buf_t rc_twopass_stats_in; unsigned int current_video_frame_in_layer; int is_key_frame; vpx_svc_parameters_t svc_params_received; diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index efde63f15..83f8abb16 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -323,7 +323,7 @@ static vpx_codec_err_t set_encoder_config( if (oxcf->init_framerate > 180) oxcf->init_framerate = 30; - oxcf->mode = BEST; + oxcf->mode = GOOD; switch (cfg->g_pass) { case VPX_RC_ONE_PASS: diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h index 82bef55c9..d7c580060 100644 --- a/vpx/internal/vpx_codec_internal.h +++ b/vpx/internal/vpx_codec_internal.h @@ -344,7 +344,7 @@ struct vpx_codec_priv { vpx_codec_priv_cb_pair_t put_slice_cb; } dec; struct { - struct vpx_fixed_buf cx_data_dst_buf; + vpx_fixed_buf_t cx_data_dst_buf; unsigned int cx_data_pad_before; unsigned int cx_data_pad_after; vpx_codec_cx_pkt_t cx_data_pkt; diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index 7dbbf2f61..58acf18a8 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -188,14 +188,14 @@ extern "C" { has id 0.*/ } frame; /**< data for compressed frame packet */ - struct vpx_fixed_buf twopass_stats; /**< data for two-pass packet */ - struct vpx_fixed_buf firstpass_mb_stats; /**< first pass mb packet */ + vpx_fixed_buf_t twopass_stats; /**< data for two-pass packet */ + vpx_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */ struct vpx_psnr_pkt { unsigned int samples[4]; /**< Number of samples, total/y/u/v */ uint64_t sse[4]; /**< sum squared error, total/y/u/v */ double psnr[4]; /**< PSNR, total/y/u/v */ } psnr; /**< data for PSNR packet */ - struct vpx_fixed_buf raw; /**< data for arbitrary packets */ + vpx_fixed_buf_t raw; /**< data for arbitrary packets */ #if CONFIG_SPATIAL_SVC size_t layer_sizes[VPX_SS_MAX_LAYERS]; #endif @@ -452,14 +452,14 @@ extern "C" { * A buffer containing all of the stats packets produced in the first * pass, concatenated. */ - struct vpx_fixed_buf rc_twopass_stats_in; + vpx_fixed_buf_t rc_twopass_stats_in; /*!\brief first pass mb stats buffer. * * A buffer containing all of the first pass mb stats packets produced * in the first pass, concatenated. */ - struct vpx_fixed_buf rc_firstpass_mb_stats_in; + vpx_fixed_buf_t rc_firstpass_mb_stats_in; /*!\brief Target data rate * @@ -599,7 +599,8 @@ int main_loop(int argc, const char **argv_) { do_scale = 1; else if (arg_match(&arg, &fb_arg, argi)) num_external_frame_buffers = arg_parse_uint(&arg); - + else if (arg_match(&arg, &continuearg, argi)) + keep_going = 1; #if CONFIG_VP8_DECODER else if (arg_match(&arg, &addnoise_level, argi)) { postproc = 1; @@ -649,11 +650,8 @@ int main_loop(int argc, const char **argv_) { } } else if (arg_match(&arg, &error_concealment, argi)) { ec_enabled = 1; - } else if (arg_match(&arg, &continuearg, argi)) { - keep_going = 1; } - -#endif +#endif // CONFIG_VP8_DECODER else argj++; } |