diff options
-rw-r--r-- | test/avg_test.cc | 105 | ||||
-rw-r--r-- | test/dct_test.cc | 13 | ||||
-rw-r--r-- | test/svc_test.h | 10 | ||||
-rw-r--r-- | third_party/libyuv/README.libvpx | 1 | ||||
-rw-r--r-- | third_party/libyuv/include/libyuv/row.h | 4 | ||||
-rw-r--r-- | third_party/libyuv/source/row_gcc.cc | 109 | ||||
-rw-r--r-- | vp9/common/ppc/vp9_idct_vsx.c | 115 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.pl | 6 | ||||
-rw-r--r-- | vp9/encoder/vp9_aq_cyclicrefresh.c | 3 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 131 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.c | 20 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.h | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_firstpass.c | 200 | ||||
-rw-r--r-- | vp9/encoder/vp9_firstpass.h | 18 | ||||
-rw-r--r-- | vp9/encoder/vp9_pickmode.c | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_ratectrl.c | 24 | ||||
-rw-r--r-- | vp9/encoder/vp9_speed_features.c | 6 | ||||
-rw-r--r-- | vp9/encoder/vp9_speed_features.h | 1 | ||||
-rw-r--r-- | vp9/vp9_common.mk | 1 | ||||
-rw-r--r-- | vpx_dsp/ppc/inv_txfm_vsx.c | 1148 | ||||
-rw-r--r-- | vpx_dsp/ppc/inv_txfm_vsx.h | 33 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 | ||||
-rw-r--r-- | vpx_dsp/x86/avg_intrin_sse2.c | 50 |
23 files changed, 1597 insertions, 409 deletions
diff --git a/test/avg_test.cc b/test/avg_test.cc index 9c6410b1f..377e54209 100644 --- a/test/avg_test.cc +++ b/test/avg_test.cc @@ -22,40 +22,41 @@ #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "test/util.h" +#include "vpx/vpx_codec.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/vpx_timer.h" using libvpx_test::ACMRandom; namespace { + +template <typename Pixel> class AverageTestBase : public ::testing::Test { public: AverageTestBase(int width, int height) : width_(width), height_(height) {} - static void SetUpTestCase() { - source_data_ = reinterpret_cast<uint8_t *>( - vpx_memalign(kDataAlignment, kDataBlockSize)); - } - - static void TearDownTestCase() { + virtual void TearDown() { vpx_free(source_data_); source_data_ = NULL; + libvpx_test::ClearSystemState(); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } - protected: // Handle blocks up to 4 blocks 64x64 with stride up to 128 static const int kDataAlignment = 16; static const int kDataBlockSize = 64 * 128; virtual void SetUp() { + source_data_ = reinterpret_cast<Pixel *>( + vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0]))); + ASSERT_TRUE(source_data_ != NULL); source_stride_ = (width_ + 31) & ~31; + bit_depth_ = 8; rnd_.Reset(ACMRandom::DeterministicSeed()); } // Sum Pixels - static unsigned int ReferenceAverage8x8(const uint8_t *source, int pitch) { + static unsigned int ReferenceAverage8x8(const Pixel *source, int pitch) { unsigned int average = 0; for (int h = 0; h < 8; ++h) { for (int w = 0; w < 8; ++w) average += source[h * pitch + w]; @@ -63,7 +64,7 @@ class AverageTestBase : public ::testing::Test { return ((average + 32) >> 6); } - static unsigned int ReferenceAverage4x4(const uint8_t *source, int pitch) { + static unsigned int ReferenceAverage4x4(const Pixel *source, int pitch) { unsigned int average = 0; for (int h = 0; h < 4; ++h) { for (int w = 0; w < 4; ++w) average += source[h * pitch + w]; @@ -71,7 +72,7 @@ class AverageTestBase : public ::testing::Test { return ((average + 8) >> 4); } - void FillConstant(uint8_t fill_constant) { + void FillConstant(Pixel fill_constant) { for (int i = 0; i < width_ * height_; ++i) { source_data_[i] = fill_constant; } @@ -79,13 +80,14 @@ class AverageTestBase : public ::testing::Test { void FillRandom() { for (int i = 0; i < width_ * height_; ++i) { - source_data_[i] = rnd_.Rand8(); + source_data_[i] = rnd_.Rand16() & ((1 << bit_depth_) - 1); } } int width_, height_; - static uint8_t *source_data_; + Pixel *source_data_; int source_stride_; + int bit_depth_; ACMRandom rnd_; }; @@ -93,7 +95,7 @@ typedef unsigned int (*AverageFunction)(const uint8_t *s, int pitch); typedef ::testing::tuple<int, int, int, int, AverageFunction> AvgFunc; -class AverageTest : public AverageTestBase, +class AverageTest : public AverageTestBase<uint8_t>, public ::testing::WithParamInterface<AvgFunc> { public: AverageTest() : AverageTestBase(GET_PARAM(0), GET_PARAM(1)) {} @@ -119,12 +121,40 @@ class AverageTest : public AverageTestBase, } }; +#if CONFIG_VP9_HIGHBITDEPTH +class AverageTestHBD : public AverageTestBase<uint16_t>, + public ::testing::WithParamInterface<AvgFunc> { + public: + AverageTestHBD() : AverageTestBase(GET_PARAM(0), GET_PARAM(1)) {} + + protected: + void CheckAverages() { + const int block_size = GET_PARAM(3); + unsigned int expected = 0; + if (block_size == 8) { + expected = + ReferenceAverage8x8(source_data_ + GET_PARAM(2), source_stride_); + } else if (block_size == 4) { + expected = + ReferenceAverage4x4(source_data_ + GET_PARAM(2), source_stride_); + } + + ASM_REGISTER_STATE_CHECK(GET_PARAM(4)( + CONVERT_TO_BYTEPTR(source_data_ + GET_PARAM(2)), source_stride_)); + unsigned int actual = GET_PARAM(4)( + CONVERT_TO_BYTEPTR(source_data_ + GET_PARAM(2)), source_stride_); + + EXPECT_EQ(expected, actual); + } +}; +#endif // CONFIG_VP9_HIGHBITDEPTH + typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref, const int ref_stride, const int height); typedef ::testing::tuple<int, IntProRowFunc, IntProRowFunc> IntProRowParam; -class IntProRowTest : public AverageTestBase, +class IntProRowTest : public AverageTestBase<uint8_t>, public ::testing::WithParamInterface<IntProRowParam> { public: IntProRowTest() @@ -135,6 +165,10 @@ class IntProRowTest : public AverageTestBase, protected: virtual void SetUp() { + source_data_ = reinterpret_cast<uint8_t *>( + vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0]))); + ASSERT_TRUE(source_data_ != NULL); + hbuf_asm_ = reinterpret_cast<int16_t *>( vpx_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16)); hbuf_c_ = reinterpret_cast<int16_t *>( @@ -142,6 +176,8 @@ class IntProRowTest : public AverageTestBase, } virtual void TearDown() { + vpx_free(source_data_); + source_data_ = NULL; vpx_free(hbuf_c_); hbuf_c_ = NULL; vpx_free(hbuf_asm_); @@ -166,7 +202,7 @@ typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width); typedef ::testing::tuple<int, IntProColFunc, IntProColFunc> IntProColParam; -class IntProColTest : public AverageTestBase, +class IntProColTest : public AverageTestBase<uint8_t>, public ::testing::WithParamInterface<IntProColParam> { public: IntProColTest() : AverageTestBase(GET_PARAM(0), 1), sum_asm_(0), sum_c_(0) { @@ -288,8 +324,6 @@ class BlockErrorTestFP ACMRandom rnd_; }; -uint8_t *AverageTestBase::source_data_ = NULL; - TEST_P(AverageTest, MinValue) { FillConstant(0); CheckAverages(); @@ -308,6 +342,27 @@ TEST_P(AverageTest, Random) { CheckAverages(); } } +#if CONFIG_VP9_HIGHBITDEPTH +TEST_P(AverageTestHBD, MinValue) { + FillConstant(0); + CheckAverages(); +} + +TEST_P(AverageTestHBD, MaxValue) { + FillConstant((1 << VPX_BITS_12) - 1); + CheckAverages(); +} + +TEST_P(AverageTestHBD, Random) { + bit_depth_ = VPX_BITS_12; + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + for (int i = 0; i < 1000; i++) { + FillRandom(); + CheckAverages(); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH TEST_P(IntProRowTest, MinValue) { FillConstant(0); @@ -435,6 +490,20 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_avg_8x8_c), make_tuple(16, 16, 1, 4, &vpx_avg_4x4_c))); +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + C, AverageTestHBD, + ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_c), + make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_c))); + +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P( + SSE2, AverageTestHBD, + ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_sse2), + make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_sse2))); +#endif // HAVE_SSE2 +#endif // CONFIG_VP9_HIGHBITDEPTH + INSTANTIATE_TEST_CASE_P(C, SatdTest, ::testing::Values(make_tuple(16, &vpx_satd_c), make_tuple(64, &vpx_satd_c), diff --git a/test/dct_test.cc b/test/dct_test.cc index e8ad0cd5d..d696d8217 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -683,6 +683,19 @@ INSTANTIATE_TEST_CASE_P( VPX_BITS_12))); #endif // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH +#if HAVE_VSX && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo ht_vsx_func_info[3] = { + { &vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_vsx>, 4, 1 }, + { &vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_vsx>, 8, 1 }, + { &vp9_fht16x16_c, &iht_wrapper<vp9_iht16x16_256_add_vsx>, 16, 1 } +}; + +INSTANTIATE_TEST_CASE_P(VSX, TransHT, + ::testing::Combine(::testing::Range(0, 3), + ::testing::Values(ht_vsx_func_info), + ::testing::Range(0, 4), + ::testing::Values(VPX_BITS_8))); +#endif // HAVE_VSX #endif // !CONFIG_EMULATE_HARDWARE /* -------------------------------------------------------------------------- */ diff --git a/test/svc_test.h b/test/svc_test.h index a515ab75d..7dfd27c97 100644 --- a/test/svc_test.h +++ b/test/svc_test.h @@ -25,7 +25,15 @@ namespace svc_test { class OnePassCbrSvc : public ::libvpx_test::EncoderTest { public: explicit OnePassCbrSvc(const ::libvpx_test::CodecFactory *codec) - : EncoderTest(codec) {} + : EncoderTest(codec), base_speed_setting_(0), speed_setting_(0), + superframe_count_(0), temporal_layer_id_(0), number_temporal_layers_(0), + number_spatial_layers_(0) { + memset(&svc_params_, 0, sizeof(svc_params_)); + memset(bits_in_buffer_model_, 0, + sizeof(bits_in_buffer_model_[0]) * VPX_MAX_LAYERS); + memset(layer_target_avg_bandwidth_, 0, + sizeof(layer_target_avg_bandwidth_[0]) * VPX_MAX_LAYERS); + } protected: virtual ~OnePassCbrSvc() {} diff --git a/third_party/libyuv/README.libvpx b/third_party/libyuv/README.libvpx index 485f79c0f..5eb9a9b32 100644 --- a/third_party/libyuv/README.libvpx +++ b/third_party/libyuv/README.libvpx @@ -13,6 +13,7 @@ which down-samples the original input video (f.g. 1280x720) a number of times in order to encode multiple resolution bit streams. Local Modifications: +Disable some functions (webm:1514) rm -rf .gitignore .gn AUTHORS Android.mk BUILD.gn CMakeLists.txt DEPS LICENSE \ LICENSE_THIRD_PARTY OWNERS PATENTS PRESUBMIT.py README.chromium README.md \ all.gyp build_overrides/ chromium/ codereview.settings docs/ \ diff --git a/third_party/libyuv/include/libyuv/row.h b/third_party/libyuv/include/libyuv/row.h index 013a7e53e..f15fddad8 100644 --- a/third_party/libyuv/include/libyuv/row.h +++ b/third_party/libyuv/include/libyuv/row.h @@ -184,10 +184,8 @@ extern "C" { !defined(__i386__) || defined(_MSC_VER) // TODO(fbarchard): fix build error on x86 debug // https://code.google.com/p/libyuv/issues/detail?id=524 -#define HAS_I411TOARGBROW_SSSE3 // TODO(fbarchard): fix build error on android_full_debug=1 // https://code.google.com/p/libyuv/issues/detail?id=517 -#define HAS_I422ALPHATOARGBROW_SSSE3 #endif #endif @@ -212,7 +210,6 @@ extern "C" { #if !(defined(_DEBUG) && defined(__i386__)) // TODO(fbarchard): fix build error on android_full_debug=1 // https://code.google.com/p/libyuv/issues/detail?id=517 -#define HAS_I422ALPHATOARGBROW_AVX2 #endif #define HAS_I411TOARGBROW_AVX2 #define HAS_I422TOARGB1555ROW_AVX2 @@ -264,7 +261,6 @@ extern "C" { // The following are also available on x64 Visual C. #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && defined(_M_X64) && \ (!defined(__clang__) || defined(__SSSE3__)) -#define HAS_I422ALPHATOARGBROW_SSSE3 #define HAS_I422TOARGBROW_SSSE3 #endif diff --git a/third_party/libyuv/source/row_gcc.cc b/third_party/libyuv/source/row_gcc.cc index 1ac7ef1aa..a0bc1ac20 100644 --- a/third_party/libyuv/source/row_gcc.cc +++ b/third_party/libyuv/source/row_gcc.cc @@ -1769,77 +1769,6 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, ); } -#ifdef HAS_I422ALPHATOARGBROW_SSSE3 -void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - LABELALIGN - "1: \n" - READYUVA422 - YUVTORGB(yuvconstants) - STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [a_buf]"+r"(a_buf), // %[a_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) && defined(__pic__) - [width]"+m"(width) // %[width] -#else - [width]"+rm"(width) // %[width] -#endif - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -#endif // HAS_I422ALPHATOARGBROW_SSSE3 - -#ifdef HAS_I411TOARGBROW_SSSE3 -void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int temp; - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - READYUV411_TEMP - YUVTORGB(yuvconstants) - STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [temp]"=&r"(temp), // %[temp] -#if defined(__i386__) && defined(__pic__) - [width]"+m"(width) // %[width] -#else - [width]"+rm"(width) // %[width] -#endif - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -#endif - void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, const uint8* uv_buf, uint8* dst_argb, @@ -2229,44 +2158,6 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, } #endif // HAS_I422TOARGBROW_AVX2 -#if defined(HAS_I422ALPHATOARGBROW_AVX2) -// 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. -void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - LABELALIGN - "1: \n" - READYUVA422_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 - "subl $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [a_buf]"+r"(a_buf), // %[a_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) && defined(__pic__) - [width]"+m"(width) // %[width] -#else - [width]"+rm"(width) // %[width] -#endif - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -#endif // HAS_I422ALPHATOARGBROW_AVX2 - #if defined(HAS_I422TORGBAROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). diff --git a/vp9/common/ppc/vp9_idct_vsx.c b/vp9/common/ppc/vp9_idct_vsx.c new file mode 100644 index 000000000..1b2a93edb --- /dev/null +++ b/vp9/common/ppc/vp9_idct_vsx.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/ppc/inv_txfm_vsx.h" +#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h" + +#include "vp9/common/vp9_enums.h" + +void vp9_iht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + int16x8_t in[2], out[2]; + + in[0] = load_tran_low(0, input); + in[1] = load_tran_low(8 * sizeof(*input), input); + + switch (tx_type) { + case DCT_DCT: + vpx_idct4_vsx(in, out); + vpx_idct4_vsx(out, in); + break; + case ADST_DCT: + vpx_idct4_vsx(in, out); + vp9_iadst4_vsx(out, in); + break; + case DCT_ADST: + vp9_iadst4_vsx(in, out); + vpx_idct4_vsx(out, in); + break; + default: + assert(tx_type == ADST_ADST); + vp9_iadst4_vsx(in, out); + vp9_iadst4_vsx(out, in); + break; + } + + vpx_round_store4x4_vsx(in, out, dest, stride); +} + +void vp9_iht8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + int16x8_t in[8], out[8]; + + // load input data + in[0] = load_tran_low(0, input); + in[1] = load_tran_low(8 * sizeof(*input), input); + in[2] = load_tran_low(2 * 8 * sizeof(*input), input); + in[3] = load_tran_low(3 * 8 * sizeof(*input), input); + in[4] = load_tran_low(4 * 8 * sizeof(*input), input); + in[5] = load_tran_low(5 * 8 * sizeof(*input), input); + in[6] = load_tran_low(6 * 8 * sizeof(*input), input); + in[7] = load_tran_low(7 * 8 * sizeof(*input), input); + + switch (tx_type) { + case DCT_DCT: + vpx_idct8_vsx(in, out); + vpx_idct8_vsx(out, in); + break; + case ADST_DCT: + vpx_idct8_vsx(in, out); + vp9_iadst8_vsx(out, in); + break; + case DCT_ADST: + vp9_iadst8_vsx(in, out); + vpx_idct8_vsx(out, in); + break; + default: + assert(tx_type == ADST_ADST); + vp9_iadst8_vsx(in, out); + vp9_iadst8_vsx(out, in); + break; + } + + vpx_round_store8x8_vsx(in, dest, stride); +} + +void vp9_iht16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride, int tx_type) { + int16x8_t in0[16], in1[16]; + + LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), in0); + LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input), + 8 * sizeof(*input), in1); + + switch (tx_type) { + case DCT_DCT: + vpx_idct16_vsx(in0, in1); + vpx_idct16_vsx(in0, in1); + break; + case ADST_DCT: + vpx_idct16_vsx(in0, in1); + vpx_iadst16_vsx(in0, in1); + break; + case DCT_ADST: + vpx_iadst16_vsx(in0, in1); + vpx_idct16_vsx(in0, in1); + break; + default: + assert(tx_type == ADST_ADST); + vpx_iadst16_vsx(in0, in1); + vpx_iadst16_vsx(in0, in1); + break; + } + + vpx_round_store16x16_vsx(in0, in1, dest, stride); +} diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 8f5b0bf30..6d7f95260 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -67,9 +67,9 @@ add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *outp if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { # Note that there are more specializations appended when # CONFIG_VP9_HIGHBITDEPTH is off. - specialize qw/vp9_iht4x4_16_add neon sse2/; - specialize qw/vp9_iht8x8_64_add neon sse2/; - specialize qw/vp9_iht16x16_256_add neon sse2/; + specialize qw/vp9_iht4x4_16_add neon sse2 vsx/; + specialize qw/vp9_iht8x8_64_add neon sse2 vsx/; + specialize qw/vp9_iht16x16_256_add neon sse2 vsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") { # Note that these specializations are appended to the above ones. specialize qw/vp9_iht4x4_16_add dspr2 msa/; diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c index aadedba39..e11c94932 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -427,8 +427,11 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { double weight_segment_target = 0; double weight_segment = 0; int thresh_low_motion = (cm->width < 720) ? 55 : 20; + int qp_thresh = VPXMIN(20, rc->best_quality << 1); cr->apply_cyclic_refresh = 1; if (cm->frame_type == KEY_FRAME || cpi->svc.temporal_layer_id > 0 || + is_lossless_requested(&cpi->oxcf) || + rc->avg_frame_qindex[INTER_FRAME] < qp_thresh || (cpi->use_svc && cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) || (!cpi->use_svc && rc->avg_frame_low_motion < thresh_low_motion && diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 383f7a8d7..f6e423a7a 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -1967,6 +1967,8 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, vp9_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate); } + rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist); + x->rdmult = orig_rdmult; // TODO(jingning) The rate-distortion optimization flow needs to be @@ -2429,7 +2431,7 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td, } x->skip = ctx->skip; - x->skip_txfm[0] = mi->segment_id ? 0 : ctx->skip_txfm[0]; + x->skip_txfm[0] = (mi->segment_id || xd->lossless) ? 0 : ctx->skip_txfm[0]; } static void encode_b_rt(VP9_COMP *cpi, ThreadData *td, @@ -3317,20 +3319,101 @@ static int ml_pruning_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd, } #define FEATURES 4 -static const float partition_breakout_weights_64[FEATURES + 1] = { - -0.016673f, -0.001025f, -0.000032f, 0.000833f, 1.94261885f - 2.1f, +#define Q_CTX 3 +static const float partition_breakout_weights_64[Q_CTX][FEATURES + 1] = { + { + -0.016673f, + -0.001025f, + -0.000032f, + 0.000833f, + 1.94261885f - 2.1f, + }, + { + -0.160867f, + -0.002101f, + 0.000011f, + 0.002448f, + 1.65738142f - 2.5f, + }, + { + -0.628934f, + -0.011459f, + -0.000009f, + 0.013833f, + 1.47982645f - 1.6f, + }, }; -static const float partition_breakout_weights_32[FEATURES + 1] = { - -0.010554f, -0.003081f, -0.000134f, 0.004491f, 1.68445992f - 3.5f, +static const float partition_breakout_weights_32[Q_CTX][FEATURES + 1] = { + { + -0.010554f, + -0.003081f, + -0.000134f, + 0.004491f, + 1.68445992f - 3.5f, + }, + { + -0.051489f, + -0.007609f, + 0.000016f, + 0.009792f, + 1.28089404f - 2.5f, + }, + { + -0.163097f, + -0.013081f, + 0.000022f, + 0.019006f, + 1.36129403f - 3.2f, + }, }; -static const float partition_breakout_weights_16[FEATURES + 1] = { - -0.013154f, -0.002404f, -0.000977f, 0.008450f, 2.57404566f - 5.5f, +static const float partition_breakout_weights_16[Q_CTX][FEATURES + 1] = { + { + -0.013154f, + -0.002404f, + -0.000977f, + 0.008450f, + 2.57404566f - 5.5f, + }, + { + -0.019146f, + -0.004018f, + 0.000064f, + 0.008187f, + 2.15043926f - 2.5f, + }, + { + -0.075755f, + -0.010858f, + 0.000030f, + 0.024505f, + 2.06848121f - 2.5f, + }, }; -static const float partition_breakout_weights_8[FEATURES + 1] = { - -0.011807f, -0.009873f, -0.000931f, 0.034768f, 1.32254851f - 2.0f, +static const float partition_breakout_weights_8[Q_CTX][FEATURES + 1] = { + { + -0.011807f, + -0.009873f, + -0.000931f, + 0.034768f, + 1.32254851f - 2.0f, + }, + { + -0.003861f, + -0.002701f, + 0.000100f, + 0.013876f, + 1.96755111f - 1.5f, + }, + { + -0.013522f, + -0.008677f, + -0.000562f, + 0.034468f, + 1.53440356f - 1.5f, + }, }; // ML-based partition search breakout. @@ -3338,22 +3421,30 @@ static int ml_predict_breakout(const VP9_COMP *const cpi, BLOCK_SIZE bsize, const MACROBLOCK *const x, const RD_COST *const rd_cost) { DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = { 0 }; + const VP9_COMMON *const cm = &cpi->common; float features[FEATURES]; const float *linear_weights = NULL; // Linear model weights. float linear_score = 0.0f; + const int qindex = cm->base_qindex; + const int q_ctx = qindex >= 200 ? 0 : (qindex >= 150 ? 1 : 2); switch (bsize) { - case BLOCK_64X64: linear_weights = partition_breakout_weights_64; break; - case BLOCK_32X32: linear_weights = partition_breakout_weights_32; break; - case BLOCK_16X16: linear_weights = partition_breakout_weights_16; break; - case BLOCK_8X8: linear_weights = partition_breakout_weights_8; break; + case BLOCK_64X64: + linear_weights = partition_breakout_weights_64[q_ctx]; + break; + case BLOCK_32X32: + linear_weights = partition_breakout_weights_32[q_ctx]; + break; + case BLOCK_16X16: + linear_weights = partition_breakout_weights_16[q_ctx]; + break; + case BLOCK_8X8: linear_weights = partition_breakout_weights_8[q_ctx]; break; default: assert(0 && "Unexpected block size."); return 0; } if (!linear_weights) return 0; { // Generate feature values. - const VP9_COMMON *const cm = &cpi->common; - const int ac_q = vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth); + const int ac_q = vp9_ac_quant(qindex, 0, cm->bit_depth); const int num_pels_log2 = num_pels_log2_lookup[bsize]; int feature_index = 0; unsigned int var, sse; @@ -3385,9 +3476,10 @@ static int ml_predict_breakout(const VP9_COMP *const cpi, BLOCK_SIZE bsize, linear_score += linear_weights[i] * features[i]; } - return linear_score >= 0; + return linear_score >= cpi->sf.ml_partition_search_breakout_thresh[q_ctx]; } #undef FEATURES +#undef Q_CTX // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are // unlikely to be selected depending on previous rate-distortion optimization @@ -3559,8 +3651,6 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, best_rdc.rdcost); if (this_rdc.rate != INT_MAX) { if (bsize >= BLOCK_8X8) { - this_rdc.rdcost = - RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist); this_rdc.rdcost += RDCOST(x->rdmult, x->rddiv, cpi->partition_cost[pl][PARTITION_NONE], 0); this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE]; @@ -3579,7 +3669,7 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, if (!x->e_mbd.lossless && ctx->skippable) { int use_ml_based_breakout = cpi->sf.use_ml_partition_search_breakout && - cm->base_qindex >= 200; + cm->base_qindex >= 100; #if CONFIG_VP9_HIGHBITDEPTH if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) use_ml_based_breakout = 0; @@ -3714,7 +3804,6 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, } if (sum_rdc.rdcost < best_rdc.rdcost && i == 4) { - sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); sum_rdc.rdcost += RDCOST(x->rdmult, x->rddiv, cpi->partition_cost[pl][PARTITION_SPLIT], 0); sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT]; @@ -3777,7 +3866,6 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, } if (sum_rdc.rdcost < best_rdc.rdcost) { - sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); sum_rdc.rdcost += RDCOST(x->rdmult, x->rddiv, cpi->partition_cost[pl][PARTITION_HORZ], 0); sum_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ]; @@ -3827,7 +3915,6 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, } if (sum_rdc.rdcost < best_rdc.rdcost) { - sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); sum_rdc.rdcost += RDCOST(x->rdmult, x->rddiv, cpi->partition_cost[pl][PARTITION_VERT], 0); sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT]; diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 6ec7a5ee8..74e0d85a5 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2952,7 +2952,7 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q, return force_recode; } -void vp9_update_reference_frames(VP9_COMP *cpi) { +void update_ref_frames(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; BufferPool *const pool = cm->buffer_pool; @@ -3016,6 +3016,14 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { cpi->interp_filter_selected[0], sizeof(cpi->interp_filter_selected[0])); } +} + +void vp9_update_reference_frames(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + BufferPool *const pool = cm->buffer_pool; + + update_ref_frames(cpi); + #if CONFIG_VP9_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && cpi->denoiser.denoising_level > kDenLowLow) { @@ -3054,6 +3062,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { denoise_svc_second_layer); } #endif + if (is_one_pass_cbr_svc(cpi)) { // Keep track of frame index for each reference frame. SVC *const svc = &cpi->svc; @@ -5670,6 +5679,15 @@ void setup_tpl_stats(VP9_COMP *cpi) { int tpl_group_frames = 0; int frame_idx; + // TODO(jingning): Make the model support high bit-depth route. +#if CONFIG_VP9_HIGHBITDEPTH + (void)gf_picture; + (void)gf_group; + (void)tpl_group_frames; + (void)frame_idx; + return; +#endif + init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames); init_tpl_stats(cpi); diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index a3d39266f..ec02a78ee 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -758,9 +758,7 @@ typedef struct VP9_COMP { int num_extra_arfs; int arf_pos_in_gf[MAX_EXT_ARFS + 1]; int arf_pos_for_ovrly[MAX_EXT_ARFS + 1]; - int extra_arf_allowed; - int bwd_ref_allowed; vpx_roi_map_t roi; } VP9_COMP; diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index c13576343..6717d961d 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -2135,7 +2135,7 @@ static void define_gf_multi_arf_structure(VP9_COMP *cpi) { // (3) The bi-predictive group interval is strictly smaller than the // golden group interval. const int is_bipred_enabled = - cpi->bwd_ref_allowed && rc->source_alt_ref_pending && + cpi->extra_arf_allowed && rc->source_alt_ref_pending && rc->bipred_group_interval && rc->bipred_group_interval <= (rc->baseline_gf_interval - rc->source_alt_ref_pending); @@ -2439,6 +2439,151 @@ static void define_gf_group_structure(VP9_COMP *cpi) { cpi->multi_arf_last_grp_enabled = cpi->multi_arf_enabled; } +static void allocate_gf_multi_arf_bits(VP9_COMP *cpi, int64_t gf_group_bits, + int gf_arf_bits) { + VP9EncoderConfig *const oxcf = &cpi->oxcf; + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + FIRSTPASS_STATS frame_stats; + int i; + int frame_index = 0; + int target_frame_size; + int key_frame; + const int max_bits = frame_max_bits(&cpi->rc, oxcf); + int64_t total_group_bits = gf_group_bits; + int normal_frames; + int normal_frame_bits; + int last_frame_reduction = 0; + double av_score = 1.0; + double tot_norm_frame_score = 1.0; + double this_frame_score = 1.0; + + // Define the GF structure and specify + define_gf_multi_arf_structure(cpi); + + //======================================== + + key_frame = cpi->common.frame_type == KEY_FRAME; + + // For key frames the frame target rate is already set and it + // is also the golden frame. + // === [frame_index == 0] === + if (!key_frame) { + gf_group->bit_allocation[frame_index] = + rc->source_alt_ref_active ? 0 : gf_arf_bits; + } + + // Deduct the boost bits for arf (or gf if it is not a key frame) + // from the group total. + if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits; + + ++frame_index; + + // === [frame_index == 1] === + // Store the bits to spend on the ARF if there is one. + if (rc->source_alt_ref_pending) { + gf_group->bit_allocation[frame_index] = gf_arf_bits; + + ++frame_index; + + // Skip all the extra-ARF's right after ARF at the starting segment of + // the current GF group. + if (cpi->num_extra_arfs) { + while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE) + ++frame_index; + } + } + + normal_frames = (rc->baseline_gf_interval - rc->source_alt_ref_pending); + if (normal_frames > 1) + normal_frame_bits = (int)(total_group_bits / normal_frames); + else + normal_frame_bits = (int)total_group_bits; + + if (oxcf->vbr_corpus_complexity) { + av_score = get_distribution_av_err(cpi, twopass); + tot_norm_frame_score = calculate_group_score(cpi, av_score, normal_frames); + } + + // Allocate bits to the other frames in the group. + for (i = 0; i < normal_frames; ++i) { + if (EOF == input_stats(twopass, &frame_stats)) break; + + if (oxcf->vbr_corpus_complexity) { + this_frame_score = calculate_norm_frame_score(cpi, twopass, oxcf, + &frame_stats, av_score); + normal_frame_bits = (int)((double)total_group_bits * + (this_frame_score / tot_norm_frame_score)); + } + + target_frame_size = normal_frame_bits; + if ((i == (normal_frames - 1)) && (i >= 1)) { + last_frame_reduction = normal_frame_bits / 16; + target_frame_size -= last_frame_reduction; + } + + // TODO(zoeliu): Further check whether following is needed for + // hierarchical GF group structure. + if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) { + target_frame_size -= (target_frame_size >> 4); + } + + target_frame_size = + clamp(target_frame_size, 0, VPXMIN(max_bits, (int)total_group_bits)); + + if (gf_group->update_type[frame_index] == BRF_UPDATE) { + // Boost up the allocated bits on BWDREF_FRAME + gf_group->bit_allocation[frame_index] = + target_frame_size + (target_frame_size >> 2); + } else if (gf_group->update_type[frame_index] == LAST_BIPRED_UPDATE) { + // Press down the allocated bits on LAST_BIPRED_UPDATE frames + gf_group->bit_allocation[frame_index] = + target_frame_size - (target_frame_size >> 1); + } else if (gf_group->update_type[frame_index] == BIPRED_UPDATE) { + // TODO(zoeliu): Investigate whether the allocated bits on BIPRED_UPDATE + // frames need to be further adjusted. + gf_group->bit_allocation[frame_index] = target_frame_size; + } else { + assert(gf_group->update_type[frame_index] == LF_UPDATE || + gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE); + gf_group->bit_allocation[frame_index] = target_frame_size; + } + + ++frame_index; + + // Skip all the extra-ARF's. + if (cpi->num_extra_arfs) { + while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE) + ++frame_index; + } + } + + // NOTE: We need to configure the frame at the end of the sequence + 1 that + // will be the start frame for the next group. Otherwise prior to the + // call to av1_rc_get_second_pass_params() the data will be undefined. + if (rc->source_alt_ref_pending) { + if (cpi->num_extra_arfs) { + // NOTE: For bit allocation, move the allocated bits associated with + // INTNL_OVERLAY_UPDATE to the corresponding INTNL_ARF_UPDATE. + // i > 0 for extra-ARF's and i == 0 for ARF: + // arf_pos_for_ovrly[i]: Position for INTNL_OVERLAY_UPDATE + // arf_pos_in_gf[i]: Position for INTNL_ARF_UPDATE + for (i = cpi->num_extra_arfs; i > 0; --i) { + assert(gf_group->update_type[cpi->arf_pos_for_ovrly[i]] == + INTNL_OVERLAY_UPDATE); + + // Encoder's choice: + // Set show_existing_frame == 1 for all extra-ARF's, and hence + // allocate zero bit for both all internal OVERLAY frames. + gf_group->bit_allocation[cpi->arf_pos_in_gf[i]] = + gf_group->bit_allocation[cpi->arf_pos_for_ovrly[i]]; + gf_group->bit_allocation[cpi->arf_pos_for_ovrly[i]] = 0; + } + } + } +} + static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, int gf_arf_bits) { VP9EncoderConfig *const oxcf = &cpi->oxcf; @@ -2462,17 +2607,7 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, double this_frame_score = 1.0; // Define the GF structure and specify - cpi->bwd_ref_allowed = 0; - cpi->extra_arf_allowed = 0; - - cpi->num_extra_arfs = 0; - cpi->num_extra_arfs = cpi->extra_arf_allowed ? cpi->num_extra_arfs : 0; - - if (cpi->bwd_ref_allowed) { - define_gf_multi_arf_structure(cpi); - } else { - define_gf_group_structure(cpi); - } + define_gf_group_structure(cpi); key_frame = cpi->common.frame_type == KEY_FRAME; @@ -2620,6 +2755,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { const int is_key_frame = frame_is_intra_only(cm); const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active; + int disable_bwd_extarf; + // Reset the GF group data structures unless this is a key // frame in which case it will already have been done. if (is_key_frame == 0) { @@ -2800,6 +2937,39 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { rc->frames_till_gf_update_due = rc->baseline_gf_interval; + // TODO(zoeliu): Turn on the option to disable extra ALTREFs for still GF + // groups. + // Disable extra altrefs for "still" gf group: + // zero_motion_accumulator: minimum percentage of (0,0) motion; + // avg_sr_coded_error: average of the SSE per pixel of each frame; + // avg_raw_err_stdev: average of the standard deviation of (0,0) + // motion error per block of each frame. +#if 0 + assert(num_mbs > 0); + disable_bwd_extarf = + (zero_motion_accumulator > MIN_ZERO_MOTION && + avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR && + avg_raw_err_stdev < MAX_RAW_ERR_VAR); +#else + disable_bwd_extarf = 0; +#endif // 0 + + if (disable_bwd_extarf) cpi->extra_arf_allowed = 0; + + if (!cpi->extra_arf_allowed) { + cpi->num_extra_arfs = 0; + } else { + // Compute how many extra alt_refs we can have + cpi->num_extra_arfs = get_number_of_extra_arfs(rc->baseline_gf_interval, + rc->source_alt_ref_pending); + } + // Currently at maximum two extra ARFs' are allowed + assert(cpi->num_extra_arfs <= MAX_EXT_ARFS); + + rc->bipred_group_interval = BFG_INTERVAL; + // The minimum bi-predictive frame group interval is 2. + if (rc->bipred_group_interval < 2) rc->bipred_group_interval = 0; + // Reset the file position. reset_fpf_position(twopass, start_pos); @@ -2851,7 +3021,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { twopass->kf_group_error_left -= gf_group_err; // Allocate bits to each of the frames in the GF group. - allocate_gf_group_bits(cpi, gf_group_bits, gf_arf_bits); + if (cpi->extra_arf_allowed) { + allocate_gf_multi_arf_bits(cpi, gf_group_bits, gf_arf_bits); + } else { + allocate_gf_group_bits(cpi, gf_group_bits, gf_arf_bits); + } // Reset the file position. reset_fpf_position(twopass, start_pos); diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index 958dc128d..404175d92 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -11,6 +11,8 @@ #ifndef VP9_ENCODER_VP9_FIRSTPASS_H_ #define VP9_ENCODER_VP9_FIRSTPASS_H_ +#include <assert.h> + #include "vp9/encoder/vp9_lookahead.h" #include "vp9/encoder/vp9_ratectrl.h" @@ -41,7 +43,12 @@ typedef struct { #define INVALID_ROW -1 +// Length of the bi-predictive frame group (BFG) +// NOTE: Currently each BFG contains one backward ref (BWF) frame plus a certain +// number of bi-predictive frames. +#define BFG_INTERVAL 2 #define MAX_EXT_ARFS 2 +#define MIN_EXT_ARF_INTERVAL 4 typedef struct { double frame_mb_intra_factor; @@ -210,6 +217,17 @@ void vp9_twopass_postencode_update(struct VP9_COMP *cpi); void calculate_coded_size(struct VP9_COMP *cpi, int *scaled_frame_width, int *scaled_frame_height); +static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) { + assert(MAX_EXT_ARFS > 0); + if (arf_pending) { + if (interval >= MIN_EXT_ARF_INTERVAL * (MAX_EXT_ARFS + 1)) + return MAX_EXT_ARFS; + else if (interval >= MIN_EXT_ARF_INTERVAL * MAX_EXT_ARFS) + return MAX_EXT_ARFS - 1; + } + return 0; +} + #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 8b5ad9ac5..eb9abf729 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -2218,7 +2218,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // Skipping checking: test to see if this block can be reconstructed by // prediction only. - if (cpi->allow_encode_breakout) { + if (cpi->allow_encode_breakout && !xd->lossless) { encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame, this_mode, var_y, sse_y, yv12_mb, &this_rdc.rate, &this_rdc.dist, flag_preduv_computed); diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index fdc8e1494..cea13b027 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -414,9 +414,12 @@ static int check_buffer_above_thresh(VP9_COMP *cpi, int drop_mark) { svc->number_temporal_layers); LAYER_CONTEXT *lc = &svc->layer_context[layer]; RATE_CONTROL *lrc = &lc->rc; - const int drop_mark_layer = - (int)(cpi->svc.framedrop_thresh[i] * lrc->optimal_buffer_level / 100); - if (!(lrc->buffer_level > drop_mark_layer)) return 0; + // Exclude check for layer whose bitrate is 0. + if (lc->target_bandwidth > 0) { + const int drop_mark_layer = (int)(cpi->svc.framedrop_thresh[i] * + lrc->optimal_buffer_level / 100); + if (!(lrc->buffer_level > drop_mark_layer)) return 0; + } } return 1; } @@ -439,12 +442,15 @@ static int check_buffer_below_thresh(VP9_COMP *cpi, int drop_mark) { svc->number_temporal_layers); LAYER_CONTEXT *lc = &svc->layer_context[layer]; RATE_CONTROL *lrc = &lc->rc; - const int drop_mark_layer = - (int)(cpi->svc.framedrop_thresh[i] * lrc->optimal_buffer_level / 100); - if (cpi->svc.framedrop_mode == FULL_SUPERFRAME_DROP) { - if (lrc->buffer_level <= drop_mark_layer) return 1; - } else { - if (!(lrc->buffer_level <= drop_mark_layer)) return 0; + // Exclude check for layer whose bitrate is 0. + if (lc->target_bandwidth > 0) { + const int drop_mark_layer = (int)(cpi->svc.framedrop_thresh[i] * + lrc->optimal_buffer_level / 100); + if (cpi->svc.framedrop_mode == FULL_SUPERFRAME_DROP) { + if (lrc->buffer_level <= drop_mark_layer) return 1; + } else { + if (!(lrc->buffer_level <= drop_mark_layer)) return 0; + } } } if (cpi->svc.framedrop_mode == FULL_SUPERFRAME_DROP) diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 254c4e2b1..b9f8055bc 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -83,6 +83,9 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, sf->disable_split_mask = DISABLE_COMPOUND_SPLIT; sf->partition_search_breakout_thr.dist = (1 << 21); sf->use_ml_partition_search_breakout = 1; + sf->ml_partition_search_breakout_thresh[0] = 0.0f; + sf->ml_partition_search_breakout_thresh[1] = 0.0f; + sf->ml_partition_search_breakout_thresh[2] = 0.0f; } } @@ -97,6 +100,9 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY; sf->partition_search_breakout_thr.dist = (1 << 22); sf->partition_search_breakout_thr.rate = 100; + sf->ml_partition_search_breakout_thresh[0] = 0.0f; + sf->ml_partition_search_breakout_thresh[1] = -1.0f; + sf->ml_partition_search_breakout_thresh[2] = -4.0f; } sf->rd_auto_partition_min_limit = set_partition_min_limit(cm); diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index eede9cbe2..7a591e491 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -472,6 +472,7 @@ typedef struct SPEED_FEATURES { // Use ML-based partition search early breakout. int use_ml_partition_search_breakout; + float ml_partition_search_breakout_thresh[3]; // Machine-learning based partition search early termination int ml_partition_search_early_termination; diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index d40d3c445..7ca4004b0 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -68,6 +68,7 @@ VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c +VP9_COMMON_SRCS-$(HAVE_VSX) += common/ppc/vp9_idct_vsx.c VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht16x16_add_neon.c diff --git a/vpx_dsp/ppc/inv_txfm_vsx.c b/vpx_dsp/ppc/inv_txfm_vsx.c index 6603b85ac..8d8fb4401 100644 --- a/vpx_dsp/ppc/inv_txfm_vsx.c +++ b/vpx_dsp/ppc/inv_txfm_vsx.c @@ -14,70 +14,130 @@ #include "vpx_dsp/ppc/bitdepth_conversion_vsx.h" #include "vpx_dsp/ppc/types_vsx.h" +#include "vpx_dsp/ppc/inv_txfm_vsx.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/inv_txfm.h" -static int16x8_t cospi1_v = { 16364, 16364, 16364, 16364, - 16364, 16364, 16364, 16364 }; -static int16x8_t cospi2_v = { 16305, 16305, 16305, 16305, - 16305, 16305, 16305, 16305 }; -static int16x8_t cospi3_v = { 16207, 16207, 16207, 16207, - 16207, 16207, 16207, 16207 }; -static int16x8_t cospi4_v = { 16069, 16069, 16069, 16069, - 16069, 16069, 16069, 16069 }; -static int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069, - -16069, -16069, -16069, -16069 }; -static int16x8_t cospi5_v = { 15893, 15893, 15893, 15893, - 15893, 15893, 15893, 15893 }; -static int16x8_t cospi6_v = { 15679, 15679, 15679, 15679, - 15679, 15679, 15679, 15679 }; -static int16x8_t cospi7_v = { 15426, 15426, 15426, 15426, - 15426, 15426, 15426, 15426 }; -static int16x8_t cospi8_v = { 15137, 15137, 15137, 15137, - 15137, 15137, 15137, 15137 }; -static int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137, - -15137, -15137, -15137, -15137 }; -static int16x8_t cospi9_v = { 14811, 14811, 14811, 14811, - 14811, 14811, 14811, 14811 }; -static int16x8_t cospi10_v = { 14449, 14449, 14449, 14449, - 14449, 14449, 14449, 14449 }; -static int16x8_t cospi11_v = { 14053, 14053, 14053, 14053, - 14053, 14053, 14053, 14053 }; -static int16x8_t cospi12_v = { 13623, 13623, 13623, 13623, - 13623, 13623, 13623, 13623 }; -static int16x8_t cospi13_v = { 13160, 13160, 13160, 13160, - 13160, 13160, 13160, 13160 }; -static int16x8_t cospi14_v = { 12665, 12665, 12665, 12665, - 12665, 12665, 12665, 12665 }; -static int16x8_t cospi15_v = { 12140, 12140, 12140, 12140, - 12140, 12140, 12140, 12140 }; -static int16x8_t cospi16_v = { 11585, 11585, 11585, 11585, - 11585, 11585, 11585, 11585 }; -static int16x8_t cospi17_v = { 11003, 11003, 11003, 11003, - 11003, 11003, 11003, 11003 }; -static int16x8_t cospi18_v = { 10394, 10394, 10394, 10394, - 10394, 10394, 10394, 10394 }; -static int16x8_t cospi19_v = { 9760, 9760, 9760, 9760, 9760, 9760, 9760, 9760 }; -static int16x8_t cospi20_v = { 9102, 9102, 9102, 9102, 9102, 9102, 9102, 9102 }; -static int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102, - -9102, -9102, -9102, -9102 }; -static int16x8_t cospi21_v = { 8423, 8423, 8423, 8423, 8423, 8423, 8423, 8423 }; -static int16x8_t cospi22_v = { 7723, 7723, 7723, 7723, 7723, 7723, 7723, 7723 }; -static int16x8_t cospi23_v = { 7005, 7005, 7005, 7005, 7005, 7005, 7005, 7005 }; -static int16x8_t cospi24_v = { 6270, 6270, 6270, 6270, 6270, 6270, 6270, 6270 }; -static int16x8_t cospi24_mv = { -6270, -6270, -6270, -6270, - -6270, -6270, -6270, -6270 }; -static int16x8_t cospi25_v = { 5520, 5520, 5520, 5520, 5520, 5520, 5520, 5520 }; -static int16x8_t cospi26_v = { 4756, 4756, 4756, 4756, 4756, 4756, 4756, 4756 }; -static int16x8_t cospi27_v = { 3981, 3981, 3981, 3981, 3981, 3981, 3981, 3981 }; -static int16x8_t cospi28_v = { 3196, 3196, 3196, 3196, 3196, 3196, 3196, 3196 }; -static int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, 2404, 2404, 2404, 2404 }; -static int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, 1606, 1606, 1606, 1606 }; -static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 }; - -static uint8x16_t mask1 = { 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, - 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 }; +static const int16x8_t cospi1_v = { 16364, 16364, 16364, 16364, + 16364, 16364, 16364, 16364 }; +static const int16x8_t cospi1m_v = { -16364, -16364, -16364, -16364, + -16364, -16364, -16364, -16364 }; +static const int16x8_t cospi2_v = { 16305, 16305, 16305, 16305, + 16305, 16305, 16305, 16305 }; +static const int16x8_t cospi2m_v = { -16305, -16305, -16305, -16305, + -16305, -16305, -16305, -16305 }; +static const int16x8_t cospi3_v = { 16207, 16207, 16207, 16207, + 16207, 16207, 16207, 16207 }; +static const int16x8_t cospi4_v = { 16069, 16069, 16069, 16069, + 16069, 16069, 16069, 16069 }; +static const int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069, + -16069, -16069, -16069, -16069 }; +static const int16x8_t cospi5_v = { 15893, 15893, 15893, 15893, + 15893, 15893, 15893, 15893 }; +static const int16x8_t cospi5m_v = { -15893, -15893, -15893, -15893, + -15893, -15893, -15893, -15893 }; +static const int16x8_t cospi6_v = { 15679, 15679, 15679, 15679, + 15679, 15679, 15679, 15679 }; +static const int16x8_t cospi7_v = { 15426, 15426, 15426, 15426, + 15426, 15426, 15426, 15426 }; +static const int16x8_t cospi8_v = { 15137, 15137, 15137, 15137, + 15137, 15137, 15137, 15137 }; +static const int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137, + -15137, -15137, -15137, -15137 }; +static const int16x8_t cospi9_v = { 14811, 14811, 14811, 14811, + 14811, 14811, 14811, 14811 }; +static const int16x8_t cospi9m_v = { -14811, -14811, -14811, -14811, + -14811, -14811, -14811, -14811 }; +static const int16x8_t cospi10_v = { 14449, 14449, 14449, 14449, + 14449, 14449, 14449, 14449 }; +static const int16x8_t cospi10m_v = { -14449, -14449, -14449, -14449, + -14449, -14449, -14449, -14449 }; +static const int16x8_t cospi11_v = { 14053, 14053, 14053, 14053, + 14053, 14053, 14053, 14053 }; +static const int16x8_t cospi12_v = { 13623, 13623, 13623, 13623, + 13623, 13623, 13623, 13623 }; +static const int16x8_t cospi12m_v = { -13623, -13623, -13623, -13623, + -13623, -13623, -13623, -13623 }; +static const int16x8_t cospi13_v = { 13160, 13160, 13160, 13160, + 13160, 13160, 13160, 13160 }; +static const int16x8_t cospi13m_v = { -13160, -13160, -13160, -13160, + -13160, -13160, -13160, -13160 }; +static const int16x8_t cospi14_v = { 12665, 12665, 12665, 12665, + 12665, 12665, 12665, 12665 }; +static const int16x8_t cospi15_v = { 12140, 12140, 12140, 12140, + 12140, 12140, 12140, 12140 }; +static const int16x8_t cospi16_v = { 11585, 11585, 11585, 11585, + 11585, 11585, 11585, 11585 }; +static const int16x8_t cospi16m_v = { -11585, -11585, -11585, -11585, + -11585, -11585, -11585, -11585 }; +static const int16x8_t cospi17_v = { 11003, 11003, 11003, 11003, + 11003, 11003, 11003, 11003 }; +static const int16x8_t cospi17m_v = { -11003, -11003, -11003, -11003, + -11003, -11003, -11003, -11003 }; +static const int16x8_t cospi18_v = { 10394, 10394, 10394, 10394, + 10394, 10394, 10394, 10394 }; +static const int16x8_t cospi18m_v = { -10394, -10394, -10394, -10394, + -10394, -10394, -10394, -10394 }; +static const int16x8_t cospi19_v = { 9760, 9760, 9760, 9760, + 9760, 9760, 9760, 9760 }; +static const int16x8_t cospi20_v = { 9102, 9102, 9102, 9102, + 9102, 9102, 9102, 9102 }; +static const int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102, + -9102, -9102, -9102, -9102 }; +static const int16x8_t cospi21_v = { 8423, 8423, 8423, 8423, + 8423, 8423, 8423, 8423 }; +static const int16x8_t cospi21m_v = { -8423, -8423, -8423, -8423, + -8423, -8423, -8423, -8423 }; +static const int16x8_t cospi22_v = { 7723, 7723, 7723, 7723, + 7723, 7723, 7723, 7723 }; +static const int16x8_t cospi23_v = { 7005, 7005, 7005, 7005, + 7005, 7005, 7005, 7005 }; +static const int16x8_t cospi24_v = { 6270, 6270, 6270, 6270, + 6270, 6270, 6270, 6270 }; +static const int16x8_t cospi24m_v = { -6270, -6270, -6270, -6270, + -6270, -6270, -6270, -6270 }; +static const int16x8_t cospi25_v = { 5520, 5520, 5520, 5520, + 5520, 5520, 5520, 5520 }; +static const int16x8_t cospi25m_v = { -5520, -5520, -5520, -5520, + -5520, -5520, -5520, -5520 }; +static const int16x8_t cospi26_v = { 4756, 4756, 4756, 4756, + 4756, 4756, 4756, 4756 }; +static const int16x8_t cospi26m_v = { -4756, -4756, -4756, -4756, + -4756, -4756, -4756, -4756 }; +static const int16x8_t cospi27_v = { 3981, 3981, 3981, 3981, + 3981, 3981, 3981, 3981 }; +static const int16x8_t cospi28_v = { 3196, 3196, 3196, 3196, + 3196, 3196, 3196, 3196 }; +static const int16x8_t cospi28m_v = { -3196, -3196, -3196, -3196, + -3196, -3196, -3196, -3196 }; +static const int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, + 2404, 2404, 2404, 2404 }; +static const int16x8_t cospi29m_v = { -2404, -2404, -2404, -2404, + -2404, -2404, -2404, -2404 }; +static const int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, + 1606, 1606, 1606, 1606 }; +static const int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 }; + +static const int16x8_t sinpi_1_9_v = { 5283, 5283, 5283, 5283, + 5283, 5283, 5283, 5283 }; +static const int16x8_t sinpi_2_9_v = { 9929, 9929, 9929, 9929, + 9929, 9929, 9929, 9929 }; +static const int16x8_t sinpi_3_9_v = { 13377, 13377, 13377, 13377, + 13377, 13377, 13377, 13377 }; +static const int16x8_t sinpi_4_9_v = { 15212, 15212, 15212, 15212, + 15212, 15212, 15212, 15212 }; + +static uint8x16_t tr8_mask0 = { + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 +}; + +static uint8x16_t tr8_mask1 = { + 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F +}; + #define ROUND_SHIFT_INIT \ const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \ const uint32x4_t shift14 = vec_splat_u32(14); @@ -109,26 +169,18 @@ static uint8x16_t mask1 = { 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, out1 = vec_sub(step0, step1); \ out1 = vec_perm(out1, out1, mask0); -#define PACK_STORE(v0, v1) \ - tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0); \ - tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1); \ - output_v = vec_packsu(tmp16_0, tmp16_1); \ - \ - vec_vsx_st(output_v, 0, tmp_dest); \ - for (i = 0; i < 4; i++) \ +#define PACK_STORE(v0, v1) \ + tmp16_0 = vec_add(vec_perm(d_u0, d_u1, tr8_mask0), v0); \ + tmp16_1 = vec_add(vec_perm(d_u2, d_u3, tr8_mask0), v1); \ + output_v = vec_packsu(tmp16_0, tmp16_1); \ + \ + vec_vsx_st(output_v, 0, tmp_dest); \ + for (i = 0; i < 4; i++) \ for (j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i]; -void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, +void vpx_round_store4x4_vsx(int16x8_t *in, int16x8_t *out, uint8_t *dest, int stride) { int i, j; - int32x4_t temp1, temp2, temp3, temp4; - int16x8_t step0, step1, tmp16_0, tmp16_1, t_out0, t_out1; - uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, - 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 }; - int16x8_t v0 = load_tran_low(0, input); - int16x8_t v1 = load_tran_low(8 * sizeof(*input), input); - int16x8_t t0 = vec_mergeh(v0, v1); - int16x8_t t1 = vec_mergel(v0, v1); uint8x16_t dest0 = vec_vsx_ld(0, dest); uint8x16_t dest1 = vec_vsx_ld(stride, dest); uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest); @@ -138,27 +190,45 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov); int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov); int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov); - + int16x8_t tmp16_0, tmp16_1; uint8x16_t output_v; uint8_t tmp_dest[16]; - ROUND_SHIFT_INIT PIXEL_ADD_INIT; - v0 = vec_mergeh(t0, t1); - v1 = vec_mergel(t0, t1); + PIXEL_ADD4(out[0], in[0]); + PIXEL_ADD4(out[1], in[1]); - IDCT4(v0, v1, t_out0, t_out1); - // transpose - t0 = vec_mergeh(t_out0, t_out1); - t1 = vec_mergel(t_out0, t_out1); - v0 = vec_mergeh(t0, t1); - v1 = vec_mergel(t0, t1); - IDCT4(v0, v1, t_out0, t_out1); + PACK_STORE(out[0], out[1]); +} + +void vpx_idct4_vsx(int16x8_t *in, int16x8_t *out) { + int32x4_t temp1, temp2, temp3, temp4; + int16x8_t step0, step1, tmp16_0; + uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 }; + int16x8_t t0 = vec_mergeh(in[0], in[1]); + int16x8_t t1 = vec_mergel(in[0], in[1]); + ROUND_SHIFT_INIT - PIXEL_ADD4(v0, t_out0); - PIXEL_ADD4(v1, t_out1); + in[0] = vec_mergeh(t0, t1); + in[1] = vec_mergel(t0, t1); - PACK_STORE(v0, v1); + IDCT4(in[0], in[1], out[0], out[1]); +} + +void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int16x8_t in[2], out[2]; + + in[0] = load_tran_low(0, input); + in[1] = load_tran_low(8 * sizeof(*input), input); + // Rows + vpx_idct4_vsx(in, out); + + // Columns + vpx_idct4_vsx(out, in); + + vpx_round_store4x4_vsx(in, out, dest, stride); } #define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ @@ -260,28 +330,20 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, #define PIXEL_ADD(in, out, add, shiftx) \ out = vec_add(vec_sra(vec_add(in, add), shiftx), out); -static uint8x16_t tr8_mask0 = { - 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, - 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 -}; -static uint8x16_t tr8_mask1 = { - 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, - 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F -}; -void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, - int stride) { - int32x4_t temp10, temp11; +void vpx_idct8_vsx(int16x8_t *in, int16x8_t *out) { int16x8_t step0, step1, step2, step3, step4, step5, step6, step7; - int16x8_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp16_0, tmp16_1, - tmp16_2, tmp16_3; - int16x8_t src0 = load_tran_low(0, input); - int16x8_t src1 = load_tran_low(8 * sizeof(*input), input); - int16x8_t src2 = load_tran_low(16 * sizeof(*input), input); - int16x8_t src3 = load_tran_low(24 * sizeof(*input), input); - int16x8_t src4 = load_tran_low(32 * sizeof(*input), input); - int16x8_t src5 = load_tran_low(40 * sizeof(*input), input); - int16x8_t src6 = load_tran_low(48 * sizeof(*input), input); - int16x8_t src7 = load_tran_low(56 * sizeof(*input), input); + int16x8_t tmp16_0, tmp16_1, tmp16_2, tmp16_3; + int32x4_t temp10, temp11; + ROUND_SHIFT_INIT; + + TRANSPOSE8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out[0], + out[1], out[2], out[3], out[4], out[5], out[6], out[7]); + + IDCT8(out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]); +} + +void vpx_round_store8x8_vsx(int16x8_t *in, uint8_t *dest, int stride) { + uint8x16_t zerov = vec_splat_u8(0); uint8x16_t dest0 = vec_vsx_ld(0, dest); uint8x16_t dest1 = vec_vsx_ld(stride, dest); uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest); @@ -290,7 +352,6 @@ void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, uint8x16_t dest5 = vec_vsx_ld(5 * stride, dest); uint8x16_t dest6 = vec_vsx_ld(6 * stride, dest); uint8x16_t dest7 = vec_vsx_ld(7 * stride, dest); - uint8x16_t zerov = vec_splat_u8(0); int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov); int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov); int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov); @@ -302,23 +363,15 @@ void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(1)); uint16x8_t shift5 = vec_splat_u16(5); uint8x16_t output0, output1, output2, output3; - ROUND_SHIFT_INIT; - TRANSPOSE8x8(src0, src1, src2, src3, src4, src5, src6, src7, tmp0, tmp1, tmp2, - tmp3, tmp4, tmp5, tmp6, tmp7); - - IDCT8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); - TRANSPOSE8x8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, src2, - src3, src4, src5, src6, src7); - IDCT8(src0, src1, src2, src3, src4, src5, src6, src7); - PIXEL_ADD(src0, d_u0, add, shift5); - PIXEL_ADD(src1, d_u1, add, shift5); - PIXEL_ADD(src2, d_u2, add, shift5); - PIXEL_ADD(src3, d_u3, add, shift5); - PIXEL_ADD(src4, d_u4, add, shift5); - PIXEL_ADD(src5, d_u5, add, shift5); - PIXEL_ADD(src6, d_u6, add, shift5); - PIXEL_ADD(src7, d_u7, add, shift5); + PIXEL_ADD(in[0], d_u0, add, shift5); + PIXEL_ADD(in[1], d_u1, add, shift5); + PIXEL_ADD(in[2], d_u2, add, shift5); + PIXEL_ADD(in[3], d_u3, add, shift5); + PIXEL_ADD(in[4], d_u4, add, shift5); + PIXEL_ADD(in[5], d_u5, add, shift5); + PIXEL_ADD(in[6], d_u6, add, shift5); + PIXEL_ADD(in[7], d_u7, add, shift5); output0 = vec_packsu(d_u0, d_u1); output1 = vec_packsu(d_u2, d_u3); output2 = vec_packsu(d_u4, d_u5); @@ -334,24 +387,24 @@ void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, vec_vsx_st(xxpermdi(output3, dest7, 3), 7 * stride, dest); } -#define LOAD_INPUT16(load, source, offset, step, in0, in1, in2, in3, in4, in5, \ - in6, in7, in8, in9, inA, inB, inC, inD, inE, inF) \ - in0 = load(offset, source); \ - in1 = load((step) + (offset), source); \ - in2 = load(2 * (step) + (offset), source); \ - in3 = load(3 * (step) + (offset), source); \ - in4 = load(4 * (step) + (offset), source); \ - in5 = load(5 * (step) + (offset), source); \ - in6 = load(6 * (step) + (offset), source); \ - in7 = load(7 * (step) + (offset), source); \ - in8 = load(8 * (step) + (offset), source); \ - in9 = load(9 * (step) + (offset), source); \ - inA = load(10 * (step) + (offset), source); \ - inB = load(11 * (step) + (offset), source); \ - inC = load(12 * (step) + (offset), source); \ - inD = load(13 * (step) + (offset), source); \ - inE = load(14 * (step) + (offset), source); \ - inF = load(15 * (step) + (offset), source); +void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int16x8_t src[8], tmp[8]; + + src[0] = load_tran_low(0, input); + src[1] = load_tran_low(8 * sizeof(*input), input); + src[2] = load_tran_low(16 * sizeof(*input), input); + src[3] = load_tran_low(24 * sizeof(*input), input); + src[4] = load_tran_low(32 * sizeof(*input), input); + src[5] = load_tran_low(40 * sizeof(*input), input); + src[6] = load_tran_low(48 * sizeof(*input), input); + src[7] = load_tran_low(56 * sizeof(*input), input); + + vpx_idct8_vsx(src, tmp); + vpx_idct8_vsx(tmp, src); + + vpx_round_store8x8_vsx(src, dest, stride); +} #define STEP16_1(inpt0, inpt1, outpt0, outpt1, cospi) \ tmp16_0 = vec_mergeh(inpt0, inpt1); \ @@ -451,9 +504,9 @@ void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, tmp16_0 = vec_mergeh(outA, outD); \ tmp16_1 = vec_mergel(outA, outD); \ temp10 = \ - vec_sub(vec_mule(tmp16_0, cospi24_mv), vec_mulo(tmp16_0, cospi8_v)); \ + vec_sub(vec_mule(tmp16_0, cospi24m_v), vec_mulo(tmp16_0, cospi8_v)); \ temp11 = \ - vec_sub(vec_mule(tmp16_1, cospi24_mv), vec_mulo(tmp16_1, cospi8_v)); \ + vec_sub(vec_mule(tmp16_1, cospi24m_v), vec_mulo(tmp16_1, cospi8_v)); \ DCT_CONST_ROUND_SHIFT(temp10); \ DCT_CONST_ROUND_SHIFT(temp11); \ inA = vec_packs(temp10, temp11); \ @@ -525,95 +578,131 @@ void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, PIXEL_ADD(in1, d_ul, add, shift6); \ vec_vsx_st(vec_packsu(d_uh, d_ul), offset, dest); -void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest, - int stride) { +static void half_idct16x8_vsx(int16x8_t *src) { + int16x8_t tmp0[8], tmp1[8]; int32x4_t temp10, temp11, temp20, temp21, temp30; - int16x8_t src00, src01, src02, src03, src04, src05, src06, src07, src10, - src11, src12, src13, src14, src15, src16, src17; - int16x8_t src20, src21, src22, src23, src24, src25, src26, src27, src30, - src31, src32, src33, src34, src35, src36, src37; - int16x8_t tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10, - tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp16_0, tmp16_1; - int16x8_t tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30, - tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37; - uint8x16_t dest0, dest1, dest2, dest3, dest4, dest5, dest6, dest7, dest8, - dest9, destA, destB, destC, destD, destE, destF; + int16x8_t tmp16_0, tmp16_1; + ROUND_SHIFT_INIT; + + TRANSPOSE8x8(src[0], src[2], src[4], src[6], src[8], src[10], src[12], + src[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], + tmp0[6], tmp0[7]); + TRANSPOSE8x8(src[1], src[3], src[5], src[7], src[9], src[11], src[13], + src[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], + tmp1[6], tmp1[7]); + IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7], + tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7], + src[0], src[2], src[4], src[6], src[8], src[10], src[12], src[14], + src[1], src[3], src[5], src[7], src[9], src[11], src[13], src[15]); +} + +void vpx_idct16_vsx(int16x8_t *src0, int16x8_t *src1) { + int16x8_t tmp0[8], tmp1[8], tmp2[8], tmp3[8]; + int32x4_t temp10, temp11, temp20, temp21, temp30; + int16x8_t tmp16_0, tmp16_1; + ROUND_SHIFT_INIT; + + TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12], + src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], + tmp0[6], tmp0[7]); + TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13], + src0[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], + tmp1[6], tmp1[7]); + TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12], + src1[14], tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], + tmp2[6], tmp2[7]); + TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13], + src1[15], tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], + tmp3[6], tmp3[7]); + + IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7], + tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7], + src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12], + src0[14], src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], + src1[12], src1[14]); + + IDCT16(tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], tmp2[6], tmp2[7], + tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], tmp3[6], tmp3[7], + src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13], + src0[15], src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], + src1[13], src1[15]); +} + +void vpx_round_store16x16_vsx(int16x8_t *src0, int16x8_t *src1, uint8_t *dest, + int stride) { + uint8x16_t destv[16]; int16x8_t d_uh, d_ul; - int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2)); - uint16x8_t shift6 = vec_splat_u16(6); uint8x16_t zerov = vec_splat_u8(0); + uint16x8_t shift6 = vec_splat_u16(6); + int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2)); + + // load dest + LOAD_INPUT16(vec_vsx_ld, dest, 0, stride, destv); + + PIXEL_ADD_STORE16(src0[0], src0[1], destv[0], 0); + PIXEL_ADD_STORE16(src0[2], src0[3], destv[1], stride); + PIXEL_ADD_STORE16(src0[4], src0[5], destv[2], 2 * stride); + PIXEL_ADD_STORE16(src0[6], src0[7], destv[3], 3 * stride); + PIXEL_ADD_STORE16(src0[8], src0[9], destv[4], 4 * stride); + PIXEL_ADD_STORE16(src0[10], src0[11], destv[5], 5 * stride); + PIXEL_ADD_STORE16(src0[12], src0[13], destv[6], 6 * stride); + PIXEL_ADD_STORE16(src0[14], src0[15], destv[7], 7 * stride); + + PIXEL_ADD_STORE16(src1[0], src1[1], destv[8], 8 * stride); + PIXEL_ADD_STORE16(src1[2], src1[3], destv[9], 9 * stride); + PIXEL_ADD_STORE16(src1[4], src1[5], destv[10], 10 * stride); + PIXEL_ADD_STORE16(src1[6], src1[7], destv[11], 11 * stride); + PIXEL_ADD_STORE16(src1[8], src1[9], destv[12], 12 * stride); + PIXEL_ADD_STORE16(src1[10], src1[11], destv[13], 13 * stride); + PIXEL_ADD_STORE16(src1[12], src1[13], destv[14], 14 * stride); + PIXEL_ADD_STORE16(src1[14], src1[15], destv[15], 15 * stride); +} +void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int16x8_t src0[16], src1[16]; + int16x8_t tmp0[8], tmp1[8], tmp2[8], tmp3[8]; + int32x4_t temp10, temp11, temp20, temp21, temp30; + int16x8_t tmp16_0, tmp16_1; ROUND_SHIFT_INIT; - // transform rows - // load and transform the upper half of 16x16 matrix - LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), src00, src10, src01, - src11, src02, src12, src03, src13, src04, src14, src05, src15, - src06, src16, src07, src17); - TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00, - tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07); - TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10, - tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17); - IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10, tmp11, - tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, src00, src01, src02, src03, - src04, src05, src06, src07, src10, src11, src12, src13, src14, src15, - src16, src17); - TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00, - tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07); - TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10, - tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17); - - // load and transform the lower half of 16x16 matrix + LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), src0); LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input), - 8 * sizeof(*input), src20, src30, src21, src31, src22, src32, - src23, src33, src24, src34, src25, src35, src26, src36, src27, - src37); - TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20, - tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27); - TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30, - tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37); - IDCT16(tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30, tmp31, - tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src20, src21, src22, src23, - src24, src25, src26, src27, src30, src31, src32, src33, src34, src35, - src36, src37); - TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20, - tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27); - TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30, - tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37); + 8 * sizeof(*input), src1); + + // transform rows + // transform the upper half of 16x16 matrix + half_idct16x8_vsx(src0); + TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12], + src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], + tmp0[6], tmp0[7]); + TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13], + src0[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], + tmp1[6], tmp1[7]); + + // transform the lower half of 16x16 matrix + half_idct16x8_vsx(src1); + TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12], + src1[14], tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], + tmp2[6], tmp2[7]); + TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13], + src1[15], tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], + tmp3[6], tmp3[7]); // transform columns // left half first - IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp20, tmp21, - tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, src00, src01, src02, src03, - src04, src05, src06, src07, src20, src21, src22, src23, src24, src25, - src26, src27); + IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7], + tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], tmp2[6], tmp2[7], + src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12], + src0[14], src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], + src1[12], src1[14]); // right half - IDCT16(tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp30, tmp31, - tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src10, src11, src12, src13, - src14, src15, src16, src17, src30, src31, src32, src33, src34, src35, - src36, src37); + IDCT16(tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7], + tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], tmp3[6], tmp3[7], + src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13], + src0[15], src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], + src1[13], src1[15]); - // load dest - LOAD_INPUT16(vec_vsx_ld, dest, 0, stride, dest0, dest1, dest2, dest3, dest4, - dest5, dest6, dest7, dest8, dest9, destA, destB, destC, destD, - destE, destF); - - PIXEL_ADD_STORE16(src00, src10, dest0, 0); - PIXEL_ADD_STORE16(src01, src11, dest1, stride); - PIXEL_ADD_STORE16(src02, src12, dest2, 2 * stride); - PIXEL_ADD_STORE16(src03, src13, dest3, 3 * stride); - PIXEL_ADD_STORE16(src04, src14, dest4, 4 * stride); - PIXEL_ADD_STORE16(src05, src15, dest5, 5 * stride); - PIXEL_ADD_STORE16(src06, src16, dest6, 6 * stride); - PIXEL_ADD_STORE16(src07, src17, dest7, 7 * stride); - - PIXEL_ADD_STORE16(src20, src30, dest8, 8 * stride); - PIXEL_ADD_STORE16(src21, src31, dest9, 9 * stride); - PIXEL_ADD_STORE16(src22, src32, destA, 10 * stride); - PIXEL_ADD_STORE16(src23, src33, destB, 11 * stride); - PIXEL_ADD_STORE16(src24, src34, destC, 12 * stride); - PIXEL_ADD_STORE16(src25, src35, destD, 13 * stride); - PIXEL_ADD_STORE16(src26, src36, destE, 14 * stride); - PIXEL_ADD_STORE16(src27, src37, destF, 15 * stride); + vpx_round_store16x16_vsx(src0, src1, dest, stride); } #define LOAD_8x32(load, in00, in01, in02, in03, in10, in11, in12, in13, in20, \ @@ -1130,3 +1219,610 @@ void vpx_iwht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, PACK_STORE(v_a, v_c); } + +void vp9_iadst4_vsx(int16x8_t *in, int16x8_t *out) { + int16x8_t sinpi_1_3_v, sinpi_4_2_v, sinpi_2_3_v, sinpi_1_4_v, sinpi_12_n3_v; + int32x4_t v_v[5], u_v[4]; + int32x4_t zerov = vec_splat_s32(0); + int16x8_t tmp0, tmp1; + int16x8_t zero16v = vec_splat_s16(0); + uint32x4_t shift16 = vec_sl(vec_splat_u32(8), vec_splat_u32(1)); + ROUND_SHIFT_INIT; + + sinpi_1_3_v = vec_mergel(sinpi_1_9_v, sinpi_3_9_v); + sinpi_4_2_v = vec_mergel(sinpi_4_9_v, sinpi_2_9_v); + sinpi_2_3_v = vec_mergel(sinpi_2_9_v, sinpi_3_9_v); + sinpi_1_4_v = vec_mergel(sinpi_1_9_v, sinpi_4_9_v); + sinpi_12_n3_v = vec_mergel(vec_add(sinpi_1_9_v, sinpi_2_9_v), + vec_sub(zero16v, sinpi_3_9_v)); + + tmp0 = (int16x8_t)vec_mergeh((int32x4_t)in[0], (int32x4_t)in[1]); + tmp1 = (int16x8_t)vec_mergel((int32x4_t)in[0], (int32x4_t)in[1]); + in[0] = (int16x8_t)vec_mergeh((int32x4_t)tmp0, (int32x4_t)tmp1); + in[1] = (int16x8_t)vec_mergel((int32x4_t)tmp0, (int32x4_t)tmp1); + + v_v[0] = vec_msum(in[0], sinpi_1_3_v, zerov); + v_v[1] = vec_msum(in[1], sinpi_4_2_v, zerov); + v_v[2] = vec_msum(in[0], sinpi_2_3_v, zerov); + v_v[3] = vec_msum(in[1], sinpi_1_4_v, zerov); + v_v[4] = vec_msum(in[0], sinpi_12_n3_v, zerov); + + in[0] = vec_sub(in[0], in[1]); + in[1] = (int16x8_t)vec_sra((int32x4_t)in[1], shift16); + in[0] = vec_add(in[0], in[1]); + in[0] = (int16x8_t)vec_sl((int32x4_t)in[0], shift16); + + u_v[0] = vec_add(v_v[0], v_v[1]); + u_v[1] = vec_sub(v_v[2], v_v[3]); + u_v[2] = vec_msum(in[0], sinpi_1_3_v, zerov); + u_v[3] = vec_sub(v_v[1], v_v[3]); + u_v[3] = vec_add(u_v[3], v_v[4]); + + DCT_CONST_ROUND_SHIFT(u_v[0]); + DCT_CONST_ROUND_SHIFT(u_v[1]); + DCT_CONST_ROUND_SHIFT(u_v[2]); + DCT_CONST_ROUND_SHIFT(u_v[3]); + + out[0] = vec_packs(u_v[0], u_v[1]); + out[1] = vec_packs(u_v[2], u_v[3]); +} + +#define MSUM_ROUND_SHIFT(a, b, cospi) \ + b = vec_msums(a, cospi, zerov); \ + DCT_CONST_ROUND_SHIFT(b); + +#define IADST_WRAPLOW(in0, in1, tmp0, tmp1, out, cospi) \ + MSUM_ROUND_SHIFT(in0, tmp0, cospi); \ + MSUM_ROUND_SHIFT(in1, tmp1, cospi); \ + out = vec_packs(tmp0, tmp1); + +void vp9_iadst8_vsx(int16x8_t *in, int16x8_t *out) { + int32x4_t tmp0[16], tmp1[16]; + + int32x4_t zerov = vec_splat_s32(0); + int16x8_t zero16v = vec_splat_s16(0); + int16x8_t cospi_p02_p30_v = vec_mergel(cospi2_v, cospi30_v); + int16x8_t cospi_p30_m02_v = vec_mergel(cospi30_v, cospi2m_v); + int16x8_t cospi_p10_p22_v = vec_mergel(cospi10_v, cospi22_v); + int16x8_t cospi_p22_m10_v = vec_mergel(cospi22_v, cospi10m_v); + int16x8_t cospi_p18_p14_v = vec_mergel(cospi18_v, cospi14_v); + int16x8_t cospi_p14_m18_v = vec_mergel(cospi14_v, cospi18m_v); + int16x8_t cospi_p26_p06_v = vec_mergel(cospi26_v, cospi6_v); + int16x8_t cospi_p06_m26_v = vec_mergel(cospi6_v, cospi26m_v); + int16x8_t cospi_p08_p24_v = vec_mergel(cospi8_v, cospi24_v); + int16x8_t cospi_p24_m08_v = vec_mergel(cospi24_v, cospi8m_v); + int16x8_t cospi_m24_p08_v = vec_mergel(cospi24m_v, cospi8_v); + int16x8_t cospi_p16_m16_v = vec_mergel(cospi16_v, cospi16m_v); + ROUND_SHIFT_INIT; + + TRANSPOSE8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out[0], + out[1], out[2], out[3], out[4], out[5], out[6], out[7]); + + // stage 1 + // interleave and multiply/add into 32-bit integer + in[0] = vec_mergeh(out[7], out[0]); + in[1] = vec_mergel(out[7], out[0]); + in[2] = vec_mergeh(out[5], out[2]); + in[3] = vec_mergel(out[5], out[2]); + in[4] = vec_mergeh(out[3], out[4]); + in[5] = vec_mergel(out[3], out[4]); + in[6] = vec_mergeh(out[1], out[6]); + in[7] = vec_mergel(out[1], out[6]); + + tmp1[0] = vec_msum(in[0], cospi_p02_p30_v, zerov); + tmp1[1] = vec_msum(in[1], cospi_p02_p30_v, zerov); + tmp1[2] = vec_msum(in[0], cospi_p30_m02_v, zerov); + tmp1[3] = vec_msum(in[1], cospi_p30_m02_v, zerov); + tmp1[4] = vec_msum(in[2], cospi_p10_p22_v, zerov); + tmp1[5] = vec_msum(in[3], cospi_p10_p22_v, zerov); + tmp1[6] = vec_msum(in[2], cospi_p22_m10_v, zerov); + tmp1[7] = vec_msum(in[3], cospi_p22_m10_v, zerov); + tmp1[8] = vec_msum(in[4], cospi_p18_p14_v, zerov); + tmp1[9] = vec_msum(in[5], cospi_p18_p14_v, zerov); + tmp1[10] = vec_msum(in[4], cospi_p14_m18_v, zerov); + tmp1[11] = vec_msum(in[5], cospi_p14_m18_v, zerov); + tmp1[12] = vec_msum(in[6], cospi_p26_p06_v, zerov); + tmp1[13] = vec_msum(in[7], cospi_p26_p06_v, zerov); + tmp1[14] = vec_msum(in[6], cospi_p06_m26_v, zerov); + tmp1[15] = vec_msum(in[7], cospi_p06_m26_v, zerov); + + tmp0[0] = vec_add(tmp1[0], tmp1[8]); + tmp0[1] = vec_add(tmp1[1], tmp1[9]); + tmp0[2] = vec_add(tmp1[2], tmp1[10]); + tmp0[3] = vec_add(tmp1[3], tmp1[11]); + tmp0[4] = vec_add(tmp1[4], tmp1[12]); + tmp0[5] = vec_add(tmp1[5], tmp1[13]); + tmp0[6] = vec_add(tmp1[6], tmp1[14]); + tmp0[7] = vec_add(tmp1[7], tmp1[15]); + tmp0[8] = vec_sub(tmp1[0], tmp1[8]); + tmp0[9] = vec_sub(tmp1[1], tmp1[9]); + tmp0[10] = vec_sub(tmp1[2], tmp1[10]); + tmp0[11] = vec_sub(tmp1[3], tmp1[11]); + tmp0[12] = vec_sub(tmp1[4], tmp1[12]); + tmp0[13] = vec_sub(tmp1[5], tmp1[13]); + tmp0[14] = vec_sub(tmp1[6], tmp1[14]); + tmp0[15] = vec_sub(tmp1[7], tmp1[15]); + + // shift and rounding + DCT_CONST_ROUND_SHIFT(tmp0[0]); + DCT_CONST_ROUND_SHIFT(tmp0[1]); + DCT_CONST_ROUND_SHIFT(tmp0[2]); + DCT_CONST_ROUND_SHIFT(tmp0[3]); + DCT_CONST_ROUND_SHIFT(tmp0[4]); + DCT_CONST_ROUND_SHIFT(tmp0[5]); + DCT_CONST_ROUND_SHIFT(tmp0[6]); + DCT_CONST_ROUND_SHIFT(tmp0[7]); + DCT_CONST_ROUND_SHIFT(tmp0[8]); + DCT_CONST_ROUND_SHIFT(tmp0[9]); + DCT_CONST_ROUND_SHIFT(tmp0[10]); + DCT_CONST_ROUND_SHIFT(tmp0[11]); + DCT_CONST_ROUND_SHIFT(tmp0[12]); + DCT_CONST_ROUND_SHIFT(tmp0[13]); + DCT_CONST_ROUND_SHIFT(tmp0[14]); + DCT_CONST_ROUND_SHIFT(tmp0[15]); + + // back to 16-bit + out[0] = vec_packs(tmp0[0], tmp0[1]); + out[1] = vec_packs(tmp0[2], tmp0[3]); + out[2] = vec_packs(tmp0[4], tmp0[5]); + out[3] = vec_packs(tmp0[6], tmp0[7]); + out[4] = vec_packs(tmp0[8], tmp0[9]); + out[5] = vec_packs(tmp0[10], tmp0[11]); + out[6] = vec_packs(tmp0[12], tmp0[13]); + out[7] = vec_packs(tmp0[14], tmp0[15]); + + // stage 2 + in[0] = vec_add(out[0], out[2]); + in[1] = vec_add(out[1], out[3]); + in[2] = vec_sub(out[0], out[2]); + in[3] = vec_sub(out[1], out[3]); + in[4] = vec_mergeh(out[4], out[5]); + in[5] = vec_mergel(out[4], out[5]); + in[6] = vec_mergeh(out[6], out[7]); + in[7] = vec_mergel(out[6], out[7]); + + tmp1[0] = vec_msum(in[4], cospi_p08_p24_v, zerov); + tmp1[1] = vec_msum(in[5], cospi_p08_p24_v, zerov); + tmp1[2] = vec_msum(in[4], cospi_p24_m08_v, zerov); + tmp1[3] = vec_msum(in[5], cospi_p24_m08_v, zerov); + tmp1[4] = vec_msum(in[6], cospi_m24_p08_v, zerov); + tmp1[5] = vec_msum(in[7], cospi_m24_p08_v, zerov); + tmp1[6] = vec_msum(in[6], cospi_p08_p24_v, zerov); + tmp1[7] = vec_msum(in[7], cospi_p08_p24_v, zerov); + + tmp0[0] = vec_add(tmp1[0], tmp1[4]); + tmp0[1] = vec_add(tmp1[1], tmp1[5]); + tmp0[2] = vec_add(tmp1[2], tmp1[6]); + tmp0[3] = vec_add(tmp1[3], tmp1[7]); + tmp0[4] = vec_sub(tmp1[0], tmp1[4]); + tmp0[5] = vec_sub(tmp1[1], tmp1[5]); + tmp0[6] = vec_sub(tmp1[2], tmp1[6]); + tmp0[7] = vec_sub(tmp1[3], tmp1[7]); + + DCT_CONST_ROUND_SHIFT(tmp0[0]); + DCT_CONST_ROUND_SHIFT(tmp0[1]); + DCT_CONST_ROUND_SHIFT(tmp0[2]); + DCT_CONST_ROUND_SHIFT(tmp0[3]); + DCT_CONST_ROUND_SHIFT(tmp0[4]); + DCT_CONST_ROUND_SHIFT(tmp0[5]); + DCT_CONST_ROUND_SHIFT(tmp0[6]); + DCT_CONST_ROUND_SHIFT(tmp0[7]); + + in[4] = vec_packs(tmp0[0], tmp0[1]); + in[5] = vec_packs(tmp0[2], tmp0[3]); + in[6] = vec_packs(tmp0[4], tmp0[5]); + in[7] = vec_packs(tmp0[6], tmp0[7]); + + // stage 3 + out[0] = vec_mergeh(in[2], in[3]); + out[1] = vec_mergel(in[2], in[3]); + out[2] = vec_mergeh(in[6], in[7]); + out[3] = vec_mergel(in[6], in[7]); + + IADST_WRAPLOW(out[0], out[1], tmp0[0], tmp0[1], in[2], cospi16_v); + IADST_WRAPLOW(out[0], out[1], tmp0[0], tmp0[1], in[3], cospi_p16_m16_v); + IADST_WRAPLOW(out[2], out[3], tmp0[0], tmp0[1], in[6], cospi16_v); + IADST_WRAPLOW(out[2], out[3], tmp0[0], tmp0[1], in[7], cospi_p16_m16_v); + + out[0] = in[0]; + out[2] = in[6]; + out[4] = in[3]; + out[6] = in[5]; + + out[1] = vec_sub(zero16v, in[4]); + out[3] = vec_sub(zero16v, in[2]); + out[5] = vec_sub(zero16v, in[7]); + out[7] = vec_sub(zero16v, in[1]); +} + +static void iadst16x8_vsx(int16x8_t *in, int16x8_t *out) { + int32x4_t tmp0[32], tmp1[32]; + int16x8_t tmp16_0[8]; + int16x8_t cospi_p01_p31 = vec_mergel(cospi1_v, cospi31_v); + int16x8_t cospi_p31_m01 = vec_mergel(cospi31_v, cospi1m_v); + int16x8_t cospi_p05_p27 = vec_mergel(cospi5_v, cospi27_v); + int16x8_t cospi_p27_m05 = vec_mergel(cospi27_v, cospi5m_v); + int16x8_t cospi_p09_p23 = vec_mergel(cospi9_v, cospi23_v); + int16x8_t cospi_p23_m09 = vec_mergel(cospi23_v, cospi9m_v); + int16x8_t cospi_p13_p19 = vec_mergel(cospi13_v, cospi19_v); + int16x8_t cospi_p19_m13 = vec_mergel(cospi19_v, cospi13m_v); + int16x8_t cospi_p17_p15 = vec_mergel(cospi17_v, cospi15_v); + int16x8_t cospi_p15_m17 = vec_mergel(cospi15_v, cospi17m_v); + int16x8_t cospi_p21_p11 = vec_mergel(cospi21_v, cospi11_v); + int16x8_t cospi_p11_m21 = vec_mergel(cospi11_v, cospi21m_v); + int16x8_t cospi_p25_p07 = vec_mergel(cospi25_v, cospi7_v); + int16x8_t cospi_p07_m25 = vec_mergel(cospi7_v, cospi25m_v); + int16x8_t cospi_p29_p03 = vec_mergel(cospi29_v, cospi3_v); + int16x8_t cospi_p03_m29 = vec_mergel(cospi3_v, cospi29m_v); + int16x8_t cospi_p04_p28 = vec_mergel(cospi4_v, cospi28_v); + int16x8_t cospi_p28_m04 = vec_mergel(cospi28_v, cospi4m_v); + int16x8_t cospi_p20_p12 = vec_mergel(cospi20_v, cospi12_v); + int16x8_t cospi_p12_m20 = vec_mergel(cospi12_v, cospi20m_v); + int16x8_t cospi_m28_p04 = vec_mergel(cospi28m_v, cospi4_v); + int16x8_t cospi_m12_p20 = vec_mergel(cospi12m_v, cospi20_v); + int16x8_t cospi_p08_p24 = vec_mergel(cospi8_v, cospi24_v); + int16x8_t cospi_p24_m08 = vec_mergel(cospi24_v, cospi8m_v); + int16x8_t cospi_m24_p08 = vec_mergel(cospi24m_v, cospi8_v); + int32x4_t zerov = vec_splat_s32(0); + ROUND_SHIFT_INIT; + + tmp16_0[0] = vec_mergeh(in[15], in[0]); + tmp16_0[1] = vec_mergel(in[15], in[0]); + tmp16_0[2] = vec_mergeh(in[13], in[2]); + tmp16_0[3] = vec_mergel(in[13], in[2]); + tmp16_0[4] = vec_mergeh(in[11], in[4]); + tmp16_0[5] = vec_mergel(in[11], in[4]); + tmp16_0[6] = vec_mergeh(in[9], in[6]); + tmp16_0[7] = vec_mergel(in[9], in[6]); + tmp16_0[8] = vec_mergeh(in[7], in[8]); + tmp16_0[9] = vec_mergel(in[7], in[8]); + tmp16_0[10] = vec_mergeh(in[5], in[10]); + tmp16_0[11] = vec_mergel(in[5], in[10]); + tmp16_0[12] = vec_mergeh(in[3], in[12]); + tmp16_0[13] = vec_mergel(in[3], in[12]); + tmp16_0[14] = vec_mergeh(in[1], in[14]); + tmp16_0[15] = vec_mergel(in[1], in[14]); + + tmp0[0] = vec_msum(tmp16_0[0], cospi_p01_p31, zerov); + tmp0[1] = vec_msum(tmp16_0[1], cospi_p01_p31, zerov); + tmp0[2] = vec_msum(tmp16_0[0], cospi_p31_m01, zerov); + tmp0[3] = vec_msum(tmp16_0[1], cospi_p31_m01, zerov); + tmp0[4] = vec_msum(tmp16_0[2], cospi_p05_p27, zerov); + tmp0[5] = vec_msum(tmp16_0[3], cospi_p05_p27, zerov); + tmp0[6] = vec_msum(tmp16_0[2], cospi_p27_m05, zerov); + tmp0[7] = vec_msum(tmp16_0[3], cospi_p27_m05, zerov); + tmp0[8] = vec_msum(tmp16_0[4], cospi_p09_p23, zerov); + tmp0[9] = vec_msum(tmp16_0[5], cospi_p09_p23, zerov); + tmp0[10] = vec_msum(tmp16_0[4], cospi_p23_m09, zerov); + tmp0[11] = vec_msum(tmp16_0[5], cospi_p23_m09, zerov); + tmp0[12] = vec_msum(tmp16_0[6], cospi_p13_p19, zerov); + tmp0[13] = vec_msum(tmp16_0[7], cospi_p13_p19, zerov); + tmp0[14] = vec_msum(tmp16_0[6], cospi_p19_m13, zerov); + tmp0[15] = vec_msum(tmp16_0[7], cospi_p19_m13, zerov); + tmp0[16] = vec_msum(tmp16_0[8], cospi_p17_p15, zerov); + tmp0[17] = vec_msum(tmp16_0[9], cospi_p17_p15, zerov); + tmp0[18] = vec_msum(tmp16_0[8], cospi_p15_m17, zerov); + tmp0[19] = vec_msum(tmp16_0[9], cospi_p15_m17, zerov); + tmp0[20] = vec_msum(tmp16_0[10], cospi_p21_p11, zerov); + tmp0[21] = vec_msum(tmp16_0[11], cospi_p21_p11, zerov); + tmp0[22] = vec_msum(tmp16_0[10], cospi_p11_m21, zerov); + tmp0[23] = vec_msum(tmp16_0[11], cospi_p11_m21, zerov); + tmp0[24] = vec_msum(tmp16_0[12], cospi_p25_p07, zerov); + tmp0[25] = vec_msum(tmp16_0[13], cospi_p25_p07, zerov); + tmp0[26] = vec_msum(tmp16_0[12], cospi_p07_m25, zerov); + tmp0[27] = vec_msum(tmp16_0[13], cospi_p07_m25, zerov); + tmp0[28] = vec_msum(tmp16_0[14], cospi_p29_p03, zerov); + tmp0[29] = vec_msum(tmp16_0[15], cospi_p29_p03, zerov); + tmp0[30] = vec_msum(tmp16_0[14], cospi_p03_m29, zerov); + tmp0[31] = vec_msum(tmp16_0[15], cospi_p03_m29, zerov); + + tmp1[0] = vec_add(tmp0[0], tmp0[16]); + tmp1[1] = vec_add(tmp0[1], tmp0[17]); + tmp1[2] = vec_add(tmp0[2], tmp0[18]); + tmp1[3] = vec_add(tmp0[3], tmp0[19]); + tmp1[4] = vec_add(tmp0[4], tmp0[20]); + tmp1[5] = vec_add(tmp0[5], tmp0[21]); + tmp1[6] = vec_add(tmp0[6], tmp0[22]); + tmp1[7] = vec_add(tmp0[7], tmp0[23]); + tmp1[8] = vec_add(tmp0[8], tmp0[24]); + tmp1[9] = vec_add(tmp0[9], tmp0[25]); + tmp1[10] = vec_add(tmp0[10], tmp0[26]); + tmp1[11] = vec_add(tmp0[11], tmp0[27]); + tmp1[12] = vec_add(tmp0[12], tmp0[28]); + tmp1[13] = vec_add(tmp0[13], tmp0[29]); + tmp1[14] = vec_add(tmp0[14], tmp0[30]); + tmp1[15] = vec_add(tmp0[15], tmp0[31]); + tmp1[16] = vec_sub(tmp0[0], tmp0[16]); + tmp1[17] = vec_sub(tmp0[1], tmp0[17]); + tmp1[18] = vec_sub(tmp0[2], tmp0[18]); + tmp1[19] = vec_sub(tmp0[3], tmp0[19]); + tmp1[20] = vec_sub(tmp0[4], tmp0[20]); + tmp1[21] = vec_sub(tmp0[5], tmp0[21]); + tmp1[22] = vec_sub(tmp0[6], tmp0[22]); + tmp1[23] = vec_sub(tmp0[7], tmp0[23]); + tmp1[24] = vec_sub(tmp0[8], tmp0[24]); + tmp1[25] = vec_sub(tmp0[9], tmp0[25]); + tmp1[26] = vec_sub(tmp0[10], tmp0[26]); + tmp1[27] = vec_sub(tmp0[11], tmp0[27]); + tmp1[28] = vec_sub(tmp0[12], tmp0[28]); + tmp1[29] = vec_sub(tmp0[13], tmp0[29]); + tmp1[30] = vec_sub(tmp0[14], tmp0[30]); + tmp1[31] = vec_sub(tmp0[15], tmp0[31]); + + DCT_CONST_ROUND_SHIFT(tmp1[0]); + DCT_CONST_ROUND_SHIFT(tmp1[1]); + DCT_CONST_ROUND_SHIFT(tmp1[2]); + DCT_CONST_ROUND_SHIFT(tmp1[3]); + DCT_CONST_ROUND_SHIFT(tmp1[4]); + DCT_CONST_ROUND_SHIFT(tmp1[5]); + DCT_CONST_ROUND_SHIFT(tmp1[6]); + DCT_CONST_ROUND_SHIFT(tmp1[7]); + DCT_CONST_ROUND_SHIFT(tmp1[8]); + DCT_CONST_ROUND_SHIFT(tmp1[9]); + DCT_CONST_ROUND_SHIFT(tmp1[10]); + DCT_CONST_ROUND_SHIFT(tmp1[11]); + DCT_CONST_ROUND_SHIFT(tmp1[12]); + DCT_CONST_ROUND_SHIFT(tmp1[13]); + DCT_CONST_ROUND_SHIFT(tmp1[14]); + DCT_CONST_ROUND_SHIFT(tmp1[15]); + DCT_CONST_ROUND_SHIFT(tmp1[16]); + DCT_CONST_ROUND_SHIFT(tmp1[17]); + DCT_CONST_ROUND_SHIFT(tmp1[18]); + DCT_CONST_ROUND_SHIFT(tmp1[19]); + DCT_CONST_ROUND_SHIFT(tmp1[20]); + DCT_CONST_ROUND_SHIFT(tmp1[21]); + DCT_CONST_ROUND_SHIFT(tmp1[22]); + DCT_CONST_ROUND_SHIFT(tmp1[23]); + DCT_CONST_ROUND_SHIFT(tmp1[24]); + DCT_CONST_ROUND_SHIFT(tmp1[25]); + DCT_CONST_ROUND_SHIFT(tmp1[26]); + DCT_CONST_ROUND_SHIFT(tmp1[27]); + DCT_CONST_ROUND_SHIFT(tmp1[28]); + DCT_CONST_ROUND_SHIFT(tmp1[29]); + DCT_CONST_ROUND_SHIFT(tmp1[30]); + DCT_CONST_ROUND_SHIFT(tmp1[31]); + + in[0] = vec_packs(tmp1[0], tmp1[1]); + in[1] = vec_packs(tmp1[2], tmp1[3]); + in[2] = vec_packs(tmp1[4], tmp1[5]); + in[3] = vec_packs(tmp1[6], tmp1[7]); + in[4] = vec_packs(tmp1[8], tmp1[9]); + in[5] = vec_packs(tmp1[10], tmp1[11]); + in[6] = vec_packs(tmp1[12], tmp1[13]); + in[7] = vec_packs(tmp1[14], tmp1[15]); + in[8] = vec_packs(tmp1[16], tmp1[17]); + in[9] = vec_packs(tmp1[18], tmp1[19]); + in[10] = vec_packs(tmp1[20], tmp1[21]); + in[11] = vec_packs(tmp1[22], tmp1[23]); + in[12] = vec_packs(tmp1[24], tmp1[25]); + in[13] = vec_packs(tmp1[26], tmp1[27]); + in[14] = vec_packs(tmp1[28], tmp1[29]); + in[15] = vec_packs(tmp1[30], tmp1[31]); + + // stage 2 + tmp16_0[0] = vec_mergeh(in[8], in[9]); + tmp16_0[1] = vec_mergel(in[8], in[9]); + tmp16_0[2] = vec_mergeh(in[10], in[11]); + tmp16_0[3] = vec_mergel(in[10], in[11]); + tmp16_0[4] = vec_mergeh(in[12], in[13]); + tmp16_0[5] = vec_mergel(in[12], in[13]); + tmp16_0[6] = vec_mergeh(in[14], in[15]); + tmp16_0[7] = vec_mergel(in[14], in[15]); + + tmp0[0] = vec_msum(tmp16_0[0], cospi_p04_p28, zerov); + tmp0[1] = vec_msum(tmp16_0[1], cospi_p04_p28, zerov); + tmp0[2] = vec_msum(tmp16_0[0], cospi_p28_m04, zerov); + tmp0[3] = vec_msum(tmp16_0[1], cospi_p28_m04, zerov); + tmp0[4] = vec_msum(tmp16_0[2], cospi_p20_p12, zerov); + tmp0[5] = vec_msum(tmp16_0[3], cospi_p20_p12, zerov); + tmp0[6] = vec_msum(tmp16_0[2], cospi_p12_m20, zerov); + tmp0[7] = vec_msum(tmp16_0[3], cospi_p12_m20, zerov); + tmp0[8] = vec_msum(tmp16_0[4], cospi_m28_p04, zerov); + tmp0[9] = vec_msum(tmp16_0[5], cospi_m28_p04, zerov); + tmp0[10] = vec_msum(tmp16_0[4], cospi_p04_p28, zerov); + tmp0[11] = vec_msum(tmp16_0[5], cospi_p04_p28, zerov); + tmp0[12] = vec_msum(tmp16_0[6], cospi_m12_p20, zerov); + tmp0[13] = vec_msum(tmp16_0[7], cospi_m12_p20, zerov); + tmp0[14] = vec_msum(tmp16_0[6], cospi_p20_p12, zerov); + tmp0[15] = vec_msum(tmp16_0[7], cospi_p20_p12, zerov); + + tmp1[0] = vec_add(tmp0[0], tmp0[8]); + tmp1[1] = vec_add(tmp0[1], tmp0[9]); + tmp1[2] = vec_add(tmp0[2], tmp0[10]); + tmp1[3] = vec_add(tmp0[3], tmp0[11]); + tmp1[4] = vec_add(tmp0[4], tmp0[12]); + tmp1[5] = vec_add(tmp0[5], tmp0[13]); + tmp1[6] = vec_add(tmp0[6], tmp0[14]); + tmp1[7] = vec_add(tmp0[7], tmp0[15]); + tmp1[8] = vec_sub(tmp0[0], tmp0[8]); + tmp1[9] = vec_sub(tmp0[1], tmp0[9]); + tmp1[10] = vec_sub(tmp0[2], tmp0[10]); + tmp1[11] = vec_sub(tmp0[3], tmp0[11]); + tmp1[12] = vec_sub(tmp0[4], tmp0[12]); + tmp1[13] = vec_sub(tmp0[5], tmp0[13]); + tmp1[14] = vec_sub(tmp0[6], tmp0[14]); + tmp1[15] = vec_sub(tmp0[7], tmp0[15]); + + DCT_CONST_ROUND_SHIFT(tmp1[0]); + DCT_CONST_ROUND_SHIFT(tmp1[1]); + DCT_CONST_ROUND_SHIFT(tmp1[2]); + DCT_CONST_ROUND_SHIFT(tmp1[3]); + DCT_CONST_ROUND_SHIFT(tmp1[4]); + DCT_CONST_ROUND_SHIFT(tmp1[5]); + DCT_CONST_ROUND_SHIFT(tmp1[6]); + DCT_CONST_ROUND_SHIFT(tmp1[7]); + DCT_CONST_ROUND_SHIFT(tmp1[8]); + DCT_CONST_ROUND_SHIFT(tmp1[9]); + DCT_CONST_ROUND_SHIFT(tmp1[10]); + DCT_CONST_ROUND_SHIFT(tmp1[11]); + DCT_CONST_ROUND_SHIFT(tmp1[12]); + DCT_CONST_ROUND_SHIFT(tmp1[13]); + DCT_CONST_ROUND_SHIFT(tmp1[14]); + DCT_CONST_ROUND_SHIFT(tmp1[15]); + + tmp16_0[0] = vec_add(in[0], in[4]); + tmp16_0[1] = vec_add(in[1], in[5]); + tmp16_0[2] = vec_add(in[2], in[6]); + tmp16_0[3] = vec_add(in[3], in[7]); + tmp16_0[4] = vec_sub(in[0], in[4]); + tmp16_0[5] = vec_sub(in[1], in[5]); + tmp16_0[6] = vec_sub(in[2], in[6]); + tmp16_0[7] = vec_sub(in[3], in[7]); + tmp16_0[8] = vec_packs(tmp1[0], tmp1[1]); + tmp16_0[9] = vec_packs(tmp1[2], tmp1[3]); + tmp16_0[10] = vec_packs(tmp1[4], tmp1[5]); + tmp16_0[11] = vec_packs(tmp1[6], tmp1[7]); + tmp16_0[12] = vec_packs(tmp1[8], tmp1[9]); + tmp16_0[13] = vec_packs(tmp1[10], tmp1[11]); + tmp16_0[14] = vec_packs(tmp1[12], tmp1[13]); + tmp16_0[15] = vec_packs(tmp1[14], tmp1[15]); + + // stage 3 + in[0] = vec_mergeh(tmp16_0[4], tmp16_0[5]); + in[1] = vec_mergel(tmp16_0[4], tmp16_0[5]); + in[2] = vec_mergeh(tmp16_0[6], tmp16_0[7]); + in[3] = vec_mergel(tmp16_0[6], tmp16_0[7]); + in[4] = vec_mergeh(tmp16_0[12], tmp16_0[13]); + in[5] = vec_mergel(tmp16_0[12], tmp16_0[13]); + in[6] = vec_mergeh(tmp16_0[14], tmp16_0[15]); + in[7] = vec_mergel(tmp16_0[14], tmp16_0[15]); + + tmp0[0] = vec_msum(in[0], cospi_p08_p24, zerov); + tmp0[1] = vec_msum(in[1], cospi_p08_p24, zerov); + tmp0[2] = vec_msum(in[0], cospi_p24_m08, zerov); + tmp0[3] = vec_msum(in[1], cospi_p24_m08, zerov); + tmp0[4] = vec_msum(in[2], cospi_m24_p08, zerov); + tmp0[5] = vec_msum(in[3], cospi_m24_p08, zerov); + tmp0[6] = vec_msum(in[2], cospi_p08_p24, zerov); + tmp0[7] = vec_msum(in[3], cospi_p08_p24, zerov); + tmp0[8] = vec_msum(in[4], cospi_p08_p24, zerov); + tmp0[9] = vec_msum(in[5], cospi_p08_p24, zerov); + tmp0[10] = vec_msum(in[4], cospi_p24_m08, zerov); + tmp0[11] = vec_msum(in[5], cospi_p24_m08, zerov); + tmp0[12] = vec_msum(in[6], cospi_m24_p08, zerov); + tmp0[13] = vec_msum(in[7], cospi_m24_p08, zerov); + tmp0[14] = vec_msum(in[6], cospi_p08_p24, zerov); + tmp0[15] = vec_msum(in[7], cospi_p08_p24, zerov); + + tmp1[0] = vec_add(tmp0[0], tmp0[4]); + tmp1[1] = vec_add(tmp0[1], tmp0[5]); + tmp1[2] = vec_add(tmp0[2], tmp0[6]); + tmp1[3] = vec_add(tmp0[3], tmp0[7]); + tmp1[4] = vec_sub(tmp0[0], tmp0[4]); + tmp1[5] = vec_sub(tmp0[1], tmp0[5]); + tmp1[6] = vec_sub(tmp0[2], tmp0[6]); + tmp1[7] = vec_sub(tmp0[3], tmp0[7]); + tmp1[8] = vec_add(tmp0[8], tmp0[12]); + tmp1[9] = vec_add(tmp0[9], tmp0[13]); + tmp1[10] = vec_add(tmp0[10], tmp0[14]); + tmp1[11] = vec_add(tmp0[11], tmp0[15]); + tmp1[12] = vec_sub(tmp0[8], tmp0[12]); + tmp1[13] = vec_sub(tmp0[9], tmp0[13]); + tmp1[14] = vec_sub(tmp0[10], tmp0[14]); + tmp1[15] = vec_sub(tmp0[11], tmp0[15]); + + DCT_CONST_ROUND_SHIFT(tmp1[0]); + DCT_CONST_ROUND_SHIFT(tmp1[1]); + DCT_CONST_ROUND_SHIFT(tmp1[2]); + DCT_CONST_ROUND_SHIFT(tmp1[3]); + DCT_CONST_ROUND_SHIFT(tmp1[4]); + DCT_CONST_ROUND_SHIFT(tmp1[5]); + DCT_CONST_ROUND_SHIFT(tmp1[6]); + DCT_CONST_ROUND_SHIFT(tmp1[7]); + DCT_CONST_ROUND_SHIFT(tmp1[8]); + DCT_CONST_ROUND_SHIFT(tmp1[9]); + DCT_CONST_ROUND_SHIFT(tmp1[10]); + DCT_CONST_ROUND_SHIFT(tmp1[11]); + DCT_CONST_ROUND_SHIFT(tmp1[12]); + DCT_CONST_ROUND_SHIFT(tmp1[13]); + DCT_CONST_ROUND_SHIFT(tmp1[14]); + DCT_CONST_ROUND_SHIFT(tmp1[15]); + + in[0] = vec_add(tmp16_0[0], tmp16_0[2]); + in[1] = vec_add(tmp16_0[1], tmp16_0[3]); + in[2] = vec_sub(tmp16_0[0], tmp16_0[2]); + in[3] = vec_sub(tmp16_0[1], tmp16_0[3]); + in[4] = vec_packs(tmp1[0], tmp1[1]); + in[5] = vec_packs(tmp1[2], tmp1[3]); + in[6] = vec_packs(tmp1[4], tmp1[5]); + in[7] = vec_packs(tmp1[6], tmp1[7]); + in[8] = vec_add(tmp16_0[8], tmp16_0[10]); + in[9] = vec_add(tmp16_0[9], tmp16_0[11]); + in[10] = vec_sub(tmp16_0[8], tmp16_0[10]); + in[11] = vec_sub(tmp16_0[9], tmp16_0[11]); + in[12] = vec_packs(tmp1[8], tmp1[9]); + in[13] = vec_packs(tmp1[10], tmp1[11]); + in[14] = vec_packs(tmp1[12], tmp1[13]); + in[15] = vec_packs(tmp1[14], tmp1[15]); + + // stage 4 + out[0] = vec_mergeh(in[2], in[3]); + out[1] = vec_mergel(in[2], in[3]); + out[2] = vec_mergeh(in[6], in[7]); + out[3] = vec_mergel(in[6], in[7]); + out[4] = vec_mergeh(in[10], in[11]); + out[5] = vec_mergel(in[10], in[11]); + out[6] = vec_mergeh(in[14], in[15]); + out[7] = vec_mergel(in[14], in[15]); +} + +void vpx_iadst16_vsx(int16x8_t *src0, int16x8_t *src1) { + int16x8_t tmp0[16], tmp1[16], tmp2[8]; + int32x4_t tmp3, tmp4; + int16x8_t zero16v = vec_splat_s16(0); + int32x4_t zerov = vec_splat_s32(0); + int16x8_t cospi_p16_m16 = vec_mergel(cospi16_v, cospi16m_v); + int16x8_t cospi_m16_p16 = vec_mergel(cospi16m_v, cospi16_v); + ROUND_SHIFT_INIT; + + TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12], + src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], + tmp0[6], tmp0[7]); + TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12], + src1[14], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], + tmp1[6], tmp1[7]); + TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13], + src0[15], tmp0[8], tmp0[9], tmp0[10], tmp0[11], tmp0[12], + tmp0[13], tmp0[14], tmp0[15]); + TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13], + src1[15], tmp1[8], tmp1[9], tmp1[10], tmp1[11], tmp1[12], + tmp1[13], tmp1[14], tmp1[15]); + + iadst16x8_vsx(tmp0, tmp2); + IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src0[14], cospi16m_v); + IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src1[0], cospi_p16_m16); + IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src0[8], cospi16_v); + IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src1[6], cospi_m16_p16); + IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src0[12], cospi16_v); + IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src1[2], cospi_m16_p16); + IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src0[10], cospi16m_v); + IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src1[4], cospi_p16_m16); + + src0[0] = tmp0[0]; + src0[2] = vec_sub(zero16v, tmp0[8]); + src0[4] = tmp0[12]; + src0[6] = vec_sub(zero16v, tmp0[4]); + src1[8] = tmp0[5]; + src1[10] = vec_sub(zero16v, tmp0[13]); + src1[12] = tmp0[9]; + src1[14] = vec_sub(zero16v, tmp0[1]); + + iadst16x8_vsx(tmp1, tmp2); + IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src0[15], cospi16m_v); + IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src1[1], cospi_p16_m16); + IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src0[9], cospi16_v); + IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src1[7], cospi_m16_p16); + IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src0[13], cospi16_v); + IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src1[3], cospi_m16_p16); + IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src0[11], cospi16m_v); + IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src1[5], cospi_p16_m16); + + src0[1] = tmp1[0]; + src0[3] = vec_sub(zero16v, tmp1[8]); + src0[5] = tmp1[12]; + src0[7] = vec_sub(zero16v, tmp1[4]); + src1[9] = tmp1[5]; + src1[11] = vec_sub(zero16v, tmp1[13]); + src1[13] = tmp1[9]; + src1[15] = vec_sub(zero16v, tmp1[1]); +} diff --git a/vpx_dsp/ppc/inv_txfm_vsx.h b/vpx_dsp/ppc/inv_txfm_vsx.h new file mode 100644 index 000000000..36159850a --- /dev/null +++ b/vpx_dsp/ppc/inv_txfm_vsx.h @@ -0,0 +1,33 @@ +#include "vpx_dsp/ppc/types_vsx.h" + +void vpx_round_store4x4_vsx(int16x8_t *in, int16x8_t *out, uint8_t *dest, + int stride); +void vpx_idct4_vsx(int16x8_t *in, int16x8_t *out); +void vp9_iadst4_vsx(int16x8_t *in, int16x8_t *out); + +void vpx_round_store8x8_vsx(int16x8_t *in, uint8_t *dest, int stride); +void vpx_idct8_vsx(int16x8_t *in, int16x8_t *out); +void vp9_iadst8_vsx(int16x8_t *in, int16x8_t *out); + +#define LOAD_INPUT16(load, source, offset, step, in) \ + in[0] = load(offset, source); \ + in[1] = load((step) + (offset), source); \ + in[2] = load(2 * (step) + (offset), source); \ + in[3] = load(3 * (step) + (offset), source); \ + in[4] = load(4 * (step) + (offset), source); \ + in[5] = load(5 * (step) + (offset), source); \ + in[6] = load(6 * (step) + (offset), source); \ + in[7] = load(7 * (step) + (offset), source); \ + in[8] = load(8 * (step) + (offset), source); \ + in[9] = load(9 * (step) + (offset), source); \ + in[10] = load(10 * (step) + (offset), source); \ + in[11] = load(11 * (step) + (offset), source); \ + in[12] = load(12 * (step) + (offset), source); \ + in[13] = load(13 * (step) + (offset), source); \ + in[14] = load(14 * (step) + (offset), source); \ + in[15] = load(15 * (step) + (offset), source); + +void vpx_round_store16x16_vsx(int16x8_t *src0, int16x8_t *src1, uint8_t *dest, + int stride); +void vpx_idct16_vsx(int16x8_t *src0, int16x8_t *src1); +void vpx_iadst16_vsx(int16x8_t *src0, int16x8_t *src1); diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 9f3e268cc..0f33562cc 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -985,7 +985,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # Avg # add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *, int p"; + specialize qw/vpx_highbd_avg_8x8 sse2/; + add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p"; + specialize qw/vpx_highbd_avg_4x4 sse2/; + add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c index a235ba41d..0362c63c5 100644 --- a/vpx_dsp/x86/avg_intrin_sse2.c +++ b/vpx_dsp/x86/avg_intrin_sse2.c @@ -138,6 +138,56 @@ unsigned int vpx_avg_4x4_sse2(const uint8_t *s, int p) { return (avg + 8) >> 4; } +#if CONFIG_VP9_HIGHBITDEPTH +unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p) { + __m128i s0, s1; + unsigned int avg; + const uint16_t *s = CONVERT_TO_SHORTPTR(s8); + const __m128i zero = _mm_setzero_si128(); + s0 = _mm_loadu_si128((const __m128i *)(s)); + s1 = _mm_loadu_si128((const __m128i *)(s + p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 2 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 3 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 4 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 5 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 6 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 7 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpackhi_epi16(s0, zero); + s0 = _mm_unpacklo_epi16(s0, zero); + s0 = _mm_add_epi32(s0, s1); + s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 8)); + s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 4)); + avg = _mm_cvtsi128_si32(s0); + + return (avg + 32) >> 6; +} + +unsigned int vpx_highbd_avg_4x4_sse2(const uint8_t *s8, int p) { + __m128i s0, s1; + unsigned int avg; + const uint16_t *s = CONVERT_TO_SHORTPTR(s8); + s0 = _mm_loadl_epi64((const __m128i *)(s)); + s1 = _mm_loadl_epi64((const __m128i *)(s + p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadl_epi64((const __m128i *)(s + 2 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadl_epi64((const __m128i *)(s + 3 * p)); + s0 = _mm_adds_epu16(s0, s1); + s0 = _mm_add_epi16(s0, _mm_srli_si128(s0, 4)); + s0 = _mm_add_epi16(s0, _mm_srli_si128(s0, 2)); + avg = _mm_extract_epi16(s0, 0); + + return (avg + 8) >> 4; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + static void hadamard_col8_sse2(__m128i *in, int iter) { __m128i a0 = in[0]; __m128i a1 = in[1]; |