diff options
-rw-r--r-- | test/dct_test.cc | 102 | ||||
-rw-r--r-- | test/encode_api_test.cc | 72 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.pl | 9 | ||||
-rw-r--r-- | vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c | 419 | ||||
-rw-r--r-- | vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c | 255 | ||||
-rw-r--r-- | vp9/common/x86/vp9_idct_intrin_sse2.c | 8 | ||||
-rw-r--r-- | vp9/vp9_common.mk | 2 | ||||
-rw-r--r-- | vpx/src/vpx_encoder.c | 50 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_idct16x16_add_sse4.c | 6 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_idct8x8_add_sse2.c | 4 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_idct8x8_add_sse4.c | 14 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_inv_txfm_sse4.h | 3 | ||||
-rw-r--r-- | vpx_dsp/x86/inv_txfm_sse2.c | 10 | ||||
-rw-r--r-- | vpx_dsp/x86/inv_txfm_sse2.h | 3 |
14 files changed, 898 insertions, 59 deletions
diff --git a/test/dct_test.cc b/test/dct_test.cc index a5ac9a0dc..379fbecc0 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -597,7 +597,9 @@ class TransHT : public TransTestBase { TransHT() { fwd_txfm_ref = fht_ref; } }; -TEST_P(TransHT, AccuracyCheck) { RunAccuracyCheck(1); } +TEST_P(TransHT, AccuracyCheck) { + RunAccuracyCheck(size_ == 16 && bit_depth_ > 10 ? 2 : 1); +} TEST_P(TransHT, CoeffCheck) { RunCoeffCheck(); } @@ -605,17 +607,6 @@ TEST_P(TransHT, MemCheck) { RunMemCheck(); } TEST_P(TransHT, InvAccuracyCheck) { RunInvAccuracyCheck(1); } -/* TODO:(johannkoenig) Determine why these fail AccuracyCheck - make_tuple(&vp9_highbd_fht16x16_c, - &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 0, VPX_BITS_12, 2), - make_tuple(&vp9_highbd_fht16x16_c, - &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 1, VPX_BITS_12, 2), - make_tuple(&vp9_highbd_fht16x16_c, - &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 2, VPX_BITS_12, 2), - make_tuple(&vp9_highbd_fht16x16_c, - &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 3, VPX_BITS_12, 2), - */ - const DctParam c_ht_tests[] = { #if CONFIG_VP9_HIGHBITDEPTH make_tuple(&vp9_highbd_fht16x16_c, @@ -642,6 +633,19 @@ const DctParam c_ht_tests[] = { make_tuple(&vp9_highbd_fht16x16_c, &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 3, VPX_BITS_10, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 0, + VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 1, + VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 2, + VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 3, + VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_c>, 8, 0, VPX_BITS_8, 2), @@ -784,6 +788,80 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( SSE4_1, TransHT, ::testing::Values( + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 0, VPX_BITS_8, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 1, VPX_BITS_8, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 2, VPX_BITS_8, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 3, VPX_BITS_8, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 0, VPX_BITS_10, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 1, VPX_BITS_10, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 2, VPX_BITS_10, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 3, VPX_BITS_10, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 0, VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 1, VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 2, VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 3, VPX_BITS_12, 2), + + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 0, + VPX_BITS_8, 2), + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 1, + VPX_BITS_8, 2), + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 2, + VPX_BITS_8, 2), + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 3, + VPX_BITS_8, 2), + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 0, + VPX_BITS_10, 2), + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 1, + VPX_BITS_10, 2), + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 2, + VPX_BITS_10, 2), + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 3, + VPX_BITS_10, 2), + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 0, + VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 1, + VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 2, + VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 3, + VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_sse4_1>, 4, 0, VPX_BITS_8, 2), diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 164db5a7b..13de53464 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -106,4 +106,76 @@ TEST(EncodeAPI, ImageSizeSetting) { } #endif +#if CONFIG_MULTI_RES_ENCODING +// Set up 2 spatial streams with 2 temporal layers per stream, and generate +// invalid configuration by setting the temporal layer rate allocation +// (ts_target_bitrate[]) to 0 for both layers. +TEST(EncodeAPI, VP8MultiResEncode) { + const int width = 1280; + const int height = 720; + const int width_down = width / 2; + const int height_down = height / 2; + const int target_bitrate = 1000; + const int framerate = 30; + vpx_codec_ctx_t enc[2]; + vpx_codec_enc_cfg_t cfg[2]; + vpx_rational_t dsf[2] = { { 2, 1 }, { 2, 1 } }; + + memset(enc, 0, sizeof(enc)); + + for (int i = 0; i < 2; i++) { + vpx_codec_enc_config_default(vpx_codec_vp8_cx(), &cfg[i], 0); + } + + /* Highest-resolution encoder settings */ + cfg[0].g_w = width; + cfg[0].g_h = height; + cfg[0].rc_dropframe_thresh = 0; + cfg[0].rc_end_usage = VPX_CBR; + cfg[0].rc_resize_allowed = 0; + cfg[0].rc_min_quantizer = 2; + cfg[0].rc_max_quantizer = 56; + cfg[0].rc_undershoot_pct = 100; + cfg[0].rc_overshoot_pct = 15; + cfg[0].rc_buf_initial_sz = 500; + cfg[0].rc_buf_optimal_sz = 600; + cfg[0].rc_buf_sz = 1000; + cfg[0].g_error_resilient = 1; /* Enable error resilient mode */ + cfg[0].g_lag_in_frames = 0; + + cfg[0].kf_mode = VPX_KF_AUTO; + cfg[0].kf_min_dist = 3000; + cfg[0].kf_max_dist = 3000; + + cfg[0].rc_target_bitrate = target_bitrate; /* Set target bitrate */ + cfg[0].g_timebase.num = 1; /* Set fps */ + cfg[0].g_timebase.den = framerate; + + memcpy(&cfg[1], &cfg[0], sizeof(cfg[0])); + cfg[1].rc_target_bitrate = 500; + cfg[1].g_w = width_down; + cfg[1].g_h = height_down; + + for (int i = 0; i < 2; i++) { + cfg[i].ts_number_layers = 2; + cfg[i].ts_periodicity = 2; + cfg[i].ts_rate_decimator[0] = 2; + cfg[i].ts_rate_decimator[1] = 1; + cfg[i].ts_layer_id[0] = 0; + cfg[i].ts_layer_id[1] = 1; + // Invalid parameters. + cfg[i].ts_target_bitrate[0] = 0; + cfg[i].ts_target_bitrate[1] = 0; + } + + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_enc_init_multi(&enc[0], vpx_codec_vp8_cx(), &cfg[0], 2, 0, + &dsf[0])); + + for (int i = 0; i < 2; i++) { + vpx_codec_destroy(&enc[i]); + } +} +#endif + } // namespace diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index dd6120266..7ee7ddaee 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -97,13 +97,16 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # Note as optimized versions of these functions are added we need to add a check to ensure # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only. add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd"; - if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { - specialize qw/vp9_highbd_iht4x4_16_add sse4_1/; - } add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd"; add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd"; + + if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { + specialize qw/vp9_highbd_iht4x4_16_add sse4_1/; + specialize qw/vp9_highbd_iht8x8_64_add sse4_1/; + specialize qw/vp9_highbd_iht16x16_256_add sse4_1/; + } } # diff --git a/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c b/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c new file mode 100644 index 000000000..57b79a732 --- /dev/null +++ b/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c @@ -0,0 +1,419 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_idct.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in, + const int c, + __m128i *const s) { + const __m128i pair_c = pair_set_epi32(4 * c, 0); + __m128i x[2]; + + extend_64bit(in, x); + s[0] = _mm_mul_epi32(pair_c, x[0]); + s[1] = _mm_mul_epi32(pair_c, x[1]); +} + +static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0, + const __m128i in1, + const int c0, const int c1, + __m128i *const s0, + __m128i *const s1) { + const __m128i pair_c0 = pair_set_epi32(4 * c0, 0); + const __m128i pair_c1 = pair_set_epi32(4 * c1, 0); + __m128i t00[2], t01[2], t10[2], t11[2]; + __m128i x0[2], x1[2]; + + extend_64bit(in0, x0); + extend_64bit(in1, x1); + t00[0] = _mm_mul_epi32(pair_c0, x0[0]); + t00[1] = _mm_mul_epi32(pair_c0, x0[1]); + t01[0] = _mm_mul_epi32(pair_c0, x1[0]); + t01[1] = _mm_mul_epi32(pair_c0, x1[1]); + t10[0] = _mm_mul_epi32(pair_c1, x0[0]); + t10[1] = _mm_mul_epi32(pair_c1, x0[1]); + t11[0] = _mm_mul_epi32(pair_c1, x1[0]); + t11[1] = _mm_mul_epi32(pair_c1, x1[1]); + + s0[0] = _mm_add_epi64(t00[0], t11[0]); + s0[1] = _mm_add_epi64(t00[1], t11[1]); + s1[0] = _mm_sub_epi64(t10[0], t01[0]); + s1[1] = _mm_sub_epi64(t10[1], t01[1]); +} + +static void highbd_iadst16_4col_sse4_1(__m128i *const io /*io[16]*/) { + __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2], s8[2], s9[2], + s10[2], s11[2], s12[2], s13[2], s14[2], s15[2]; + __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2], x8[2], x9[2], + x10[2], x11[2], x12[2], x13[2], x14[2], x15[2]; + + // stage 1 + highbd_iadst_butterfly_sse4_1(io[15], io[0], cospi_1_64, cospi_31_64, s0, s1); + highbd_iadst_butterfly_sse4_1(io[13], io[2], cospi_5_64, cospi_27_64, s2, s3); + highbd_iadst_butterfly_sse4_1(io[11], io[4], cospi_9_64, cospi_23_64, s4, s5); + highbd_iadst_butterfly_sse4_1(io[9], io[6], cospi_13_64, cospi_19_64, s6, s7); + highbd_iadst_butterfly_sse4_1(io[7], io[8], cospi_17_64, cospi_15_64, s8, s9); + highbd_iadst_butterfly_sse4_1(io[5], io[10], cospi_21_64, cospi_11_64, s10, + s11); + highbd_iadst_butterfly_sse4_1(io[3], io[12], cospi_25_64, cospi_7_64, s12, + s13); + highbd_iadst_butterfly_sse4_1(io[1], io[14], cospi_29_64, cospi_3_64, s14, + s15); + + x0[0] = _mm_add_epi64(s0[0], s8[0]); + x0[1] = _mm_add_epi64(s0[1], s8[1]); + x1[0] = _mm_add_epi64(s1[0], s9[0]); + x1[1] = _mm_add_epi64(s1[1], s9[1]); + x2[0] = _mm_add_epi64(s2[0], s10[0]); + x2[1] = _mm_add_epi64(s2[1], s10[1]); + x3[0] = _mm_add_epi64(s3[0], s11[0]); + x3[1] = _mm_add_epi64(s3[1], s11[1]); + x4[0] = _mm_add_epi64(s4[0], s12[0]); + x4[1] = _mm_add_epi64(s4[1], s12[1]); + x5[0] = _mm_add_epi64(s5[0], s13[0]); + x5[1] = _mm_add_epi64(s5[1], s13[1]); + x6[0] = _mm_add_epi64(s6[0], s14[0]); + x6[1] = _mm_add_epi64(s6[1], s14[1]); + x7[0] = _mm_add_epi64(s7[0], s15[0]); + x7[1] = _mm_add_epi64(s7[1], s15[1]); + x8[0] = _mm_sub_epi64(s0[0], s8[0]); + x8[1] = _mm_sub_epi64(s0[1], s8[1]); + x9[0] = _mm_sub_epi64(s1[0], s9[0]); + x9[1] = _mm_sub_epi64(s1[1], s9[1]); + x10[0] = _mm_sub_epi64(s2[0], s10[0]); + x10[1] = _mm_sub_epi64(s2[1], s10[1]); + x11[0] = _mm_sub_epi64(s3[0], s11[0]); + x11[1] = _mm_sub_epi64(s3[1], s11[1]); + x12[0] = _mm_sub_epi64(s4[0], s12[0]); + x12[1] = _mm_sub_epi64(s4[1], s12[1]); + x13[0] = _mm_sub_epi64(s5[0], s13[0]); + x13[1] = _mm_sub_epi64(s5[1], s13[1]); + x14[0] = _mm_sub_epi64(s6[0], s14[0]); + x14[1] = _mm_sub_epi64(s6[1], s14[1]); + x15[0] = _mm_sub_epi64(s7[0], s15[0]); + x15[1] = _mm_sub_epi64(s7[1], s15[1]); + + x0[0] = dct_const_round_shift_64bit(x0[0]); + x0[1] = dct_const_round_shift_64bit(x0[1]); + x1[0] = dct_const_round_shift_64bit(x1[0]); + x1[1] = dct_const_round_shift_64bit(x1[1]); + x2[0] = dct_const_round_shift_64bit(x2[0]); + x2[1] = dct_const_round_shift_64bit(x2[1]); + x3[0] = dct_const_round_shift_64bit(x3[0]); + x3[1] = dct_const_round_shift_64bit(x3[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + x8[0] = dct_const_round_shift_64bit(x8[0]); + x8[1] = dct_const_round_shift_64bit(x8[1]); + x9[0] = dct_const_round_shift_64bit(x9[0]); + x9[1] = dct_const_round_shift_64bit(x9[1]); + x10[0] = dct_const_round_shift_64bit(x10[0]); + x10[1] = dct_const_round_shift_64bit(x10[1]); + x11[0] = dct_const_round_shift_64bit(x11[0]); + x11[1] = dct_const_round_shift_64bit(x11[1]); + x12[0] = dct_const_round_shift_64bit(x12[0]); + x12[1] = dct_const_round_shift_64bit(x12[1]); + x13[0] = dct_const_round_shift_64bit(x13[0]); + x13[1] = dct_const_round_shift_64bit(x13[1]); + x14[0] = dct_const_round_shift_64bit(x14[0]); + x14[1] = dct_const_round_shift_64bit(x14[1]); + x15[0] = dct_const_round_shift_64bit(x15[0]); + x15[1] = dct_const_round_shift_64bit(x15[1]); + x0[0] = pack_4(x0[0], x0[1]); + x1[0] = pack_4(x1[0], x1[1]); + x2[0] = pack_4(x2[0], x2[1]); + x3[0] = pack_4(x3[0], x3[1]); + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + x8[0] = pack_4(x8[0], x8[1]); + x9[0] = pack_4(x9[0], x9[1]); + x10[0] = pack_4(x10[0], x10[1]); + x11[0] = pack_4(x11[0], x11[1]); + x12[0] = pack_4(x12[0], x12[1]); + x13[0] = pack_4(x13[0], x13[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + // stage 2 + s0[0] = x0[0]; + s1[0] = x1[0]; + s2[0] = x2[0]; + s3[0] = x3[0]; + s4[0] = x4[0]; + s5[0] = x5[0]; + s6[0] = x6[0]; + s7[0] = x7[0]; + x0[0] = _mm_add_epi32(s0[0], s4[0]); + x1[0] = _mm_add_epi32(s1[0], s5[0]); + x2[0] = _mm_add_epi32(s2[0], s6[0]); + x3[0] = _mm_add_epi32(s3[0], s7[0]); + x4[0] = _mm_sub_epi32(s0[0], s4[0]); + x5[0] = _mm_sub_epi32(s1[0], s5[0]); + x6[0] = _mm_sub_epi32(s2[0], s6[0]); + x7[0] = _mm_sub_epi32(s3[0], s7[0]); + + highbd_iadst_butterfly_sse4_1(x8[0], x9[0], cospi_4_64, cospi_28_64, s8, s9); + highbd_iadst_butterfly_sse4_1(x10[0], x11[0], cospi_20_64, cospi_12_64, s10, + s11); + highbd_iadst_butterfly_sse4_1(x13[0], x12[0], cospi_28_64, cospi_4_64, s13, + s12); + highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_12_64, cospi_20_64, s15, + s14); + + x8[0] = _mm_add_epi64(s8[0], s12[0]); + x8[1] = _mm_add_epi64(s8[1], s12[1]); + x9[0] = _mm_add_epi64(s9[0], s13[0]); + x9[1] = _mm_add_epi64(s9[1], s13[1]); + x10[0] = _mm_add_epi64(s10[0], s14[0]); + x10[1] = _mm_add_epi64(s10[1], s14[1]); + x11[0] = _mm_add_epi64(s11[0], s15[0]); + x11[1] = _mm_add_epi64(s11[1], s15[1]); + x12[0] = _mm_sub_epi64(s8[0], s12[0]); + x12[1] = _mm_sub_epi64(s8[1], s12[1]); + x13[0] = _mm_sub_epi64(s9[0], s13[0]); + x13[1] = _mm_sub_epi64(s9[1], s13[1]); + x14[0] = _mm_sub_epi64(s10[0], s14[0]); + x14[1] = _mm_sub_epi64(s10[1], s14[1]); + x15[0] = _mm_sub_epi64(s11[0], s15[0]); + x15[1] = _mm_sub_epi64(s11[1], s15[1]); + x8[0] = dct_const_round_shift_64bit(x8[0]); + x8[1] = dct_const_round_shift_64bit(x8[1]); + x9[0] = dct_const_round_shift_64bit(x9[0]); + x9[1] = dct_const_round_shift_64bit(x9[1]); + x10[0] = dct_const_round_shift_64bit(x10[0]); + x10[1] = dct_const_round_shift_64bit(x10[1]); + x11[0] = dct_const_round_shift_64bit(x11[0]); + x11[1] = dct_const_round_shift_64bit(x11[1]); + x12[0] = dct_const_round_shift_64bit(x12[0]); + x12[1] = dct_const_round_shift_64bit(x12[1]); + x13[0] = dct_const_round_shift_64bit(x13[0]); + x13[1] = dct_const_round_shift_64bit(x13[1]); + x14[0] = dct_const_round_shift_64bit(x14[0]); + x14[1] = dct_const_round_shift_64bit(x14[1]); + x15[0] = dct_const_round_shift_64bit(x15[0]); + x15[1] = dct_const_round_shift_64bit(x15[1]); + x8[0] = pack_4(x8[0], x8[1]); + x9[0] = pack_4(x9[0], x9[1]); + x10[0] = pack_4(x10[0], x10[1]); + x11[0] = pack_4(x11[0], x11[1]); + x12[0] = pack_4(x12[0], x12[1]); + x13[0] = pack_4(x13[0], x13[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + // stage 3 + s0[0] = x0[0]; + s1[0] = x1[0]; + s2[0] = x2[0]; + s3[0] = x3[0]; + highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5); + highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6); + s8[0] = x8[0]; + s9[0] = x9[0]; + s10[0] = x10[0]; + s11[0] = x11[0]; + highbd_iadst_butterfly_sse4_1(x12[0], x13[0], cospi_8_64, cospi_24_64, s12, + s13); + highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_24_64, cospi_8_64, s15, + s14); + + x0[0] = _mm_add_epi32(s0[0], s2[0]); + x1[0] = _mm_add_epi32(s1[0], s3[0]); + x2[0] = _mm_sub_epi32(s0[0], s2[0]); + x3[0] = _mm_sub_epi32(s1[0], s3[0]); + x4[0] = _mm_add_epi64(s4[0], s6[0]); + x4[1] = _mm_add_epi64(s4[1], s6[1]); + x5[0] = _mm_add_epi64(s5[0], s7[0]); + x5[1] = _mm_add_epi64(s5[1], s7[1]); + x6[0] = _mm_sub_epi64(s4[0], s6[0]); + x6[1] = _mm_sub_epi64(s4[1], s6[1]); + x7[0] = _mm_sub_epi64(s5[0], s7[0]); + x7[1] = _mm_sub_epi64(s5[1], s7[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + x8[0] = _mm_add_epi32(s8[0], s10[0]); + x9[0] = _mm_add_epi32(s9[0], s11[0]); + x10[0] = _mm_sub_epi32(s8[0], s10[0]); + x11[0] = _mm_sub_epi32(s9[0], s11[0]); + x12[0] = _mm_add_epi64(s12[0], s14[0]); + x12[1] = _mm_add_epi64(s12[1], s14[1]); + x13[0] = _mm_add_epi64(s13[0], s15[0]); + x13[1] = _mm_add_epi64(s13[1], s15[1]); + x14[0] = _mm_sub_epi64(s12[0], s14[0]); + x14[1] = _mm_sub_epi64(s12[1], s14[1]); + x15[0] = _mm_sub_epi64(s13[0], s15[0]); + x15[1] = _mm_sub_epi64(s13[1], s15[1]); + x12[0] = dct_const_round_shift_64bit(x12[0]); + x12[1] = dct_const_round_shift_64bit(x12[1]); + x13[0] = dct_const_round_shift_64bit(x13[0]); + x13[1] = dct_const_round_shift_64bit(x13[1]); + x14[0] = dct_const_round_shift_64bit(x14[0]); + x14[1] = dct_const_round_shift_64bit(x14[1]); + x15[0] = dct_const_round_shift_64bit(x15[0]); + x15[1] = dct_const_round_shift_64bit(x15[1]); + x12[0] = pack_4(x12[0], x12[1]); + x13[0] = pack_4(x13[0], x13[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + // stage 4 + s2[0] = _mm_add_epi32(x2[0], x3[0]); + s3[0] = _mm_sub_epi32(x2[0], x3[0]); + s6[0] = _mm_add_epi32(x7[0], x6[0]); + s7[0] = _mm_sub_epi32(x7[0], x6[0]); + s10[0] = _mm_add_epi32(x11[0], x10[0]); + s11[0] = _mm_sub_epi32(x11[0], x10[0]); + s14[0] = _mm_add_epi32(x14[0], x15[0]); + s15[0] = _mm_sub_epi32(x14[0], x15[0]); + highbd_iadst_half_butterfly_sse4_1(s2[0], -cospi_16_64, s2); + highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3); + highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6); + highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7); + highbd_iadst_half_butterfly_sse4_1(s10[0], cospi_16_64, s10); + highbd_iadst_half_butterfly_sse4_1(s11[0], cospi_16_64, s11); + highbd_iadst_half_butterfly_sse4_1(s14[0], -cospi_16_64, s14); + highbd_iadst_half_butterfly_sse4_1(s15[0], cospi_16_64, s15); + + x2[0] = dct_const_round_shift_64bit(s2[0]); + x2[1] = dct_const_round_shift_64bit(s2[1]); + x3[0] = dct_const_round_shift_64bit(s3[0]); + x3[1] = dct_const_round_shift_64bit(s3[1]); + x6[0] = dct_const_round_shift_64bit(s6[0]); + x6[1] = dct_const_round_shift_64bit(s6[1]); + x7[0] = dct_const_round_shift_64bit(s7[0]); + x7[1] = dct_const_round_shift_64bit(s7[1]); + x10[0] = dct_const_round_shift_64bit(s10[0]); + x10[1] = dct_const_round_shift_64bit(s10[1]); + x11[0] = dct_const_round_shift_64bit(s11[0]); + x11[1] = dct_const_round_shift_64bit(s11[1]); + x14[0] = dct_const_round_shift_64bit(s14[0]); + x14[1] = dct_const_round_shift_64bit(s14[1]); + x15[0] = dct_const_round_shift_64bit(s15[0]); + x15[1] = dct_const_round_shift_64bit(s15[1]); + x2[0] = pack_4(x2[0], x2[1]); + x3[0] = pack_4(x3[0], x3[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + x10[0] = pack_4(x10[0], x10[1]); + x11[0] = pack_4(x11[0], x11[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + io[0] = x0[0]; + io[1] = _mm_sub_epi32(_mm_setzero_si128(), x8[0]); + io[2] = x12[0]; + io[3] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]); + io[4] = x6[0]; + io[5] = x14[0]; + io[6] = x10[0]; + io[7] = x2[0]; + io[8] = x3[0]; + io[9] = x11[0]; + io[10] = x15[0]; + io[11] = x7[0]; + io[12] = x5[0]; + io[13] = _mm_sub_epi32(_mm_setzero_si128(), x13[0]); + io[14] = x9[0]; + io[15] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]); +} + +void vp9_highbd_iht16x16_256_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + int i; + __m128i out[16], *in; + + if (bd == 8) { + __m128i l[16], r[16]; + + in = l; + for (i = 0; i < 2; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]); + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + idct16_8col(in, in); + } else { + vpx_iadst16_8col_sse2(in); + } + in = r; + input += 128; + } + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(l + i, out); + transpose_16bit_8x8(r + i, out + 8); + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + idct16_8col(out, out); + } else { + vpx_iadst16_8col_sse2(out); + } + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[4][16]; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]); + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + vpx_highbd_idct16_4col_sse4_1(in); + } else { + highbd_iadst16_4col_sse4_1(in); + } + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + vpx_highbd_idct16_4col_sse4_1(out); + } else { + highbd_iadst16_4col_sse4_1(out); + } + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} diff --git a/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c b/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c new file mode 100644 index 000000000..7d949b6db --- /dev/null +++ b/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_idct.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in, + const int c, + __m128i *const s) { + const __m128i pair_c = pair_set_epi32(4 * c, 0); + __m128i x[2]; + + extend_64bit(in, x); + s[0] = _mm_mul_epi32(pair_c, x[0]); + s[1] = _mm_mul_epi32(pair_c, x[1]); +} + +static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0, + const __m128i in1, + const int c0, const int c1, + __m128i *const s0, + __m128i *const s1) { + const __m128i pair_c0 = pair_set_epi32(4 * c0, 0); + const __m128i pair_c1 = pair_set_epi32(4 * c1, 0); + __m128i t00[2], t01[2], t10[2], t11[2]; + __m128i x0[2], x1[2]; + + extend_64bit(in0, x0); + extend_64bit(in1, x1); + t00[0] = _mm_mul_epi32(pair_c0, x0[0]); + t00[1] = _mm_mul_epi32(pair_c0, x0[1]); + t01[0] = _mm_mul_epi32(pair_c0, x1[0]); + t01[1] = _mm_mul_epi32(pair_c0, x1[1]); + t10[0] = _mm_mul_epi32(pair_c1, x0[0]); + t10[1] = _mm_mul_epi32(pair_c1, x0[1]); + t11[0] = _mm_mul_epi32(pair_c1, x1[0]); + t11[1] = _mm_mul_epi32(pair_c1, x1[1]); + + s0[0] = _mm_add_epi64(t00[0], t11[0]); + s0[1] = _mm_add_epi64(t00[1], t11[1]); + s1[0] = _mm_sub_epi64(t10[0], t01[0]); + s1[1] = _mm_sub_epi64(t10[1], t01[1]); +} + +static void highbd_iadst8_sse4_1(__m128i *const io) { + __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2]; + + transpose_32bit_4x4x2(io, io); + + // stage 1 + highbd_iadst_butterfly_sse4_1(io[7], io[0], cospi_2_64, cospi_30_64, s0, s1); + highbd_iadst_butterfly_sse4_1(io[3], io[4], cospi_18_64, cospi_14_64, s4, s5); + x0[0] = _mm_add_epi64(s0[0], s4[0]); + x0[1] = _mm_add_epi64(s0[1], s4[1]); + x1[0] = _mm_add_epi64(s1[0], s5[0]); + x1[1] = _mm_add_epi64(s1[1], s5[1]); + x4[0] = _mm_sub_epi64(s0[0], s4[0]); + x4[1] = _mm_sub_epi64(s0[1], s4[1]); + x5[0] = _mm_sub_epi64(s1[0], s5[0]); + x5[1] = _mm_sub_epi64(s1[1], s5[1]); + + highbd_iadst_butterfly_sse4_1(io[5], io[2], cospi_10_64, cospi_22_64, s2, s3); + highbd_iadst_butterfly_sse4_1(io[1], io[6], cospi_26_64, cospi_6_64, s6, s7); + x2[0] = _mm_add_epi64(s2[0], s6[0]); + x2[1] = _mm_add_epi64(s2[1], s6[1]); + x3[0] = _mm_add_epi64(s3[0], s7[0]); + x3[1] = _mm_add_epi64(s3[1], s7[1]); + x6[0] = _mm_sub_epi64(s2[0], s6[0]); + x6[1] = _mm_sub_epi64(s2[1], s6[1]); + x7[0] = _mm_sub_epi64(s3[0], s7[0]); + x7[1] = _mm_sub_epi64(s3[1], s7[1]); + + x0[0] = dct_const_round_shift_64bit(x0[0]); + x0[1] = dct_const_round_shift_64bit(x0[1]); + x1[0] = dct_const_round_shift_64bit(x1[0]); + x1[1] = dct_const_round_shift_64bit(x1[1]); + x2[0] = dct_const_round_shift_64bit(x2[0]); + x2[1] = dct_const_round_shift_64bit(x2[1]); + x3[0] = dct_const_round_shift_64bit(x3[0]); + x3[1] = dct_const_round_shift_64bit(x3[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + s0[0] = pack_4(x0[0], x0[1]); // s0 = x0; + s1[0] = pack_4(x1[0], x1[1]); // s1 = x1; + s2[0] = pack_4(x2[0], x2[1]); // s2 = x2; + s3[0] = pack_4(x3[0], x3[1]); // s3 = x3; + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + + // stage 2 + x0[0] = _mm_add_epi32(s0[0], s2[0]); + x1[0] = _mm_add_epi32(s1[0], s3[0]); + x2[0] = _mm_sub_epi32(s0[0], s2[0]); + x3[0] = _mm_sub_epi32(s1[0], s3[0]); + + highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5); + highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6); + + x4[0] = _mm_add_epi64(s4[0], s6[0]); + x4[1] = _mm_add_epi64(s4[1], s6[1]); + x5[0] = _mm_add_epi64(s5[0], s7[0]); + x5[1] = _mm_add_epi64(s5[1], s7[1]); + x6[0] = _mm_sub_epi64(s4[0], s6[0]); + x6[1] = _mm_sub_epi64(s4[1], s6[1]); + x7[0] = _mm_sub_epi64(s5[0], s7[0]); + x7[1] = _mm_sub_epi64(s5[1], s7[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + + // stage 3 + s2[0] = _mm_add_epi32(x2[0], x3[0]); + s3[0] = _mm_sub_epi32(x2[0], x3[0]); + s6[0] = _mm_add_epi32(x6[0], x7[0]); + s7[0] = _mm_sub_epi32(x6[0], x7[0]); + highbd_iadst_half_butterfly_sse4_1(s2[0], cospi_16_64, s2); + highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3); + highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6); + highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7); + + x2[0] = dct_const_round_shift_64bit(s2[0]); + x2[1] = dct_const_round_shift_64bit(s2[1]); + x3[0] = dct_const_round_shift_64bit(s3[0]); + x3[1] = dct_const_round_shift_64bit(s3[1]); + x6[0] = dct_const_round_shift_64bit(s6[0]); + x6[1] = dct_const_round_shift_64bit(s6[1]); + x7[0] = dct_const_round_shift_64bit(s7[0]); + x7[1] = dct_const_round_shift_64bit(s7[1]); + x2[0] = pack_4(x2[0], x2[1]); + x3[0] = pack_4(x3[0], x3[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + + io[0] = x0[0]; + io[1] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]); + io[2] = x6[0]; + io[3] = _mm_sub_epi32(_mm_setzero_si128(), x2[0]); + io[4] = x3[0]; + io[5] = _mm_sub_epi32(_mm_setzero_si128(), x7[0]); + io[6] = x5[0]; + io[7] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]); +} + +void vp9_highbd_iht8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + __m128i io[16]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4)); + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], io[4]); + io_short[1] = _mm_packs_epi32(io[1], io[5]); + io_short[2] = _mm_packs_epi32(io[2], io[6]); + io_short[3] = _mm_packs_epi32(io[3], io[7]); + io_short[4] = _mm_packs_epi32(io[8], io[12]); + io_short[5] = _mm_packs_epi32(io[9], io[13]); + io_short[6] = _mm_packs_epi32(io[10], io[14]); + io_short[7] = _mm_packs_epi32(io[11], io[15]); + + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + vpx_idct8_sse2(io_short); + } else { + iadst8_sse2(io_short); + } + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + vpx_idct8_sse2(io_short); + } else { + iadst8_sse2(io_short); + } + round_shift_8x8(io_short, io); + } else { + __m128i temp[4]; + + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + vpx_highbd_idct8x8_half1d_sse4_1(io); + vpx_highbd_idct8x8_half1d_sse4_1(&io[8]); + } else { + highbd_iadst8_sse4_1(io); + highbd_iadst8_sse4_1(&io[8]); + } + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + io[4] = io[8]; + io[5] = io[9]; + io[6] = io[10]; + io[7] = io[11]; + + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + vpx_highbd_idct8x8_half1d_sse4_1(io); + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + vpx_highbd_idct8x8_half1d_sse4_1(&io[8]); + } else { + highbd_iadst8_sse4_1(io); + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + highbd_iadst8_sse4_1(&io[8]); + } + highbd_idct8x8_final_round(io); + } + recon_and_store_8x8(io, dest, stride, bd); +} diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index 6996260e2..95dad919e 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -68,16 +68,16 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, switch (tx_type) { case 0: // DCT_DCT - idct8_sse2(in); - idct8_sse2(in); + vpx_idct8_sse2(in); + vpx_idct8_sse2(in); break; case 1: // ADST_DCT - idct8_sse2(in); + vpx_idct8_sse2(in); iadst8_sse2(in); break; case 2: // DCT_ADST iadst8_sse2(in); - idct8_sse2(in); + vpx_idct8_sse2(in); break; case 3: // ADST_ADST iadst8_sse2(in); diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 9819fb641..377411431 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -82,6 +82,8 @@ VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c else VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht4x4_add_sse4.c +VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht8x8_add_sse4.c +VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht16x16_add_sse4.c endif $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl)) diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c index 4390cf7c8..a26204bc4 100644 --- a/vpx/src/vpx_encoder.c +++ b/vpx/src/vpx_encoder.c @@ -12,8 +12,11 @@ * \brief Provides the high level interface to wrap encoder algorithms. * */ +#include <assert.h> #include <limits.h> +#include <stdlib.h> #include <string.h> +#include "vp8/common/blockd.h" #include "vpx_config.h" #include "vpx/internal/vpx_codec_internal.h" @@ -89,28 +92,27 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver( if (dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 || dsf->den > dsf->num) { res = VPX_CODEC_INVALID_PARAM; - break; + } else { + mr_cfg.mr_low_res_mode_info = mem_loc; + mr_cfg.mr_total_resolutions = num_enc; + mr_cfg.mr_encoder_id = num_enc - 1 - i; + mr_cfg.mr_down_sampling_factor.num = dsf->num; + mr_cfg.mr_down_sampling_factor.den = dsf->den; + + /* Force Key-frame synchronization. Namely, encoder at higher + * resolution always use the same frame_type chosen by the + * lowest-resolution encoder. + */ + if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED; + + ctx->iface = iface; + ctx->name = iface->name; + ctx->priv = NULL; + ctx->init_flags = flags; + ctx->config.enc = cfg; + res = ctx->iface->init(ctx, &mr_cfg); } - mr_cfg.mr_low_res_mode_info = mem_loc; - mr_cfg.mr_total_resolutions = num_enc; - mr_cfg.mr_encoder_id = num_enc - 1 - i; - mr_cfg.mr_down_sampling_factor.num = dsf->num; - mr_cfg.mr_down_sampling_factor.den = dsf->den; - - /* Force Key-frame synchronization. Namely, encoder at higher - * resolution always use the same frame_type chosen by the - * lowest-resolution encoder. - */ - if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED; - - ctx->iface = iface; - ctx->name = iface->name; - ctx->priv = NULL; - ctx->init_flags = flags; - ctx->config.enc = cfg; - res = ctx->iface->init(ctx, &mr_cfg); - if (res) { const char *error_detail = ctx->priv ? ctx->priv->err_detail : NULL; /* Destroy current ctx */ @@ -124,10 +126,14 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver( vpx_codec_destroy(ctx); i--; } +#if CONFIG_MULTI_RES_ENCODING + assert(mem_loc); + free(((LOWER_RES_FRAME_INFO *)mem_loc)->mb_info); + free(mem_loc); +#endif + return SAVE_STATUS(ctx, res); } - if (res) break; - ctx++; cfg++; dsf++; diff --git a/vpx_dsp/x86/highbd_idct16x16_add_sse4.c b/vpx_dsp/x86/highbd_idct16x16_add_sse4.c index de097c66a..7898ee12c 100644 --- a/vpx_dsp/x86/highbd_idct16x16_add_sse4.c +++ b/vpx_dsp/x86/highbd_idct16x16_add_sse4.c @@ -53,7 +53,7 @@ static INLINE void highbd_idct16_4col_stage6(const __m128i *const in, out[15] = in[15]; } -static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) { +void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/) { __m128i step1[16], step2[16]; // stage 2 @@ -233,7 +233,7 @@ void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input, in = all[i]; highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]); highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]); - highbd_idct16_4col(in); + vpx_highbd_idct16_4col_sse4_1(in); input += 4 * 16; } @@ -243,7 +243,7 @@ void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input, transpose_32bit_4x4(all[1] + i, out + 4); transpose_32bit_4x4(all[2] + i, out + 8); transpose_32bit_4x4(all[3] + i, out + 12); - highbd_idct16_4col(out); + vpx_highbd_idct16_4col_sse4_1(out); for (j = 0; j < 16; ++j) { highbd_write_buffer_4(dest + j * stride, out[j], bd); diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c index 909a6b794..bb7a510e1 100644 --- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c +++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c @@ -124,8 +124,8 @@ void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, io_short[6] = _mm_packs_epi32(io[10], io[14]); io_short[7] = _mm_packs_epi32(io[11], io[15]); - idct8_sse2(io_short); - idct8_sse2(io_short); + vpx_idct8_sse2(io_short); + vpx_idct8_sse2(io_short); round_shift_8x8(io_short, io); } else { __m128i temp[4]; diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse4.c b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c index ae391b2c0..8b2e3d241 100644 --- a/vpx_dsp/x86/highbd_idct8x8_add_sse4.c +++ b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c @@ -17,7 +17,7 @@ #include "vpx_dsp/x86/inv_txfm_ssse3.h" #include "vpx_dsp/x86/transpose_sse2.h" -static void highbd_idct8x8_half1d(__m128i *const io) { +void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io) { __m128i step1[8], step2[8]; transpose_32bit_4x4x2(io, io); @@ -126,13 +126,13 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest, io_short[6] = _mm_packs_epi32(io[10], io[14]); io_short[7] = _mm_packs_epi32(io[11], io[15]); - idct8_sse2(io_short); - idct8_sse2(io_short); + vpx_idct8_sse2(io_short); + vpx_idct8_sse2(io_short); round_shift_8x8(io_short, io); } else { __m128i temp[4]; - highbd_idct8x8_half1d(io); + vpx_highbd_idct8x8_half1d_sse4_1(io); io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); @@ -142,7 +142,7 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest, io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); - highbd_idct8x8_half1d(&io[8]); + vpx_highbd_idct8x8_half1d_sse4_1(&io[8]); temp[0] = io[4]; temp[1] = io[5]; @@ -152,13 +152,13 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest, io[5] = io[9]; io[6] = io[10]; io[7] = io[11]; - highbd_idct8x8_half1d(io); + vpx_highbd_idct8x8_half1d_sse4_1(io); io[8] = temp[0]; io[9] = temp[1]; io[10] = temp[2]; io[11] = temp[3]; - highbd_idct8x8_half1d(&io[8]); + vpx_highbd_idct8x8_half1d_sse4_1(&io[8]); highbd_idct8x8_final_round(io); } diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/vpx_dsp/x86/highbd_inv_txfm_sse4.h index 435934f1b..5a7fd1d39 100644 --- a/vpx_dsp/x86/highbd_inv_txfm_sse4.h +++ b/vpx_dsp/x86/highbd_inv_txfm_sse4.h @@ -106,4 +106,7 @@ static INLINE void highbd_idct4_sse4_1(__m128i *const io) { io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3] } +void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io); +void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/); + #endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c index 6b1837df5..4b02da966 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.c +++ b/vpx_dsp/x86/inv_txfm_sse2.c @@ -165,7 +165,7 @@ void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, // 2-D for (i = 0; i < 2; i++) { - idct8_sse2(in); + vpx_idct8_sse2(in); } write_buffer_8x8(in, dest, stride); @@ -221,7 +221,7 @@ void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, recon_and_store_8_dual(dest, dc_value, stride); } -void idct8_sse2(__m128i *const in) { +void vpx_idct8_sse2(__m128i *const in) { // 8x8 Transpose is copied from vpx_fdct8x8_sse2() transpose_16bit_8x8(in, in); @@ -514,7 +514,7 @@ void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, } } -static void iadst16_8col(__m128i *const in) { +void vpx_iadst16_8col_sse2(__m128i *const in) { // perform 16x16 1-D ADST for 8 columns __m128i s[16], x[16], u[32], v[32]; const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); @@ -874,8 +874,8 @@ void idct16_sse2(__m128i *const in0, __m128i *const in1) { void iadst16_sse2(__m128i *const in0, __m128i *const in1) { transpose_16bit_16x16(in0, in1); - iadst16_8col(in0); - iadst16_8col(in1); + vpx_iadst16_8col_sse2(in0); + vpx_iadst16_8col_sse2(in1); } // Group the coefficient calculation into smaller functions to prevent stack diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h index 5cd5098f1..d573f66c9 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.h +++ b/vpx_dsp/x86/inv_txfm_sse2.h @@ -697,10 +697,11 @@ static INLINE void idct32_8x32_quarter_3_4_stage_4_to_7( } void idct4_sse2(__m128i *const in); -void idct8_sse2(__m128i *const in); +void vpx_idct8_sse2(__m128i *const in); void idct16_sse2(__m128i *const in0, __m128i *const in1); void iadst4_sse2(__m128i *const in); void iadst8_sse2(__m128i *const in); +void vpx_iadst16_8col_sse2(__m128i *const in); void iadst16_sse2(__m128i *const in0, __m128i *const in1); void idct32_1024_8x32(const __m128i *const in, __m128i *const out); void idct32_34_8x32_sse2(const __m128i *const in, __m128i *const out); |