summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--test/dct_test.cc102
-rw-r--r--test/encode_api_test.cc72
-rw-r--r--vp9/common/vp9_rtcd_defs.pl9
-rw-r--r--vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c419
-rw-r--r--vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c255
-rw-r--r--vp9/common/x86/vp9_idct_intrin_sse2.c8
-rw-r--r--vp9/vp9_common.mk2
-rw-r--r--vpx/src/vpx_encoder.c50
-rw-r--r--vpx_dsp/x86/highbd_idct16x16_add_sse4.c6
-rw-r--r--vpx_dsp/x86/highbd_idct8x8_add_sse2.c4
-rw-r--r--vpx_dsp/x86/highbd_idct8x8_add_sse4.c14
-rw-r--r--vpx_dsp/x86/highbd_inv_txfm_sse4.h3
-rw-r--r--vpx_dsp/x86/inv_txfm_sse2.c10
-rw-r--r--vpx_dsp/x86/inv_txfm_sse2.h3
14 files changed, 898 insertions, 59 deletions
diff --git a/test/dct_test.cc b/test/dct_test.cc
index a5ac9a0dc..379fbecc0 100644
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -597,7 +597,9 @@ class TransHT : public TransTestBase {
TransHT() { fwd_txfm_ref = fht_ref; }
};
-TEST_P(TransHT, AccuracyCheck) { RunAccuracyCheck(1); }
+TEST_P(TransHT, AccuracyCheck) {
+ RunAccuracyCheck(size_ == 16 && bit_depth_ > 10 ? 2 : 1);
+}
TEST_P(TransHT, CoeffCheck) { RunCoeffCheck(); }
@@ -605,17 +607,6 @@ TEST_P(TransHT, MemCheck) { RunMemCheck(); }
TEST_P(TransHT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
-/* TODO:(johannkoenig) Determine why these fail AccuracyCheck
- make_tuple(&vp9_highbd_fht16x16_c,
- &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 0, VPX_BITS_12, 2),
- make_tuple(&vp9_highbd_fht16x16_c,
- &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 1, VPX_BITS_12, 2),
- make_tuple(&vp9_highbd_fht16x16_c,
- &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 2, VPX_BITS_12, 2),
- make_tuple(&vp9_highbd_fht16x16_c,
- &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 3, VPX_BITS_12, 2),
- */
-
const DctParam c_ht_tests[] = {
#if CONFIG_VP9_HIGHBITDEPTH
make_tuple(&vp9_highbd_fht16x16_c,
@@ -642,6 +633,19 @@ const DctParam c_ht_tests[] = {
make_tuple(&vp9_highbd_fht16x16_c,
&highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 3,
VPX_BITS_10, 2),
+ make_tuple(&vp9_highbd_fht16x16_c,
+ &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 0,
+ VPX_BITS_12, 2),
+ make_tuple(&vp9_highbd_fht16x16_c,
+ &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 1,
+ VPX_BITS_12, 2),
+ make_tuple(&vp9_highbd_fht16x16_c,
+ &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 2,
+ VPX_BITS_12, 2),
+ make_tuple(&vp9_highbd_fht16x16_c,
+ &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 3,
+ VPX_BITS_12, 2),
+
make_tuple(&vp9_highbd_fht8x8_c,
&highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_c>, 8, 0, VPX_BITS_8,
2),
@@ -784,6 +788,80 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
SSE4_1, TransHT,
::testing::Values(
+ make_tuple(&vp9_highbd_fht16x16_c,
+ &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+ 0, VPX_BITS_8, 2),
+ make_tuple(&vp9_highbd_fht16x16_c,
+ &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+ 1, VPX_BITS_8, 2),
+ make_tuple(&vp9_highbd_fht16x16_c,
+ &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+ 2, VPX_BITS_8, 2),
+ make_tuple(&vp9_highbd_fht16x16_c,
+ &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+ 3, VPX_BITS_8, 2),
+ make_tuple(&vp9_highbd_fht16x16_c,
+ &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+ 0, VPX_BITS_10, 2),
+ make_tuple(&vp9_highbd_fht16x16_c,
+ &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+ 1, VPX_BITS_10, 2),
+ make_tuple(&vp9_highbd_fht16x16_c,
+ &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+ 2, VPX_BITS_10, 2),
+ make_tuple(&vp9_highbd_fht16x16_c,
+ &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+ 3, VPX_BITS_10, 2),
+ make_tuple(&vp9_highbd_fht16x16_c,
+ &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+ 0, VPX_BITS_12, 2),
+ make_tuple(&vp9_highbd_fht16x16_c,
+ &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+ 1, VPX_BITS_12, 2),
+ make_tuple(&vp9_highbd_fht16x16_c,
+ &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+ 2, VPX_BITS_12, 2),
+ make_tuple(&vp9_highbd_fht16x16_c,
+ &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+ 3, VPX_BITS_12, 2),
+
+ make_tuple(&vp9_highbd_fht8x8_c,
+ &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 0,
+ VPX_BITS_8, 2),
+ make_tuple(&vp9_highbd_fht8x8_c,
+ &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 1,
+ VPX_BITS_8, 2),
+ make_tuple(&vp9_highbd_fht8x8_c,
+ &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 2,
+ VPX_BITS_8, 2),
+ make_tuple(&vp9_highbd_fht8x8_c,
+ &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 3,
+ VPX_BITS_8, 2),
+ make_tuple(&vp9_highbd_fht8x8_c,
+ &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 0,
+ VPX_BITS_10, 2),
+ make_tuple(&vp9_highbd_fht8x8_c,
+ &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 1,
+ VPX_BITS_10, 2),
+ make_tuple(&vp9_highbd_fht8x8_c,
+ &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 2,
+ VPX_BITS_10, 2),
+ make_tuple(&vp9_highbd_fht8x8_c,
+ &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 3,
+ VPX_BITS_10, 2),
+ make_tuple(&vp9_highbd_fht8x8_c,
+ &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 0,
+ VPX_BITS_12, 2),
+ make_tuple(&vp9_highbd_fht8x8_c,
+ &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 1,
+ VPX_BITS_12, 2),
+ make_tuple(&vp9_highbd_fht8x8_c,
+ &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 2,
+ VPX_BITS_12, 2),
+ make_tuple(&vp9_highbd_fht8x8_c,
+ &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 3,
+ VPX_BITS_12, 2),
+
make_tuple(&vp9_highbd_fht4x4_c,
&highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_sse4_1>, 4, 0,
VPX_BITS_8, 2),
diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 164db5a7b..13de53464 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -106,4 +106,76 @@ TEST(EncodeAPI, ImageSizeSetting) {
}
#endif
+#if CONFIG_MULTI_RES_ENCODING
+// Set up 2 spatial streams with 2 temporal layers per stream, and generate
+// invalid configuration by setting the temporal layer rate allocation
+// (ts_target_bitrate[]) to 0 for both layers.
+TEST(EncodeAPI, VP8MultiResEncode) {
+ const int width = 1280;
+ const int height = 720;
+ const int width_down = width / 2;
+ const int height_down = height / 2;
+ const int target_bitrate = 1000;
+ const int framerate = 30;
+ vpx_codec_ctx_t enc[2];
+ vpx_codec_enc_cfg_t cfg[2];
+ vpx_rational_t dsf[2] = { { 2, 1 }, { 2, 1 } };
+
+ memset(enc, 0, sizeof(enc));
+
+ for (int i = 0; i < 2; i++) {
+ vpx_codec_enc_config_default(vpx_codec_vp8_cx(), &cfg[i], 0);
+ }
+
+ /* Highest-resolution encoder settings */
+ cfg[0].g_w = width;
+ cfg[0].g_h = height;
+ cfg[0].rc_dropframe_thresh = 0;
+ cfg[0].rc_end_usage = VPX_CBR;
+ cfg[0].rc_resize_allowed = 0;
+ cfg[0].rc_min_quantizer = 2;
+ cfg[0].rc_max_quantizer = 56;
+ cfg[0].rc_undershoot_pct = 100;
+ cfg[0].rc_overshoot_pct = 15;
+ cfg[0].rc_buf_initial_sz = 500;
+ cfg[0].rc_buf_optimal_sz = 600;
+ cfg[0].rc_buf_sz = 1000;
+ cfg[0].g_error_resilient = 1; /* Enable error resilient mode */
+ cfg[0].g_lag_in_frames = 0;
+
+ cfg[0].kf_mode = VPX_KF_AUTO;
+ cfg[0].kf_min_dist = 3000;
+ cfg[0].kf_max_dist = 3000;
+
+ cfg[0].rc_target_bitrate = target_bitrate; /* Set target bitrate */
+ cfg[0].g_timebase.num = 1; /* Set fps */
+ cfg[0].g_timebase.den = framerate;
+
+ memcpy(&cfg[1], &cfg[0], sizeof(cfg[0]));
+ cfg[1].rc_target_bitrate = 500;
+ cfg[1].g_w = width_down;
+ cfg[1].g_h = height_down;
+
+ for (int i = 0; i < 2; i++) {
+ cfg[i].ts_number_layers = 2;
+ cfg[i].ts_periodicity = 2;
+ cfg[i].ts_rate_decimator[0] = 2;
+ cfg[i].ts_rate_decimator[1] = 1;
+ cfg[i].ts_layer_id[0] = 0;
+ cfg[i].ts_layer_id[1] = 1;
+ // Invalid parameters.
+ cfg[i].ts_target_bitrate[0] = 0;
+ cfg[i].ts_target_bitrate[1] = 0;
+ }
+
+ EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+ vpx_codec_enc_init_multi(&enc[0], vpx_codec_vp8_cx(), &cfg[0], 2, 0,
+ &dsf[0]));
+
+ for (int i = 0; i < 2; i++) {
+ vpx_codec_destroy(&enc[i]);
+ }
+}
+#endif
+
} // namespace
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index dd6120266..7ee7ddaee 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -97,13 +97,16 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Note as optimized versions of these functions are added we need to add a check to ensure
# that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
- if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
- specialize qw/vp9_highbd_iht4x4_16_add sse4_1/;
- }
add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd";
+
+ if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
+ specialize qw/vp9_highbd_iht4x4_16_add sse4_1/;
+ specialize qw/vp9_highbd_iht8x8_64_add sse4_1/;
+ specialize qw/vp9_highbd_iht16x16_256_add sse4_1/;
+ }
}
#
diff --git a/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c b/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c
new file mode 100644
index 000000000..57b79a732
--- /dev/null
+++ b/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c
@@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
+ const int c,
+ __m128i *const s) {
+ const __m128i pair_c = pair_set_epi32(4 * c, 0);
+ __m128i x[2];
+
+ extend_64bit(in, x);
+ s[0] = _mm_mul_epi32(pair_c, x[0]);
+ s[1] = _mm_mul_epi32(pair_c, x[1]);
+}
+
+static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
+ const __m128i in1,
+ const int c0, const int c1,
+ __m128i *const s0,
+ __m128i *const s1) {
+ const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
+ const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
+ __m128i t00[2], t01[2], t10[2], t11[2];
+ __m128i x0[2], x1[2];
+
+ extend_64bit(in0, x0);
+ extend_64bit(in1, x1);
+ t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
+ t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
+ t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
+ t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
+ t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
+ t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
+ t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
+ t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
+
+ s0[0] = _mm_add_epi64(t00[0], t11[0]);
+ s0[1] = _mm_add_epi64(t00[1], t11[1]);
+ s1[0] = _mm_sub_epi64(t10[0], t01[0]);
+ s1[1] = _mm_sub_epi64(t10[1], t01[1]);
+}
+
+static void highbd_iadst16_4col_sse4_1(__m128i *const io /*io[16]*/) {
+ __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2], s8[2], s9[2],
+ s10[2], s11[2], s12[2], s13[2], s14[2], s15[2];
+ __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2], x8[2], x9[2],
+ x10[2], x11[2], x12[2], x13[2], x14[2], x15[2];
+
+ // stage 1
+ highbd_iadst_butterfly_sse4_1(io[15], io[0], cospi_1_64, cospi_31_64, s0, s1);
+ highbd_iadst_butterfly_sse4_1(io[13], io[2], cospi_5_64, cospi_27_64, s2, s3);
+ highbd_iadst_butterfly_sse4_1(io[11], io[4], cospi_9_64, cospi_23_64, s4, s5);
+ highbd_iadst_butterfly_sse4_1(io[9], io[6], cospi_13_64, cospi_19_64, s6, s7);
+ highbd_iadst_butterfly_sse4_1(io[7], io[8], cospi_17_64, cospi_15_64, s8, s9);
+ highbd_iadst_butterfly_sse4_1(io[5], io[10], cospi_21_64, cospi_11_64, s10,
+ s11);
+ highbd_iadst_butterfly_sse4_1(io[3], io[12], cospi_25_64, cospi_7_64, s12,
+ s13);
+ highbd_iadst_butterfly_sse4_1(io[1], io[14], cospi_29_64, cospi_3_64, s14,
+ s15);
+
+ x0[0] = _mm_add_epi64(s0[0], s8[0]);
+ x0[1] = _mm_add_epi64(s0[1], s8[1]);
+ x1[0] = _mm_add_epi64(s1[0], s9[0]);
+ x1[1] = _mm_add_epi64(s1[1], s9[1]);
+ x2[0] = _mm_add_epi64(s2[0], s10[0]);
+ x2[1] = _mm_add_epi64(s2[1], s10[1]);
+ x3[0] = _mm_add_epi64(s3[0], s11[0]);
+ x3[1] = _mm_add_epi64(s3[1], s11[1]);
+ x4[0] = _mm_add_epi64(s4[0], s12[0]);
+ x4[1] = _mm_add_epi64(s4[1], s12[1]);
+ x5[0] = _mm_add_epi64(s5[0], s13[0]);
+ x5[1] = _mm_add_epi64(s5[1], s13[1]);
+ x6[0] = _mm_add_epi64(s6[0], s14[0]);
+ x6[1] = _mm_add_epi64(s6[1], s14[1]);
+ x7[0] = _mm_add_epi64(s7[0], s15[0]);
+ x7[1] = _mm_add_epi64(s7[1], s15[1]);
+ x8[0] = _mm_sub_epi64(s0[0], s8[0]);
+ x8[1] = _mm_sub_epi64(s0[1], s8[1]);
+ x9[0] = _mm_sub_epi64(s1[0], s9[0]);
+ x9[1] = _mm_sub_epi64(s1[1], s9[1]);
+ x10[0] = _mm_sub_epi64(s2[0], s10[0]);
+ x10[1] = _mm_sub_epi64(s2[1], s10[1]);
+ x11[0] = _mm_sub_epi64(s3[0], s11[0]);
+ x11[1] = _mm_sub_epi64(s3[1], s11[1]);
+ x12[0] = _mm_sub_epi64(s4[0], s12[0]);
+ x12[1] = _mm_sub_epi64(s4[1], s12[1]);
+ x13[0] = _mm_sub_epi64(s5[0], s13[0]);
+ x13[1] = _mm_sub_epi64(s5[1], s13[1]);
+ x14[0] = _mm_sub_epi64(s6[0], s14[0]);
+ x14[1] = _mm_sub_epi64(s6[1], s14[1]);
+ x15[0] = _mm_sub_epi64(s7[0], s15[0]);
+ x15[1] = _mm_sub_epi64(s7[1], s15[1]);
+
+ x0[0] = dct_const_round_shift_64bit(x0[0]);
+ x0[1] = dct_const_round_shift_64bit(x0[1]);
+ x1[0] = dct_const_round_shift_64bit(x1[0]);
+ x1[1] = dct_const_round_shift_64bit(x1[1]);
+ x2[0] = dct_const_round_shift_64bit(x2[0]);
+ x2[1] = dct_const_round_shift_64bit(x2[1]);
+ x3[0] = dct_const_round_shift_64bit(x3[0]);
+ x3[1] = dct_const_round_shift_64bit(x3[1]);
+ x4[0] = dct_const_round_shift_64bit(x4[0]);
+ x4[1] = dct_const_round_shift_64bit(x4[1]);
+ x5[0] = dct_const_round_shift_64bit(x5[0]);
+ x5[1] = dct_const_round_shift_64bit(x5[1]);
+ x6[0] = dct_const_round_shift_64bit(x6[0]);
+ x6[1] = dct_const_round_shift_64bit(x6[1]);
+ x7[0] = dct_const_round_shift_64bit(x7[0]);
+ x7[1] = dct_const_round_shift_64bit(x7[1]);
+ x8[0] = dct_const_round_shift_64bit(x8[0]);
+ x8[1] = dct_const_round_shift_64bit(x8[1]);
+ x9[0] = dct_const_round_shift_64bit(x9[0]);
+ x9[1] = dct_const_round_shift_64bit(x9[1]);
+ x10[0] = dct_const_round_shift_64bit(x10[0]);
+ x10[1] = dct_const_round_shift_64bit(x10[1]);
+ x11[0] = dct_const_round_shift_64bit(x11[0]);
+ x11[1] = dct_const_round_shift_64bit(x11[1]);
+ x12[0] = dct_const_round_shift_64bit(x12[0]);
+ x12[1] = dct_const_round_shift_64bit(x12[1]);
+ x13[0] = dct_const_round_shift_64bit(x13[0]);
+ x13[1] = dct_const_round_shift_64bit(x13[1]);
+ x14[0] = dct_const_round_shift_64bit(x14[0]);
+ x14[1] = dct_const_round_shift_64bit(x14[1]);
+ x15[0] = dct_const_round_shift_64bit(x15[0]);
+ x15[1] = dct_const_round_shift_64bit(x15[1]);
+ x0[0] = pack_4(x0[0], x0[1]);
+ x1[0] = pack_4(x1[0], x1[1]);
+ x2[0] = pack_4(x2[0], x2[1]);
+ x3[0] = pack_4(x3[0], x3[1]);
+ x4[0] = pack_4(x4[0], x4[1]);
+ x5[0] = pack_4(x5[0], x5[1]);
+ x6[0] = pack_4(x6[0], x6[1]);
+ x7[0] = pack_4(x7[0], x7[1]);
+ x8[0] = pack_4(x8[0], x8[1]);
+ x9[0] = pack_4(x9[0], x9[1]);
+ x10[0] = pack_4(x10[0], x10[1]);
+ x11[0] = pack_4(x11[0], x11[1]);
+ x12[0] = pack_4(x12[0], x12[1]);
+ x13[0] = pack_4(x13[0], x13[1]);
+ x14[0] = pack_4(x14[0], x14[1]);
+ x15[0] = pack_4(x15[0], x15[1]);
+
+ // stage 2
+ s0[0] = x0[0];
+ s1[0] = x1[0];
+ s2[0] = x2[0];
+ s3[0] = x3[0];
+ s4[0] = x4[0];
+ s5[0] = x5[0];
+ s6[0] = x6[0];
+ s7[0] = x7[0];
+ x0[0] = _mm_add_epi32(s0[0], s4[0]);
+ x1[0] = _mm_add_epi32(s1[0], s5[0]);
+ x2[0] = _mm_add_epi32(s2[0], s6[0]);
+ x3[0] = _mm_add_epi32(s3[0], s7[0]);
+ x4[0] = _mm_sub_epi32(s0[0], s4[0]);
+ x5[0] = _mm_sub_epi32(s1[0], s5[0]);
+ x6[0] = _mm_sub_epi32(s2[0], s6[0]);
+ x7[0] = _mm_sub_epi32(s3[0], s7[0]);
+
+ highbd_iadst_butterfly_sse4_1(x8[0], x9[0], cospi_4_64, cospi_28_64, s8, s9);
+ highbd_iadst_butterfly_sse4_1(x10[0], x11[0], cospi_20_64, cospi_12_64, s10,
+ s11);
+ highbd_iadst_butterfly_sse4_1(x13[0], x12[0], cospi_28_64, cospi_4_64, s13,
+ s12);
+ highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_12_64, cospi_20_64, s15,
+ s14);
+
+ x8[0] = _mm_add_epi64(s8[0], s12[0]);
+ x8[1] = _mm_add_epi64(s8[1], s12[1]);
+ x9[0] = _mm_add_epi64(s9[0], s13[0]);
+ x9[1] = _mm_add_epi64(s9[1], s13[1]);
+ x10[0] = _mm_add_epi64(s10[0], s14[0]);
+ x10[1] = _mm_add_epi64(s10[1], s14[1]);
+ x11[0] = _mm_add_epi64(s11[0], s15[0]);
+ x11[1] = _mm_add_epi64(s11[1], s15[1]);
+ x12[0] = _mm_sub_epi64(s8[0], s12[0]);
+ x12[1] = _mm_sub_epi64(s8[1], s12[1]);
+ x13[0] = _mm_sub_epi64(s9[0], s13[0]);
+ x13[1] = _mm_sub_epi64(s9[1], s13[1]);
+ x14[0] = _mm_sub_epi64(s10[0], s14[0]);
+ x14[1] = _mm_sub_epi64(s10[1], s14[1]);
+ x15[0] = _mm_sub_epi64(s11[0], s15[0]);
+ x15[1] = _mm_sub_epi64(s11[1], s15[1]);
+ x8[0] = dct_const_round_shift_64bit(x8[0]);
+ x8[1] = dct_const_round_shift_64bit(x8[1]);
+ x9[0] = dct_const_round_shift_64bit(x9[0]);
+ x9[1] = dct_const_round_shift_64bit(x9[1]);
+ x10[0] = dct_const_round_shift_64bit(x10[0]);
+ x10[1] = dct_const_round_shift_64bit(x10[1]);
+ x11[0] = dct_const_round_shift_64bit(x11[0]);
+ x11[1] = dct_const_round_shift_64bit(x11[1]);
+ x12[0] = dct_const_round_shift_64bit(x12[0]);
+ x12[1] = dct_const_round_shift_64bit(x12[1]);
+ x13[0] = dct_const_round_shift_64bit(x13[0]);
+ x13[1] = dct_const_round_shift_64bit(x13[1]);
+ x14[0] = dct_const_round_shift_64bit(x14[0]);
+ x14[1] = dct_const_round_shift_64bit(x14[1]);
+ x15[0] = dct_const_round_shift_64bit(x15[0]);
+ x15[1] = dct_const_round_shift_64bit(x15[1]);
+ x8[0] = pack_4(x8[0], x8[1]);
+ x9[0] = pack_4(x9[0], x9[1]);
+ x10[0] = pack_4(x10[0], x10[1]);
+ x11[0] = pack_4(x11[0], x11[1]);
+ x12[0] = pack_4(x12[0], x12[1]);
+ x13[0] = pack_4(x13[0], x13[1]);
+ x14[0] = pack_4(x14[0], x14[1]);
+ x15[0] = pack_4(x15[0], x15[1]);
+
+ // stage 3
+ s0[0] = x0[0];
+ s1[0] = x1[0];
+ s2[0] = x2[0];
+ s3[0] = x3[0];
+ highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
+ highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
+ s8[0] = x8[0];
+ s9[0] = x9[0];
+ s10[0] = x10[0];
+ s11[0] = x11[0];
+ highbd_iadst_butterfly_sse4_1(x12[0], x13[0], cospi_8_64, cospi_24_64, s12,
+ s13);
+ highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_24_64, cospi_8_64, s15,
+ s14);
+
+ x0[0] = _mm_add_epi32(s0[0], s2[0]);
+ x1[0] = _mm_add_epi32(s1[0], s3[0]);
+ x2[0] = _mm_sub_epi32(s0[0], s2[0]);
+ x3[0] = _mm_sub_epi32(s1[0], s3[0]);
+ x4[0] = _mm_add_epi64(s4[0], s6[0]);
+ x4[1] = _mm_add_epi64(s4[1], s6[1]);
+ x5[0] = _mm_add_epi64(s5[0], s7[0]);
+ x5[1] = _mm_add_epi64(s5[1], s7[1]);
+ x6[0] = _mm_sub_epi64(s4[0], s6[0]);
+ x6[1] = _mm_sub_epi64(s4[1], s6[1]);
+ x7[0] = _mm_sub_epi64(s5[0], s7[0]);
+ x7[1] = _mm_sub_epi64(s5[1], s7[1]);
+ x4[0] = dct_const_round_shift_64bit(x4[0]);
+ x4[1] = dct_const_round_shift_64bit(x4[1]);
+ x5[0] = dct_const_round_shift_64bit(x5[0]);
+ x5[1] = dct_const_round_shift_64bit(x5[1]);
+ x6[0] = dct_const_round_shift_64bit(x6[0]);
+ x6[1] = dct_const_round_shift_64bit(x6[1]);
+ x7[0] = dct_const_round_shift_64bit(x7[0]);
+ x7[1] = dct_const_round_shift_64bit(x7[1]);
+ x4[0] = pack_4(x4[0], x4[1]);
+ x5[0] = pack_4(x5[0], x5[1]);
+ x6[0] = pack_4(x6[0], x6[1]);
+ x7[0] = pack_4(x7[0], x7[1]);
+ x8[0] = _mm_add_epi32(s8[0], s10[0]);
+ x9[0] = _mm_add_epi32(s9[0], s11[0]);
+ x10[0] = _mm_sub_epi32(s8[0], s10[0]);
+ x11[0] = _mm_sub_epi32(s9[0], s11[0]);
+ x12[0] = _mm_add_epi64(s12[0], s14[0]);
+ x12[1] = _mm_add_epi64(s12[1], s14[1]);
+ x13[0] = _mm_add_epi64(s13[0], s15[0]);
+ x13[1] = _mm_add_epi64(s13[1], s15[1]);
+ x14[0] = _mm_sub_epi64(s12[0], s14[0]);
+ x14[1] = _mm_sub_epi64(s12[1], s14[1]);
+ x15[0] = _mm_sub_epi64(s13[0], s15[0]);
+ x15[1] = _mm_sub_epi64(s13[1], s15[1]);
+ x12[0] = dct_const_round_shift_64bit(x12[0]);
+ x12[1] = dct_const_round_shift_64bit(x12[1]);
+ x13[0] = dct_const_round_shift_64bit(x13[0]);
+ x13[1] = dct_const_round_shift_64bit(x13[1]);
+ x14[0] = dct_const_round_shift_64bit(x14[0]);
+ x14[1] = dct_const_round_shift_64bit(x14[1]);
+ x15[0] = dct_const_round_shift_64bit(x15[0]);
+ x15[1] = dct_const_round_shift_64bit(x15[1]);
+ x12[0] = pack_4(x12[0], x12[1]);
+ x13[0] = pack_4(x13[0], x13[1]);
+ x14[0] = pack_4(x14[0], x14[1]);
+ x15[0] = pack_4(x15[0], x15[1]);
+
+ // stage 4
+ s2[0] = _mm_add_epi32(x2[0], x3[0]);
+ s3[0] = _mm_sub_epi32(x2[0], x3[0]);
+ s6[0] = _mm_add_epi32(x7[0], x6[0]);
+ s7[0] = _mm_sub_epi32(x7[0], x6[0]);
+ s10[0] = _mm_add_epi32(x11[0], x10[0]);
+ s11[0] = _mm_sub_epi32(x11[0], x10[0]);
+ s14[0] = _mm_add_epi32(x14[0], x15[0]);
+ s15[0] = _mm_sub_epi32(x14[0], x15[0]);
+ highbd_iadst_half_butterfly_sse4_1(s2[0], -cospi_16_64, s2);
+ highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
+ highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
+ highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
+ highbd_iadst_half_butterfly_sse4_1(s10[0], cospi_16_64, s10);
+ highbd_iadst_half_butterfly_sse4_1(s11[0], cospi_16_64, s11);
+ highbd_iadst_half_butterfly_sse4_1(s14[0], -cospi_16_64, s14);
+ highbd_iadst_half_butterfly_sse4_1(s15[0], cospi_16_64, s15);
+
+ x2[0] = dct_const_round_shift_64bit(s2[0]);
+ x2[1] = dct_const_round_shift_64bit(s2[1]);
+ x3[0] = dct_const_round_shift_64bit(s3[0]);
+ x3[1] = dct_const_round_shift_64bit(s3[1]);
+ x6[0] = dct_const_round_shift_64bit(s6[0]);
+ x6[1] = dct_const_round_shift_64bit(s6[1]);
+ x7[0] = dct_const_round_shift_64bit(s7[0]);
+ x7[1] = dct_const_round_shift_64bit(s7[1]);
+ x10[0] = dct_const_round_shift_64bit(s10[0]);
+ x10[1] = dct_const_round_shift_64bit(s10[1]);
+ x11[0] = dct_const_round_shift_64bit(s11[0]);
+ x11[1] = dct_const_round_shift_64bit(s11[1]);
+ x14[0] = dct_const_round_shift_64bit(s14[0]);
+ x14[1] = dct_const_round_shift_64bit(s14[1]);
+ x15[0] = dct_const_round_shift_64bit(s15[0]);
+ x15[1] = dct_const_round_shift_64bit(s15[1]);
+ x2[0] = pack_4(x2[0], x2[1]);
+ x3[0] = pack_4(x3[0], x3[1]);
+ x6[0] = pack_4(x6[0], x6[1]);
+ x7[0] = pack_4(x7[0], x7[1]);
+ x10[0] = pack_4(x10[0], x10[1]);
+ x11[0] = pack_4(x11[0], x11[1]);
+ x14[0] = pack_4(x14[0], x14[1]);
+ x15[0] = pack_4(x15[0], x15[1]);
+
+ io[0] = x0[0];
+ io[1] = _mm_sub_epi32(_mm_setzero_si128(), x8[0]);
+ io[2] = x12[0];
+ io[3] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
+ io[4] = x6[0];
+ io[5] = x14[0];
+ io[6] = x10[0];
+ io[7] = x2[0];
+ io[8] = x3[0];
+ io[9] = x11[0];
+ io[10] = x15[0];
+ io[11] = x7[0];
+ io[12] = x5[0];
+ io[13] = _mm_sub_epi32(_mm_setzero_si128(), x13[0]);
+ io[14] = x9[0];
+ io[15] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
+}
+
+void vp9_highbd_iht16x16_256_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int tx_type, int bd) {
+ int i;
+ __m128i out[16], *in;
+
+ if (bd == 8) {
+ __m128i l[16], r[16];
+
+ in = l;
+ for (i = 0; i < 2; i++) {
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+ highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
+ if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+ idct16_8col(in, in);
+ } else {
+ vpx_iadst16_8col_sse2(in);
+ }
+ in = r;
+ input += 128;
+ }
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ transpose_16bit_8x8(l + i, out);
+ transpose_16bit_8x8(r + i, out + 8);
+ if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+ idct16_8col(out, out);
+ } else {
+ vpx_iadst16_8col_sse2(out);
+ }
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[4][16];
+
+ for (i = 0; i < 4; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
+ if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+ vpx_highbd_idct16_4col_sse4_1(in);
+ } else {
+ highbd_iadst16_4col_sse4_1(in);
+ }
+ input += 4 * 16;
+ }
+
+ for (i = 0; i < 16; i += 4) {
+ int j;
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+ vpx_highbd_idct16_4col_sse4_1(out);
+ } else {
+ highbd_iadst16_4col_sse4_1(out);
+ }
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
diff --git a/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c b/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c
new file mode 100644
index 000000000..7d949b6db
--- /dev/null
+++ b/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
+ const int c,
+ __m128i *const s) {
+ const __m128i pair_c = pair_set_epi32(4 * c, 0);
+ __m128i x[2];
+
+ extend_64bit(in, x);
+ s[0] = _mm_mul_epi32(pair_c, x[0]);
+ s[1] = _mm_mul_epi32(pair_c, x[1]);
+}
+
+static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
+ const __m128i in1,
+ const int c0, const int c1,
+ __m128i *const s0,
+ __m128i *const s1) {
+ const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
+ const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
+ __m128i t00[2], t01[2], t10[2], t11[2];
+ __m128i x0[2], x1[2];
+
+ extend_64bit(in0, x0);
+ extend_64bit(in1, x1);
+ t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
+ t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
+ t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
+ t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
+ t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
+ t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
+ t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
+ t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
+
+ s0[0] = _mm_add_epi64(t00[0], t11[0]);
+ s0[1] = _mm_add_epi64(t00[1], t11[1]);
+ s1[0] = _mm_sub_epi64(t10[0], t01[0]);
+ s1[1] = _mm_sub_epi64(t10[1], t01[1]);
+}
+
+static void highbd_iadst8_sse4_1(__m128i *const io) {
+ __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+ __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2];
+
+ transpose_32bit_4x4x2(io, io);
+
+ // stage 1
+ highbd_iadst_butterfly_sse4_1(io[7], io[0], cospi_2_64, cospi_30_64, s0, s1);
+ highbd_iadst_butterfly_sse4_1(io[3], io[4], cospi_18_64, cospi_14_64, s4, s5);
+ x0[0] = _mm_add_epi64(s0[0], s4[0]);
+ x0[1] = _mm_add_epi64(s0[1], s4[1]);
+ x1[0] = _mm_add_epi64(s1[0], s5[0]);
+ x1[1] = _mm_add_epi64(s1[1], s5[1]);
+ x4[0] = _mm_sub_epi64(s0[0], s4[0]);
+ x4[1] = _mm_sub_epi64(s0[1], s4[1]);
+ x5[0] = _mm_sub_epi64(s1[0], s5[0]);
+ x5[1] = _mm_sub_epi64(s1[1], s5[1]);
+
+ highbd_iadst_butterfly_sse4_1(io[5], io[2], cospi_10_64, cospi_22_64, s2, s3);
+ highbd_iadst_butterfly_sse4_1(io[1], io[6], cospi_26_64, cospi_6_64, s6, s7);
+ x2[0] = _mm_add_epi64(s2[0], s6[0]);
+ x2[1] = _mm_add_epi64(s2[1], s6[1]);
+ x3[0] = _mm_add_epi64(s3[0], s7[0]);
+ x3[1] = _mm_add_epi64(s3[1], s7[1]);
+ x6[0] = _mm_sub_epi64(s2[0], s6[0]);
+ x6[1] = _mm_sub_epi64(s2[1], s6[1]);
+ x7[0] = _mm_sub_epi64(s3[0], s7[0]);
+ x7[1] = _mm_sub_epi64(s3[1], s7[1]);
+
+ x0[0] = dct_const_round_shift_64bit(x0[0]);
+ x0[1] = dct_const_round_shift_64bit(x0[1]);
+ x1[0] = dct_const_round_shift_64bit(x1[0]);
+ x1[1] = dct_const_round_shift_64bit(x1[1]);
+ x2[0] = dct_const_round_shift_64bit(x2[0]);
+ x2[1] = dct_const_round_shift_64bit(x2[1]);
+ x3[0] = dct_const_round_shift_64bit(x3[0]);
+ x3[1] = dct_const_round_shift_64bit(x3[1]);
+ x4[0] = dct_const_round_shift_64bit(x4[0]);
+ x4[1] = dct_const_round_shift_64bit(x4[1]);
+ x5[0] = dct_const_round_shift_64bit(x5[0]);
+ x5[1] = dct_const_round_shift_64bit(x5[1]);
+ x6[0] = dct_const_round_shift_64bit(x6[0]);
+ x6[1] = dct_const_round_shift_64bit(x6[1]);
+ x7[0] = dct_const_round_shift_64bit(x7[0]);
+ x7[1] = dct_const_round_shift_64bit(x7[1]);
+ s0[0] = pack_4(x0[0], x0[1]); // s0 = x0;
+ s1[0] = pack_4(x1[0], x1[1]); // s1 = x1;
+ s2[0] = pack_4(x2[0], x2[1]); // s2 = x2;
+ s3[0] = pack_4(x3[0], x3[1]); // s3 = x3;
+ x4[0] = pack_4(x4[0], x4[1]);
+ x5[0] = pack_4(x5[0], x5[1]);
+ x6[0] = pack_4(x6[0], x6[1]);
+ x7[0] = pack_4(x7[0], x7[1]);
+
+ // stage 2
+ x0[0] = _mm_add_epi32(s0[0], s2[0]);
+ x1[0] = _mm_add_epi32(s1[0], s3[0]);
+ x2[0] = _mm_sub_epi32(s0[0], s2[0]);
+ x3[0] = _mm_sub_epi32(s1[0], s3[0]);
+
+ highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
+ highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
+
+ x4[0] = _mm_add_epi64(s4[0], s6[0]);
+ x4[1] = _mm_add_epi64(s4[1], s6[1]);
+ x5[0] = _mm_add_epi64(s5[0], s7[0]);
+ x5[1] = _mm_add_epi64(s5[1], s7[1]);
+ x6[0] = _mm_sub_epi64(s4[0], s6[0]);
+ x6[1] = _mm_sub_epi64(s4[1], s6[1]);
+ x7[0] = _mm_sub_epi64(s5[0], s7[0]);
+ x7[1] = _mm_sub_epi64(s5[1], s7[1]);
+ x4[0] = dct_const_round_shift_64bit(x4[0]);
+ x4[1] = dct_const_round_shift_64bit(x4[1]);
+ x5[0] = dct_const_round_shift_64bit(x5[0]);
+ x5[1] = dct_const_round_shift_64bit(x5[1]);
+ x6[0] = dct_const_round_shift_64bit(x6[0]);
+ x6[1] = dct_const_round_shift_64bit(x6[1]);
+ x7[0] = dct_const_round_shift_64bit(x7[0]);
+ x7[1] = dct_const_round_shift_64bit(x7[1]);
+ x4[0] = pack_4(x4[0], x4[1]);
+ x5[0] = pack_4(x5[0], x5[1]);
+ x6[0] = pack_4(x6[0], x6[1]);
+ x7[0] = pack_4(x7[0], x7[1]);
+
+ // stage 3
+ s2[0] = _mm_add_epi32(x2[0], x3[0]);
+ s3[0] = _mm_sub_epi32(x2[0], x3[0]);
+ s6[0] = _mm_add_epi32(x6[0], x7[0]);
+ s7[0] = _mm_sub_epi32(x6[0], x7[0]);
+ highbd_iadst_half_butterfly_sse4_1(s2[0], cospi_16_64, s2);
+ highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
+ highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
+ highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
+
+ x2[0] = dct_const_round_shift_64bit(s2[0]);
+ x2[1] = dct_const_round_shift_64bit(s2[1]);
+ x3[0] = dct_const_round_shift_64bit(s3[0]);
+ x3[1] = dct_const_round_shift_64bit(s3[1]);
+ x6[0] = dct_const_round_shift_64bit(s6[0]);
+ x6[1] = dct_const_round_shift_64bit(s6[1]);
+ x7[0] = dct_const_round_shift_64bit(s7[0]);
+ x7[1] = dct_const_round_shift_64bit(s7[1]);
+ x2[0] = pack_4(x2[0], x2[1]);
+ x3[0] = pack_4(x3[0], x3[1]);
+ x6[0] = pack_4(x6[0], x6[1]);
+ x7[0] = pack_4(x7[0], x7[1]);
+
+ io[0] = x0[0];
+ io[1] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
+ io[2] = x6[0];
+ io[3] = _mm_sub_epi32(_mm_setzero_si128(), x2[0]);
+ io[4] = x3[0];
+ io[5] = _mm_sub_epi32(_mm_setzero_si128(), x7[0]);
+ io[6] = x5[0];
+ io[7] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
+}
+
+void vp9_highbd_iht8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int tx_type, int bd) {
+ __m128i io[16];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+ io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
+ io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+ io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
+ io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+ io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
+ io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+ io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
+ io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+ io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+ io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+ io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+ io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+ io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+ io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+ io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+
+ if (bd == 8) {
+ __m128i io_short[8];
+
+ io_short[0] = _mm_packs_epi32(io[0], io[4]);
+ io_short[1] = _mm_packs_epi32(io[1], io[5]);
+ io_short[2] = _mm_packs_epi32(io[2], io[6]);
+ io_short[3] = _mm_packs_epi32(io[3], io[7]);
+ io_short[4] = _mm_packs_epi32(io[8], io[12]);
+ io_short[5] = _mm_packs_epi32(io[9], io[13]);
+ io_short[6] = _mm_packs_epi32(io[10], io[14]);
+ io_short[7] = _mm_packs_epi32(io[11], io[15]);
+
+ if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+ vpx_idct8_sse2(io_short);
+ } else {
+ iadst8_sse2(io_short);
+ }
+ if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+ vpx_idct8_sse2(io_short);
+ } else {
+ iadst8_sse2(io_short);
+ }
+ round_shift_8x8(io_short, io);
+ } else {
+ __m128i temp[4];
+
+ if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+ vpx_highbd_idct8x8_half1d_sse4_1(io);
+ vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
+ } else {
+ highbd_iadst8_sse4_1(io);
+ highbd_iadst8_sse4_1(&io[8]);
+ }
+
+ temp[0] = io[4];
+ temp[1] = io[5];
+ temp[2] = io[6];
+ temp[3] = io[7];
+ io[4] = io[8];
+ io[5] = io[9];
+ io[6] = io[10];
+ io[7] = io[11];
+
+ if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+ vpx_highbd_idct8x8_half1d_sse4_1(io);
+ io[8] = temp[0];
+ io[9] = temp[1];
+ io[10] = temp[2];
+ io[11] = temp[3];
+ vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
+ } else {
+ highbd_iadst8_sse4_1(io);
+ io[8] = temp[0];
+ io[9] = temp[1];
+ io[10] = temp[2];
+ io[11] = temp[3];
+ highbd_iadst8_sse4_1(&io[8]);
+ }
+ highbd_idct8x8_final_round(io);
+ }
+ recon_and_store_8x8(io, dest, stride, bd);
+}
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 6996260e2..95dad919e 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -68,16 +68,16 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
switch (tx_type) {
case 0: // DCT_DCT
- idct8_sse2(in);
- idct8_sse2(in);
+ vpx_idct8_sse2(in);
+ vpx_idct8_sse2(in);
break;
case 1: // ADST_DCT
- idct8_sse2(in);
+ vpx_idct8_sse2(in);
iadst8_sse2(in);
break;
case 2: // DCT_ADST
iadst8_sse2(in);
- idct8_sse2(in);
+ vpx_idct8_sse2(in);
break;
case 3: // ADST_ADST
iadst8_sse2(in);
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 9819fb641..377411431 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -82,6 +82,8 @@ VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
else
VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht4x4_add_sse4.c
+VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht8x8_add_sse4.c
+VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht16x16_add_sse4.c
endif
$(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl))
diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c
index 4390cf7c8..a26204bc4 100644
--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c
@@ -12,8 +12,11 @@
* \brief Provides the high level interface to wrap encoder algorithms.
*
*/
+#include <assert.h>
#include <limits.h>
+#include <stdlib.h>
#include <string.h>
+#include "vp8/common/blockd.h"
#include "vpx_config.h"
#include "vpx/internal/vpx_codec_internal.h"
@@ -89,28 +92,27 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
if (dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 ||
dsf->den > dsf->num) {
res = VPX_CODEC_INVALID_PARAM;
- break;
+ } else {
+ mr_cfg.mr_low_res_mode_info = mem_loc;
+ mr_cfg.mr_total_resolutions = num_enc;
+ mr_cfg.mr_encoder_id = num_enc - 1 - i;
+ mr_cfg.mr_down_sampling_factor.num = dsf->num;
+ mr_cfg.mr_down_sampling_factor.den = dsf->den;
+
+ /* Force Key-frame synchronization. Namely, encoder at higher
+ * resolution always use the same frame_type chosen by the
+ * lowest-resolution encoder.
+ */
+ if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED;
+
+ ctx->iface = iface;
+ ctx->name = iface->name;
+ ctx->priv = NULL;
+ ctx->init_flags = flags;
+ ctx->config.enc = cfg;
+ res = ctx->iface->init(ctx, &mr_cfg);
}
- mr_cfg.mr_low_res_mode_info = mem_loc;
- mr_cfg.mr_total_resolutions = num_enc;
- mr_cfg.mr_encoder_id = num_enc - 1 - i;
- mr_cfg.mr_down_sampling_factor.num = dsf->num;
- mr_cfg.mr_down_sampling_factor.den = dsf->den;
-
- /* Force Key-frame synchronization. Namely, encoder at higher
- * resolution always use the same frame_type chosen by the
- * lowest-resolution encoder.
- */
- if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED;
-
- ctx->iface = iface;
- ctx->name = iface->name;
- ctx->priv = NULL;
- ctx->init_flags = flags;
- ctx->config.enc = cfg;
- res = ctx->iface->init(ctx, &mr_cfg);
-
if (res) {
const char *error_detail = ctx->priv ? ctx->priv->err_detail : NULL;
/* Destroy current ctx */
@@ -124,10 +126,14 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
vpx_codec_destroy(ctx);
i--;
}
+#if CONFIG_MULTI_RES_ENCODING
+ assert(mem_loc);
+ free(((LOWER_RES_FRAME_INFO *)mem_loc)->mb_info);
+ free(mem_loc);
+#endif
+ return SAVE_STATUS(ctx, res);
}
- if (res) break;
-
ctx++;
cfg++;
dsf++;
diff --git a/vpx_dsp/x86/highbd_idct16x16_add_sse4.c b/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
index de097c66a..7898ee12c 100644
--- a/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
@@ -53,7 +53,7 @@ static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
out[15] = in[15];
}
-static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
+void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/) {
__m128i step1[16], step2[16];
// stage 2
@@ -233,7 +233,7 @@ void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input,
in = all[i];
highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
- highbd_idct16_4col(in);
+ vpx_highbd_idct16_4col_sse4_1(in);
input += 4 * 16;
}
@@ -243,7 +243,7 @@ void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input,
transpose_32bit_4x4(all[1] + i, out + 4);
transpose_32bit_4x4(all[2] + i, out + 8);
transpose_32bit_4x4(all[3] + i, out + 12);
- highbd_idct16_4col(out);
+ vpx_highbd_idct16_4col_sse4_1(out);
for (j = 0; j < 16; ++j) {
highbd_write_buffer_4(dest + j * stride, out[j], bd);
diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
index 909a6b794..bb7a510e1 100644
--- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
@@ -124,8 +124,8 @@ void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,
io_short[6] = _mm_packs_epi32(io[10], io[14]);
io_short[7] = _mm_packs_epi32(io[11], io[15]);
- idct8_sse2(io_short);
- idct8_sse2(io_short);
+ vpx_idct8_sse2(io_short);
+ vpx_idct8_sse2(io_short);
round_shift_8x8(io_short, io);
} else {
__m128i temp[4];
diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse4.c b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
index ae391b2c0..8b2e3d241 100644
--- a/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
@@ -17,7 +17,7 @@
#include "vpx_dsp/x86/inv_txfm_ssse3.h"
#include "vpx_dsp/x86/transpose_sse2.h"
-static void highbd_idct8x8_half1d(__m128i *const io) {
+void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io) {
__m128i step1[8], step2[8];
transpose_32bit_4x4x2(io, io);
@@ -126,13 +126,13 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
io_short[6] = _mm_packs_epi32(io[10], io[14]);
io_short[7] = _mm_packs_epi32(io[11], io[15]);
- idct8_sse2(io_short);
- idct8_sse2(io_short);
+ vpx_idct8_sse2(io_short);
+ vpx_idct8_sse2(io_short);
round_shift_8x8(io_short, io);
} else {
__m128i temp[4];
- highbd_idct8x8_half1d(io);
+ vpx_highbd_idct8x8_half1d_sse4_1(io);
io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
@@ -142,7 +142,7 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
- highbd_idct8x8_half1d(&io[8]);
+ vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
temp[0] = io[4];
temp[1] = io[5];
@@ -152,13 +152,13 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
io[5] = io[9];
io[6] = io[10];
io[7] = io[11];
- highbd_idct8x8_half1d(io);
+ vpx_highbd_idct8x8_half1d_sse4_1(io);
io[8] = temp[0];
io[9] = temp[1];
io[10] = temp[2];
io[11] = temp[3];
- highbd_idct8x8_half1d(&io[8]);
+ vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
highbd_idct8x8_final_round(io);
}
diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/vpx_dsp/x86/highbd_inv_txfm_sse4.h
index 435934f1b..5a7fd1d39 100644
--- a/vpx_dsp/x86/highbd_inv_txfm_sse4.h
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse4.h
@@ -106,4 +106,7 @@ static INLINE void highbd_idct4_sse4_1(__m128i *const io) {
io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3]
}
+void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io);
+void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/);
+
#endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c
index 6b1837df5..4b02da966 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -165,7 +165,7 @@ void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
// 2-D
for (i = 0; i < 2; i++) {
- idct8_sse2(in);
+ vpx_idct8_sse2(in);
}
write_buffer_8x8(in, dest, stride);
@@ -221,7 +221,7 @@ void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
recon_and_store_8_dual(dest, dc_value, stride);
}
-void idct8_sse2(__m128i *const in) {
+void vpx_idct8_sse2(__m128i *const in) {
// 8x8 Transpose is copied from vpx_fdct8x8_sse2()
transpose_16bit_8x8(in, in);
@@ -514,7 +514,7 @@ void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
}
}
-static void iadst16_8col(__m128i *const in) {
+void vpx_iadst16_8col_sse2(__m128i *const in) {
// perform 16x16 1-D ADST for 8 columns
__m128i s[16], x[16], u[32], v[32];
const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -874,8 +874,8 @@ void idct16_sse2(__m128i *const in0, __m128i *const in1) {
void iadst16_sse2(__m128i *const in0, __m128i *const in1) {
transpose_16bit_16x16(in0, in1);
- iadst16_8col(in0);
- iadst16_8col(in1);
+ vpx_iadst16_8col_sse2(in0);
+ vpx_iadst16_8col_sse2(in1);
}
// Group the coefficient calculation into smaller functions to prevent stack
diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h
index 5cd5098f1..d573f66c9 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.h
+++ b/vpx_dsp/x86/inv_txfm_sse2.h
@@ -697,10 +697,11 @@ static INLINE void idct32_8x32_quarter_3_4_stage_4_to_7(
}
void idct4_sse2(__m128i *const in);
-void idct8_sse2(__m128i *const in);
+void vpx_idct8_sse2(__m128i *const in);
void idct16_sse2(__m128i *const in0, __m128i *const in1);
void iadst4_sse2(__m128i *const in);
void iadst8_sse2(__m128i *const in);
+void vpx_iadst16_8col_sse2(__m128i *const in);
void iadst16_sse2(__m128i *const in0, __m128i *const in1);
void idct32_1024_8x32(const __m128i *const in, __m128i *const out);
void idct32_34_8x32_sse2(const __m128i *const in, __m128i *const out);