diff options
60 files changed, 1651 insertions, 1162 deletions
diff --git a/build/make/Android.mk b/build/make/Android.mk index db0cebff5..cf6221017 100644 --- a/build/make/Android.mk +++ b/build/make/Android.mk @@ -112,12 +112,12 @@ endef # Use ads2gas script to convert from RVCT format to GAS format. This passes # puts the processed file under $(ASM_CNV_PATH). Local clean rule # to handle removing these -ASM_CNV_OFFSETS_DEPEND = $(ASM_CNV_PATH)/asm_com_offsets.asm +ASM_CNV_OFFSETS_DEPEND = $(ASM_CNV_PATH)/vp8_asm_com_offsets.asm ifeq ($(CONFIG_VP8_DECODER), yes) - ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/asm_dec_offsets.asm + ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/vp8_asm_dec_offsets.asm endif ifeq ($(CONFIG_VP8_ENCODER), yes) - ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/asm_enc_offsets.asm + ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/vp8_asm_enc_offsets.asm endif .PRECIOUS: %.asm.s @@ -190,19 +190,19 @@ clean: include $(BUILD_SHARED_LIBRARY) $(eval $(call asm_offsets_template,\ - $(ASM_CNV_PATH)/asm_com_offsets.asm, \ - $(LIBVPX_PATH)/vp8/common/asm_com_offsets.c)) + $(ASM_CNV_PATH)/vp8_asm_com_offsets.asm, \ + $(LIBVPX_PATH)/vp8/common/vp8_asm_com_offsets.c)) ifeq ($(CONFIG_VP8_DECODER), yes) $(eval $(call asm_offsets_template,\ - $(ASM_CNV_PATH)/asm_dec_offsets.asm, \ - $(LIBVPX_PATH)/vp8/decoder/asm_dec_offsets.c)) + $(ASM_CNV_PATH)/vp8_asm_dec_offsets.asm, \ + $(LIBVPX_PATH)/vp8/decoder/vp8_asm_dec_offsets.c)) endif ifeq ($(CONFIG_VP8_ENCODER), yes) $(eval $(call asm_offsets_template,\ - $(ASM_CNV_PATH)/asm_enc_offsets.asm, \ - $(LIBVPX_PATH)/vp8/encoder/asm_enc_offsets.c)) + $(ASM_CNV_PATH)/vp8_asm_enc_offsets.asm, \ + $(LIBVPX_PATH)/vp8/encoder/vp8_asm_enc_offsets.c)) endif ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes) diff --git a/build/make/Makefile b/build/make/Makefile index da7fb03a0..4ac5bcf1f 100644 --- a/build/make/Makefile +++ b/build/make/Makefile @@ -377,7 +377,7 @@ ifneq ($(call enabled,DIST-SRCS),) DIST-SRCS-$(CONFIG_MSVS) += build/x86-msvs/yasm.rules DIST-SRCS-$(CONFIG_MSVS) += build/x86-msvs/obj_int_extract.bat DIST-SRCS-$(CONFIG_RVCT) += build/make/armlink_adapter.sh - # Include obj_int_extract if we use offsets from asm_*_offsets + # Include obj_int_extract if we use offsets from *_asm_*_offsets DIST-SRCS-$(ARCH_ARM)$(ARCH_X86)$(ARCH_X86_64) += build/make/obj_int_extract.c DIST-SRCS-$(ARCH_ARM) += build/make/ads2gas.pl DIST-SRCS-$(ARCH_ARM) += build/make/ads2gas_apple.pl diff --git a/build/make/obj_int_extract.c b/build/make/obj_int_extract.c index f86cec2ac..1604b5e68 100644 --- a/build/make/obj_int_extract.c +++ b/build/make/obj_int_extract.c @@ -144,7 +144,7 @@ int parse_macho(uint8_t *base_buf, size_t sz) { /* Location of string is cacluated each time from the * start of the string buffer. On darwin the symbols * are prefixed by "_", so we bump the pointer by 1. - * The target value is defined as an int in asm_*_offsets.c, + * The target value is defined as an int in *_asm_*_offsets.c, * which is 4 bytes on all targets we currently use. */ if (bits == 32) { @@ -446,7 +446,7 @@ int parse_elf(uint8_t *buf, size_t sz, output_fmt_t mode) { if (strcmp(section_name, ".bss")) { if (sizeof(val) != sym.st_size) { /* The target value is declared as an int in - * asm_*_offsets.c, which is 4 bytes on all + * *_asm_*_offsets.c, which is 4 bytes on all * targets we currently use. Complain loudly if * this is not true. */ @@ -528,7 +528,7 @@ int parse_elf(uint8_t *buf, size_t sz, output_fmt_t mode) { if ((strcmp(section_name, ".bss"))) { if (sizeof(val) != sym.st_size) { /* The target value is declared as an int in - * asm_*_offsets.c, which is 4 bytes on all + * *_asm_*_offsets.c, which is 4 bytes on all * targets we currently use. Complain loudly if * this is not true. */ diff --git a/build/x86-msvs/obj_int_extract.bat b/build/x86-msvs/obj_int_extract.bat index 70b39f68a..47fef974c 100644 --- a/build/x86-msvs/obj_int_extract.bat +++ b/build/x86-msvs/obj_int_extract.bat @@ -14,10 +14,10 @@ obj_int_extract.exe rvds "vp9_asm_com_offsets.obj" > "vp9_asm_com_offsets.asm" obj_int_extract.exe rvds "vp9_asm_dec_offsets.obj" > "vp9_asm_dec_offsets.asm" obj_int_extract.exe rvds "vp9_asm_enc_offsets.obj" > "vp9_asm_enc_offsets.asm" -cl /I "./" /I "%1" /nologo /c "%1/vp8/common/asm_com_offsets.c" -cl /I "./" /I "%1" /nologo /c "%1/vp8/decoder/asm_dec_offsets.c" -cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/asm_enc_offsets.c" -obj_int_extract.exe rvds "asm_com_offsets.obj" > "vp8_asm_com_offsets.asm" -obj_int_extract.exe rvds "asm_dec_offsets.obj" > "vp8_asm_dec_offsets.asm" -obj_int_extract.exe rvds "asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm" +cl /I "./" /I "%1" /nologo /c "%1/vp8/common/vp8_asm_com_offsets.c" +cl /I "./" /I "%1" /nologo /c "%1/vp8/decoder/vp8_asm_dec_offsets.c" +cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/vp8_asm_enc_offsets.c" +obj_int_extract.exe rvds "vp8_asm_com_offsets.obj" > "vp8_asm_com_offsets.asm" +obj_int_extract.exe rvds "vp8_asm_dec_offsets.obj" > "vp8_asm_dec_offsets.asm" +obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm" @@ -299,6 +299,7 @@ CONFIG_LIST=" multi_res_encoding temporal_denoising experimental + decrypt ${EXPERIMENT_LIST} " CMDLINE_SELECT=" @@ -348,6 +349,7 @@ CMDLINE_SELECT=" multi_res_encoding temporal_denoising experimental + decrypt " process_cmdline() { diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc index da43310c1..1f6d54064 100644 --- a/test/decode_test_driver.cc +++ b/test/decode_test_driver.cc @@ -14,18 +14,13 @@ #include "test/video_source.h" namespace libvpx_test { -void Decoder::DecodeFrame(const uint8_t *cxdata, int size) { - if (!decoder_.priv) { - const vpx_codec_err_t res_init = vpx_codec_dec_init(&decoder_, - CodecInterface(), - &cfg_, 0); - ASSERT_EQ(VPX_CODEC_OK, res_init) << DecodeError(); - } +vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, int size) { vpx_codec_err_t res_dec; + InitOnce(); REGISTER_STATE_CHECK(res_dec = vpx_codec_decode(&decoder_, cxdata, size, NULL, 0)); - ASSERT_EQ(VPX_CODEC_OK, res_dec) << DecodeError(); + return res_dec; } void DecoderTest::RunLoop(CompressedVideoSource *video) { @@ -35,7 +30,9 @@ void DecoderTest::RunLoop(CompressedVideoSource *video) { // Decode frames. for (video->Begin(); video->cxdata(); video->Next()) { - decoder->DecodeFrame(video->cxdata(), video->frame_size()); + vpx_codec_err_t res_dec = decoder->DecodeFrame(video->cxdata(), + video->frame_size()); + ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError(); DxDataIterator dec_iter = decoder->GetDxData(); const vpx_image_t *img = NULL; diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h index ed7069004..49e7384f4 100644 --- a/test/decode_test_driver.h +++ b/test/decode_test_driver.h @@ -42,7 +42,7 @@ class DxDataIterator { class Decoder { public: Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline) - : cfg_(cfg), deadline_(deadline) { + : cfg_(cfg), deadline_(deadline), init_done_(false) { memset(&decoder_, 0, sizeof(decoder_)); } @@ -50,7 +50,7 @@ class Decoder { vpx_codec_destroy(&decoder_); } - void DecodeFrame(const uint8_t *cxdata, int size); + vpx_codec_err_t DecodeFrame(const uint8_t *cxdata, int size); DxDataIterator GetDxData() { return DxDataIterator(&decoder_); @@ -61,21 +61,39 @@ class Decoder { } void Control(int ctrl_id, int arg) { + InitOnce(); const vpx_codec_err_t res = vpx_codec_control_(&decoder_, ctrl_id, arg); ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError(); } - protected: - virtual const vpx_codec_iface_t* CodecInterface() const = 0; + void Control(int ctrl_id, const void *arg) { + InitOnce(); + const vpx_codec_err_t res = vpx_codec_control_(&decoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError(); + } const char* DecodeError() { const char *detail = vpx_codec_error_detail(&decoder_); return detail ? detail : vpx_codec_error(&decoder_); } + protected: + virtual const vpx_codec_iface_t* CodecInterface() const = 0; + + void InitOnce() { + if (!init_done_) { + const vpx_codec_err_t res = vpx_codec_dec_init(&decoder_, + CodecInterface(), + &cfg_, 0); + ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError(); + init_done_ = true; + } + } + vpx_codec_ctx_t decoder_; vpx_codec_dec_cfg_t cfg_; unsigned int deadline_; + bool init_done_; }; // Common test functionality for all Decoder tests. diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc index 75921aa02..eed3e33af 100644 --- a/test/encode_test_driver.cc +++ b/test/encode_test_driver.cc @@ -175,8 +175,9 @@ void EncoderTest::RunLoop(VideoSource *video) { case VPX_CODEC_CX_FRAME_PKT: has_cxdata = true; if (decoder && DoDecode()) { - decoder->DecodeFrame((const uint8_t*)pkt->data.frame.buf, - pkt->data.frame.sz); + vpx_codec_err_t res_dec = decoder->DecodeFrame( + (const uint8_t*)pkt->data.frame.buf, pkt->data.frame.sz); + ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError(); has_dxdata = true; } ASSERT_GE(pkt->data.frame.pts, last_pts_); diff --git a/test/test.mk b/test/test.mk index 793fbf8b2..0d069d026 100644 --- a/test/test.mk +++ b/test/test.mk @@ -31,6 +31,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ivf_video_source.h LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += test_vector_test.cc + ## ## WHITE BOX TESTS ## @@ -55,6 +56,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc LIBVPX_TEST_SRCS-yes += sixtap_predict_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc endif # VP8 diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc index acff0fdfb..711d0bd45 100644 --- a/test/tile_independence_test.cc +++ b/test/tile_independence_test.cc @@ -32,10 +32,9 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest, cfg.w = 704; cfg.h = 144; cfg.threads = 1; - cfg.inv_tile_order = 0; fw_dec_ = codec_->CreateDecoder(cfg, 0); - cfg.inv_tile_order = 1; inv_dec_ = codec_->CreateDecoder(cfg, 0); + inv_dec_->Control(VP9_INVERT_TILE_DECODE_ORDER, 1); } virtual ~TileIndependenceTest() { diff --git a/test/vp8_boolcoder_test.cc b/test/vp8_boolcoder_test.cc index 4e21be8c5..ab19c3412 100644 --- a/test/vp8_boolcoder_test.cc +++ b/test/vp8_boolcoder_test.cc @@ -26,6 +26,20 @@ extern "C" { namespace { const int num_tests = 10; + +void encrypt_buffer(uint8_t *buffer, int size, const uint8_t *key) { + for (int i = 0; i < size; ++i) { + buffer[i] ^= key[i % 32]; + } +} + +const uint8_t secret_key[32] = { + 234, 32, 2, 3, 4, 230, 6, 11, + 0, 132, 22, 23, 45, 21, 124, 255, + 0, 43, 52, 3, 23, 63, 99, 7, + 120, 8, 252, 84, 4, 83, 6, 13 +}; + } // namespace using libvpx_test::ACMRandom; @@ -71,7 +85,12 @@ TEST(VP8, TestBitIO) { vp8_stop_encode(&bw); BOOL_DECODER br; - vp8dx_start_decode(&br, bw_buffer, buffer_size); + +#if CONFIG_DECRYPT + encrypt_buffer(bw_buffer, buffer_size, secret_key); +#endif + + vp8dx_start_decode(&br, bw_buffer, buffer_size, bw_buffer, secret_key); bit_rnd.Reset(random_seed); for (int i = 0; i < bits_to_test; ++i) { if (bit_method == 2) { diff --git a/test/vp8_decrypt_test.cc b/test/vp8_decrypt_test.cc new file mode 100644 index 000000000..ea7b92049 --- /dev/null +++ b/test/vp8_decrypt_test.cc @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <cstdio> +#include <cstdlib> +#include <string> +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/decode_test_driver.h" +#include "test/ivf_video_source.h" + +#if CONFIG_DECRYPT + +namespace { + +const uint8_t decrypt_key[32] = { + 255, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +}; + +} // namespace + +namespace libvpx_test { + +TEST(TestDecrypt, NullKey) { + vpx_codec_dec_cfg_t cfg = {0}; + vpx_codec_ctx_t decoder = {0}; + vpx_codec_err_t res = vpx_codec_dec_init(&decoder, &vpx_codec_vp8_dx_algo, + &cfg, 0); + ASSERT_EQ(VPX_CODEC_OK, res); + + res = vpx_codec_control(&decoder, VP8_SET_DECRYPT_KEY, NULL); + ASSERT_EQ(VPX_CODEC_INVALID_PARAM, res); +} + +TEST(TestDecrypt, DecryptWorks) { + libvpx_test::IVFVideoSource video("vp80-00-comprehensive-001.ivf"); + video.Init(); + + vpx_codec_dec_cfg_t dec_cfg = {0}; + Decoder decoder(dec_cfg, 0); + + // Zero decrypt key (by default) + video.Begin(); + vpx_codec_err_t res = decoder.DecodeFrame(video.cxdata(), video.frame_size()); + ASSERT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError(); + + // Non-zero decrypt key + video.Next(); + decoder.Control(VP8_SET_DECRYPT_KEY, decrypt_key); + res = decoder.DecodeFrame(video.cxdata(), video.frame_size()); + ASSERT_NE(VPX_CODEC_OK, res) << decoder.DecodeError(); +} + +} // namespace libvpx_test + +#endif // CONFIG_DECRYPT diff --git a/vp8/common/loopfilter.c b/vp8/common/loopfilter.c index 8681b7a6a..19857a7e9 100644 --- a/vp8/common/loopfilter.c +++ b/vp8/common/loopfilter.c @@ -156,39 +156,38 @@ void vp8_loop_filter_frame_init(VP8_COMMON *cm, continue; } - lvl_ref = lvl_seg; - /* INTRA_FRAME */ ref = INTRA_FRAME; /* Apply delta for reference frame */ - lvl_ref += mbd->ref_lf_deltas[ref]; + lvl_ref = lvl_seg + mbd->ref_lf_deltas[ref]; /* Apply delta for Intra modes */ mode = 0; /* B_PRED */ /* Only the split mode BPRED has a further special case */ - lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode]; - lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */ + lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode]; + /* clamp */ + lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; lfi->lvl[seg][ref][mode] = lvl_mode; mode = 1; /* all the rest of Intra modes */ - lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0; /* clamp */ + /* clamp */ + lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0; lfi->lvl[seg][ref][mode] = lvl_mode; /* LAST, GOLDEN, ALT */ for(ref = 1; ref < MAX_REF_FRAMES; ref++) { - int lvl_ref = lvl_seg; - /* Apply delta for reference frame */ - lvl_ref += mbd->ref_lf_deltas[ref]; + lvl_ref = lvl_seg + mbd->ref_lf_deltas[ref]; /* Apply delta for Inter modes */ for (mode = 1; mode < 4; mode++) { lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode]; - lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */ + /* clamp */ + lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; lfi->lvl[seg][ref][mode] = lvl_mode; } diff --git a/vp8/common/loopfilter_filters.c b/vp8/common/loopfilter_filters.c index 8235f6e9f..1d51696ff 100644 --- a/vp8/common/loopfilter_filters.c +++ b/vp8/common/loopfilter_filters.c @@ -54,7 +54,7 @@ static void vp8_filter(signed char mask, uc hev, uc *op1, { signed char ps0, qs0; signed char ps1, qs1; - signed char vp8_filter, Filter1, Filter2; + signed char filter_value, Filter1, Filter2; signed char u; ps1 = (signed char) * op1 ^ 0x80; @@ -63,35 +63,35 @@ static void vp8_filter(signed char mask, uc hev, uc *op1, qs1 = (signed char) * oq1 ^ 0x80; /* add outer taps if we have high edge variance */ - vp8_filter = vp8_signed_char_clamp(ps1 - qs1); - vp8_filter &= hev; + filter_value = vp8_signed_char_clamp(ps1 - qs1); + filter_value &= hev; /* inner taps */ - vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); - vp8_filter &= mask; + filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0)); + filter_value &= mask; /* save bottom 3 bits so that we round one side +4 and the other +3 * if it equals 4 we'll set to adjust by -1 to account for the fact * we'd round 3 the other way */ - Filter1 = vp8_signed_char_clamp(vp8_filter + 4); - Filter2 = vp8_signed_char_clamp(vp8_filter + 3); + Filter1 = vp8_signed_char_clamp(filter_value + 4); + Filter2 = vp8_signed_char_clamp(filter_value + 3); Filter1 >>= 3; Filter2 >>= 3; u = vp8_signed_char_clamp(qs0 - Filter1); *oq0 = u ^ 0x80; u = vp8_signed_char_clamp(ps0 + Filter2); *op0 = u ^ 0x80; - vp8_filter = Filter1; + filter_value = Filter1; /* outer tap adjustments */ - vp8_filter += 1; - vp8_filter >>= 1; - vp8_filter &= ~hev; + filter_value += 1; + filter_value >>= 1; + filter_value &= ~hev; - u = vp8_signed_char_clamp(qs1 - vp8_filter); + u = vp8_signed_char_clamp(qs1 - filter_value); *oq1 = u ^ 0x80; - u = vp8_signed_char_clamp(ps1 + vp8_filter); + u = vp8_signed_char_clamp(ps1 + filter_value); *op1 = u ^ 0x80; } @@ -162,7 +162,7 @@ static void vp8_mbfilter(signed char mask, uc hev, uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2) { signed char s, u; - signed char vp8_filter, Filter1, Filter2; + signed char filter_value, Filter1, Filter2; signed char ps2 = (signed char) * op2 ^ 0x80; signed char ps1 = (signed char) * op1 ^ 0x80; signed char ps0 = (signed char) * op0 ^ 0x80; @@ -171,11 +171,11 @@ static void vp8_mbfilter(signed char mask, uc hev, signed char qs2 = (signed char) * oq2 ^ 0x80; /* add outer taps if we have high edge variance */ - vp8_filter = vp8_signed_char_clamp(ps1 - qs1); - vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); - vp8_filter &= mask; + filter_value = vp8_signed_char_clamp(ps1 - qs1); + filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0)); + filter_value &= mask; - Filter2 = vp8_filter; + Filter2 = filter_value; Filter2 &= hev; /* save bottom 3 bits so that we round one side +4 and the other +3 */ @@ -188,8 +188,8 @@ static void vp8_mbfilter(signed char mask, uc hev, /* only apply wider filter if not high edge variance */ - vp8_filter &= ~hev; - Filter2 = vp8_filter; + filter_value &= ~hev; + Filter2 = filter_value; /* roughly 3/7th difference across boundary */ u = vp8_signed_char_clamp((63 + Filter2 * 27) >> 7); @@ -291,24 +291,24 @@ static signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0, uc q1) static void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0, uc *oq1) { - signed char vp8_filter, Filter1, Filter2; + signed char filter_value, Filter1, Filter2; signed char p1 = (signed char) * op1 ^ 0x80; signed char p0 = (signed char) * op0 ^ 0x80; signed char q0 = (signed char) * oq0 ^ 0x80; signed char q1 = (signed char) * oq1 ^ 0x80; signed char u; - vp8_filter = vp8_signed_char_clamp(p1 - q1); - vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (q0 - p0)); - vp8_filter &= mask; + filter_value = vp8_signed_char_clamp(p1 - q1); + filter_value = vp8_signed_char_clamp(filter_value + 3 * (q0 - p0)); + filter_value &= mask; /* save bottom 3 bits so that we round one side +4 and the other +3 */ - Filter1 = vp8_signed_char_clamp(vp8_filter + 4); + Filter1 = vp8_signed_char_clamp(filter_value + 4); Filter1 >>= 3; u = vp8_signed_char_clamp(q0 - Filter1); *oq0 = u ^ 0x80; - Filter2 = vp8_signed_char_clamp(vp8_filter + 3); + Filter2 = vp8_signed_char_clamp(filter_value + 3); Filter2 >>= 3; u = vp8_signed_char_clamp(p0 + Filter2); *op0 = u ^ 0x80; diff --git a/vp8/common/reconintra.c b/vp8/common/reconintra.c index a85121579..ec51ffe40 100644 --- a/vp8/common/reconintra.c +++ b/vp8/common/reconintra.c @@ -36,7 +36,6 @@ void vp8_build_intra_predictors_mby_s_c(MACROBLOCKD *x, case DC_PRED: { int expected_dc; - int i; int shift; int average = 0; @@ -168,7 +167,6 @@ void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x, { int expected_udc; int expected_vdc; - int i; int shift; int Uaverage = 0; int Vaverage = 0; @@ -217,8 +215,6 @@ void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x, break; case V_PRED: { - int i; - for (i = 0; i < 8; i++) { vpx_memcpy(upred_ptr, uabove_row, 8); @@ -231,8 +227,6 @@ void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x, break; case H_PRED: { - int i; - for (i = 0; i < 8; i++) { vpx_memset(upred_ptr, uleft_col[i], 8); @@ -245,8 +239,6 @@ void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x, break; case TM_PRED: { - int i; - for (i = 0; i < 8; i++) { for (j = 0; j < 8; j++) diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh index 4eb96b743..ee892ded2 100644 --- a/vp8/common/rtcd_defs.sh +++ b/vp8/common/rtcd_defs.sh @@ -444,8 +444,9 @@ vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6 # Quantizer # prototype void vp8_regular_quantize_b "struct block *, struct blockd *" -specialize vp8_regular_quantize_b sse2 sse4_1 -vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4 +specialize vp8_regular_quantize_b sse2 #sse4_1 +# TODO(johann) Update sse4 implementation and re-enable +#vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4 prototype void vp8_fast_quantize_b "struct block *, struct blockd *" specialize vp8_fast_quantize_b sse2 ssse3 media neon diff --git a/vp8/common/asm_com_offsets.c b/vp8/common/vp8_asm_com_offsets.c index 7bab90f82..7bab90f82 100644 --- a/vp8/common/asm_com_offsets.c +++ b/vp8/common/vp8_asm_com_offsets.c diff --git a/vp8/common/x86/postproc_mmx.asm b/vp8/common/x86/postproc_mmx.asm index 966c586e4..5cf110b53 100644 --- a/vp8/common/x86/postproc_mmx.asm +++ b/vp8/common/x86/postproc_mmx.asm @@ -61,7 +61,7 @@ sym(vp8_mbpost_proc_down_mmx): mov rcx, 8 .init_borderd ; initialize borders lea rdi, [rdi + rax] - movq [rdi], xmm1 + movq [rdi], mm1 dec rcx jne .init_borderd @@ -193,7 +193,6 @@ sym(vp8_mbpost_proc_down_mmx): movq mm4, [sym(vp8_rv) + rcx*2] %endif paddw mm1, mm4 - ;paddw xmm1, eight8s psraw mm1, 4 packuswb mm1, mm0 diff --git a/vp8/decoder/dboolhuff.c b/vp8/decoder/dboolhuff.c index 7e7b05aa6..aa7a56a02 100644 --- a/vp8/decoder/dboolhuff.c +++ b/vp8/decoder/dboolhuff.c @@ -10,18 +10,20 @@ #include "dboolhuff.h" -#include "vpx_ports/mem.h" -#include "vpx_mem/vpx_mem.h" int vp8dx_start_decode(BOOL_DECODER *br, const unsigned char *source, - unsigned int source_sz) + unsigned int source_sz, + const unsigned char *origin, + const unsigned char *key) { br->user_buffer_end = source+source_sz; br->user_buffer = source; br->value = 0; br->count = -8; br->range = 255; + br->origin = origin; + br->key = key; if (source_sz && !source) return 1; @@ -32,19 +34,34 @@ int vp8dx_start_decode(BOOL_DECODER *br, return 0; } - void vp8dx_bool_decoder_fill(BOOL_DECODER *br) { - const unsigned char *bufptr; - const unsigned char *bufend; - VP8_BD_VALUE value; - int count; - bufend = br->user_buffer_end; - bufptr = br->user_buffer; - value = br->value; - count = br->count; - - VP8DX_BOOL_DECODER_FILL(count, value, bufptr, bufend); + const unsigned char *bufptr = br->user_buffer; + const unsigned char *bufend = br->user_buffer_end; + VP8_BD_VALUE value = br->value; + int count = br->count; + int shift = VP8_BD_VALUE_SIZE - 8 - (count + 8); + size_t bits_left = (bufend - bufptr)*CHAR_BIT; + int x = (int)(shift + CHAR_BIT - bits_left); + int loop_end = 0; + + if(x >= 0) + { + count += VP8_LOTS_OF_BITS; + loop_end = x; + } + + if (x < 0 || bits_left) + { + while(shift >= loop_end) + { + count += CHAR_BIT; + value |= ((VP8_BD_VALUE)decrypt_byte(bufptr, br->origin, + br->key)) << shift; + ++bufptr; + shift -= CHAR_BIT; + } + } br->user_buffer = bufptr; br->value = value; diff --git a/vp8/decoder/dboolhuff.h b/vp8/decoder/dboolhuff.h index 1a08c057b..46a4dd60e 100644 --- a/vp8/decoder/dboolhuff.h +++ b/vp8/decoder/dboolhuff.h @@ -9,21 +9,36 @@ */ -#ifndef DBOOLHUFF_H -#define DBOOLHUFF_H +#ifndef DBOOLHUFF_H_ +#define DBOOLHUFF_H_ + #include <stddef.h> #include <limits.h> + #include "vpx_config.h" #include "vpx_ports/mem.h" #include "vpx/vpx_integer.h" typedef size_t VP8_BD_VALUE; -# define VP8_BD_VALUE_SIZE ((int)sizeof(VP8_BD_VALUE)*CHAR_BIT) +#define VP8_BD_VALUE_SIZE ((int)sizeof(VP8_BD_VALUE)*CHAR_BIT) + /*This is meant to be a large, positive constant that can still be efficiently loaded as an immediate (on platforms like ARM, for example). Even relatively modest values like 100 would work fine.*/ -# define VP8_LOTS_OF_BITS (0x40000000) +#define VP8_LOTS_OF_BITS (0x40000000) + +static unsigned char decrypt_byte(const unsigned char *ch, + const unsigned char *origin, + const unsigned char *key) +{ +#if CONFIG_DECRYPT + const int offset = (int)(ch - origin); + return *ch ^ key[offset % 32]; // VP8_DECRYPT_KEY_SIZE +#else + return *ch; +#endif +} typedef struct { @@ -32,46 +47,20 @@ typedef struct VP8_BD_VALUE value; int count; unsigned int range; + const unsigned char *origin; + const unsigned char *key; } BOOL_DECODER; DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]); int vp8dx_start_decode(BOOL_DECODER *br, const unsigned char *source, - unsigned int source_sz); + unsigned int source_sz, + const unsigned char *origin, + const unsigned char *key); void vp8dx_bool_decoder_fill(BOOL_DECODER *br); -/*The refill loop is used in several places, so define it in a macro to make - sure they're all consistent. - An inline function would be cleaner, but has a significant penalty, because - multiple BOOL_DECODER fields must be modified, and the compiler is not smart - enough to eliminate the stores to those fields and the subsequent reloads - from them when inlining the function.*/ -#define VP8DX_BOOL_DECODER_FILL(_count,_value,_bufptr,_bufend) \ - do \ - { \ - int shift = VP8_BD_VALUE_SIZE - 8 - ((_count) + 8); \ - int loop_end, x; \ - size_t bits_left = ((_bufend)-(_bufptr))*CHAR_BIT; \ - \ - x = (int)(shift + CHAR_BIT - bits_left); \ - loop_end = 0; \ - if(x >= 0) \ - { \ - (_count) += VP8_LOTS_OF_BITS; \ - loop_end = x; \ - if(!bits_left) break; \ - } \ - while(shift >= loop_end) \ - { \ - (_count) += CHAR_BIT; \ - (_value) |= (VP8_BD_VALUE)*(_bufptr)++ << shift; \ - shift -= CHAR_BIT; \ - } \ - } \ - while(0) \ - static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) { unsigned int bit = 0; @@ -151,4 +140,5 @@ static int vp8dx_bool_error(BOOL_DECODER *br) /* No error. */ return 0; } -#endif + +#endif // DBOOLHUFF_H_ diff --git a/vp8/decoder/decodemv.h b/vp8/decoder/decodemv.h index 940342447..05a33d27f 100644 --- a/vp8/decoder/decodemv.h +++ b/vp8/decoder/decodemv.h @@ -8,7 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef DECODEMV_H_ +#define DECODEMV_H_ #include "onyxd_int.h" void vp8_decode_mode_mvs(VP8D_COMP *); + +#endif // DECODEMV_H_ diff --git a/vp8/decoder/decoderthreading.h b/vp8/decoder/decoderthreading.h index 60c39d1e1..bc716e489 100644 --- a/vp8/decoder/decoderthreading.h +++ b/vp8/decoder/decoderthreading.h @@ -8,19 +8,15 @@ * be found in the AUTHORS file in the root of the source tree. */ - - - - -#ifndef _DECODER_THREADING_H -#define _DECODER_THREADING_H +#ifndef DECODERTHREADING_H_ +#define DECODERTHREADING_H_ #if CONFIG_MULTITHREAD -extern void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd); -extern void vp8_decoder_remove_threads(VP8D_COMP *pbi); -extern void vp8_decoder_create_threads(VP8D_COMP *pbi); -extern void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows); -extern void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows); +void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd); +void vp8_decoder_remove_threads(VP8D_COMP *pbi); +void vp8_decoder_create_threads(VP8D_COMP *pbi); +void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows); +void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows); #endif -#endif +#endif // DECODERTHREADING_H_ diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c index 6f8282a64..7060005a9 100644 --- a/vp8/decoder/decodframe.c +++ b/vp8/decoder/decodframe.c @@ -893,7 +893,9 @@ static void setup_token_decoder(VP8D_COMP *pbi, { if (vp8dx_start_decode(bool_decoder, pbi->fragments.ptrs[partition_idx], - pbi->fragments.sizes[partition_idx])) + pbi->fragments.sizes[partition_idx], + pbi->fragments.ptrs[0], + pbi->decrypt_key)) vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, "Failed to allocate bool decoder %d", partition_idx); @@ -980,10 +982,11 @@ static void init_frame(VP8D_COMP *pbi) int vp8_decode_frame(VP8D_COMP *pbi) { - vp8_reader *const bc = & pbi->mbc[8]; - VP8_COMMON *const pc = & pbi->common; - MACROBLOCKD *const xd = & pbi->mb; + vp8_reader *const bc = &pbi->mbc[8]; + VP8_COMMON *const pc = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; const unsigned char *data = pbi->fragments.ptrs[0]; + const unsigned char *const origin = data; const unsigned char *data_end = data + pbi->fragments.sizes[0]; ptrdiff_t first_partition_length_in_bytes; @@ -1016,13 +1019,21 @@ int vp8_decode_frame(VP8D_COMP *pbi) } else { - pc->frame_type = (FRAME_TYPE)(data[0] & 1); - pc->version = (data[0] >> 1) & 7; - pc->show_frame = (data[0] >> 4) & 1; + const unsigned char data0 = decrypt_byte(data + 0, origin, + pbi->decrypt_key); + const unsigned char data1 = decrypt_byte(data + 1, origin, + pbi->decrypt_key); + const unsigned char data2 = decrypt_byte(data + 2, origin, + pbi->decrypt_key); + + pc->frame_type = (FRAME_TYPE)(data0 & 1); + pc->version = (data0 >> 1) & 7; + pc->show_frame = (data0 >> 4) & 1; first_partition_length_in_bytes = - (data[0] | (data[1] << 8) | (data[2] << 16)) >> 5; + (data0 | (data1 << 8) | (data2 << 16)) >> 5; - if (!pbi->ec_active && (data + first_partition_length_in_bytes > data_end + if (!pbi->ec_active && + (data + first_partition_length_in_bytes > data_end || data + first_partition_length_in_bytes < data)) vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet or corrupt partition 0 length"); @@ -1040,7 +1051,13 @@ int vp8_decode_frame(VP8D_COMP *pbi) */ if (!pbi->ec_active || data + 3 < data_end) { - if (data[0] != 0x9d || data[1] != 0x01 || data[2] != 0x2a) + const unsigned char data0 = decrypt_byte(data + 0, origin, + pbi->decrypt_key); + const unsigned char data1 = decrypt_byte(data + 1, origin, + pbi->decrypt_key); + const unsigned char data2 = decrypt_byte(data + 2, origin, + pbi->decrypt_key); + if (data0 != 0x9d || data1 != 0x01 || data2 != 0x2a) vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM, "Invalid frame sync code"); } @@ -1051,10 +1068,19 @@ int vp8_decode_frame(VP8D_COMP *pbi) */ if (!pbi->ec_active || data + 6 < data_end) { - pc->Width = (data[3] | (data[4] << 8)) & 0x3fff; - pc->horiz_scale = data[4] >> 6; - pc->Height = (data[5] | (data[6] << 8)) & 0x3fff; - pc->vert_scale = data[6] >> 6; + const unsigned char data3 = decrypt_byte(data + 3, origin, + pbi->decrypt_key); + const unsigned char data4 = decrypt_byte(data + 4, origin, + pbi->decrypt_key); + const unsigned char data5 = decrypt_byte(data + 5, origin, + pbi->decrypt_key); + const unsigned char data6 = decrypt_byte(data + 6, origin, + pbi->decrypt_key); + + pc->Width = (data3 | (data4 << 8)) & 0x3fff; + pc->horiz_scale = data4 >> 6; + pc->Height = (data5 | (data6 << 8)) & 0x3fff; + pc->vert_scale = data6 >> 6; } data += 7; @@ -1072,7 +1098,11 @@ int vp8_decode_frame(VP8D_COMP *pbi) init_frame(pbi); - if (vp8dx_start_decode(bc, data, (unsigned int)(data_end - data))) + if (vp8dx_start_decode(bc, + data, + (unsigned int)(data_end - data), + pbi->fragments.ptrs[0], + pbi->decrypt_key)) vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR, "Failed to allocate bool decoder 0"); if (pc->frame_type == KEY_FRAME) { diff --git a/vp8/decoder/detokenize.h b/vp8/decoder/detokenize.h index 8640bda4c..f2130b361 100644 --- a/vp8/decoder/detokenize.h +++ b/vp8/decoder/detokenize.h @@ -8,13 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ - -#ifndef DETOKENIZE_H -#define DETOKENIZE_H +#ifndef DETOKENIZE_H_ +#define DETOKENIZE_H_ #include "onyxd_int.h" void vp8_reset_mb_tokens_context(MACROBLOCKD *x); int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *); -#endif /* DETOKENIZE_H */ +#endif // DETOKENIZE_H diff --git a/vp8/decoder/ec_types.h b/vp8/decoder/ec_types.h index ccb5ddbb9..b24bfd943 100644 --- a/vp8/decoder/ec_types.h +++ b/vp8/decoder/ec_types.h @@ -14,7 +14,6 @@ #define MAX_OVERLAPS 16 - /* The area (pixel area in Q6) the block pointed to by bmi overlaps * another block with. */ @@ -48,4 +47,4 @@ typedef struct MV_REFERENCE_FRAME ref_frame; } EC_BLOCK; -#endif /* VP8_DEC_EC_TYPES_H */ +#endif // VP8_DEC_EC_TYPES_H diff --git a/vp8/decoder/error_concealment.c b/vp8/decoder/error_concealment.c index 8b2e32be6..0b58c98fd 100644 --- a/vp8/decoder/error_concealment.c +++ b/vp8/decoder/error_concealment.c @@ -8,14 +8,14 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <assert.h> + #include "error_concealment.h" #include "onyxd_int.h" #include "decodemv.h" #include "vpx_mem/vpx_mem.h" #include "vp8/common/findnearmv.h" -#include <assert.h> - #define MIN(x,y) (((x)<(y))?(x):(y)) #define MAX(x,y) (((x)>(y))?(x):(y)) diff --git a/vp8/decoder/error_concealment.h b/vp8/decoder/error_concealment.h index 65ae9d9be..fb96b3605 100644 --- a/vp8/decoder/error_concealment.h +++ b/vp8/decoder/error_concealment.h @@ -9,8 +9,8 @@ */ -#ifndef ERROR_CONCEALMENT_H -#define ERROR_CONCEALMENT_H +#ifndef ERROR_CONCEALMENT_H_ +#define ERROR_CONCEALMENT_H_ #include "onyxd_int.h" #include "ec_types.h" @@ -38,4 +38,4 @@ void vp8_interpolate_motion(MACROBLOCKD *mb, */ void vp8_conceal_corrupt_mb(MACROBLOCKD *xd); -#endif +#endif // ERROR_CONCEALMENT_H_ diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h index fb2dde852..c2325ebef 100644 --- a/vp8/decoder/onyxd_int.h +++ b/vp8/decoder/onyxd_int.h @@ -9,8 +9,9 @@ */ -#ifndef __INC_VP8D_INT_H -#define __INC_VP8D_INT_H +#ifndef ONYXD_INT_H_ +#define ONYXD_INT_H_ + #include "vpx_config.h" #include "vp8/common/onyxd.h" #include "treereader.h" @@ -121,6 +122,7 @@ typedef struct VP8D_COMP int independent_partitions; int frame_corrupt_residual; + const unsigned char *decrypt_key; } VP8D_COMP; int vp8_decode_frame(VP8D_COMP *cpi); @@ -145,4 +147,4 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb); } while(0) #endif -#endif +#endif // ONYXD_INT_H_ diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c index b18cb5065..73f9a8356 100644 --- a/vp8/decoder/threading.c +++ b/vp8/decoder/threading.c @@ -36,7 +36,7 @@ } while (0) -extern void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd); +void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd); static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count) { diff --git a/vp8/decoder/treereader.h b/vp8/decoder/treereader.h index 238ff8536..9393bb478 100644 --- a/vp8/decoder/treereader.h +++ b/vp8/decoder/treereader.h @@ -9,18 +9,17 @@ */ -#ifndef tree_reader_h -#define tree_reader_h 1 +#ifndef TREEREADER_H_ +#define TREEREADER_H_ #include "vp8/common/treecoder.h" - #include "dboolhuff.h" typedef BOOL_DECODER vp8_reader; #define vp8_read vp8dx_decode_bool #define vp8_read_literal vp8_decode_value -#define vp8_read_bit( R) vp8_read( R, vp8_prob_half) +#define vp8_read_bit(R) vp8_read(R, vp8_prob_half) /* Intent of tree data structure is to make decoding trivial. */ @@ -38,4 +37,4 @@ static int vp8_treed_read( return -i; } -#endif /* tree_reader_h */ +#endif // TREEREADER_H_ diff --git a/vp8/decoder/asm_dec_offsets.c b/vp8/decoder/vp8_asm_dec_offsets.c index 842a0d574..842a0d574 100644 --- a/vp8/decoder/asm_dec_offsets.c +++ b/vp8/decoder/vp8_asm_dec_offsets.c diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c index 383196904..ca680f9a5 100644 --- a/vp8/encoder/bitstream.c +++ b/vp8/encoder/bitstream.c @@ -980,6 +980,12 @@ void vp8_calc_ref_frame_costs(int *ref_frame_cost, int prob_garf ) { + assert(prob_intra >= 0); + assert(prob_intra <= 255); + assert(prob_last >= 0); + assert(prob_last <= 255); + assert(prob_garf >= 0); + assert(prob_garf <= 255); ref_frame_cost[INTRA_FRAME] = vp8_cost_zero(prob_intra); ref_frame_cost[LAST_FRAME] = vp8_cost_one(prob_intra) + vp8_cost_zero(prob_last); diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h index a30f88816..cf74c7aaf 100644 --- a/vp8/encoder/block.h +++ b/vp8/encoder/block.h @@ -37,7 +37,7 @@ typedef struct block /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */ short *quant; short *quant_fast; - unsigned char *quant_shift; + short *quant_shift; short *zbin; short *zrun_zbin_boost; short *round; diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 124b1cb35..916137b49 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -641,7 +641,6 @@ void vp8_set_speed_features(VP8_COMP *cpi) for (i = 0; i < MAX_MODES; i ++) { cpi->mode_check_freq[i] = 0; - cpi->mode_chosen_counts[i] = 0; } cpi->mb.mbs_tested_so_far = 0; @@ -2816,6 +2815,8 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi) if (cpi->common.refresh_alt_ref_frame) { cpi->prob_intra_coded += 40; + if (cpi->prob_intra_coded > 255) + cpi->prob_intra_coded = 255; cpi->prob_last_coded = 200; cpi->prob_gf_coded = 1; } @@ -4598,9 +4599,6 @@ static void encode_frame_to_data_rate cm->frame_type, cm->refresh_golden_frame, cm->refresh_alt_ref_frame); - for (i = 0; i < MAX_MODES; i++) - fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]); - fprintf(fmodes, "\n"); fclose(fmodes); diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index 378731d0a..c79531c5d 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -282,17 +282,17 @@ typedef struct VP8_COMP { DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, unsigned char, Y1quant_shift[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, Y1quant_shift[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, unsigned char, Y2quant_shift[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, Y2quant_shift[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, UVquant_shift[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]); @@ -349,7 +349,6 @@ typedef struct VP8_COMP int ambient_err; unsigned int mode_check_freq[MAX_MODES]; - unsigned int mode_chosen_counts[MAX_MODES]; int rd_baseline_thresh[MAX_MODES]; diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c index 33c8ef055..4e2fef793 100644 --- a/vp8/encoder/quantize.c +++ b/vp8/encoder/quantize.c @@ -50,8 +50,8 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d) if (x >= zbin) { x += round_ptr[rc]; - y = (((x * quant_ptr[rc]) >> 16) + x) - >> quant_shift_ptr[rc]; /* quantize (x) */ + y = ((((x * quant_ptr[rc]) >> 16) + x) + * quant_shift_ptr[rc]) >> 16; /* quantize (x) */ x = (y ^ sz) - sz; /* get the sign back */ qcoeff_ptr[rc] = x; /* write to destination */ dqcoeff_ptr[rc] = x * dequant_ptr[rc]; /* dequantized value */ @@ -113,7 +113,7 @@ void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d) short *zbin_ptr = b->zbin; short *round_ptr = b->round; short *quant_ptr = b->quant; - unsigned char *quant_shift_ptr = b->quant_shift; + short *quant_shift_ptr = b->quant_shift; short *qcoeff_ptr = d->qcoeff; short *dqcoeff_ptr = d->dqcoeff; short *dequant_ptr = d->dequant; @@ -138,8 +138,8 @@ void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d) if (x >= zbin) { x += round_ptr[rc]; - y = (((x * quant_ptr[rc]) >> 16) + x) - >> quant_shift_ptr[rc]; /* quantize (x) */ + y = ((((x * quant_ptr[rc]) >> 16) + x) + * quant_shift_ptr[rc]) >> 16; /* quantize (x) */ x = (y ^ sz) - sz; /* get the sign back */ qcoeff_ptr[rc] = x; /* write to destination */ dqcoeff_ptr[rc] = x * dequant_ptr[rc]; /* dequantized value */ @@ -167,7 +167,7 @@ void vp8_strict_quantize_b_c(BLOCK *b, BLOCKD *d) int sz; short *coeff_ptr; short *quant_ptr; - unsigned char *quant_shift_ptr; + short *quant_shift_ptr; short *qcoeff_ptr; short *dqcoeff_ptr; short *dequant_ptr; @@ -198,7 +198,7 @@ void vp8_strict_quantize_b_c(BLOCK *b, BLOCKD *d) if (x >= dq) { /* Quantize x. */ - y = (((x * quant_ptr[rc]) >> 16) + x) >> quant_shift_ptr[rc]; + y = ((((x * quant_ptr[rc]) >> 16) + x) * quant_shift_ptr[rc]) >> 16; /* Put the sign back. */ x = (y + sz) ^ sz; /* Save the coefficient and its dequantized value. */ @@ -406,7 +406,7 @@ static const int qzbin_factors_y2[129] = #define EXACT_QUANT #ifdef EXACT_QUANT static void invert_quant(int improved_quant, short *quant, - unsigned char *shift, short d) + short *shift, short d) { if(improved_quant) { @@ -418,11 +418,15 @@ static void invert_quant(int improved_quant, short *quant, t = 1 + (1<<(16+l))/d; *quant = (short)(t - (1<<16)); *shift = l; + /* use multiplication and constant shift by 16 */ + *shift = 1 << (16 - *shift); } else { *quant = (1 << 16) / d; *shift = 0; + /* use multiplication and constant shift by 16 */ + *shift = 1 << (16 - *shift); } } diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 3d60bebda..9080c627c 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -2512,9 +2512,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, x->rd_thresh_mult[best_mode_index]; } - /* Note how often each mode chosen as best */ - cpi->mode_chosen_counts[best_mode_index] ++; - #if CONFIG_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity) { diff --git a/vp8/encoder/asm_enc_offsets.c b/vp8/encoder/vp8_asm_enc_offsets.c index a4169b32f..a4169b32f 100644 --- a/vp8/encoder/asm_enc_offsets.c +++ b/vp8/encoder/vp8_asm_enc_offsets.c diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm deleted file mode 100644 index b41768ce0..000000000 --- a/vp8/encoder/x86/quantize_sse2.asm +++ /dev/null @@ -1,245 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" -%include "vp8_asm_enc_offsets.asm" - - -; void vp8_regular_quantize_b_sse2 | arg -; (BLOCK *b, | 0 -; BLOCKD *d) | 1 - -global sym(vp8_regular_quantize_b_sse2) PRIVATE -sym(vp8_regular_quantize_b_sse2): - push rbp - mov rbp, rsp - SAVE_XMM 7 - GET_GOT rbx - -%if ABI_IS_32BIT - push rdi - push rsi -%else - %if LIBVPX_YASM_WIN64 - push rdi - push rsi - %endif -%endif - - ALIGN_STACK 16, rax - %define zrun_zbin_boost 0 ; 8 - %define abs_minus_zbin 8 ; 32 - %define temp_qcoeff 40 ; 32 - %define qcoeff 72 ; 32 - %define stack_size 104 - sub rsp, stack_size - ; end prolog - -%if ABI_IS_32BIT - mov rdi, arg(0) ; BLOCK *b - mov rsi, arg(1) ; BLOCKD *d -%else - %if LIBVPX_YASM_WIN64 - mov rdi, rcx ; BLOCK *b - mov rsi, rdx ; BLOCKD *d - %else - ;mov rdi, rdi ; BLOCK *b - ;mov rsi, rsi ; BLOCKD *d - %endif -%endif - - mov rdx, [rdi + vp8_block_coeff] ; coeff_ptr - mov rcx, [rdi + vp8_block_zbin] ; zbin_ptr - movd xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value - - ; z - movdqa xmm0, [rdx] - movdqa xmm4, [rdx + 16] - mov rdx, [rdi + vp8_block_round] ; round_ptr - - pshuflw xmm7, xmm7, 0 - punpcklwd xmm7, xmm7 ; duplicated zbin_oq_value - - movdqa xmm1, xmm0 - movdqa xmm5, xmm4 - - ; sz - psraw xmm0, 15 - psraw xmm4, 15 - - ; (z ^ sz) - pxor xmm1, xmm0 - pxor xmm5, xmm4 - - ; x = abs(z) - psubw xmm1, xmm0 - psubw xmm5, xmm4 - - movdqa xmm2, [rcx] - movdqa xmm3, [rcx + 16] - mov rcx, [rdi + vp8_block_quant] ; quant_ptr - - ; *zbin_ptr + zbin_oq_value - paddw xmm2, xmm7 - paddw xmm3, xmm7 - - ; x - (*zbin_ptr + zbin_oq_value) - psubw xmm1, xmm2 - psubw xmm5, xmm3 - movdqa [rsp + abs_minus_zbin], xmm1 - movdqa [rsp + abs_minus_zbin + 16], xmm5 - - ; add (zbin_ptr + zbin_oq_value) back - paddw xmm1, xmm2 - paddw xmm5, xmm3 - - movdqa xmm2, [rdx] - movdqa xmm6, [rdx + 16] - - movdqa xmm3, [rcx] - movdqa xmm7, [rcx + 16] - - ; x + round - paddw xmm1, xmm2 - paddw xmm5, xmm6 - - ; y = x * quant_ptr >> 16 - pmulhw xmm3, xmm1 - pmulhw xmm7, xmm5 - - ; y += x - paddw xmm1, xmm3 - paddw xmm5, xmm7 - - movdqa [rsp + temp_qcoeff], xmm1 - movdqa [rsp + temp_qcoeff + 16], xmm5 - - pxor xmm6, xmm6 - ; zero qcoeff - movdqa [rsp + qcoeff], xmm6 - movdqa [rsp + qcoeff + 16], xmm6 - - mov rdx, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr - mov rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr - mov [rsp + zrun_zbin_boost], rdx - -%macro ZIGZAG_LOOP 1 - ; x - movsx ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2] - - ; if (x >= zbin) - sub cx, WORD PTR[rdx] ; x - zbin - lea rdx, [rdx + 2] ; zbin_boost_ptr++ - jl .rq_zigzag_loop_%1 ; x < zbin - - movsx edi, WORD PTR[rsp + temp_qcoeff + %1 * 2] - - ; downshift by quant_shift[rc] - movsx cx, BYTE PTR[rax + %1] ; quant_shift_ptr[rc] - sar edi, cl ; also sets Z bit - je .rq_zigzag_loop_%1 ; !y - mov WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc] - mov rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost -.rq_zigzag_loop_%1: -%endmacro -; in vp8_default_zig_zag1d order: see vp8/common/entropy.c -ZIGZAG_LOOP 0 -ZIGZAG_LOOP 1 -ZIGZAG_LOOP 4 -ZIGZAG_LOOP 8 -ZIGZAG_LOOP 5 -ZIGZAG_LOOP 2 -ZIGZAG_LOOP 3 -ZIGZAG_LOOP 6 -ZIGZAG_LOOP 9 -ZIGZAG_LOOP 12 -ZIGZAG_LOOP 13 -ZIGZAG_LOOP 10 -ZIGZAG_LOOP 7 -ZIGZAG_LOOP 11 -ZIGZAG_LOOP 14 -ZIGZAG_LOOP 15 - - movdqa xmm2, [rsp + qcoeff] - movdqa xmm3, [rsp + qcoeff + 16] - - mov rcx, [rsi + vp8_blockd_dequant] ; dequant_ptr - mov rdi, [rsi + vp8_blockd_dqcoeff] ; dqcoeff_ptr - - ; y ^ sz - pxor xmm2, xmm0 - pxor xmm3, xmm4 - ; x = (y ^ sz) - sz - psubw xmm2, xmm0 - psubw xmm3, xmm4 - - ; dequant - movdqa xmm0, [rcx] - movdqa xmm1, [rcx + 16] - - mov rcx, [rsi + vp8_blockd_qcoeff] ; qcoeff_ptr - - pmullw xmm0, xmm2 - pmullw xmm1, xmm3 - - movdqa [rcx], xmm2 ; store qcoeff - movdqa [rcx + 16], xmm3 - movdqa [rdi], xmm0 ; store dqcoeff - movdqa [rdi + 16], xmm1 - - mov rcx, [rsi + vp8_blockd_eob] - - ; select the last value (in zig_zag order) for EOB - pcmpeqw xmm2, xmm6 - pcmpeqw xmm3, xmm6 - ; ! - pcmpeqw xmm6, xmm6 - pxor xmm2, xmm6 - pxor xmm3, xmm6 - ; mask inv_zig_zag - pand xmm2, [GLOBAL(inv_zig_zag)] - pand xmm3, [GLOBAL(inv_zig_zag + 16)] - ; select the max value - pmaxsw xmm2, xmm3 - pshufd xmm3, xmm2, 00001110b - pmaxsw xmm2, xmm3 - pshuflw xmm3, xmm2, 00001110b - pmaxsw xmm2, xmm3 - pshuflw xmm3, xmm2, 00000001b - pmaxsw xmm2, xmm3 - movd eax, xmm2 - and eax, 0xff - - mov BYTE PTR [rcx], al ; store eob - - ; begin epilog - add rsp, stack_size - pop rsp -%if ABI_IS_32BIT - pop rsi - pop rdi -%else - %if LIBVPX_YASM_WIN64 - pop rsi - pop rdi - %endif -%endif - RESTORE_GOT - RESTORE_XMM - pop rbp - ret - -SECTION_RODATA -align 16 -inv_zig_zag: - dw 0x0001, 0x0002, 0x0006, 0x0007 - dw 0x0003, 0x0005, 0x0008, 0x000d - dw 0x0004, 0x0009, 0x000c, 0x000e - dw 0x000a, 0x000b, 0x000f, 0x0010 diff --git a/vp8/encoder/x86/quantize_sse2_intrinsics.c b/vp8/encoder/x86/quantize_sse2_intrinsics.c index 55d57ad62..f495bf287 100644 --- a/vp8/encoder/x86/quantize_sse2_intrinsics.c +++ b/vp8/encoder/x86/quantize_sse2_intrinsics.c @@ -9,13 +9,139 @@ */ -#include "vp8/common/blockd.h" -#include "vp8/common/entropy.h" +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx_ports/x86.h" +#include "vpx_mem/vpx_mem.h" #include "vp8/encoder/block.h" - -#include <mmintrin.h> //MMX -#include <xmmintrin.h> //SSE -#include <emmintrin.h> //SSE2 +#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */ + +#include <mmintrin.h> /* MMX */ +#include <xmmintrin.h> /* SSE */ +#include <emmintrin.h> /* SSE2 */ + +#define SELECT_EOB(i, z) \ + do { \ + short boost = *zbin_boost_ptr; \ + int cmp = (x[z] < boost) | (y[z] == 0); \ + zbin_boost_ptr++; \ + if (cmp) \ + goto select_eob_end_##i; \ + qcoeff_ptr[z] = y[z]; \ + eob = i; \ + zbin_boost_ptr = b->zrun_zbin_boost; \ + select_eob_end_##i:; \ + } while (0) + +void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d) +{ + char eob = 0; + short *zbin_boost_ptr = b->zrun_zbin_boost; + short *qcoeff_ptr = d->qcoeff; + DECLARE_ALIGNED_ARRAY(16, short, x, 16); + DECLARE_ALIGNED_ARRAY(16, short, y, 16); + + __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1; + __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); + __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); + __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); + __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8)); + __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra); + __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin)); + __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8)); + __m128i round0 = _mm_load_si128((__m128i *)(b->round)); + __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); + __m128i quant0 = _mm_load_si128((__m128i *)(b->quant)); + __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); + __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); + __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); + + vpx_memset(qcoeff_ptr, 0, 32); + + /* Duplicate to all lanes. */ + zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); + zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); + + /* Sign of z: z >> 15 */ + sz0 = _mm_srai_epi16(z0, 15); + sz1 = _mm_srai_epi16(z1, 15); + + /* x = abs(z): (z ^ sz) - sz */ + x0 = _mm_xor_si128(z0, sz0); + x1 = _mm_xor_si128(z1, sz1); + x0 = _mm_sub_epi16(x0, sz0); + x1 = _mm_sub_epi16(x1, sz1); + + /* zbin[] + zbin_extra */ + zbin0 = _mm_add_epi16(zbin0, zbin_extra); + zbin1 = _mm_add_epi16(zbin1, zbin_extra); + + /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance + * the equation because boost is the only value which can change: + * x - (zbin[] + extra) >= boost */ + x_minus_zbin0 = _mm_sub_epi16(x0, zbin0); + x_minus_zbin1 = _mm_sub_epi16(x1, zbin1); + + _mm_store_si128((__m128i *)(x), x_minus_zbin0); + _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1); + + /* All the remaining calculations are valid whether they are done now with + * simd or later inside the loop one at a time. */ + x0 = _mm_add_epi16(x0, round0); + x1 = _mm_add_epi16(x1, round1); + + y0 = _mm_mulhi_epi16(x0, quant0); + y1 = _mm_mulhi_epi16(x1, quant1); + + y0 = _mm_add_epi16(y0, x0); + y1 = _mm_add_epi16(y1, x1); + + /* Instead of shifting each value independently we convert the scaling + * factor with 1 << (16 - shift) so we can use multiply/return high half. */ + y0 = _mm_mulhi_epi16(y0, quant_shift0); + y1 = _mm_mulhi_epi16(y1, quant_shift1); + + /* Return the sign: (y ^ sz) - sz */ + y0 = _mm_xor_si128(y0, sz0); + y1 = _mm_xor_si128(y1, sz1); + y0 = _mm_sub_epi16(y0, sz0); + y1 = _mm_sub_epi16(y1, sz1); + + _mm_store_si128((__m128i *)(y), y0); + _mm_store_si128((__m128i *)(y + 8), y1); + + zbin_boost_ptr = b->zrun_zbin_boost; + + /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */ + SELECT_EOB(1, 0); + SELECT_EOB(2, 1); + SELECT_EOB(3, 4); + SELECT_EOB(4, 8); + SELECT_EOB(5, 5); + SELECT_EOB(6, 2); + SELECT_EOB(7, 3); + SELECT_EOB(8, 6); + SELECT_EOB(9, 9); + SELECT_EOB(10, 12); + SELECT_EOB(11, 13); + SELECT_EOB(12, 10); + SELECT_EOB(13, 7); + SELECT_EOB(14, 11); + SELECT_EOB(15, 14); + SELECT_EOB(16, 15); + + y0 = _mm_load_si128((__m128i *)(d->qcoeff)); + y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8)); + + /* dqcoeff = qcoeff * dequant */ + y0 = _mm_mullo_epi16(y0, dequant0); + y1 = _mm_mullo_epi16(y1, dequant1); + + _mm_store_si128((__m128i *)(d->dqcoeff), y0); + _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1); + + *d->eob = eob; +} void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) { diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 2f73420a5..cde2651b5 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -14,7 +14,6 @@ VP8_COMMON_SRCS-yes += common/ppflags.h VP8_COMMON_SRCS-yes += common/onyx.h VP8_COMMON_SRCS-yes += common/onyxd.h VP8_COMMON_SRCS-yes += common/alloccommon.c -VP8_COMMON_SRCS-yes += common/asm_com_offsets.c VP8_COMMON_SRCS-yes += common/blockd.c VP8_COMMON_SRCS-yes += common/coefupdateprobs.h VP8_COMMON_SRCS-yes += common/debugmodes.c @@ -67,6 +66,7 @@ VP8_COMMON_SRCS-yes += common/setupintrarecon.c VP8_COMMON_SRCS-yes += common/swapyv12buffer.c VP8_COMMON_SRCS-yes += common/variance_c.c VP8_COMMON_SRCS-yes += common/variance.h +VP8_COMMON_SRCS-yes += common/vp8_asm_com_offsets.c VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h @@ -193,6 +193,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance16x16_neon$ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM) $(eval $(call asm_offsets_template,\ - vp8_asm_com_offsets.asm, $(VP8_PREFIX)common/asm_com_offsets.c)) + vp8_asm_com_offsets.asm, $(VP8_PREFIX)common/vp8_asm_com_offsets.c)) $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.sh)) diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 0b58b0eaa..b985cb1b7 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -1238,7 +1238,6 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {1, 30}, /* g_timebase */ 0, /* g_error_resilient */ - 0, /* g_frame_parallel_decoding */ VPX_RC_ONE_PASS, /* g_pass */ diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index 1db61f161..f3834b063 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -29,6 +29,8 @@ #define VP8_CAP_ERROR_CONCEALMENT (CONFIG_ERROR_CONCEALMENT ? \ VPX_CODEC_CAP_ERROR_CONCEALMENT : 0) +#define VP8_DECRYPT_KEY_SIZE 32 + typedef vpx_codec_stream_info_t vp8_stream_info_t; /* Structures for handling memory allocations */ @@ -73,6 +75,7 @@ struct vpx_codec_alg_priv int dbg_color_b_modes_flag; int dbg_display_mv_flag; #endif + unsigned char decrypt_key[VP8_DECRYPT_KEY_SIZE]; vpx_image_t img; int img_setup; struct frame_buffers yv12_frame_buffers; @@ -150,6 +153,8 @@ static vpx_codec_err_t vp8_validate_mmaps(const vp8_stream_info_t *si, return res; } +static const unsigned char fake_decrypt_key[VP8_DECRYPT_KEY_SIZE] = { 0 }; + static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) { int i; @@ -164,6 +169,8 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) ctx->priv->alg_priv->mmaps[0] = *mmap; ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si); + memcpy(ctx->priv->alg_priv->decrypt_key, fake_decrypt_key, + VP8_DECRYPT_KEY_SIZE); ctx->priv->init_flags = ctx->init_flags; if (ctx->config.dec) @@ -211,21 +218,19 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, mmap.flags = vp8_mem_req_segs[0].flags; res = vp8_mmap_alloc(&mmap); + if (res != VPX_CODEC_OK) return res; - if (!res) - { - vp8_init_ctx(ctx, &mmap); + vp8_init_ctx(ctx, &mmap); - /* initialize number of fragments to zero */ - ctx->priv->alg_priv->fragments.count = 0; - /* is input fragments enabled? */ - ctx->priv->alg_priv->fragments.enabled = - (ctx->priv->alg_priv->base.init_flags & - VPX_CODEC_USE_INPUT_FRAGMENTS); + /* initialize number of fragments to zero */ + ctx->priv->alg_priv->fragments.count = 0; + /* is input fragments enabled? */ + ctx->priv->alg_priv->fragments.enabled = + (ctx->priv->alg_priv->base.init_flags & + VPX_CODEC_USE_INPUT_FRAGMENTS); - ctx->priv->alg_priv->defer_alloc = 1; - /*post processing level initialized to do nothing */ - } + ctx->priv->alg_priv->defer_alloc = 1; + /*post processing level initialized to do nothing */ } ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads = @@ -264,14 +269,17 @@ static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx) return VPX_CODEC_OK; } -static vpx_codec_err_t vp8_peek_si(const uint8_t *data, - unsigned int data_sz, - vpx_codec_stream_info_t *si) +static vpx_codec_err_t vp8_peek_si_external(const uint8_t *data, + unsigned int data_sz, + vpx_codec_stream_info_t *si, + const unsigned char *decrypt_key) { vpx_codec_err_t res = VPX_CODEC_OK; if(data + data_sz <= data) + { res = VPX_CODEC_INVALID_PARAM; + } else { /* Parse uncompresssed part of key frame header. @@ -280,30 +288,45 @@ static vpx_codec_err_t vp8_peek_si(const uint8_t *data, * 4 bytes:- including image width and height in the lowest 14 bits * of each 2-byte value. */ - si->is_kf = 0; - if (data_sz >= 10 && !(data[0] & 0x01)) /* I-Frame */ + const uint8_t data0 = decrypt_byte(data, data, decrypt_key); + si->is_kf = 0; + if (data_sz >= 10 && !(data0 & 0x01)) /* I-Frame */ { - const uint8_t *c = data + 3; + const uint8_t data3 = decrypt_byte(data + 3, data, decrypt_key); + const uint8_t data4 = decrypt_byte(data + 4, data, decrypt_key); + const uint8_t data5 = decrypt_byte(data + 5, data, decrypt_key); + const uint8_t data6 = decrypt_byte(data + 6, data, decrypt_key); + const uint8_t data7 = decrypt_byte(data + 7, data, decrypt_key); + const uint8_t data8 = decrypt_byte(data + 8, data, decrypt_key); + const uint8_t data9 = decrypt_byte(data + 9, data, decrypt_key); + si->is_kf = 1; /* vet via sync code */ - if (c[0] != 0x9d || c[1] != 0x01 || c[2] != 0x2a) + if (data3 != 0x9d || data4 != 0x01 || data5 != 0x2a) res = VPX_CODEC_UNSUP_BITSTREAM; - si->w = (c[3] | (c[4] << 8)) & 0x3fff; - si->h = (c[5] | (c[6] << 8)) & 0x3fff; + si->w = (data6 | (data7 << 8)) & 0x3fff; + si->h = (data8 | (data9 << 8)) & 0x3fff; /*printf("w=%d, h=%d\n", si->w, si->h);*/ if (!(si->h | si->w)) res = VPX_CODEC_UNSUP_BITSTREAM; } else + { res = VPX_CODEC_UNSUP_BITSTREAM; + } } return res; +} +static vpx_codec_err_t vp8_peek_si(const uint8_t *data, + unsigned int data_sz, + vpx_codec_stream_info_t *si) { + return vp8_peek_si_external(data, data_sz, si, fake_decrypt_key); } static vpx_codec_err_t vp8_get_si(vpx_codec_alg_priv_t *ctx, @@ -432,8 +455,10 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, w = ctx->si.w; h = ctx->si.h; - res = ctx->base.iface->dec.peek_si(ctx->fragments.ptrs[0], - ctx->fragments.sizes[0], &ctx->si); + res = vp8_peek_si_external(ctx->fragments.ptrs[0], + ctx->fragments.sizes[0], + &ctx->si, + ctx->decrypt_key); if((res == VPX_CODEC_UNSUP_BITSTREAM) && !ctx->si.is_kf) { @@ -507,6 +532,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, } res = vp8_create_decoder_instances(&ctx->yv12_frame_buffers, &oxcf); + ctx->yv12_frame_buffers.pbi[0]->decrypt_key = ctx->decrypt_key; } ctx->decoder_init = 1; @@ -928,6 +954,20 @@ static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx, } + +static vpx_codec_err_t vp8_set_decrypt_key(vpx_codec_alg_priv_t *ctx, + int ctr_id, + va_list args) +{ + const unsigned char *data = va_arg(args, const unsigned char *); + if (data == NULL) { + return VPX_CODEC_INVALID_PARAM; + } + + memcpy(ctx->decrypt_key, data, VP8_DECRYPT_KEY_SIZE); + return VPX_CODEC_OK; +} + vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] = { {VP8_SET_REFERENCE, vp8_set_reference}, @@ -940,6 +980,7 @@ vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] = {VP8D_GET_LAST_REF_UPDATES, vp8_get_last_ref_updates}, {VP8D_GET_FRAME_CORRUPTED, vp8_get_frame_corrupted}, {VP8D_GET_LAST_REF_USED, vp8_get_last_ref_frame}, + {VP8_SET_DECRYPT_KEY, vp8_set_decrypt_key}, { -1, NULL}, }; diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index 2a0e7c526..ca9f6a62e 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -24,7 +24,6 @@ VP8_CX_SRCS-yes += vp8cx.mk VP8_CX_SRCS-yes += vp8_cx_iface.c -VP8_CX_SRCS-yes += encoder/asm_enc_offsets.c VP8_CX_SRCS-yes += encoder/defaultcoefcounts.h VP8_CX_SRCS-yes += encoder/bitstream.c VP8_CX_SRCS-yes += encoder/boolhuff.c @@ -78,6 +77,7 @@ VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c VP8_CX_SRCS-yes += encoder/temporal_filter.c VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h +VP8_CX_SRCS-yes += encoder/vp8_asm_enc_offsets.c ifeq ($(CONFIG_REALTIME_ONLY),yes) VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c @@ -90,7 +90,6 @@ VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2_intrinsics.c -VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm # TODO(johann) make this generic ifeq ($(HAVE_SSE2),yes) @@ -122,4 +121,4 @@ endif VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes)) $(eval $(call asm_offsets_template,\ - vp8_asm_enc_offsets.asm, $(VP8_PREFIX)encoder/asm_enc_offsets.c)) + vp8_asm_enc_offsets.asm, $(VP8_PREFIX)encoder/vp8_asm_enc_offsets.c)) diff --git a/vp8/vp8dx.mk b/vp8/vp8dx.mk index 8be4c7ba5..c26f42d58 100644 --- a/vp8/vp8dx.mk +++ b/vp8/vp8dx.mk @@ -20,7 +20,6 @@ VP8_DX_SRCS-yes += vp8dx.mk VP8_DX_SRCS-yes += vp8_dx_iface.c -VP8_DX_SRCS-yes += decoder/asm_dec_offsets.c VP8_DX_SRCS-yes += decoder/dboolhuff.c VP8_DX_SRCS-yes += decoder/decodemv.c VP8_DX_SRCS-yes += decoder/decodframe.c @@ -36,8 +35,9 @@ VP8_DX_SRCS-yes += decoder/onyxd_int.h VP8_DX_SRCS-yes += decoder/treereader.h VP8_DX_SRCS-yes += decoder/onyxd_if.c VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c +VP8_DX_SRCS-yes += decoder/vp8_asm_dec_offsets.c VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes)) $(eval $(call asm_offsets_template,\ - vp8_asm_dec_offsets.asm, $(VP8_PREFIX)decoder/asm_dec_offsets.c)) + vp8_asm_dec_offsets.asm, $(VP8_PREFIX)decoder/vp8_asm_dec_offsets.c)) diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 8de68505a..8b6efc384 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -361,7 +361,7 @@ specialize vp9_short_idct1_16x16 prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch" -specialize vp9_short_idct32x32 +specialize vp9_short_idct32x32 sse2 prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output" specialize vp9_short_idct1_32x32 diff --git a/vp9/common/vp9_seg_common.c b/vp9/common/vp9_seg_common.c index 859e211bd..44d317293 100644 --- a/vp9/common/vp9_seg_common.c +++ b/vp9/common/vp9_seg_common.c @@ -51,7 +51,7 @@ int vp9_seg_feature_data_max(SEG_LVL_FEATURES feature_id) { } int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) { - return (segfeaturedata_signed[feature_id]); + return segfeaturedata_signed[feature_id]; } void vp9_clear_segdata(MACROBLOCKD *xd, diff --git a/vp9/common/x86/vp9_idct_x86.c b/vp9/common/x86/vp9_idct_x86.c index 1a2c84a40..811ed9899 100644 --- a/vp9/common/x86/vp9_idct_x86.c +++ b/vp9/common/x86/vp9_idct_x86.c @@ -298,129 +298,110 @@ void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) { in3 = _mm_unpackhi_epi32(tr0_2, tr0_3); /* i7 i6 */ \ } -#define IDCT8x8_1D \ - /* Stage1 */ \ - { \ - const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ - const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ - const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ - const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ - \ - tmp0 = _mm_madd_epi16(lo_17, stg1_0); \ - tmp1 = _mm_madd_epi16(hi_17, stg1_0); \ - tmp2 = _mm_madd_epi16(lo_17, stg1_1); \ - tmp3 = _mm_madd_epi16(hi_17, stg1_1); \ - tmp4 = _mm_madd_epi16(lo_35, stg1_2); \ - tmp5 = _mm_madd_epi16(hi_35, stg1_2); \ - tmp6 = _mm_madd_epi16(lo_35, stg1_3); \ - tmp7 = _mm_madd_epi16(hi_35, stg1_3); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - tmp4 = _mm_add_epi32(tmp4, rounding); \ - tmp5 = _mm_add_epi32(tmp5, rounding); \ - tmp6 = _mm_add_epi32(tmp6, rounding); \ - tmp7 = _mm_add_epi32(tmp7, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ - \ - stp1_4 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_7 = _mm_packs_epi32(tmp2, tmp3); \ - stp1_5 = _mm_packs_epi32(tmp4, tmp5); \ - stp1_6 = _mm_packs_epi32(tmp6, tmp7); \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ - const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ - const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ - const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ - \ - tmp0 = _mm_madd_epi16(lo_04, stg2_0); \ - tmp1 = _mm_madd_epi16(hi_04, stg2_0); \ - tmp2 = _mm_madd_epi16(lo_04, stg2_1); \ - tmp3 = _mm_madd_epi16(hi_04, stg2_1); \ - tmp4 = _mm_madd_epi16(lo_26, stg2_2); \ - tmp5 = _mm_madd_epi16(hi_26, stg2_2); \ - tmp6 = _mm_madd_epi16(lo_26, stg2_3); \ - tmp7 = _mm_madd_epi16(hi_26, stg2_3); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - tmp4 = _mm_add_epi32(tmp4, rounding); \ - tmp5 = _mm_add_epi32(tmp5, rounding); \ - tmp6 = _mm_add_epi32(tmp6, rounding); \ - tmp7 = _mm_add_epi32(tmp7, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ - \ - stp2_0 = _mm_packs_epi32(tmp0, tmp1); \ - stp2_1 = _mm_packs_epi32(tmp2, tmp3); \ - stp2_2 = _mm_packs_epi32(tmp4, tmp5); \ - stp2_3 = _mm_packs_epi32(tmp6, tmp7); \ - \ - stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ - tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ - tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ - tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - } \ - \ - /* Stage4 */ \ - in0 = _mm_adds_epi16(stp1_0, stp2_7); \ - in1 = _mm_adds_epi16(stp1_1, stp1_6); \ - in2 = _mm_adds_epi16(stp1_2, stp1_5); \ - in3 = _mm_adds_epi16(stp1_3, stp2_4); \ - in4 = _mm_subs_epi16(stp1_3, stp2_4); \ - in5 = _mm_subs_epi16(stp1_2, stp1_5); \ - in6 = _mm_subs_epi16(stp1_1, stp1_6); \ +// Define Macro for multiplying elements by constants and adding them together. +#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ + cst0, cst1, cst2, cst3, res0, res1, res2, res3) \ + { \ + tmp0 = _mm_madd_epi16(lo_0, cst0); \ + tmp1 = _mm_madd_epi16(hi_0, cst0); \ + tmp2 = _mm_madd_epi16(lo_0, cst1); \ + tmp3 = _mm_madd_epi16(hi_0, cst1); \ + tmp4 = _mm_madd_epi16(lo_1, cst2); \ + tmp5 = _mm_madd_epi16(hi_1, cst2); \ + tmp6 = _mm_madd_epi16(lo_1, cst3); \ + tmp7 = _mm_madd_epi16(hi_1, cst3); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + tmp4 = _mm_add_epi32(tmp4, rounding); \ + tmp5 = _mm_add_epi32(tmp5, rounding); \ + tmp6 = _mm_add_epi32(tmp6, rounding); \ + tmp7 = _mm_add_epi32(tmp7, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ + \ + res0 = _mm_packs_epi32(tmp0, tmp1); \ + res1 = _mm_packs_epi32(tmp2, tmp3); \ + res2 = _mm_packs_epi32(tmp4, tmp5); \ + res3 = _mm_packs_epi32(tmp6, tmp7); \ + } + +#define IDCT8x8_1D \ + /* Stage1 */ \ + { \ + const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ + const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ + const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ + const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ + \ + MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ + stg1_1, stg1_2, stg1_3, stp1_4, \ + stp1_7, stp1_5, stp1_6) \ + } \ + \ + /* Stage2 */ \ + { \ + const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ + const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ + const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ + const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ + \ + MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \ + stg2_1, stg2_2, stg2_3, stp2_0, \ + stp2_1, stp2_2, stp2_3) \ + \ + stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ + tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ + tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ + tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + } \ + \ + /* Stage4 */ \ + in0 = _mm_adds_epi16(stp1_0, stp2_7); \ + in1 = _mm_adds_epi16(stp1_1, stp1_6); \ + in2 = _mm_adds_epi16(stp1_2, stp1_5); \ + in3 = _mm_adds_epi16(stp1_3, stp2_4); \ + in4 = _mm_subs_epi16(stp1_3, stp2_4); \ + in5 = _mm_subs_epi16(stp1_2, stp1_5); \ + in6 = _mm_subs_epi16(stp1_1, stp1_6); \ in7 = _mm_subs_epi16(stp1_0, stp2_7); void vp9_short_idct8x8_sse2(int16_t *input, int16_t *output, int pitch) { @@ -643,9 +624,9 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) { _mm_store_si128((__m128i *)(output + half_pitch * 7), in7); } -#define IDCT16x16_1D \ - /* Stage2 */ \ - { \ +#define IDCT16x16_1D \ + /* Stage2 */ \ + { \ const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \ const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \ const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7); \ @@ -654,250 +635,110 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) { const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \ const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \ const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \ - \ - tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); \ - tmp1 = _mm_madd_epi16(hi_1_15, stg2_0); \ - tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); \ - tmp3 = _mm_madd_epi16(hi_1_15, stg2_1); \ - tmp4 = _mm_madd_epi16(lo_9_7, stg2_2); \ - tmp5 = _mm_madd_epi16(hi_9_7, stg2_2); \ - tmp6 = _mm_madd_epi16(lo_9_7, stg2_3); \ - tmp7 = _mm_madd_epi16(hi_9_7, stg2_3); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - tmp4 = _mm_add_epi32(tmp4, rounding); \ - tmp5 = _mm_add_epi32(tmp5, rounding); \ - tmp6 = _mm_add_epi32(tmp6, rounding); \ - tmp7 = _mm_add_epi32(tmp7, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ - \ - stp2_8 = _mm_packs_epi32(tmp0, tmp1); \ - stp2_15 = _mm_packs_epi32(tmp2, tmp3); \ - stp2_9 = _mm_packs_epi32(tmp4, tmp5); \ - stp2_14 = _mm_packs_epi32(tmp6, tmp7); \ - \ - tmp0 = _mm_madd_epi16(lo_5_11, stg2_4); \ - tmp1 = _mm_madd_epi16(hi_5_11, stg2_4); \ - tmp2 = _mm_madd_epi16(lo_5_11, stg2_5); \ - tmp3 = _mm_madd_epi16(hi_5_11, stg2_5); \ - tmp4 = _mm_madd_epi16(lo_13_3, stg2_6); \ - tmp5 = _mm_madd_epi16(hi_13_3, stg2_6); \ - tmp6 = _mm_madd_epi16(lo_13_3, stg2_7); \ - tmp7 = _mm_madd_epi16(hi_13_3, stg2_7); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - tmp4 = _mm_add_epi32(tmp4, rounding); \ - tmp5 = _mm_add_epi32(tmp5, rounding); \ - tmp6 = _mm_add_epi32(tmp6, rounding); \ - tmp7 = _mm_add_epi32(tmp7, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ - \ - stp2_10 = _mm_packs_epi32(tmp0, tmp1); \ - stp2_13 = _mm_packs_epi32(tmp2, tmp3); \ - stp2_11 = _mm_packs_epi32(tmp4, tmp5); \ - stp2_12 = _mm_packs_epi32(tmp6, tmp7); \ - } \ - \ - /* Stage3 */ \ - { \ + \ + MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ + stg2_0, stg2_1, stg2_2, stg2_3, \ + stp2_8, stp2_15, stp2_9, stp2_14) \ + \ + MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ + stg2_4, stg2_5, stg2_6, stg2_7, \ + stp2_10, stp2_13, stp2_11, stp2_12) \ + } \ + \ + /* Stage3 */ \ + { \ const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \ const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \ const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \ const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \ - \ - tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); \ - tmp1 = _mm_madd_epi16(hi_2_14, stg3_0); \ - tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); \ - tmp3 = _mm_madd_epi16(hi_2_14, stg3_1); \ - tmp4 = _mm_madd_epi16(lo_10_6, stg3_2); \ - tmp5 = _mm_madd_epi16(hi_10_6, stg3_2); \ - tmp6 = _mm_madd_epi16(lo_10_6, stg3_3); \ - tmp7 = _mm_madd_epi16(hi_10_6, stg3_3); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - tmp4 = _mm_add_epi32(tmp4, rounding); \ - tmp5 = _mm_add_epi32(tmp5, rounding); \ - tmp6 = _mm_add_epi32(tmp6, rounding); \ - tmp7 = _mm_add_epi32(tmp7, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ - \ - stp1_4 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_7 = _mm_packs_epi32(tmp2, tmp3); \ - stp1_5 = _mm_packs_epi32(tmp4, tmp5); \ - stp1_6 = _mm_packs_epi32(tmp6, tmp7); \ - \ + \ + MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ + stg3_0, stg3_1, stg3_2, stg3_3, \ + stp1_4, stp1_7, stp1_5, stp1_6) \ + \ stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ - \ + \ stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ - } \ - \ - /* Stage4 */ \ - { \ + } \ + \ + /* Stage4 */ \ + { \ const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \ const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \ const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \ const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \ - \ + \ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); \ - tmp1 = _mm_madd_epi16(hi_0_8, stg4_0); \ - tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); \ - tmp3 = _mm_madd_epi16(hi_0_8, stg4_1); \ - tmp4 = _mm_madd_epi16(lo_4_12, stg4_2); \ - tmp5 = _mm_madd_epi16(hi_4_12, stg4_2); \ - tmp6 = _mm_madd_epi16(lo_4_12, stg4_3); \ - tmp7 = _mm_madd_epi16(hi_4_12, stg4_3); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - tmp4 = _mm_add_epi32(tmp4, rounding); \ - tmp5 = _mm_add_epi32(tmp5, rounding); \ - tmp6 = _mm_add_epi32(tmp6, rounding); \ - tmp7 = _mm_add_epi32(tmp7, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ - \ - stp2_0 = _mm_packs_epi32(tmp0, tmp1); \ - stp2_1 = _mm_packs_epi32(tmp2, tmp3); \ - stp2_2 = _mm_packs_epi32(tmp4, tmp5); \ - stp2_3 = _mm_packs_epi32(tmp6, tmp7); \ - \ + \ + MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ + stg4_0, stg4_1, stg4_2, stg4_3, \ + stp2_0, stp2_1, stp2_2, stp2_3) \ + \ stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - \ - tmp0 = _mm_madd_epi16(lo_9_14, stg4_4); \ - tmp1 = _mm_madd_epi16(hi_9_14, stg4_4); \ - tmp2 = _mm_madd_epi16(lo_9_14, stg4_5); \ - tmp3 = _mm_madd_epi16(hi_9_14, stg4_5); \ - tmp4 = _mm_madd_epi16(lo_10_13, stg4_6); \ - tmp5 = _mm_madd_epi16(hi_10_13, stg4_6); \ - tmp6 = _mm_madd_epi16(lo_10_13, stg4_7); \ - tmp7 = _mm_madd_epi16(hi_10_13, stg4_7); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - tmp4 = _mm_add_epi32(tmp4, rounding); \ - tmp5 = _mm_add_epi32(tmp5, rounding); \ - tmp6 = _mm_add_epi32(tmp6, rounding); \ - tmp7 = _mm_add_epi32(tmp7, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ - \ - stp2_9 = _mm_packs_epi32(tmp0, tmp1); \ - stp2_14 = _mm_packs_epi32(tmp2, tmp3); \ - stp2_10 = _mm_packs_epi32(tmp4, tmp5); \ - stp2_13 = _mm_packs_epi32(tmp6, tmp7); \ - } \ - \ - /* Stage5 */ \ - { \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ + stg4_4, stg4_5, stg4_6, stg4_7, \ + stp2_9, stp2_14, stp2_10, stp2_13) \ + } \ + \ + /* Stage5 */ \ + { \ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ + \ stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ + \ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ + \ tmp0 = _mm_add_epi32(tmp0, rounding); \ tmp1 = _mm_add_epi32(tmp1, rounding); \ tmp2 = _mm_add_epi32(tmp2, rounding); \ tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ + \ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ + \ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ + \ stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ - \ + \ stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ - } \ - \ - /* Stage6 */ \ - { \ + } \ + \ + /* Stage6 */ \ + { \ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ + \ stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ @@ -906,38 +747,10 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) { stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ - \ - tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); \ - tmp1 = _mm_madd_epi16(hi_10_13, stg6_0); \ - tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_10_13, stg4_0); \ - tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); \ - tmp5 = _mm_madd_epi16(hi_11_12, stg6_0); \ - tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); \ - tmp7 = _mm_madd_epi16(hi_11_12, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - tmp4 = _mm_add_epi32(tmp4, rounding); \ - tmp5 = _mm_add_epi32(tmp5, rounding); \ - tmp6 = _mm_add_epi32(tmp6, rounding); \ - tmp7 = _mm_add_epi32(tmp7, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ - \ - stp2_10 = _mm_packs_epi32(tmp0, tmp1); \ - stp2_13 = _mm_packs_epi32(tmp2, tmp3); \ - stp2_11 = _mm_packs_epi32(tmp4, tmp5); \ - stp2_12 = _mm_packs_epi32(tmp6, tmp7); \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ + stg6_0, stg4_0, stg6_0, stg4_0, \ + stp2_10, stp2_13, stp2_11, stp2_12) \ } void vp9_short_idct16x16_sse2(int16_t *input, int16_t *output, int pitch) { @@ -1506,4 +1319,657 @@ void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) { output += 8; } } + +void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) { + const int half_pitch = pitch >> 1; + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1<<5); + + // idct constants for each stage + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, + in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, + in24, in25, in26, in27, in28, in29, in30, in31; + __m128i col[128]; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, + stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, + stp1_30, stp1_31; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, + stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, + stp2_30, stp2_31; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i, j; + + // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. + for (i = 0; i < 8; i++) { + if (i < 4) { + // First 1-D idct + // Load input data. + in0 = _mm_load_si128((__m128i *)input); + in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); + in16 = _mm_load_si128((__m128i *)(input + 8 * 2)); + in24 = _mm_load_si128((__m128i *)(input + 8 * 3)); + in1 = _mm_load_si128((__m128i *)(input + 8 * 4)); + in9 = _mm_load_si128((__m128i *)(input + 8 * 5)); + in17 = _mm_load_si128((__m128i *)(input + 8 * 6)); + in25 = _mm_load_si128((__m128i *)(input + 8 * 7)); + in2 = _mm_load_si128((__m128i *)(input + 8 * 8)); + in10 = _mm_load_si128((__m128i *)(input + 8 * 9)); + in18 = _mm_load_si128((__m128i *)(input + 8 * 10)); + in26 = _mm_load_si128((__m128i *)(input + 8 * 11)); + in3 = _mm_load_si128((__m128i *)(input + 8 * 12)); + in11 = _mm_load_si128((__m128i *)(input + 8 * 13)); + in19 = _mm_load_si128((__m128i *)(input + 8 * 14)); + in27 = _mm_load_si128((__m128i *)(input + 8 * 15)); + + in4 = _mm_load_si128((__m128i *)(input + 8 * 16)); + in12 = _mm_load_si128((__m128i *)(input + 8 * 17)); + in20 = _mm_load_si128((__m128i *)(input + 8 * 18)); + in28 = _mm_load_si128((__m128i *)(input + 8 * 19)); + in5 = _mm_load_si128((__m128i *)(input + 8 * 20)); + in13 = _mm_load_si128((__m128i *)(input + 8 * 21)); + in21 = _mm_load_si128((__m128i *)(input + 8 * 22)); + in29 = _mm_load_si128((__m128i *)(input + 8 * 23)); + in6 = _mm_load_si128((__m128i *)(input + 8 * 24)); + in14 = _mm_load_si128((__m128i *)(input + 8 * 25)); + in22 = _mm_load_si128((__m128i *)(input + 8 * 26)); + in30 = _mm_load_si128((__m128i *)(input + 8 * 27)); + in7 = _mm_load_si128((__m128i *)(input + 8 * 28)); + in15 = _mm_load_si128((__m128i *)(input + 8 * 29)); + in23 = _mm_load_si128((__m128i *)(input + 8 * 30)); + in31 = _mm_load_si128((__m128i *)(input + 8 * 31)); + + input += 256; + + // Transpose 32x8 block to 8x32 block + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, + in18, in19, in20, in21, in22, in23); + TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, + in26, in27, in28, in29, in30, in31); + } else { + // Second 1-D idct + j = i - 4; + + // Transpose 32x8 block to 8x32 block + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, + in5, in6, in7); + j += 4; + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, + in11, in12, in13, in14, in15); + j += 4; + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, + in19, in20, in21, in22, in23); + j += 4; + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, + in28, in29, in30, in31); + } + + // Stage1 + { + const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); + const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); + const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); + const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); + + const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); + const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); + const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); + const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); + + const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); + const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); + const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); + const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); + + const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); + const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); + const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); + const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); + + MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, + stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, + stp1_17, stp1_30) + MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, + stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, + stp1_19, stp1_28) + MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, + stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, + stp1_21, stp1_26) + MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, + stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, + stp1_23, stp1_24) + } + + // Stage2 + { + const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); + const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); + const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); + const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); + + const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); + const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); + const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); + const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); + + MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, + stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, + stp2_14) + MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, + stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, + stp2_11, stp2_12) + + stp2_16 = _mm_add_epi16(stp1_16, stp1_17); + stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); + stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); + stp2_19 = _mm_add_epi16(stp1_19, stp1_18); + + stp2_20 = _mm_add_epi16(stp1_20, stp1_21); + stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); + stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); + stp2_23 = _mm_add_epi16(stp1_23, stp1_22); + + stp2_24 = _mm_add_epi16(stp1_24, stp1_25); + stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); + stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); + stp2_27 = _mm_add_epi16(stp1_27, stp1_26); + + stp2_28 = _mm_add_epi16(stp1_28, stp1_29); + stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); + stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); + stp2_31 = _mm_add_epi16(stp1_31, stp1_30); + } + + // Stage3 + { + const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); + const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); + const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); + const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); + + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); + + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); + + MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, + stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, + stp1_6) + + stp1_8 = _mm_add_epi16(stp2_8, stp2_9); + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); + stp1_12 = _mm_add_epi16(stp2_12, stp2_13); + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); + + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, + stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, + stp1_18, stp1_29) + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, + stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, + stp1_22, stp1_25) + + stp1_16 = stp2_16; + stp1_31 = stp2_31; + stp1_19 = stp2_19; + stp1_20 = stp2_20; + stp1_23 = stp2_23; + stp1_24 = stp2_24; + stp1_27 = stp2_27; + stp1_28 = stp2_28; + } + + // Stage4 + { + const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); + const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); + const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); + const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); + + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); + + MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, + stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, + stp2_2, stp2_3) + + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); + + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, + stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, + stp2_10, stp2_13) + + stp2_8 = stp1_8; + stp2_15 = stp1_15; + stp2_11 = stp1_11; + stp2_12 = stp1_12; + + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); + + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); + } + + // Stage5 + { + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); + + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); + + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); + + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); + + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + + stp1_5 = _mm_packs_epi32(tmp0, tmp1); + stp1_6 = _mm_packs_epi32(tmp2, tmp3); + + stp1_4 = stp2_4; + stp1_7 = stp2_7; + + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); + + stp1_16 = stp2_16; + stp1_17 = stp2_17; + + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, + stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, + stp1_19, stp1_28) + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, + stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, + stp1_21, stp1_26) + + stp1_22 = stp2_22; + stp1_23 = stp2_23; + stp1_24 = stp2_24; + stp1_25 = stp2_25; + stp1_30 = stp2_30; + stp1_31 = stp2_31; + } + + // Stage6 + { + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); + + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); + + stp2_8 = stp1_8; + stp2_9 = stp1_9; + stp2_14 = stp1_14; + stp2_15 = stp1_15; + + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, + stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, + stp2_13, stp2_11, stp2_12) + + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); + + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); + } + + // Stage7 + { + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); + + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); + + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); + + stp1_16 = stp2_16; + stp1_17 = stp2_17; + stp1_18 = stp2_18; + stp1_19 = stp2_19; + + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, + stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, + stp1_21, stp1_26) + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, + stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, + stp1_23, stp1_24) + + stp1_28 = stp2_28; + stp1_29 = stp2_29; + stp1_30 = stp2_30; + stp1_31 = stp2_31; + } + + // final stage + if (i < 4) { + // 1_D: Store 32 intermediate results for each 8x32 block. + col[i * 32 + 0] = _mm_add_epi16(stp1_0, stp1_31); + col[i * 32 + 1] = _mm_add_epi16(stp1_1, stp1_30); + col[i * 32 + 2] = _mm_add_epi16(stp1_2, stp1_29); + col[i * 32 + 3] = _mm_add_epi16(stp1_3, stp1_28); + col[i * 32 + 4] = _mm_add_epi16(stp1_4, stp1_27); + col[i * 32 + 5] = _mm_add_epi16(stp1_5, stp1_26); + col[i * 32 + 6] = _mm_add_epi16(stp1_6, stp1_25); + col[i * 32 + 7] = _mm_add_epi16(stp1_7, stp1_24); + col[i * 32 + 8] = _mm_add_epi16(stp1_8, stp1_23); + col[i * 32 + 9] = _mm_add_epi16(stp1_9, stp1_22); + col[i * 32 + 10] = _mm_add_epi16(stp1_10, stp1_21); + col[i * 32 + 11] = _mm_add_epi16(stp1_11, stp1_20); + col[i * 32 + 12] = _mm_add_epi16(stp1_12, stp1_19); + col[i * 32 + 13] = _mm_add_epi16(stp1_13, stp1_18); + col[i * 32 + 14] = _mm_add_epi16(stp1_14, stp1_17); + col[i * 32 + 15] = _mm_add_epi16(stp1_15, stp1_16); + col[i * 32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); + col[i * 32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); + col[i * 32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); + col[i * 32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); + col[i * 32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); + col[i * 32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); + col[i * 32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); + col[i * 32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); + col[i * 32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); + col[i * 32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); + col[i * 32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); + col[i * 32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); + col[i * 32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); + col[i * 32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); + col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); + col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); + } else { + // 2_D: Calculate the results and store them to destination. + in0 = _mm_add_epi16(stp1_0, stp1_31); + in1 = _mm_add_epi16(stp1_1, stp1_30); + in2 = _mm_add_epi16(stp1_2, stp1_29); + in3 = _mm_add_epi16(stp1_3, stp1_28); + in4 = _mm_add_epi16(stp1_4, stp1_27); + in5 = _mm_add_epi16(stp1_5, stp1_26); + in6 = _mm_add_epi16(stp1_6, stp1_25); + in7 = _mm_add_epi16(stp1_7, stp1_24); + in8 = _mm_add_epi16(stp1_8, stp1_23); + in9 = _mm_add_epi16(stp1_9, stp1_22); + in10 = _mm_add_epi16(stp1_10, stp1_21); + in11 = _mm_add_epi16(stp1_11, stp1_20); + in12 = _mm_add_epi16(stp1_12, stp1_19); + in13 = _mm_add_epi16(stp1_13, stp1_18); + in14 = _mm_add_epi16(stp1_14, stp1_17); + in15 = _mm_add_epi16(stp1_15, stp1_16); + in16 = _mm_sub_epi16(stp1_15, stp1_16); + in17 = _mm_sub_epi16(stp1_14, stp1_17); + in18 = _mm_sub_epi16(stp1_13, stp1_18); + in19 = _mm_sub_epi16(stp1_12, stp1_19); + in20 = _mm_sub_epi16(stp1_11, stp1_20); + in21 = _mm_sub_epi16(stp1_10, stp1_21); + in22 = _mm_sub_epi16(stp1_9, stp1_22); + in23 = _mm_sub_epi16(stp1_8, stp1_23); + in24 = _mm_sub_epi16(stp1_7, stp1_24); + in25 = _mm_sub_epi16(stp1_6, stp1_25); + in26 = _mm_sub_epi16(stp1_5, stp1_26); + in27 = _mm_sub_epi16(stp1_4, stp1_27); + in28 = _mm_sub_epi16(stp1_3, stp1_28); + in29 = _mm_sub_epi16(stp1_2, stp1_29); + in30 = _mm_sub_epi16(stp1_1, stp1_30); + in31 = _mm_sub_epi16(stp1_0, stp1_31); + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + in8 = _mm_adds_epi16(in8, final_rounding); + in9 = _mm_adds_epi16(in9, final_rounding); + in10 = _mm_adds_epi16(in10, final_rounding); + in11 = _mm_adds_epi16(in11, final_rounding); + in12 = _mm_adds_epi16(in12, final_rounding); + in13 = _mm_adds_epi16(in13, final_rounding); + in14 = _mm_adds_epi16(in14, final_rounding); + in15 = _mm_adds_epi16(in15, final_rounding); + in16 = _mm_adds_epi16(in16, final_rounding); + in17 = _mm_adds_epi16(in17, final_rounding); + in18 = _mm_adds_epi16(in18, final_rounding); + in19 = _mm_adds_epi16(in19, final_rounding); + in20 = _mm_adds_epi16(in20, final_rounding); + in21 = _mm_adds_epi16(in21, final_rounding); + in22 = _mm_adds_epi16(in22, final_rounding); + in23 = _mm_adds_epi16(in23, final_rounding); + in24 = _mm_adds_epi16(in24, final_rounding); + in25 = _mm_adds_epi16(in25, final_rounding); + in26 = _mm_adds_epi16(in26, final_rounding); + in27 = _mm_adds_epi16(in27, final_rounding); + in28 = _mm_adds_epi16(in28, final_rounding); + in29 = _mm_adds_epi16(in29, final_rounding); + in30 = _mm_adds_epi16(in30, final_rounding); + in31 = _mm_adds_epi16(in31, final_rounding); + + in0 = _mm_srai_epi16(in0, 6); + in1 = _mm_srai_epi16(in1, 6); + in2 = _mm_srai_epi16(in2, 6); + in3 = _mm_srai_epi16(in3, 6); + in4 = _mm_srai_epi16(in4, 6); + in5 = _mm_srai_epi16(in5, 6); + in6 = _mm_srai_epi16(in6, 6); + in7 = _mm_srai_epi16(in7, 6); + in8 = _mm_srai_epi16(in8, 6); + in9 = _mm_srai_epi16(in9, 6); + in10 = _mm_srai_epi16(in10, 6); + in11 = _mm_srai_epi16(in11, 6); + in12 = _mm_srai_epi16(in12, 6); + in13 = _mm_srai_epi16(in13, 6); + in14 = _mm_srai_epi16(in14, 6); + in15 = _mm_srai_epi16(in15, 6); + in16 = _mm_srai_epi16(in16, 6); + in17 = _mm_srai_epi16(in17, 6); + in18 = _mm_srai_epi16(in18, 6); + in19 = _mm_srai_epi16(in19, 6); + in20 = _mm_srai_epi16(in20, 6); + in21 = _mm_srai_epi16(in21, 6); + in22 = _mm_srai_epi16(in22, 6); + in23 = _mm_srai_epi16(in23, 6); + in24 = _mm_srai_epi16(in24, 6); + in25 = _mm_srai_epi16(in25, 6); + in26 = _mm_srai_epi16(in26, 6); + in27 = _mm_srai_epi16(in27, 6); + in28 = _mm_srai_epi16(in28, 6); + in29 = _mm_srai_epi16(in29, 6); + in30 = _mm_srai_epi16(in30, 6); + in31 = _mm_srai_epi16(in31, 6); + + // Store results + _mm_store_si128((__m128i *)output, in0); + _mm_store_si128((__m128i *)(output + half_pitch * 1), in1); + _mm_store_si128((__m128i *)(output + half_pitch * 2), in2); + _mm_store_si128((__m128i *)(output + half_pitch * 3), in3); + _mm_store_si128((__m128i *)(output + half_pitch * 4), in4); + _mm_store_si128((__m128i *)(output + half_pitch * 5), in5); + _mm_store_si128((__m128i *)(output + half_pitch * 6), in6); + _mm_store_si128((__m128i *)(output + half_pitch * 7), in7); + _mm_store_si128((__m128i *)(output + half_pitch * 8), in8); + _mm_store_si128((__m128i *)(output + half_pitch * 9), in9); + _mm_store_si128((__m128i *)(output + half_pitch * 10), in10); + _mm_store_si128((__m128i *)(output + half_pitch * 11), in11); + _mm_store_si128((__m128i *)(output + half_pitch * 12), in12); + _mm_store_si128((__m128i *)(output + half_pitch * 13), in13); + _mm_store_si128((__m128i *)(output + half_pitch * 14), in14); + _mm_store_si128((__m128i *)(output + half_pitch * 15), in15); + _mm_store_si128((__m128i *)(output + half_pitch * 16), in16); + _mm_store_si128((__m128i *)(output + half_pitch * 17), in17); + _mm_store_si128((__m128i *)(output + half_pitch * 18), in18); + _mm_store_si128((__m128i *)(output + half_pitch * 19), in19); + _mm_store_si128((__m128i *)(output + half_pitch * 20), in20); + _mm_store_si128((__m128i *)(output + half_pitch * 21), in21); + _mm_store_si128((__m128i *)(output + half_pitch * 22), in22); + _mm_store_si128((__m128i *)(output + half_pitch * 23), in23); + _mm_store_si128((__m128i *)(output + half_pitch * 24), in24); + _mm_store_si128((__m128i *)(output + half_pitch * 25), in25); + _mm_store_si128((__m128i *)(output + half_pitch * 26), in26); + _mm_store_si128((__m128i *)(output + half_pitch * 27), in27); + _mm_store_si128((__m128i *)(output + half_pitch * 28), in28); + _mm_store_si128((__m128i *)(output + half_pitch * 29), in29); + _mm_store_si128((__m128i *)(output + half_pitch * 30), in30); + _mm_store_si128((__m128i *)(output + half_pitch * 31), in31); + + output += 8; + } + } +} #endif diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index 9cb18143f..353e94fa5 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -490,7 +490,7 @@ static void read_switchable_interp_probs(VP9D_COMP* const pbi, int i, j; for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) { for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) { - cm->fc.switchable_interp_prob[j][i] = vp9_read_literal(bc, 8); + cm->fc.switchable_interp_prob[j][i] = vp9_read_prob(bc); } } //printf("DECODER: %d %d\n", cm->fc.switchable_interp_prob[0], @@ -511,13 +511,13 @@ static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *bc) { #if CONFIG_COMP_INTERINTRA_PRED if (cm->use_interintra) { if (vp9_read(bc, VP9_UPD_INTERINTRA_PROB)) - cm->fc.interintra_prob = (vp9_prob)vp9_read_literal(bc, 8); + cm->fc.interintra_prob = vp9_read_prob(bc); } #endif // Decode the baseline probabilities for decoding reference frame - cm->prob_intra_coded = (vp9_prob)vp9_read_literal(bc, 8); - cm->prob_last_coded = (vp9_prob)vp9_read_literal(bc, 8); - cm->prob_gf_coded = (vp9_prob)vp9_read_literal(bc, 8); + cm->prob_intra_coded = vp9_read_prob(bc); + cm->prob_last_coded = vp9_read_prob(bc); + cm->prob_gf_coded = vp9_read_prob(bc); // Computes a modified set of probabilities for use when reference // frame prediction fails. @@ -529,14 +529,14 @@ static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *bc) { if (cm->comp_pred_mode == HYBRID_PREDICTION) { int i; for (i = 0; i < COMP_PRED_CONTEXTS; i++) - cm->prob_comppred[i] = (vp9_prob)vp9_read_literal(bc, 8); + cm->prob_comppred[i] = vp9_read_prob(bc); } if (vp9_read_bit(bc)) { int i = 0; do { - cm->fc.ymode_prob[i] = (vp9_prob) vp9_read_literal(bc, 8); + cm->fc.ymode_prob[i] = vp9_read_prob(bc); } while (++i < VP9_YMODES - 1); } @@ -544,7 +544,7 @@ static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *bc) { int i = 0; do { - cm->fc.sb_ymode_prob[i] = (vp9_prob) vp9_read_literal(bc, 8); + cm->fc.sb_ymode_prob[i] = vp9_read_prob(bc); } while (++i < VP9_I32X32_MODES - 1); } @@ -1141,7 +1141,7 @@ void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, BOOL_DECODER* const bc) { if (pbi->common.mb_no_coeff_skip) { int k; for (k = 0; k < MBSKIP_CONTEXTS; ++k) { - cm->mbskip_pred_probs[k] = (vp9_prob)vp9_read_literal(bc, 8); + cm->mbskip_pred_probs[k] = vp9_read_prob(bc); } } diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c index ba7570347..40e5b1451 100644 --- a/vp9/decoder/vp9_decodframe.c +++ b/vp9/decoder/vp9_decodframe.c @@ -44,7 +44,6 @@ int dec_debug = 0; #endif - static int read_le16(const uint8_t *p) { return (p[1] << 8) | p[0]; } @@ -1278,61 +1277,51 @@ static void update_frame_size(VP9D_COMP *pbi) { vp9_update_mode_info_in_image(cm, cm->mi); } -static void setup_segmentation(VP9_COMMON *pc, MACROBLOCKD *xd, - BOOL_DECODER *header_bc) { +static void setup_segmentation(VP9_COMMON *pc, MACROBLOCKD *xd, vp9_reader *r) { int i, j; - // Is segmentation enabled - xd->segmentation_enabled = vp9_read_bit(header_bc); - + xd->segmentation_enabled = vp9_read_bit(r); if (xd->segmentation_enabled) { // Read whether or not the segmentation map is being explicitly updated // this frame. - xd->update_mb_segmentation_map = vp9_read_bit(header_bc); + xd->update_mb_segmentation_map = vp9_read_bit(r); // If so what method will be used. if (xd->update_mb_segmentation_map) { // Which macro block level features are enabled. Read the probs used to // decode the segment id for each macro block. for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) { - xd->mb_segment_tree_probs[i] = vp9_read_bit(header_bc) ? - (vp9_prob)vp9_read_literal(header_bc, 8) : 255; + xd->mb_segment_tree_probs[i] = vp9_read_bit(r) ? vp9_read_prob(r) : 255; } // Read the prediction probs needed to decode the segment id - pc->temporal_update = vp9_read_bit(header_bc); + pc->temporal_update = vp9_read_bit(r); for (i = 0; i < PREDICTION_PROBS; i++) { - if (pc->temporal_update) { - pc->segment_pred_probs[i] = vp9_read_bit(header_bc) ? - (vp9_prob)vp9_read_literal(header_bc, 8) : 255; - } else { - pc->segment_pred_probs[i] = 255; - } + pc->segment_pred_probs[i] = pc->temporal_update + ? (vp9_read_bit(r) ? vp9_read_prob(r) : 255) + : 255; } if (pc->temporal_update) { - int count[4]; const vp9_prob *p = xd->mb_segment_tree_probs; vp9_prob *p_mod = xd->mb_segment_mispred_tree_probs; - - count[0] = p[0] * p[1]; - count[1] = p[0] * (256 - p[1]); - count[2] = (256 - p[0]) * p[2]; - count[3] = (256 - p[0]) * (256 - p[2]); - - p_mod[0] = get_binary_prob(count[1], count[2] + count[3]); - p_mod[1] = get_binary_prob(count[0], count[2] + count[3]); - p_mod[2] = get_binary_prob(count[0] + count[1], count[3]); - p_mod[3] = get_binary_prob(count[0] + count[1], count[2]); + const int c0 = p[0] * p[1]; + const int c1 = p[0] * (256 - p[1]); + const int c2 = (256 - p[0]) * p[2]; + const int c3 = (256 - p[0]) * (256 - p[2]); + + p_mod[0] = get_binary_prob(c1, c2 + c3); + p_mod[1] = get_binary_prob(c0, c2 + c3); + p_mod[2] = get_binary_prob(c0 + c1, c3); + p_mod[3] = get_binary_prob(c0 + c1, c2); } } - // Is the segment data being updated - xd->update_mb_segmentation_data = vp9_read_bit(header_bc); + xd->update_mb_segmentation_data = vp9_read_bit(r); if (xd->update_mb_segmentation_data) { int data; - xd->mb_segment_abs_delta = vp9_read_bit(header_bc); + xd->mb_segment_abs_delta = vp9_read_bit(r); vp9_clearall_segfeatures(xd); @@ -1341,16 +1330,15 @@ static void setup_segmentation(VP9_COMMON *pc, MACROBLOCKD *xd, // For each of the segments features... for (j = 0; j < SEG_LVL_MAX; j++) { // Is the feature enabled - if (vp9_read_bit(header_bc)) { + if (vp9_read_bit(r)) { // Update the feature data and mask vp9_enable_segfeature(xd, i, j); - data = vp9_decode_unsigned_max(header_bc, - vp9_seg_feature_data_max(j)); + data = vp9_decode_unsigned_max(r, vp9_seg_feature_data_max(j)); // Is the segment data signed.. if (vp9_is_segfeature_signed(j)) { - if (vp9_read_bit(header_bc)) + if (vp9_read_bit(r)) data = -data; } } else { @@ -1364,17 +1352,16 @@ static void setup_segmentation(VP9_COMMON *pc, MACROBLOCKD *xd, } } -static void setup_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd, - BOOL_DECODER *header_bc) { +static void setup_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd, vp9_reader *r) { int i; - pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(header_bc); - pc->filter_level = vp9_read_literal(header_bc, 6); - pc->sharpness_level = vp9_read_literal(header_bc, 3); + pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(r); + pc->filter_level = vp9_read_literal(r, 6); + pc->sharpness_level = vp9_read_literal(r, 3); #if CONFIG_LOOP_DERING - if (vp9_read_bit(header_bc)) - pc->dering_enabled = 1 + vp9_read_literal(header_bc, 4); + if (vp9_read_bit(r)) + pc->dering_enabled = 1 + vp9_read_literal(r, 4); else pc->dering_enabled = 0; #endif @@ -1382,31 +1369,31 @@ static void setup_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd, // Read in loop filter deltas applied at the MB level based on mode or ref // frame. xd->mode_ref_lf_delta_update = 0; - xd->mode_ref_lf_delta_enabled = vp9_read_bit(header_bc); + xd->mode_ref_lf_delta_enabled = vp9_read_bit(r); if (xd->mode_ref_lf_delta_enabled) { // Do the deltas need to be updated - xd->mode_ref_lf_delta_update = vp9_read_bit(header_bc); + xd->mode_ref_lf_delta_update = vp9_read_bit(r); if (xd->mode_ref_lf_delta_update) { // Send update for (i = 0; i < MAX_REF_LF_DELTAS; i++) { - if (vp9_read_bit(header_bc)) { - // sign = vp9_read_bit( &header_bc ); - xd->ref_lf_deltas[i] = (signed char)vp9_read_literal(header_bc, 6); + if (vp9_read_bit(r)) { + // sign = vp9_read_bit(r); + xd->ref_lf_deltas[i] = vp9_read_literal(r, 6); - if (vp9_read_bit(header_bc)) + if (vp9_read_bit(r)) xd->ref_lf_deltas[i] = -xd->ref_lf_deltas[i]; // Apply sign } } // Send update for (i = 0; i < MAX_MODE_LF_DELTAS; i++) { - if (vp9_read_bit(header_bc)) { - // sign = vp9_read_bit( &header_bc ); - xd->mode_lf_deltas[i] = (signed char)vp9_read_literal(header_bc, 6); + if (vp9_read_bit(r)) { + // sign = vp9_read_bit(r); + xd->mode_lf_deltas[i] = vp9_read_literal(r, 6); - if (vp9_read_bit(header_bc)) + if (vp9_read_bit(r)) xd->mode_lf_deltas[i] = -xd->mode_lf_deltas[i]; // Apply sign } } @@ -1414,6 +1401,124 @@ static void setup_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd, } } +static const uint8_t *setup_frame_size(VP9D_COMP *pbi, int scaling_active, + const uint8_t *data, + const uint8_t *data_end) { + VP9_COMMON *const pc = &pbi->common; + const int width = pc->width; + const int height = pc->height; + + // If error concealment is enabled we should only parse the new size + // if we have enough data. Otherwise we will end up with the wrong size. + if (scaling_active && data + 4 < data_end) { + pc->display_width = read_le16(data + 0); + pc->display_height = read_le16(data + 2); + data += 4; + } + + if (data + 4 < data_end) { + pc->width = read_le16(data + 0); + pc->height = read_le16(data + 2); + data += 4; + } + + if (!scaling_active) { + pc->display_width = pc->width; + pc->display_height = pc->height; + } + + if (width != pc->width || height != pc->height) { + if (pc->width <= 0) { + pc->width = width; + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid frame width"); + } + + if (pc->height <= 0) { + pc->height = height; + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid frame height"); + } + + if (!pbi->initial_width || !pbi->initial_height) { + if (vp9_alloc_frame_buffers(pc, pc->width, pc->height)) + vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffers"); + pbi->initial_width = pc->width; + pbi->initial_height = pc->height; + } + + if (pc->width > pbi->initial_width) { + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Frame width too large"); + } + + if (pc->height > pbi->initial_height) { + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Frame height too large"); + } + + update_frame_size(pbi); + } + + return data; +} + +static void update_frame_context(VP9D_COMP *pbi, vp9_reader *r) { + FRAME_CONTEXT *const fc = &pbi->common.fc; + + vp9_copy(fc->pre_coef_probs_4x4, fc->coef_probs_4x4); + vp9_copy(fc->pre_coef_probs_8x8, fc->coef_probs_8x8); + vp9_copy(fc->pre_coef_probs_16x16, fc->coef_probs_16x16); + vp9_copy(fc->pre_coef_probs_32x32, fc->coef_probs_32x32); + vp9_copy(fc->pre_ymode_prob, fc->ymode_prob); + vp9_copy(fc->pre_sb_ymode_prob, fc->sb_ymode_prob); + vp9_copy(fc->pre_uv_mode_prob, fc->uv_mode_prob); + vp9_copy(fc->pre_bmode_prob, fc->bmode_prob); + vp9_copy(fc->pre_i8x8_mode_prob, fc->i8x8_mode_prob); + vp9_copy(fc->pre_sub_mv_ref_prob, fc->sub_mv_ref_prob); + vp9_copy(fc->pre_mbsplit_prob, fc->mbsplit_prob); + fc->pre_nmvc = fc->nmvc; + + vp9_zero(fc->coef_counts_4x4); + vp9_zero(fc->coef_counts_8x8); + vp9_zero(fc->coef_counts_16x16); + vp9_zero(fc->coef_counts_32x32); + vp9_zero(fc->eob_branch_counts); + vp9_zero(fc->ymode_counts); + vp9_zero(fc->sb_ymode_counts); + vp9_zero(fc->uv_mode_counts); + vp9_zero(fc->bmode_counts); + vp9_zero(fc->i8x8_mode_counts); + vp9_zero(fc->sub_mv_ref_counts); + vp9_zero(fc->mbsplit_counts); + vp9_zero(fc->NMVcount); + vp9_zero(fc->mv_ref_ct); + +#if CONFIG_COMP_INTERINTRA_PRED + fc->pre_interintra_prob = fc->interintra_prob; + vp9_zero(fc->interintra_counts); +#endif + +#if CONFIG_CODE_NONZEROCOUNT + vp9_copy(fc->pre_nzc_probs_4x4, fc->nzc_probs_4x4); + vp9_copy(fc->pre_nzc_probs_8x8, fc->nzc_probs_8x8); + vp9_copy(fc->pre_nzc_probs_16x16, fc->nzc_probs_16x16); + vp9_copy(fc->pre_nzc_probs_32x32, fc->nzc_probs_32x32); + vp9_copy(fc->pre_nzc_pcat_probs, fc->nzc_pcat_probs); + + vp9_zero(fc->nzc_counts_4x4); + vp9_zero(fc->nzc_counts_8x8); + vp9_zero(fc->nzc_counts_16x16); + vp9_zero(fc->nzc_counts_32x32); + vp9_zero(fc->nzc_pcat_counts); +#endif + + read_coef_probs(pbi, r); +#if CONFIG_CODE_NONZEROCOUNT + read_nzc_probs(&pbi->common, r); +#endif +} int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { BOOL_DECODER header_bc, residual_bc; @@ -1425,8 +1530,8 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { int mb_row, i, corrupt_tokens = 0; // printf("Decoding frame %d\n", pc->current_video_frame); - /* start with no corruption of current frame */ - xd->corrupted = 0; + + xd->corrupted = 0; // start with no corruption of current frame pc->yv12_fb[pc->new_fb_idx].corrupted = 0; if (data_end - data < 3) { @@ -1449,10 +1554,8 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { vp9_setup_version(pc); if (pc->frame_type == KEY_FRAME) { - /* vet via sync code */ - /* When error concealment is enabled we should only check the sync - * code if we have enough bits available - */ + // When error concealment is enabled we should only check the sync + // code if we have enough bits available if (data + 3 < data_end) { if (data[0] != 0x9d || data[1] != 0x01 || data[2] != 0x2a) vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM, @@ -1460,63 +1563,8 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { } data += 3; } - { - const int width = pc->width; - const int height = pc->height; - - /* If error concealment is enabled we should only parse the new size - * if we have enough data. Otherwise we will end up with the wrong - * size. - */ - if (scaling_active && data + 4 < data_end) { - pc->display_width = read_le16(data + 0); - pc->display_height = read_le16(data + 2); - data += 4; - } - if (data + 4 < data_end) { - pc->width = read_le16(data + 0); - pc->height = read_le16(data + 2); - data += 4; - } - if (!scaling_active) { - pc->display_width = pc->width; - pc->display_height = pc->height; - } - - if (width != pc->width || height != pc->height) { - if (pc->width <= 0) { - pc->width = width; - vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, - "Invalid frame width"); - } - - if (pc->height <= 0) { - pc->height = height; - vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, - "Invalid frame height"); - } - - if (!pbi->initial_width || !pbi->initial_height) { - if (vp9_alloc_frame_buffers(pc, pc->width, pc->height)) - vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR, - "Failed to allocate frame buffers"); - pbi->initial_width = pc->width; - pbi->initial_height = pc->height; - } - - if (pc->width > pbi->initial_width) { - vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, - "Frame width too large"); - } - if (pc->height > pbi->initial_height) { - vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, - "Frame height too large"); - } - - update_frame_size(pbi); - } - } + data = setup_frame_size(pbi, scaling_active, data, data_end); } if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME) || @@ -1526,7 +1574,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { init_frame(pbi); - /* Reset the frame pointers to the current frame size */ + // Reset the frame pointers to the current frame size vp8_yv12_realloc_frame_buffer(&pc->yv12_fb[pc->new_fb_idx], pc->width, pc->height, VP9BORDERINPIXELS); @@ -1535,9 +1583,9 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { (unsigned int)first_partition_length_in_bytes)) vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR, "Failed to allocate bool decoder 0"); - pc->clr_type = (YUV_TYPE)vp9_read_bit(&header_bc); - pc->clamp_type = (CLAMP_TYPE)vp9_read_bit(&header_bc); + pc->clr_type = (YUV_TYPE)vp9_read_bit(&header_bc); + pc->clamp_type = (CLAMP_TYPE)vp9_read_bit(&header_bc); pc->error_resilient_mode = vp9_read_bit(&header_bc); setup_segmentation(pc, xd, &header_bc); @@ -1552,25 +1600,25 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { } else { for (i = 0; i < PREDICTION_PROBS; i++) { if (vp9_read_bit(&header_bc)) - pc->ref_pred_probs[i] = (vp9_prob)vp9_read_literal(&header_bc, 8); + pc->ref_pred_probs[i] = vp9_read_prob(&header_bc); } } - pc->sb64_coded = vp9_read_literal(&header_bc, 8); - pc->sb32_coded = vp9_read_literal(&header_bc, 8); + pc->sb64_coded = vp9_read_prob(&header_bc); + pc->sb32_coded = vp9_read_prob(&header_bc); xd->lossless = vp9_read_bit(&header_bc); if (xd->lossless) { pc->txfm_mode = ONLY_4X4; } else { // Read the loop filter level and type pc->txfm_mode = vp9_read_literal(&header_bc, 2); - if (pc->txfm_mode == 3) + if (pc->txfm_mode == ALLOW_32X32) pc->txfm_mode += vp9_read_bit(&header_bc); if (pc->txfm_mode == TX_MODE_SELECT) { - pc->prob_tx[0] = vp9_read_literal(&header_bc, 8); - pc->prob_tx[1] = vp9_read_literal(&header_bc, 8); - pc->prob_tx[2] = vp9_read_literal(&header_bc, 8); + pc->prob_tx[0] = vp9_read_prob(&header_bc); + pc->prob_tx[1] = vp9_read_prob(&header_bc); + pc->prob_tx[2] = vp9_read_prob(&header_bc); } } @@ -1596,22 +1644,20 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { mb_init_dequantizer(pbi, &pbi->mb); } - /* Determine if the golden frame or ARF buffer should be updated and how. - * For all non key frames the GF and ARF refresh flags and sign bias - * flags must be set explicitly. - */ + // Determine if the golden frame or ARF buffer should be updated and how. + // For all non key frames the GF and ARF refresh flags and sign bias + // flags must be set explicitly. if (pc->frame_type == KEY_FRAME) { pc->active_ref_idx[0] = pc->new_fb_idx; pc->active_ref_idx[1] = pc->new_fb_idx; pc->active_ref_idx[2] = pc->new_fb_idx; } else { - /* Should the GF or ARF be updated from the current frame */ + // Should the GF or ARF be updated from the current frame pbi->refresh_frame_flags = vp9_read_literal(&header_bc, NUM_REF_FRAMES); - /* Select active reference frames */ + // Select active reference frames for (i = 0; i < 3; i++) { int ref_frame_num = vp9_read_literal(&header_bc, NUM_REF_FRAMES_LG2); - pc->active_ref_idx[i] = pc->ref_frame_map[ref_frame_num]; } @@ -1619,16 +1665,17 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { pc->ref_frame_sign_bias[ALTREF_FRAME] = vp9_read_bit(&header_bc); // Is high precision mv allowed - xd->allow_high_precision_mv = (unsigned char)vp9_read_bit(&header_bc); + xd->allow_high_precision_mv = vp9_read_bit(&header_bc); // Read the type of subpel filter to use - pc->mcomp_filter_type = vp9_read_bit(&header_bc) ? SWITCHABLE : - vp9_read_literal(&header_bc, 2); + pc->mcomp_filter_type = vp9_read_bit(&header_bc) + ? SWITCHABLE + : vp9_read_literal(&header_bc, 2); #if CONFIG_COMP_INTERINTRA_PRED pc->use_interintra = vp9_read_bit(&header_bc); #endif - /* To enable choice of different interploation filters */ + // To enable choice of different interploation filters vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc); } @@ -1649,8 +1696,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { for (i = 0; i < INTER_MODE_CONTEXTS; i++) { for (j = 0; j < 4; j++) { if (vp9_read(&header_bc, 252)) { - pc->fc.vp9_mode_contexts[i][j] = - (vp9_prob)vp9_read_literal(&header_bc, 8); + pc->fc.vp9_mode_contexts[i][j] = vp9_read_prob(&header_bc); } } } @@ -1675,8 +1721,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { // Read any updates to probabilities for (j = 0; j < MAX_MV_REF_CANDIDATES - 1; ++j) { if (vp9_read(&header_bc, VP9_MVREF_UPDATE_PROB)) { - xd->mb_mv_ref_probs[i][j] = - (vp9_prob)vp9_read_literal(&header_bc, 8); + xd->mb_mv_ref_probs[i][j] = vp9_read_prob(&header_bc); } } } @@ -1693,69 +1738,9 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { fclose(z); } - vp9_copy(pbi->common.fc.pre_coef_probs_4x4, - pbi->common.fc.coef_probs_4x4); - vp9_copy(pbi->common.fc.pre_coef_probs_8x8, - pbi->common.fc.coef_probs_8x8); - vp9_copy(pbi->common.fc.pre_coef_probs_16x16, - pbi->common.fc.coef_probs_16x16); - vp9_copy(pbi->common.fc.pre_coef_probs_32x32, - pbi->common.fc.coef_probs_32x32); - vp9_copy(pbi->common.fc.pre_ymode_prob, pbi->common.fc.ymode_prob); - vp9_copy(pbi->common.fc.pre_sb_ymode_prob, pbi->common.fc.sb_ymode_prob); - vp9_copy(pbi->common.fc.pre_uv_mode_prob, pbi->common.fc.uv_mode_prob); - vp9_copy(pbi->common.fc.pre_bmode_prob, pbi->common.fc.bmode_prob); - vp9_copy(pbi->common.fc.pre_i8x8_mode_prob, pbi->common.fc.i8x8_mode_prob); - vp9_copy(pbi->common.fc.pre_sub_mv_ref_prob, pbi->common.fc.sub_mv_ref_prob); - vp9_copy(pbi->common.fc.pre_mbsplit_prob, pbi->common.fc.mbsplit_prob); -#if CONFIG_COMP_INTERINTRA_PRED - pbi->common.fc.pre_interintra_prob = pbi->common.fc.interintra_prob; -#endif - pbi->common.fc.pre_nmvc = pbi->common.fc.nmvc; -#if CONFIG_CODE_NONZEROCOUNT - vp9_copy(pbi->common.fc.pre_nzc_probs_4x4, - pbi->common.fc.nzc_probs_4x4); - vp9_copy(pbi->common.fc.pre_nzc_probs_8x8, - pbi->common.fc.nzc_probs_8x8); - vp9_copy(pbi->common.fc.pre_nzc_probs_16x16, - pbi->common.fc.nzc_probs_16x16); - vp9_copy(pbi->common.fc.pre_nzc_probs_32x32, - pbi->common.fc.nzc_probs_32x32); - vp9_copy(pbi->common.fc.pre_nzc_pcat_probs, - pbi->common.fc.nzc_pcat_probs); -#endif - - vp9_zero(pbi->common.fc.coef_counts_4x4); - vp9_zero(pbi->common.fc.coef_counts_8x8); - vp9_zero(pbi->common.fc.coef_counts_16x16); - vp9_zero(pbi->common.fc.coef_counts_32x32); - vp9_zero(pbi->common.fc.eob_branch_counts); - vp9_zero(pbi->common.fc.ymode_counts); - vp9_zero(pbi->common.fc.sb_ymode_counts); - vp9_zero(pbi->common.fc.uv_mode_counts); - vp9_zero(pbi->common.fc.bmode_counts); - vp9_zero(pbi->common.fc.i8x8_mode_counts); - vp9_zero(pbi->common.fc.sub_mv_ref_counts); - vp9_zero(pbi->common.fc.mbsplit_counts); - vp9_zero(pbi->common.fc.NMVcount); - vp9_zero(pbi->common.fc.mv_ref_ct); -#if CONFIG_COMP_INTERINTRA_PRED - vp9_zero(pbi->common.fc.interintra_counts); -#endif -#if CONFIG_CODE_NONZEROCOUNT - vp9_zero(pbi->common.fc.nzc_counts_4x4); - vp9_zero(pbi->common.fc.nzc_counts_8x8); - vp9_zero(pbi->common.fc.nzc_counts_16x16); - vp9_zero(pbi->common.fc.nzc_counts_32x32); - vp9_zero(pbi->common.fc.nzc_pcat_counts); -#endif - - read_coef_probs(pbi, &header_bc); -#if CONFIG_CODE_NONZEROCOUNT - read_nzc_probs(&pbi->common, &header_bc); -#endif + update_frame_context(pbi, &header_bc); - /* Initialize xd pointers. Any reference should do for xd->pre, so use 0. */ + // Initialize xd pointers. Any reference should do for xd->pre, so use 0. vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->active_ref_idx[0]], sizeof(YV12_BUFFER_CONFIG)); vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], diff --git a/vp9/decoder/vp9_treereader.h b/vp9/decoder/vp9_treereader.h index 305dfe51f..4ec6de99d 100644 --- a/vp9/decoder/vp9_treereader.h +++ b/vp9/decoder/vp9_treereader.h @@ -19,10 +19,10 @@ typedef BOOL_DECODER vp9_reader; #define vp9_read decode_bool #define vp9_read_literal decode_value -#define vp9_read_bit(R) vp9_read(R, vp9_prob_half) - -/* Intent of tree data structure is to make decoding trivial. */ +#define vp9_read_bit(r) vp9_read(r, vp9_prob_half) +#define vp9_read_prob(r) ((vp9_prob)vp9_read_literal(r, 8)) +// Intent of tree data structure is to make decoding trivial. static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */ vp9_tree t, const vp9_prob *const p) { diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 82bd70bf8..d26f5ec46 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -89,45 +89,31 @@ static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] = { 1, 2, 3, // tables if and when things settle down in the experimental bitstream double vp9_convert_qindex_to_q(int qindex) { // Convert the index to a real Q value (scaled down to match old Q values) - return (double)vp9_ac_yquant(qindex) / 4.0; + return vp9_ac_yquant(qindex) / 4.0; } int vp9_gfboost_qadjust(int qindex) { - int retval; - double q; - - q = vp9_convert_qindex_to_q(qindex); - retval = (int)((0.00000828 * q * q * q) + - (-0.0055 * q * q) + - (1.32 * q) + 79.3); - return retval; + const double q = vp9_convert_qindex_to_q(qindex); + return (int)((0.00000828 * q * q * q) + + (-0.0055 * q * q) + + (1.32 * q) + 79.3); } static int kfboost_qadjust(int qindex) { - int retval; - double q; - - q = vp9_convert_qindex_to_q(qindex); - retval = (int)((0.00000973 * q * q * q) + - (-0.00613 * q * q) + - (1.316 * q) + 121.2); - return retval; + const double q = vp9_convert_qindex_to_q(qindex); + return (int)((0.00000973 * q * q * q) + + (-0.00613 * q * q) + + (1.316 * q) + 121.2); } int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex, double correction_factor) { - int enumerator; - double q = vp9_convert_qindex_to_q(qindex); - if (frame_type == KEY_FRAME) { - enumerator = 4000000; - } else { - enumerator = 2500000; - } + const double q = vp9_convert_qindex_to_q(qindex); + int enumerator = frame_type == KEY_FRAME ? 4000000 : 2500000; - // Q based adjustment to baseline enumberator + // q based adjustment to baseline enumberator enumerator += (int)(enumerator * q) >> 12; - return (int)(0.5 + (enumerator * correction_factor / q)); } @@ -265,33 +251,30 @@ void vp9_setup_key_frame(VP9_COMP *cpi) { // interval before next GF cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; /* All buffers are implicitly updated on key frames. */ - cpi->refresh_golden_frame = TRUE; - cpi->refresh_alt_ref_frame = TRUE; + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 1; } void vp9_setup_inter_frame(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &cpi->mb.e_mbd; - if (cm->error_resilient_mode) { + if (cm->error_resilient_mode) vp9_setup_past_independence(cm, xd); - } + assert(cm->frame_context_idx < NUM_FRAME_CONTEXTS); vpx_memcpy(&cm->fc, &cm->frame_contexts[cm->frame_context_idx], sizeof(cm->fc)); } -static int estimate_bits_at_q(int frame_kind, int Q, int MBs, +static int estimate_bits_at_q(int frame_kind, int q, int mbs, double correction_factor) { - int Bpm = (int)(vp9_bits_per_mb(frame_kind, Q, correction_factor)); + const int bpm = (int)(vp9_bits_per_mb(frame_kind, q, correction_factor)); - /* Attempt to retain reasonable accuracy without overflow. The cutoff is - * chosen such that the maximum product of Bpm and MBs fits 31 bits. The - * largest Bpm takes 20 bits. - */ - if (MBs > (1 << 11)) - return (Bpm >> BPER_MB_NORMBITS) * MBs; - else - return (Bpm * MBs) >> BPER_MB_NORMBITS; + // Attempt to retain reasonable accuracy without overflow. The cutoff is + // chosen such that the maximum product of Bpm and MBs fits 31 bits. The + // largest Bpm takes 20 bits. + return (mbs > (1 << 11)) ? (bpm >> BPER_MB_NORMBITS) * mbs + : (bpm * mbs) >> BPER_MB_NORMBITS; } @@ -314,7 +297,6 @@ static void calc_iframe_target_size(VP9_COMP *cpi) { } cpi->this_frame_target = target; - } @@ -330,25 +312,15 @@ static void calc_gf_params(VP9_COMP *cpi) { static void calc_pframe_target_size(VP9_COMP *cpi) { - int min_frame_target; - - min_frame_target = 0; - - min_frame_target = cpi->min_frame_bandwidth; - - if (min_frame_target < (cpi->av_per_frame_bandwidth >> 5)) - min_frame_target = cpi->av_per_frame_bandwidth >> 5; - - - // Special alt reference frame case + const int min_frame_target = MAX(cpi->min_frame_bandwidth, + cpi->av_per_frame_bandwidth >> 5); if (cpi->refresh_alt_ref_frame) { + // Special alt reference frame case // Per frame bit target for the alt ref frame cpi->per_frame_bandwidth = cpi->twopass.gf_bits; cpi->this_frame_target = cpi->per_frame_bandwidth; - } - - // Normal frames (gf,and inter) - else { + } else { + // Normal frames (gf,and inter) cpi->this_frame_target = cpi->per_frame_bandwidth; } @@ -366,10 +338,10 @@ static void calc_pframe_target_size(VP9_COMP *cpi) { // Adjust target frame size for Golden Frames: if (cpi->frames_till_gf_update_due == 0) { - // int Boost = 0; - int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q; + const int q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] + : cpi->oxcf.fixed_q; - cpi->refresh_golden_frame = TRUE; + cpi->refresh_golden_frame = 1; calc_gf_params(cpi); @@ -381,17 +353,17 @@ static void calc_pframe_target_size(VP9_COMP *cpi) { // The spend on the GF is defined in the two pass code // for two pass encodes cpi->this_frame_target = cpi->per_frame_bandwidth; - } else + } else { cpi->this_frame_target = - (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0) + (estimate_bits_at_q(1, q, cpi->common.MBs, 1.0) * cpi->last_boost) / 100; + } - } - // If there is an active ARF at this location use the minimum - // bits on this frame even if it is a contructed arf. - // The active maximum quantizer insures that an appropriate - // number of bits will be spent if needed for contstructed ARFs. - else { + } else { + // If there is an active ARF at this location use the minimum + // bits on this frame even if it is a contructed arf. + // The active maximum quantizer insures that an appropriate + // number of bits will be spent if needed for contstructed ARFs. cpi->this_frame_target = 0; } @@ -401,12 +373,12 @@ static void calc_pframe_target_size(VP9_COMP *cpi) { void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) { - int Q = cpi->common.base_qindex; - int correction_factor = 100; + const int q = cpi->common.base_qindex; + int correction_factor = 100; double rate_correction_factor; double adjustment_limit; - int projected_size_based_on_q = 0; + int projected_size_based_on_q = 0; // Clear down mmx registers to allow floating point in what follows vp9_clear_system_state(); // __asm emms; @@ -423,9 +395,9 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) { // Work out how big we would have expected the frame to be at this Q given // the current correction factor. // Stay in double to avoid int overflow when values are large - projected_size_based_on_q = - estimate_bits_at_q(cpi->common.frame_type, Q, - cpi->common.MBs, rate_correction_factor); + projected_size_based_on_q = estimate_bits_at_q(cpi->common.frame_type, q, + cpi->common.MBs, + rate_correction_factor); // Work out a size correction factor. // if ( cpi->this_frame_target > 0 ) @@ -480,7 +452,7 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) { int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) { - int Q = cpi->active_worst_quality; + int q = cpi->active_worst_quality; int i; int last_error = INT_MAX; @@ -507,21 +479,22 @@ int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) { i = cpi->active_best_quality; do { - bits_per_mb_at_this_q = - (int)(vp9_bits_per_mb(cpi->common.frame_type, i, correction_factor)); + bits_per_mb_at_this_q = (int)vp9_bits_per_mb(cpi->common.frame_type, i, + correction_factor); if (bits_per_mb_at_this_q <= target_bits_per_mb) { if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error) - Q = i; + q = i; else - Q = i - 1; + q = i - 1; break; - } else + } else { last_error = bits_per_mb_at_this_q - target_bits_per_mb; + } } while (++i <= cpi->active_worst_quality); - return Q; + return q; } @@ -566,7 +539,7 @@ static int estimate_keyframe_frequency(VP9_COMP *cpi) { total_weight += prior_key_frame_weight[i]; } - av_key_frame_frequency /= total_weight; + av_key_frame_frequency /= total_weight; } return av_key_frame_frequency; diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 19255bbf9..56453e249 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -37,6 +37,7 @@ struct vp8_extracfg { unsigned int cq_level; /* constrained quality level */ unsigned int rc_max_intra_bitrate_pct; unsigned int lossless; + unsigned int frame_parallel_decoding_mode; }; struct extraconfig_map { @@ -64,6 +65,7 @@ static const struct extraconfig_map extracfg_map[] = { 10, /* cq_level */ 0, /* rc_max_intra_bitrate_pct */ 0, /* lossless */ + 0, /* frame_parallel_decoding_mode */ } } }; @@ -313,7 +315,7 @@ static vpx_codec_err_t set_vp8e_config(VP9_CONFIG *oxcf, oxcf->lossless = vp8_cfg.lossless; oxcf->error_resilient_mode = cfg.g_error_resilient; - oxcf->frame_parallel_decoding_mode = cfg.g_frame_parallel_decoding; + oxcf->frame_parallel_decoding_mode = vp8_cfg.frame_parallel_decoding_mode; /* printf("Current VP9 Settings: \n"); printf("target_bandwidth: %d\n", oxcf->target_bandwidth); @@ -423,6 +425,7 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx, MAP(VP8E_SET_CQ_LEVEL, xcfg.cq_level); MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT, xcfg.rc_max_intra_bitrate_pct); MAP(VP9E_SET_LOSSLESS, xcfg.lossless); + MAP(VP9E_SET_FRAME_PARALLEL_DECODING, xcfg.frame_parallel_decoding_mode); } res = validate_config(ctx, &ctx->cfg, &xcfg); @@ -1096,7 +1099,6 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = { {1, 30}, /* g_timebase */ 0, /* g_error_resilient */ - 0, /* g_frame_parallel_decoding */ VPX_RC_ONE_PASS, /* g_pass */ diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index 66c89b5a9..d0c23f07a 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -64,6 +64,7 @@ struct vpx_codec_alg_priv { vpx_image_t img; int img_setup; int img_avail; + int invert_tile_order; }; static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, @@ -333,7 +334,7 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, oxcf.Version = 9; oxcf.postprocess = 0; oxcf.max_threads = ctx->cfg.threads; - oxcf.inv_tile_order = ctx->cfg.inv_tile_order; + oxcf.inv_tile_order = ctx->invert_tile_order; optr = vp9_create_decompressor(&oxcf); /* If postprocessing was enabled by the application and a @@ -726,6 +727,13 @@ static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx, } +static vpx_codec_err_t set_invert_tile_order(vpx_codec_alg_priv_t *ctx, + int ctr_id, + va_list args) { + ctx->invert_tile_order = va_arg(args, int); + return VPX_CODEC_OK; +} + static vpx_codec_ctrl_fn_map_t ctf_maps[] = { {VP8_SET_REFERENCE, vp9_set_reference}, {VP8_COPY_REFERENCE, vp9_copy_reference}, @@ -737,6 +745,7 @@ static vpx_codec_ctrl_fn_map_t ctf_maps[] = { {VP8D_GET_LAST_REF_UPDATES, vp8_get_last_ref_updates}, {VP8D_GET_FRAME_CORRUPTED, vp8_get_frame_corrupted}, {VP9_GET_REFERENCE, get_reference}, + {VP9_INVERT_TILE_DECODE_ORDER, set_invert_tile_order}, { -1, NULL}, }; diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index 0b910b99d..7f19dd033 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -190,6 +190,7 @@ enum vp8e_enc_control_id { VP9E_SET_LOSSLESS, VP9E_SET_TILE_COLUMNS, VP9E_SET_TILE_ROWS, + VP9E_SET_FRAME_PARALLEL_DECODING }; /*!\brief vpx 1-D scaling mode @@ -310,6 +311,7 @@ VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int) VPX_CTRL_USE_TYPE(VP9E_SET_LOSSLESS, unsigned int) +VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PARALLEL_DECODING, unsigned int) /*! @} - end defgroup vp8_encoder */ #include "vpx_codec_impl_bottom.h" #endif diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h index e2ec8b213..201df88fe 100644 --- a/vpx/vp8dx.h +++ b/vpx/vp8dx.h @@ -63,6 +63,15 @@ enum vp8_dec_control_id { */ VP8D_GET_LAST_REF_USED, + /** decryption key to protect encoded data buffer before decoding, + * pointer to 32 byte array which is copied, so the array passed + * does not need to be preserved + */ + VP8_SET_DECRYPT_KEY, + + /** For testing. */ + VP9_INVERT_TILE_DECODE_ORDER, + VP8_DECODER_CTRL_ID_MAX }; @@ -78,6 +87,8 @@ enum vp8_dec_control_id { VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_UPDATES, int *) VPX_CTRL_USE_TYPE(VP8D_GET_FRAME_CORRUPTED, int *) VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED, int *) +VPX_CTRL_USE_TYPE(VP8_SET_DECRYPT_KEY, const unsigned char *) +VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int) /*! @} - end defgroup vp8_decoder */ diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h index ddbc0a6d6..e7701e512 100644 --- a/vpx/vpx_decoder.h +++ b/vpx/vpx_decoder.h @@ -106,7 +106,6 @@ extern "C" { unsigned int threads; /**< Maximum number of threads to use, default 1 */ unsigned int w; /**< Width */ unsigned int h; /**< Height */ - int inv_tile_order; /**< Invert tile decoding order, default 0 */ } vpx_codec_dec_cfg_t; /**< alias for struct vpx_codec_dec_cfg */ diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index 2ec09bdd4..ffdbc0644 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -334,12 +334,6 @@ extern "C" { */ vpx_codec_er_flags_t g_error_resilient; - /*!\brief Enable frame parallel decoding mode - * This value should be 1 to encode in a way that enables frame parallel - * decoding. Otherwise make it 0. - */ - unsigned int g_frame_parallel_decoding; - /*!\brief Multi-pass Encoding Mode * @@ -1024,7 +1024,7 @@ int main(int argc, const char **argv_) { if (!noblit) { if (do_scale) { - if (frame_out == 1) { + if (img && frame_out == 1) { stream_w = img->d_w; stream_h = img->d_h; scaled_img = vpx_img_alloc(NULL, VPX_IMG_FMT_I420, @@ -89,8 +89,8 @@ static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb, static const char *exec_name; -#define VP8_FOURCC (0x00385056) -#define VP9_FOURCC (0x00395056) +#define VP8_FOURCC (0x30385056) +#define VP9_FOURCC (0x30395056) static const struct codec_item { char const *name; const vpx_codec_iface_t *(*iface)(void); @@ -1023,10 +1023,6 @@ static const arg_def_t timebase = ARG_DEF(NULL, "timebase", 1, "Output timestamp precision (fractional seconds)"); static const arg_def_t error_resilient = ARG_DEF(NULL, "error-resilient", 1, "Enable error resiliency features"); -#if CONFIG_VP9_ENCODER -static const arg_def_t frame_parallel_decoding = ARG_DEF( - NULL, "frame-parallel", 1, "Enable frame parallel decodability features"); -#endif static const arg_def_t lag_in_frames = ARG_DEF(NULL, "lag-in-frames", 1, "Max number of frames to lag"); @@ -1034,9 +1030,6 @@ static const arg_def_t *global_args[] = { &use_yv12, &use_i420, &usage, &threads, &profile, &width, &height, &stereo_mode, &timebase, &framerate, &error_resilient, -#if CONFIG_VP9_ENCODER - &frame_parallel_decoding, -#endif &lag_in_frames, NULL }; @@ -1136,6 +1129,10 @@ static const arg_def_t cq_level = ARG_DEF(NULL, "cq-level", 1, static const arg_def_t max_intra_rate_pct = ARG_DEF(NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)"); static const arg_def_t lossless = ARG_DEF(NULL, "lossless", 1, "Lossless mode"); +#if CONFIG_VP9_ENCODER +static const arg_def_t frame_parallel_decoding = ARG_DEF( + NULL, "frame-parallel", 1, "Enable frame parallel decodability features"); +#endif #if CONFIG_VP8_ENCODER static const arg_def_t *vp8_args[] = { @@ -1159,6 +1156,7 @@ static const arg_def_t *vp9_args[] = { &cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh, &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type, &tune_ssim, &cq_level, &max_intra_rate_pct, &lossless, + &frame_parallel_decoding, NULL }; static const int vp9_arg_ctrl_map[] = { @@ -1167,7 +1165,7 @@ static const int vp9_arg_ctrl_map[] = { VP9E_SET_TILE_COLUMNS, VP9E_SET_TILE_ROWS, VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE, VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT, - VP9E_SET_LOSSLESS, + VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING, 0 }; #endif @@ -1936,10 +1934,6 @@ static int parse_stream_params(struct global_config *global, validate_positive_rational(arg.name, &config->cfg.g_timebase); } else if (arg_match(&arg, &error_resilient, argi)) config->cfg.g_error_resilient = arg_parse_uint(&arg); -#if CONFIG_VP9_ENCODER - else if (arg_match(&arg, &frame_parallel_decoding, argi)) - config->cfg.g_frame_parallel_decoding = arg_parse_uint(&arg); -#endif else if (arg_match(&arg, &lag_in_frames, argi)) config->cfg.g_lag_in_frames = arg_parse_uint(&arg); else if (arg_match(&arg, &dropframe_thresh, argi)) @@ -2124,9 +2118,6 @@ static void show_stream_config(struct stream_state *stream, SHOW(g_timebase.num); SHOW(g_timebase.den); SHOW(g_error_resilient); -#if CONFIG_VP9_ENCODER - SHOW(g_frame_parallel_decoding); -#endif SHOW(g_pass); SHOW(g_lag_in_frames); SHOW(rc_dropframe_thresh); @@ -2560,7 +2551,7 @@ int main(int argc, const char **argv_) { usage_exit(); for (pass = global.pass ? global.pass - 1 : 0; pass < global.passes; pass++) { - int frames_in = 0; + int frames_in = 0, seen_frames = 0; int64_t estimated_time_left = -1; int64_t average_rate = -1; off_t lagged_count = 0; @@ -2640,9 +2631,11 @@ int main(int argc, const char **argv_) { if (frame_avail) frames_in++; + seen_frames = frames_in > global.skip_frames ? + frames_in - global.skip_frames : 0; if (!global.quiet) { - float fps = usec_to_fps(cx_time, frames_in); + float fps = usec_to_fps(cx_time, seen_frames); fprintf(stderr, "\rPass %d/%d ", pass + 1, global.passes); if (stream_cnt == 1) @@ -2678,16 +2671,17 @@ int main(int argc, const char **argv_) { FOREACH_STREAM(get_cx_data(stream, &global, &got_data)); if (!got_data && input.length && !streams->frames_out) { - lagged_count = global.limit ? frames_in : ftello(input.file); + lagged_count = global.limit ? seen_frames : ftello(input.file); } else if (input.length) { int64_t remaining; int64_t rate; if (global.limit) { - int frame_in_lagged = (frames_in - lagged_count) * 1000; + int frame_in_lagged = (seen_frames - lagged_count) * 1000; rate = cx_time ? frame_in_lagged * (int64_t)1000000 / cx_time : 0; - remaining = 1000 * (global.limit - frames_in + lagged_count); + remaining = 1000 * (global.limit - global.skip_frames + - seen_frames + lagged_count); } else { off_t input_pos = ftello(input.file); off_t input_pos_lagged = input_pos - lagged_count; @@ -2719,14 +2713,14 @@ int main(int argc, const char **argv_) { "\rPass %d/%d frame %4d/%-4d %7"PRId64"B %7lub/f %7"PRId64"b/s" " %7"PRId64" %s (%.2f fps)\033[K\n", pass + 1, global.passes, frames_in, stream->frames_out, (int64_t)stream->nbytes, - frames_in ? (unsigned long)(stream->nbytes * 8 / frames_in) : 0, - frames_in ? (int64_t)stream->nbytes * 8 + seen_frames ? (unsigned long)(stream->nbytes * 8 / seen_frames) : 0, + seen_frames ? (int64_t)stream->nbytes * 8 * (int64_t)global.framerate.num / global.framerate.den - / frames_in + / seen_frames : 0, stream->cx_time > 9999999 ? stream->cx_time / 1000 : stream->cx_time, stream->cx_time > 9999999 ? "ms" : "us", - usec_to_fps(stream->cx_time, frames_in)); + usec_to_fps(stream->cx_time, seen_frames)); ); if (global.show_psnr) |