diff options
66 files changed, 1656 insertions, 1168 deletions
diff --git a/build/make/configure.sh b/build/make/configure.sh index bb7ab4110..f3610218b 100755 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -1062,7 +1062,7 @@ EOF setup_gnu_toolchain add_cflags -use-msasm -use-asm add_ldflags -i-static - enabled x86_64 && add_cflags -ipo -no-prec-div -static -xSSE2 -axSSE2 + enabled x86_64 && add_cflags -ipo -static -O3 enabled x86_64 && AR=xiar case ${tune_cpu} in atom*) diff --git a/build/make/rtcd.sh b/build/make/rtcd.sh index 6cc36843b..9a8d97e32 100755 --- a/build/make/rtcd.sh +++ b/build/make/rtcd.sh @@ -290,9 +290,11 @@ static void setup_rtcd_internal(void) { $(set_function_pointers c $ALL_ARCHS) #if HAVE_DSPR2 +#if CONFIG_VP8 void dsputil_static_init(); dsputil_static_init(); #endif +#endif } #endif $(common_bottom) diff --git a/build/make/thumb.pm b/build/make/thumb.pm index 545f59f43..e1f34c1ec 100644 --- a/build/make/thumb.pm +++ b/build/make/thumb.pm @@ -47,7 +47,7 @@ sub FixThumbInstructions($$) # this is used, it's used for two subsequent load instructions, # where a hand-written version of it could merge two subsequent # add and sub instructions. - s/^(\s*)((ldr|str|pld)(ne)?)(\s+)(r\d+,)?\s*\[(\w+), -([^\]]+)\]/$1sub$4$5$7, $7, $8\n$1$2$5$6\[$7\]\n$1add$4$5$7, $7, $8/g; + s/^(\s*)((ldr|str|pld)(ne)?)(\s+)(r\d+,\s*)?\[(\w+), -([^\]]+)\]/$1sub$4$5$7, $7, $8\n$1$2$5$6\[$7\]\n$1add$4$5$7, $7, $8/g; # Convert register post indexing to a separate add instruction. # This converts "ldrneb r9, [r0], r2" into "ldrneb r9, [r0]", diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc index f5f6d5b3f..5fba70025 100644 --- a/test/intrapred_test.cc +++ b/test/intrapred_test.cc @@ -34,13 +34,17 @@ class IntraPredBase { } protected: - void SetupMacroblock(uint8_t *data, int block_size, int stride, + void SetupMacroblock(MACROBLOCKD *mbptr, + MODE_INFO *miptr, + uint8_t *data, + int block_size, + int stride, int num_planes) { - memset(&mb_, 0, sizeof(mb_)); - memset(&mi_, 0, sizeof(mi_)); - mb_.up_available = 1; - mb_.left_available = 1; - mb_.mode_info_context = &mi_; + mbptr_ = mbptr; + miptr_ = miptr; + mbptr_->up_available = 1; + mbptr_->left_available = 1; + mbptr_->mode_info_context = miptr_; stride_ = stride; block_size_ = block_size; num_planes_ = num_planes; @@ -63,14 +67,14 @@ class IntraPredBase { virtual void Predict(MB_PREDICTION_MODE mode) = 0; void SetLeftUnavailable() { - mb_.left_available = 0; + mbptr_->left_available = 0; for (int p = 0; p < num_planes_; p++) for (int i = -1; i < block_size_; ++i) data_ptr_[p][stride_ * i - 1] = 129; } void SetTopUnavailable() { - mb_.up_available = 0; + mbptr_->up_available = 0; for (int p = 0; p < num_planes_; p++) memset(&data_ptr_[p][-1 - stride_], 127, block_size_ + 2); } @@ -96,13 +100,13 @@ class IntraPredBase { for (int p = 0; p < num_planes_; p++) { // calculate expected DC int expected; - if (mb_.up_available || mb_.left_available) { - int sum = 0, shift = BlockSizeLog2Min1() + mb_.up_available + - mb_.left_available; - if (mb_.up_available) + if (mbptr_->up_available || mbptr_->left_available) { + int sum = 0, shift = BlockSizeLog2Min1() + mbptr_->up_available + + mbptr_->left_available; + if (mbptr_->up_available) for (int x = 0; x < block_size_; x++) sum += data_ptr_[p][x - stride_]; - if (mb_.left_available) + if (mbptr_->left_available) for (int y = 0; y < block_size_; y++) sum += data_ptr_[p][y * stride_ - 1]; expected = (sum + (1 << (shift - 1))) >> shift; @@ -209,8 +213,8 @@ class IntraPredBase { } } - MACROBLOCKD mb_; - MODE_INFO mi_; + MACROBLOCKD *mbptr_; + MODE_INFO *miptr_; uint8_t *data_ptr_[2]; // in the case of Y, only [0] is used int stride_; int block_size_; @@ -228,12 +232,18 @@ class IntraPredYTest : public ::testing::TestWithParam<intra_pred_y_fn_t>, protected IntraPredBase { public: static void SetUpTestCase() { + mb_ = reinterpret_cast<MACROBLOCKD*>( + vpx_memalign(32, sizeof(MACROBLOCKD))); + mi_ = reinterpret_cast<MODE_INFO*>( + vpx_memalign(32, sizeof(MODE_INFO))); data_array_ = reinterpret_cast<uint8_t*>( vpx_memalign(kDataAlignment, kDataBufferSize)); } static void TearDownTestCase() { vpx_free(data_array_); + vpx_free(mi_); + vpx_free(mb_); data_array_ = NULL; } @@ -250,12 +260,12 @@ class IntraPredYTest : public ::testing::TestWithParam<intra_pred_y_fn_t>, virtual void SetUp() { pred_fn_ = GetParam(); - SetupMacroblock(data_array_, kBlockSize, kStride, 1); + SetupMacroblock(mb_, mi_, data_array_, kBlockSize, kStride, 1); } virtual void Predict(MB_PREDICTION_MODE mode) { - mb_.mode_info_context->mbmi.mode = mode; - REGISTER_STATE_CHECK(pred_fn_(&mb_, + mbptr_->mode_info_context->mbmi.mode = mode; + REGISTER_STATE_CHECK(pred_fn_(mbptr_, data_ptr_[0] - kStride, data_ptr_[0] - 1, kStride, data_ptr_[0], kStride)); @@ -263,8 +273,12 @@ class IntraPredYTest : public ::testing::TestWithParam<intra_pred_y_fn_t>, intra_pred_y_fn_t pred_fn_; static uint8_t* data_array_; + static MACROBLOCKD * mb_; + static MODE_INFO *mi_; }; +MACROBLOCKD* IntraPredYTest::mb_ = NULL; +MODE_INFO* IntraPredYTest::mi_ = NULL; uint8_t* IntraPredYTest::data_array_ = NULL; TEST_P(IntraPredYTest, IntraPredTests) { @@ -299,12 +313,18 @@ class IntraPredUVTest : public ::testing::TestWithParam<intra_pred_uv_fn_t>, protected IntraPredBase { public: static void SetUpTestCase() { + mb_ = reinterpret_cast<MACROBLOCKD*>( + vpx_memalign(32, sizeof(MACROBLOCKD))); + mi_ = reinterpret_cast<MODE_INFO*>( + vpx_memalign(32, sizeof(MODE_INFO))); data_array_ = reinterpret_cast<uint8_t*>( vpx_memalign(kDataAlignment, kDataBufferSize)); } static void TearDownTestCase() { vpx_free(data_array_); + vpx_free(mi_); + vpx_free(mb_); data_array_ = NULL; } @@ -322,12 +342,12 @@ class IntraPredUVTest : public ::testing::TestWithParam<intra_pred_uv_fn_t>, virtual void SetUp() { pred_fn_ = GetParam(); - SetupMacroblock(data_array_, kBlockSize, kStride, 2); + SetupMacroblock(mb_, mi_, data_array_, kBlockSize, kStride, 2); } virtual void Predict(MB_PREDICTION_MODE mode) { - mb_.mode_info_context->mbmi.uv_mode = mode; - pred_fn_(&mb_, data_ptr_[0] - kStride, data_ptr_[1] - kStride, + mbptr_->mode_info_context->mbmi.uv_mode = mode; + pred_fn_(mbptr_, data_ptr_[0] - kStride, data_ptr_[1] - kStride, data_ptr_[0] - 1, data_ptr_[1] - 1, kStride, data_ptr_[0], data_ptr_[1], kStride); } @@ -340,8 +360,12 @@ class IntraPredUVTest : public ::testing::TestWithParam<intra_pred_uv_fn_t>, // We use 9 lines so we have one line above us for top-prediction. // [0] = U, [1] = V static uint8_t* data_array_; + static MACROBLOCKD* mb_; + static MODE_INFO* mi_; }; +MACROBLOCKD* IntraPredUVTest::mb_ = NULL; +MODE_INFO* IntraPredUVTest::mi_ = NULL; uint8_t* IntraPredUVTest::data_array_ = NULL; TEST_P(IntraPredUVTest, IntraPredTests) { diff --git a/vp8/common/x86/vp8_asm_stubs.c b/vp8/common/x86/vp8_asm_stubs.c index c0416b7e6..b40929381 100644 --- a/vp8/common/x86/vp8_asm_stubs.c +++ b/vp8/common/x86/vp8_asm_stubs.c @@ -611,16 +611,12 @@ void vp8_sixtap_predict4x4_ssse3 for (r = 0; r < 4; r++) { - #if !(CONFIG_FAST_UNALIGNED) dst_ptr[0] = src_ptr[0]; dst_ptr[1] = src_ptr[1]; dst_ptr[2] = src_ptr[2]; dst_ptr[3] = src_ptr[3]; - #else - *(uint32_t *)dst_ptr = *(uint32_t *)src_ptr ; - #endif - dst_ptr += dst_pitch; - src_ptr += src_pixels_per_line; + dst_ptr += dst_pitch; + src_ptr += src_pixels_per_line; } } } diff --git a/vp8/encoder/dct.c b/vp8/encoder/dct.c index b5a11ae34..091554a5d 100644 --- a/vp8/encoder/dct.c +++ b/vp8/encoder/dct.c @@ -20,10 +20,10 @@ void vp8_short_fdct4x4_c(short *input, short *output, int pitch) for (i = 0; i < 4; i++) { - a1 = ((ip[0] + ip[3])<<3); - b1 = ((ip[1] + ip[2])<<3); - c1 = ((ip[1] - ip[2])<<3); - d1 = ((ip[0] - ip[3])<<3); + a1 = ((ip[0] + ip[3]) * 8); + b1 = ((ip[1] + ip[2]) * 8); + c1 = ((ip[1] - ip[2]) * 8); + d1 = ((ip[0] - ip[3]) * 8); op[0] = a1 + b1; op[2] = a1 - b1; @@ -72,10 +72,10 @@ void vp8_short_walsh4x4_c(short *input, short *output, int pitch) for (i = 0; i < 4; i++) { - a1 = ((ip[0] + ip[2])<<2); - d1 = ((ip[1] + ip[3])<<2); - c1 = ((ip[1] - ip[3])<<2); - b1 = ((ip[0] - ip[2])<<2); + a1 = ((ip[0] + ip[2]) * 4); + d1 = ((ip[1] + ip[3]) * 4); + c1 = ((ip[1] - ip[3]) * 4); + b1 = ((ip[0] - ip[2]) * 4); op[0] = a1 + d1 + (a1!=0); op[1] = b1 + c1; diff --git a/vp9/common/arm/neon/vp9_idct16x16_neon.c b/vp9/common/arm/neon/vp9_idct16x16_neon.c index 3e3e400a4..fb7b5cdc4 100644 --- a/vp9/common/arm/neon/vp9_idct16x16_neon.c +++ b/vp9/common/arm/neon/vp9_idct16x16_neon.c @@ -20,26 +20,28 @@ extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src, int16_t skip_adding, uint8_t *dest, int dest_stride); -extern void vp9_short_idct10_16x16_add_neon_pass1(int16_t *input, +extern void vp9_short_idct16x16_10_add_neon_pass1(int16_t *input, int16_t *output, int output_stride); -extern void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src, +extern void vp9_short_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *output, int16_t *pass1Output, int16_t skip_adding, uint8_t *dest, int dest_stride); -extern void save_neon_registers(); -extern void restore_neon_registers(); +/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ +extern void vp9_push_neon(int64_t *store); +extern void vp9_pop_neon(int64_t *store); void vp9_short_idct16x16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + int64_t store_reg[8]; int16_t pass1_output[16*16] = {0}; int16_t row_idct_output[16*16] = {0}; // save d8-d15 register values. - save_neon_registers(); + vp9_push_neon(store_reg); /* Parallel idct on the upper 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the @@ -102,28 +104,29 @@ void vp9_short_idct16x16_add_neon(int16_t *input, dest_stride); // restore d8-d15 register values. - restore_neon_registers(); + vp9_pop_neon(store_reg); return; } -void vp9_short_idct10_16x16_add_neon(int16_t *input, +void vp9_short_idct16x16_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + int64_t store_reg[8]; int16_t pass1_output[16*16] = {0}; int16_t row_idct_output[16*16] = {0}; // save d8-d15 register values. - save_neon_registers(); + vp9_push_neon(store_reg); /* Parallel idct on the upper 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vp9_short_idct10_16x16_add_neon_pass1(input, pass1_output, 8); + vp9_short_idct16x16_10_add_neon_pass1(input, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7 // which will be saved into row_idct_output. - vp9_short_idct10_16x16_add_neon_pass2(input+1, + vp9_short_idct16x16_10_add_neon_pass2(input+1, row_idct_output, pass1_output, 0, @@ -163,7 +166,7 @@ void vp9_short_idct10_16x16_add_neon(int16_t *input, dest_stride); // restore d8-d15 register values. - restore_neon_registers(); + vp9_pop_neon(store_reg); return; } diff --git a/vp9/common/arm/neon/vp9_idct32x32_neon.c b/vp9/common/arm/neon/vp9_idct32x32_neon.c deleted file mode 100644 index ceecd6fbd..000000000 --- a/vp9/common/arm/neon/vp9_idct32x32_neon.c +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2013 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vp9/common/vp9_common.h" - -// defined in vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm -extern void idct32_transpose_and_transform(int16_t *transpose_buffer, - int16_t *output, int16_t *input); -extern void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride); - - -// defined in vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm -extern void save_neon_registers(); -extern void restore_neon_registers(); - -void vp9_short_idct32x32_add_neon(int16_t *input, uint8_t *dest, - int dest_stride) { - // TODO(cd): move the creation of these buffers within the ASM file - // internal buffer used to transpose 8 lines into before transforming them - int16_t transpose_buffer[32 * 8]; - // results of the first pass (transpose and transform rows) - int16_t pass1[32 * 32]; - // results of the second pass (transpose and transform columns) - int16_t pass2[32 * 32]; - - // save register we need to preserve - save_neon_registers(); - // process rows - idct32_transpose_and_transform(transpose_buffer, pass1, input); - // process columns - // TODO(cd): do these two steps/passes within the ASM file - idct32_transpose_and_transform(transpose_buffer, pass2, pass1); - // combine and add to dest - // TODO(cd): integrate this within the last storage step of the second pass - idct32_combine_add(dest, pass2, dest_stride); - // restore register we need to preserve - restore_neon_registers(); -} - -// TODO(cd): Eliminate this file altogether when everything is in ASM file diff --git a/vp9/common/arm/neon/vp9_save_reg_neon.asm b/vp9/common/arm/neon/vp9_save_reg_neon.asm new file mode 100644 index 000000000..71c3e7077 --- /dev/null +++ b/vp9/common/arm/neon/vp9_save_reg_neon.asm @@ -0,0 +1,36 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_push_neon| + EXPORT |vp9_pop_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vp9_push_neon| PROC + vst1.i64 {d8, d9, d10, d11}, [r0]! + vst1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + ENDP + +|vp9_pop_neon| PROC + vld1.i64 {d8, d9, d10, d11}, [r0]! + vld1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + ENDP + + END + diff --git a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm index 7464e800f..df2a0526c 100644 --- a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm +++ b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm @@ -10,10 +10,8 @@ EXPORT |vp9_short_idct16x16_add_neon_pass1| EXPORT |vp9_short_idct16x16_add_neon_pass2| - EXPORT |vp9_short_idct10_16x16_add_neon_pass1| - EXPORT |vp9_short_idct10_16x16_add_neon_pass2| - EXPORT |save_neon_registers| - EXPORT |restore_neon_registers| + EXPORT |vp9_short_idct16x16_10_add_neon_pass1| + EXPORT |vp9_short_idct16x16_10_add_neon_pass2| ARM REQUIRE8 PRESERVE8 @@ -788,7 +786,7 @@ end_idct16x16_pass2 bx lr ENDP ; |vp9_short_idct16x16_add_neon_pass2| -;void |vp9_short_idct10_16x16_add_neon_pass1|(int16_t *input, +;void |vp9_short_idct16x16_10_add_neon_pass1|(int16_t *input, ; int16_t *output, int output_stride) ; ; r0 int16_t input @@ -798,7 +796,7 @@ end_idct16x16_pass2 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output ; will be stored back into q8-q15 registers. This function will touch q0-q7 ; registers and use them as buffer during calculation. -|vp9_short_idct10_16x16_add_neon_pass1| PROC +|vp9_short_idct16x16_10_add_neon_pass1| PROC ; TODO(hkuang): Find a better way to load the elements. ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 @@ -907,9 +905,9 @@ end_idct16x16_pass2 vst1.64 {d31}, [r1], r2 bx lr - ENDP ; |vp9_short_idct10_16x16_add_neon_pass1| + ENDP ; |vp9_short_idct16x16_10_add_neon_pass1| -;void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src, +;void vp9_short_idct16x16_10_add_neon_pass2(int16_t *src, ; int16_t *output, ; int16_t *pass1Output, ; int16_t skip_adding, @@ -926,7 +924,7 @@ end_idct16x16_pass2 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output ; will be stored back into q8-q15 registers. This function will touch q0-q7 ; registers and use them as buffer during calculation. -|vp9_short_idct10_16x16_add_neon_pass2| PROC +|vp9_short_idct16x16_10_add_neon_pass2| PROC push {r3-r9} ; TODO(hkuang): Find a better way to load the elements. @@ -1177,15 +1175,5 @@ end_idct16x16_pass2 end_idct10_16x16_pass2 pop {r3-r9} bx lr - ENDP ; |vp9_short_idct10_16x16_add_neon_pass2| -;void |save_neon_registers|() -|save_neon_registers| PROC - vpush {d8-d15} - bx lr - ENDP ; |save_registers| -;void |restore_neon_registers|() -|restore_neon_registers| PROC - vpop {d8-d15} - bx lr - ENDP ; |restore_registers| + ENDP ; |vp9_short_idct16x16_10_add_neon_pass2| END diff --git a/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm index 3a0ff608b..b5a284b5a 100644 --- a/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm +++ b/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm @@ -43,8 +43,7 @@ cospi_30_64 EQU 1606 cospi_31_64 EQU 804 - EXPORT |idct32_transpose_and_transform| - EXPORT |idct32_combine_add| + EXPORT |vp9_short_idct32x32_add_neon| ARM REQUIRE8 PRESERVE8 @@ -100,6 +99,142 @@ cospi_31_64 EQU 804 vst1.16 {$reg2}, [r1] MEND ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q6-q9 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_CENTER_RESULTS + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d8}, [r10], r2 + vld1.s16 {d11}, [r9], r11 + vld1.s16 {d9}, [r10] + vld1.s16 {d10}, [r9] + ; ROUND_POWER_OF_TWO + vrshr.s16 q7, q7, #6 + vrshr.s16 q8, q8, #6 + vrshr.s16 q9, q9, #6 + vrshr.s16 q6, q6, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q7, q7, d9 + vaddw.u8 q8, q8, d10 + vaddw.u8 q9, q9, d11 + vaddw.u8 q6, q6, d8 + ; clip pixel + vqmovun.s16 d9, q7 + vqmovun.s16 d10, q8 + vqmovun.s16 d11, q9 + vqmovun.s16 d8, q6 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d9}, [r10], r11 + vst1.16 {d10}, [r9], r2 + vst1.16 {d8}, [r10] + vst1.16 {d11}, [r9] + ; update pointers (by dest_stride * 2) + sub r9, r9, r2, lsl #1 + add r10, r10, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q6-q9 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_CENTER_RESULTS_LAST + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d8}, [r10], r2 + vld1.s16 {d11}, [r9], r11 + vld1.s16 {d9}, [r10] + vld1.s16 {d10}, [r9] + ; ROUND_POWER_OF_TWO + vrshr.s16 q7, q7, #6 + vrshr.s16 q8, q8, #6 + vrshr.s16 q9, q9, #6 + vrshr.s16 q6, q6, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q7, q7, d9 + vaddw.u8 q8, q8, d10 + vaddw.u8 q9, q9, d11 + vaddw.u8 q6, q6, d8 + ; clip pixel + vqmovun.s16 d9, q7 + vqmovun.s16 d10, q8 + vqmovun.s16 d11, q9 + vqmovun.s16 d8, q6 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d9}, [r10], r11 + vst1.16 {d10}, [r9], r2 + vst1.16 {d8}, [r10]! + vst1.16 {d11}, [r9]! + ; update pointers (by dest_stride * 2) + sub r9, r9, r2, lsl #1 + add r10, r10, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q4-q7 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_EXTREME_RESULTS + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d4}, [r7], r2 + vld1.s16 {d7}, [r6], r11 + vld1.s16 {d5}, [r7] + vld1.s16 {d6}, [r6] + ; ROUND_POWER_OF_TWO + vrshr.s16 q5, q5, #6 + vrshr.s16 q6, q6, #6 + vrshr.s16 q7, q7, #6 + vrshr.s16 q4, q4, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q5, q5, d5 + vaddw.u8 q6, q6, d6 + vaddw.u8 q7, q7, d7 + vaddw.u8 q4, q4, d4 + ; clip pixel + vqmovun.s16 d5, q5 + vqmovun.s16 d6, q6 + vqmovun.s16 d7, q7 + vqmovun.s16 d4, q4 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d5}, [r7], r11 + vst1.16 {d6}, [r6], r2 + vst1.16 {d7}, [r6] + vst1.16 {d4}, [r7] + ; update pointers (by dest_stride * 2) + sub r6, r6, r2, lsl #1 + add r7, r7, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q4-q7 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_EXTREME_RESULTS_LAST + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d4}, [r7], r2 + vld1.s16 {d7}, [r6], r11 + vld1.s16 {d5}, [r7] + vld1.s16 {d6}, [r6] + ; ROUND_POWER_OF_TWO + vrshr.s16 q5, q5, #6 + vrshr.s16 q6, q6, #6 + vrshr.s16 q7, q7, #6 + vrshr.s16 q4, q4, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q5, q5, d5 + vaddw.u8 q6, q6, d6 + vaddw.u8 q7, q7, d7 + vaddw.u8 q4, q4, d4 + ; clip pixel + vqmovun.s16 d5, q5 + vqmovun.s16 d6, q6 + vqmovun.s16 d7, q7 + vqmovun.s16 d4, q4 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d5}, [r7], r11 + vst1.16 {d6}, [r6], r2 + vst1.16 {d7}, [r6]! + vst1.16 {d4}, [r7]! + ; update pointers (by dest_stride * 2) + sub r6, r6, r2, lsl #1 + add r7, r7, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- ; Touches q8-q12, q15 (q13-q14 are preserved) ; valid output registers are anything but q8-q11 MACRO @@ -110,12 +245,12 @@ cospi_31_64 EQU 804 ; additions/substractions before the multiplies. ; generate the constants ; generate scalar constants - mov r3, #$first_constant & 0xFF00 - add r3, #$first_constant & 0x00FF + mov r8, #$first_constant & 0xFF00 mov r12, #$second_constant & 0xFF00 + add r8, #$first_constant & 0x00FF add r12, #$second_constant & 0x00FF ; generate vector constants - vdup.16 d30, r3 + vdup.16 d30, r8 vdup.16 d31, r12 ; (used) two for inputs (regA-regD), one for constants (q15) ; do some multiplications (ordered for maximum latency hiding) @@ -153,15 +288,22 @@ cospi_31_64 EQU 804 MEND ; -------------------------------------------------------------------------- -;void idct32_transpose_and_transform(int16_t *transpose_buffer, int16_t *output, int16_t *input); +;void vp9_short_idct32x32_add_neon(int16_t *input, uint8_t *dest, int dest_stride); ; -; r0 int16_t *transpose_buffer -; r1 int16_t *output -; r2 int16_t *input) -; TODO(cd): have more logical parameter ordering but this issue will disappear -; when functions are combined. +; r0 int16_t *input, +; r1 uint8_t *dest, +; r2 int dest_stride) +; loop counters +; r4 bands loop counter +; r5 pass loop counter +; r8 transpose loop counter +; combine-add pointers +; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...) +; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...) +; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...) +; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...) -|idct32_transpose_and_transform| PROC +|vp9_short_idct32x32_add_neon| PROC ; This function does one pass of idct32x32 transform. ; ; This is done by transposing the input and then doing a 1d transform on @@ -171,43 +313,73 @@ cospi_31_64 EQU 804 ; The 1d transform is done by looping over bands of eight columns (the ; idct32_bands loop). For each band, the transform input transposition ; is done on demand, one band of four 8x8 matrices at a time. The four - ; matrices are trsnposed by pairs (the idct32_transpose_pair loop). - push {r4} - mov r4, #0 ; initialize bands loop counter + ; matrices are transposed by pairs (the idct32_transpose_pair loop). + push {r4-r11} + vpush {d8-d15} + ; stack operation + ; internal buffer used to transpose 8 lines into before transforming them + ; int16_t transpose_buffer[32 * 8]; + ; at sp + [4096, 4607] + ; results of the first pass (transpose and transform rows) + ; int16_t pass1[32 * 32]; + ; at sp + [0, 2047] + ; results of the second pass (transpose and transform columns) + ; int16_t pass2[32 * 32]; + ; at sp + [2048, 4095] + sub sp, sp, #512+2048+2048 + + ; r6 = dest + 31 * dest_stride + ; r7 = dest + 0 * dest_stride + ; r9 = dest + 15 * dest_stride + ; r10 = dest + 16 * dest_stride + rsb r6, r2, r2, lsl #5 + rsb r9, r2, r2, lsl #4 + add r10, r1, r2, lsl #4 + mov r7, r1 + add r6, r6, r1 + add r9, r9, r1 + ; r11 = -dest_stride + neg r11, r2 + ; r3 = input + mov r3, r0 + ; parameters for first pass + ; r0 = transpose_buffer[32 * 8] + add r0, sp, #4096 + ; r1 = pass1[32 * 32] + mov r1, sp + + mov r5, #0 ; initialize pass loop counter +idct32_pass_loop + mov r4, #4 ; initialize bands loop counter idct32_bands_loop - ; TODO(cd) get rid of these push/pop by properly adjusting register - ; content at end of loop - push {r0} - push {r1} - push {r2} - mov r3, #0 ; initialize transpose loop counter + mov r8, #2 ; initialize transpose loop counter idct32_transpose_pair_loop ; Load two horizontally consecutive 8x8 16bit data matrices. The first one ; into q0-q7 and the second one into q8-q15. There is a stride of 64, ; adjusted to 32 because of the two post-increments. - vld1.s16 {q8}, [r2]! - vld1.s16 {q0}, [r2]! - add r2, #32 - vld1.s16 {q9}, [r2]! - vld1.s16 {q1}, [r2]! - add r2, #32 - vld1.s16 {q10}, [r2]! - vld1.s16 {q2}, [r2]! - add r2, #32 - vld1.s16 {q11}, [r2]! - vld1.s16 {q3}, [r2]! - add r2, #32 - vld1.s16 {q12}, [r2]! - vld1.s16 {q4}, [r2]! - add r2, #32 - vld1.s16 {q13}, [r2]! - vld1.s16 {q5}, [r2]! - add r2, #32 - vld1.s16 {q14}, [r2]! - vld1.s16 {q6}, [r2]! - add r2, #32 - vld1.s16 {q15}, [r2]! - vld1.s16 {q7}, [r2]! + vld1.s16 {q8}, [r3]! + vld1.s16 {q0}, [r3]! + add r3, #32 + vld1.s16 {q9}, [r3]! + vld1.s16 {q1}, [r3]! + add r3, #32 + vld1.s16 {q10}, [r3]! + vld1.s16 {q2}, [r3]! + add r3, #32 + vld1.s16 {q11}, [r3]! + vld1.s16 {q3}, [r3]! + add r3, #32 + vld1.s16 {q12}, [r3]! + vld1.s16 {q4}, [r3]! + add r3, #32 + vld1.s16 {q13}, [r3]! + vld1.s16 {q5}, [r3]! + add r3, #32 + vld1.s16 {q14}, [r3]! + vld1.s16 {q6}, [r3]! + add r3, #32 + vld1.s16 {q15}, [r3]! + vld1.s16 {q7}, [r3]! ; Transpose the two 8x8 16bit data matrices. vswp d17, d24 @@ -255,11 +427,13 @@ idct32_transpose_pair_loop vst1.16 {q7}, [r0]! ; increment pointers by adjusted stride (not necessary for r0/out) - sub r2, r2, #8*32*2-32-16*2 + ; go back by 7*32 for the seven lines moved fully by read and add + ; go back by 32 for the eigth line only read + ; advance by 16*2 to go the next pair + sub r3, r3, #7*32*2 + 32 - 16*2 ; transpose pair loop processing - add r3, r3, #1 - cmp r3, #1 - ble idct32_transpose_pair_loop + subs r8, r8, #1 + bne idct32_transpose_pair_loop ; restore r0/input to its original value sub r0, r0, #32*8*2 @@ -815,21 +989,26 @@ idct32_transpose_pair_loop vadd.s16 q9, q5, q0 vsub.s16 q6, q5, q0 vsub.s16 q7, q4, q1 - STORE_IN_OUTPUT 17, 17, 16, q7, q6 - STORE_IN_OUTPUT 16, 15, 14, q9, q8 + + cmp r5, #0 + bgt idct32_bands_end_2nd_pass + +idct32_bands_end_1st_pass + STORE_IN_OUTPUT 17, 16, 17, q6, q7 + STORE_IN_OUTPUT 17, 14, 15, q8, q9 ; -------------------------------------------------------------------------- ; part of final stage ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; ;output[30 * 32] = step1b[1][i] - step1b[30][i]; ;output[31 * 32] = step1b[0][i] - step1b[31][i]; - LOAD_FROM_OUTPUT 14, 30, 31, q0, q1 + LOAD_FROM_OUTPUT 15, 30, 31, q0, q1 vadd.s16 q4, q2, q1 vadd.s16 q5, q3, q0 vsub.s16 q6, q3, q0 vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 31, 31, 30, q7, q6 - STORE_IN_OUTPUT 30, 0, 1, q4, q5 + STORE_IN_OUTPUT 31, 30, 31, q6, q7 + STORE_IN_OUTPUT 31, 0, 1, q4, q5 ; -------------------------------------------------------------------------- ; part of stage 7 ;step1[2] = step1b[2][i] + step1b[13][i]; @@ -848,25 +1027,25 @@ idct32_transpose_pair_loop ;output[18 * 32] = step1b[13][i] - step1b[18][i]; ;output[19 * 32] = step1b[12][i] - step1b[19][i]; LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 - vadd.s16 q6, q4, q1 - vadd.s16 q7, q5, q0 - vsub.s16 q8, q5, q0 - vsub.s16 q9, q4, q1 - STORE_IN_OUTPUT 19, 19, 18, q9, q8 - STORE_IN_OUTPUT 18, 13, 12, q7, q6 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_IN_OUTPUT 19, 18, 19, q6, q7 + STORE_IN_OUTPUT 19, 12, 13, q8, q9 ; -------------------------------------------------------------------------- ; part of final stage ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; ;output[28 * 32] = step1b[3][i] - step1b[28][i]; ;output[29 * 32] = step1b[2][i] - step1b[29][i]; - LOAD_FROM_OUTPUT 12, 28, 29, q0, q1 + LOAD_FROM_OUTPUT 13, 28, 29, q0, q1 vadd.s16 q4, q2, q1 vadd.s16 q5, q3, q0 vsub.s16 q6, q3, q0 vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 29, 29, 28, q7, q6 - STORE_IN_OUTPUT 28, 2, 3, q4, q5 + STORE_IN_OUTPUT 29, 28, 29, q6, q7 + STORE_IN_OUTPUT 29, 2, 3, q4, q5 ; -------------------------------------------------------------------------- ; part of stage 7 ;step1[4] = step1b[4][i] + step1b[11][i]; @@ -885,25 +1064,25 @@ idct32_transpose_pair_loop ;output[20 * 32] = step1b[11][i] - step1b[20][i]; ;output[21 * 32] = step1b[10][i] - step1b[21][i]; LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 - vadd.s16 q6, q4, q1 - vadd.s16 q7, q5, q0 - vsub.s16 q8, q5, q0 - vsub.s16 q9, q4, q1 - STORE_IN_OUTPUT 21, 21, 20, q9, q8 - STORE_IN_OUTPUT 20, 11, 10, q7, q6 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_IN_OUTPUT 21, 20, 21, q6, q7 + STORE_IN_OUTPUT 21, 10, 11, q8, q9 ; -------------------------------------------------------------------------- ; part of final stage ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; ;output[26 * 32] = step1b[5][i] - step1b[26][i]; ;output[27 * 32] = step1b[4][i] - step1b[27][i]; - LOAD_FROM_OUTPUT 10, 26, 27, q0, q1 + LOAD_FROM_OUTPUT 11, 26, 27, q0, q1 vadd.s16 q4, q2, q1 vadd.s16 q5, q3, q0 vsub.s16 q6, q3, q0 vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 27, 27, 26, q7, q6 - STORE_IN_OUTPUT 26, 4, 5, q4, q5 + STORE_IN_OUTPUT 27, 26, 27, q6, q7 + STORE_IN_OUTPUT 27, 4, 5, q4, q5 ; -------------------------------------------------------------------------- ; part of stage 7 ;step1[6] = step1b[6][i] + step1b[9][i]; @@ -922,92 +1101,199 @@ idct32_transpose_pair_loop ;output[22 * 32] = step1b[9][i] - step1b[22][i]; ;output[23 * 32] = step1b[8][i] - step1b[23][i]; LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 - vadd.s16 q6, q4, q1 - vadd.s16 q7, q5, q0 - vsub.s16 q8, q5, q0 - vsub.s16 q9, q4, q1 - STORE_IN_OUTPUT 23, 23, 22, q9, q8 - STORE_IN_OUTPUT 22, 9, 8, q7, q6 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_IN_OUTPUT 23, 22, 23, q6, q7 + STORE_IN_OUTPUT 23, 8, 9, q8, q9 ; -------------------------------------------------------------------------- ; part of final stage ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; ;output[24 * 32] = step1b[7][i] - step1b[24][i]; ;output[25 * 32] = step1b[6][i] - step1b[25][i]; - LOAD_FROM_OUTPUT 8, 24, 25, q0, q1 + LOAD_FROM_OUTPUT 9, 24, 25, q0, q1 vadd.s16 q4, q2, q1 vadd.s16 q5, q3, q0 vsub.s16 q6, q3, q0 vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 25, 25, 24, q7, q6 - STORE_IN_OUTPUT 24, 6, 7, q4, q5 - ; -------------------------------------------------------------------------- + STORE_IN_OUTPUT 25, 24, 25, q6, q7 + STORE_IN_OUTPUT 25, 6, 7, q4, q5 - ; TODO(cd) get rid of these push/pop by properly adjusting register - ; content at end of loop - pop {r2} - pop {r1} - pop {r0} - add r1, r1, #8*2 - add r2, r2, #8*32*2 + ; restore r0 by removing the last offset from the last + ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 + sub r0, r0, #24*8*2 + ; restore r1 by removing the last offset from the last + ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7*32*2 + ; advance by 8 columns => 8*2 + sub r1, r1, #7*32*2 - 8*2 + ; advance by 8 lines (8*32*2) + ; go back by the two pairs from the loop (32*2) + add r3, r3, #8*32*2 - 32*2 ; bands loop processing - add r4, r4, #1 - cmp r4, #3 - ble idct32_bands_loop + subs r4, r4, #1 + bne idct32_bands_loop - pop {r4} - bx lr - ENDP ; |idct32_transpose_and_transform| + ; parameters for second pass + ; the input of pass2 is the result of pass1. we have to remove the offset + ; of 32 columns induced by the above idct32_bands_loop + sub r3, r1, #32*2 + ; r1 = pass2[32 * 32] + add r1, sp, #2048 -;void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride); -; -; r0 uint8_t *dest -; r1 int16_t *out -; r2 int dest_stride) - -|idct32_combine_add| PROC - - mov r12, r0 ; dest pointer used for stores - sub r2, r2, #32 ; adjust the stride (remove the post-increments) - mov r3, #0 ; initialize loop counter - -idct32_combine_add_loop - ; load out[j * 32 + 0-31] - vld1.s16 {q12}, [r1]! - vld1.s16 {q13}, [r1]! - vld1.s16 {q14}, [r1]! - vld1.s16 {q15}, [r1]! - ; load dest[j * dest_stride + 0-31] - vld1.s16 {q6}, [r0]! - vld1.s16 {q7}, [r0]! - ; ROUND_POWER_OF_TWO - vrshr.s16 q12, q12, #6 - vrshr.s16 q13, q13, #6 - vrshr.s16 q14, q14, #6 - vrshr.s16 q15, q15, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q12, q12, d12 - vaddw.u8 q13, q13, d13 - vaddw.u8 q14, q14, d14 - vaddw.u8 q15, q15, d15 - ; clip pixel - vqmovun.s16 d12, q12 - vqmovun.s16 d13, q13 - vqmovun.s16 d14, q14 - vqmovun.s16 d15, q15 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {q6}, [r12]! - vst1.16 {q7}, [r12]! - ; increment pointers by adjusted stride (not necessary for r1/out) - add r0, r0, r2 - add r12, r12, r2 - ; loop processing - add r3, r3, #1 - cmp r3, #31 - ble idct32_combine_add_loop + ; pass loop processing + add r5, r5, #1 + B idct32_pass_loop - bx lr - ENDP ; |idct32_transpose| +idct32_bands_end_2nd_pass + STORE_COMBINE_CENTER_RESULTS + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; + ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; + ;output[30 * 32] = step1b[1][i] - step1b[30][i]; + ;output[31 * 32] = step1b[0][i] - step1b[31][i]; + LOAD_FROM_OUTPUT 17, 30, 31, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[2] = step1b[2][i] + step1b[13][i]; + ;step1[3] = step1b[3][i] + step1b[12][i]; + ;step1[12] = step1b[3][i] - step1b[12][i]; + ;step1[13] = step1b[2][i] - step1b[13][i]; + LOAD_FROM_OUTPUT 31, 12, 13, q0, q1 + vadd.s16 q2, q10, q1 + vadd.s16 q3, q11, q0 + vsub.s16 q4, q11, q0 + vsub.s16 q5, q10, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[12 * 32] = step1b[12][i] + step1b[19][i]; + ;output[13 * 32] = step1b[13][i] + step1b[18][i]; + ;output[18 * 32] = step1b[13][i] - step1b[18][i]; + ;output[19 * 32] = step1b[12][i] - step1b[19][i]; + LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_COMBINE_CENTER_RESULTS + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; + ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; + ;output[28 * 32] = step1b[3][i] - step1b[28][i]; + ;output[29 * 32] = step1b[2][i] - step1b[29][i]; + LOAD_FROM_OUTPUT 19, 28, 29, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[4] = step1b[4][i] + step1b[11][i]; + ;step1[5] = step1b[5][i] + step1b[10][i]; + ;step1[10] = step1b[5][i] - step1b[10][i]; + ;step1[11] = step1b[4][i] - step1b[11][i]; + LOAD_FROM_OUTPUT 29, 10, 11, q0, q1 + vadd.s16 q2, q12, q1 + vadd.s16 q3, q13, q0 + vsub.s16 q4, q13, q0 + vsub.s16 q5, q12, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[10 * 32] = step1b[10][i] + step1b[21][i]; + ;output[11 * 32] = step1b[11][i] + step1b[20][i]; + ;output[20 * 32] = step1b[11][i] - step1b[20][i]; + ;output[21 * 32] = step1b[10][i] - step1b[21][i]; + LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_COMBINE_CENTER_RESULTS + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; + ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; + ;output[26 * 32] = step1b[5][i] - step1b[26][i]; + ;output[27 * 32] = step1b[4][i] - step1b[27][i]; + LOAD_FROM_OUTPUT 21, 26, 27, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[6] = step1b[6][i] + step1b[9][i]; + ;step1[7] = step1b[7][i] + step1b[8][i]; + ;step1[8] = step1b[7][i] - step1b[8][i]; + ;step1[9] = step1b[6][i] - step1b[9][i]; + LOAD_FROM_OUTPUT 27, 8, 9, q0, q1 + vadd.s16 q2, q14, q1 + vadd.s16 q3, q15, q0 + vsub.s16 q4, q15, q0 + vsub.s16 q5, q14, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; + ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; + ;output[22 * 32] = step1b[9][i] - step1b[22][i]; + ;output[23 * 32] = step1b[8][i] - step1b[23][i]; + LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_COMBINE_CENTER_RESULTS_LAST + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; + ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; + ;output[24 * 32] = step1b[7][i] - step1b[24][i]; + ;output[25 * 32] = step1b[6][i] - step1b[25][i]; + LOAD_FROM_OUTPUT 23, 24, 25, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS_LAST + ; -------------------------------------------------------------------------- + ; restore pointers to their initial indices for next band pass by + ; removing/adding dest_stride * 8. The actual increment by eight + ; is taken care of within the _LAST macros. + add r6, r6, r2, lsl #3 + add r9, r9, r2, lsl #3 + sub r7, r7, r2, lsl #3 + sub r10, r10, r2, lsl #3 + + ; restore r0 by removing the last offset from the last + ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 + sub r0, r0, #24*8*2 + ; restore r1 by removing the last offset from the last + ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2 + ; advance by 8 columns => 8*2 + sub r1, r1, #25*32*2 - 8*2 + ; advance by 8 lines (8*32*2) + ; go back by the two pairs from the loop (32*2) + add r3, r3, #8*32*2 - 32*2 + ; bands loop processing + subs r4, r4, #1 + bne idct32_bands_loop + + ; stack operation + add sp, sp, #512+2048+2048 + vpop {d8-d15} + pop {r4-r11} + bx lr + ENDP ; |vp9_short_idct32x32_add_neon| END diff --git a/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm index a744f59db..c02251a3d 100644 --- a/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm +++ b/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm @@ -9,7 +9,7 @@ ; EXPORT |vp9_short_idct8x8_add_neon| - EXPORT |vp9_short_idct10_8x8_add_neon| + EXPORT |vp9_short_idct8x8_10_add_neon| ARM REQUIRE8 PRESERVE8 @@ -310,13 +310,13 @@ bx lr ENDP ; |vp9_short_idct8x8_add_neon| -;void vp9_short_idct10_8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +;void vp9_short_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride) ; ; r0 int16_t input ; r1 uint8_t *dest ; r2 int dest_stride) -|vp9_short_idct10_8x8_add_neon| PROC +|vp9_short_idct8x8_10_add_neon| PROC push {r4-r9} vpush {d8-d15} vld1.s16 {q8,q9}, [r0]! @@ -514,6 +514,6 @@ vpop {d8-d15} pop {r4-r9} bx lr - ENDP ; |vp9_short_idct10_8x8_add_neon| + ENDP ; |vp9_short_idct8x8_10_add_neon| END diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index 864e27e98..5e526a83c 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -170,13 +170,8 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) { void vp9_create_common(VP9_COMMON *cm) { vp9_machine_specific_config(cm); - vp9_init_mbmode_probs(cm); - cm->tx_mode = ONLY_4X4; cm->comp_pred_mode = HYBRID_PREDICTION; - - // Initialize reference frame sign bias structure to defaults - vpx_memset(cm->ref_frame_sign_bias, 0, sizeof(cm->ref_frame_sign_bias)); } void vp9_remove_common(VP9_COMMON *cm) { diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index c8d677fb9..9ab2cc31b 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -137,7 +137,7 @@ typedef struct { TX_SIZE tx_size; int_mv mv[2]; // for each reference frame used int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES]; - int_mv best_mv, best_second_mv; + int_mv best_mv[2]; uint8_t mode_context[MAX_REF_FRAMES]; @@ -247,7 +247,7 @@ typedef struct macroblockd { } MACROBLOCKD; -static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) { +static INLINE uint8_t *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) { switch (subsize) { case BLOCK_64X64: case BLOCK_64X32: diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c index abedf6b27..1705402c2 100644 --- a/vp9/common/vp9_convolve.c +++ b/vp9/common/vp9_convolve.c @@ -282,7 +282,7 @@ void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, int r; for (r = h; r > 0; --r) { - memcpy(dst, src, w); + vpx_memcpy(dst, src, w); src += src_stride; dst += dst_stride; } diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c index 32d9e0cf7..f171c317f 100644 --- a/vp9/common/vp9_entropy.c +++ b/vp9/common/vp9_entropy.c @@ -569,31 +569,6 @@ void vp9_init_neighbors() { vp9_default_scan_32x32_neighbors); } -const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan) { - if (scan == vp9_default_scan_4x4) { - return vp9_default_scan_4x4_neighbors; - } else if (scan == vp9_row_scan_4x4) { - return vp9_row_scan_4x4_neighbors; - } else if (scan == vp9_col_scan_4x4) { - return vp9_col_scan_4x4_neighbors; - } else if (scan == vp9_default_scan_8x8) { - return vp9_default_scan_8x8_neighbors; - } else if (scan == vp9_row_scan_8x8) { - return vp9_row_scan_8x8_neighbors; - } else if (scan == vp9_col_scan_8x8) { - return vp9_col_scan_8x8_neighbors; - } else if (scan == vp9_default_scan_16x16) { - return vp9_default_scan_16x16_neighbors; - } else if (scan == vp9_row_scan_16x16) { - return vp9_row_scan_16x16_neighbors; - } else if (scan == vp9_col_scan_16x16) { - return vp9_col_scan_16x16_neighbors; - } else { - assert(scan == vp9_default_scan_32x32); - return vp9_default_scan_32x32_neighbors; - } -} - void vp9_coef_tree_initialize() { vp9_init_neighbors(); init_bit_trees(); diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h index c1f2d782b..4ed94815b 100644 --- a/vp9/common/vp9_entropy.h +++ b/vp9/common/vp9_entropy.h @@ -190,9 +190,6 @@ static INLINE int get_coef_context(const int16_t *neighbors, token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1; } -const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan); - - // 128 lists of probabilities are stored for the following ONE node probs: // 1, 3, 5, 7, ..., 253, 255 // In between probabilities are interpolated linearly @@ -367,22 +364,24 @@ static int get_entropy_context(TX_SIZE tx_size, static void get_scan_and_band(const MACROBLOCKD *xd, TX_SIZE tx_size, PLANE_TYPE type, int block_idx, const int16_t **scan, + const int16_t **scan_nb, const uint8_t **band_translate) { switch (tx_size) { case TX_4X4: - *scan = get_scan_4x4(get_tx_type_4x4(type, xd, block_idx)); + get_scan_nb_4x4(get_tx_type_4x4(type, xd, block_idx), scan, scan_nb); *band_translate = vp9_coefband_trans_4x4; break; case TX_8X8: - *scan = get_scan_8x8(get_tx_type_8x8(type, xd)); + get_scan_nb_8x8(get_tx_type_8x8(type, xd), scan, scan_nb); *band_translate = vp9_coefband_trans_8x8plus; break; case TX_16X16: - *scan = get_scan_16x16(get_tx_type_16x16(type, xd)); + get_scan_nb_16x16(get_tx_type_16x16(type, xd), scan, scan_nb); *band_translate = vp9_coefband_trans_8x8plus; break; case TX_32X32: *scan = vp9_default_scan_32x32; + *scan_nb = vp9_default_scan_32x32_neighbors; *band_translate = vp9_coefband_trans_8x8plus; break; default: diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h index 4cf4c0392..31537c7f7 100644 --- a/vp9/common/vp9_entropymode.h +++ b/vp9/common/vp9_entropymode.h @@ -14,7 +14,6 @@ #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_treecoder.h" -#define SUBMVREF_COUNT 5 #define TX_SIZE_CONTEXTS 2 #define MODE_UPDATE_PROB 252 #define SWITCHABLE_FILTERS 3 // number of switchable filters diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c index 49a731fdb..73f6b4c19 100644 --- a/vp9/common/vp9_findnearmv.c +++ b/vp9/common/vp9_findnearmv.c @@ -54,7 +54,7 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, dst_list[1].as_int = 0; if (block_idx == 0) { - memcpy(dst_list, mv_list, MAX_MV_REF_CANDIDATES * sizeof(int_mv)); + vpx_memcpy(dst_list, mv_list, MAX_MV_REF_CANDIDATES * sizeof(int_mv)); } else if (block_idx == 1 || block_idx == 2) { int dst = 0, n; union b_mode_info *bmi = mi->bmi; diff --git a/vp9/common/vp9_findnearmv.h b/vp9/common/vp9_findnearmv.h index ad0d882b9..50dfdc7fb 100644 --- a/vp9/common/vp9_findnearmv.h +++ b/vp9/common/vp9_findnearmv.h @@ -55,13 +55,11 @@ static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, if (!mi) return DC_PRED; - if (mi->mbmi.ref_frame[0] != INTRA_FRAME) { + if (is_inter_block(&mi->mbmi)) return DC_PRED; - } else if (mi->mbmi.sb_type < BLOCK_8X8) { - return ((mi->bmi + 1 + b)->as_mode); - } else { - return mi->mbmi.mode; - } + else + return mi->mbmi.sb_type < BLOCK_8X8 ? (mi->bmi + 1 + b)->as_mode + : mi->mbmi.mode; } assert(b == 1 || b == 3); return (mi->bmi + b - 1)->as_mode; @@ -77,13 +75,11 @@ static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, if (!mi) return DC_PRED; - if (mi->mbmi.ref_frame[0] != INTRA_FRAME) { + if (is_inter_block(&mi->mbmi)) return DC_PRED; - } else if (mi->mbmi.sb_type < BLOCK_8X8) { - return ((mi->bmi + 2 + b)->as_mode); - } else { - return mi->mbmi.mode; - } + else + return mi->mbmi.sb_type < BLOCK_8X8 ? (mi->bmi + 2 + b)->as_mode + : mi->mbmi.mode; } return (mi->bmi + b - 2)->as_mode; diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index a2245259e..bc30d2a95 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -28,10 +28,10 @@ void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) { int16_t *op = output; for (i = 0; i < 4; i++) { - a1 = ip[0] >> WHT_UPSCALE_FACTOR; - c1 = ip[1] >> WHT_UPSCALE_FACTOR; - d1 = ip[2] >> WHT_UPSCALE_FACTOR; - b1 = ip[3] >> WHT_UPSCALE_FACTOR; + a1 = ip[0] >> UNIT_QUANT_SHIFT; + c1 = ip[1] >> UNIT_QUANT_SHIFT; + d1 = ip[2] >> UNIT_QUANT_SHIFT; + b1 = ip[3] >> UNIT_QUANT_SHIFT; a1 += c1; d1 -= b1; e1 = (a1 - d1) >> 1; @@ -77,7 +77,7 @@ void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) { int16_t *ip = in; int16_t *op = tmp; - a1 = ip[0] >> WHT_UPSCALE_FACTOR; + a1 = ip[0] >> UNIT_QUANT_SHIFT; e1 = a1 >> 1; a1 -= e1; op[0] = a1; @@ -420,7 +420,7 @@ void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride, + dest[j * dest_stride + i]); } } -void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest, +void vp9_short_idct8x8_10_add_c(int16_t *input, uint8_t *dest, int dest_stride) { int16_t out[8 * 8] = { 0 }; int16_t *outptr = out; @@ -838,7 +838,7 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride, + dest[j * dest_stride + i]); } } -void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest, +void vp9_short_idct16x16_10_add_c(int16_t *input, uint8_t *dest, int dest_stride) { int16_t out[16 * 16] = { 0 }; int16_t *outptr = out; diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h index 5f2f0a569..59892cd03 100644 --- a/vp9/common/vp9_idct.h +++ b/vp9/common/vp9_idct.h @@ -22,7 +22,8 @@ #define DCT_CONST_BITS 14 #define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1)) -#define WHT_UPSCALE_FACTOR 2 +#define UNIT_QUANT_SHIFT 2 +#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT) #define pair_set_epi16(a, b) \ _mm_set_epi16(b, a, b, a, b, a, b, a) diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c index 0f2e4e999..18407dd73 100644 --- a/vp9/common/vp9_reconinter.c +++ b/vp9/common/vp9_reconinter.c @@ -132,7 +132,7 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize, const int x = 4 * (block & ((1 << bwl) - 1)); const int y = 4 * (block >> bwl); const MODE_INFO *mi = xd->this_mi; - const int use_second_ref = mi->mbmi.ref_frame[1] > 0; + const int is_compound = has_second_ref(&mi->mbmi); int ref; assert(x < bw); @@ -140,7 +140,7 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize, assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_w == bw); assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_h == bh); - for (ref = 0; ref < 1 + use_second_ref; ++ref) { + for (ref = 0; ref < 1 + is_compound; ++ref) { struct scale_factors *const scale = &xd->scale_factor[ref]; struct buf_2d *const pre_buf = &pd->pre[ref]; struct buf_2d *const dst_buf = &pd->dst; diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 042afbbef..42923b3c8 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -53,7 +53,7 @@ prototype void vp9_d45_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const ui specialize vp9_d45_predictor_4x4 $ssse3_x86inc prototype void vp9_d63_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d63_predictor_4x4 +specialize vp9_d63_predictor_4x4 $ssse3_x86inc prototype void vp9_h_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_h_predictor_4x4 $ssse3_x86inc @@ -92,7 +92,7 @@ prototype void vp9_d45_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const ui specialize vp9_d45_predictor_8x8 $ssse3_x86inc prototype void vp9_d63_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d63_predictor_8x8 +specialize vp9_d63_predictor_8x8 $ssse3_x86inc prototype void vp9_h_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_h_predictor_8x8 $ssse3_x86inc @@ -131,7 +131,7 @@ prototype void vp9_d45_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const specialize vp9_d45_predictor_16x16 $ssse3_x86inc prototype void vp9_d63_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d63_predictor_16x16 +specialize vp9_d63_predictor_16x16 $ssse3_x86inc prototype void vp9_h_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_h_predictor_16x16 $ssse3_x86inc @@ -170,7 +170,7 @@ prototype void vp9_d45_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const specialize vp9_d45_predictor_32x32 $ssse3_x86inc prototype void vp9_d63_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d63_predictor_32x32 +specialize vp9_d63_predictor_32x32 $ssse3_x86inc prototype void vp9_h_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_h_predictor_32x32 $ssse3 x86inc @@ -306,8 +306,8 @@ specialize vp9_short_idct8x8_1_add sse2 neon prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct8x8_add sse2 neon -prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct10_8x8_add sse2 neon +prototype void vp9_short_idct8x8_10_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct8x8_10_add sse2 neon prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct16x16_1_add sse2 neon @@ -315,8 +315,8 @@ specialize vp9_short_idct16x16_1_add sse2 neon prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct16x16_add sse2 neon -prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct10_16x16_add sse2 neon +prototype void vp9_short_idct16x16_10_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct16x16_10_add sse2 neon prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct32x32_add sse2 neon diff --git a/vp9/common/vp9_subpelvar.h b/vp9/common/vp9_subpelvar.h deleted file mode 100644 index fe75481f6..000000000 --- a/vp9/common/vp9_subpelvar.h +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_COMMON_VP9_SUBPELVAR_H_ -#define VP9_COMMON_VP9_SUBPELVAR_H_ - -#include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_convolve.h" - -static void variance(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - int w, - int h, - unsigned int *sse, - int *sum) { - int i, j; - int diff; - - *sum = 0; - *sse = 0; - - for (i = 0; i < h; i++) { - for (j = 0; j < w; j++) { - diff = src_ptr[j] - ref_ptr[j]; - *sum += diff; - *sse += diff * diff; - } - - src_ptr += source_stride; - ref_ptr += recon_stride; - } -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil_first_pass - * - * INPUTS : uint8_t *src_ptr : Pointer to source block. - * uint32_t src_pixels_per_line : Stride of input block. - * uint32_t pixel_step : Offset between filter input samples (see notes). - * uint32_t output_height : Input block height. - * uint32_t output_width : Input block width. - * int32_t *vp9_filter : Array of 2 bi-linear filter taps. - * - * OUTPUTS : int32_t *output_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in - * either horizontal or vertical direction to produce the - * filtered output block. Used to implement first-pass - * of 2-D separable filter. - * - * SPECIAL NOTES : Produces int32_t output to retain precision for next pass. - * Two filter taps should sum to VP9_FILTER_WEIGHT. - * pixel_step defines whether the filter is applied - * horizontally (pixel_step=1) or vertically (pixel_step=stride). - * It defines the offset required to move from one input - * to the next. - * - ****************************************************************************/ -static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr, - uint16_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, - const int16_t *vp9_filter) { - unsigned int i, j; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + - (int)src_ptr[pixel_step] * vp9_filter[1], - FILTER_BITS); - - src_ptr++; - } - - // Next row... - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_width; - } -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil_second_pass - * - * INPUTS : int32_t *src_ptr : Pointer to source block. - * uint32_t src_pixels_per_line : Stride of input block. - * uint32_t pixel_step : Offset between filter input samples (see notes). - * uint32_t output_height : Input block height. - * uint32_t output_width : Input block width. - * int32_t *vp9_filter : Array of 2 bi-linear filter taps. - * - * OUTPUTS : uint16_t *output_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in - * either horizontal or vertical direction to produce the - * filtered output block. Used to implement second-pass - * of 2-D separable filter. - * - * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass. - * Two filter taps should sum to VP9_FILTER_WEIGHT. - * pixel_step defines whether the filter is applied - * horizontally (pixel_step=1) or vertically (pixel_step=stride). - * It defines the offset required to move from one input - * to the next. - * - ****************************************************************************/ -static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const int16_t *vp9_filter) { - unsigned int i, j; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + - (int)src_ptr[pixel_step] * vp9_filter[1], - FILTER_BITS); - src_ptr++; - } - - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_width; - } -} - -#endif // VP9_COMMON_VP9_SUBPELVAR_H_ diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index 8f740f412..d44c7e2a0 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -985,7 +985,7 @@ void vp9_short_iht8x8_add_sse2(int16_t *input, uint8_t *dest, int stride, RECON_AND_STORE(dest, in[7]); } -void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { +void vp9_short_idct8x8_10_add_sse2(int16_t *input, uint8_t *dest, int stride) { const __m128i zero = _mm_setzero_si128(); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<4); @@ -2456,7 +2456,7 @@ void vp9_short_iht16x16_add_sse2(int16_t *input, uint8_t *dest, int stride, write_buffer_8x16(dest, in1, stride); } -void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest, +void vp9_short_idct16x16_10_add_sse2(int16_t *input, uint8_t *dest, int stride) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<5); diff --git a/vp9/common/x86/vp9_intrapred_ssse3.asm b/vp9/common/x86/vp9_intrapred_ssse3.asm index 67c8ab03a..c51d01151 100644 --- a/vp9/common/x86/vp9_intrapred_ssse3.asm +++ b/vp9/common/x86/vp9_intrapred_ssse3.asm @@ -17,8 +17,8 @@ pw_2: times 8 dw 2 pb_7m1: times 8 db 7, -1 pb_15: times 16 db 15 -sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7 -sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7 +sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 @@ -305,3 +305,153 @@ cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset RESTORE_GOT RET + +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 + pavgb %4, %1, %3 + pxor %3, %1 + pand %3, [GLOBAL(pb_1)] + psubb %4, %3 + pavgb %4, %2 +%endmacro + +INIT_XMM ssse3 +cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset + GET_GOT goffsetq + + movq m3, [aboveq] + pshufb m1, m3, [GLOBAL(sh_b23456777)] + pshufb m2, m3, [GLOBAL(sh_b12345677)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4 + pavgb m3, m2 + + ; store 4 lines + movd [dstq ], m3 + movd [dstq+strideq], m4 + lea dstq, [dstq+strideq*2] + psrldq m3, 1 + psrldq m4, 1 + movd [dstq ], m3 + movd [dstq+strideq], m4 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset + GET_GOT goffsetq + + movq m3, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pshufb m1, m3, [GLOBAL(sh_b2345677777777777)] + pshufb m0, m3, [GLOBAL(sh_b0123456777777777)] + pshufb m2, m3, [GLOBAL(sh_b1234567777777777)] + pshufb m3, [GLOBAL(sh_b0123456777777777)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4 + pavgb m3, m2 + + ; store 4 lines + movq [dstq ], m3 + movq [dstq+strideq], m4 + psrldq m3, 1 + psrldq m4, 1 + movq [dstq+strideq*2], m3 + movq [dstq+stride3q ], m4 + lea dstq, [dstq+strideq*4] + psrldq m3, 1 + psrldq m4, 1 + + ; store 4 lines + movq [dstq ], m3 + movq [dstq+strideq], m4 + psrldq m3, 1 + psrldq m4, 1 + movq [dstq+strideq*2], m3 + movq [dstq+stride3q ], m4 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset + GET_GOT goffsetq + + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, line + lea stride3q, [strideq*3] + mova m1, [GLOBAL(sh_b123456789abcdeff)] + pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] + pshufb m3, m0, m1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4 + pavgb m0, m3 + + mov lined, 4 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m4 + pshufb m0, m1 + pshufb m4, m1 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m4 + pshufb m0, m1 + pshufb m4, m1 + lea dstq, [dstq+strideq*4] + dec lined + jnz .loop + RESTORE_GOT + REP_RET + +INIT_XMM ssse3 +cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset + GET_GOT goffsetq + + mova m0, [aboveq] + mova m7, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, line + mova m1, [GLOBAL(sh_b123456789abcdeff)] + lea stride3q, [strideq*3] + pshufb m2, m7, [GLOBAL(sh_b23456789abcdefff)] + pshufb m3, m7, m1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4 + palignr m6, m7, m0, 1 + palignr m5, m7, m0, 2 + pavgb m7, m3 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2 + pavgb m0, m6 + + mov lined, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m7 + mova [dstq+strideq ], m2 + mova [dstq+strideq +16], m4 + palignr m3, m7, m0, 1 + palignr m5, m4, m2, 1 + pshufb m7, m1 + pshufb m4, m1 + + mova [dstq+strideq*2 ], m3 + mova [dstq+strideq*2+16], m7 + mova [dstq+stride3q ], m5 + mova [dstq+stride3q +16], m4 + palignr m0, m7, m3, 1 + palignr m2, m4, m5, 1 + pshufb m7, m1 + pshufb m4, m1 + lea dstq, [dstq+strideq*4] + dec lined + jnz .loop + RESTORE_GOT + REP_RET diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index 84a29b17a..ba9fad25e 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -75,28 +75,9 @@ static TX_SIZE read_tx_size(VP9D_COMP *pbi, TX_MODE tx_mode, return TX_4X4; } -static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize, - int mi_row, int mi_col, int segment_id) { - const int mi_offset = mi_row * cm->mi_cols + mi_col; - const int bw = 1 << mi_width_log2(bsize); - const int bh = 1 << mi_height_log2(bsize); - const int xmis = MIN(cm->mi_cols - mi_col, bw); - const int ymis = MIN(cm->mi_rows - mi_row, bh); - int x, y; - - assert(segment_id >= 0 && segment_id < MAX_SEGMENTS); - - for (y = 0; y < ymis; y++) - for (x = 0; x < xmis; x++) - cm->last_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id; -} - static int read_intra_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col, vp9_reader *r) { - MACROBLOCKD *const xd = &pbi->mb; struct segmentation *const seg = &pbi->common.seg; - const BLOCK_SIZE bsize = xd->this_mi->mbmi.sb_type; - int segment_id; if (!seg->enabled) return 0; // Default for disabled segmentation @@ -104,9 +85,7 @@ static int read_intra_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col, if (!seg->update_map) return 0; - segment_id = read_segment_id(r, seg); - set_segment_id(&pbi->common, bsize, mi_row, mi_col, segment_id); - return segment_id; + return read_segment_id(r, seg); } static int read_inter_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col, @@ -115,7 +94,7 @@ static int read_inter_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col, MACROBLOCKD *const xd = &pbi->mb; struct segmentation *const seg = &cm->seg; const BLOCK_SIZE bsize = xd->this_mi->mbmi.sb_type; - int pred_segment_id, segment_id; + int pred_segment_id;; if (!seg->enabled) return 0; // Default for disabled segmentation @@ -129,13 +108,10 @@ static int read_inter_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col, const vp9_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd); const int pred_flag = vp9_read(r, pred_prob); vp9_set_pred_flag_seg_id(xd, pred_flag); - segment_id = pred_flag ? pred_segment_id - : read_segment_id(r, seg); + return pred_flag ? pred_segment_id : read_segment_id(r, seg); } else { - segment_id = read_segment_id(r, seg); + return read_segment_id(r, seg); } - set_segment_id(cm, bsize, mi_row, mi_col, segment_id); - return segment_id; } static uint8_t read_skip_coeff(VP9D_COMP *pbi, int segment_id, vp9_reader *r) { @@ -200,7 +176,6 @@ static void read_intra_frame_mode_info(VP9D_COMP *pbi, MODE_INFO *m, static int read_mv_component(vp9_reader *r, const nmv_component *mvcomp, int usehp) { - int mag, d, fr, hp; const int sign = vp9_read(r, mvcomp->sign); const int mv_class = treed_read(r, vp9_mv_class_tree, mvcomp->classes); @@ -493,11 +468,12 @@ static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi, const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; // 1 or 2 const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; // 1 or 2 int idx, idy; + int b_mode; for (idy = 0; idy < 2; idy += num_4x4_h) { for (idx = 0; idx < 2; idx += num_4x4_w) { int_mv blockmv, secondmv; const int j = idy * 2 + idx; - const int b_mode = read_inter_mode(cm, r, inter_mode_ctx); + b_mode = read_inter_mode(cm, r, inter_mode_ctx); if (b_mode == NEARESTMV || b_mode == NEARMV) { vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest, &nearby, j, 0, @@ -544,10 +520,10 @@ static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi, mi->bmi[j + 2] = mi->bmi[j]; if (num_4x4_w == 2) mi->bmi[j + 1] = mi->bmi[j]; - mi->mbmi.mode = b_mode; } } + mi->mbmi.mode = b_mode; mv0->as_int = mi->bmi[3].as_mv[0].as_int; mv1->as_int = mi->bmi[3].as_mv[1].as_int; } else { diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c index 34ed0c759..77fec5061 100644 --- a/vp9/decoder/vp9_decodframe.c +++ b/vp9/decoder/vp9_decodframe.c @@ -436,7 +436,6 @@ static void setup_segmentation(struct segmentation *seg, static void setup_loopfilter(struct loopfilter *lf, struct vp9_read_bit_buffer *rb) { - lf->filter_level = vp9_rb_read_literal(rb, 6); lf->sharpness_level = vp9_rb_read_literal(rb, 3); @@ -935,6 +934,15 @@ void vp9_init_dequantizer(VP9_COMMON *cm) { } } +static void update_segmentation_map(VP9_COMMON *cm) { + int i, j; + + for (i = 0; i < cm->mi_rows; ++i) + for (j = 0; j < cm->mi_cols; ++j) + cm->last_frame_seg_map[i * cm->mi_cols + j] = + cm->mi_grid_visible[i * cm->mode_info_stride + j]->mbmi.segment_id; +} + int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { int i; VP9_COMMON *const cm = &pbi->common; @@ -1014,5 +1022,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { if (cm->refresh_frame_context) cm->frame_contexts[cm->frame_context_idx] = cm->fc; + update_segmentation_map(cm); + return 0; } diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c index 3792b9c78..8fcf83ee3 100644 --- a/vp9/decoder/vp9_detokenize.c +++ b/vp9/decoder/vp9_detokenize.c @@ -106,8 +106,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, const uint8_t *band_translate; uint8_t token_cache[1024]; int pt = get_entropy_context(tx_size, A, L); - get_scan_and_band(xd, tx_size, type, block_idx, &scan, &band_translate); - nb = vp9_get_coef_neighbors_handle(scan); + get_scan_and_band(xd, tx_size, type, block_idx, &scan, &nb, &band_translate); while (1) { int val; @@ -122,7 +121,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, if (!vp9_read(r, prob[EOB_CONTEXT_NODE])) break; -SKIP_START: + SKIP_START: if (c >= seg_eob) break; if (c) diff --git a/vp9/decoder/vp9_dsubexp.c b/vp9/decoder/vp9_dsubexp.c index 8cc64f73e..9a970d42b 100644 --- a/vp9/decoder/vp9_dsubexp.c +++ b/vp9/decoder/vp9_dsubexp.c @@ -67,7 +67,6 @@ static int inv_remap_prob(int v, int m) { 206, 207, 208, 209, 210, 211, 212, 213, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, - }; // v = merge_index(v, MAX_PROBS - 1, MODULUS_PARAM); v = inv_map_table[v]; diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c index 395e636b8..cac5f1a76 100644 --- a/vp9/decoder/vp9_idct_blk.c +++ b/vp9/decoder/vp9_idct_blk.c @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vp9_rtcd.h" +#include "./vp9_rtcd.h" #include "vp9/common/vp9_blockd.h" #include "vp9/decoder/vp9_idct_blk.h" @@ -96,7 +96,7 @@ void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) { vp9_short_idct8x8_1_add(input, dest, stride); input[0] = 0; } else if (eob <= 10) { - vp9_short_idct10_8x8_add(input, dest, stride); + vp9_short_idct8x8_10_add(input, dest, stride); vpx_memset(input, 0, 128); } else { vp9_short_idct8x8_add(input, dest, stride); @@ -126,7 +126,7 @@ void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) { vp9_short_idct16x16_1_add(input, dest, stride); input[0] = 0; } else if (eob <= 10) { - vp9_short_idct10_16x16_add(input, dest, stride); + vp9_short_idct16x16_10_add(input, dest, stride); vpx_memset(input, 0, 512); } else { vp9_short_idct16x16_add(input, dest, stride); diff --git a/vp9/decoder/vp9_idct_blk.h b/vp9/decoder/vp9_idct_blk.h index 1810bd02f..00f1bc6a6 100644 --- a/vp9/decoder/vp9_idct_blk.h +++ b/vp9/decoder/vp9_idct_blk.h @@ -14,17 +14,16 @@ #include "vp9/common/vp9_blockd.h" +void vp9_idct_add_lossless_c(int16_t *input, uint8_t *dest, + int stride, int eob); -void vp9_idct_add_lossless_c(int16_t *input, unsigned char *dest, int stride, - int eob); - -void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, unsigned char *dest, +void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride, int eob); -void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, unsigned char *dest, +void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride, int eob); -void vp9_iht_add_16x16_c(TX_TYPE tx_type, int16_t *input, unsigned char *dest, +void vp9_iht_add_16x16_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride, int eob); #endif // VP9_DECODER_VP9_IDCT_BLK_H_ diff --git a/vp9/decoder/vp9_onyxd.h b/vp9/decoder/vp9_onyxd.h index cd5b7508f..4f662e9ac 100644 --- a/vp9/decoder/vp9_onyxd.h +++ b/vp9/decoder/vp9_onyxd.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_ONYXD_H_ -#define VP9_COMMON_VP9_ONYXD_H_ +#ifndef VP9_DECODER_VP9_ONYXD_H_ +#define VP9_DECODER_VP9_ONYXD_H_ #ifdef __cplusplus extern "C" { @@ -66,4 +66,4 @@ void vp9_remove_decompressor(VP9D_PTR comp); } #endif -#endif // VP9_COMMON_VP9_ONYXD_H_ +#endif // VP9_DECODER_VP9_ONYXD_H_ diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c index 17d5def33..1c804d974 100644 --- a/vp9/decoder/vp9_onyxd_if.c +++ b/vp9/decoder/vp9_onyxd_if.c @@ -65,13 +65,12 @@ static void recon_write_yuv_frame(const char *name, #endif #if WRITE_RECON_BUFFER == 2 void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) { - // write the frame FILE *yframe; int i; char filename[255]; - sprintf(filename, "dx\\y%04d.raw", this_frame); + snprintf(filename, sizeof(filename)-1, "dx\\y%04d.raw", this_frame); yframe = fopen(filename, "wb"); for (i = 0; i < frame->y_height; i++) @@ -79,7 +78,7 @@ void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) { frame->y_width, 1, yframe); fclose(yframe); - sprintf(filename, "dx\\u%04d.raw", this_frame); + snprintf(filename, sizeof(filename)-1, "dx\\u%04d.raw", this_frame); yframe = fopen(filename, "wb"); for (i = 0; i < frame->uv_height; i++) @@ -87,7 +86,7 @@ void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) { frame->uv_width, 1, yframe); fclose(yframe); - sprintf(filename, "dx\\v%04d.raw", this_frame); + snprintf(filename, sizeof(filename)-1, "dx\\v%04d.raw", this_frame); yframe = fopen(filename, "wb"); for (i = 0; i < frame->uv_height; i++) @@ -214,13 +213,13 @@ vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag, * vpxenc --test-decode functionality working, and will be replaced in a * later commit that adds VP9-specific controls for this functionality. */ - if (ref_frame_flag == VP9_LAST_FLAG) + if (ref_frame_flag == VP9_LAST_FLAG) { ref_fb_ptr = &pbi->common.active_ref_idx[0]; - else if (ref_frame_flag == VP9_GOLD_FLAG) + } else if (ref_frame_flag == VP9_GOLD_FLAG) { ref_fb_ptr = &pbi->common.active_ref_idx[1]; - else if (ref_frame_flag == VP9_ALT_FLAG) + } else if (ref_frame_flag == VP9_ALT_FLAG) { ref_fb_ptr = &pbi->common.active_ref_idx[2]; - else { + } else { vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR, "Invalid reference frame"); return pbi->common.error.error_code; diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h index a051971a1..8fee5e975 100644 --- a/vp9/decoder/vp9_onyxd_int.h +++ b/vp9/decoder/vp9_onyxd_int.h @@ -41,4 +41,4 @@ typedef struct VP9Decompressor { VP9Worker lf_worker; } VP9D_COMP; -#endif // VP9_DECODER_VP9_TREEREADER_H_ +#endif // VP9_DECODER_VP9_ONYXD_INT_H_ diff --git a/vp9/decoder/vp9_read_bit_buffer.h b/vp9/decoder/vp9_read_bit_buffer.h index c7fa3aa27..41a686837 100644 --- a/vp9/decoder/vp9_read_bit_buffer.h +++ b/vp9/decoder/vp9_read_bit_buffer.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_READ_BIT_BUFFER_ -#define VP9_READ_BIT_BUFFER_ +#ifndef VP9_DECODER_VP9_READ_BIT_BUFFER_H_ +#define VP9_DECODER_VP9_READ_BIT_BUFFER_H_ #include <limits.h> @@ -57,4 +57,4 @@ static int vp9_rb_read_signed_literal(struct vp9_read_bit_buffer *rb, return vp9_rb_read_bit(rb) ? -value : value; } -#endif // VP9_READ_BIT_BUFFER_ +#endif // VP9_DECODER_VP9_READ_BIT_BUFFER_H_ diff --git a/vp9/decoder/vp9_thread.h b/vp9/decoder/vp9_thread.h index a8f7e046a..0b5eca073 100644 --- a/vp9/decoder/vp9_thread.h +++ b/vp9/decoder/vp9_thread.h @@ -17,7 +17,7 @@ #ifndef VP9_DECODER_VP9_THREAD_H_ #define VP9_DECODER_VP9_THREAD_H_ -#include "vpx_config.h" +#include "./vpx_config.h" #if defined(__cplusplus) || defined(c_plusplus) extern "C" { @@ -90,4 +90,4 @@ void vp9_worker_end(VP9Worker* const worker); } // extern "C" #endif -#endif /* VP9_DECODER_VP9_THREAD_H_ */ +#endif // VP9_DECODER_VP9_THREAD_H_ diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index 622f75fe6..20dd8e175 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -484,17 +484,13 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { } if (bsize < BLOCK_8X8) { - int j; - MB_PREDICTION_MODE blockmode; - int_mv blockmv; const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; int idx, idy; for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { - j = idy * 2 + idx; - blockmode = x->partition_info->bmi[j].mode; - blockmv = m->bmi[j].as_mv[0]; + const int j = idy * 2 + idx; + const MB_PREDICTION_MODE blockmode = x->partition_info->bmi[j].mode; write_sb_mv_ref(bc, blockmode, mv_ref_p); ++cm->counts.inter_mode[mi->mode_context[rf]] [inter_mode_offset(blockmode)]; @@ -503,14 +499,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { #ifdef ENTROPY_STATS active_section = 11; #endif - vp9_encode_mv(cpi, bc, &blockmv.as_mv, &mi->best_mv.as_mv, - nmvc, allow_hp); - - if (mi->ref_frame[1] > INTRA_FRAME) - vp9_encode_mv(cpi, bc, - &m->bmi[j].as_mv[1].as_mv, - &mi->best_second_mv.as_mv, - nmvc, allow_hp); + vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[0].as_mv, + &mi->best_mv[0].as_mv, nmvc, allow_hp); + + if (has_second_ref(mi)) + vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[1].as_mv, + &mi->best_mv[1].as_mv, nmvc, allow_hp); } } } @@ -518,12 +512,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { #ifdef ENTROPY_STATS active_section = 5; #endif - vp9_encode_mv(cpi, bc, &mi->mv[0].as_mv, &mi->best_mv.as_mv, - nmvc, allow_hp); + vp9_encode_mv(cpi, bc, &mi->mv[0].as_mv, + &mi->best_mv[0].as_mv, nmvc, allow_hp); - if (mi->ref_frame[1] > INTRA_FRAME) - vp9_encode_mv(cpi, bc, &mi->mv[1].as_mv, &mi->best_second_mv.as_mv, - nmvc, allow_hp); + if (has_second_ref(mi)) + vp9_encode_mv(cpi, bc, &mi->mv[1].as_mv, + &mi->best_mv[1].as_mv, nmvc, allow_hp); } } } diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 013047e35..5a0d746c8 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -34,6 +34,7 @@ typedef struct { typedef struct { MODE_INFO mic; PARTITION_INFO partition_info; + unsigned char zcoeff_blk[256]; int skip; int_mv best_ref_mv; int_mv second_best_ref_mv; @@ -136,6 +137,7 @@ struct macroblock { int mv_row_min; int mv_row_max; + unsigned char zcoeff_blk[TX_SIZES][256]; int skip; int encode_breakout; diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index ca863931e..b9c300033 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -637,10 +637,10 @@ void vp9_short_walsh4x4_c(short *input, short *output, int pitch) { c1 = e1 - c1; a1 -= c1; d1 += b1; - op[0] = a1 << WHT_UPSCALE_FACTOR; - op[1] = c1 << WHT_UPSCALE_FACTOR; - op[2] = d1 << WHT_UPSCALE_FACTOR; - op[3] = b1 << WHT_UPSCALE_FACTOR; + op[0] = a1 * UNIT_QUANT_FACTOR; + op[1] = c1 * UNIT_QUANT_FACTOR; + op[2] = d1 * UNIT_QUANT_FACTOR; + op[3] = b1 * UNIT_QUANT_FACTOR; ip += 4; op += 4; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index ee938bda9..f6045e80b 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -390,6 +390,9 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, } x->skip = ctx->skip; + vpx_memcpy(x->zcoeff_blk[mbmi->tx_size], ctx->zcoeff_blk, + sizeof(ctx->zcoeff_blk)); + if (!output_enabled) return; @@ -428,19 +431,19 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, cpi->mode_chosen_counts[mb_mode_index]++; if (is_inter_block(mbmi) && (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV)) { - int_mv best_mv, best_second_mv; + int_mv best_mv[2]; const MV_REFERENCE_FRAME rf1 = mbmi->ref_frame[0]; const MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1]; - best_mv.as_int = ctx->best_ref_mv.as_int; - best_second_mv.as_int = ctx->second_best_ref_mv.as_int; + best_mv[0].as_int = ctx->best_ref_mv.as_int; + best_mv[1].as_int = ctx->second_best_ref_mv.as_int; if (mbmi->mode == NEWMV) { - best_mv.as_int = mbmi->ref_mvs[rf1][0].as_int; + best_mv[0].as_int = mbmi->ref_mvs[rf1][0].as_int; if (rf2 > 0) - best_second_mv.as_int = mbmi->ref_mvs[rf2][0].as_int; + best_mv[1].as_int = mbmi->ref_mvs[rf2][0].as_int; } - mbmi->best_mv.as_int = best_mv.as_int; - mbmi->best_second_mv.as_int = best_second_mv.as_int; - vp9_update_nmv_count(cpi, x, &best_mv, &best_second_mv); + mbmi->best_mv[0].as_int = best_mv[0].as_int; + mbmi->best_mv[1].as_int = best_mv[1].as_int; + vp9_update_mv_count(cpi, x, best_mv); } if (cm->mcomp_filter_type == SWITCHABLE && is_inter_mode(mbmi->mode)) { @@ -2209,7 +2212,7 @@ static void encode_frame_internal(VP9_COMP *cpi) { cpi->inter_zz_count = 0; vp9_zero(cm->counts.switchable_interp); - vp9_zero(cpi->txfm_stepdown_count); + vp9_zero(cpi->tx_stepdown_count); xd->mi_8x8 = cm->mi_grid_visible; // required for vp9_frame_init_quantizer @@ -2348,18 +2351,19 @@ static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, int mis, TX_SIZE max_tx_size, int bw, int bh, int mi_row, int mi_col, BLOCK_SIZE bsize) { VP9_COMMON * const cm = &cpi->common; - MB_MODE_INFO * const mbmi = &mi_8x8[0]->mbmi; - if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) { return; - - if (mbmi->tx_size > max_tx_size) { - const int ymbs = MIN(bh, cm->mi_rows - mi_row); - const int xmbs = MIN(bw, cm->mi_cols - mi_col); - - assert(vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) || - get_skip_flag(mi_8x8, mis, ymbs, xmbs)); - set_txfm_flag(mi_8x8, mis, ymbs, xmbs, max_tx_size); + } else { + MB_MODE_INFO * const mbmi = &mi_8x8[0]->mbmi; + if (mbmi->tx_size > max_tx_size) { + const int ymbs = MIN(bh, cm->mi_rows - mi_row); + const int xmbs = MIN(bw, cm->mi_cols - mi_col); + + assert(vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) || + get_skip_flag(mi_8x8, mis, ymbs, xmbs)); + set_txfm_flag(mi_8x8, mis, ymbs, xmbs, max_tx_size); + } } } @@ -2454,9 +2458,9 @@ static void select_tx_mode(VP9_COMP *cpi) { unsigned int total = 0; int i; for (i = 0; i < TX_SIZES; ++i) - total += cpi->txfm_stepdown_count[i]; + total += cpi->tx_stepdown_count[i]; if (total) { - double fraction = (double)cpi->txfm_stepdown_count[0] / total; + double fraction = (double)cpi->tx_stepdown_count[0] / total; cpi->common.tx_mode = fraction > 0.90 ? ALLOW_32X32 : TX_MODE_SELECT; // printf("fraction = %f\n", fraction); } // else keep unchanged @@ -2732,7 +2736,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame[0])]; YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[idx]; YV12_BUFFER_CONFIG *second_ref_fb = NULL; - if (mbmi->ref_frame[1] > 0) { + if (has_second_ref(mbmi)) { idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame[1])]; second_ref_fb = &cm->yv12_fb[idx]; } @@ -2744,7 +2748,6 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, setup_pre_planes(xd, 1, second_ref_fb, mi_row, mi_col, &xd->scale_factor[1]); - vp9_build_inter_predictors_sb(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8)); } diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 2c12477a7..76a5d33e7 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -53,7 +53,7 @@ static void inverse_transform_b_8x8_add(int eob, if (eob <= 1) vp9_short_idct8x8_1_add(dqcoeff, dest, stride); else if (eob <= 10) - vp9_short_idct10_8x8_add(dqcoeff, dest, stride); + vp9_short_idct8x8_10_add(dqcoeff, dest, stride); else vp9_short_idct8x8_add(dqcoeff, dest, stride); } @@ -64,7 +64,7 @@ static void inverse_transform_b_16x16_add(int eob, if (eob <= 1) vp9_short_idct16x16_1_add(dqcoeff, dest, stride); else if (eob <= 10) - vp9_short_idct10_16x16_add(dqcoeff, dest, stride); + vp9_short_idct16x16_10_add(dqcoeff, dest, stride); else vp9_short_idct16x16_add(dqcoeff, dest, stride); } @@ -172,7 +172,7 @@ static void optimize_b(MACROBLOCK *mb, assert((!type && !plane) || (type && plane)); dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block); qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block); - get_scan_and_band(xd, tx_size, type, ib, &scan, &band_translate); + get_scan_and_band(xd, tx_size, type, ib, &scan, &nb, &band_translate); assert(eob <= default_eob); /* Now set up a Viterbi trellis to evaluate alternative roundings. */ @@ -191,7 +191,6 @@ static void optimize_b(MACROBLOCK *mb, for (i = 0; i < eob; i++) token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[ qcoeff_ptr[scan[i]]].token]; - nb = vp9_get_coef_neighbors_handle(scan); for (i = eob; i-- > i0;) { int base_bits, d2, dx; @@ -365,36 +364,10 @@ static void optimize_init_b(int plane, BLOCK_SIZE bsize, const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; const MB_MODE_INFO *mbmi = &xd->this_mi->mbmi; const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size; - int i; - switch (tx_size) { - case TX_4X4: - vpx_memcpy(args->ctx->ta[plane], pd->above_context, - sizeof(ENTROPY_CONTEXT) * num_4x4_w); - vpx_memcpy(args->ctx->tl[plane], pd->left_context, - sizeof(ENTROPY_CONTEXT) * num_4x4_h); - break; - case TX_8X8: - for (i = 0; i < num_4x4_w; i += 2) - args->ctx->ta[plane][i] = !!*(uint16_t *)&pd->above_context[i]; - for (i = 0; i < num_4x4_h; i += 2) - args->ctx->tl[plane][i] = !!*(uint16_t *)&pd->left_context[i]; - break; - case TX_16X16: - for (i = 0; i < num_4x4_w; i += 4) - args->ctx->ta[plane][i] = !!*(uint32_t *)&pd->above_context[i]; - for (i = 0; i < num_4x4_h; i += 4) - args->ctx->tl[plane][i] = !!*(uint32_t *)&pd->left_context[i]; - break; - case TX_32X32: - for (i = 0; i < num_4x4_w; i += 8) - args->ctx->ta[plane][i] = !!*(uint64_t *)&pd->above_context[i]; - for (i = 0; i < num_4x4_h; i += 8) - args->ctx->tl[plane][i] = !!*(uint64_t *)&pd->left_context[i]; - break; - default: - assert(0); - } + vp9_get_entropy_contexts(tx_size, args->ctx->ta[plane], args->ctx->tl[plane], + pd->above_context, pd->left_context, + num_4x4_w, num_4x4_h); } void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize, @@ -482,6 +455,14 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block, pd->dst.buf, pd->dst.stride); + + // TODO(jingning): per transformed block zero forcing only enabled for + // luma component. will integrate chroma components as well. + if (x->zcoeff_blk[tx_size][block] && plane == 0) { + pd->eobs[block] = 0; + return; + } + vp9_xform_quant(plane, block, plane_bsize, tx_size, arg); if (x->optimize) diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c index ed3a2bb64..db08ee856 100644 --- a/vp9/encoder/vp9_encodemv.c +++ b/vp9/encoder/vp9_encodemv.c @@ -314,44 +314,34 @@ void vp9_build_nmv_cost_table(int *mvjoint, build_nmv_component_cost_table(mvcost[1], &mvctx->comps[1], usehp); } -void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x, - int_mv *best_ref_mv, int_mv *second_best_ref_mv) { +static void inc_mvs(int_mv mv[2], int_mv ref[2], int is_compound, + nmv_context_counts *counts) { + int i; + for (i = 0; i < 1 + is_compound; ++i) { + const MV diff = { mv[i].as_mv.row - ref[i].as_mv.row, + mv[i].as_mv.col - ref[i].as_mv.col }; + vp9_inc_mv(&diff, counts); + } +} + +void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]) { MODE_INFO *mi = x->e_mbd.mi_8x8[0]; MB_MODE_INFO *const mbmi = &mi->mbmi; - MV diff; - const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type]; - const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type]; - int idx, idy; + const int is_compound = has_second_ref(mbmi); if (mbmi->sb_type < BLOCK_8X8) { - PARTITION_INFO *pi = x->partition_info; - for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { - for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { + const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi->sb_type]; + const int num_4x4_h = num_4x4_blocks_high_lookup[mbmi->sb_type]; + int idx, idy; + + for (idy = 0; idy < 2; idy += num_4x4_h) { + for (idx = 0; idx < 2; idx += num_4x4_w) { const int i = idy * 2 + idx; - if (pi->bmi[i].mode == NEWMV) { - diff.row = mi->bmi[i].as_mv[0].as_mv.row - best_ref_mv->as_mv.row; - diff.col = mi->bmi[i].as_mv[0].as_mv.col - best_ref_mv->as_mv.col; - vp9_inc_mv(&diff, &cpi->NMVcount); - - if (mi->mbmi.ref_frame[1] > INTRA_FRAME) { - diff.row = mi->bmi[i].as_mv[1].as_mv.row - - second_best_ref_mv->as_mv.row; - diff.col = mi->bmi[i].as_mv[1].as_mv.col - - second_best_ref_mv->as_mv.col; - vp9_inc_mv(&diff, &cpi->NMVcount); - } - } + if (x->partition_info->bmi[i].mode == NEWMV) + inc_mvs(mi->bmi[i].as_mv, best_ref_mv, is_compound, &cpi->NMVcount); } } } else if (mbmi->mode == NEWMV) { - diff.row = mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row; - diff.col = mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col; - vp9_inc_mv(&diff, &cpi->NMVcount); - - if (mbmi->ref_frame[1] > INTRA_FRAME) { - diff.row = mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row; - diff.col = mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col; - vp9_inc_mv(&diff, &cpi->NMVcount); - } + inc_mvs(mbmi->mv, best_ref_mv, is_compound, &cpi->NMVcount); } } diff --git a/vp9/encoder/vp9_encodemv.h b/vp9/encoder/vp9_encodemv.h index 2789ce114..633177885 100644 --- a/vp9/encoder/vp9_encodemv.h +++ b/vp9/encoder/vp9_encodemv.h @@ -25,7 +25,7 @@ void vp9_build_nmv_cost_table(int *mvjoint, int usehp, int mvc_flag_v, int mvc_flag_h); -void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x, - int_mv *best_ref_mv, int_mv *second_best_ref_mv); + +void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]); #endif // VP9_ENCODER_VP9_ENCODEMV_H_ diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 6e44e604c..eaa3bd183 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -534,10 +534,11 @@ void vp9_first_pass(VP9_COMP *cpi) { recon_yoffset = (mb_row * recon_y_stride * 16); recon_uvoffset = (mb_row * recon_uv_stride * 8); - // Set up limit values for motion vectors to prevent them extending outside the UMV borders - x->mv_row_min = -((mb_row * 16) + (VP9BORDERINPIXELS - 8)); + // Set up limit values for motion vectors to prevent them extending + // outside the UMV borders + x->mv_row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16); x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) - + (VP9BORDERINPIXELS - 8); + + BORDER_MV_PIXELS_B16; // for each macroblock col in image for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { @@ -583,9 +584,9 @@ void vp9_first_pass(VP9_COMP *cpi) { intra_error += (int64_t)this_error; // Set up limit values for motion vectors to prevent them extending outside the UMV borders - x->mv_col_min = -((mb_col * 16) + (VP9BORDERINPIXELS - 8)); + x->mv_col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16); x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) - + (VP9BORDERINPIXELS - 8); + + BORDER_MV_PIXELS_B16; // Other than for the first frame do a motion search if (cm->current_video_frame > 0) { diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c index 5a671f201..0a6576eb5 100644 --- a/vp9/encoder/vp9_mbgraph.c +++ b/vp9/encoder/vp9_mbgraph.c @@ -10,14 +10,17 @@ #include <limits.h> -#include <vpx_mem/vpx_mem.h> -#include <vp9/encoder/vp9_encodeintra.h> -#include <vp9/encoder/vp9_rdopt.h> -#include <vp9/common/vp9_blockd.h> -#include <vp9/common/vp9_reconinter.h> -#include <vp9/common/vp9_reconintra.h> -#include <vp9/common/vp9_systemdependent.h> -#include <vp9/encoder/vp9_segmentation.h> +#include "vpx_mem/vpx_mem.h" +#include "vp9/encoder/vp9_encodeintra.h" +#include "vp9/encoder/vp9_rdopt.h" +#include "vp9/encoder/vp9_segmentation.h" +#include "vp9/encoder/vp9_mcomp.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_systemdependent.h" + + static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, int_mv *ref_mv, @@ -46,9 +49,9 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, ref_full.as_mv.row = ref_mv->as_mv.row >> 3; /*cpi->sf.search_method == HEX*/ - best_err = vp9_hex_search(x, &ref_full, step_param, x->errorperbit, + best_err = vp9_hex_search(x, &ref_full.as_mv, step_param, x->errorperbit, 0, &v_fn_ptr, - 0, ref_mv, dst_mv); + 0, &ref_mv->as_mv, &dst_mv->as_mv); // Try sub-pixel MC // if (bestsme > error_thresh && bestsme < INT_MAX) @@ -57,7 +60,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, unsigned int sse; best_err = cpi->find_fractional_mv_step( x, - dst_mv, ref_mv, + &dst_mv->as_mv, &ref_mv->as_mv, x->errorperbit, &v_fn_ptr, 0, cpi->sf.subpel_iters_per_step, NULL, NULL, & distortion, &sse); @@ -246,9 +249,8 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi, // Set up limit values for motion vectors to prevent them extending outside the UMV borders arf_top_mv.as_int = 0; gld_top_mv.as_int = 0; - x->mv_row_min = -(VP9BORDERINPIXELS - 8 - VP9_INTERP_EXTEND); - x->mv_row_max = (cm->mb_rows - 1) * 8 + VP9BORDERINPIXELS - - 8 - VP9_INTERP_EXTEND; + x->mv_row_min = -BORDER_MV_PIXELS_B16; + x->mv_row_max = (cm->mb_rows - 1) * 8 + BORDER_MV_PIXELS_B16; xd->up_available = 0; xd->plane[0].dst.stride = buf->y_stride; xd->plane[0].pre[0].stride = buf->y_stride; @@ -267,9 +269,8 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi, // Set up limit values for motion vectors to prevent them extending outside the UMV borders arf_left_mv.as_int = arf_top_mv.as_int; gld_left_mv.as_int = gld_top_mv.as_int; - x->mv_col_min = -(VP9BORDERINPIXELS - 8 - VP9_INTERP_EXTEND); - x->mv_col_max = (cm->mb_cols - 1) * 8 + VP9BORDERINPIXELS - - 8 - VP9_INTERP_EXTEND; + x->mv_col_min = -BORDER_MV_PIXELS_B16; + x->mv_col_max = (cm->mb_cols - 1) * 8 + BORDER_MV_PIXELS_B16; xd->left_available = 0; for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 7dd786904..44eaa657c 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -59,38 +59,39 @@ int vp9_init_search_range(VP9_COMP *cpi, int size) { return sr; } -int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2], - int weight) { - MV v; - v.row = mv->as_mv.row - ref->as_mv.row; - v.col = mv->as_mv.col - ref->as_mv.col; - return ROUND_POWER_OF_TWO((mvjcost[vp9_get_mv_joint(&v)] + - mvcost[0][v.row] + - mvcost[1][v.col]) * weight, 7); +static INLINE int mv_cost(const MV *mv, + const int *joint_cost, int *comp_cost[2]) { + return joint_cost[vp9_get_mv_joint(mv)] + + comp_cost[0][mv->row] + comp_cost[1][mv->col]; } -static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2], +int vp9_mv_bit_cost(const MV *mv, const MV *ref, + const int *mvjcost, int *mvcost[2], int weight) { + const MV diff = { mv->row - ref->row, + mv->col - ref->col }; + return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) * weight, 7); +} + +static int mv_err_cost(const MV *mv, const MV *ref, + const int *mvjcost, int *mvcost[2], int error_per_bit) { if (mvcost) { - MV v; - v.row = mv->as_mv.row - ref->as_mv.row; - v.col = mv->as_mv.col - ref->as_mv.col; - return ROUND_POWER_OF_TWO((mvjcost[vp9_get_mv_joint(&v)] + - mvcost[0][v.row] + - mvcost[1][v.col]) * error_per_bit, 13); + const MV diff = { mv->row - ref->row, + mv->col - ref->col }; + return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) * + error_per_bit, 13); } return 0; } -static int mvsad_err_cost(int_mv *mv, int_mv *ref, int *mvjsadcost, - int *mvsadcost[2], int error_per_bit) { +static int mvsad_err_cost(const MV *mv, const MV *ref, + const int *mvjsadcost, int *mvsadcost[2], + int error_per_bit) { if (mvsadcost) { - MV v; - v.row = mv->as_mv.row - ref->as_mv.row; - v.col = mv->as_mv.col - ref->as_mv.col; - return ROUND_POWER_OF_TWO((mvjsadcost[vp9_get_mv_joint(&v)] + - mvsadcost[0][v.row] + - mvsadcost[1][v.col]) * error_per_bit, 8); + const MV diff = { mv->row - ref->row, + mv->col - ref->col }; + return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjsadcost, mvsadcost) * + error_per_bit, 8); } return 0; } @@ -273,7 +274,7 @@ void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) { } int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x, - int_mv *bestmv, int_mv *ref_mv, + MV *bestmv, const MV *ref_mv, int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, @@ -294,25 +295,25 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x, int thismse; const int y_stride = xd->plane[0].pre[0].stride; - const int offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col; + const int offset = bestmv->row * y_stride + bestmv->col; uint8_t *y = xd->plane[0].pre[0].buf + offset; - int rr = ref_mv->as_mv.row; - int rc = ref_mv->as_mv.col; - int br = bestmv->as_mv.row * 8; - int bc = bestmv->as_mv.col * 8; + int rr = ref_mv->row; + int rc = ref_mv->col; + int br = bestmv->row * 8; + int bc = bestmv->col * 8; int hstep = 4; - const int minc = MAX(x->mv_col_min * 8, ref_mv->as_mv.col - MV_MAX); - const int maxc = MIN(x->mv_col_max * 8, ref_mv->as_mv.col + MV_MAX); - const int minr = MAX(x->mv_row_min * 8, ref_mv->as_mv.row - MV_MAX); - const int maxr = MIN(x->mv_row_max * 8, ref_mv->as_mv.row + MV_MAX); + const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX); + const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX); + const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX); + const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX); int tr = br; int tc = bc; // central mv - bestmv->as_mv.row <<= 3; - bestmv->as_mv.col <<= 3; + bestmv->row <<= 3; + bestmv->col <<= 3; // calculate central point error besterr = vfp->vf(y, y_stride, z, src_stride, sse1); @@ -347,7 +348,7 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x, } } - if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) && + if (xd->allow_high_precision_mv && vp9_use_mv_hp(ref_mv) && forced_stop == 0) { hstep >>= 1; while (eighthiters--) { @@ -360,18 +361,18 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x, } } - bestmv->as_mv.row = br; - bestmv->as_mv.col = bc; + bestmv->row = br; + bestmv->col = bc; - if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) || - (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3))) + if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || + (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) return INT_MAX; return besterr; } int vp9_find_best_sub_pixel_tree(MACROBLOCK *x, - int_mv *bestmv, int_mv *ref_mv, + MV *bestmv, const MV *ref_mv, int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, @@ -391,25 +392,25 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x, unsigned int eighthiters = iters_per_step; const int y_stride = xd->plane[0].pre[0].stride; - const int offset = bestmv->as_mv.row * y_stride + bestmv->as_mv.col; + const int offset = bestmv->row * y_stride + bestmv->col; uint8_t *y = xd->plane[0].pre[0].buf + offset; - int rr = ref_mv->as_mv.row; - int rc = ref_mv->as_mv.col; - int br = bestmv->as_mv.row * 8; - int bc = bestmv->as_mv.col * 8; + int rr = ref_mv->row; + int rc = ref_mv->col; + int br = bestmv->row * 8; + int bc = bestmv->col * 8; int hstep = 4; - const int minc = MAX(x->mv_col_min * 8, ref_mv->as_mv.col - MV_MAX); - const int maxc = MIN(x->mv_col_max * 8, ref_mv->as_mv.col + MV_MAX); - const int minr = MAX(x->mv_row_min * 8, ref_mv->as_mv.row - MV_MAX); - const int maxr = MIN(x->mv_row_max * 8, ref_mv->as_mv.row + MV_MAX); + const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX); + const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX); + const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX); + const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX); int tr = br; int tc = bc; // central mv - bestmv->as_mv.row *= 8; - bestmv->as_mv.col *= 8; + bestmv->row *= 8; + bestmv->col *= 8; // calculate central point error besterr = vfp->vf(y, y_stride, z, src_stride, sse1); @@ -435,7 +436,7 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x, tc = bc; } - if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) && + if (xd->allow_high_precision_mv && vp9_use_mv_hp(ref_mv) && forced_stop == 0) { hstep >>= 1; FIRST_LEVEL_CHECKS; @@ -446,11 +447,11 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x, tc = bc; } - bestmv->as_mv.row = br; - bestmv->as_mv.col = bc; + bestmv->row = br; + bestmv->col = bc; - if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) || - (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3))) + if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || + (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) return INT_MAX; return besterr; @@ -463,7 +464,7 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x, z, src_stride, &sse, second_pred) int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x, - int_mv *bestmv, int_mv *ref_mv, + MV *bestmv, const MV *ref_mv, int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, @@ -487,25 +488,25 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x, DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); const int y_stride = xd->plane[0].pre[0].stride; - const int offset = bestmv->as_mv.row * y_stride + bestmv->as_mv.col; + const int offset = bestmv->row * y_stride + bestmv->col; uint8_t *const y = xd->plane[0].pre[0].buf + offset; - int rr = ref_mv->as_mv.row; - int rc = ref_mv->as_mv.col; - int br = bestmv->as_mv.row * 8; - int bc = bestmv->as_mv.col * 8; + int rr = ref_mv->row; + int rc = ref_mv->col; + int br = bestmv->row * 8; + int bc = bestmv->col * 8; int hstep = 4; - const int minc = MAX(x->mv_col_min * 8, ref_mv->as_mv.col - MV_MAX); - const int maxc = MIN(x->mv_col_max * 8, ref_mv->as_mv.col + MV_MAX); - const int minr = MAX(x->mv_row_min * 8, ref_mv->as_mv.row - MV_MAX); - const int maxr = MIN(x->mv_row_max * 8, ref_mv->as_mv.row + MV_MAX); + const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX); + const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX); + const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX); + const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX); int tr = br; int tc = bc; // central mv - bestmv->as_mv.row *= 8; - bestmv->as_mv.col *= 8; + bestmv->row *= 8; + bestmv->col *= 8; // calculate central point error // TODO(yunqingwang): central pointer error was already calculated in full- @@ -543,7 +544,7 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x, } } - if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) && + if (xd->allow_high_precision_mv && vp9_use_mv_hp(ref_mv) && forced_stop == 0) { hstep >>= 1; while (eighthiters--) { @@ -555,18 +556,18 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x, tc = bc; } } - bestmv->as_mv.row = br; - bestmv->as_mv.col = bc; + bestmv->row = br; + bestmv->col = bc; - if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) || - (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3))) + if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || + (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) return INT_MAX; return besterr; } int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x, - int_mv *bestmv, int_mv *ref_mv, + MV *bestmv, const MV *ref_mv, int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, @@ -589,25 +590,25 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x, DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); const int y_stride = xd->plane[0].pre[0].stride; - const int offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col; + const int offset = bestmv->row * y_stride + bestmv->col; uint8_t *y = xd->plane[0].pre[0].buf + offset; - int rr = ref_mv->as_mv.row; - int rc = ref_mv->as_mv.col; - int br = bestmv->as_mv.row * 8; - int bc = bestmv->as_mv.col * 8; + int rr = ref_mv->row; + int rc = ref_mv->col; + int br = bestmv->row * 8; + int bc = bestmv->col * 8; int hstep = 4; - const int minc = MAX(x->mv_col_min * 8, ref_mv->as_mv.col - MV_MAX); - const int maxc = MIN(x->mv_col_max * 8, ref_mv->as_mv.col + MV_MAX); - const int minr = MAX(x->mv_row_min * 8, ref_mv->as_mv.row - MV_MAX); - const int maxr = MIN(x->mv_row_max * 8, ref_mv->as_mv.row + MV_MAX); + const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX); + const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX); + const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX); + const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX); int tr = br; int tc = bc; // central mv - bestmv->as_mv.row *= 8; - bestmv->as_mv.col *= 8; + bestmv->row *= 8; + bestmv->col *= 8; // calculate central point error // TODO(yunqingwang): central pointer error was already calculated in full- @@ -641,7 +642,7 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x, tc = bc; } - if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) && + if (xd->allow_high_precision_mv && vp9_use_mv_hp(ref_mv) && forced_stop == 0) { hstep >>= 1; FIRST_LEVEL_CHECKS; @@ -651,11 +652,11 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x, tr = br; tc = bc; } - bestmv->as_mv.row = br; - bestmv->as_mv.col = bc; + bestmv->row = br; + bestmv->col = bc; - if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) || - (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3))) + if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || + (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) return INT_MAX; return besterr; @@ -679,10 +680,10 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x, #define CHECK_POINT \ {\ - if (this_mv.as_mv.col < x->mv_col_min) continue;\ - if (this_mv.as_mv.col > x->mv_col_max) continue;\ - if (this_mv.as_mv.row < x->mv_row_min) continue;\ - if (this_mv.as_mv.row > x->mv_row_max) continue;\ + if (this_mv.col < x->mv_col_min) continue;\ + if (this_mv.col > x->mv_col_max) continue;\ + if (this_mv.row < x->mv_row_min) continue;\ + if (this_mv.row > x->mv_row_max) continue;\ } #define CHECK_BETTER \ @@ -690,7 +691,7 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x, if (thissad < bestsad)\ {\ if (use_mvcost) \ - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, \ + thissad += mvsad_err_cost(&this_mv, &fcenter_mv.as_mv, \ mvjsadcost, mvsadcost, \ sad_per_bit);\ if (thissad < bestsad)\ @@ -715,14 +716,14 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x, // candidates as indicated in the num_candidates and candidates arrays // passed into this function static int vp9_pattern_search(MACROBLOCK *x, - int_mv *ref_mv, + MV *ref_mv, int search_param, int sad_per_bit, int do_init_search, int do_refine, const vp9_variance_fn_ptr_t *vfp, int use_mvcost, - int_mv *center_mv, int_mv *best_mv, + const MV *center_mv, MV *best_mv, const int num_candidates[MAX_PATTERN_SCALES], const MV candidates[MAX_PATTERN_SCALES] [MAX_PATTERN_CANDIDATES]) { @@ -735,7 +736,7 @@ static int vp9_pattern_search(MACROBLOCK *x, int what_stride = x->plane[0].src.stride; int in_what_stride = xd->plane[0].pre[0].stride; int br, bc; - int_mv this_mv; + MV this_mv; int bestsad = INT_MAX; int thissad; uint8_t *base_offset; @@ -748,24 +749,22 @@ static int vp9_pattern_search(MACROBLOCK *x, int *mvjsadcost = x->nmvjointsadcost; int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; - fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; - fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; + fcenter_mv.as_mv.row = center_mv->row >> 3; + fcenter_mv.as_mv.col = center_mv->col >> 3; // adjust ref_mv to make sure it is within MV range - clamp_mv(&ref_mv->as_mv, - x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); - br = ref_mv->as_mv.row; - bc = ref_mv->as_mv.col; + clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); + br = ref_mv->row; + bc = ref_mv->col; // Work out the start point for the search base_offset = (uint8_t *)(xd->plane[0].pre[0].buf); this_offset = base_offset + (br * in_what_stride) + bc; - this_mv.as_mv.row = br; - this_mv.as_mv.col = bc; - bestsad = vfp->sdf(what, what_stride, this_offset, - in_what_stride, 0x7fffffff) - + mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, mvsadcost, - sad_per_bit); + this_mv.row = br; + this_mv.col = bc; + bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride, 0x7fffffff) + + mvsad_err_cost(&this_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, sad_per_bit); // Search all possible scales upto the search param around the center point // pick the scale of the point that is best as the starting scale of @@ -778,21 +777,21 @@ static int vp9_pattern_search(MACROBLOCK *x, CHECK_BOUNDS((1 << t)) if (all_in) { for (i = 0; i < num_candidates[t]; i++) { - this_mv.as_mv.row = br + candidates[t][i].row; - this_mv.as_mv.col = bc + candidates[t][i].col; - this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + - this_mv.as_mv.col; + this_mv.row = br + candidates[t][i].row; + this_mv.col = bc + candidates[t][i].col; + this_offset = base_offset + (this_mv.row * in_what_stride) + + this_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); CHECK_BETTER } } else { for (i = 0; i < num_candidates[t]; i++) { - this_mv.as_mv.row = br + candidates[t][i].row; - this_mv.as_mv.col = bc + candidates[t][i].col; + this_mv.row = br + candidates[t][i].row; + this_mv.col = bc + candidates[t][i].col; CHECK_POINT - this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + - this_mv.as_mv.col; + this_offset = base_offset + (this_mv.row * in_what_stride) + + this_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); CHECK_BETTER @@ -822,21 +821,21 @@ static int vp9_pattern_search(MACROBLOCK *x, CHECK_BOUNDS((1 << s)) if (all_in) { for (i = 0; i < num_candidates[s]; i++) { - this_mv.as_mv.row = br + candidates[s][i].row; - this_mv.as_mv.col = bc + candidates[s][i].col; - this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + - this_mv.as_mv.col; + this_mv.row = br + candidates[s][i].row; + this_mv.col = bc + candidates[s][i].col; + this_offset = base_offset + (this_mv.row * in_what_stride) + + this_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); CHECK_BETTER } } else { for (i = 0; i < num_candidates[s]; i++) { - this_mv.as_mv.row = br + candidates[s][i].row; - this_mv.as_mv.col = bc + candidates[s][i].col; + this_mv.row = br + candidates[s][i].row; + this_mv.col = bc + candidates[s][i].col; CHECK_POINT - this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + - this_mv.as_mv.col; + this_offset = base_offset + (this_mv.row * in_what_stride) + + this_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); CHECK_BETTER @@ -860,25 +859,21 @@ static int vp9_pattern_search(MACROBLOCK *x, get_next_chkpts(next_chkpts_indices, k, num_candidates[s]); if (all_in) { for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { - this_mv.as_mv.row = br + - candidates[s][next_chkpts_indices[i]].row; - this_mv.as_mv.col = bc + - candidates[s][next_chkpts_indices[i]].col; - this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + - this_mv.as_mv.col; + this_mv.row = br + candidates[s][next_chkpts_indices[i]].row; + this_mv.col = bc + candidates[s][next_chkpts_indices[i]].col; + this_offset = base_offset + (this_mv.row * (in_what_stride)) + + this_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); CHECK_BETTER } } else { for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { - this_mv.as_mv.row = br + - candidates[s][next_chkpts_indices[i]].row; - this_mv.as_mv.col = bc + - candidates[s][next_chkpts_indices[i]].col; + this_mv.row = br + candidates[s][next_chkpts_indices[i]].row; + this_mv.col = bc + candidates[s][next_chkpts_indices[i]].col; CHECK_POINT - this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + - this_mv.as_mv.col; + this_offset = base_offset + (this_mv.row * (in_what_stride)) + + this_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); CHECK_BETTER @@ -905,21 +900,21 @@ static int vp9_pattern_search(MACROBLOCK *x, CHECK_BOUNDS(1) if (all_in) { for (i = 0; i < 4; i++) { - this_mv.as_mv.row = br + neighbors[i].row; - this_mv.as_mv.col = bc + neighbors[i].col; - this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + - this_mv.as_mv.col; + this_mv.row = br + neighbors[i].row; + this_mv.col = bc + neighbors[i].col; + this_offset = base_offset + (this_mv.row * (in_what_stride)) + + this_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); CHECK_BETTER } } else { for (i = 0; i < 4; i++) { - this_mv.as_mv.row = br + neighbors[i].row; - this_mv.as_mv.col = bc + neighbors[i].col; + this_mv.row = br + neighbors[i].row; + this_mv.col = bc + neighbors[i].col; CHECK_POINT - this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + - this_mv.as_mv.col; + this_offset = base_offset + (this_mv.row * (in_what_stride)) + + this_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); CHECK_BETTER @@ -935,31 +930,32 @@ static int vp9_pattern_search(MACROBLOCK *x, } } - best_mv->as_mv.row = br; - best_mv->as_mv.col = bc; + best_mv->row = br; + best_mv->col = bc; - this_offset = base_offset + (best_mv->as_mv.row * (in_what_stride)) + - best_mv->as_mv.col; - this_mv.as_mv.row = best_mv->as_mv.row * 8; - this_mv.as_mv.col = best_mv->as_mv.col * 8; + this_offset = base_offset + (best_mv->row * in_what_stride) + + best_mv->col; + this_mv.row = best_mv->row * 8; + this_mv.col = best_mv->col * 8; if (bestsad == INT_MAX) return INT_MAX; - return - vfp->vf(what, what_stride, this_offset, in_what_stride, - (unsigned int *)(&bestsad)) + - use_mvcost ? mv_err_cost(&this_mv, center_mv, x->nmvjointcost, x->mvcost, - x->errorperbit) : 0; + + return vfp->vf(what, what_stride, this_offset, in_what_stride, + (unsigned int *)&bestsad) + + use_mvcost ? mv_err_cost(&this_mv, center_mv, + x->nmvjointcost, x->mvcost, x->errorperbit) + : 0; } int vp9_hex_search(MACROBLOCK *x, - int_mv *ref_mv, + MV *ref_mv, int search_param, int sad_per_bit, int do_init_search, const vp9_variance_fn_ptr_t *vfp, int use_mvcost, - int_mv *center_mv, int_mv *best_mv) { + const MV *center_mv, MV *best_mv) { // First scale has 8-closest points, the rest have 6 points in hex shape // at increasing scales static const int hex_num_candidates[MAX_PATTERN_SCALES] = { @@ -988,14 +984,14 @@ int vp9_hex_search(MACROBLOCK *x, } int vp9_bigdia_search(MACROBLOCK *x, - int_mv *ref_mv, + MV *ref_mv, int search_param, int sad_per_bit, int do_init_search, const vp9_variance_fn_ptr_t *vfp, int use_mvcost, - int_mv *center_mv, - int_mv *best_mv) { + const MV *center_mv, + MV *best_mv) { // First scale has 4-closest points, the rest have 8 points in diamond // shape at increasing scales static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = { @@ -1022,22 +1018,21 @@ int vp9_bigdia_search(MACROBLOCK *x, {{-512, -512}, {0, -1024}, {512, -512}, {1024, 0}, {512, 512}, {0, 1024}, {-512, 512}, {-1024, 0}}, }; - return - vp9_pattern_search(x, ref_mv, search_param, sad_per_bit, - do_init_search, 0, vfp, use_mvcost, - center_mv, best_mv, - bigdia_num_candidates, bigdia_candidates); + return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit, + do_init_search, 0, vfp, use_mvcost, + center_mv, best_mv, + bigdia_num_candidates, bigdia_candidates); } int vp9_square_search(MACROBLOCK *x, - int_mv *ref_mv, + MV *ref_mv, int search_param, int sad_per_bit, int do_init_search, const vp9_variance_fn_ptr_t *vfp, int use_mvcost, - int_mv *center_mv, - int_mv *best_mv) { + const MV *center_mv, + MV *best_mv) { // All scales have 8 closest points in square shape static const int square_num_candidates[MAX_PATTERN_SCALES] = { 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, @@ -1064,11 +1059,10 @@ int vp9_square_search(MACROBLOCK *x, {{-1024, -1024}, {0, -1024}, {1024, -1024}, {1024, 0}, {1024, 1024}, {0, 1024}, {-1024, 1024}, {-1024, 0}}, }; - return - vp9_pattern_search(x, ref_mv, search_param, sad_per_bit, - do_init_search, 0, vfp, use_mvcost, - center_mv, best_mv, - square_num_candidates, square_candidates); + return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit, + do_init_search, 0, vfp, use_mvcost, + center_mv, best_mv, + square_num_candidates, square_candidates); }; #undef CHECK_BOUNDS @@ -1124,10 +1118,9 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x, best_address = in_what; // Check the starting position - bestsad = fn_ptr->sdf(what, what_stride, in_what, - in_what_stride, 0x7fffffff) - + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost, - sad_per_bit); + bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, sad_per_bit); // search_param determines the length of the initial step and hence the number of iterations // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc. @@ -1153,7 +1146,7 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x, if (thissad < bestsad) { this_mv.as_mv.row = this_row_offset; this_mv.as_mv.col = this_col_offset; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, mvjsadcost, mvsadcost, sad_per_bit); if (thissad < bestsad) { @@ -1185,7 +1178,7 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x, if (thissad < bestsad) { this_mv.as_mv.row = this_row_offset; this_mv.as_mv.col = this_col_offset; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, mvjsadcost, mvsadcost, sad_per_bit); if (thissad < bestsad) { bestsad = thissad; @@ -1210,8 +1203,9 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x, return INT_MAX; return fn_ptr->vf(what, what_stride, best_address, in_what_stride, - (unsigned int *)(&thissad)) + mv_err_cost(&this_mv, center_mv, mvjcost, - mvcost, x->errorperbit); + (unsigned int *)(&thissad)) + + mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, + mvjcost, mvcost, x->errorperbit); } int vp9_diamond_search_sadx4(MACROBLOCK *x, @@ -1265,10 +1259,9 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x, best_address = in_what; // Check the starting position - bestsad = fn_ptr->sdf(what, what_stride, - in_what, in_what_stride, 0x7fffffff) - + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost, - sad_per_bit); + bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, sad_per_bit); // search_param determines the length of the initial step and hence the number of iterations // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc. @@ -1303,7 +1296,7 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x, if (sad_array[t] < bestsad) { this_mv.as_mv.row = best_mv->as_mv.row + ss[i].mv.row; this_mv.as_mv.col = best_mv->as_mv.col + ss[i].mv.col; - sad_array[t] += mvsad_err_cost(&this_mv, &fcenter_mv, + sad_array[t] += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, mvjsadcost, mvsadcost, sad_per_bit); if (sad_array[t] < bestsad) { @@ -1327,7 +1320,7 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x, if (thissad < bestsad) { this_mv.as_mv.row = this_row_offset; this_mv.as_mv.col = this_col_offset; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, mvjsadcost, mvsadcost, sad_per_bit); if (thissad < bestsad) { @@ -1358,7 +1351,7 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x, if (thissad < bestsad) { this_mv.as_mv.row = this_row_offset; this_mv.as_mv.col = this_col_offset; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, mvjsadcost, mvsadcost, sad_per_bit); if (thissad < bestsad) { bestsad = thissad; @@ -1383,8 +1376,9 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x, return INT_MAX; return fn_ptr->vf(what, what_stride, best_address, in_what_stride, - (unsigned int *)(&thissad)) + mv_err_cost(&this_mv, - center_mv, mvjcost, mvcost, x->errorperbit); + (unsigned int *)(&thissad)) + + mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, + mvjcost, mvcost, x->errorperbit); } /* do_refine: If last step (1-away) of n-step search doesn't pick the center @@ -1495,8 +1489,8 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv, // Baseline value at the centre bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) - + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost, - sad_per_bit); + + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, sad_per_bit); // Apply further limits to prevent us looking using vectors that stretch // beyond the UMV border @@ -1513,8 +1507,8 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv, thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad); this_mv.as_mv.col = c; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, sad_per_bit); if (thissad < bestsad) { bestsad = thissad; @@ -1531,10 +1525,10 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv, this_mv.as_mv.col = best_mv->as_mv.col * 8; if (bestsad < INT_MAX) - return - fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, - (unsigned int *)(&thissad)) + - mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit); + return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, + (unsigned int *)(&thissad)) + + mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, + mvjcost, mvcost, x->errorperbit); else return INT_MAX; } @@ -1585,8 +1579,8 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv, // Baseline value at the centre bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) - + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost, - sad_per_bit); + + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, sad_per_bit); // Apply further limits to prevent us looking using vectors that stretch // beyond the UMV border @@ -1610,8 +1604,8 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv, if (thissad < bestsad) { this_mv.as_mv.col = c; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, sad_per_bit); if (thissad < bestsad) { bestsad = thissad; @@ -1631,7 +1625,7 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv, if (thissad < bestsad) { this_mv.as_mv.col = c; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, mvjsadcost, mvsadcost, sad_per_bit); if (thissad < bestsad) { @@ -1652,10 +1646,10 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv, this_mv.as_mv.col = best_mv->as_mv.col * 8; if (bestsad < INT_MAX) - return - fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, - (unsigned int *)(&thissad)) + - mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit); + return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, + (unsigned int *)(&thissad)) + + mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, + mvjcost, mvcost, x->errorperbit); else return INT_MAX; } @@ -1708,8 +1702,8 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv, // Baseline value at the centre bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) - + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost, - sad_per_bit); + + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, sad_per_bit); // Apply further limits to prevent us looking using vectors that stretch // beyond the UMV border @@ -1733,8 +1727,8 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv, if (thissad < bestsad) { this_mv.as_mv.col = c; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, sad_per_bit); if (thissad < bestsad) { bestsad = thissad; @@ -1759,7 +1753,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv, if (thissad < bestsad) { this_mv.as_mv.col = c; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, mvjsadcost, mvsadcost, sad_per_bit); if (thissad < bestsad) { @@ -1780,8 +1774,8 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv, if (thissad < bestsad) { this_mv.as_mv.col = c; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, sad_per_bit); if (thissad < bestsad) { bestsad = thissad; @@ -1800,10 +1794,10 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv, this_mv.as_mv.col = best_mv->as_mv.col * 8; if (bestsad < INT_MAX) - return - fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, - (unsigned int *)(&thissad)) + - mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit); + return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, + (unsigned int *)(&thissad)) + + mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, + mvjcost, mvcost, x->errorperbit); else return INT_MAX; } @@ -1834,8 +1828,10 @@ int vp9_refining_search_sad_c(MACROBLOCK *x, fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; - bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) + - mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit); + bestsad = fn_ptr->sdf(what, what_stride, best_address, + in_what_stride, 0x7fffffff) + + mvsad_err_cost(&ref_mv->as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, error_per_bit); for (i = 0; i < search_range; i++) { int best_site = -1; @@ -1852,8 +1848,8 @@ int vp9_refining_search_sad_c(MACROBLOCK *x, if (thissad < bestsad) { this_mv.as_mv.row = this_row_offset; this_mv.as_mv.col = this_col_offset; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, - mvsadcost, error_per_bit); + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, error_per_bit); if (thissad < bestsad) { bestsad = thissad; @@ -1876,10 +1872,10 @@ int vp9_refining_search_sad_c(MACROBLOCK *x, this_mv.as_mv.col = ref_mv->as_mv.col * 8; if (bestsad < INT_MAX) - return - fn_ptr->vf(what, what_stride, best_address, in_what_stride, - (unsigned int *)(&thissad)) + - mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit); + return fn_ptr->vf(what, what_stride, best_address, in_what_stride, + (unsigned int *)(&thissad)) + + mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, + mvjcost, mvcost, x->errorperbit); else return INT_MAX; } @@ -1911,8 +1907,10 @@ int vp9_refining_search_sadx4(MACROBLOCK *x, fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; - bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) + - mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit); + bestsad = fn_ptr->sdf(what, what_stride, best_address, + in_what_stride, 0x7fffffff) + + mvsad_err_cost(&ref_mv->as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, error_per_bit); for (i = 0; i < search_range; i++) { int best_site = -1; @@ -1935,8 +1933,8 @@ int vp9_refining_search_sadx4(MACROBLOCK *x, if (sad_array[j] < bestsad) { this_mv.as_mv.row = ref_mv->as_mv.row + neighbors[j].row; this_mv.as_mv.col = ref_mv->as_mv.col + neighbors[j].col; - sad_array[j] += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, - mvsadcost, error_per_bit); + sad_array[j] += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, error_per_bit); if (sad_array[j] < bestsad) { bestsad = sad_array[j]; @@ -1957,8 +1955,8 @@ int vp9_refining_search_sadx4(MACROBLOCK *x, if (thissad < bestsad) { this_mv.as_mv.row = this_row_offset; this_mv.as_mv.col = this_col_offset; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, - mvsadcost, error_per_bit); + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, error_per_bit); if (thissad < bestsad) { bestsad = thissad; @@ -1982,10 +1980,10 @@ int vp9_refining_search_sadx4(MACROBLOCK *x, this_mv.as_mv.col = ref_mv->as_mv.col * 8; if (bestsad < INT_MAX) - return - fn_ptr->vf(what, what_stride, best_address, in_what_stride, - (unsigned int *)(&thissad)) + - mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit); + return fn_ptr->vf(what, what_stride, best_address, in_what_stride, + (unsigned int *)(&thissad)) + + mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, + mvjcost, mvcost, x->errorperbit); else return INT_MAX; } @@ -2025,7 +2023,8 @@ int vp9_refining_search_8p_c(MACROBLOCK *x, /* Get compound pred by averaging two pred blocks. */ bestsad = fn_ptr->sdaf(what, what_stride, best_address, in_what_stride, second_pred, 0x7fffffff) + - mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit); + mvsad_err_cost(&ref_mv->as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, error_per_bit); for (i = 0; i < search_range; i++) { int best_site = -1; @@ -2048,9 +2047,8 @@ int vp9_refining_search_8p_c(MACROBLOCK *x, if (thissad < bestsad) { this_mv.as_mv.row = this_row_offset; this_mv.as_mv.col = this_col_offset; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, - mvsadcost, error_per_bit); - + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, error_per_bit); if (thissad < bestsad) { bestsad = thissad; best_site = j; @@ -2075,10 +2073,10 @@ int vp9_refining_search_8p_c(MACROBLOCK *x, if (bestsad < INT_MAX) { // FIXME(rbultje, yunqing): add full-pixel averaging variance functions // so we don't have to use the subpixel with xoff=0,yoff=0 here. - return fn_ptr->svaf(best_address, in_what_stride, 0, 0, - what, what_stride, (unsigned int *)(&thissad), - second_pred) + - mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit); + return fn_ptr->svaf(best_address, in_what_stride, 0, 0, what, what_stride, + (unsigned int *)(&thissad), second_pred) + + mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, + mvjcost, mvcost, x->errorperbit); } else { return INT_MAX; } diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index 3598fa09a..77c157c5b 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -22,10 +22,14 @@ #define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1) // Maximum size of the first step in full pel units #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1)) +// Allowed motion vector pixel distance outside image border +// for Block_16x16 +#define BORDER_MV_PIXELS_B16 (16 + VP9_INTERP_EXTEND) + void vp9_clamp_mv_min_max(MACROBLOCK *x, MV *mv); -int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost, - int *mvcost[2], int weight); +int vp9_mv_bit_cost(const MV *mv, const MV *ref, + const int *mvjcost, int *mvcost[2], int weight); void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride); void vp9_init3smotion_compensation(MACROBLOCK *x, int stride); @@ -40,37 +44,36 @@ int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x, int_mv *ref_mv, int_mv *dst_mv); int vp9_hex_search(MACROBLOCK *x, - int_mv *ref_mv, + MV *ref_mv, int search_param, int error_per_bit, int do_init_search, const vp9_variance_fn_ptr_t *vf, int use_mvcost, - int_mv *center_mv, - int_mv *best_mv); + const MV *center_mv, + MV *best_mv); int vp9_bigdia_search(MACROBLOCK *x, - int_mv *ref_mv, + MV *ref_mv, int search_param, int error_per_bit, int do_init_search, const vp9_variance_fn_ptr_t *vf, int use_mvcost, - int_mv *center_mv, - int_mv *best_mv); + const MV *center_mv, + MV *best_mv); int vp9_square_search(MACROBLOCK *x, - int_mv *ref_mv, + MV *ref_mv, int search_param, int error_per_bit, int do_init_search, const vp9_variance_fn_ptr_t *vf, int use_mvcost, - int_mv *center_mv, - int_mv *best_mv); + const MV *center_mv, + MV *best_mv); typedef int (fractional_mv_step_fp) ( MACROBLOCK *x, - int_mv *bestmv, - int_mv *ref_mv, + MV *bestmv, const MV *ref_mv, int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, // 0 - full, 1 - qtr only, 2 - half only @@ -84,7 +87,7 @@ extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree; typedef int (fractional_mv_step_comp_fp) ( MACROBLOCK *x, - int_mv *bestmv, int_mv *ref_mv, + MV *bestmv, const MV *ref_mv, int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, // 0 - full, 1 - qtr only, 2 - half only diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h index 9b20dafde..a106014f8 100644 --- a/vp9/encoder/vp9_onyx_int.h +++ b/vp9/encoder/vp9_onyx_int.h @@ -61,16 +61,11 @@ #define INTRA_ZBIN_BOOST 0 typedef struct { - nmv_context nmvc; int nmvjointcost[MV_JOINTS]; int nmvcosts[2][MV_VALS]; int nmvcosts_hp[2][MV_VALS]; vp9_prob segment_pred_probs[PREDICTION_PROBS]; - vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS]; - vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS]; - vp9_prob single_ref_prob[REF_CONTEXTS][2]; - vp9_prob comp_ref_prob[REF_CONTEXTS]; unsigned char *last_frame_seg_map_copy; @@ -79,20 +74,8 @@ typedef struct { // 0 = ZERO_MV, MV signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS]; - vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES]; - - vp9_prob y_mode_prob[4][INTRA_MODES - 1]; - vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1]; - vp9_prob partition_prob[2][NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1]; - - vp9_prob switchable_interp_prob[SWITCHABLE_FILTERS + 1] - [SWITCHABLE_FILTERS - 1]; - int inter_mode_counts[INTER_MODE_CONTEXTS][INTER_MODES - 1][2]; - vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1]; - - struct tx_probs tx_probs; - vp9_prob mbskip_probs[MBSKIP_CONTEXTS]; + FRAME_CONTEXT fc; } CODING_CONTEXT; typedef struct { @@ -649,7 +632,7 @@ typedef struct VP9_COMP { unsigned int switchable_interp_count[SWITCHABLE_FILTERS + 1] [SWITCHABLE_FILTERS]; - unsigned int txfm_stepdown_count[TX_SIZES]; + unsigned int tx_stepdown_count[TX_SIZES]; int initial_width; int initial_height; @@ -712,9 +695,8 @@ void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x); void vp9_set_speed_features(VP9_COMP *cpi); -extern int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *dest); +int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest); -extern void vp9_alloc_compressor_data(VP9_COMP *cpi); +void vp9_alloc_compressor_data(VP9_COMP *cpi); #endif // VP9_ENCODER_VP9_ONYX_INT_H_ diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index 6c8b2a04b..05e893ee9 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -337,10 +337,10 @@ void vp9_frame_init_quantizer(VP9_COMP *cpi) { vp9_mb_init_quantizer(cpi, &cpi->mb); } -void vp9_set_quantizer(struct VP9_COMP *cpi, int Q) { +void vp9_set_quantizer(struct VP9_COMP *cpi, int q) { VP9_COMMON *cm = &cpi->common; - cm->base_qindex = Q; + cm->base_qindex = q; // if any of the delta_q values are changing update flag will // have to be set. diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h index 3229eaad2..3191c49ae 100644 --- a/vp9/encoder/vp9_quantize.h +++ b/vp9/encoder/vp9_quantize.h @@ -30,14 +30,14 @@ void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type, int y_blocks); struct VP9_COMP; -extern void vp9_set_quantizer(struct VP9_COMP *cpi, int Q); +void vp9_set_quantizer(struct VP9_COMP *cpi, int q); -extern void vp9_frame_init_quantizer(struct VP9_COMP *cpi); +void vp9_frame_init_quantizer(struct VP9_COMP *cpi); -extern void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x); +void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x); -extern void vp9_mb_init_quantizer(struct VP9_COMP *cpi, MACROBLOCK *x); +void vp9_mb_init_quantizer(struct VP9_COMP *cpi, MACROBLOCK *x); -extern void vp9_init_quantizer(struct VP9_COMP *cpi); +void vp9_init_quantizer(struct VP9_COMP *cpi); #endif // VP9_ENCODER_VP9_QUANTIZE_H_ diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 2d12ba94f..bbcad172d 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -76,35 +76,19 @@ void vp9_save_coding_context(VP9_COMP *cpi) { // restored with a call to vp9_restore_coding_context. These functions are // intended for use in a re-code loop in vp9_compress_frame where the // quantizer value is adjusted between loop iterations. - - cc->nmvc = cm->fc.nmvc; vp9_copy(cc->nmvjointcost, cpi->mb.nmvjointcost); vp9_copy(cc->nmvcosts, cpi->mb.nmvcosts); vp9_copy(cc->nmvcosts_hp, cpi->mb.nmvcosts_hp); - vp9_copy(cc->inter_mode_probs, cm->fc.inter_mode_probs); - - vp9_copy(cc->y_mode_prob, cm->fc.y_mode_prob); - vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob); - vp9_copy(cc->partition_prob, cm->fc.partition_prob); - vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs); - vp9_copy(cc->intra_inter_prob, cm->fc.intra_inter_prob); - vp9_copy(cc->comp_inter_prob, cm->fc.comp_inter_prob); - vp9_copy(cc->single_ref_prob, cm->fc.single_ref_prob); - vp9_copy(cc->comp_ref_prob, cm->fc.comp_ref_prob); - vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy, cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols)); vp9_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas); vp9_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas); - vp9_copy(cc->coef_probs, cm->fc.coef_probs); - vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob); - cc->tx_probs = cm->fc.tx_probs; - vp9_copy(cc->mbskip_probs, cm->fc.mbskip_probs); + cc->fc = cm->fc; } void vp9_restore_coding_context(VP9_COMP *cpi) { @@ -113,25 +97,12 @@ void vp9_restore_coding_context(VP9_COMP *cpi) { // Restore key state variables to the snapshot state stored in the // previous call to vp9_save_coding_context. - - cm->fc.nmvc = cc->nmvc; vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost); vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts); vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp); - vp9_copy(cm->fc.inter_mode_probs, cc->inter_mode_probs); - - vp9_copy(cm->fc.y_mode_prob, cc->y_mode_prob); - vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob); - vp9_copy(cm->fc.partition_prob, cc->partition_prob); - vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs); - vp9_copy(cm->fc.intra_inter_prob, cc->intra_inter_prob); - vp9_copy(cm->fc.comp_inter_prob, cc->comp_inter_prob); - vp9_copy(cm->fc.single_ref_prob, cc->single_ref_prob); - vp9_copy(cm->fc.comp_ref_prob, cc->comp_ref_prob); - vpx_memcpy(cm->last_frame_seg_map, cpi->coding_context.last_frame_seg_map_copy, (cm->mi_rows * cm->mi_cols)); @@ -139,10 +110,7 @@ void vp9_restore_coding_context(VP9_COMP *cpi) { vp9_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas); vp9_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas); - vp9_copy(cm->fc.coef_probs, cc->coef_probs); - vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob); - cm->fc.tx_probs = cc->tx_probs; - vp9_copy(cm->fc.mbskip_probs, cc->mbskip_probs); + cm->fc = cc->fc; } void vp9_setup_key_frame(VP9_COMP *cpi) { diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index 473317605..ddda7130c 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -32,8 +32,8 @@ int vp9_pick_frame_size(VP9_COMP *cpi); double vp9_convert_qindex_to_q(int qindex); int vp9_gfboost_qadjust(int qindex); -extern int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex, - double correction_factor); +int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex, + double correction_factor); void vp9_setup_inter_frame(VP9_COMP *cpi); #endif // VP9_ENCODER_VP9_RATECTRL_H_ diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 3ef3eeeeb..83cd61226 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -186,6 +186,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) { // cpi->common.refresh_alt_ref_frame) qindex = clamp(qindex, 0, MAXQ); + cpi->RDDIV = 100; cpi->RDMULT = compute_rd_mult(qindex); if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) { if (cpi->twopass.next_iiratio > 31) @@ -204,42 +205,18 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) { if (q < 8) q = 8; - if (cpi->RDMULT > 1000) { - cpi->RDDIV = 1; - cpi->RDMULT /= 100; + for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) { + for (i = 0; i < MAX_MODES; i++) { + // Threshold here seem unecessarily harsh but fine given actual + // range of values used for cpi->sf.thresh_mult[] + int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]); - for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) { - for (i = 0; i < MAX_MODES; ++i) { - // Threshold here seem unecessarily harsh but fine given actual - // range of values used for cpi->sf.thresh_mult[] - int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]); - - // *4 relates to the scaling of rd_thresh_block_size_factor[] - if ((int64_t)cpi->sf.thresh_mult[i] < thresh_max) { - cpi->rd_threshes[bsize][i] = - cpi->sf.thresh_mult[i] * q * - rd_thresh_block_size_factor[bsize] / (4 * 100); - } else { - cpi->rd_threshes[bsize][i] = INT_MAX; - } - } - } - } else { - cpi->RDDIV = 100; - - for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) { - for (i = 0; i < MAX_MODES; i++) { - // Threshold here seem unecessarily harsh but fine given actual - // range of values used for cpi->sf.thresh_mult[] - int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]); - - if (cpi->sf.thresh_mult[i] < thresh_max) { - cpi->rd_threshes[bsize][i] = + if (cpi->sf.thresh_mult[i] < thresh_max) { + cpi->rd_threshes[bsize][i] = cpi->sf.thresh_mult[i] * q * rd_thresh_block_size_factor[bsize] / 4; - } else { - cpi->rd_threshes[bsize][i] = INT_MAX; - } + } else { + cpi->rd_threshes[bsize][i] = INT_MAX; } } } @@ -554,9 +531,13 @@ struct rdcost_block_args { TX_SIZE tx_size; int bw; int bh; - int rate; - int64_t dist; - int64_t sse; + int rate[256]; + int64_t dist[256]; + int64_t sse[256]; + int this_rate; + int64_t this_dist; + int64_t this_sse; + int64_t this_rd; int64_t best_rd; int skip; const int16_t *scan, *nb; @@ -573,17 +554,17 @@ static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) { int shift = args->tx_size == TX_32X32 ? 0 : 2; int16_t *const coeff = BLOCK_OFFSET(p->coeff, block); int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - args->dist += vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, + args->dist[block] = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse) >> shift; - args->sse += this_sse >> shift; + args->sse[block] = this_sse >> shift; if (x->skip_encode && xd->this_mi->mbmi.ref_frame[0] == INTRA_FRAME) { // TODO(jingning): tune the model to better capture the distortion. int64_t p = (pd->dequant[1] * pd->dequant[1] * (1 << ss_txfrm_size)) >> shift; - args->dist += p; - args->sse += p; + args->dist[block] = p; + args->sse[block] = p; } } @@ -594,10 +575,10 @@ static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize, int x_idx, y_idx; txfrm_block_to_raster_xy(plane_bsize, args->tx_size, block, &x_idx, &y_idx); - args->rate += cost_coeffs(args->x, plane, block, - args->t_above + x_idx, - args->t_left + y_idx, args->tx_size, - args->scan, args->nb); + args->rate[block] = cost_coeffs(args->x, plane, block, + args->t_above + x_idx, + args->t_left + y_idx, args->tx_size, + args->scan, args->nb); } static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, @@ -610,16 +591,6 @@ static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, if (args->skip) return; - rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist); - rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse); - rd = MIN(rd1, rd2); - if (rd > args->best_rd) { - args->skip = 1; - args->rate = INT_MAX; - args->dist = INT64_MAX; - args->sse = INT64_MAX; - return; - } if (!is_inter_block(&xd->this_mi->mbmi)) vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &encode_args); @@ -628,6 +599,56 @@ static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, dist_block(plane, block, tx_size, args); rate_block(plane, block, plane_bsize, tx_size, args); + rd1 = RDCOST(x->rdmult, x->rddiv, args->rate[block], args->dist[block]); + rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse[block]); + + // TODO(jingning): temporarily enabled only for luma component + rd = MIN(rd1, rd2); + if (plane == 0) + x->zcoeff_blk[tx_size][block] = rd1 > rd2; + + args->this_rate += args->rate[block]; + args->this_dist += args->dist[block]; + args->this_sse += args->sse[block]; + args->this_rd += rd; + + if (args->this_rd > args->best_rd) { + args->skip = 1; + return; + } +} + +void vp9_get_entropy_contexts(TX_SIZE tx_size, + ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16], + const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left, + int num_4x4_w, int num_4x4_h) { + int i; + switch (tx_size) { + case TX_4X4: + vpx_memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w); + vpx_memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h); + break; + case TX_8X8: + for (i = 0; i < num_4x4_w; i += 2) + t_above[i] = !!*(const uint16_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 2) + t_left[i] = !!*(const uint16_t *)&left[i]; + break; + case TX_16X16: + for (i = 0; i < num_4x4_w; i += 4) + t_above[i] = !!*(const uint32_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 4) + t_left[i] = !!*(const uint32_t *)&left[i]; + break; + case TX_32X32: + for (i = 0; i < num_4x4_w; i += 8) + t_above[i] = !!*(const uint64_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 8) + t_left[i] = !!*(const uint64_t *)&left[i]; + break; + default: + assert(!"Invalid transform size."); + } } static void txfm_rd_in_plane(MACROBLOCK *x, @@ -638,45 +659,33 @@ static void txfm_rd_in_plane(MACROBLOCK *x, MACROBLOCKD *const xd = &x->e_mbd; struct macroblockd_plane *const pd = &xd->plane[plane]; const BLOCK_SIZE bs = get_plane_block_size(bsize, pd); - const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bs]; - const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bs]; - int i; + const int num_4x4_w = num_4x4_blocks_wide_lookup[bs]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bs]; + struct rdcost_block_args args = { x, { 0 }, { 0 }, tx_size, - num_4x4_blocks_wide, num_4x4_blocks_high, - 0, 0, 0, ref_best_rd, 0 }; + num_4x4_w, num_4x4_h, + { 0 }, { 0 }, { 0 }, + 0, 0, 0, 0, ref_best_rd, 0 }; if (plane == 0) xd->this_mi->mbmi.tx_size = tx_size; + vp9_get_entropy_contexts(tx_size, args.t_above, args.t_left, + pd->above_context, pd->left_context, + num_4x4_w, num_4x4_h); switch (tx_size) { case TX_4X4: - vpx_memcpy(&args.t_above, pd->above_context, - sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide); - vpx_memcpy(&args.t_left, pd->left_context, - sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high); get_scan_nb_4x4(get_tx_type_4x4(pd->plane_type, xd, 0), &args.scan, &args.nb); break; case TX_8X8: - for (i = 0; i < num_4x4_blocks_wide; i += 2) - args.t_above[i] = !!*(uint16_t *)&pd->above_context[i]; - for (i = 0; i < num_4x4_blocks_high; i += 2) - args.t_left[i] = !!*(uint16_t *)&pd->left_context[i]; get_scan_nb_8x8(get_tx_type_8x8(pd->plane_type, xd), &args.scan, &args.nb); break; case TX_16X16: - for (i = 0; i < num_4x4_blocks_wide; i += 4) - args.t_above[i] = !!*(uint32_t *)&pd->above_context[i]; - for (i = 0; i < num_4x4_blocks_high; i += 4) - args.t_left[i] = !!*(uint32_t *)&pd->left_context[i]; get_scan_nb_16x16(get_tx_type_16x16(pd->plane_type, xd), &args.scan, &args.nb); break; case TX_32X32: - for (i = 0; i < num_4x4_blocks_wide; i += 8) - args.t_above[i] = !!*(uint64_t *)&pd->above_context[i]; - for (i = 0; i < num_4x4_blocks_high; i += 8) - args.t_left[i] = !!*(uint64_t *)&pd->left_context[i]; args.scan = vp9_default_scan_32x32; args.nb = vp9_default_scan_32x32_neighbors; break; @@ -685,10 +694,17 @@ static void txfm_rd_in_plane(MACROBLOCK *x, } foreach_transformed_block_in_plane(xd, bsize, plane, block_yrd_txfm, &args); - *distortion = args.dist; - *rate = args.rate; - *sse = args.sse; - *skippable = vp9_is_skippable_in_plane(xd, bsize, plane) && (!args.skip); + if (args.skip) { + *rate = INT_MAX; + *distortion = INT64_MAX; + *sse = INT64_MAX; + *skippable = 0; + } else { + *distortion = args.this_dist; + *rate = args.this_rate; + *sse = args.this_sse; + *skippable = vp9_is_skippable_in_plane(xd, bsize, plane); + } } static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x, @@ -696,15 +712,15 @@ static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x, int *skip, int64_t *sse, int64_t ref_best_rd, BLOCK_SIZE bs) { - const TX_SIZE max_txfm_size = max_txsize_lookup[bs]; + const TX_SIZE max_tx_size = max_txsize_lookup[bs]; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; - if (max_txfm_size == TX_32X32 && + if (max_tx_size == TX_32X32 && (cm->tx_mode == ALLOW_32X32 || cm->tx_mode == TX_MODE_SELECT)) { mbmi->tx_size = TX_32X32; - } else if (max_txfm_size >= TX_16X16 && + } else if (max_tx_size >= TX_16X16 && (cm->tx_mode == ALLOW_16X16 || cm->tx_mode == ALLOW_32X32 || cm->tx_mode == TX_MODE_SELECT)) { @@ -717,7 +733,7 @@ static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x, txfm_rd_in_plane(x, rate, distortion, skip, &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size); - cpi->txfm_stepdown_count[0]++; + cpi->tx_stepdown_count[0]++; } static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, @@ -811,15 +827,15 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] && rd[TX_32X32][1] < rd[TX_4X4][1]) { - cpi->txfm_stepdown_count[0]++; + cpi->tx_stepdown_count[0]++; } else if (max_tx_size >= TX_16X16 && rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1]) { - cpi->txfm_stepdown_count[max_tx_size - TX_16X16]++; + cpi->tx_stepdown_count[max_tx_size - TX_16X16]++; } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) { - cpi->txfm_stepdown_count[max_tx_size - TX_8X8]++; + cpi->tx_stepdown_count[max_tx_size - TX_8X8]++; } else { - cpi->txfm_stepdown_count[max_tx_size - TX_4X4]++; + cpi->tx_stepdown_count[max_tx_size - TX_4X4]++; } } @@ -829,7 +845,7 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, int *s, int *skip, int64_t *sse, int64_t ref_best_rd, BLOCK_SIZE bs) { - const TX_SIZE max_txfm_size = max_txsize_lookup[bs]; + const TX_SIZE max_tx_size = max_txsize_lookup[bs]; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; @@ -845,9 +861,9 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, // for (n = TX_4X4; n <= max_txfm_size; n++) // r[n][0] = (r[n][0] * scale_r[n]); - for (n = TX_4X4; n <= max_txfm_size; n++) { + for (n = TX_4X4; n <= max_tx_size; n++) { r[n][1] = r[n][0]; - for (m = 0; m <= n - (n == max_txfm_size); m++) { + for (m = 0; m <= n - (n == max_tx_size); m++) { if (m == n) r[n][1] += vp9_cost_zero(tx_probs[m]); else @@ -859,7 +875,7 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, s0 = vp9_cost_bit(skip_prob, 0); s1 = vp9_cost_bit(skip_prob, 1); - for (n = TX_4X4; n <= max_txfm_size; n++) { + for (n = TX_4X4; n <= max_tx_size; n++) { if (s[n]) { rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]); } else { @@ -867,19 +883,19 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]); } } - for (n = TX_4X4; n <= max_txfm_size; n++) { + for (n = TX_4X4; n <= max_tx_size; n++) { rd[n][0] = (int64_t)(scale_rd[n] * rd[n][0]); rd[n][1] = (int64_t)(scale_rd[n] * rd[n][1]); } - if (max_txfm_size == TX_32X32 && + if (max_tx_size == TX_32X32 && (cm->tx_mode == ALLOW_32X32 || (cm->tx_mode == TX_MODE_SELECT && rd[TX_32X32][1] <= rd[TX_16X16][1] && rd[TX_32X32][1] <= rd[TX_8X8][1] && rd[TX_32X32][1] <= rd[TX_4X4][1]))) { mbmi->tx_size = TX_32X32; - } else if (max_txfm_size >= TX_16X16 && + } else if (max_tx_size >= TX_16X16 && (cm->tx_mode == ALLOW_16X16 || cm->tx_mode == ALLOW_32X32 || (cm->tx_mode == TX_MODE_SELECT && @@ -901,19 +917,19 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, txfm_rd_in_plane(x, rate, distortion, skip, &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size); - if (max_txfm_size == TX_32X32 && + if (max_tx_size == TX_32X32 && rd[TX_32X32][1] <= rd[TX_16X16][1] && rd[TX_32X32][1] <= rd[TX_8X8][1] && rd[TX_32X32][1] <= rd[TX_4X4][1]) { - cpi->txfm_stepdown_count[0]++; - } else if (max_txfm_size >= TX_16X16 && + cpi->tx_stepdown_count[0]++; + } else if (max_tx_size >= TX_16X16 && rd[TX_16X16][1] <= rd[TX_8X8][1] && rd[TX_16X16][1] <= rd[TX_4X4][1]) { - cpi->txfm_stepdown_count[max_txfm_size - TX_16X16]++; + cpi->tx_stepdown_count[max_tx_size - TX_16X16]++; } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) { - cpi->txfm_stepdown_count[max_txfm_size - TX_8X8]++; + cpi->tx_stepdown_count[max_tx_size - TX_8X8]++; } else { - cpi->txfm_stepdown_count[max_txfm_size - TX_4X4]++; + cpi->tx_stepdown_count[max_tx_size - TX_4X4]++; } } @@ -1058,6 +1074,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, for (idx = 0; idx < num_4x4_blocks_wide; ++idx) { int64_t ssz; const int16_t *scan; + const int16_t *nb; uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride; uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride; @@ -1083,10 +1100,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, x->quantize_b_4x4(x, block, tx_type, 16); } - scan = get_scan_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block)); + get_scan_nb_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block), + &scan, &nb); ratey += cost_coeffs(x, 0, block, - tempa + idx, templ + idy, TX_4X4, scan, - vp9_get_coef_neighbors_handle(scan)); + tempa + idx, templ + idy, TX_4X4, scan, nb); distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, &ssz) >> 2; if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) @@ -1458,11 +1475,12 @@ static int labels2mode(MACROBLOCK *x, int i, switch (m = this_mode) { case NEWMV: this_mv->as_int = seg_mvs[mbmi->ref_frame[0]].as_int; - thismvcost = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost, - 102); + thismvcost = vp9_mv_bit_cost(&this_mv->as_mv, &best_ref_mv->as_mv, + mvjcost, mvcost, 102); if (has_second_rf) { this_second_mv->as_int = seg_mvs[mbmi->ref_frame[1]].as_int; - thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv, + thismvcost += vp9_mv_bit_cost(&this_second_mv->as_mv, + &second_best_ref_mv->as_mv, mvjcost, mvcost, 102); } break; @@ -1796,20 +1814,23 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, // adjust src pointer for this block mi_buf_shift(x, i); if (cpi->sf.search_method == HEX) { - bestsme = vp9_hex_search(x, &mvp_full, + bestsme = vp9_hex_search(x, &mvp_full.as_mv, step_param, sadpb, 1, v_fn_ptr, 1, - bsi->ref_mv, &mode_mv[NEWMV]); + &bsi->ref_mv->as_mv, + &mode_mv[NEWMV].as_mv); } else if (cpi->sf.search_method == SQUARE) { - bestsme = vp9_square_search(x, &mvp_full, + bestsme = vp9_square_search(x, &mvp_full.as_mv, step_param, sadpb, 1, v_fn_ptr, 1, - bsi->ref_mv, &mode_mv[NEWMV]); + &bsi->ref_mv->as_mv, + &mode_mv[NEWMV].as_mv); } else if (cpi->sf.search_method == BIGDIA) { - bestsme = vp9_bigdia_search(x, &mvp_full, + bestsme = vp9_bigdia_search(x, &mvp_full.as_mv, step_param, sadpb, 1, v_fn_ptr, 1, - bsi->ref_mv, &mode_mv[NEWMV]); + &bsi->ref_mv->as_mv, + &mode_mv[NEWMV].as_mv); } else { bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, sadpb, further_steps, 0, v_fn_ptr, @@ -1840,8 +1861,10 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, if (bestsme < INT_MAX) { int distortion; unsigned int sse; - cpi->find_fractional_mv_step(x, &mode_mv[NEWMV], - bsi->ref_mv, x->errorperbit, v_fn_ptr, + cpi->find_fractional_mv_step(x, + &mode_mv[NEWMV].as_mv, + &bsi->ref_mv->as_mv, + x->errorperbit, v_fn_ptr, 0, cpi->sf.subpel_iters_per_step, x->nmvjointcost, x->mvcost, &distortion, &sse); @@ -2220,11 +2243,12 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, ctx->comp_pred_diff = (int)comp_pred_diff[COMP_PREDICTION_ONLY]; ctx->hybrid_pred_diff = (int)comp_pred_diff[HYBRID_PREDICTION]; - // FIXME(rbultje) does this memcpy the whole array? I believe sizeof() - // doesn't actually work this way - memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff)); - memcpy(ctx->best_filter_diff, best_filter_diff, - sizeof(*best_filter_diff) * (SWITCHABLE_FILTERS + 1)); + vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[xd->this_mi->mbmi.tx_size], + sizeof(ctx->zcoeff_blk)); + + vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff)); + vpx_memcpy(ctx->best_filter_diff, best_filter_diff, + sizeof(*best_filter_diff) * (SWITCHABLE_FILTERS + 1)); } static void setup_pred_block(const MACROBLOCKD *xd, @@ -2403,23 +2427,23 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, further_steps = (cpi->sf.max_step_search_steps - 1) - step_param; if (cpi->sf.search_method == HEX) { - bestsme = vp9_hex_search(x, &mvp_full, + bestsme = vp9_hex_search(x, &mvp_full.as_mv, step_param, sadpb, 1, &cpi->fn_ptr[block_size], 1, - &ref_mv, tmp_mv); + &ref_mv.as_mv, &tmp_mv->as_mv); } else if (cpi->sf.search_method == SQUARE) { - bestsme = vp9_square_search(x, &mvp_full, + bestsme = vp9_square_search(x, &mvp_full.as_mv, step_param, sadpb, 1, &cpi->fn_ptr[block_size], 1, - &ref_mv, tmp_mv); + &ref_mv.as_mv, &tmp_mv->as_mv); } else if (cpi->sf.search_method == BIGDIA) { - bestsme = vp9_bigdia_search(x, &mvp_full, + bestsme = vp9_bigdia_search(x, &mvp_full.as_mv, step_param, sadpb, 1, &cpi->fn_ptr[block_size], 1, - &ref_mv, tmp_mv); + &ref_mv.as_mv, &tmp_mv->as_mv); } else { bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, sadpb, further_steps, 1, @@ -2435,16 +2459,15 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, if (bestsme < INT_MAX) { int dis; /* TODO: use dis in distortion calculation later. */ unsigned int sse; - cpi->find_fractional_mv_step(x, tmp_mv, &ref_mv, + cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv, x->errorperbit, &cpi->fn_ptr[block_size], 0, cpi->sf.subpel_iters_per_step, x->nmvjointcost, x->mvcost, &dis, &sse); } - *rate_mv = vp9_mv_bit_cost(tmp_mv, &ref_mv, - x->nmvjointcost, x->mvcost, - 96); + *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv, + x->nmvjointcost, x->mvcost, 96); if (cpi->sf.adaptive_motion_search && cpi->common.show_frame) x->pred_mv[ref].as_int = tmp_mv->as_int; @@ -2570,8 +2593,8 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, unsigned int sse; bestsme = cpi->find_fractional_mv_step_comp( - x, &tmp_mv, - &ref_mv[id], + x, &tmp_mv.as_mv, + &ref_mv[id].as_mv, x->errorperbit, &cpi->fn_ptr[block_size], 0, cpi->sf.subpel_iters_per_step, @@ -2603,11 +2626,11 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[1] = backup_second_yv12[i]; } - *rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]], - &mbmi->ref_mvs[refs[0]][0], + *rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv, + &mbmi->ref_mvs[refs[0]][0].as_mv, x->nmvjointcost, x->mvcost, 96); - *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]], - &mbmi->ref_mvs[refs[1]][0], + *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv, + &mbmi->ref_mvs[refs[1]][0].as_mv, x->nmvjointcost, x->mvcost, 96); vpx_free(second_pred); @@ -2630,7 +2653,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, VP9_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = &xd->this_mi->mbmi; - const int is_comp_pred = (mbmi->ref_frame[1] > 0); + const int is_comp_pred = has_second_ref(mbmi); const int num_refs = is_comp_pred ? 2 : 1; const int this_mode = mbmi->mode; int_mv *frame_mv = mode_mv[this_mode]; @@ -2659,11 +2682,11 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, single_newmv, &rate_mv); } else { - rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]], - &mbmi->ref_mvs[refs[0]][0], + rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv, + &mbmi->ref_mvs[refs[0]][0].as_mv, x->nmvjointcost, x->mvcost, 96); - rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]], - &mbmi->ref_mvs[refs[1]][0], + rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv, + &mbmi->ref_mvs[refs[1]][0].as_mv, x->nmvjointcost, x->mvcost, 96); } if (frame_mv[refs[0]].as_int == INVALID_MV || @@ -3071,8 +3094,12 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 0); *returndist = dist_y + dist_uv; if (cpi->sf.tx_size_search_method == USE_FULL_RD) - for (i = 0; i < TX_MODES; i++) - ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode]; + for (i = 0; i < TX_MODES; i++) { + if (tx_cache[i] < INT64_MAX && tx_cache[cm->tx_mode] < INT64_MAX) + ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode]; + else + ctx->tx_rd_diff[i] = 0; + } } ctx->mic = *xd->this_mi; @@ -3139,8 +3166,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, const int bws = num_8x8_blocks_wide_lookup[bsize] / 2; const int bhs = num_8x8_blocks_high_lookup[bsize] / 2; int best_skip2 = 0; + unsigned char best_zcoeff_blk[256] = { 0 }; x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH; + vpx_memset(x->zcoeff_blk, 0, sizeof(x->zcoeff_blk)); + vpx_memset(ctx->zcoeff_blk, 0, sizeof(ctx->zcoeff_blk)); for (i = 0; i < 4; i++) { int j; @@ -3812,6 +3842,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, best_mbmode = *mbmi; best_skip2 = this_skip2; best_partition = *x->partition_info; + vpx_memcpy(best_zcoeff_blk, x->zcoeff_blk[mbmi->tx_size], + sizeof(best_zcoeff_blk)); if (this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV) for (i = 0; i < 4; i++) @@ -3993,13 +4025,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (best_mbmode.ref_frame[0] != INTRA_FRAME && best_mbmode.sb_type < BLOCK_8X8) { for (i = 0; i < 4; i++) - xd->this_mi->bmi[i].as_mv[0].as_int = - best_bmodes[i].as_mv[0].as_int; + xd->this_mi->bmi[i].as_mv[0].as_int = best_bmodes[i].as_mv[0].as_int; - if (mbmi->ref_frame[1] > 0) + if (has_second_ref(mbmi)) for (i = 0; i < 4; i++) - xd->this_mi->bmi[i].as_mv[1].as_int = - best_bmodes[i].as_mv[1].as_int; + xd->this_mi->bmi[i].as_mv[1].as_int = best_bmodes[i].as_mv[1].as_int; *x->partition_info = best_partition; @@ -4007,6 +4037,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, mbmi->mv[1].as_int = xd->this_mi->bmi[3].as_mv[1].as_int; } + vpx_memcpy(x->zcoeff_blk[mbmi->tx_size], best_zcoeff_blk, + sizeof(best_zcoeff_blk)); + for (i = 0; i < NB_PREDICTION_TYPES; ++i) { if (best_pred_rd[i] == INT64_MAX) best_pred_diff[i] = INT_MIN; diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h index eba7df907..9796c0d7c 100644 --- a/vp9/encoder/vp9_rdopt.h +++ b/vp9/encoder/vp9_rdopt.h @@ -33,4 +33,9 @@ void vp9_init_me_luts(); void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv); +void vp9_get_entropy_contexts(TX_SIZE tx_size, + ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16], + const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left, + int num_4x4_w, int num_4x4_h); + #endif // VP9_ENCODER_VP9_RDOPT_H_ diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c index 10655e8a7..874b71ab1 100644 --- a/vp9/encoder/vp9_segmentation.c +++ b/vp9/encoder/vp9_segmentation.c @@ -130,6 +130,8 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO **mi_8x8, return; segment_id = mi_8x8[0]->mbmi.segment_id; + xd->mi_8x8 = mi_8x8; + xd->this_mi = mi_8x8[0]; set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw); diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index 63826eea5..1768b5bed 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -154,10 +154,10 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, // TODO Check that the 16x16 vf & sdf are selected here // Ignore mv costing by sending NULL pointer instead of cost arrays ref_mv = &x->e_mbd.mi_8x8[0]->bmi[0].as_mv[0]; - bestsme = vp9_hex_search(x, &best_ref_mv1_full, + bestsme = vp9_hex_search(x, &best_ref_mv1_full.as_mv, step_param, sadpb, 1, &cpi->fn_ptr[BLOCK_16X16], - 0, &best_ref_mv1, ref_mv); + 0, &best_ref_mv1.as_mv, &ref_mv->as_mv); #if ALT_REF_SUBPEL_ENABLED // Try sub-pixel MC? @@ -166,8 +166,8 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, int distortion; unsigned int sse; // Ignore mv costing by sending NULL pointer instead of cost array - bestsme = cpi->find_fractional_mv_step(x, ref_mv, - &best_ref_mv1, + bestsme = cpi->find_fractional_mv_step(x, &ref_mv->as_mv, + &best_ref_mv1.as_mv, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], 0, cpi->sf.subpel_iters_per_step, diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c index a59f6db88..7c14c18aa 100644 --- a/vp9/encoder/vp9_tokenize.c +++ b/vp9/encoder/vp9_tokenize.c @@ -137,8 +137,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, pt = get_entropy_context(tx_size, pd->above_context + aoff, pd->left_context + loff); - get_scan_and_band(xd, tx_size, type, block, &scan, &band_translate); - nb = vp9_get_coef_neighbors_handle(scan); + get_scan_and_band(xd, tx_size, type, block, &scan, &nb, &band_translate); c = 0; do { const int band = get_coef_band(band_translate, c); diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance_c.c index 155ba8a3e..991ef4d29 100644 --- a/vp9/encoder/vp9_variance_c.c +++ b/vp9/encoder/vp9_variance_c.c @@ -8,13 +8,150 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vp9_rtcd.h" -#include "vp9/encoder/vp9_variance.h" -#include "vp9/common/vp9_filter.h" -#include "vp9/common/vp9_subpelvar.h" -#include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" -#include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_convolve.h" +#include "vp9/common/vp9_filter.h" +#include "vp9/encoder/vp9_variance.h" + +static void variance(const uint8_t *src_ptr, + int source_stride, + const uint8_t *ref_ptr, + int recon_stride, + int w, + int h, + unsigned int *sse, + int *sum) { + int i, j; + int diff; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + diff = src_ptr[j] - ref_ptr[j]; + *sum += diff; + *sse += diff * diff; + } + + src_ptr += source_stride; + ref_ptr += recon_stride; + } +} + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_first_pass + * + * INPUTS : uint8_t *src_ptr : Pointer to source block. + * uint32_t src_pixels_per_line : Stride of input block. + * uint32_t pixel_step : Offset between filter input + * samples (see notes). + * uint32_t output_height : Input block height. + * uint32_t output_width : Input block width. + * int32_t *vp9_filter : Array of 2 bi-linear filter + * taps. + * + * OUTPUTS : int32_t *output_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in + * either horizontal or vertical direction to produce the + * filtered output block. Used to implement first-pass + * of 2-D separable filter. + * + * SPECIAL NOTES : Produces int32_t output to retain precision for next pass. + * Two filter taps should sum to VP9_FILTER_WEIGHT. + * pixel_step defines whether the filter is applied + * horizontally (pixel_step=1) or vertically (pixel_step= + * stride). + * It defines the offset required to move from one input + * to the next. + * + ****************************************************************************/ +static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr, + uint16_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp9_filter) { + unsigned int i, j; + + for (i = 0; i < output_height; i++) { + for (j = 0; j < output_width; j++) { + output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + + (int)src_ptr[pixel_step] * vp9_filter[1], + FILTER_BITS); + + src_ptr++; + } + + // Next row... + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_second_pass + * + * INPUTS : int32_t *src_ptr : Pointer to source block. + * uint32_t src_pixels_per_line : Stride of input block. + * uint32_t pixel_step : Offset between filter input + * samples (see notes). + * uint32_t output_height : Input block height. + * uint32_t output_width : Input block width. + * int32_t *vp9_filter : Array of 2 bi-linear filter + * taps. + * + * OUTPUTS : uint16_t *output_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in + * either horizontal or vertical direction to produce the + * filtered output block. Used to implement second-pass + * of 2-D separable filter. + * + * SPECIAL NOTES : Requires 32-bit input as produced by + * filter_block2d_bil_first_pass. + * Two filter taps should sum to VP9_FILTER_WEIGHT. + * pixel_step defines whether the filter is applied + * horizontally (pixel_step=1) or vertically (pixel_step= + * stride). + * It defines the offset required to move from one input + * to the next. + * + ****************************************************************************/ +static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp9_filter) { + unsigned int i, j; + + for (i = 0; i < output_height; i++) { + for (j = 0; j < output_width; j++) { + output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + + (int)src_ptr[pixel_step] * vp9_filter[1], + FILTER_BITS); + src_ptr++; + } + + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) { unsigned int i, sum = 0; diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 687fb487c..7d040f7db 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -48,7 +48,6 @@ VP9_COMMON_SRCS-yes += common/vp9_reconintra.h VP9_COMMON_SRCS-yes += common/vp9_rtcd.c VP9_COMMON_SRCS-yes += common/vp9_rtcd_defs.sh VP9_COMMON_SRCS-yes += common/vp9_sadmxn.h -VP9_COMMON_SRCS-yes += common/vp9_subpelvar.h VP9_COMMON_SRCS-yes += common/vp9_scale.h VP9_COMMON_SRCS-yes += common/vp9_scale.c VP9_COMMON_SRCS-yes += common/vp9_seg_common.h @@ -92,7 +91,6 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_neon.c -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct32x32_neon.c VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM) @@ -109,5 +107,6 @@ VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht8x8_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_copy_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_avg_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_save_reg_neon$(ASM) $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.sh)) diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 08a1a8458..157752a86 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -89,7 +89,7 @@ struct vpx_codec_alg_priv { unsigned int fixed_kf_cntr; }; -static const VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) { +static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) { switch (frame) { case VP8_LAST_FRAME: return VP9_LAST_FLAG; diff --git a/vpx_scale/mips/dspr2/yv12extend_dspr2.c b/vpx_scale/mips/dspr2/yv12extend_dspr2.c new file mode 100644 index 000000000..2c5cd1a87 --- /dev/null +++ b/vpx_scale/mips/dspr2/yv12extend_dspr2.c @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "./vpx_config.h" +#include "vpx_scale/yv12config.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_scale/vpx_scale.h" + +#if HAVE_DSPR2 +static void extend_plane(uint8_t *const src, int src_stride, + int width, int height, + int extend_top, int extend_left, + int extend_bottom, int extend_right) { + int i, j; + uint8_t *left_src, *right_src; + uint8_t *left_dst_start, *right_dst_start; + uint8_t *left_dst, *right_dst; + uint8_t *top_src, *bot_src; + uint8_t *top_dst, *bot_dst; + uint32_t left_pix; + uint32_t right_pix; + uint32_t linesize; + + /* copy the left and right most columns out */ + left_src = src; + right_src = src + width - 1; + left_dst_start = src - extend_left; + right_dst_start = src + width; + + for (i = height; i--; ) { + left_dst = left_dst_start; + right_dst = right_dst_start; + + __asm__ __volatile__ ( + "lb %[left_pix], 0(%[left_src]) \n\t" + "lb %[right_pix], 0(%[right_src]) \n\t" + "replv.qb %[left_pix], %[left_pix] \n\t" + "replv.qb %[right_pix], %[right_pix] \n\t" + + : [left_pix] "=&r" (left_pix), [right_pix] "=&r" (right_pix) + : [left_src] "r" (left_src), [right_src] "r" (right_src) + ); + + for (j = extend_left/4; j--; ) { + __asm__ __volatile__ ( + "sw %[left_pix], 0(%[left_dst]) \n\t" + "sw %[right_pix], 0(%[right_dst]) \n\t" + + : + : [left_dst] "r" (left_dst), [left_pix] "r" (left_pix), + [right_dst] "r" (right_dst), [right_pix] "r" (right_pix) + ); + + left_dst += 4; + right_dst += 4; + } + + for (j = extend_left%4; j--; ) { + __asm__ __volatile__ ( + "sb %[left_pix], 0(%[left_dst]) \n\t" + "sb %[right_pix], 0(%[right_dst]) \n\t" + + : + : [left_dst] "r" (left_dst), [left_pix] "r" (left_pix), + [right_dst] "r" (right_dst), [right_pix] "r" (right_pix) + ); + + left_dst += 1; + right_dst += 1; + } + + left_src += src_stride; + right_src += src_stride; + left_dst_start += src_stride; + right_dst_start += src_stride; + } + + /* Now copy the top and bottom lines into each line of the respective + * borders + */ + top_src = src - extend_left; + bot_src = src + src_stride * (height - 1) - extend_left; + top_dst = src + src_stride * (-extend_top) - extend_left; + bot_dst = src + src_stride * (height) - extend_left; + linesize = extend_left + extend_right + width; + + for (i = 0; i < extend_top; i++) { + vpx_memcpy(top_dst, top_src, linesize); + top_dst += src_stride; + } + + for (i = 0; i < extend_bottom; i++) { + vpx_memcpy(bot_dst, bot_src, linesize); + bot_dst += src_stride; + } +} + +static void extend_frame(YV12_BUFFER_CONFIG *const ybf, + int subsampling_x, int subsampling_y, + int ext_size) { + const int c_w = (ybf->y_crop_width + subsampling_x) >> subsampling_x; + const int c_h = (ybf->y_crop_height + subsampling_y) >> subsampling_y; + const int c_et = ext_size >> subsampling_y; + const int c_el = ext_size >> subsampling_x; + const int c_eb = (ext_size + ybf->y_height - ybf->y_crop_height + + subsampling_y) >> subsampling_y; + const int c_er = (ext_size + ybf->y_width - ybf->y_crop_width + + subsampling_x) >> subsampling_x; + + assert(ybf->y_height - ybf->y_crop_height < 16); + assert(ybf->y_width - ybf->y_crop_width < 16); + assert(ybf->y_height - ybf->y_crop_height >= 0); + assert(ybf->y_width - ybf->y_crop_width >= 0); + + extend_plane(ybf->y_buffer, ybf->y_stride, + ybf->y_crop_width, ybf->y_crop_height, + ext_size, ext_size, + ext_size + ybf->y_height - ybf->y_crop_height, + ext_size + ybf->y_width - ybf->y_crop_width); + + extend_plane(ybf->u_buffer, ybf->uv_stride, + c_w, c_h, c_et, c_el, c_eb, c_er); + + extend_plane(ybf->v_buffer, ybf->uv_stride, + c_w, c_h, c_et, c_el, c_eb, c_er); +} + +void vp9_extend_frame_borders_dspr2(YV12_BUFFER_CONFIG *ybf, + int subsampling_x, int subsampling_y) { + extend_frame(ybf, subsampling_x, subsampling_y, ybf->border); +} + +void vp9_extend_frame_inner_borders_dspr2(YV12_BUFFER_CONFIG *ybf, + int subsampling_x, + int subsampling_y) { + const int inner_bw = (ybf->border > VP9INNERBORDERINPIXELS) ? + VP9INNERBORDERINPIXELS : ybf->border; + extend_frame(ybf, subsampling_x, subsampling_y, inner_bw); +} +#endif diff --git a/vpx_scale/vpx_scale.mk b/vpx_scale/vpx_scale.mk index 76c11e792..50d3e9d8e 100644 --- a/vpx_scale/vpx_scale.mk +++ b/vpx_scale/vpx_scale.mk @@ -16,6 +16,9 @@ SCALE_SRCS-$(HAVE_NEON) += arm/neon/vp8_vpxyv12_copysrcframe_func_neon$(ASM) SCALE_SRCS-$(HAVE_NEON) += arm/neon/vp8_vpxyv12_extendframeborders_neon$(ASM) SCALE_SRCS-$(HAVE_NEON) += arm/neon/yv12extend_arm.c +#mips(dspr2) +SCALE_SRCS-$(HAVE_DSPR2) += mips/dspr2/yv12extend_dspr2.c + SCALE_SRCS-no += $(SCALE_SRCS_REMOVE-yes) $(eval $(call asm_offsets_template,\ diff --git a/vpx_scale/vpx_scale_rtcd.sh b/vpx_scale/vpx_scale_rtcd.sh index ea7b0e2e8..a5faf1148 100644 --- a/vpx_scale/vpx_scale_rtcd.sh +++ b/vpx_scale/vpx_scale_rtcd.sh @@ -27,8 +27,8 @@ specialize vpx_yv12_copy_y neon if [ "$CONFIG_VP9" = "yes" ]; then prototype void vp9_extend_frame_borders "struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y" - specialize vp9_extend_frame_borders + specialize vp9_extend_frame_borders dspr2 prototype void vp9_extend_frame_inner_borders "struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y" - specialize vp9_extend_frame_inner_borders_c + specialize vp9_extend_frame_inner_borders dspr2 fi @@ -725,14 +725,12 @@ write_webm_file_header(EbmlGlobal *glob, { unsigned int pixelWidth = cfg->g_w; unsigned int pixelHeight = cfg->g_h; - float frameRate = (float)fps->num / (float)fps->den; EbmlLoc videoStart; Ebml_StartSubElement(glob, &videoStart, Video); Ebml_SerializeUnsigned(glob, PixelWidth, pixelWidth); Ebml_SerializeUnsigned(glob, PixelHeight, pixelHeight); Ebml_SerializeUnsigned(glob, StereoMode, stereo_fmt); - Ebml_SerializeFloat(glob, FrameRate, frameRate); Ebml_EndSubElement(glob, &videoStart); } Ebml_EndSubElement(glob, &start); /* Track Entry */ |