diff options
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/common/x86/vp9_asm_stubs.c | 100 | ||||
-rw-r--r-- | vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c | 591 | ||||
-rw-r--r-- | vp9/decoder/vp9_decodemv.c | 36 | ||||
-rw-r--r-- | vp9/encoder/vp9_block.h | 1 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 230 | ||||
-rw-r--r-- | vp9/encoder/vp9_firstpass.c | 6 | ||||
-rw-r--r-- | vp9/encoder/vp9_pickmode.c | 230 | ||||
-rw-r--r-- | vp9/encoder/vp9_pickmode.h | 19 | ||||
-rw-r--r-- | vp9/encoder/vp9_ratectrl.c | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_rdopt.c | 31 | ||||
-rw-r--r-- | vp9/encoder/vp9_rdopt.h | 12 | ||||
-rw-r--r-- | vp9/vp9_common.mk | 3 | ||||
-rw-r--r-- | vp9/vp9cx.mk | 2 |
13 files changed, 510 insertions, 753 deletions
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c index f4f758297..f95423678 100644 --- a/vp9/common/x86/vp9_asm_stubs.c +++ b/vp9/common/x86/vp9_asm_stubs.c @@ -23,105 +23,20 @@ typedef void filter8_1dfunction ( const short *filter ); -#if (HAVE_SSSE3) +#if HAVE_SSSE3 +filter8_1dfunction vp9_filter_block1d16_v8_ssse3; +filter8_1dfunction vp9_filter_block1d16_h8_ssse3; +filter8_1dfunction vp9_filter_block1d8_v8_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_ssse3; +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_ssse3; filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; -#if (ARCH_X86_64) -filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d4_v8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; - -void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - /* Ensure the filter can be compressed to int16_t. */ - if (x_step_q4 == 16 && filter_x[3] != 128) { - while (w >= 16) { - vp9_filter_block1d16_h8_intrin_ssse3(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 16; - dst += 16; - w -= 16; - } - while (w >= 8) { - vp9_filter_block1d8_h8_intrin_ssse3(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 8; - dst += 8; - w -= 8; - } - while (w >= 4) { - vp9_filter_block1d4_h8_intrin_ssse3(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 4; - dst += 4; - w -= 4; - } - } - if (w) { - vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } -} -void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - if (y_step_q4 == 16 && filter_y[3] != 128) { - while (w >= 16) { - vp9_filter_block1d16_v8_intrin_ssse3(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 16; - dst += 16; - w -= 16; - } - while (w >= 8) { - vp9_filter_block1d8_v8_intrin_ssse3(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 8; - dst += 8; - w -= 8; - } - while (w >= 4) { - vp9_filter_block1d4_v8_intrin_ssse3(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 4; - dst += 4; - w -= 4; - } - } - if (w) { - vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } -} - -#else -filter8_1dfunction vp9_filter_block1d16_v8_ssse3; -filter8_1dfunction vp9_filter_block1d16_h8_ssse3; -filter8_1dfunction vp9_filter_block1d8_v8_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_ssse3; -filter8_1dfunction vp9_filter_block1d4_v8_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_ssse3; void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, @@ -198,7 +113,6 @@ void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, w, h); } } -#endif void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c deleted file mode 100644 index 303fced3b..000000000 --- a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c +++ /dev/null @@ -1,591 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <tmmintrin.h> -#include "vpx_ports/mem.h" -#include "vpx_ports/emmintrin_compat.h" - - -// filters only for the 4_h8 convolution -DECLARE_ALIGNED(16, const unsigned char, -filt1_4_h8[16])= {0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6}; - -DECLARE_ALIGNED(16, const unsigned char, -filt2_4_h8[16])= {4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10}; - -// filters for 8_h8 and 16_h8 -DECLARE_ALIGNED(16, const unsigned char, -filt1_global[16])= {0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8}; - -DECLARE_ALIGNED(16, const unsigned char, -filt2_global[16])= {2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10}; - -DECLARE_ALIGNED(16, const unsigned char, -filt3_global[16])= {4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12}; - -DECLARE_ALIGNED(16, const unsigned char, -filt4_global[16])= {6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14}; - - - -void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - int16_t *filter) { - __m128i firstFilters, secondFilters, thirdFilters, forthFilters; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; - __m128i addFilterReg64, filtersReg, srcReg, minReg; - unsigned int i; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 =_mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits in the filter into the first lane - firstFilters = _mm_shufflelo_epi16(filtersReg, 0); - // duplicate only the third 16 bit in the filter into the first lane - secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); - // duplicate only the seconds 16 bits in the filter into the second lane - firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); - // duplicate only the forth 16 bits in the filter into the second lane - secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); - - // loading the local filters - thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8); - forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8); - - for (i = 0; i < output_height; i++) { - srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); - - // filter the source buffer - srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters); - srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); - - // extract the higher half of the lane - srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); - srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); - - minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); - - // add and saturate all the results together - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - - srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); - - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); - - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); - - // shift by 7 bit each 16 bits - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - - // shrink to 8 bit each 16 bits - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); - - src_ptr+=src_pixels_per_line; - - // save only 4 bytes - *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); - - output_ptr+=output_pitch; - } -} - - -void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - int16_t *filter) { - __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; - __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; - __m128i addFilterReg64, filtersReg, minReg; - unsigned int i; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 128 bit register - firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 128 bit register - secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 128 bit register - thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 128 bit register - forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); - - filt1Reg = _mm_load_si128((__m128i const *)filt1_global); - filt2Reg = _mm_load_si128((__m128i const *)filt2_global); - filt3Reg = _mm_load_si128((__m128i const *)filt3_global); - filt4Reg = _mm_load_si128((__m128i const *)filt4_global); - - for (i = 0; i < output_height; i++) { - srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); - - // filter the source buffer - srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg); - srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); - - // filter the source buffer - srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg); - srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); - srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); - - // add and saturate all the results together - minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); - - srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3); - - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); - - // shift by 7 bit each 16 bits - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - - // shrink to 8 bit each 16 bits - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); - - src_ptr+=src_pixels_per_line; - - // save only 8 bytes - _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); - - output_ptr+=output_pitch; - } -} - -void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - int16_t *filter) { - __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; - __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; - __m128i firstFilters, secondFilters, thirdFilters, forthFilters; - __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; - unsigned int i; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 128 bit register - firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 128 bit register - secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 128 bit register - thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 128 bit register - forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); - - filt1Reg = _mm_load_si128((__m128i const *)filt1_global); - filt2Reg = _mm_load_si128((__m128i const *)filt2_global); - filt3Reg = _mm_load_si128((__m128i const *)filt3_global); - filt4Reg = _mm_load_si128((__m128i const *)filt4_global); - - for (i = 0; i < output_height; i++) { - srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); - - // filter the source buffer - srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); - srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); - - // add and saturate the results together - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); - - // filter the source buffer - srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg); - srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); - - // add and saturate the results together - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, - _mm_min_epi16(srcRegFilt3, srcRegFilt2)); - - // reading the next 16 bytes. - // (part of it was being read by earlier read) - srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); - - // add and saturate the results together - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, - _mm_max_epi16(srcRegFilt3, srcRegFilt2)); - - // filter the source buffer - srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); - srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); - - // add and saturate the results together - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); - - // filter the source buffer - srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg); - srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); - - // add and saturate the results together - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, - _mm_min_epi16(srcRegFilt3, srcRegFilt2)); - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, - _mm_max_epi16(srcRegFilt3, srcRegFilt2)); - - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64); - - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); - srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); - - src_ptr+=src_pixels_per_line; - - // save 16 bytes - _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); - - output_ptr+=output_pitch; - } -} - - - -void vp9_filter_block1d4_v8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - int16_t *filter) { - __m128i addFilterReg64, filtersReg, firstFilters, secondFilters; - __m128i minReg, srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; - unsigned int i; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits in the filter into the first lane - firstFilters = _mm_shufflelo_epi16(filtersReg, 0); - // duplicate only the second 16 bits in the filter into the second lane - firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); - // duplicate only the third 16 bits in the filter into the first lane - secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); - // duplicate only the forth 16 bits in the filter into the second lane - secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); - - for (i = 0; i < output_height; i++) { - // load the first 4 byte - srcRegFilt1 = _mm_cvtsi32_si128(*((int*)&src_ptr[0])); - // load the next 4 bytes in stride of src_pitch - srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch)[0])); - - // merge the result together - srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); - - - srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*2)[0])); - srcRegFilt3 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*3)[0])); - - // merge the result together - srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); - - srcRegFilt3 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*4)[0])); - srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*5)[0])); - - // merge the result together - srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); - srcRegFilt1 = _mm_unpacklo_epi64(srcRegFilt1, srcRegFilt2); - - srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*6)[0])); - srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*7)[0])); - - // merge the result together - srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt4, srcRegFilt2); - srcRegFilt3 = _mm_unpacklo_epi64(srcRegFilt3, srcRegFilt4); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); - - // extract the second lane of the 128 bit register - srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8); - - // add and saturate the results together - minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, - _mm_srli_si128(srcRegFilt3, 8)); - srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - - // shrink to 8 bit each 16 bits - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); - - src_ptr+=src_pitch; - - // save only 4 bytes convolve result - *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); - - output_ptr+=out_pitch; - } -} - -void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - int16_t *filter) { - __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6; - __m128i firstFilters, secondFilters, thirdFilters, forthFilters; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5; - unsigned int i; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits in the filter - firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); - // duplicate only the second 16 bits in the filter - secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); - // duplicate only the third 16 bits in the filter - thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); - // duplicate only the forth 16 bits in the filter - forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); - - for (i = 0; i < output_height; i++) { - // load the first 8 bytes - srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]); - // load the next 8 bytes in stride of src_pitch - srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]); - srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]); - srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]); - - // merge the result together - srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); - srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); - - // load the next 8 bytes in stride of src_pitch - srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]); - srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]); - srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]); - srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]); - - // merge the result together - srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4); - srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); - srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); - - // add and saturate the results together - minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); - srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - - // shrink to 8 bit each 16 bits - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); - - src_ptr+=src_pitch; - - // save only 8 bytes convolve result - _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); - - output_ptr+=out_pitch; - } -} - - -void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - int16_t *filter) { - __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3; - __m128i firstFilters, secondFilters, thirdFilters, forthFilters; - __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; - unsigned int i; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits in the filter - firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); - // duplicate only the second 16 bits in the filter - secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); - // duplicate only the third 16 bits in the filter - thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); - // duplicate only the forth 16 bits in the filter - forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); - - - for (i = 0; i < output_height; i++) { - // load the first 16 bytes - srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr)); - // load the next 16 bytes in stride of src_pitch - srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)); - srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)); - srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); - - // merge the result together - srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); - srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); - srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2); - srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters); - srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters); - srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); - - - // add and saturate the results together - srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); - - // load the next 16 bytes in stride of two/three src_pitch - srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)); - srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)); - - // merge the result together - srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); - srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters); - srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters); - - // load the next 16 bytes in stride of four/five src_pitch - srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)); - srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)); - - // merge the result together - srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); - srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters); - srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters); - - - // add and saturate the results together - srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, - _mm_min_epi16(srcRegFilt4, srcRegFilt7)); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, - _mm_min_epi16(srcRegFilt6, srcRegFilt8)); - - - // add and saturate the results together - srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, - _mm_max_epi16(srcRegFilt4, srcRegFilt7)); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, - _mm_max_epi16(srcRegFilt6, srcRegFilt8)); - srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7); - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1); - - src_ptr+=src_pitch; - - // save 16 bytes convolve result - _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); - - output_ptr+=out_pitch; - } -} diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index c81378153..2eb99ea15 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -357,9 +357,9 @@ static void read_intra_block_mode_info(VP9_COMMON *const cm, MODE_INFO *mi, } static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode, - int_mv mv[2], int_mv best_mv[2], - int_mv nearest_mv[2], int_mv near_mv[2], - int is_compound, int allow_hp, vp9_reader *r) { + int_mv mv[2], int_mv ref_mv[2], + int_mv nearest_mv[2], int_mv near_mv[2], + int is_compound, int allow_hp, vp9_reader *r) { int i; int ret = 1; @@ -367,10 +367,10 @@ static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode, case NEWMV: { nmv_context_counts *const mv_counts = cm->frame_parallel_decoding_mode ? NULL : &cm->counts.mv; - read_mv(r, &mv[0].as_mv, &best_mv[0].as_mv, + read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, &cm->fc.nmvc, mv_counts, allow_hp); if (is_compound) - read_mv(r, &mv[1].as_mv, &best_mv[1].as_mv, + read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, &cm->fc.nmvc, mv_counts, allow_hp); for (i = 0; i < 1 + is_compound; ++i) { ret = ret && mv[i].as_mv.row < MV_UPP && mv[i].as_mv.row > MV_LOW; @@ -380,17 +380,20 @@ static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode, } case NEARESTMV: { mv[0].as_int = nearest_mv[0].as_int; - if (is_compound) mv[1].as_int = nearest_mv[1].as_int; + if (is_compound) + mv[1].as_int = nearest_mv[1].as_int; break; } case NEARMV: { mv[0].as_int = near_mv[0].as_int; - if (is_compound) mv[1].as_int = near_mv[1].as_int; + if (is_compound) + mv[1].as_int = near_mv[1].as_int; break; } case ZEROMV: { mv[0].as_int = 0; - if (is_compound) mv[1].as_int = 0; + if (is_compound) + mv[1].as_int = 0; break; } default: { @@ -423,7 +426,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, const BLOCK_SIZE bsize = mbmi->sb_type; const int allow_hp = cm->allow_high_precision_mv; - int_mv nearest[2], nearmv[2], best[2]; + int_mv nearestmv[2], nearmv[2]; int inter_mode_ctx, ref, is_compound; read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame); @@ -452,8 +455,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) { for (ref = 0; ref < 1 + is_compound; ++ref) { vp9_find_best_ref_mvs(xd, allow_hp, mbmi->ref_mvs[mbmi->ref_frame[ref]], - &nearest[ref], &nearmv[ref]); - best[ref].as_int = nearest[ref].as_int; + &nearestmv[ref], &nearmv[ref]); } } @@ -466,6 +468,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; // 1 or 2 int idx, idy; int b_mode; + int_mv nearest_sub8x8[2], near_sub8x8[2]; for (idy = 0; idy < 2; idy += num_4x4_h) { for (idx = 0; idx < 2; idx += num_4x4_w) { int_mv block[2]; @@ -475,9 +478,11 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, if (b_mode == NEARESTMV || b_mode == NEARMV) for (ref = 0; ref < 1 + is_compound; ++ref) vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, j, ref, mi_row, mi_col, - &nearest[ref], &nearmv[ref]); + &nearest_sub8x8[ref], + &near_sub8x8[ref]); - if (!assign_mv(cm, b_mode, block, best, nearest, nearmv, + if (!assign_mv(cm, b_mode, block, nearestmv, + nearest_sub8x8, near_sub8x8, is_compound, allow_hp, r)) { xd->corrupted |= 1; break; @@ -499,9 +504,8 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int; mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int; } else { - xd->corrupted |= !assign_mv(cm, mbmi->mode, mbmi->mv, - best, nearest, nearmv, - is_compound, allow_hp, r); + xd->corrupted |= !assign_mv(cm, mbmi->mode, mbmi->mv, nearestmv, + nearestmv, nearmv, is_compound, allow_hp, r); } } diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index c011948b9..c1b95817f 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -116,6 +116,7 @@ struct macroblock { unsigned int source_variance; unsigned int pred_sse[MAX_REF_FRAMES]; int pred_mv_sad[MAX_REF_FRAMES]; + int mode_sad[MAX_REF_FRAMES][INTER_MODES + 1]; int nmvjointcost[MV_JOINTS]; int nmvcosts[2][MV_VALS]; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index b0fae6593..a66b9fb8e 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -34,6 +34,7 @@ #include "vp9/encoder/vp9_encodemv.h" #include "vp9/encoder/vp9_extend.h" #include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_pickmode.h" #include "vp9/encoder/vp9_rdopt.h" #include "vp9/encoder/vp9_segmentation.h" #include "vp9/encoder/vp9_tokenize.h" @@ -629,11 +630,11 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile, } } -static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, - int mi_row, int mi_col, - int *totalrate, int64_t *totaldist, - BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, - int64_t best_rd) { +static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, + int mi_row, int mi_col, + int *totalrate, int64_t *totaldist, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + int64_t best_rd) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; @@ -1038,6 +1039,132 @@ static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8) { return 0; } +// TODO(jingning) This currently serves as a test framework for non-RD mode +// decision. To be continued on optimizing the partition type decisions. +static void pick_partition_type(VP9_COMP *cpi, + const TileInfo *const tile, + MODE_INFO **mi_8x8, TOKENEXTRA **tp, + int mi_row, int mi_col, + BLOCK_SIZE bsize, int *rate, int64_t *dist, + int do_recon) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; + const int mi_stride = cm->mode_info_stride; + const int num_8x8_subsize = (num_8x8_blocks_wide_lookup[bsize] >> 1); + int i; + PARTITION_TYPE partition = PARTITION_NONE; + BLOCK_SIZE subsize; + BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type; + int sub_rate[4] = {0}; + int64_t sub_dist[4] = {0}; + int mi_offset; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) + return; + + partition = partition_lookup[b_width_log2(bsize)][bs_type]; + subsize = get_subsize(bsize, partition); + + if (bsize < BLOCK_8X8) { + // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 + // there is nothing to be done. + if (x->ab_index != 0) { + *rate = 0; + *dist = 0; + return; + } + } else { + *(get_sb_partitioning(x, bsize)) = subsize; + } + + switch (partition) { + case PARTITION_NONE: + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, rate, dist, + bsize, get_block_context(x, bsize), INT64_MAX); + break; + case PARTITION_HORZ: + *get_sb_index(x, subsize) = 0; + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sub_rate[0], &sub_dist[0], + subsize, get_block_context(x, subsize), INT64_MAX); + if (bsize >= BLOCK_8X8 && mi_row + num_8x8_subsize < cm->mi_rows) { + update_state(cpi, get_block_context(x, subsize), subsize, 0); + encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); + *get_sb_index(x, subsize) = 1; + rd_pick_sb_modes(cpi, tile, mi_row + num_8x8_subsize, mi_col, + &sub_rate[1], &sub_dist[1], subsize, + get_block_context(x, subsize), INT64_MAX); + } + *rate = sub_rate[0] + sub_rate[1]; + *dist = sub_dist[0] + sub_dist[1]; + break; + case PARTITION_VERT: + *get_sb_index(x, subsize) = 0; + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sub_rate[0], &sub_dist[0], + subsize, get_block_context(x, subsize), INT64_MAX); + if (bsize >= BLOCK_8X8 && mi_col + num_8x8_subsize < cm->mi_cols) { + update_state(cpi, get_block_context(x, subsize), subsize, 0); + encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); + *get_sb_index(x, subsize) = 1; + rd_pick_sb_modes(cpi, tile, mi_row, mi_col + num_8x8_subsize, + &sub_rate[1], &sub_dist[1], subsize, + get_block_context(x, subsize), INT64_MAX); + } + *rate = sub_rate[0] + sub_rate[1]; + *dist = sub_dist[1] + sub_dist[1]; + break; + case PARTITION_SPLIT: + *get_sb_index(x, subsize) = 0; + pick_partition_type(cpi, tile, mi_8x8, tp, mi_row, mi_col, subsize, + &sub_rate[0], &sub_dist[0], 0); + + if ((mi_col + num_8x8_subsize) < cm->mi_cols) { + *get_sb_index(x, subsize) = 1; + pick_partition_type(cpi, tile, mi_8x8 + num_8x8_subsize, tp, + mi_row, mi_col + num_8x8_subsize, subsize, + &sub_rate[1], &sub_dist[1], 0); + } + + if ((mi_row + num_8x8_subsize) < cm->mi_rows) { + *get_sb_index(x, subsize) = 2; + pick_partition_type(cpi, tile, mi_8x8 + num_8x8_subsize * mi_stride, tp, + mi_row + num_8x8_subsize, mi_col, subsize, + &sub_rate[2], &sub_dist[2], 0); + } + + if ((mi_col + num_8x8_subsize) < cm->mi_cols && + (mi_row + num_8x8_subsize) < cm->mi_rows) { + *get_sb_index(x, subsize) = 3; + mi_offset = num_8x8_subsize * mi_stride + num_8x8_subsize; + pick_partition_type(cpi, tile, mi_8x8 + mi_offset, tp, + mi_row + num_8x8_subsize, mi_col + num_8x8_subsize, + subsize, &sub_rate[3], &sub_dist[3], 0); + } + + for (i = 0; i < 4; ++i) { + *rate += sub_rate[i]; + *dist += sub_dist[i]; + } + + break; + default: + assert(0); + } + + if (do_recon) { + int output_enabled = (bsize == BLOCK_64X64); + + // Check the projected output rate for this SB against it's target + // and and if necessary apply a Q delta using segmentation to get + // closer to the target. + if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) { + select_in_frame_q_segment(cpi, mi_row, mi_col, + output_enabled, *rate); + } + + encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize); + } +} + static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile, MODE_INFO **mi_8x8, @@ -1117,8 +1244,8 @@ static void rd_use_partition(VP9_COMP *cpi, mi_row + (ms >> 1) < cm->mi_rows && mi_col + (ms >> 1) < cm->mi_cols) { *(get_sb_partitioning(x, bsize)) = bsize; - pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize, - get_block_context(x, bsize), INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize, + get_block_context(x, bsize), INT64_MAX); pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context, @@ -1133,13 +1260,15 @@ static void rd_use_partition(VP9_COMP *cpi, switch (partition) { case PARTITION_NONE: - pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist, - bsize, get_block_context(x, bsize), INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, + &last_part_dist, bsize, + get_block_context(x, bsize), INT64_MAX); break; case PARTITION_HORZ: *get_sb_index(x, subsize) = 0; - pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist, - subsize, get_block_context(x, subsize), INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, + &last_part_dist, subsize, + get_block_context(x, subsize), INT64_MAX); if (last_part_rate != INT_MAX && bsize >= BLOCK_8X8 && mi_row + (mh >> 1) < cm->mi_rows) { int rt = 0; @@ -1147,8 +1276,8 @@ static void rd_use_partition(VP9_COMP *cpi, update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); *get_sb_index(x, subsize) = 1; - pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize, - get_block_context(x, subsize), INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt, + subsize, get_block_context(x, subsize), INT64_MAX); if (rt == INT_MAX || dt == INT_MAX) { last_part_rate = INT_MAX; last_part_dist = INT_MAX; @@ -1161,8 +1290,9 @@ static void rd_use_partition(VP9_COMP *cpi, break; case PARTITION_VERT: *get_sb_index(x, subsize) = 0; - pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist, - subsize, get_block_context(x, subsize), INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, + &last_part_dist, subsize, + get_block_context(x, subsize), INT64_MAX); if (last_part_rate != INT_MAX && bsize >= BLOCK_8X8 && mi_col + (ms >> 1) < cm->mi_cols) { int rt = 0; @@ -1170,8 +1300,8 @@ static void rd_use_partition(VP9_COMP *cpi, update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); *get_sb_index(x, subsize) = 1; - pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize, - get_block_context(x, subsize), INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt, + subsize, get_block_context(x, subsize), INT64_MAX); if (rt == INT_MAX || dt == INT_MAX) { last_part_rate = INT_MAX; last_part_dist = INT_MAX; @@ -1245,9 +1375,9 @@ static void rd_use_partition(VP9_COMP *cpi, save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); - pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &rt, &dt, - split_subsize, get_block_context(x, split_subsize), - INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &rt, &dt, + split_subsize, get_block_context(x, split_subsize), + INT64_MAX); restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); @@ -1611,8 +1741,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, // PARTITION_NONE if (partition_none_allowed) { - pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize, - get_block_context(x, bsize), best_rd); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize, + get_block_context(x, bsize), best_rd); if (this_rate != INT_MAX) { if (bsize >= BLOCK_8X8) { pl = partition_plane_context(cpi->above_seg_context, @@ -1722,8 +1852,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, partition_none_allowed) get_block_context(x, subsize)->pred_filter_type = get_block_context(x, bsize)->mic.mbmi.interp_filter; - pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, - get_block_context(x, subsize), best_rd); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, + get_block_context(x, subsize), best_rd); sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); if (sum_rd < best_rd && mi_row + ms < cm->mi_rows) { @@ -1737,9 +1867,9 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, partition_none_allowed) get_block_context(x, subsize)->pred_filter_type = get_block_context(x, bsize)->mic.mbmi.interp_filter; - pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate, - &this_dist, subsize, get_block_context(x, subsize), - best_rd - sum_rd); + rd_pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate, + &this_dist, subsize, get_block_context(x, subsize), + best_rd - sum_rd); if (this_rate == INT_MAX) { sum_rd = INT64_MAX; } else { @@ -1775,8 +1905,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, partition_none_allowed) get_block_context(x, subsize)->pred_filter_type = get_block_context(x, bsize)->mic.mbmi.interp_filter; - pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, - get_block_context(x, subsize), best_rd); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, + get_block_context(x, subsize), best_rd); sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) { update_state(cpi, get_block_context(x, subsize), subsize, 0); @@ -1789,9 +1919,9 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, partition_none_allowed) get_block_context(x, subsize)->pred_filter_type = get_block_context(x, bsize)->mic.mbmi.interp_filter; - pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate, - &this_dist, subsize, get_block_context(x, subsize), - best_rd - sum_rd); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate, + &this_dist, subsize, get_block_context(x, subsize), + best_rd - sum_rd); if (this_rate == INT_MAX) { sum_rd = INT64_MAX; } else { @@ -1862,8 +1992,8 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, const TileInfo *const tile, if ((mi_row + (ms >> 1) < cm->mi_rows) && (mi_col + (ms >> 1) < cm->mi_cols)) { cpi->set_ref_frame_mask = 1; - pick_sb_modes(cpi, tile, mi_row, mi_col, &r, &d, BLOCK_64X64, - get_block_context(x, BLOCK_64X64), INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &r, &d, BLOCK_64X64, + get_block_context(x, BLOCK_64X64), INT64_MAX); pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context, mi_row, mi_col, BLOCK_64X64); r += x->partition_cost[pl][PARTITION_NONE]; @@ -1875,6 +2005,34 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, const TileInfo *const tile, restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64); } +static void encode_sb_row_rt(VP9_COMP *cpi, const TileInfo *const tile, + int mi_row, TOKENEXTRA **tp) { + VP9_COMMON *const cm = &cpi->common; + int mi_col; + + cpi->sf.always_this_block_size = BLOCK_8X8; + + // Initialize the left context for the new SB row + vpx_memset(&cpi->left_context, 0, sizeof(cpi->left_context)); + vpx_memset(cpi->left_seg_context, 0, sizeof(cpi->left_seg_context)); + + // Code each SB in the row + for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; + mi_col += MI_BLOCK_SIZE) { + int dummy_rate; + int64_t dummy_dist; + const int idx_str = cm->mode_info_stride * mi_row + mi_col; + MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str; + + vp9_zero(cpi->mb.pred_mv); + + set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64); + set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col); + pick_partition_type(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, + &dummy_rate, &dummy_dist, 1); + } +} + static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile, int mi_row, TOKENEXTRA **tp) { VP9_COMMON *const cm = &cpi->common; @@ -2101,7 +2259,11 @@ static void encode_frame_internal(VP9_COMP *cpi) { vp9_tile_init(&tile, cm, tile_row, tile_col); for (mi_row = tile.mi_row_start; mi_row < tile.mi_row_end; mi_row += 8) +#if 1 encode_sb_row(cpi, &tile, mi_row, &tp); +#else + encode_sb_row_rt(cpi, &tile, mi_row, &tp); +#endif cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old); assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols)); diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 538599d58..0a5af18cb 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -1965,7 +1965,10 @@ void vp9_get_one_pass_params(VP9_COMP *cpi) { cpi->rc.frames_to_key == 0 || (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) { cm->frame_type = KEY_FRAME; + cpi->rc.this_key_frame_forced = cm->current_video_frame != 0 && + cpi->rc.frames_to_key == 0; cpi->rc.frames_to_key = cpi->key_frame_frequency; + cpi->rc.kf_boost = 300; } else { cm->frame_type = INTER_FRAME; } @@ -1982,7 +1985,10 @@ void vp9_get_one_pass_cbr_params(VP9_COMP *cpi) { cpi->rc.frames_to_key == 0 || (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) { cm->frame_type = KEY_FRAME; + cpi->rc.this_key_frame_forced = cm->current_video_frame != 0 && + cpi->rc.frames_to_key == 0; cpi->rc.frames_to_key = cpi->key_frame_frequency; + cpi->rc.kf_boost = 300; } else { cm->frame_type = INTER_FRAME; } diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c new file mode 100644 index 000000000..17d1f5984 --- /dev/null +++ b/vp9/encoder/vp9_pickmode.c @@ -0,0 +1,230 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdio.h> +#include <math.h> +#include <limits.h> +#include <assert.h> + +#include "vp9/common/vp9_pragmas.h" +#include "vp9/encoder/vp9_tokenize.h" +#include "vp9/encoder/vp9_treewriter.h" +#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_quant_common.h" +#include "vp9/encoder/vp9_encodemb.h" +#include "vp9/encoder/vp9_quantize.h" +#include "vp9/encoder/vp9_variance.h" +#include "vp9/encoder/vp9_mcomp.h" +#include "vp9/encoder/vp9_rdopt.h" +#include "vp9/encoder/vp9_ratectrl.h" +#include "vpx_mem/vpx_mem.h" +#include "vp9/common/vp9_systemdependent.h" +#include "vp9/encoder/vp9_encodemv.h" +#include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_entropy.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_mvref_common.h" +#include "vp9/common/vp9_common.h" + +static int full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x, + const TileInfo *const tile, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int_mv *tmp_mv, int *rate_mv) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; + struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}}; + int bestsme = INT_MAX; + int further_steps, step_param; + int sadpb = x->sadperbit16; + MV mvp_full; + int ref = mbmi->ref_frame[0]; + int_mv ref_mv = mbmi->ref_mvs[ref][0]; + int i; + + int tmp_col_min = x->mv_col_min; + int tmp_col_max = x->mv_col_max; + int tmp_row_min = x->mv_row_min; + int tmp_row_max = x->mv_row_max; + + int buf_offset; + int stride = xd->plane[0].pre[0].stride; + + YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref); + + if (scaled_ref_frame) { + int i; + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + for (i = 0; i < MAX_MB_PLANE; i++) + backup_yv12[i] = xd->plane[i].pre[0]; + + setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL); + } + + vp9_set_mv_search_range(x, &ref_mv.as_mv); + + // TODO(jingning) exploiting adaptive motion search control in non-RD + // mode decision too. + step_param = 6; + further_steps = (cpi->sf.max_step_search_steps - 1) - step_param; + + for (i = LAST_FRAME; i <= ALTREF_FRAME && cpi->common.show_frame; ++i) { + if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) { + tmp_mv->as_int = INVALID_MV; + + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) + xd->plane[i].pre[0] = backup_yv12[i]; + } + return INT_MAX; + } + } + + mvp_full = mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_mv; + + mvp_full.col >>= 3; + mvp_full.row >>= 3; + + bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, + sadpb, further_steps, 1, + &cpi->fn_ptr[bsize], + &ref_mv.as_mv, tmp_mv); + + x->mv_col_min = tmp_col_min; + x->mv_col_max = tmp_col_max; + x->mv_row_min = tmp_row_min; + x->mv_row_max = tmp_row_max; + + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) + xd->plane[i].pre[0] = backup_yv12[i]; + } + + // TODO(jingning) This step can be merged into full pixel search step in the + // re-designed log-diamond search + buf_offset = tmp_mv->as_mv.row * stride + tmp_mv->as_mv.col; + + // Find sad for current vector. + bestsme = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, x->plane[0].src.stride, + xd->plane[0].pre[0].buf + buf_offset, + stride, 0x7fffffff); + + // scale to 1/8 pixel resolution + tmp_mv->as_mv.row = tmp_mv->as_mv.row << 3; + tmp_mv->as_mv.col = tmp_mv->as_mv.col << 3; + + // calculate the bit cost on motion vector + *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); + + + return bestsme; +} + +// TODO(jingning) placeholder for inter-frame non-RD mode decision. +// this needs various further optimizations. to be continued.. +int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, + const TileInfo *const tile, + int mi_row, int mi_col, + int *returnrate, + int64_t *returndistortion, + BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; + const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]); + MB_PREDICTION_MODE this_mode; + MV_REFERENCE_FRAME ref_frame; + int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; + struct buf_2d yv12_mb[4][MAX_MB_PLANE]; + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + int64_t best_rd = INT64_MAX; + int64_t this_rd; + + x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; + + x->skip = 0; + if (cpi->active_map_enabled && x->active_ptr[0] == 0) + x->skip = 1; + + // initialize mode decisions + *returnrate = INT_MAX; + vpx_memset(mbmi, 0, sizeof(MB_MODE_INFO)); + mbmi->sb_type = bsize; + mbmi->ref_frame[0] = NONE; + mbmi->ref_frame[1] = NONE; + mbmi->tx_size = MIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + x->pred_mv_sad[ref_frame] = INT_MAX; + if (cpi->ref_frame_flags & flag_list[ref_frame]) { + vp9_setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame), + ref_frame, block_size, mi_row, mi_col, + frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); + } + frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; + frame_mv[ZEROMV][ref_frame].as_int = 0; + } + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + int rate_mv = 0; + + if (!(cpi->ref_frame_flags & flag_list[ref_frame])) + continue; + + // Select prediction reference frames. + xd->plane[0].pre[0] = yv12_mb[ref_frame][0]; + + + x->mode_sad[ref_frame][INTER_OFFSET(NEWMV)] = + full_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col, + &frame_mv[NEWMV][ref_frame], &rate_mv); + + if (frame_mv[NEWMV][ref_frame].as_int == INVALID_MV) + continue; + + clamp_mv2(&frame_mv[NEARESTMV][ref_frame].as_mv, xd); + clamp_mv2(&frame_mv[NEARMV][ref_frame].as_mv, xd); + + for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { + int rate = x->inter_mode_cost[mbmi->mode_context[ref_frame]] + [INTER_OFFSET(this_mode)]; + int64_t dist = x->mode_sad[ref_frame][INTER_OFFSET(this_mode)] * + x->mode_sad[ref_frame][INTER_OFFSET(this_mode)]; + this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist); + + if (this_rd < best_rd) { + best_rd = this_rd; + mbmi->mode = this_mode; + mbmi->ref_frame[0] = ref_frame; + mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int; + } + } + } + + // TODO(jingning) sub-pixel motion search, if NEWMV is chosen + + // TODO(jingning) intra prediction search, if the best SAD is above a certain + // threshold. + + // store mode decisions + ctx->mic = *xd->mi_8x8[0]; + + return INT64_MAX; +} diff --git a/vp9/encoder/vp9_pickmode.h b/vp9/encoder/vp9_pickmode.h new file mode 100644 index 000000000..32750fa69 --- /dev/null +++ b/vp9/encoder/vp9_pickmode.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/encoder/vp9_onyx_int.h" + +int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, + const struct TileInfo *const tile, + int mi_row, int mi_col, + int *returnrate, + int64_t *returndistortion, + BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx); diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 72ab00f98..4d2d43a11 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -602,7 +602,7 @@ int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi, (last_boosted_q * 0.75)); active_best_quality = MAX(qindex + delta_qindex, rc->best_quality); } else if (!(cpi->pass == 0 && cm->current_video_frame == 0)) { - // not first frame of one pass + // not first frame of one pass and kf_boost is set double q_adj_factor = 1.0; double q_val; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 5ba891597..fa6b362d4 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -37,8 +37,6 @@ #include "vp9/common/vp9_mvref_common.h" #include "vp9/common/vp9_common.h" -#define INVALID_MV 0x80008000 - /* Factor to weigh the rate for switchable interp filters */ #define SWITCHABLE_INTERP_RATE_FACTOR 1 @@ -113,14 +111,6 @@ const REF_DEFINITION vp9_ref_order[MAX_REFS] = { static int rd_thresh_block_size_factor[BLOCK_SIZES] = {2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32}; -#define RD_THRESH_MAX_FACT 64 -#define RD_THRESH_INC 1 -#define RD_THRESH_POW 1.25 -#define RD_MULT_EPB_RATIO 64 - -#define MV_COST_WEIGHT 108 -#define MV_COST_WEIGHT_SUB 120 - static int raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block, int stride) { const int bw = b_width_log2(plane_bsize); @@ -2133,8 +2123,10 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x, max_mv = MAX(max_mv, MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3); // only need to check zero mv once - if (!this_mv.as_int && zero_seen) + if (!this_mv.as_int && zero_seen) { + x->mode_sad[ref_frame][i] = x->mode_sad[ref_frame][INTER_OFFSET(ZEROMV)]; continue; + } zero_seen = zero_seen || !this_mv.as_int; row_offset = this_mv.as_mv.row >> 3; @@ -2145,6 +2137,9 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x, this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride, ref_y_ptr, ref_y_stride, 0x7fffffff); + x->mode_sad[ref_frame][i] = this_sad; + if (this_mv.as_int == 0) + x->mode_sad[ref_frame][INTER_OFFSET(ZEROMV)] = this_sad; // Note if it is the best so far. if (this_sad < best_sad) { @@ -2153,6 +2148,12 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x, } } + if (!zero_seen) + x->mode_sad[ref_frame][INTER_OFFSET(ZEROMV)] = + cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride, + ref_y_buffer, ref_y_stride, + 0x7fffffff); + // Note the index of the mv that worked best in the reference list. x->mv_best_ref_index[ref_frame] = best_index; x->max_mv_context[ref_frame] = max_mv; @@ -2312,7 +2313,7 @@ void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, frame_type, block_size); } -static YV12_BUFFER_CONFIG *get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) { +YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) { YV12_BUFFER_CONFIG *scaled_ref_frame = NULL; int fb = get_ref_frame_idx(cpi, ref_frame); int fb_scale = get_scale_ref_frame_idx(cpi, ref_frame); @@ -2350,7 +2351,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int tmp_row_min = x->mv_row_min; int tmp_row_max = x->mv_row_max; - YV12_BUFFER_CONFIG *scaled_ref_frame = get_scaled_ref_frame(cpi, ref); + YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref); int_mv pred_mv[3]; pred_mv[0] = mbmi->ref_mvs[ref][0]; @@ -2498,8 +2499,8 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, struct buf_2d scaled_first_yv12 = xd->plane[0].pre[0]; int last_besterr[2] = {INT_MAX, INT_MAX}; YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = { - get_scaled_ref_frame(cpi, mbmi->ref_frame[0]), - get_scaled_ref_frame(cpi, mbmi->ref_frame[1]) + vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]), + vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1]) }; for (ref = 0; ref < 2; ++ref) { diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h index 4b244a50a..696cf6b11 100644 --- a/vp9/encoder/vp9_rdopt.h +++ b/vp9/encoder/vp9_rdopt.h @@ -19,6 +19,16 @@ (((128 + ((int64_t)R) * (RM)) >> 8) + (D << DM)) #define QIDX_SKIP_THRESH 115 +#define RD_THRESH_MAX_FACT 64 +#define RD_THRESH_INC 1 +#define RD_THRESH_POW 1.25 +#define RD_MULT_EPB_RATIO 64 + +#define MV_COST_WEIGHT 108 +#define MV_COST_WEIGHT_SUB 120 + +#define INVALID_MV 0x80008000 + struct TileInfo; int vp9_compute_rd_mult(VP9_COMP *cpi, int qindex); @@ -36,6 +46,8 @@ void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, int_mv frame_near_mv[MAX_REF_FRAMES], struct buf_2d yv12_mb[4][MAX_MB_PLANE]); +YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame); + void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int *r, int64_t *d, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd); diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 478b45ac0..b1c029cba 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -74,9 +74,6 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm -ifeq ($(ARCH_X86_64),yes) -VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c -endif ifeq ($(CONFIG_VP9_POSTPROC),yes) VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 0d79346fe..9ea0f549f 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -43,6 +43,7 @@ VP9_CX_SRCS-yes += encoder/vp9_psnr.h VP9_CX_SRCS-yes += encoder/vp9_quantize.h VP9_CX_SRCS-yes += encoder/vp9_ratectrl.h VP9_CX_SRCS-yes += encoder/vp9_rdopt.h +VP9_CX_SRCS-yes += encoder/vp9_pickmode.h VP9_CX_SRCS-yes += encoder/vp9_sadmxn.h VP9_CX_SRCS-yes += encoder/vp9_tokenize.h VP9_CX_SRCS-yes += encoder/vp9_treewriter.h @@ -55,6 +56,7 @@ VP9_CX_SRCS-yes += encoder/vp9_psnr.c VP9_CX_SRCS-yes += encoder/vp9_quantize.c VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c VP9_CX_SRCS-yes += encoder/vp9_rdopt.c +VP9_CX_SRCS-yes += encoder/vp9_pickmode.c VP9_CX_SRCS-yes += encoder/vp9_sad_c.c VP9_CX_SRCS-yes += encoder/vp9_segmentation.c VP9_CX_SRCS-yes += encoder/vp9_segmentation.h |