diff options
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/common/vp9_rtcd_defs.sh | 14 | ||||
-rw-r--r-- | vp9/common/x86/vp9_asm_stubs.c | 42 | ||||
-rw-r--r-- | vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c | 542 | ||||
-rw-r--r-- | vp9/encoder/vp9_block.h | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 172 | ||||
-rw-r--r-- | vp9/encoder/vp9_firstpass.c | 27 | ||||
-rw-r--r-- | vp9/encoder/vp9_mbgraph.c | 24 | ||||
-rw-r--r-- | vp9/encoder/vp9_mcomp.c | 51 | ||||
-rw-r--r-- | vp9/encoder/vp9_mcomp.h | 14 | ||||
-rw-r--r-- | vp9/encoder/vp9_onyx_if.c | 9 | ||||
-rw-r--r-- | vp9/encoder/vp9_onyx_int.h | 5 | ||||
-rw-r--r-- | vp9/encoder/vp9_pickmode.c | 6 | ||||
-rw-r--r-- | vp9/encoder/vp9_ratectrl.c | 1 | ||||
-rw-r--r-- | vp9/encoder/vp9_rdopt.c | 101 | ||||
-rw-r--r-- | vp9/encoder/vp9_temporal_filter.c | 8 | ||||
-rw-r--r-- | vp9/encoder/vp9_vaq.c | 44 | ||||
-rw-r--r-- | vp9/vp9_common.mk | 1 |
17 files changed, 745 insertions, 318 deletions
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 7bdd11eb0..63171033c 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -264,13 +264,13 @@ prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8 specialize vp9_convolve_avg $sse2_x86inc neon dspr2 prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8 sse2 ssse3 neon dspr2 +specialize vp9_convolve8 sse2 ssse3 avx2 neon dspr2 prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_horiz sse2 ssse3 neon dspr2 +specialize vp9_convolve8_horiz sse2 ssse3 avx2 neon dspr2 prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_vert sse2 ssse3 neon dspr2 +specialize vp9_convolve8_vert sse2 ssse3 avx2 neon dspr2 prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" specialize vp9_convolve8_avg sse2 ssse3 neon dspr2 @@ -737,20 +737,20 @@ specialize vp9_fdct32x32_rd sse2 avx2 # # Motion search # -prototype int vp9_full_search_sad "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv, int n" +prototype int vp9_full_search_sad "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv, int n" specialize vp9_full_search_sad sse3 sse4_1 vp9_full_search_sad_sse3=vp9_full_search_sadx3 vp9_full_search_sad_sse4_1=vp9_full_search_sadx8 -prototype int vp9_refining_search_sad "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv" +prototype int vp9_refining_search_sad "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv" specialize vp9_refining_search_sad sse3 vp9_refining_search_sad_sse3=vp9_refining_search_sadx4 -prototype int vp9_diamond_search_sad "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv" +prototype int vp9_diamond_search_sad "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv" specialize vp9_diamond_search_sad sse3 vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4 -prototype int vp9_full_range_search "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv" +prototype int vp9_full_range_search "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv" specialize vp9_full_range_search prototype void vp9_temporal_filter_apply "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count" diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c index 60018ea86..a2cf910a4 100644 --- a/vp9/common/x86/vp9_asm_stubs.c +++ b/vp9/common/x86/vp9_asm_stubs.c @@ -139,7 +139,49 @@ void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ filter_x, x_step_q4, filter_y, y_step_q4, w, h); \ } \ } +#if HAVE_AVX2 +filter8_1dfunction vp9_filter_block1d16_v8_avx2; +filter8_1dfunction vp9_filter_block1d16_h8_avx2; +filter8_1dfunction vp9_filter_block1d8_v8_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_ssse3; +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_ssse3; +filter8_1dfunction vp9_filter_block1d16_v2_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_ssse3; +#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3 +#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3 +#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3 +#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3 +#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3 +#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3 +#define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3 +#define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3 +#define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3 +#define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3 +// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); +// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, avx2); +#endif #if HAVE_SSSE3 filter8_1dfunction vp9_filter_block1d16_v8_ssse3; filter8_1dfunction vp9_filter_block1d16_h8_ssse3; diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c new file mode 100644 index 000000000..0ffb1bce3 --- /dev/null +++ b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c @@ -0,0 +1,542 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <immintrin.h> +#include "vpx_ports/mem.h" + +// filters for 16_h8 and 16_v8 +DECLARE_ALIGNED(32, const unsigned char, filt1_global_avx2[32])= { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8}; + +DECLARE_ALIGNED(32, const unsigned char, filt2_global_avx2[32])= { + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10}; + +DECLARE_ALIGNED(32, const unsigned char, filt3_global_avx2[32])= { + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12}; + +DECLARE_ALIGNED(32, const unsigned char, filt4_global_avx2[32])= { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14}; + + +void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1, srcReg32b2, filtersReg32; + unsigned int i; + unsigned int src_stride, dst_stride; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((__m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register +#if defined (__GNUC__) +#if ( __GNUC__ < 4 || (__GNUC__ == 4 && \ +(__GNUC_MINOR__ < 6 || (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0)))) + filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg); +#elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0)) + filtersReg32 = _mm_broadcastsi128_si256(filtersReg); +#else + filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); +#endif +#else + filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); +#endif + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x706u)); + + filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i-=2) { + // load the 2 strides of source + srcReg32b1 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr-3))); + srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, + _mm_loadu_si128((__m128i *) + (src_ptr+src_pixels_per_line-3)), 1); + + // filter the source buffer + srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); + srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, + _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + // reading 2 strides of the next 16 bytes + // (part of it was being read by earlier read) + srcReg32b2 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+5))); + srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, + _mm_loadu_si128((__m128i *) + (src_ptr+src_pixels_per_line+5)), 1); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, + _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + // filter the source buffer + srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg); + srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, + _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, + _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); + + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); + srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, + srcRegFilt32b2_1); + + src_ptr+=src_stride; + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, + _mm256_castsi256_si128(srcRegFilt32b1_1)); + + // save the next 16 bits + _mm_store_si128((__m128i*)(output_ptr+output_pitch), + _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); + output_ptr+=dst_stride; + } + + // if the number of strides is odd. + // process only 16 bytes + if (i > 0) { + __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; + __m128i srcRegFilt2, srcRegFilt3; + + srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt2Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, + _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(secondFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3= _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt4Reg)); + srcRegFilt2= _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, + _mm256_castsi256_si128(forthFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + + // reading the next 16 bytes + // (part of it was being read by earlier read) + srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + // filter the source buffer + srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt2Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, + _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(secondFilters)); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt4Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, + _mm256_castsi256_si128(forthFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm256_castsi256_si128(addFilterReg64)); + + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm256_castsi256_si128(addFilterReg64)); + + // shift by 7 bit each 16 bit + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); + srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); + } +} + +void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg64; + __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; + __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; + __m256i srcReg32b11, srcReg32b12, srcReg32b13, filtersReg32; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + unsigned int i; + unsigned int src_stride, dst_stride; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((__m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register +#if defined (__GNUC__) +#if ( __GNUC__ < 4 || (__GNUC__ == 4 && \ +(__GNUC_MINOR__ < 6 || (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0)))) + filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg); +#elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0)) + filtersReg32 = _mm_broadcastsi128_si256(filtersReg); +#else + filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); +#endif +#else + filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); +#endif + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x706u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + // load 16 bytes 7 times in stride of src_pitch + srcReg32b1 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr))); + srcReg32b2 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch))); + srcReg32b3 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2))); + srcReg32b4 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3))); + srcReg32b5 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4))); + srcReg32b6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5))); + srcReg32b7 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6))); + + // have each consecutive loads on the same 256 register + srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, + _mm256_castsi256_si128(srcReg32b2), 1); + srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, + _mm256_castsi256_si128(srcReg32b3), 1); + srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, + _mm256_castsi256_si128(srcReg32b4), 1); + srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, + _mm256_castsi256_si128(srcReg32b5), 1); + srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, + _mm256_castsi256_si128(srcReg32b6), 1); + srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, + _mm256_castsi256_si128(srcReg32b7), 1); + + // merge every two consecutive registers except the last one + srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); + srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); + + // save + srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); + + // save + srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); + + // save + srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); + + // save + srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); + + + for (i = output_height; i > 1; i-=2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg32b8 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7))); + srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, + _mm256_castsi256_si128(srcReg32b8), 1); + srcReg32b9 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8))); + srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, + _mm256_castsi256_si128(srcReg32b9), 1); + + // merge every two consecutive registers + // save + srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); + srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); + srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b8); + + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); + srcReg32b13 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); + + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_min_epi16(srcReg32b8, srcReg32b12)); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, + _mm256_min_epi16(srcReg32b6, srcReg32b13)); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_max_epi16(srcReg32b8, srcReg32b12)); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, + _mm256_max_epi16(srcReg32b6, srcReg32b13)); + + + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); + srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); + + src_ptr+=src_stride; + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, + _mm256_castsi256_si128(srcReg32b1)); + + // save the next 16 bits + _mm_store_si128((__m128i*)(output_ptr+out_pitch), + _mm256_extractf128_si256(srcReg32b1, 1)); + + output_ptr+=dst_stride; + + // save part of the registers for next strides + srcReg32b10 = srcReg32b11; + srcReg32b1 = srcReg32b3; + srcReg32b11 = srcReg32b2; + srcReg32b3 = srcReg32b5; + srcReg32b2 = srcReg32b4; + srcReg32b5 = srcReg32b7; + srcReg32b7 = srcReg32b9; + } + if (i > 0) { + __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; + __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; + // load the last 16 bytes + srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); + + // merge the last 2 results together + srcRegFilt4 = _mm_unpacklo_epi8( + _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + srcRegFilt7 = _mm_unpackhi_epi8( + _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, + _mm256_castsi256_si128(forthFilters)); + srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, + _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); + + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), + _mm256_castsi256_si128(secondFilters)); + srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), + _mm256_castsi256_si128(secondFilters)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), + _mm256_castsi256_si128(thirdFilters)); + srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm_min_epi16(srcRegFilt4, srcRegFilt6)); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, + _mm_min_epi16(srcRegFilt5, srcRegFilt7)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm_max_epi16(srcRegFilt4, srcRegFilt6)); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, + _mm_max_epi16(srcRegFilt5, srcRegFilt7)); + + + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm256_castsi256_si128(addFilterReg64)); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, + _mm256_castsi256_si128(addFilterReg64)); + + // shift by 7 bit each 16 bit + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); + } +} diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 713cc5132..7cbdfce52 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -172,9 +172,7 @@ struct macroblock { int skip_encode; // Used to store sub partition's choices. - int fast_ms; int_mv pred_mv[MAX_REF_FRAMES]; - int subblock_ref; // TODO(jingning): Need to refactor the structure arrays that buffers the // coding mode decisions of each partition type. diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 1b106f4c7..5585428d9 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -345,7 +345,6 @@ static void select_in_frame_q_segment(VP9_COMP *cpi, int mi_row, int mi_col, int output_enabled, int projected_rate) { VP9_COMMON *const cm = &cpi->common; - int target_rate = cpi->rc.sb64_target_rate << 8; // convert to bits << 8 const int mi_offset = mi_row * cm->mi_cols + mi_col; const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64]; @@ -362,11 +361,10 @@ static void select_in_frame_q_segment(VP9_COMP *cpi, } else { // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh). // It is converted to bits * 256 units - target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) / (bw * bh); + const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) / + (bw * bh); if (projected_rate < (target_rate / 4)) { - segment = 2; - } else if (projected_rate < (target_rate / 2)) { segment = 1; } else { segment = 0; @@ -667,7 +665,18 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, if (cpi->oxcf.aq_mode == VARIANCE_AQ) { const int energy = bsize <= BLOCK_16X16 ? x->mb_energy : vp9_block_energy(cpi, x, bsize); - xd->mi_8x8[0]->mbmi.segment_id = vp9_vaq_segment_id(energy); + + if (cm->frame_type == KEY_FRAME || + cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { + xd->mi_8x8[0]->mbmi.segment_id = vp9_vaq_segment_id(energy); + } else { + const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map + : cm->last_frame_seg_map; + xd->mi_8x8[0]->mbmi.segment_id = + vp9_get_segment_id(cm, map, bsize, mi_row, mi_col); + } + rdmult_ratio = vp9_vaq_rdmult_ratio(energy); vp9_mb_init_quantizer(cpi, x); } @@ -681,11 +690,12 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { const int mi_offset = mi_row * cm->mi_cols + mi_col; unsigned char complexity = cpi->complexity_map[mi_offset]; - const int is_edge = (mi_row == 0) || (mi_row == (cm->mi_rows - 1)) || - (mi_col == 0) || (mi_col == (cm->mi_cols - 1)); + const int is_edge = (mi_row <= 1) || (mi_row >= (cm->mi_rows - 2)) || + (mi_col <= 1) || (mi_col >= (cm->mi_cols - 2)); - if (!is_edge && (complexity > 128)) + if (!is_edge && (complexity > 128)) { x->rdmult = x->rdmult + ((x->rdmult * (complexity - 128)) / 256); + } } // Find best coding mode & reconstruct the MB so it is available @@ -709,6 +719,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, *totalrate = round(*totalrate * rdmult_ratio); } } + else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { + x->rdmult = orig_rdmult; + } } static void update_stats(VP9_COMP *cpi) { @@ -1253,9 +1266,6 @@ static void rd_use_partition(VP9_COMP *cpi, x->mb_energy = vp9_block_energy(cpi, x, bsize); } - x->fast_ms = 0; - x->subblock_ref = 0; - if (cpi->sf.adjust_partitioning_from_last_frame) { // Check if any of the sub blocks are further split. if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) { @@ -1613,80 +1623,6 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile, } } -static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; - - if (cm->frame_type == INTER_FRAME && - !cpi->rc.is_src_frame_alt_ref && - (bsize == BLOCK_16X16 || bsize == BLOCK_32X32 || bsize == BLOCK_64X64)) { - const PICK_MODE_CONTEXT *block_context = get_block_context(x, bsize); - const int ref0 = block_context[0].mic.mbmi.ref_frame[0]; - const int ref1 = block_context[1].mic.mbmi.ref_frame[0]; - const int ref2 = block_context[2].mic.mbmi.ref_frame[0]; - const int ref3 = block_context[3].mic.mbmi.ref_frame[0]; - - // Currently, only consider 4 inter reference frames. - if (ref0 && ref1 && ref2 && ref3) { - int d01, d23, d02, d13; - - // Motion vectors for the four subblocks. - int16_t mvr0 = block_context[0].mic.mbmi.mv[0].as_mv.row; - int16_t mvc0 = block_context[0].mic.mbmi.mv[0].as_mv.col; - int16_t mvr1 = block_context[1].mic.mbmi.mv[0].as_mv.row; - int16_t mvc1 = block_context[1].mic.mbmi.mv[0].as_mv.col; - int16_t mvr2 = block_context[2].mic.mbmi.mv[0].as_mv.row; - int16_t mvc2 = block_context[2].mic.mbmi.mv[0].as_mv.col; - int16_t mvr3 = block_context[3].mic.mbmi.mv[0].as_mv.row; - int16_t mvc3 = block_context[3].mic.mbmi.mv[0].as_mv.col; - - // Adjust sign if ref is alt_ref. - if (cm->ref_frame_sign_bias[ref0]) { - mvr0 *= -1; - mvc0 *= -1; - } - - if (cm->ref_frame_sign_bias[ref1]) { - mvr1 *= -1; - mvc1 *= -1; - } - - if (cm->ref_frame_sign_bias[ref2]) { - mvr2 *= -1; - mvc2 *= -1; - } - - if (cm->ref_frame_sign_bias[ref3]) { - mvr3 *= -1; - mvc3 *= -1; - } - - // Calculate mv distances. - d01 = MAX(abs(mvr0 - mvr1), abs(mvc0 - mvc1)); - d23 = MAX(abs(mvr2 - mvr3), abs(mvc2 - mvc3)); - d02 = MAX(abs(mvr0 - mvr2), abs(mvc0 - mvc2)); - d13 = MAX(abs(mvr1 - mvr3), abs(mvc1 - mvc3)); - - if (d01 < FAST_MOTION_MV_THRESH && d23 < FAST_MOTION_MV_THRESH && - d02 < FAST_MOTION_MV_THRESH && d13 < FAST_MOTION_MV_THRESH) { - // Set fast motion search level. - x->fast_ms = 1; - - if (ref0 == ref1 && ref1 == ref2 && ref2 == ref3 && - d01 < 2 && d23 < 2 && d02 < 2 && d13 < 2) { - // Set fast motion search level. - x->fast_ms = 2; - - if (!d01 && !d23 && !d02 && !d13) { - x->fast_ms = 3; - x->subblock_ref = ref0; - } - } - } - } - } -} - static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { vpx_memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv)); } @@ -1726,8 +1662,6 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, bsize >= BLOCK_8X8; int partition_vert_allowed = !force_horz_split && xss <= yss && bsize >= BLOCK_8X8; - - int partition_split_done = 0; (void) *tp_orig; if (bsize < BLOCK_8X8) { @@ -1869,18 +1803,9 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, if (cpi->sf.less_rectangular_check) do_rect &= !partition_none_allowed; } - partition_split_done = 1; restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); } - x->fast_ms = 0; - x->subblock_ref = 0; - - if (partition_split_done && - cpi->sf.using_small_partition_info) { - compute_fast_motion_search_level(cpi, bsize); - } - // PARTITION_HORZ if (partition_horz_allowed && do_rect) { subsize = get_subsize(bsize, PARTITION_HORZ); @@ -1985,7 +1910,11 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); } - + // TODO(jbb): This code added so that we avoid static analysis + // warning related to the fact that best_rd isn't used after this + // point. This code should be refactored so that the duplicate + // checks occur in some sub function and thus are used... + (void) best_rd; *rate = best_rate; *dist = best_dist; @@ -2009,41 +1938,6 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, } } -// Examines 64x64 block and chooses a best reference frame -static void rd_pick_reference_frame(VP9_COMP *cpi, const TileInfo *const tile, - int mi_row, int mi_col) { - VP9_COMMON * const cm = &cpi->common; - MACROBLOCK * const x = &cpi->mb; - int bsl = b_width_log2(BLOCK_64X64), bs = 1 << bsl; - int ms = bs / 2; - ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; - PARTITION_CONTEXT sl[8], sa[8]; - int pl; - int r; - int64_t d; - - save_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64); - - // Default is non mask (all reference frames allowed. - cpi->ref_frame_mask = 0; - - // Do RD search for 64x64. - if ((mi_row + (ms >> 1) < cm->mi_rows) && - (mi_col + (ms >> 1) < cm->mi_cols)) { - cpi->set_ref_frame_mask = 1; - rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &r, &d, BLOCK_64X64, - get_block_context(x, BLOCK_64X64), INT64_MAX); - pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context, - mi_row, mi_col, BLOCK_64X64); - r += x->partition_cost[pl][PARTITION_NONE]; - - *(get_sb_partitioning(x, BLOCK_64X64)) = BLOCK_64X64; - cpi->set_ref_frame_mask = 0; - } - - restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64); -} - static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile, int mi_row, TOKENEXTRA **tp) { VP9_COMMON *const cm = &cpi->common; @@ -2337,6 +2231,7 @@ static void select_tx_mode(VP9_COMP *cpi) { } } } + // Start RTC Exploration typedef enum { BOTH_ZERO = 0, @@ -2368,12 +2263,15 @@ static void set_mode_info(MB_MODE_INFO *mbmi, BLOCK_SIZE bsize, mbmi->sb_type = bsize; mbmi->segment_id = 0; } + static INLINE int get_block_row(int b32i, int b16i, int b8i) { return ((b32i >> 1) << 2) + ((b16i >> 1) << 1) + (b8i >> 1); } + static INLINE int get_block_col(int b32i, int b16i, int b8i) { return ((b32i & 1) << 2) + ((b16i & 1) << 1) + (b8i & 1); } + static void rtc_use_partition(VP9_COMP *cpi, const TileInfo *const tile, MODE_INFO **mi_8x8, @@ -2393,8 +2291,6 @@ static void rtc_use_partition(VP9_COMP *cpi, int row8x8_remaining = tile->mi_row_end - mi_row; int col8x8_remaining = tile->mi_col_end - mi_col; int b32i; - x->fast_ms = 0; - x->subblock_ref = 0; for (b32i = 0; b32i < 4; b32i++) { int b16i; for (b16i = 0; b16i < 4; b16i++) { @@ -2405,10 +2301,6 @@ static void rtc_use_partition(VP9_COMP *cpi, int rate; int64_t dist; - int_mv frame_nearest_mv[MAX_REF_FRAMES]; - int_mv frame_near_mv[MAX_REF_FRAMES]; - struct buf_2d yv12_mb[MAX_REF_FRAMES][MAX_MB_PLANE]; - // Find a partition size that fits bsize = find_partition_size(cpi->sf.always_this_block_size, (row8x8_remaining - block_row), @@ -2430,10 +2322,6 @@ static void rtc_use_partition(VP9_COMP *cpi, } else { set_mode_info(&mi_8x8[index]->mbmi, bsize, mode, mi_row + block_row, mi_col + block_col); - vp9_setup_buffer_inter(cpi, x, tile, - LAST_FRAME, cpi->sf.always_this_block_size, - mi_row + block_row, mi_col + block_col, - frame_nearest_mv, frame_near_mv, yv12_mb); } for (j = 0; j < mi_height; j++) diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 3e04c2faf..bf9dd3ec5 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -1566,13 +1566,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { (i >= MIN_GF_INTERVAL) && // for real scene cuts (not forced kfs) dont allow arf very near kf. (rc->next_key_frame_forced || - (i <= (rc->frames_to_key - MIN_GF_INTERVAL))) && - ((next_frame.pcnt_inter > 0.75) || - (next_frame.pcnt_second_ref > 0.5)) && - ((mv_in_out_accumulator / (double)i > -0.2) || - (mv_in_out_accumulator > -2.0)) && - (boost_score > 100)) { - + (i <= (rc->frames_to_key - MIN_GF_INTERVAL)))) { // Alternative boost calculation for alt ref rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost); @@ -1926,8 +1920,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { double kf_mod_err = 0.0; double kf_group_err = 0.0; - double kf_group_intra_err = 0.0; - double kf_group_coded_err = 0.0; double recent_loop_decay[8] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; RATE_CONTROL *const rc = &cpi->rc; @@ -1965,12 +1957,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Accumulate kf group error kf_group_err += calculate_modified_err(cpi, this_frame); - // These figures keep intra and coded error counts for all frames including - // key frames in the group. The effect of the key frame itself can be - // subtracted out using the first_frame data collected above. - kf_group_intra_err += this_frame->intra_error; - kf_group_coded_err += this_frame->coded_error; - // load a the next frame's stats last_frame = *this_frame; input_stats(twopass, this_frame); @@ -2030,15 +2016,11 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { reset_fpf_position(twopass, start_position); kf_group_err = 0; - kf_group_intra_err = 0; - kf_group_coded_err = 0; // Rescan to get the correct error data for the forced kf group for (i = 0; i < rc->frames_to_key; i++) { // Accumulate kf group errors kf_group_err += calculate_modified_err(cpi, &tmp_frame); - kf_group_intra_err += tmp_frame.intra_error; - kf_group_coded_err += tmp_frame.coded_error; // Load the next frame's stats. input_stats(twopass, &tmp_frame); @@ -2054,12 +2036,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { if (twopass->stats_in >= twopass->stats_in_end) { // Accumulate kf group error kf_group_err += calculate_modified_err(cpi, this_frame); - - // These figures keep intra and coded error counts for all frames including - // key frames in the group. The effect of the key frame itself can be - // subtracted out using the first_frame data collected above. - kf_group_intra_err += this_frame->intra_error; - kf_group_coded_err += this_frame->coded_error; } // Calculate the number of bits that should be assigned to the kf group. @@ -2089,7 +2065,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // frames use inter blocks. decay_accumulator = 1.0; boost_score = 0.0; - loop_decay_rate = 1.00; // Starting decay rate // Scan through the kf group collating various stats. for (i = 0; i < rc->frames_to_key; i++) { diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c index c50098678..7eacda217 100644 --- a/vp9/encoder/vp9_mbgraph.c +++ b/vp9/encoder/vp9_mbgraph.c @@ -29,7 +29,6 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16]; - unsigned int best_err; const int tmp_col_min = x->mv_col_min; const int tmp_col_max = x->mv_col_max; @@ -48,27 +47,22 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, ref_full.row = ref_mv->row >> 3; /*cpi->sf.search_method == HEX*/ - best_err = vp9_hex_search(x, &ref_full, step_param, x->errorperbit, - 0, &v_fn_ptr, 0, ref_mv, dst_mv); + vp9_hex_search(x, &ref_full, step_param, x->errorperbit, 0, &v_fn_ptr, 0, + ref_mv, dst_mv); // Try sub-pixel MC // if (bestsme > error_thresh && bestsme < INT_MAX) { int distortion; unsigned int sse; - best_err = cpi->find_fractional_mv_step( - x, dst_mv, ref_mv, - cpi->common.allow_high_precision_mv, - x->errorperbit, &v_fn_ptr, - 0, cpi->sf.subpel_iters_per_step, NULL, NULL, - & distortion, &sse); + cpi->find_fractional_mv_step( + x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit, + &v_fn_ptr, 0, cpi->sf.subpel_iters_per_step, NULL, NULL, &distortion, + &sse); } vp9_set_mbmode_and_mvs(xd, NEWMV, dst_mv); vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16); - best_err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, - xd->plane[0].dst.buf, xd->plane[0].dst.stride, - INT_MAX); /* restore UMV window */ x->mv_col_min = tmp_col_min; @@ -76,7 +70,9 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, x->mv_row_min = tmp_row_min; x->mv_row_max = tmp_row_max; - return best_err; + return vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, + xd->plane[0].dst.buf, xd->plane[0].dst.stride, + INT_MAX); } static int do_16x16_motion_search(VP9_COMP *cpi, const int_mv *ref_mv, @@ -355,7 +351,7 @@ static void separate_arf_mbs(VP9_COMP *cpi) { for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) { // If any of the blocks in the sequence failed then the MB // goes in segment 0 - if (arf_not_zz[mi_row/2*cm->mb_cols + mi_col/2]) { + if (arf_not_zz[mi_row / 2 * cm->mb_cols + mi_col / 2]) { ncnt[0]++; cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 0; } else { diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index b2555e682..62b33e4b9 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -349,6 +349,10 @@ int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x, tr = br; tc = bc; } + // These lines insure static analysis doesn't warn that + // tr and tc aren't used after the above point. + (void) tr; + (void) tc; bestmv->row = br; bestmv->col = bc; @@ -452,6 +456,11 @@ int vp9_find_best_sub_pixel_comp_tree(const MACROBLOCK *x, tr = br; tc = bc; } + // These lines insure static analysis doesn't warn that + // tr and tc aren't used after the above point. + (void) tr; + (void) tc; + bestmv->row = br; bestmv->col = bc; @@ -850,8 +859,9 @@ int vp9_square_search(const MACROBLOCK *x, int vp9_full_range_search_c(const MACROBLOCK *x, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, - vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, - int *mvcost[2], const MV *center_mv) { + const vp9_variance_fn_ptr_t *fn_ptr, + int *mvjcost, int *mvcost[2], + const MV *center_mv) { const MACROBLOCKD *const xd = &x->e_mbd; const uint8_t *what = x->plane[0].src.buf; const int what_stride = x->plane[0].src.stride; @@ -965,8 +975,9 @@ int vp9_full_range_search_c(const MACROBLOCK *x, MV *ref_mv, MV *best_mv, int vp9_diamond_search_sad_c(const MACROBLOCK *x, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, - vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, - int *mvcost[2], const MV *center_mv) { + const vp9_variance_fn_ptr_t *fn_ptr, + int *mvjcost, int *mvcost[2], + const MV *center_mv) { int i, j, step; const MACROBLOCKD *const xd = &x->e_mbd; @@ -1099,7 +1110,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, int vp9_diamond_search_sadx4(const MACROBLOCK *x, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, - vp9_variance_fn_ptr_t *fn_ptr, + const vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, int *mvcost[2], const MV *center_mv) { int i, j, step; @@ -1279,7 +1290,8 @@ int vp9_diamond_search_sadx4(const MACROBLOCK *x, int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x, MV *mvp_full, int step_param, int sadpb, int further_steps, - int do_refine, vp9_variance_fn_ptr_t *fn_ptr, + int do_refine, + const vp9_variance_fn_ptr_t *fn_ptr, const MV *ref_mv, int_mv *dst_mv) { int_mv temp_mv; int thissme, n, num00; @@ -1336,9 +1348,9 @@ int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x, return bestsme; } -int vp9_full_search_sad_c(const MACROBLOCK *x, MV *ref_mv, +int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv, int sad_per_bit, int distance, - vp9_variance_fn_ptr_t *fn_ptr, + const vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, int *mvcost[2], const MV *center_mv, int block) { int r, c; @@ -1389,10 +1401,11 @@ int vp9_full_search_sad_c(const MACROBLOCK *x, MV *ref_mv, } } -int vp9_full_search_sadx3(const MACROBLOCK *x, MV *ref_mv, +int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv, int sad_per_bit, int distance, - vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, - int *mvcost[2], const MV *center_mv, int n) { + const vp9_variance_fn_ptr_t *fn_ptr, + int *mvjcost, int *mvcost[2], + const MV *center_mv, int n) { const MACROBLOCKD *const xd = &x->e_mbd; const uint8_t *const what = x->plane[0].src.buf; const int what_stride = x->plane[0].src.stride; @@ -1494,9 +1507,9 @@ int vp9_full_search_sadx3(const MACROBLOCK *x, MV *ref_mv, return INT_MAX; } -int vp9_full_search_sadx8(const MACROBLOCK *x, MV *ref_mv, +int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv, int sad_per_bit, int distance, - vp9_variance_fn_ptr_t *fn_ptr, + const vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, int *mvcost[2], const MV *center_mv, int n) { const MACROBLOCKD *const xd = &x->e_mbd; @@ -1630,7 +1643,8 @@ int vp9_full_search_sadx8(const MACROBLOCK *x, MV *ref_mv, int vp9_refining_search_sad_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, - int search_range, vp9_variance_fn_ptr_t *fn_ptr, + int search_range, + const vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, int *mvcost[2], const MV *center_mv) { const MACROBLOCKD *const xd = &x->e_mbd; @@ -1702,7 +1716,8 @@ int vp9_refining_search_sad_c(const MACROBLOCK *x, int vp9_refining_search_sadx4(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, - int search_range, vp9_variance_fn_ptr_t *fn_ptr, + int search_range, + const vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, int *mvcost[2], const MV *center_mv) { const MACROBLOCKD *const xd = &x->e_mbd; @@ -1815,8 +1830,10 @@ int vp9_refining_search_sadx4(const MACROBLOCK *x, // mode. int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, - int search_range, vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], const MV *center_mv, + int search_range, + const vp9_variance_fn_ptr_t *fn_ptr, + int *mvjcost, int *mvcost[2], + const MV *center_mv, const uint8_t *second_pred, int w, int h) { const MACROBLOCKD *const xd = &x->e_mbd; const MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0}, diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index 28b46b503..e1d6abeb6 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -45,7 +45,7 @@ int vp9_init_search_range(struct VP9_COMP *cpi, int size); int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x, MV *mvp_full, int step_param, int sadpb, int further_steps, int do_refine, - vp9_variance_fn_ptr_t *fn_ptr, + const vp9_variance_fn_ptr_t *fn_ptr, const MV *ref_mv, int_mv *dst_mv); int vp9_hex_search(const MACROBLOCK *x, @@ -107,15 +107,16 @@ typedef int (fractional_mv_step_comp_fp) ( extern fractional_mv_step_comp_fp vp9_find_best_sub_pixel_comp_tree; typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x, - MV *ref_mv, int sad_per_bit, - int distance, vp9_variance_fn_ptr_t *fn_ptr, + const MV *ref_mv, int sad_per_bit, + int distance, + const vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, int *mvcost[2], const MV *center_mv, int n); typedef int (*vp9_refining_search_fn_t)(const MACROBLOCK *x, MV *ref_mv, int sad_per_bit, int distance, - vp9_variance_fn_ptr_t *fn_ptr, + const vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, int *mvcost[2], const MV *center_mv); @@ -123,13 +124,14 @@ typedef int (*vp9_diamond_search_fn_t)(const MACROBLOCK *x, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, - vp9_variance_fn_ptr_t *fn_ptr, + const vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, int *mvcost[2], const MV *center_mv); int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, - int search_range, vp9_variance_fn_ptr_t *fn_ptr, + int search_range, + const vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, int *mvcost[2], const MV *center_mv, const uint8_t *second_pred, int w, int h); diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index 9f4d47bdd..cef7e0403 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -96,7 +96,7 @@ FILE *keyfile; void vp9_init_quantizer(VP9_COMP *cpi); static const double in_frame_q_adj_ratio[MAX_SEGMENTS] = - {1.0, 1.5, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + {1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; static INLINE void Scale2Ratio(int mode, int *hr, int *hs) { switch (mode) { @@ -267,7 +267,6 @@ static void setup_in_frame_q_adj(VP9_COMP *cpi) { // Clear down the complexity map used for rd vpx_memset(cpi->complexity_map, 0, cm->mi_rows * cm->mi_cols); - // Enable segmentation vp9_enable_segmentation((VP9_PTR)cpi); vp9_clearall_segfeatures(seg); @@ -278,7 +277,7 @@ static void setup_in_frame_q_adj(VP9_COMP *cpi) { vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q); // Use some of the segments for in frame Q adjustment - for (segment = 1; segment < 3; segment++) { + for (segment = 1; segment < 2; segment++) { qindex_delta = vp9_compute_qdelta_by_rate(cpi, cm->base_qindex, in_frame_q_adj_ratio[segment]); @@ -911,7 +910,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->use_uv_intra_rd_estimate = 0; sf->use_fast_lpf_pick = 0; sf->use_fast_coef_updates = 0; - sf->using_small_partition_info = 0; sf->mode_skip_start = MAX_MODES; // Mode index at which mode skip mask set sf->use_pick_mode = 0; sf->encode_breakout_thresh = 0; @@ -3708,8 +3706,9 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, xd->interp_kernel = vp9_get_interp_kernel( DEFAULT_INTERP_FILTER == SWITCHABLE ? EIGHTTAP : DEFAULT_INTERP_FILTER); - if (cpi->oxcf.aq_mode == VARIANCE_AQ) + if (cpi->oxcf.aq_mode == VARIANCE_AQ) { vp9_vaq_init(); + } if (cpi->use_svc) { SvcEncode(cpi, size, dest, frame_flags); diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h index 88a041984..1ab1814c0 100644 --- a/vp9/encoder/vp9_onyx_int.h +++ b/vp9/encoder/vp9_onyx_int.h @@ -346,11 +346,6 @@ typedef struct { // inter modes or to enable it always. int disable_split_mask; - // TODO(jbb): Remove this and everything that uses it. It's only valid if - // we were doing small to large partition checks. We currently do the - // reverse. - int using_small_partition_info; - // TODO(jingning): combine the related motion search speed features // This allows us to use motion search at other sizes as a starting // point for this motion search and limits the search range around it. diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 3b4a7f6e4..912968594 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -98,10 +98,8 @@ static int full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x, mvp_full.col >>= 3; mvp_full.row >>= 3; - bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, - sadpb, further_steps, 1, - &cpi->fn_ptr[bsize], - &ref_mv.as_mv, tmp_mv); + vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, sadpb, further_steps, 1, + &cpi->fn_ptr[bsize], &ref_mv.as_mv, tmp_mv); x->mv_col_min = tmp_col_min; x->mv_col_max = tmp_col_max; diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 9afa0647d..2427dbe74 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -846,7 +846,6 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int q; if (frame_is_intra_only(cm)) { - active_best_quality = rc->best_quality; #if !CONFIG_MULTIPLE_ARF // Handle the special case for key frames forced when we have75 reached // the maximum key frame interval. Here force the Q to a range diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index cae7884fd..03def2095 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -68,7 +68,7 @@ struct rdcost_block_args { int64_t this_rd; int64_t best_rd; int skip; - const int16_t *scan, *nb; + const scan_order *so; }; const MODE_DEFINITION vp9_mode_order[MAX_MODES] = { @@ -635,7 +635,7 @@ static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize, args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx, args->t_left + y_idx, tx_size, - args->scan, args->nb); + args->so->scan, args->so->neighbors); } static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, @@ -710,49 +710,40 @@ void vp9_get_entropy_contexts(TX_SIZE tx_size, } } -static void init_rdcost_stack(MACROBLOCK *x, const int64_t ref_rdcost, - struct rdcost_block_args *arg) { - vpx_memset(arg, 0, sizeof(struct rdcost_block_args)); - arg->x = x; - arg->best_rd = ref_rdcost; -} - static void txfm_rd_in_plane(MACROBLOCK *x, int *rate, int64_t *distortion, int *skippable, int64_t *sse, int64_t ref_best_rd, int plane, BLOCK_SIZE bsize, TX_SIZE tx_size) { - struct rdcost_block_args rd_stack; MACROBLOCKD *const xd = &x->e_mbd; struct macroblockd_plane *const pd = &xd->plane[plane]; const BLOCK_SIZE bs = get_plane_block_size(bsize, pd); const int num_4x4_w = num_4x4_blocks_wide_lookup[bs]; const int num_4x4_h = num_4x4_blocks_high_lookup[bs]; - const scan_order *so; + struct rdcost_block_args args = { 0 }; + args.x = x; + args.best_rd = ref_best_rd; - init_rdcost_stack(x, ref_best_rd, &rd_stack); if (plane == 0) xd->mi_8x8[0]->mbmi.tx_size = tx_size; - vp9_get_entropy_contexts(tx_size, rd_stack.t_above, rd_stack.t_left, + vp9_get_entropy_contexts(tx_size, args.t_above, args.t_left, pd->above_context, pd->left_context, num_4x4_w, num_4x4_h); - so = get_scan(xd, tx_size, pd->plane_type, 0); - rd_stack.scan = so->scan; - rd_stack.nb = so->neighbors; + args.so = get_scan(xd, tx_size, pd->plane_type, 0); vp9_foreach_transformed_block_in_plane(xd, bsize, plane, - block_rd_txfm, &rd_stack); - if (rd_stack.skip) { + block_rd_txfm, &args); + if (args.skip) { *rate = INT_MAX; *distortion = INT64_MAX; *sse = INT64_MAX; *skippable = 0; } else { - *distortion = rd_stack.this_dist; - *rate = rd_stack.this_rate; - *sse = rd_stack.this_sse; + *distortion = args.this_dist; + *rate = args.this_rate; + *sse = args.this_sse; *skippable = vp9_is_skippable_in_plane(x, bsize, plane); } } @@ -787,7 +778,10 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; vp9_prob skip_prob = vp9_get_skip_prob(cm, xd); - int64_t rd[TX_SIZES][2]; + int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX}, + {INT64_MAX, INT64_MAX}, + {INT64_MAX, INT64_MAX}, + {INT64_MAX, INT64_MAX}}; int n, m; int s0, s1; const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode]; @@ -862,7 +856,10 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; vp9_prob skip_prob = vp9_get_skip_prob(cm, xd); - int64_t rd[TX_SIZES][2]; + int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX}, + {INT64_MAX, INT64_MAX}, + {INT64_MAX, INT64_MAX}, + {INT64_MAX, INT64_MAX}}; int n, m; int s0, s1; double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00}; @@ -1604,13 +1601,11 @@ typedef struct { int mvthresh; } BEST_SEG_INFO; -static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) { - int r = 0; - r |= (mv->as_mv.row >> 3) < x->mv_row_min; - r |= (mv->as_mv.row >> 3) > x->mv_row_max; - r |= (mv->as_mv.col >> 3) < x->mv_col_min; - r |= (mv->as_mv.col >> 3) > x->mv_col_max; - return r; +static INLINE int mv_check_bounds(const MACROBLOCK *x, const MV *mv) { + return (mv->row >> 3) < x->mv_row_min || + (mv->row >> 3) > x->mv_row_max || + (mv->col >> 3) < x->mv_col_min || + (mv->col >> 3) > x->mv_col_max; } static INLINE void mi_buf_shift(MACROBLOCK *x, int i) { @@ -1924,10 +1919,9 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, } // Trap vectors that reach beyond the UMV borders - if (mv_check_bounds(x, &mode_mv[this_mode])) - continue; - if (has_second_rf && - mv_check_bounds(x, &second_mode_mv[this_mode])) + if (mv_check_bounds(x, &mode_mv[this_mode].as_mv) || + (has_second_rf && + mv_check_bounds(x, &second_mode_mv[this_mode].as_mv))) continue; if (filter_idx > 0) { @@ -2380,24 +2374,16 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, vp9_set_mv_search_range(x, &ref_mv.as_mv); - // Adjust search parameters based on small partitions' result. - if (x->fast_ms) { - // adjust search range - step_param = 6; - if (x->fast_ms > 1) - step_param = 8; + // Work out the size of the first step in the mv step search. + // 0 here is maximum length first step. 1 is MAX >> 1 etc. + if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) { + // Take wtd average of the step_params based on the last frame's + // max mv magnitude and that based on the best ref mvs of the current + // block for the given reference. + step_param = (vp9_init_search_range(cpi, x->max_mv_context[ref]) + + cpi->mv_step_param) >> 1; } else { - // Work out the size of the first step in the mv step search. - // 0 here is maximum length first step. 1 is MAX >> 1 etc. - if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) { - // Take wtd average of the step_params based on the last frame's - // max mv magnitude and that based on the best ref mvs of the current - // block for the given reference. - step_param = (vp9_init_search_range(cpi, x->max_mv_context[ref]) + - cpi->mv_step_param) >> 1; - } else { - step_param = cpi->mv_step_param; - } + step_param = cpi->mv_step_param; } if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 && @@ -2750,7 +2736,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (this_mode != NEWMV) clamp_mv2(&cur_mv[i].as_mv, xd); - if (mv_check_bounds(x, &cur_mv[i])) + if (mv_check_bounds(x, &cur_mv[i].as_mv)) return INT64_MAX; mbmi->mv[i].as_int = cur_mv[i].as_int; } @@ -3260,12 +3246,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue; - // Skip some checking based on small partitions' result. - if (x->fast_ms > 1 && !ref_frame) - continue; - if (x->fast_ms > 2 && ref_frame != x->subblock_ref) - continue; - mbmi->ref_frame[0] = ref_frame; mbmi->ref_frame[1] = second_ref_frame; @@ -4126,11 +4106,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, if (tmp_rd == INT64_MAX) continue; } else { - if (cm->interp_filter == SWITCHABLE) { - int rs = get_switchable_rate(x); - tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0); - } - tmp_rd = tmp_best_rdu; total_sse = tmp_best_sse; rate = tmp_best_rate; rate_y = tmp_best_ratey; diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index ca11dda1e..004047773 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -29,7 +29,6 @@ #include "vpx_scale/vpx_scale.h" #define ALT_REF_MC_ENABLED 1 // dis/enable MC in AltRef filtering -#define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, uint8_t *y_mb_ptr, @@ -160,11 +159,9 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, /*cpi->sf.search_method == HEX*/ // Ignore mv costing by sending NULL pointer instead of cost arrays - bestsme = vp9_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1, - &cpi->fn_ptr[BLOCK_16X16], - 0, &best_ref_mv1, ref_mv); + vp9_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1, + &cpi->fn_ptr[BLOCK_16X16], 0, &best_ref_mv1, ref_mv); -#if ALT_REF_SUBPEL_ENABLED // Try sub-pixel MC? // if (bestsme > error_thresh && bestsme < INT_MAX) { @@ -180,7 +177,6 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, NULL, NULL, &distortion, &sse); } -#endif // Restore input state x->plane[0].src = src; diff --git a/vp9/encoder/vp9_vaq.c b/vp9/encoder/vp9_vaq.c index 1f9cb8709..acd7c416e 100644 --- a/vp9/encoder/vp9_vaq.c +++ b/vp9/encoder/vp9_vaq.c @@ -19,8 +19,8 @@ #include "vp9/encoder/vp9_segmentation.h" #include "vp9/common/vp9_systemdependent.h" -#define ENERGY_MIN (-3) -#define ENERGY_MAX (3) +#define ENERGY_MIN (-1) +#define ENERGY_MAX (1) #define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1) #define ENERGY_IN_BOUNDS(energy)\ assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX) @@ -65,7 +65,7 @@ void vp9_vaq_init() { vp9_clear_system_state(); // __asm emms; - base_ratio = 1.8; + base_ratio = 1.5; for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) { Q_RATIO(i) = pow(base_ratio, i/3.0); @@ -80,30 +80,34 @@ void vp9_vaq_frame_setup(VP9_COMP *cpi) { cm->y_dc_delta_q); int i; - vp9_enable_segmentation((VP9_PTR)cpi); - vp9_clearall_segfeatures(seg); + if (cm->frame_type == KEY_FRAME || + cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { + vp9_enable_segmentation((VP9_PTR)cpi); + vp9_clearall_segfeatures(seg); - seg->abs_delta = SEGMENT_DELTADATA; + seg->abs_delta = SEGMENT_DELTADATA; - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); // __asm emms; - for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) { - int qindex_delta, segment_rdmult; + for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) { + int qindex_delta, segment_rdmult; - if (Q_RATIO(i) == 1) { - // No need to enable SEG_LVL_ALT_Q for this segment - RDMULT_RATIO(i) = 1; - continue; - } + if (Q_RATIO(i) == 1) { + // No need to enable SEG_LVL_ALT_Q for this segment + RDMULT_RATIO(i) = 1; + continue; + } - qindex_delta = vp9_compute_qdelta(cpi, base_q, base_q * Q_RATIO(i)); - vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta); - vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q); + qindex_delta = vp9_compute_qdelta(cpi, base_q, base_q * Q_RATIO(i)); + vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta); + vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q); - segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta + - cm->y_dc_delta_q); + segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta + + cm->y_dc_delta_q); - RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult; + RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult; + } } } diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 85e83b834..48d6a7ca0 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -79,6 +79,7 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm +VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c ifeq ($(CONFIG_VP9_POSTPROC),yes) VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm |