summaryrefslogtreecommitdiff
path: root/vp9
diff options
context:
space:
mode:
Diffstat (limited to 'vp9')
-rw-r--r--vp9/common/vp9_rtcd_defs.sh14
-rw-r--r--vp9/common/x86/vp9_asm_stubs.c42
-rw-r--r--vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c542
-rw-r--r--vp9/encoder/vp9_block.h2
-rw-r--r--vp9/encoder/vp9_encodeframe.c172
-rw-r--r--vp9/encoder/vp9_firstpass.c27
-rw-r--r--vp9/encoder/vp9_mbgraph.c24
-rw-r--r--vp9/encoder/vp9_mcomp.c51
-rw-r--r--vp9/encoder/vp9_mcomp.h14
-rw-r--r--vp9/encoder/vp9_onyx_if.c9
-rw-r--r--vp9/encoder/vp9_onyx_int.h5
-rw-r--r--vp9/encoder/vp9_pickmode.c6
-rw-r--r--vp9/encoder/vp9_ratectrl.c1
-rw-r--r--vp9/encoder/vp9_rdopt.c101
-rw-r--r--vp9/encoder/vp9_temporal_filter.c8
-rw-r--r--vp9/encoder/vp9_vaq.c44
-rw-r--r--vp9/vp9_common.mk1
17 files changed, 745 insertions, 318 deletions
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 7bdd11eb0..63171033c 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -264,13 +264,13 @@ prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8
specialize vp9_convolve_avg $sse2_x86inc neon dspr2
prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8 sse2 ssse3 neon dspr2
+specialize vp9_convolve8 sse2 ssse3 avx2 neon dspr2
prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_horiz sse2 ssse3 neon dspr2
+specialize vp9_convolve8_horiz sse2 ssse3 avx2 neon dspr2
prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_vert sse2 ssse3 neon dspr2
+specialize vp9_convolve8_vert sse2 ssse3 avx2 neon dspr2
prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_avg sse2 ssse3 neon dspr2
@@ -737,20 +737,20 @@ specialize vp9_fdct32x32_rd sse2 avx2
#
# Motion search
#
-prototype int vp9_full_search_sad "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv, int n"
+prototype int vp9_full_search_sad "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv, int n"
specialize vp9_full_search_sad sse3 sse4_1
vp9_full_search_sad_sse3=vp9_full_search_sadx3
vp9_full_search_sad_sse4_1=vp9_full_search_sadx8
-prototype int vp9_refining_search_sad "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"
+prototype int vp9_refining_search_sad "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"
specialize vp9_refining_search_sad sse3
vp9_refining_search_sad_sse3=vp9_refining_search_sadx4
-prototype int vp9_diamond_search_sad "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"
+prototype int vp9_diamond_search_sad "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"
specialize vp9_diamond_search_sad sse3
vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4
-prototype int vp9_full_range_search "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"
+prototype int vp9_full_range_search "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"
specialize vp9_full_range_search
prototype void vp9_temporal_filter_apply "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index 60018ea86..a2cf910a4 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -139,7 +139,49 @@ void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
} \
}
+#if HAVE_AVX2
+filter8_1dfunction vp9_filter_block1d16_v8_avx2;
+filter8_1dfunction vp9_filter_block1d16_h8_avx2;
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
+#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
+#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
+#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
+#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
+#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
+#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
+#define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3
+#define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3
+#define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3
+#define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3
+// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_2D(, avx2);
+#endif
#if HAVE_SSSE3
filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
new file mode 100644
index 000000000..0ffb1bce3
--- /dev/null
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
@@ -0,0 +1,542 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+#include "vpx_ports/mem.h"
+
+// filters for 16_h8 and 16_v8
+DECLARE_ALIGNED(32, const unsigned char, filt1_global_avx2[32])= {
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8};
+
+DECLARE_ALIGNED(32, const unsigned char, filt2_global_avx2[32])= {
+ 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+ 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10};
+
+DECLARE_ALIGNED(32, const unsigned char, filt3_global_avx2[32])= {
+ 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+ 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12};
+
+DECLARE_ALIGNED(32, const unsigned char, filt4_global_avx2[32])= {
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14};
+
+
+void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+ __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m256i srcReg32b1, srcReg32b2, filtersReg32;
+ unsigned int i;
+ unsigned int src_stride, dst_stride;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((__m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+#if defined (__GNUC__)
+#if ( __GNUC__ < 4 || (__GNUC__ == 4 && \
+(__GNUC_MINOR__ < 6 || (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0))))
+ filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg);
+#elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0))
+ filtersReg32 = _mm_broadcastsi128_si256(filtersReg);
+#else
+ filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
+#endif
+#else
+ filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
+#endif
+
+ // duplicate only the first 16 bits (first and second byte)
+ // across 256 bit register
+ firstFilters = _mm256_shuffle_epi8(filtersReg32,
+ _mm256_set1_epi16(0x100u));
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32,
+ _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32,
+ _mm256_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits (seventh and eighth byte)
+ // across 256 bit register
+ forthFilters = _mm256_shuffle_epi8(filtersReg32,
+ _mm256_set1_epi16(0x706u));
+
+ filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+ filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+ filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+ filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i-=2) {
+ // load the 2 strides of source
+ srcReg32b1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(src_ptr-3)));
+ srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
+ _mm_loadu_si128((__m128i *)
+ (src_ptr+src_pixels_per_line-3)), 1);
+
+ // filter the source buffer
+ srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+ srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+ // filter the source buffer
+ srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+ srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
+ _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+ // reading 2 strides of the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg32b2 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(src_ptr+5)));
+ srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
+ _mm_loadu_si128((__m128i *)
+ (src_ptr+src_pixels_per_line+5)), 1);
+
+ // add and saturate the results together
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
+ _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+ // filter the source buffer
+ srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
+
+ // filter the source buffer
+ srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
+ srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
+ _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
+ _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
+
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
+
+ // shift by 7 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
+ srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1,
+ srcRegFilt32b2_1);
+
+ src_ptr+=src_stride;
+
+ // save 16 bytes
+ _mm_store_si128((__m128i*)output_ptr,
+ _mm256_castsi256_si128(srcRegFilt32b1_1));
+
+ // save the next 16 bits
+ _mm_store_si128((__m128i*)(output_ptr+output_pitch),
+ _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+ output_ptr+=dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 16 bytes
+ if (i > 0) {
+ __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
+ __m128i srcRegFilt2, srcRegFilt3;
+
+ srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+ // filter the source buffer
+ srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
+ _mm256_castsi256_si128(filt1Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
+ _mm256_castsi256_si128(filt2Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
+ _mm256_castsi256_si128(firstFilters));
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+ _mm256_castsi256_si128(secondFilters));
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+ // filter the source buffer
+ srcRegFilt3= _mm_shuffle_epi8(srcReg1,
+ _mm256_castsi256_si128(filt4Reg));
+ srcRegFilt2= _mm_shuffle_epi8(srcReg1,
+ _mm256_castsi256_si128(filt3Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
+ _mm256_castsi256_si128(forthFilters));
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+ _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+ _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+
+ // reading the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+ _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+ // filter the source buffer
+ srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
+ _mm256_castsi256_si128(filt1Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
+ _mm256_castsi256_si128(filt2Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
+ _mm256_castsi256_si128(firstFilters));
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+ _mm256_castsi256_si128(secondFilters));
+
+ // add and saturate the results together
+ srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+ // filter the source buffer
+ srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
+ _mm256_castsi256_si128(filt4Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
+ _mm256_castsi256_si128(filt3Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
+ _mm256_castsi256_si128(forthFilters));
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+ _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+ _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+ srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+ _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+ _mm256_castsi256_si128(addFilterReg64));
+
+ srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+ _mm256_castsi256_si128(addFilterReg64));
+
+ // shift by 7 bit each 16 bit
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+ srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+ // save 16 bytes
+ _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
+ }
+}
+
+void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg64;
+ __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
+ __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
+ __m256i srcReg32b11, srcReg32b12, srcReg32b13, filtersReg32;
+ __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+ unsigned int i;
+ unsigned int src_stride, dst_stride;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((__m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+#if defined (__GNUC__)
+#if ( __GNUC__ < 4 || (__GNUC__ == 4 && \
+(__GNUC_MINOR__ < 6 || (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0))))
+ filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg);
+#elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0))
+ filtersReg32 = _mm_broadcastsi128_si256(filtersReg);
+#else
+ filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
+#endif
+#else
+ filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
+#endif
+
+ // duplicate only the first 16 bits (first and second byte)
+ // across 256 bit register
+ firstFilters = _mm256_shuffle_epi8(filtersReg32,
+ _mm256_set1_epi16(0x100u));
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32,
+ _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32,
+ _mm256_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits (seventh and eighth byte)
+ // across 256 bit register
+ forthFilters = _mm256_shuffle_epi8(filtersReg32,
+ _mm256_set1_epi16(0x706u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ // load 16 bytes 7 times in stride of src_pitch
+ srcReg32b1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(src_ptr)));
+ srcReg32b2 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)));
+ srcReg32b3 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)));
+ srcReg32b4 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)));
+ srcReg32b5 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)));
+ srcReg32b6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)));
+ srcReg32b7 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)));
+
+ // have each consecutive loads on the same 256 register
+ srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
+ _mm256_castsi256_si128(srcReg32b2), 1);
+ srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
+ _mm256_castsi256_si128(srcReg32b3), 1);
+ srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
+ _mm256_castsi256_si128(srcReg32b4), 1);
+ srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
+ _mm256_castsi256_si128(srcReg32b5), 1);
+ srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
+ _mm256_castsi256_si128(srcReg32b6), 1);
+ srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
+ _mm256_castsi256_si128(srcReg32b7), 1);
+
+ // merge every two consecutive registers except the last one
+ srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
+ srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
+
+ // save
+ srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
+
+ // save
+ srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
+
+ // save
+ srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
+
+ // save
+ srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
+
+
+ for (i = output_height; i > 1; i-=2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg32b8 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)));
+ srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
+ _mm256_castsi256_si128(srcReg32b8), 1);
+ srcReg32b9 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8)));
+ srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
+ _mm256_castsi256_si128(srcReg32b9), 1);
+
+ // merge every two consecutive registers
+ // save
+ srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
+ srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
+ srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+ srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
+ srcReg32b8 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
+
+ // add and saturate the results together
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b8);
+
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
+ srcReg32b6 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
+ srcReg32b13 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
+
+
+ // add and saturate the results together
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+ _mm256_min_epi16(srcReg32b8, srcReg32b12));
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+ _mm256_min_epi16(srcReg32b6, srcReg32b13));
+
+ // add and saturate the results together
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+ _mm256_max_epi16(srcReg32b8, srcReg32b12));
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+ _mm256_max_epi16(srcReg32b6, srcReg32b13));
+
+
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
+
+ // shift by 7 bit each 16 bit
+ srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
+ srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
+
+ src_ptr+=src_stride;
+
+ // save 16 bytes
+ _mm_store_si128((__m128i*)output_ptr,
+ _mm256_castsi256_si128(srcReg32b1));
+
+ // save the next 16 bits
+ _mm_store_si128((__m128i*)(output_ptr+out_pitch),
+ _mm256_extractf128_si256(srcReg32b1, 1));
+
+ output_ptr+=dst_stride;
+
+ // save part of the registers for next strides
+ srcReg32b10 = srcReg32b11;
+ srcReg32b1 = srcReg32b3;
+ srcReg32b11 = srcReg32b2;
+ srcReg32b3 = srcReg32b5;
+ srcReg32b2 = srcReg32b4;
+ srcReg32b5 = srcReg32b7;
+ srcReg32b7 = srcReg32b9;
+ }
+ if (i > 0) {
+ __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+ __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
+ // load the last 16 bytes
+ srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
+
+ // merge the last 2 results together
+ srcRegFilt4 = _mm_unpacklo_epi8(
+ _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+ srcRegFilt7 = _mm_unpackhi_epi8(
+ _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
+ _mm256_castsi256_si128(firstFilters));
+ srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4,
+ _mm256_castsi256_si128(forthFilters));
+ srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
+ _mm256_castsi256_si128(firstFilters));
+ srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7,
+ _mm256_castsi256_si128(forthFilters));
+
+ // add and saturate the results together
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+ srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
+
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
+ _mm256_castsi256_si128(secondFilters));
+ srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
+ _mm256_castsi256_si128(secondFilters));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
+ _mm256_castsi256_si128(thirdFilters));
+ srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
+ _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+ _mm_min_epi16(srcRegFilt4, srcRegFilt6));
+ srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
+ _mm_min_epi16(srcRegFilt5, srcRegFilt7));
+
+ // add and saturate the results together
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+ _mm_max_epi16(srcRegFilt4, srcRegFilt6));
+ srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
+ _mm_max_epi16(srcRegFilt5, srcRegFilt7));
+
+
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+ _mm256_castsi256_si128(addFilterReg64));
+ srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
+ _mm256_castsi256_si128(addFilterReg64));
+
+ // shift by 7 bit each 16 bit
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+ srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
+
+ // save 16 bytes
+ _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
+ }
+}
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 713cc5132..7cbdfce52 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -172,9 +172,7 @@ struct macroblock {
int skip_encode;
// Used to store sub partition's choices.
- int fast_ms;
int_mv pred_mv[MAX_REF_FRAMES];
- int subblock_ref;
// TODO(jingning): Need to refactor the structure arrays that buffers the
// coding mode decisions of each partition type.
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 1b106f4c7..5585428d9 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -345,7 +345,6 @@ static void select_in_frame_q_segment(VP9_COMP *cpi,
int mi_row, int mi_col,
int output_enabled, int projected_rate) {
VP9_COMMON *const cm = &cpi->common;
- int target_rate = cpi->rc.sb64_target_rate << 8; // convert to bits << 8
const int mi_offset = mi_row * cm->mi_cols + mi_col;
const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
@@ -362,11 +361,10 @@ static void select_in_frame_q_segment(VP9_COMP *cpi,
} else {
// Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
// It is converted to bits * 256 units
- target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) / (bw * bh);
+ const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) /
+ (bw * bh);
if (projected_rate < (target_rate / 4)) {
- segment = 2;
- } else if (projected_rate < (target_rate / 2)) {
segment = 1;
} else {
segment = 0;
@@ -667,7 +665,18 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
const int energy = bsize <= BLOCK_16X16 ? x->mb_energy
: vp9_block_energy(cpi, x, bsize);
- xd->mi_8x8[0]->mbmi.segment_id = vp9_vaq_segment_id(energy);
+
+ if (cm->frame_type == KEY_FRAME ||
+ cpi->refresh_alt_ref_frame ||
+ (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+ xd->mi_8x8[0]->mbmi.segment_id = vp9_vaq_segment_id(energy);
+ } else {
+ const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map
+ : cm->last_frame_seg_map;
+ xd->mi_8x8[0]->mbmi.segment_id =
+ vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
+ }
+
rdmult_ratio = vp9_vaq_rdmult_ratio(energy);
vp9_mb_init_quantizer(cpi, x);
}
@@ -681,11 +690,12 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
} else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
const int mi_offset = mi_row * cm->mi_cols + mi_col;
unsigned char complexity = cpi->complexity_map[mi_offset];
- const int is_edge = (mi_row == 0) || (mi_row == (cm->mi_rows - 1)) ||
- (mi_col == 0) || (mi_col == (cm->mi_cols - 1));
+ const int is_edge = (mi_row <= 1) || (mi_row >= (cm->mi_rows - 2)) ||
+ (mi_col <= 1) || (mi_col >= (cm->mi_cols - 2));
- if (!is_edge && (complexity > 128))
+ if (!is_edge && (complexity > 128)) {
x->rdmult = x->rdmult + ((x->rdmult * (complexity - 128)) / 256);
+ }
}
// Find best coding mode & reconstruct the MB so it is available
@@ -709,6 +719,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
*totalrate = round(*totalrate * rdmult_ratio);
}
}
+ else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+ x->rdmult = orig_rdmult;
+ }
}
static void update_stats(VP9_COMP *cpi) {
@@ -1253,9 +1266,6 @@ static void rd_use_partition(VP9_COMP *cpi,
x->mb_energy = vp9_block_energy(cpi, x, bsize);
}
- x->fast_ms = 0;
- x->subblock_ref = 0;
-
if (cpi->sf.adjust_partitioning_from_last_frame) {
// Check if any of the sub blocks are further split.
if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) {
@@ -1613,80 +1623,6 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
}
}
-static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) {
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
-
- if (cm->frame_type == INTER_FRAME &&
- !cpi->rc.is_src_frame_alt_ref &&
- (bsize == BLOCK_16X16 || bsize == BLOCK_32X32 || bsize == BLOCK_64X64)) {
- const PICK_MODE_CONTEXT *block_context = get_block_context(x, bsize);
- const int ref0 = block_context[0].mic.mbmi.ref_frame[0];
- const int ref1 = block_context[1].mic.mbmi.ref_frame[0];
- const int ref2 = block_context[2].mic.mbmi.ref_frame[0];
- const int ref3 = block_context[3].mic.mbmi.ref_frame[0];
-
- // Currently, only consider 4 inter reference frames.
- if (ref0 && ref1 && ref2 && ref3) {
- int d01, d23, d02, d13;
-
- // Motion vectors for the four subblocks.
- int16_t mvr0 = block_context[0].mic.mbmi.mv[0].as_mv.row;
- int16_t mvc0 = block_context[0].mic.mbmi.mv[0].as_mv.col;
- int16_t mvr1 = block_context[1].mic.mbmi.mv[0].as_mv.row;
- int16_t mvc1 = block_context[1].mic.mbmi.mv[0].as_mv.col;
- int16_t mvr2 = block_context[2].mic.mbmi.mv[0].as_mv.row;
- int16_t mvc2 = block_context[2].mic.mbmi.mv[0].as_mv.col;
- int16_t mvr3 = block_context[3].mic.mbmi.mv[0].as_mv.row;
- int16_t mvc3 = block_context[3].mic.mbmi.mv[0].as_mv.col;
-
- // Adjust sign if ref is alt_ref.
- if (cm->ref_frame_sign_bias[ref0]) {
- mvr0 *= -1;
- mvc0 *= -1;
- }
-
- if (cm->ref_frame_sign_bias[ref1]) {
- mvr1 *= -1;
- mvc1 *= -1;
- }
-
- if (cm->ref_frame_sign_bias[ref2]) {
- mvr2 *= -1;
- mvc2 *= -1;
- }
-
- if (cm->ref_frame_sign_bias[ref3]) {
- mvr3 *= -1;
- mvc3 *= -1;
- }
-
- // Calculate mv distances.
- d01 = MAX(abs(mvr0 - mvr1), abs(mvc0 - mvc1));
- d23 = MAX(abs(mvr2 - mvr3), abs(mvc2 - mvc3));
- d02 = MAX(abs(mvr0 - mvr2), abs(mvc0 - mvc2));
- d13 = MAX(abs(mvr1 - mvr3), abs(mvc1 - mvc3));
-
- if (d01 < FAST_MOTION_MV_THRESH && d23 < FAST_MOTION_MV_THRESH &&
- d02 < FAST_MOTION_MV_THRESH && d13 < FAST_MOTION_MV_THRESH) {
- // Set fast motion search level.
- x->fast_ms = 1;
-
- if (ref0 == ref1 && ref1 == ref2 && ref2 == ref3 &&
- d01 < 2 && d23 < 2 && d02 < 2 && d13 < 2) {
- // Set fast motion search level.
- x->fast_ms = 2;
-
- if (!d01 && !d23 && !d02 && !d13) {
- x->fast_ms = 3;
- x->subblock_ref = ref0;
- }
- }
- }
- }
- }
-}
-
static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
vpx_memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
}
@@ -1726,8 +1662,6 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
bsize >= BLOCK_8X8;
int partition_vert_allowed = !force_horz_split && xss <= yss &&
bsize >= BLOCK_8X8;
-
- int partition_split_done = 0;
(void) *tp_orig;
if (bsize < BLOCK_8X8) {
@@ -1869,18 +1803,9 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
if (cpi->sf.less_rectangular_check)
do_rect &= !partition_none_allowed;
}
- partition_split_done = 1;
restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
}
- x->fast_ms = 0;
- x->subblock_ref = 0;
-
- if (partition_split_done &&
- cpi->sf.using_small_partition_info) {
- compute_fast_motion_search_level(cpi, bsize);
- }
-
// PARTITION_HORZ
if (partition_horz_allowed && do_rect) {
subsize = get_subsize(bsize, PARTITION_HORZ);
@@ -1985,7 +1910,11 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
}
-
+ // TODO(jbb): This code added so that we avoid static analysis
+ // warning related to the fact that best_rd isn't used after this
+ // point. This code should be refactored so that the duplicate
+ // checks occur in some sub function and thus are used...
+ (void) best_rd;
*rate = best_rate;
*dist = best_dist;
@@ -2009,41 +1938,6 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
}
}
-// Examines 64x64 block and chooses a best reference frame
-static void rd_pick_reference_frame(VP9_COMP *cpi, const TileInfo *const tile,
- int mi_row, int mi_col) {
- VP9_COMMON * const cm = &cpi->common;
- MACROBLOCK * const x = &cpi->mb;
- int bsl = b_width_log2(BLOCK_64X64), bs = 1 << bsl;
- int ms = bs / 2;
- ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
- PARTITION_CONTEXT sl[8], sa[8];
- int pl;
- int r;
- int64_t d;
-
- save_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64);
-
- // Default is non mask (all reference frames allowed.
- cpi->ref_frame_mask = 0;
-
- // Do RD search for 64x64.
- if ((mi_row + (ms >> 1) < cm->mi_rows) &&
- (mi_col + (ms >> 1) < cm->mi_cols)) {
- cpi->set_ref_frame_mask = 1;
- rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &r, &d, BLOCK_64X64,
- get_block_context(x, BLOCK_64X64), INT64_MAX);
- pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
- mi_row, mi_col, BLOCK_64X64);
- r += x->partition_cost[pl][PARTITION_NONE];
-
- *(get_sb_partitioning(x, BLOCK_64X64)) = BLOCK_64X64;
- cpi->set_ref_frame_mask = 0;
- }
-
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64);
-}
-
static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
int mi_row, TOKENEXTRA **tp) {
VP9_COMMON *const cm = &cpi->common;
@@ -2337,6 +2231,7 @@ static void select_tx_mode(VP9_COMP *cpi) {
}
}
}
+
// Start RTC Exploration
typedef enum {
BOTH_ZERO = 0,
@@ -2368,12 +2263,15 @@ static void set_mode_info(MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
mbmi->sb_type = bsize;
mbmi->segment_id = 0;
}
+
static INLINE int get_block_row(int b32i, int b16i, int b8i) {
return ((b32i >> 1) << 2) + ((b16i >> 1) << 1) + (b8i >> 1);
}
+
static INLINE int get_block_col(int b32i, int b16i, int b8i) {
return ((b32i & 1) << 2) + ((b16i & 1) << 1) + (b8i & 1);
}
+
static void rtc_use_partition(VP9_COMP *cpi,
const TileInfo *const tile,
MODE_INFO **mi_8x8,
@@ -2393,8 +2291,6 @@ static void rtc_use_partition(VP9_COMP *cpi,
int row8x8_remaining = tile->mi_row_end - mi_row;
int col8x8_remaining = tile->mi_col_end - mi_col;
int b32i;
- x->fast_ms = 0;
- x->subblock_ref = 0;
for (b32i = 0; b32i < 4; b32i++) {
int b16i;
for (b16i = 0; b16i < 4; b16i++) {
@@ -2405,10 +2301,6 @@ static void rtc_use_partition(VP9_COMP *cpi,
int rate;
int64_t dist;
- int_mv frame_nearest_mv[MAX_REF_FRAMES];
- int_mv frame_near_mv[MAX_REF_FRAMES];
- struct buf_2d yv12_mb[MAX_REF_FRAMES][MAX_MB_PLANE];
-
// Find a partition size that fits
bsize = find_partition_size(cpi->sf.always_this_block_size,
(row8x8_remaining - block_row),
@@ -2430,10 +2322,6 @@ static void rtc_use_partition(VP9_COMP *cpi,
} else {
set_mode_info(&mi_8x8[index]->mbmi, bsize, mode,
mi_row + block_row, mi_col + block_col);
- vp9_setup_buffer_inter(cpi, x, tile,
- LAST_FRAME, cpi->sf.always_this_block_size,
- mi_row + block_row, mi_col + block_col,
- frame_nearest_mv, frame_near_mv, yv12_mb);
}
for (j = 0; j < mi_height; j++)
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 3e04c2faf..bf9dd3ec5 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -1566,13 +1566,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
(i >= MIN_GF_INTERVAL) &&
// for real scene cuts (not forced kfs) dont allow arf very near kf.
(rc->next_key_frame_forced ||
- (i <= (rc->frames_to_key - MIN_GF_INTERVAL))) &&
- ((next_frame.pcnt_inter > 0.75) ||
- (next_frame.pcnt_second_ref > 0.5)) &&
- ((mv_in_out_accumulator / (double)i > -0.2) ||
- (mv_in_out_accumulator > -2.0)) &&
- (boost_score > 100)) {
-
+ (i <= (rc->frames_to_key - MIN_GF_INTERVAL)))) {
// Alternative boost calculation for alt ref
rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
&b_boost);
@@ -1926,8 +1920,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
double kf_mod_err = 0.0;
double kf_group_err = 0.0;
- double kf_group_intra_err = 0.0;
- double kf_group_coded_err = 0.0;
double recent_loop_decay[8] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
RATE_CONTROL *const rc = &cpi->rc;
@@ -1965,12 +1957,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Accumulate kf group error
kf_group_err += calculate_modified_err(cpi, this_frame);
- // These figures keep intra and coded error counts for all frames including
- // key frames in the group. The effect of the key frame itself can be
- // subtracted out using the first_frame data collected above.
- kf_group_intra_err += this_frame->intra_error;
- kf_group_coded_err += this_frame->coded_error;
-
// load a the next frame's stats
last_frame = *this_frame;
input_stats(twopass, this_frame);
@@ -2030,15 +2016,11 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
reset_fpf_position(twopass, start_position);
kf_group_err = 0;
- kf_group_intra_err = 0;
- kf_group_coded_err = 0;
// Rescan to get the correct error data for the forced kf group
for (i = 0; i < rc->frames_to_key; i++) {
// Accumulate kf group errors
kf_group_err += calculate_modified_err(cpi, &tmp_frame);
- kf_group_intra_err += tmp_frame.intra_error;
- kf_group_coded_err += tmp_frame.coded_error;
// Load the next frame's stats.
input_stats(twopass, &tmp_frame);
@@ -2054,12 +2036,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
if (twopass->stats_in >= twopass->stats_in_end) {
// Accumulate kf group error
kf_group_err += calculate_modified_err(cpi, this_frame);
-
- // These figures keep intra and coded error counts for all frames including
- // key frames in the group. The effect of the key frame itself can be
- // subtracted out using the first_frame data collected above.
- kf_group_intra_err += this_frame->intra_error;
- kf_group_coded_err += this_frame->coded_error;
}
// Calculate the number of bits that should be assigned to the kf group.
@@ -2089,7 +2065,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// frames use inter blocks.
decay_accumulator = 1.0;
boost_score = 0.0;
- loop_decay_rate = 1.00; // Starting decay rate
// Scan through the kf group collating various stats.
for (i = 0; i < rc->frames_to_key; i++) {
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index c50098678..7eacda217 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -29,7 +29,6 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
- unsigned int best_err;
const int tmp_col_min = x->mv_col_min;
const int tmp_col_max = x->mv_col_max;
@@ -48,27 +47,22 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
ref_full.row = ref_mv->row >> 3;
/*cpi->sf.search_method == HEX*/
- best_err = vp9_hex_search(x, &ref_full, step_param, x->errorperbit,
- 0, &v_fn_ptr, 0, ref_mv, dst_mv);
+ vp9_hex_search(x, &ref_full, step_param, x->errorperbit, 0, &v_fn_ptr, 0,
+ ref_mv, dst_mv);
// Try sub-pixel MC
// if (bestsme > error_thresh && bestsme < INT_MAX)
{
int distortion;
unsigned int sse;
- best_err = cpi->find_fractional_mv_step(
- x, dst_mv, ref_mv,
- cpi->common.allow_high_precision_mv,
- x->errorperbit, &v_fn_ptr,
- 0, cpi->sf.subpel_iters_per_step, NULL, NULL,
- & distortion, &sse);
+ cpi->find_fractional_mv_step(
+ x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
+ &v_fn_ptr, 0, cpi->sf.subpel_iters_per_step, NULL, NULL, &distortion,
+ &sse);
}
vp9_set_mbmode_and_mvs(xd, NEWMV, dst_mv);
vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16);
- best_err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
- xd->plane[0].dst.buf, xd->plane[0].dst.stride,
- INT_MAX);
/* restore UMV window */
x->mv_col_min = tmp_col_min;
@@ -76,7 +70,9 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
x->mv_row_min = tmp_row_min;
x->mv_row_max = tmp_row_max;
- return best_err;
+ return vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+ xd->plane[0].dst.buf, xd->plane[0].dst.stride,
+ INT_MAX);
}
static int do_16x16_motion_search(VP9_COMP *cpi, const int_mv *ref_mv,
@@ -355,7 +351,7 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
// If any of the blocks in the sequence failed then the MB
// goes in segment 0
- if (arf_not_zz[mi_row/2*cm->mb_cols + mi_col/2]) {
+ if (arf_not_zz[mi_row / 2 * cm->mb_cols + mi_col / 2]) {
ncnt[0]++;
cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 0;
} else {
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index b2555e682..62b33e4b9 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -349,6 +349,10 @@ int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
tr = br;
tc = bc;
}
+ // These lines insure static analysis doesn't warn that
+ // tr and tc aren't used after the above point.
+ (void) tr;
+ (void) tc;
bestmv->row = br;
bestmv->col = bc;
@@ -452,6 +456,11 @@ int vp9_find_best_sub_pixel_comp_tree(const MACROBLOCK *x,
tr = br;
tc = bc;
}
+ // These lines insure static analysis doesn't warn that
+ // tr and tc aren't used after the above point.
+ (void) tr;
+ (void) tc;
+
bestmv->row = br;
bestmv->col = bc;
@@ -850,8 +859,9 @@ int vp9_square_search(const MACROBLOCK *x,
int vp9_full_range_search_c(const MACROBLOCK *x, MV *ref_mv, MV *best_mv,
int search_param, int sad_per_bit, int *num00,
- vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost,
- int *mvcost[2], const MV *center_mv) {
+ const vp9_variance_fn_ptr_t *fn_ptr,
+ int *mvjcost, int *mvcost[2],
+ const MV *center_mv) {
const MACROBLOCKD *const xd = &x->e_mbd;
const uint8_t *what = x->plane[0].src.buf;
const int what_stride = x->plane[0].src.stride;
@@ -965,8 +975,9 @@ int vp9_full_range_search_c(const MACROBLOCK *x, MV *ref_mv, MV *best_mv,
int vp9_diamond_search_sad_c(const MACROBLOCK *x,
MV *ref_mv, MV *best_mv,
int search_param, int sad_per_bit, int *num00,
- vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost,
- int *mvcost[2], const MV *center_mv) {
+ const vp9_variance_fn_ptr_t *fn_ptr,
+ int *mvjcost, int *mvcost[2],
+ const MV *center_mv) {
int i, j, step;
const MACROBLOCKD *const xd = &x->e_mbd;
@@ -1099,7 +1110,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
int vp9_diamond_search_sadx4(const MACROBLOCK *x,
MV *ref_mv, MV *best_mv, int search_param,
int sad_per_bit, int *num00,
- vp9_variance_fn_ptr_t *fn_ptr,
+ const vp9_variance_fn_ptr_t *fn_ptr,
int *mvjcost, int *mvcost[2],
const MV *center_mv) {
int i, j, step;
@@ -1279,7 +1290,8 @@ int vp9_diamond_search_sadx4(const MACROBLOCK *x,
int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x,
MV *mvp_full, int step_param,
int sadpb, int further_steps,
- int do_refine, vp9_variance_fn_ptr_t *fn_ptr,
+ int do_refine,
+ const vp9_variance_fn_ptr_t *fn_ptr,
const MV *ref_mv, int_mv *dst_mv) {
int_mv temp_mv;
int thissme, n, num00;
@@ -1336,9 +1348,9 @@ int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x,
return bestsme;
}
-int vp9_full_search_sad_c(const MACROBLOCK *x, MV *ref_mv,
+int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
int sad_per_bit, int distance,
- vp9_variance_fn_ptr_t *fn_ptr,
+ const vp9_variance_fn_ptr_t *fn_ptr,
int *mvjcost, int *mvcost[2],
const MV *center_mv, int block) {
int r, c;
@@ -1389,10 +1401,11 @@ int vp9_full_search_sad_c(const MACROBLOCK *x, MV *ref_mv,
}
}
-int vp9_full_search_sadx3(const MACROBLOCK *x, MV *ref_mv,
+int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
int sad_per_bit, int distance,
- vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost,
- int *mvcost[2], const MV *center_mv, int n) {
+ const vp9_variance_fn_ptr_t *fn_ptr,
+ int *mvjcost, int *mvcost[2],
+ const MV *center_mv, int n) {
const MACROBLOCKD *const xd = &x->e_mbd;
const uint8_t *const what = x->plane[0].src.buf;
const int what_stride = x->plane[0].src.stride;
@@ -1494,9 +1507,9 @@ int vp9_full_search_sadx3(const MACROBLOCK *x, MV *ref_mv,
return INT_MAX;
}
-int vp9_full_search_sadx8(const MACROBLOCK *x, MV *ref_mv,
+int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
int sad_per_bit, int distance,
- vp9_variance_fn_ptr_t *fn_ptr,
+ const vp9_variance_fn_ptr_t *fn_ptr,
int *mvjcost, int *mvcost[2],
const MV *center_mv, int n) {
const MACROBLOCKD *const xd = &x->e_mbd;
@@ -1630,7 +1643,8 @@ int vp9_full_search_sadx8(const MACROBLOCK *x, MV *ref_mv,
int vp9_refining_search_sad_c(const MACROBLOCK *x,
MV *ref_mv, int error_per_bit,
- int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+ int search_range,
+ const vp9_variance_fn_ptr_t *fn_ptr,
int *mvjcost, int *mvcost[2],
const MV *center_mv) {
const MACROBLOCKD *const xd = &x->e_mbd;
@@ -1702,7 +1716,8 @@ int vp9_refining_search_sad_c(const MACROBLOCK *x,
int vp9_refining_search_sadx4(const MACROBLOCK *x,
MV *ref_mv, int error_per_bit,
- int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+ int search_range,
+ const vp9_variance_fn_ptr_t *fn_ptr,
int *mvjcost, int *mvcost[2],
const MV *center_mv) {
const MACROBLOCKD *const xd = &x->e_mbd;
@@ -1815,8 +1830,10 @@ int vp9_refining_search_sadx4(const MACROBLOCK *x,
// mode.
int vp9_refining_search_8p_c(const MACROBLOCK *x,
MV *ref_mv, int error_per_bit,
- int search_range, vp9_variance_fn_ptr_t *fn_ptr,
- int *mvjcost, int *mvcost[2], const MV *center_mv,
+ int search_range,
+ const vp9_variance_fn_ptr_t *fn_ptr,
+ int *mvjcost, int *mvcost[2],
+ const MV *center_mv,
const uint8_t *second_pred, int w, int h) {
const MACROBLOCKD *const xd = &x->e_mbd;
const MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 28b46b503..e1d6abeb6 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -45,7 +45,7 @@ int vp9_init_search_range(struct VP9_COMP *cpi, int size);
int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x,
MV *mvp_full, int step_param,
int sadpb, int further_steps, int do_refine,
- vp9_variance_fn_ptr_t *fn_ptr,
+ const vp9_variance_fn_ptr_t *fn_ptr,
const MV *ref_mv, int_mv *dst_mv);
int vp9_hex_search(const MACROBLOCK *x,
@@ -107,15 +107,16 @@ typedef int (fractional_mv_step_comp_fp) (
extern fractional_mv_step_comp_fp vp9_find_best_sub_pixel_comp_tree;
typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x,
- MV *ref_mv, int sad_per_bit,
- int distance, vp9_variance_fn_ptr_t *fn_ptr,
+ const MV *ref_mv, int sad_per_bit,
+ int distance,
+ const vp9_variance_fn_ptr_t *fn_ptr,
int *mvjcost, int *mvcost[2],
const MV *center_mv, int n);
typedef int (*vp9_refining_search_fn_t)(const MACROBLOCK *x,
MV *ref_mv, int sad_per_bit,
int distance,
- vp9_variance_fn_ptr_t *fn_ptr,
+ const vp9_variance_fn_ptr_t *fn_ptr,
int *mvjcost, int *mvcost[2],
const MV *center_mv);
@@ -123,13 +124,14 @@ typedef int (*vp9_diamond_search_fn_t)(const MACROBLOCK *x,
MV *ref_mv, MV *best_mv,
int search_param, int sad_per_bit,
int *num00,
- vp9_variance_fn_ptr_t *fn_ptr,
+ const vp9_variance_fn_ptr_t *fn_ptr,
int *mvjcost, int *mvcost[2],
const MV *center_mv);
int vp9_refining_search_8p_c(const MACROBLOCK *x,
MV *ref_mv, int error_per_bit,
- int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+ int search_range,
+ const vp9_variance_fn_ptr_t *fn_ptr,
int *mvjcost, int *mvcost[2],
const MV *center_mv, const uint8_t *second_pred,
int w, int h);
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 9f4d47bdd..cef7e0403 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -96,7 +96,7 @@ FILE *keyfile;
void vp9_init_quantizer(VP9_COMP *cpi);
static const double in_frame_q_adj_ratio[MAX_SEGMENTS] =
- {1.0, 1.5, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+ {1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {
switch (mode) {
@@ -267,7 +267,6 @@ static void setup_in_frame_q_adj(VP9_COMP *cpi) {
// Clear down the complexity map used for rd
vpx_memset(cpi->complexity_map, 0, cm->mi_rows * cm->mi_cols);
- // Enable segmentation
vp9_enable_segmentation((VP9_PTR)cpi);
vp9_clearall_segfeatures(seg);
@@ -278,7 +277,7 @@ static void setup_in_frame_q_adj(VP9_COMP *cpi) {
vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q);
// Use some of the segments for in frame Q adjustment
- for (segment = 1; segment < 3; segment++) {
+ for (segment = 1; segment < 2; segment++) {
qindex_delta =
vp9_compute_qdelta_by_rate(cpi, cm->base_qindex,
in_frame_q_adj_ratio[segment]);
@@ -911,7 +910,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->use_uv_intra_rd_estimate = 0;
sf->use_fast_lpf_pick = 0;
sf->use_fast_coef_updates = 0;
- sf->using_small_partition_info = 0;
sf->mode_skip_start = MAX_MODES; // Mode index at which mode skip mask set
sf->use_pick_mode = 0;
sf->encode_breakout_thresh = 0;
@@ -3708,8 +3706,9 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
xd->interp_kernel = vp9_get_interp_kernel(
DEFAULT_INTERP_FILTER == SWITCHABLE ? EIGHTTAP : DEFAULT_INTERP_FILTER);
- if (cpi->oxcf.aq_mode == VARIANCE_AQ)
+ if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
vp9_vaq_init();
+ }
if (cpi->use_svc) {
SvcEncode(cpi, size, dest, frame_flags);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 88a041984..1ab1814c0 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -346,11 +346,6 @@ typedef struct {
// inter modes or to enable it always.
int disable_split_mask;
- // TODO(jbb): Remove this and everything that uses it. It's only valid if
- // we were doing small to large partition checks. We currently do the
- // reverse.
- int using_small_partition_info;
-
// TODO(jingning): combine the related motion search speed features
// This allows us to use motion search at other sizes as a starting
// point for this motion search and limits the search range around it.
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 3b4a7f6e4..912968594 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -98,10 +98,8 @@ static int full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
mvp_full.col >>= 3;
mvp_full.row >>= 3;
- bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
- sadpb, further_steps, 1,
- &cpi->fn_ptr[bsize],
- &ref_mv.as_mv, tmp_mv);
+ vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, sadpb, further_steps, 1,
+ &cpi->fn_ptr[bsize], &ref_mv.as_mv, tmp_mv);
x->mv_col_min = tmp_col_min;
x->mv_col_max = tmp_col_max;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 9afa0647d..2427dbe74 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -846,7 +846,6 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
int q;
if (frame_is_intra_only(cm)) {
- active_best_quality = rc->best_quality;
#if !CONFIG_MULTIPLE_ARF
// Handle the special case for key frames forced when we have75 reached
// the maximum key frame interval. Here force the Q to a range
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index cae7884fd..03def2095 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -68,7 +68,7 @@ struct rdcost_block_args {
int64_t this_rd;
int64_t best_rd;
int skip;
- const int16_t *scan, *nb;
+ const scan_order *so;
};
const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
@@ -635,7 +635,7 @@ static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx,
args->t_left + y_idx, tx_size,
- args->scan, args->nb);
+ args->so->scan, args->so->neighbors);
}
static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -710,49 +710,40 @@ void vp9_get_entropy_contexts(TX_SIZE tx_size,
}
}
-static void init_rdcost_stack(MACROBLOCK *x, const int64_t ref_rdcost,
- struct rdcost_block_args *arg) {
- vpx_memset(arg, 0, sizeof(struct rdcost_block_args));
- arg->x = x;
- arg->best_rd = ref_rdcost;
-}
-
static void txfm_rd_in_plane(MACROBLOCK *x,
int *rate, int64_t *distortion,
int *skippable, int64_t *sse,
int64_t ref_best_rd, int plane,
BLOCK_SIZE bsize, TX_SIZE tx_size) {
- struct rdcost_block_args rd_stack;
MACROBLOCKD *const xd = &x->e_mbd;
struct macroblockd_plane *const pd = &xd->plane[plane];
const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
const int num_4x4_w = num_4x4_blocks_wide_lookup[bs];
const int num_4x4_h = num_4x4_blocks_high_lookup[bs];
- const scan_order *so;
+ struct rdcost_block_args args = { 0 };
+ args.x = x;
+ args.best_rd = ref_best_rd;
- init_rdcost_stack(x, ref_best_rd, &rd_stack);
if (plane == 0)
xd->mi_8x8[0]->mbmi.tx_size = tx_size;
- vp9_get_entropy_contexts(tx_size, rd_stack.t_above, rd_stack.t_left,
+ vp9_get_entropy_contexts(tx_size, args.t_above, args.t_left,
pd->above_context, pd->left_context,
num_4x4_w, num_4x4_h);
- so = get_scan(xd, tx_size, pd->plane_type, 0);
- rd_stack.scan = so->scan;
- rd_stack.nb = so->neighbors;
+ args.so = get_scan(xd, tx_size, pd->plane_type, 0);
vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
- block_rd_txfm, &rd_stack);
- if (rd_stack.skip) {
+ block_rd_txfm, &args);
+ if (args.skip) {
*rate = INT_MAX;
*distortion = INT64_MAX;
*sse = INT64_MAX;
*skippable = 0;
} else {
- *distortion = rd_stack.this_dist;
- *rate = rd_stack.this_rate;
- *sse = rd_stack.this_sse;
+ *distortion = args.this_dist;
+ *rate = args.this_rate;
+ *sse = args.this_sse;
*skippable = vp9_is_skippable_in_plane(x, bsize, plane);
}
}
@@ -787,7 +778,10 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
- int64_t rd[TX_SIZES][2];
+ int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
+ {INT64_MAX, INT64_MAX},
+ {INT64_MAX, INT64_MAX},
+ {INT64_MAX, INT64_MAX}};
int n, m;
int s0, s1;
const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
@@ -862,7 +856,10 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
- int64_t rd[TX_SIZES][2];
+ int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
+ {INT64_MAX, INT64_MAX},
+ {INT64_MAX, INT64_MAX},
+ {INT64_MAX, INT64_MAX}};
int n, m;
int s0, s1;
double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00};
@@ -1604,13 +1601,11 @@ typedef struct {
int mvthresh;
} BEST_SEG_INFO;
-static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
- int r = 0;
- r |= (mv->as_mv.row >> 3) < x->mv_row_min;
- r |= (mv->as_mv.row >> 3) > x->mv_row_max;
- r |= (mv->as_mv.col >> 3) < x->mv_col_min;
- r |= (mv->as_mv.col >> 3) > x->mv_col_max;
- return r;
+static INLINE int mv_check_bounds(const MACROBLOCK *x, const MV *mv) {
+ return (mv->row >> 3) < x->mv_row_min ||
+ (mv->row >> 3) > x->mv_row_max ||
+ (mv->col >> 3) < x->mv_col_min ||
+ (mv->col >> 3) > x->mv_col_max;
}
static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
@@ -1924,10 +1919,9 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
}
// Trap vectors that reach beyond the UMV borders
- if (mv_check_bounds(x, &mode_mv[this_mode]))
- continue;
- if (has_second_rf &&
- mv_check_bounds(x, &second_mode_mv[this_mode]))
+ if (mv_check_bounds(x, &mode_mv[this_mode].as_mv) ||
+ (has_second_rf &&
+ mv_check_bounds(x, &second_mode_mv[this_mode].as_mv)))
continue;
if (filter_idx > 0) {
@@ -2380,24 +2374,16 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
vp9_set_mv_search_range(x, &ref_mv.as_mv);
- // Adjust search parameters based on small partitions' result.
- if (x->fast_ms) {
- // adjust search range
- step_param = 6;
- if (x->fast_ms > 1)
- step_param = 8;
+ // Work out the size of the first step in the mv step search.
+ // 0 here is maximum length first step. 1 is MAX >> 1 etc.
+ if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
+ // Take wtd average of the step_params based on the last frame's
+ // max mv magnitude and that based on the best ref mvs of the current
+ // block for the given reference.
+ step_param = (vp9_init_search_range(cpi, x->max_mv_context[ref]) +
+ cpi->mv_step_param) >> 1;
} else {
- // Work out the size of the first step in the mv step search.
- // 0 here is maximum length first step. 1 is MAX >> 1 etc.
- if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
- // Take wtd average of the step_params based on the last frame's
- // max mv magnitude and that based on the best ref mvs of the current
- // block for the given reference.
- step_param = (vp9_init_search_range(cpi, x->max_mv_context[ref]) +
- cpi->mv_step_param) >> 1;
- } else {
- step_param = cpi->mv_step_param;
- }
+ step_param = cpi->mv_step_param;
}
if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 &&
@@ -2750,7 +2736,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
if (this_mode != NEWMV)
clamp_mv2(&cur_mv[i].as_mv, xd);
- if (mv_check_bounds(x, &cur_mv[i]))
+ if (mv_check_bounds(x, &cur_mv[i].as_mv))
return INT64_MAX;
mbmi->mv[i].as_int = cur_mv[i].as_int;
}
@@ -3260,12 +3246,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
continue;
- // Skip some checking based on small partitions' result.
- if (x->fast_ms > 1 && !ref_frame)
- continue;
- if (x->fast_ms > 2 && ref_frame != x->subblock_ref)
- continue;
-
mbmi->ref_frame[0] = ref_frame;
mbmi->ref_frame[1] = second_ref_frame;
@@ -4126,11 +4106,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
if (tmp_rd == INT64_MAX)
continue;
} else {
- if (cm->interp_filter == SWITCHABLE) {
- int rs = get_switchable_rate(x);
- tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);
- }
- tmp_rd = tmp_best_rdu;
total_sse = tmp_best_sse;
rate = tmp_best_rate;
rate_y = tmp_best_ratey;
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index ca11dda1e..004047773 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -29,7 +29,6 @@
#include "vpx_scale/vpx_scale.h"
#define ALT_REF_MC_ENABLED 1 // dis/enable MC in AltRef filtering
-#define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering
static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
uint8_t *y_mb_ptr,
@@ -160,11 +159,9 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
/*cpi->sf.search_method == HEX*/
// Ignore mv costing by sending NULL pointer instead of cost arrays
- bestsme = vp9_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
- &cpi->fn_ptr[BLOCK_16X16],
- 0, &best_ref_mv1, ref_mv);
+ vp9_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
+ &cpi->fn_ptr[BLOCK_16X16], 0, &best_ref_mv1, ref_mv);
-#if ALT_REF_SUBPEL_ENABLED
// Try sub-pixel MC?
// if (bestsme > error_thresh && bestsme < INT_MAX)
{
@@ -180,7 +177,6 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
NULL, NULL,
&distortion, &sse);
}
-#endif
// Restore input state
x->plane[0].src = src;
diff --git a/vp9/encoder/vp9_vaq.c b/vp9/encoder/vp9_vaq.c
index 1f9cb8709..acd7c416e 100644
--- a/vp9/encoder/vp9_vaq.c
+++ b/vp9/encoder/vp9_vaq.c
@@ -19,8 +19,8 @@
#include "vp9/encoder/vp9_segmentation.h"
#include "vp9/common/vp9_systemdependent.h"
-#define ENERGY_MIN (-3)
-#define ENERGY_MAX (3)
+#define ENERGY_MIN (-1)
+#define ENERGY_MAX (1)
#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1)
#define ENERGY_IN_BOUNDS(energy)\
assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
@@ -65,7 +65,7 @@ void vp9_vaq_init() {
vp9_clear_system_state(); // __asm emms;
- base_ratio = 1.8;
+ base_ratio = 1.5;
for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
Q_RATIO(i) = pow(base_ratio, i/3.0);
@@ -80,30 +80,34 @@ void vp9_vaq_frame_setup(VP9_COMP *cpi) {
cm->y_dc_delta_q);
int i;
- vp9_enable_segmentation((VP9_PTR)cpi);
- vp9_clearall_segfeatures(seg);
+ if (cm->frame_type == KEY_FRAME ||
+ cpi->refresh_alt_ref_frame ||
+ (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+ vp9_enable_segmentation((VP9_PTR)cpi);
+ vp9_clearall_segfeatures(seg);
- seg->abs_delta = SEGMENT_DELTADATA;
+ seg->abs_delta = SEGMENT_DELTADATA;
- vp9_clear_system_state(); // __asm emms;
+ vp9_clear_system_state(); // __asm emms;
- for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
- int qindex_delta, segment_rdmult;
+ for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
+ int qindex_delta, segment_rdmult;
- if (Q_RATIO(i) == 1) {
- // No need to enable SEG_LVL_ALT_Q for this segment
- RDMULT_RATIO(i) = 1;
- continue;
- }
+ if (Q_RATIO(i) == 1) {
+ // No need to enable SEG_LVL_ALT_Q for this segment
+ RDMULT_RATIO(i) = 1;
+ continue;
+ }
- qindex_delta = vp9_compute_qdelta(cpi, base_q, base_q * Q_RATIO(i));
- vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta);
- vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q);
+ qindex_delta = vp9_compute_qdelta(cpi, base_q, base_q * Q_RATIO(i));
+ vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta);
+ vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q);
- segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta +
- cm->y_dc_delta_q);
+ segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta +
+ cm->y_dc_delta_q);
- RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult;
+ RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult;
+ }
}
}
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 85e83b834..48d6a7ca0 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -79,6 +79,7 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm
+VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c
ifeq ($(CONFIG_VP9_POSTPROC),yes)
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm