diff options
-rw-r--r-- | vp8/common/recon.h | 8 | ||||
-rw-r--r-- | vp8/common/reconinter.h | 9 | ||||
-rw-r--r-- | vp8/common/reconintra.c | 2 | ||||
-rw-r--r-- | vp8/common/rtcd_defs.sh | 8 | ||||
-rw-r--r-- | vp8/common/x86/filter_sse2.c | 289 | ||||
-rw-r--r-- | vp8/common/x86/filter_sse4.c | 3 | ||||
-rw-r--r-- | vp8/decoder/dequantize.h | 11 | ||||
-rw-r--r-- | vp8/decoder/detokenize.c | 3 | ||||
-rw-r--r-- | vp8/encoder/encodeframe.c | 25 | ||||
-rw-r--r-- | vp8/encoder/encodemb.c | 16 | ||||
-rw-r--r-- | vp8/encoder/encodemb.h | 10 | ||||
-rw-r--r-- | vp8/encoder/onyx_if.c | 1 | ||||
-rw-r--r-- | vp8/encoder/rdopt.c | 11 | ||||
-rw-r--r-- | vp8/vp8_common.mk | 5 |
14 files changed, 368 insertions, 33 deletions
diff --git a/vp8/common/recon.h b/vp8/common/recon.h index 3527fc14d..0bb5c8863 100644 --- a/vp8/common/recon.h +++ b/vp8/common/recon.h @@ -262,4 +262,12 @@ typedef struct vp8_recon_rtcd_vtable { void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *xd); + +#if CONFIG_SUPERBLOCKS +extern void vp8_recon_mby_s_c(const vp8_recon_rtcd_vtable_t *rtcd, + MACROBLOCKD *xd, uint8_t *dst); +extern void vp8_recon_mbuv_s_c(const vp8_recon_rtcd_vtable_t *rtcd, + MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst); +#endif + #endif diff --git a/vp8/common/reconinter.h b/vp8/common/reconinter.h index 7ad0adbd4..37e34b5e1 100644 --- a/vp8/common/reconinter.h +++ b/vp8/common/reconinter.h @@ -45,6 +45,15 @@ extern void vp8_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd, int dst_ystride, int dst_uvstride); +#if CONFIG_SUPERBLOCKS +extern void vp8_build_inter32x32_predictors_sb(MACROBLOCKD *x, + unsigned char *dst_y, + unsigned char *dst_u, + unsigned char *dst_v, + int dst_ystride, + int dst_uvstride); +#endif + extern void vp8_build_inter_predictors_mb(MACROBLOCKD *xd); extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, diff --git a/vp8/common/reconintra.c b/vp8/common/reconintra.c index e391fa9be..e84afa139 100644 --- a/vp8/common/reconintra.c +++ b/vp8/common/reconintra.c @@ -218,7 +218,7 @@ void vp8_build_intra_predictors_internal(MACROBLOCKD *xd, int r, c, i; for (i = 0; i < bsize; i++) { - yleft_col[i] = xd->dst.y_buffer [i * src_stride - 1]; + yleft_col[i] = src[i * src_stride - 1]; } /* for Y */ diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh index 1cb5de311..66029f88e 100644 --- a/vp8/common/rtcd_defs.sh +++ b/vp8/common/rtcd_defs.sh @@ -14,8 +14,8 @@ prototype void vp8_filter_block2d_16x16_8 "const unsigned char *src_ptr, const u # compiles warning free but a dissassembly of generated code show bugs. To be # on the safe side, only enabled when compiled with 'gcc'. if [ "$CONFIG_GCC" = "yes" ]; then - specialize vp8_filter_block2d_4x4_8 sse4_1 - specialize vp8_filter_block2d_8x4_8 sse4_1 - specialize vp8_filter_block2d_8x8_8 sse4_1 - specialize vp8_filter_block2d_16x16_8 sse4_1 + specialize vp8_filter_block2d_4x4_8 sse4_1 sse2 + specialize vp8_filter_block2d_8x4_8 sse4_1 sse2 + specialize vp8_filter_block2d_8x8_8 sse4_1 sse2 + specialize vp8_filter_block2d_16x16_8 sse4_1 sse2 fi diff --git a/vp8/common/x86/filter_sse2.c b/vp8/common/x86/filter_sse2.c new file mode 100644 index 000000000..fe57b4e0b --- /dev/null +++ b/vp8/common/x86/filter_sse2.c @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> // for alignment checks +#include <emmintrin.h> // SSE2 +#include "vp8/common/filter.h" +#include "vpx_ports/mem.h" // for DECLARE_ALIGNED +#include "vpx_rtcd.h" + +// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is +// just a quick partial snapshot so that other can already use some +// speedup. +// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap +// filtering. +// TODO(cd): Add some comments, better variable naming. +// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum +// of positive above 128), or have higher precision filter +// coefficients. + +DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = { + VP8_FILTER_WEIGHT >> 1, + VP8_FILTER_WEIGHT >> 1, + VP8_FILTER_WEIGHT >> 1, + VP8_FILTER_WEIGHT >> 1, +}; + +// Creating a macro to do more than four pixels at once to hide instruction +// latency is actually slower :-( +#define DO_FOUR_PIXELS(result, src_ptr, offset) \ + { \ + /* Do shifted load to achieve require shuffles through unpacking */ \ + const __m128i src0 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \ + const __m128i src1 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \ + const __m128i src2 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \ + const __m128i src3 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \ + const __m128i src01 = _mm_unpacklo_epi8(src0, src1); \ + const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero); \ + const __m128i src23 = _mm_unpacklo_epi8(src2, src3); \ + const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero); \ + /* Shit by 4 bytes through suffle to get additional shifted loads */ \ + const __m128i src4 = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1)); \ + const __m128i src5 = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1)); \ + const __m128i src6 = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1)); \ + const __m128i src7 = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1)); \ + const __m128i src45 = _mm_unpacklo_epi8(src4, src5); \ + const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero); \ + const __m128i src67 = _mm_unpacklo_epi8(src6, src7); \ + const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero); \ + /* multiply accumulate them */ \ + const __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \ + const __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \ + const __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \ + const __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \ + const __m128i mad0123 = _mm_add_epi32(mad01, mad23); \ + const __m128i mad4567 = _mm_add_epi32(mad45, mad67); \ + __m128i mad_all = _mm_add_epi32(mad0123, mad4567); \ + mad_all = _mm_add_epi32(mad_all, rounding); \ + result = _mm_srai_epi32(mad_all, VP8_FILTER_SHIFT); \ + } + +void vp8_filter_block2d_4x4_8_sse2 +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + __m128i intermediateA, intermediateB, intermediateC; + + const int kInterp_Extend = 4; + + const __m128i zero = _mm_set1_epi16(0); + const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c); + + // check alignment + assert(0 == ((long)HFilter_aligned16)%16); + assert(0 == ((long)VFilter_aligned16)%16); + + { + __m128i transpose3_0; + __m128i transpose3_1; + __m128i transpose3_2; + __m128i transpose3_3; + + // Horizontal pass (src -> intermediate). + { + const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16); + // get first two columns filter coefficients + __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1)); + __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2)); + __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3)); + src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); + + { + __m128i mad_all0; + __m128i mad_all1; + __m128i mad_all2; + __m128i mad_all3; + DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) + DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) + DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) + DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride) + mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); + mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); + intermediateA = _mm_packus_epi16(mad_all0, mad_all2); + // -- + src_ptr += src_stride*4; + // -- + DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) + DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) + DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) + DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride) + mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); + mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); + intermediateB = _mm_packus_epi16(mad_all0, mad_all2); + // -- + src_ptr += src_stride*4; + // -- + DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) + DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) + DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) + mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); + mad_all2 = _mm_packs_epi32(mad_all2, mad_all2); + intermediateC = _mm_packus_epi16(mad_all0, mad_all2); + } + } + + // Transpose result (intermediate -> transpose3_x) + { + // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33 + // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73 + // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx + const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB); + const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB); + const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC); + const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC); + // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73 + // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx + // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx + const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); + const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); + const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3); + const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3); + // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63 + // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73 + // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx + // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx + const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1); + const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1); + const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3); + const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3); + // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx + // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx + transpose3_0 = _mm_castps_si128( + _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), + _mm_castsi128_ps(transpose2_2), + _MM_SHUFFLE(1, 0, 1, 0))); + transpose3_1 = _mm_castps_si128( + _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), + _mm_castsi128_ps(transpose2_2), + _MM_SHUFFLE(3, 2, 3, 2))); + transpose3_2 = _mm_castps_si128( + _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), + _mm_castsi128_ps(transpose2_3), + _MM_SHUFFLE(1, 0, 1, 0))); + transpose3_3 = _mm_castps_si128( + _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), + _mm_castsi128_ps(transpose2_3), + _MM_SHUFFLE(3, 2, 3, 2))); + // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx + // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx + // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx + // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx + } + + // Vertical pass (transpose3_x -> dst). + { + const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16); + // get first two columns filter coefficients + __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1)); + __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2)); + __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3)); + __m128i col0, col1, col2, col3; + DECLARE_ALIGNED(16, unsigned char, temp[32]); + { + _mm_store_si128((__m128i *)temp, transpose3_0); + DO_FOUR_PIXELS(col0, temp, 0); + } + { + _mm_store_si128((__m128i *)temp, transpose3_1); + DO_FOUR_PIXELS(col1, temp, 0); + } + { + _mm_store_si128((__m128i *)temp, transpose3_2); + DO_FOUR_PIXELS(col2, temp, 0); + } + { + _mm_store_si128((__m128i *)temp, transpose3_3); + DO_FOUR_PIXELS(col3, temp, 0); + } + // transpose + { + __m128i T0 = _mm_unpacklo_epi32(col0, col1); + __m128i T1 = _mm_unpacklo_epi32(col2, col3); + __m128i T2 = _mm_unpackhi_epi32(col0, col1); + __m128i T3 = _mm_unpackhi_epi32(col2, col3); + col0 = _mm_unpacklo_epi64(T0, T1); + col1 = _mm_unpackhi_epi64(T0, T1); + col2 = _mm_unpacklo_epi64(T2, T3); + col3 = _mm_unpackhi_epi64(T2, T3); + } + // saturate to 8 bit + { + col0 = _mm_packs_epi32(col0, col0); + col0 = _mm_packus_epi16(col0, col0); + col1 = _mm_packs_epi32(col1, col1); + col1 = _mm_packus_epi16(col1, col1); + col2 = _mm_packs_epi32 (col2, col2); + col2 = _mm_packus_epi16(col2, col2); + col3 = _mm_packs_epi32 (col3, col3); + col3 = _mm_packus_epi16(col3, col3); + } + // store + { + *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0); + *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1); + *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2); + *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3); + } + } + } +} + +void vp8_filter_block2d_8x4_8_sse2 +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + int j; + for (j=0; j<8; j+=4) { + vp8_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride, + HFilter_aligned16, VFilter_aligned16, + dst_ptr + j, dst_stride); + } +} + +void vp8_filter_block2d_8x8_8_sse2 +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + int i, j; + for (i=0; i<8; i+=4) { + for (j=0; j<8; j+=4) { + vp8_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride, + HFilter_aligned16, VFilter_aligned16, + dst_ptr + j + i*dst_stride, dst_stride); + } + } +} + +void vp8_filter_block2d_16x16_8_sse2 +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + int i, j; + for (i=0; i<16; i+=4) { + for (j=0; j<16; j+=4) { + vp8_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride, + HFilter_aligned16, VFilter_aligned16, + dst_ptr + j + i*dst_stride, dst_stride); + } + } +} diff --git a/vp8/common/x86/filter_sse4.c b/vp8/common/x86/filter_sse4.c index a037622e1..c461db173 100644 --- a/vp8/common/x86/filter_sse4.c +++ b/vp8/common/x86/filter_sse4.c @@ -25,9 +25,6 @@ // TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum // of positive above 128), or have higher precision filter // coefficients. -// TODO(cd): Remove use of _mm_extract_epi32 and _mm_extract_epi64, to not -// require SSE4.1 -// TODO(cd): Remove use of _mm_shuffle_epi8 to not require SSSE3 DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = { 0x00, 0x01, diff --git a/vp8/decoder/dequantize.h b/vp8/decoder/dequantize.h index c4c8d4a06..2326e467d 100644 --- a/vp8/decoder/dequantize.h +++ b/vp8/decoder/dequantize.h @@ -201,5 +201,16 @@ void vp8_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq, int pitch, int stride); #endif +#if CONFIG_SUPERBLOCKS +void vp8_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, short *dq, + unsigned char *dst, + int stride, char *eobs, + short *dc, MACROBLOCKD *xd); +void vp8_dequant_idct_add_uv_block_8x8_inplace_c(short *q, short *dq, + unsigned char *dstu, + unsigned char *dstv, + int stride, char *eobs, + MACROBLOCKD *xd); +#endif #endif diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c index adff88a59..a6c837084 100644 --- a/vp8/decoder/detokenize.c +++ b/vp8/decoder/detokenize.c @@ -295,6 +295,7 @@ static int vp8_decode_coefs(VP8D_COMP *dx, const MACROBLOCKD *xd, const vp8_prob *prob, *coef_probs; switch (block_type) { + default: case TX_4X4: coef_probs = fc->coef_probs[type][0][0]; break; @@ -302,7 +303,7 @@ static int vp8_decode_coefs(VP8D_COMP *dx, const MACROBLOCKD *xd, coef_probs = fc->coef_probs_8x8[type][0][0]; break; #if CONFIG_TX16X16 - default: + case TX_16X16: coef_probs = fc->coef_probs_16x16[type][0][0]; break; #endif diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index 4472497e0..f834e0b83 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -21,6 +21,7 @@ #include "vp8/common/setupintrarecon.h" #include "encodeintra.h" #include "vp8/common/reconinter.h" +#include "vp8/common/invtrans.h" #include "rdopt.h" #include "vp8/common/findnearmv.h" #include "vp8/common/reconintra.h" @@ -76,7 +77,8 @@ void vp8cx_encode_intra_super_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int mb_col); static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x); - +extern void vp8_stuff_mb_8x8(VP8_COMP *cpi, + MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run); #ifdef MODE_STATS unsigned int inter_y_modes[MB_MODE_COUNT]; @@ -852,7 +854,6 @@ static void encode_sb(VP8_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, TOKENEXTRA **tp) { - VP8_COMMON *pc = cm; int i; int map_index; int mb_row, mb_col; @@ -1693,7 +1694,6 @@ static void update_sb_skip_coeff_state(VP8_COMP *cpi, // reset pointer, stuff EOBs where necessary *tp = t[0]; for (n = 0; n < 4; n++) { - TOKENEXTRA *tbak = *tp; if (skip[n]) { x->e_mbd.above_context = &ta[n]; x->e_mbd.left_context = &tl[n]; @@ -1715,9 +1715,12 @@ void vp8cx_encode_intra_super_block(VP8_COMP *cpi, int n; MACROBLOCKD *xd = &x->e_mbd; VP8_COMMON *cm = &cpi->common; - const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer; - const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer; - const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer; + const uint8_t *src = x->src.y_buffer; + uint8_t *dst = xd->dst.y_buffer; + const uint8_t *usrc = x->src.u_buffer; + uint8_t *udst = xd->dst.u_buffer; + const uint8_t *vsrc = x->src.v_buffer; + uint8_t *vdst = xd->dst.v_buffer; int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride; int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride; const VP8_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd); @@ -2041,13 +2044,15 @@ void vp8cx_encode_inter_superblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, const int output_enabled = 1; VP8_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; - const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer; - const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer; - const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer; + const uint8_t *src = x->src.y_buffer; + uint8_t *dst = xd->dst.y_buffer; + const uint8_t *usrc = x->src.u_buffer; + uint8_t *udst = xd->dst.u_buffer; + const uint8_t *vsrc = x->src.v_buffer; + uint8_t *vdst = xd->dst.v_buffer; int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride; int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride; const VP8_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd); - int mis = xd->mode_info_stride; unsigned int segment_id = xd->mode_info_context->mbmi.segment_id; int seg_ref_active; unsigned char ref_pred_flag; diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c index 8c48b0d83..a26350552 100644 --- a/vp8/encoder/encodemb.c +++ b/vp8/encoder/encodemb.c @@ -67,8 +67,10 @@ void vp8_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch) { } } -void vp8_subtract_mbuv_s_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, - unsigned char *upred, unsigned char *vpred, int dst_stride) { +void vp8_subtract_mbuv_s_c(short *diff, const unsigned char *usrc, + const unsigned char *vsrc, int src_stride, + const unsigned char *upred, + const unsigned char *vpred, int dst_stride) { short *udiff = diff + 256; short *vdiff = diff + 320; @@ -95,14 +97,16 @@ void vp8_subtract_mbuv_s_c(short *diff, unsigned char *usrc, unsigned char *vsrc } } -void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) { +void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, + unsigned char *vsrc, unsigned char *pred, int stride) { unsigned char *upred = pred + 256; unsigned char *vpred = pred + 320; vp8_subtract_mbuv_s_c(diff, usrc, vsrc, stride, upred, vpred, 8); } -void vp8_subtract_mby_s_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int dst_stride) { +void vp8_subtract_mby_s_c(short *diff, const unsigned char *src, int src_stride, + const unsigned char *pred, int dst_stride) { int r, c; for (r = 0; r < 16; r++) { @@ -116,8 +120,8 @@ void vp8_subtract_mby_s_c(short *diff, unsigned char *src, int src_stride, unsig } } -void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride) -{ +void vp8_subtract_mby_c(short *diff, unsigned char *src, + unsigned char *pred, int stride) { vp8_subtract_mby_s_c(diff, src, stride, pred, 16); } diff --git a/vp8/encoder/encodemb.h b/vp8/encoder/encodemb.h index 13ddcf115..653774aaf 100644 --- a/vp8/encoder/encodemb.h +++ b/vp8/encoder/encodemb.h @@ -132,4 +132,14 @@ void vp8_optimize_mby_16x16(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd); void vp8_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch); +#if CONFIG_SUPERBLOCKS +void vp8_subtract_mbuv_s_c(short *diff, const unsigned char *usrc, + const unsigned char *vsrc, int src_stride, + const unsigned char *upred, + const unsigned char *vpred, int dst_stride); +void vp8_subtract_mby_s_c(short *diff, const unsigned char *src, + int src_stride, const unsigned char *pred, + int dst_stride); +#endif + #endif diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 256c70386..c3df54481 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -630,7 +630,6 @@ static void update_reference_segmentation_map(VP8_COMP *cpi) { for (row = 0; row < sb_rows; row++) { for (col = 0; col < sb_cols; col++) { MODE_INFO *miptr = mi + col * 2; - uint8_t *seg = segmap + col * 2; uint8_t *cache = segcache + col * 2; #if CONFIG_SUPERBLOCKS if (miptr->mbmi.encoded_as_sb) { diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index d9b49bfb6..d217f2ffc 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -2943,7 +2943,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int int uv_intra_rate_8x8 = 0, uv_intra_distortion_8x8 = 0, uv_intra_rate_tokenonly_8x8 = 0; int uv_intra_skippable_8x8 = 0; int rate_y, UNINITIALIZED_IS_SAFE(rate_uv); - int distortion_uv; + int distortion_uv = INT_MAX; int64_t best_yrd = INT64_MAX; #if CONFIG_PRED_FILTER int best_filter_state; @@ -3856,7 +3856,6 @@ void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int mode16x16; int mode8x8[2][4]; int dist; - int rateuv8, rateuv_tokenonly8, distuv8; mbmi->ref_frame = INTRA_FRAME; rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv); @@ -3961,7 +3960,6 @@ int64_t vp8_rd_pick_inter_mode_sb(VP8_COMP *cpi, MACROBLOCK *x, BLOCKD *d = &xd->block[0]; MB_PREDICTION_MODE this_mode; MV_REFERENCE_FRAME ref_frame; - int mis = xd->mode_info_stride; unsigned char segment_id = xd->mode_info_context->mbmi.segment_id; int comp_pred; int_mv best_ref_mv, second_best_ref_mv; @@ -4313,11 +4311,11 @@ int64_t vp8_rd_pick_inter_mode_sb(VP8_COMP *cpi, MACROBLOCK *x, if ((sse - var < q2dc *q2dc >> 4) || (sse / 2 > var && sse - var < 64)) { // Check u and v to make sure skip is ok - int sse2, sse3; - int var2 = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16) + unsigned int sse2, sse3; + var += VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16) (x->src.u_buffer, x->src.uv_stride, xd->dst.u_buffer, xd->dst.uv_stride, &sse2); - int var3 = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16) + var += VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16) (x->src.v_buffer, x->src.uv_stride, xd->dst.v_buffer, xd->dst.uv_stride, &sse3); sse2 += sse3; @@ -4658,7 +4656,6 @@ void vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *totalrate, int *totaldist) { - VP8_COMMON *cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi; int rate, distortion; diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 9f708ac4c..4d3d0345e 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -116,6 +116,11 @@ ifeq ($(HAVE_SSE4_1),yes) vp8/common/x86/filter_sse4.c.o: CFLAGS += -msse4 endif +VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/filter_sse2.c +ifeq ($(HAVE_SSE2),yes) +vp8/common/x86/filter_sse2.c.o: CFLAGS += -msse2 +endif + VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/arm_systemdependent.c VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/bilinearfilter_arm.c VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/bilinearfilter_arm.h |