diff options
23 files changed, 340 insertions, 30 deletions
diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c index af2a5df98..db079d5ed 100644 --- a/vp8/encoder/arm/arm_csystemdependent.c +++ b/vp8/encoder/arm/arm_csystemdependent.c @@ -121,8 +121,10 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi) cpi->rtcd.encodemb.submby = vp8_subtract_mby_neon; cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_neon; - /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;*/ + /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b; + cpi->rtcd.quantize.quantb_pair = vp8_regular_quantize_b_pair;*/ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_neon; + cpi->rtcd.quantize.fastquantb_pair = vp8_fast_quantize_b_pair_neon; } #endif diff --git a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm index a9060d76f..000805d4f 100644 --- a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm +++ b/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm @@ -27,8 +27,11 @@ |vp8_mse16x16_armv6| PROC push {r4-r9, lr} - mov r12, #16 ; set loop counter to 16 (=block height) + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r12, #16 ; set loop counter to 16 (=block height) mov r4, #0 ; initialize sse = 0 loop @@ -39,8 +42,10 @@ loop mov lr, #0 ; constant zero usub8 r8, r5, r6 ; calculate difference + pld [r0, r1, lsl #1] sel r7, r8, lr ; select bytes with positive difference usub8 r9, r6, r5 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] sel r8, r9, lr ; select bytes with negative difference ; calculate partial sums diff --git a/vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm index c759f7c65..1b4f5cf3b 100644 --- a/vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm +++ b/vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm @@ -24,6 +24,12 @@ ; stack max_sad (not used) |vp8_sad16x16_armv6| PROC stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + pld [r0, r1, lsl #1] + pld [r2, r3, lsl #1] + mov r4, #0 ; sad = 0; mov r5, #8 ; loop count @@ -45,6 +51,9 @@ loop add r0, r0, r1 ; set src pointer to next row add r2, r2, r3 ; set dst pointer to next row + pld [r0, r1, lsl #1] + pld [r2, r3, lsl #1] + usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels @@ -70,6 +79,9 @@ loop usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels + pld [r0, r1, lsl #1] + pld [r2, r3, lsl #1] + subs r5, r5, #1 ; decrement loop counter add r4, r4, r8 ; add partial sad values diff --git a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm index 988376390..5feaa8bc2 100644 --- a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm +++ b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm @@ -25,6 +25,10 @@ |vp8_variance16x16_armv6| PROC stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + mov r8, #0 ; initialize sum = 0 mov r11, #0 ; initialize sse = 0 mov r12, #16 ; set loop counter to 16 (=block height) @@ -37,8 +41,10 @@ loop mov lr, #0 ; constant zero usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] sel r7, r6, lr ; select bytes with positive difference usub8 r9, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] sel r6, r9, lr ; select bytes with negative difference ; calculate partial sums diff --git a/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm index 7daecb925..adc353d20 100644 --- a/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm +++ b/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm @@ -23,6 +23,10 @@ |vp8_variance8x8_armv6| PROC push {r4-r10, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + mov r12, #8 ; set loop counter to 8 (=block height) mov r4, #0 ; initialize sum = 0 mov r5, #0 ; initialize sse = 0 @@ -35,8 +39,10 @@ loop mov lr, #0 ; constant zero usub8 r8, r6, r7 ; calculate difference + pld [r0, r1, lsl #1] sel r10, r8, lr ; select bytes with positive difference usub8 r9, r7, r6 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] sel r8, r9, lr ; select bytes with negative difference ; calculate partial sums diff --git a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm index 2350f3e8b..1b5489795 100644 --- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm +++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm @@ -25,6 +25,10 @@ |vp8_variance_halfpixvar16x16_h_armv6| PROC stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + mov r8, #0 ; initialize sum = 0 ldr r10, c80808080 mov r11, #0 ; initialize sse = 0 @@ -42,8 +46,10 @@ loop eor r4, r4, r10 usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] sel r7, r6, lr ; select bytes with positive difference usub8 r6, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] sel r6, r6, lr ; select bytes with negative difference ; calculate partial sums diff --git a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm index f9ae3b7e2..38c55edf8 100644 --- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm +++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm @@ -25,6 +25,10 @@ |vp8_variance_halfpixvar16x16_hv_armv6| PROC stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + mov r8, #0 ; initialize sum = 0 ldr r10, c80808080 mov r11, #0 ; initialize sse = 0 @@ -53,8 +57,10 @@ loop eor r4, r4, r10 usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] sel r7, r6, lr ; select bytes with positive difference usub8 r6, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] sel r6, r6, lr ; select bytes with negative difference ; calculate partial sums diff --git a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm index 9e0a03548..22a50eb00 100644 --- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm +++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm @@ -25,6 +25,10 @@ |vp8_variance_halfpixvar16x16_v_armv6| PROC stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + mov r8, #0 ; initialize sum = 0 ldr r10, c80808080 mov r11, #0 ; initialize sse = 0 @@ -43,8 +47,10 @@ loop eor r4, r4, r10 usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] sel r7, r6, lr ; select bytes with positive difference usub8 r6, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] sel r6, r6, lr ; select bytes with negative difference ; calculate partial sums diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.asm b/vp8/encoder/arm/neon/fastquantizeb_neon.asm index 3dd92b12e..dcf3c5090 100644 --- a/vp8/encoder/arm/neon/fastquantizeb_neon.asm +++ b/vp8/encoder/arm/neon/fastquantizeb_neon.asm @@ -10,6 +10,7 @@ EXPORT |vp8_fast_quantize_b_neon| + EXPORT |vp8_fast_quantize_b_pair_neon| INCLUDE asm_enc_offsets.asm @@ -19,6 +20,138 @@ AREA ||.text||, CODE, READONLY, ALIGN=4 +;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2); +|vp8_fast_quantize_b_pair_neon| PROC + + stmfd sp!, {r4-r9} + vstmdb sp!, {q4-q7} + + ldr r4, [r0, #vp8_block_coeff] + ldr r5, [r0, #vp8_block_quant_fast] + ldr r6, [r0, #vp8_block_round] + + vld1.16 {q0, q1}, [r4@128] ; load z + + ldr r7, [r2, #vp8_blockd_qcoeff] + + vabs.s16 q4, q0 ; calculate x = abs(z) + vabs.s16 q5, q1 + + ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative + vshr.s16 q2, q0, #15 ; sz + vshr.s16 q3, q1, #15 + + vld1.s16 {q6, q7}, [r6@128] ; load round_ptr [0-15] + vld1.s16 {q8, q9}, [r5@128] ; load quant_ptr [0-15] + + ldr r4, [r1, #vp8_block_coeff] + + vadd.s16 q4, q6 ; x + Round + vadd.s16 q5, q7 + + vld1.16 {q0, q1}, [r4@128] ; load z2 + + vqdmulh.s16 q4, q8 ; y = ((Round+abs(z)) * Quant) >> 16 + vqdmulh.s16 q5, q9 + + vabs.s16 q10, q0 ; calculate x2 = abs(z_2) + vabs.s16 q11, q1 + vshr.s16 q12, q0, #15 ; sz2 + vshr.s16 q13, q1, #15 + + ;modify data to have its original sign + veor.s16 q4, q2 ; y^sz + veor.s16 q5, q3 + + vadd.s16 q10, q6 ; x2 + Round + vadd.s16 q11, q7 + + ldr r8, [r2, #vp8_blockd_dequant] + + vqdmulh.s16 q10, q8 ; y2 = ((Round+abs(z)) * Quant) >> 16 + vqdmulh.s16 q11, q9 + + vshr.s16 q4, #1 ; right shift 1 after vqdmulh + vshr.s16 q5, #1 + + vld1.s16 {q6, q7}, [r8@128] ;load dequant_ptr[i] + + vsub.s16 q4, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement) + vsub.s16 q5, q3 + + vshr.s16 q10, #1 ; right shift 1 after vqdmulh + vshr.s16 q11, #1 + + ldr r9, [r2, #vp8_blockd_dqcoeff] + + veor.s16 q10, q12 ; y2^sz2 + veor.s16 q11, q13 + + vst1.s16 {q4, q5}, [r7] ; store: qcoeff = x1 + + + vsub.s16 q10, q12 ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement) + vsub.s16 q11, q13 + + ldr r6, [r3, #vp8_blockd_qcoeff] + + vmul.s16 q2, q6, q4 ; x * Dequant + vmul.s16 q3, q7, q5 + + ldr r0, _inv_zig_zag_ ; load ptr of inverse zigzag table + + vceq.s16 q8, q8 ; set q8 to all 1 + + vst1.s16 {q10, q11}, [r6] ; store: qcoeff = x2 + + vmul.s16 q12, q6, q10 ; x2 * Dequant + vmul.s16 q13, q7, q11 + + vld1.16 {q6, q7}, [r0@128] ; load inverse scan order + + vtst.16 q14, q4, q8 ; now find eob + vtst.16 q15, q5, q8 ; non-zero element is set to all 1 + + vst1.s16 {q2, q3}, [r9] ; store dqcoeff = x * Dequant + + ldr r7, [r3, #vp8_blockd_dqcoeff] + + vand q0, q6, q14 ; get all valid numbers from scan array + vand q1, q7, q15 + + vst1.s16 {q12, q13}, [r7] ; store dqcoeff = x * Dequant + + vtst.16 q2, q10, q8 ; now find eob + vtst.16 q3, q11, q8 ; non-zero element is set to all 1 + + vmax.u16 q0, q0, q1 ; find maximum value in q0, q1 + + vand q10, q6, q2 ; get all valid numbers from scan array + vand q11, q7, q3 + vmax.u16 q10, q10, q11 ; find maximum value in q10, q11 + + vmax.u16 d0, d0, d1 + vmax.u16 d20, d20, d21 + vmovl.u16 q0, d0 + vmovl.u16 q10, d20 + + + vmax.u32 d0, d0, d1 + vmax.u32 d20, d20, d21 + vpmax.u32 d0, d0, d0 + vpmax.u32 d20, d20, d20 + + add r4, r2, #vp8_blockd_eob + add r5, r3, #vp8_blockd_eob + + vst1.32 {d0[0]}, [r4@32] + vst1.32 {d20[0]}, [r5@32] + + vldmia sp!, {q4-q7} + ldmfd sp!, {r4-r9} + bx lr + + ENDP ;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d) |vp8_fast_quantize_b_neon| PROC @@ -97,10 +230,8 @@ vst1.s16 {q2, q3}, [r7@128] ; store dqcoeff = x * Dequant - vmov.32 r0, d0[0] ; this instruction takes 1+13 cycles - ; if we have vfp, we could use - ; vstr s0, [r1, #vp8_blockd_eob] - str r0, [r1, #vp8_blockd_eob] + add r4, r1, #vp8_blockd_eob + vst1.32 {d0[0]}, [r4@32] ldmfd sp!, {r4-r7} bx lr diff --git a/vp8/encoder/arm/quantize_arm.c b/vp8/encoder/arm/quantize_arm.c new file mode 100644 index 000000000..52d84013e --- /dev/null +++ b/vp8/encoder/arm/quantize_arm.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include <math.h> +#include "vpx_mem/vpx_mem.h" + +#include "vp8/encoder/quantize.h" +#include "vp8/common/entropy.h" + + +#if HAVE_ARMV7 + +/* vp8_quantize_mbX functions here differs from corresponding ones in + * quantize.c only by using quantize_b_pair function pointer instead of + * the regular quantize_b function pointer */ +void vp8_quantize_mby_neon(MACROBLOCK *x) +{ + int i; + int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED + && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV); + + for (i = 0; i < 16; i+=2) + x->quantize_b_pair(&x->block[i], &x->block[i+1], + &x->e_mbd.block[i], &x->e_mbd.block[i+1]); + + if(has_2nd_order) + x->quantize_b(&x->block[24], &x->e_mbd.block[24]); +} + +void vp8_quantize_mb_neon(MACROBLOCK *x) +{ + int i; + int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED + && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV); + + for (i = 0; i < 24; i+=2) + x->quantize_b_pair(&x->block[i], &x->block[i+1], + &x->e_mbd.block[i], &x->e_mbd.block[i+1]); + + if (has_2nd_order) + x->quantize_b(&x->block[i], &x->e_mbd.block[i]); +} + + +void vp8_quantize_mbuv_neon(MACROBLOCK *x) +{ + int i; + + for (i = 16; i < 24; i+=2) + x->quantize_b_pair(&x->block[i], &x->block[i+1], + &x->e_mbd.block[i], &x->e_mbd.block[i+1]); +} + +#endif /* HAVE_ARMV7 */ diff --git a/vp8/encoder/arm/quantize_arm.h b/vp8/encoder/arm/quantize_arm.h index af4187ac1..7d2088d2d 100644 --- a/vp8/encoder/arm/quantize_arm.h +++ b/vp8/encoder/arm/quantize_arm.h @@ -16,8 +16,10 @@ extern prototype_quantize_block(vp8_fast_quantize_b_armv6); +#if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_quantize_fastquantb #define vp8_quantize_fastquantb vp8_fast_quantize_b_armv6 +#endif #endif /* HAVE_ARMV6 */ @@ -25,10 +27,25 @@ extern prototype_quantize_block(vp8_fast_quantize_b_armv6); #if HAVE_ARMV7 extern prototype_quantize_block(vp8_fast_quantize_b_neon); +extern prototype_quantize_block_pair(vp8_fast_quantize_b_pair_neon); +#if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_quantize_fastquantb #define vp8_quantize_fastquantb vp8_fast_quantize_b_neon +#undef vp8_quantize_fastquantb_pair +#define vp8_quantize_fastquantb_pair vp8_fast_quantize_b_pair_neon + +#undef vp8_quantize_mb +#define vp8_quantize_mb vp8_quantize_mb_neon + +#undef vp8_quantize_mbuv +#define vp8_quantize_mbuv vp8_quantize_mbuv_neon + +#undef vp8_quantize_mby +#define vp8_quantize_mby vp8_quantize_mby_neon +#endif + #endif /* HAVE_ARMV7 */ #endif diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h index bc6eeeb14..e8a5b78eb 100644 --- a/vp8/encoder/block.h +++ b/vp8/encoder/block.h @@ -123,6 +123,7 @@ typedef struct void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); void (*short_walsh4x4)(short *input, short *output, int pitch); void (*quantize_b)(BLOCK *b, BLOCKD *d); + void (*quantize_b_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1); } MACROBLOCK; diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index abdbbac0b..616e702b4 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -1266,8 +1266,10 @@ int vp8cx_encode_inter_macroblock /* Are we using the fast quantizer for the mode selection? */ if(cpi->sf.use_fastquant_for_pick) { - cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, - fastquantb); + cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, + fastquantb); + cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize, + fastquantb_pair); /* the fast quantizer does not use zbin_extra, so * do not recalculate */ @@ -1279,7 +1281,10 @@ int vp8cx_encode_inter_macroblock /* switch back to the regular quantizer for the encode */ if (cpi->sf.improved_quant) { - cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb); + cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, + quantb); + cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize, + quantb_pair); } /* restore cpi->zbin_mode_boost_enabled */ diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c index 3e6ed2a9d..665b2d5dc 100644 --- a/vp8/encoder/ethreading.c +++ b/vp8/encoder/ethreading.c @@ -314,6 +314,7 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) z->vp8_short_fdct8x4 = x->vp8_short_fdct8x4; z->short_walsh4x4 = x->short_walsh4x4; z->quantize_b = x->quantize_b; + z->quantize_b_pair = x->quantize_b_pair; z->optimize = x->optimize; /* diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c index 35d2d5332..b9b371fe4 100644 --- a/vp8/encoder/generic/csystemdependent.c +++ b/vp8/encoder/generic/csystemdependent.c @@ -17,8 +17,6 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi); void vp8_arch_arm_encoder_init(VP8_COMP *cpi); -extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d); - void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction); extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction); @@ -88,7 +86,9 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi) cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_c; cpi->rtcd.quantize.quantb = vp8_regular_quantize_b; + cpi->rtcd.quantize.quantb_pair = vp8_regular_quantize_b_pair; cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_c; + cpi->rtcd.quantize.fastquantb_pair = vp8_fast_quantize_b_pair_c; cpi->rtcd.search.full_search = vp8_full_search_sad; cpi->rtcd.search.refining_search = vp8_refining_search_sad; cpi->rtcd.search.diamond_search = vp8_diamond_search_sad; diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c index 50c4745b1..d22fdb2e6 100644 --- a/vp8/encoder/mcomp.c +++ b/vp8/encoder/mcomp.c @@ -842,7 +842,6 @@ int vp8_hex_search int_mv *best_mv, int search_param, int sad_per_bit, - int *num00, const vp8_variance_fn_ptr_t *vfp, int *mvsadcost[2], int *mvcost[2], @@ -996,12 +995,8 @@ cal_neighbors: best_mv->as_mv.row = br; best_mv->as_mv.col = bc; - this_mv.as_mv.row = br<<3; - this_mv.as_mv.col = bc<<3; - this_offset = (unsigned char *)(*(d->base_pre) + d->pre + (br * (in_what_stride)) + bc); - return vfp->vf(what, what_stride, this_offset, in_what_stride, &bestsad) - + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit) ; + return bestsad; } #undef CHECK_BOUNDS #undef CHECK_POINT diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h index bf9fa6f76..44ed055db 100644 --- a/vp8/encoder/mcomp.h +++ b/vp8/encoder/mcomp.h @@ -40,7 +40,6 @@ extern int vp8_hex_search int_mv *best_mv, int search_param, int error_per_bit, - int *num00, const vp8_variance_fn_ptr_t *vf, int *mvsadcost[2], int *mvcost[2], diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 0fdc572ff..3b86ba041 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -1266,11 +1266,17 @@ void vp8_set_speed_features(VP8_COMP *cpi) if (cpi->sf.improved_quant) { - cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb); + cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, + quantb); + cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize, + quantb_pair); } else { - cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb); + cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, + fastquantb); + cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize, + fastquantb_pair); } if (cpi->sf.improved_quant != last_improved_quant) vp8cx_init_quantizer(cpi); diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index e6716fac2..456059cf8 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -762,7 +762,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, if (cpi->sf.search_method == HEX) { bestsme = vp8_hex_search(x, b, d, &mvp, &d->bmi.mv, step_param, - sadpb, &num00, &cpi->fn_ptr[BLOCK_16X16], + sadpb, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv); mode_mv[NEWMV].as_int = d->bmi.mv.as_int; } diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c index 49e8e1b9b..503d24123 100644 --- a/vp8/encoder/quantize.c +++ b/vp8/encoder/quantize.c @@ -269,7 +269,7 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d) #endif -void vp8_quantize_mby(MACROBLOCK *x) +void vp8_quantize_mby_c(MACROBLOCK *x) { int i; int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED @@ -282,7 +282,7 @@ void vp8_quantize_mby(MACROBLOCK *x) x->quantize_b(&x->block[24], &x->e_mbd.block[24]); } -void vp8_quantize_mb(MACROBLOCK *x) +void vp8_quantize_mb_c(MACROBLOCK *x) { int i; int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED @@ -293,7 +293,7 @@ void vp8_quantize_mb(MACROBLOCK *x) } -void vp8_quantize_mbuv(MACROBLOCK *x) +void vp8_quantize_mbuv_c(MACROBLOCK *x) { int i; @@ -301,6 +301,22 @@ void vp8_quantize_mbuv(MACROBLOCK *x) x->quantize_b(&x->block[i], &x->e_mbd.block[i]); } +/* quantize_b_pair function pointer in MACROBLOCK structure is set to one of + * these two C functions if corresponding optimized routine is not available. + * NEON optimized version implements currently the fast quantization for pair + * of blocks. */ +void vp8_regular_quantize_b_pair(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2) +{ + vp8_regular_quantize_b(b1, d1); + vp8_regular_quantize_b(b2, d2); +} + +void vp8_fast_quantize_b_pair_c(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2) +{ + vp8_fast_quantize_b_c(b1, d1); + vp8_fast_quantize_b_c(b2, d2); +} + static const int qrounding_factors[129] = { @@ -715,3 +731,4 @@ void vp8_set_quantizer(struct VP8_COMP *cpi, int Q) vp8cx_init_quantizer(cpi); } + diff --git a/vp8/encoder/quantize.h b/vp8/encoder/quantize.h index d9a041071..f1f0156d8 100644 --- a/vp8/encoder/quantize.h +++ b/vp8/encoder/quantize.h @@ -17,6 +17,11 @@ #define prototype_quantize_block(sym) \ void (sym)(BLOCK *b,BLOCKD *d) +#define prototype_quantize_block_pair(sym) \ + void (sym)(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2) + +#define prototype_quantize_mb(sym) \ + void (sym)(MACROBLOCK *x) #if ARCH_X86 || ARCH_X86_64 #include "x86/quantize_x86.h" @@ -31,17 +36,43 @@ #endif extern prototype_quantize_block(vp8_quantize_quantb); +#ifndef vp8_quantize_quantb_pair +#define vp8_quantize_quantb_pair vp8_regular_quantize_b_pair +#endif +extern prototype_quantize_block_pair(vp8_quantize_quantb_pair); + #ifndef vp8_quantize_fastquantb #define vp8_quantize_fastquantb vp8_fast_quantize_b_c #endif extern prototype_quantize_block(vp8_quantize_fastquantb); +#ifndef vp8_quantize_fastquantb_pair +#define vp8_quantize_fastquantb_pair vp8_fast_quantize_b_pair_c +#endif +extern prototype_quantize_block_pair(vp8_quantize_fastquantb_pair); + typedef struct { prototype_quantize_block(*quantb); + prototype_quantize_block_pair(*quantb_pair); prototype_quantize_block(*fastquantb); + prototype_quantize_block_pair(*fastquantb_pair); } vp8_quantize_rtcd_vtable_t; +#ifndef vp8_quantize_mb +#define vp8_quantize_mb vp8_quantize_mb_c +#endif +extern prototype_quantize_mb(vp8_quantize_mb); + +#ifndef vp8_quantize_mbuv +#define vp8_quantize_mbuv vp8_quantize_mbuv_c +#endif +extern prototype_quantize_mb(vp8_quantize_mbuv); + +#ifndef vp8_quantize_mby +#define vp8_quantize_mby vp8_quantize_mby_c +#endif +extern prototype_quantize_mb(vp8_quantize_mby); #if CONFIG_RUNTIME_CPU_DETECT #define QUANTIZE_INVOKE(ctx,fn) (ctx)->fn @@ -51,10 +82,6 @@ typedef struct extern void vp8_strict_quantize_b(BLOCK *b,BLOCKD *d); -extern void vp8_quantize_mb(MACROBLOCK *x); -extern void vp8_quantize_mbuv(MACROBLOCK *x); -extern void vp8_quantize_mby(MACROBLOCK *x); - struct VP8_COMP; extern void vp8_set_quantizer(struct VP8_COMP *cpi, int Q); extern void vp8cx_frame_init_quantizer(struct VP8_COMP *cpi); diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c index 2997f77d1..c1ca7d4ed 100644 --- a/vp8/encoder/temporal_filter.c +++ b/vp8/encoder/temporal_filter.c @@ -153,7 +153,6 @@ static int vp8_temporal_filter_find_matching_mb_c int further_steps; int sadpb = x->sadperbit16; int bestsme = INT_MAX; - int num00 = 0; BLOCK *b = &x->block[0]; BLOCKD *d = &x->e_mbd.block[0]; @@ -201,7 +200,7 @@ static int vp8_temporal_filter_find_matching_mb_c &best_ref_mv1, &d->bmi.mv, step_param, sadpb, - &num00, &cpi->fn_ptr[BLOCK_16X16], + &cpi->fn_ptr[BLOCK_16X16], mvsadcost, mvcost, &best_ref_mv1); #if ALT_REF_SUBPEL_ENABLED diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk index 03d42d215..165dada2b 100644 --- a/vp8/vp8cx_arm.mk +++ b/vp8/vp8cx_arm.mk @@ -15,6 +15,7 @@ # encoder VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/arm_csystemdependent.c +VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/quantize_arm.c VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/picklpf_arm.c VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/dct_arm.c VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/variance_arm.c |