diff options
Diffstat (limited to 'vp8/encoder/arm')
-rw-r--r-- | vp8/encoder/arm/arm_csystemdependent.c | 4 | ||||
-rw-r--r-- | vp8/encoder/arm/neon/fastquantizeb_neon.asm | 139 | ||||
-rw-r--r-- | vp8/encoder/arm/quantize_arm.c | 62 | ||||
-rw-r--r-- | vp8/encoder/arm/quantize_arm.h | 17 |
4 files changed, 217 insertions, 5 deletions
diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c index af2a5df98..db079d5ed 100644 --- a/vp8/encoder/arm/arm_csystemdependent.c +++ b/vp8/encoder/arm/arm_csystemdependent.c @@ -121,8 +121,10 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi) cpi->rtcd.encodemb.submby = vp8_subtract_mby_neon; cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_neon; - /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;*/ + /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b; + cpi->rtcd.quantize.quantb_pair = vp8_regular_quantize_b_pair;*/ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_neon; + cpi->rtcd.quantize.fastquantb_pair = vp8_fast_quantize_b_pair_neon; } #endif diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.asm b/vp8/encoder/arm/neon/fastquantizeb_neon.asm index 3dd92b12e..dcf3c5090 100644 --- a/vp8/encoder/arm/neon/fastquantizeb_neon.asm +++ b/vp8/encoder/arm/neon/fastquantizeb_neon.asm @@ -10,6 +10,7 @@ EXPORT |vp8_fast_quantize_b_neon| + EXPORT |vp8_fast_quantize_b_pair_neon| INCLUDE asm_enc_offsets.asm @@ -19,6 +20,138 @@ AREA ||.text||, CODE, READONLY, ALIGN=4 +;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2); +|vp8_fast_quantize_b_pair_neon| PROC + + stmfd sp!, {r4-r9} + vstmdb sp!, {q4-q7} + + ldr r4, [r0, #vp8_block_coeff] + ldr r5, [r0, #vp8_block_quant_fast] + ldr r6, [r0, #vp8_block_round] + + vld1.16 {q0, q1}, [r4@128] ; load z + + ldr r7, [r2, #vp8_blockd_qcoeff] + + vabs.s16 q4, q0 ; calculate x = abs(z) + vabs.s16 q5, q1 + + ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative + vshr.s16 q2, q0, #15 ; sz + vshr.s16 q3, q1, #15 + + vld1.s16 {q6, q7}, [r6@128] ; load round_ptr [0-15] + vld1.s16 {q8, q9}, [r5@128] ; load quant_ptr [0-15] + + ldr r4, [r1, #vp8_block_coeff] + + vadd.s16 q4, q6 ; x + Round + vadd.s16 q5, q7 + + vld1.16 {q0, q1}, [r4@128] ; load z2 + + vqdmulh.s16 q4, q8 ; y = ((Round+abs(z)) * Quant) >> 16 + vqdmulh.s16 q5, q9 + + vabs.s16 q10, q0 ; calculate x2 = abs(z_2) + vabs.s16 q11, q1 + vshr.s16 q12, q0, #15 ; sz2 + vshr.s16 q13, q1, #15 + + ;modify data to have its original sign + veor.s16 q4, q2 ; y^sz + veor.s16 q5, q3 + + vadd.s16 q10, q6 ; x2 + Round + vadd.s16 q11, q7 + + ldr r8, [r2, #vp8_blockd_dequant] + + vqdmulh.s16 q10, q8 ; y2 = ((Round+abs(z)) * Quant) >> 16 + vqdmulh.s16 q11, q9 + + vshr.s16 q4, #1 ; right shift 1 after vqdmulh + vshr.s16 q5, #1 + + vld1.s16 {q6, q7}, [r8@128] ;load dequant_ptr[i] + + vsub.s16 q4, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement) + vsub.s16 q5, q3 + + vshr.s16 q10, #1 ; right shift 1 after vqdmulh + vshr.s16 q11, #1 + + ldr r9, [r2, #vp8_blockd_dqcoeff] + + veor.s16 q10, q12 ; y2^sz2 + veor.s16 q11, q13 + + vst1.s16 {q4, q5}, [r7] ; store: qcoeff = x1 + + + vsub.s16 q10, q12 ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement) + vsub.s16 q11, q13 + + ldr r6, [r3, #vp8_blockd_qcoeff] + + vmul.s16 q2, q6, q4 ; x * Dequant + vmul.s16 q3, q7, q5 + + ldr r0, _inv_zig_zag_ ; load ptr of inverse zigzag table + + vceq.s16 q8, q8 ; set q8 to all 1 + + vst1.s16 {q10, q11}, [r6] ; store: qcoeff = x2 + + vmul.s16 q12, q6, q10 ; x2 * Dequant + vmul.s16 q13, q7, q11 + + vld1.16 {q6, q7}, [r0@128] ; load inverse scan order + + vtst.16 q14, q4, q8 ; now find eob + vtst.16 q15, q5, q8 ; non-zero element is set to all 1 + + vst1.s16 {q2, q3}, [r9] ; store dqcoeff = x * Dequant + + ldr r7, [r3, #vp8_blockd_dqcoeff] + + vand q0, q6, q14 ; get all valid numbers from scan array + vand q1, q7, q15 + + vst1.s16 {q12, q13}, [r7] ; store dqcoeff = x * Dequant + + vtst.16 q2, q10, q8 ; now find eob + vtst.16 q3, q11, q8 ; non-zero element is set to all 1 + + vmax.u16 q0, q0, q1 ; find maximum value in q0, q1 + + vand q10, q6, q2 ; get all valid numbers from scan array + vand q11, q7, q3 + vmax.u16 q10, q10, q11 ; find maximum value in q10, q11 + + vmax.u16 d0, d0, d1 + vmax.u16 d20, d20, d21 + vmovl.u16 q0, d0 + vmovl.u16 q10, d20 + + + vmax.u32 d0, d0, d1 + vmax.u32 d20, d20, d21 + vpmax.u32 d0, d0, d0 + vpmax.u32 d20, d20, d20 + + add r4, r2, #vp8_blockd_eob + add r5, r3, #vp8_blockd_eob + + vst1.32 {d0[0]}, [r4@32] + vst1.32 {d20[0]}, [r5@32] + + vldmia sp!, {q4-q7} + ldmfd sp!, {r4-r9} + bx lr + + ENDP ;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d) |vp8_fast_quantize_b_neon| PROC @@ -97,10 +230,8 @@ vst1.s16 {q2, q3}, [r7@128] ; store dqcoeff = x * Dequant - vmov.32 r0, d0[0] ; this instruction takes 1+13 cycles - ; if we have vfp, we could use - ; vstr s0, [r1, #vp8_blockd_eob] - str r0, [r1, #vp8_blockd_eob] + add r4, r1, #vp8_blockd_eob + vst1.32 {d0[0]}, [r4@32] ldmfd sp!, {r4-r7} bx lr diff --git a/vp8/encoder/arm/quantize_arm.c b/vp8/encoder/arm/quantize_arm.c new file mode 100644 index 000000000..52d84013e --- /dev/null +++ b/vp8/encoder/arm/quantize_arm.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include <math.h> +#include "vpx_mem/vpx_mem.h" + +#include "vp8/encoder/quantize.h" +#include "vp8/common/entropy.h" + + +#if HAVE_ARMV7 + +/* vp8_quantize_mbX functions here differs from corresponding ones in + * quantize.c only by using quantize_b_pair function pointer instead of + * the regular quantize_b function pointer */ +void vp8_quantize_mby_neon(MACROBLOCK *x) +{ + int i; + int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED + && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV); + + for (i = 0; i < 16; i+=2) + x->quantize_b_pair(&x->block[i], &x->block[i+1], + &x->e_mbd.block[i], &x->e_mbd.block[i+1]); + + if(has_2nd_order) + x->quantize_b(&x->block[24], &x->e_mbd.block[24]); +} + +void vp8_quantize_mb_neon(MACROBLOCK *x) +{ + int i; + int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED + && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV); + + for (i = 0; i < 24; i+=2) + x->quantize_b_pair(&x->block[i], &x->block[i+1], + &x->e_mbd.block[i], &x->e_mbd.block[i+1]); + + if (has_2nd_order) + x->quantize_b(&x->block[i], &x->e_mbd.block[i]); +} + + +void vp8_quantize_mbuv_neon(MACROBLOCK *x) +{ + int i; + + for (i = 16; i < 24; i+=2) + x->quantize_b_pair(&x->block[i], &x->block[i+1], + &x->e_mbd.block[i], &x->e_mbd.block[i+1]); +} + +#endif /* HAVE_ARMV7 */ diff --git a/vp8/encoder/arm/quantize_arm.h b/vp8/encoder/arm/quantize_arm.h index af4187ac1..7d2088d2d 100644 --- a/vp8/encoder/arm/quantize_arm.h +++ b/vp8/encoder/arm/quantize_arm.h @@ -16,8 +16,10 @@ extern prototype_quantize_block(vp8_fast_quantize_b_armv6); +#if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_quantize_fastquantb #define vp8_quantize_fastquantb vp8_fast_quantize_b_armv6 +#endif #endif /* HAVE_ARMV6 */ @@ -25,10 +27,25 @@ extern prototype_quantize_block(vp8_fast_quantize_b_armv6); #if HAVE_ARMV7 extern prototype_quantize_block(vp8_fast_quantize_b_neon); +extern prototype_quantize_block_pair(vp8_fast_quantize_b_pair_neon); +#if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_quantize_fastquantb #define vp8_quantize_fastquantb vp8_fast_quantize_b_neon +#undef vp8_quantize_fastquantb_pair +#define vp8_quantize_fastquantb_pair vp8_fast_quantize_b_pair_neon + +#undef vp8_quantize_mb +#define vp8_quantize_mb vp8_quantize_mb_neon + +#undef vp8_quantize_mbuv +#define vp8_quantize_mbuv vp8_quantize_mbuv_neon + +#undef vp8_quantize_mby +#define vp8_quantize_mby vp8_quantize_mby_neon +#endif + #endif /* HAVE_ARMV7 */ #endif |