summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--vp8/encoder/arm/arm_csystemdependent.c4
-rw-r--r--vp8/encoder/arm/neon/fastquantizeb_neon.asm139
-rw-r--r--vp8/encoder/arm/quantize_arm.c62
-rw-r--r--vp8/encoder/arm/quantize_arm.h17
-rw-r--r--vp8/encoder/block.h1
-rw-r--r--vp8/encoder/encodeframe.c11
-rw-r--r--vp8/encoder/ethreading.c1
-rw-r--r--vp8/encoder/generic/csystemdependent.c4
-rw-r--r--vp8/encoder/onyx_if.c10
-rw-r--r--vp8/encoder/quantize.c23
-rw-r--r--vp8/encoder/quantize.h35
-rw-r--r--vp8/vp8cx_arm.mk1
12 files changed, 289 insertions, 19 deletions
diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c
index af2a5df98..db079d5ed 100644
--- a/vp8/encoder/arm/arm_csystemdependent.c
+++ b/vp8/encoder/arm/arm_csystemdependent.c
@@ -121,8 +121,10 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
cpi->rtcd.encodemb.submby = vp8_subtract_mby_neon;
cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_neon;
- /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;*/
+ /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;
+ cpi->rtcd.quantize.quantb_pair = vp8_regular_quantize_b_pair;*/
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_neon;
+ cpi->rtcd.quantize.fastquantb_pair = vp8_fast_quantize_b_pair_neon;
}
#endif
diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.asm b/vp8/encoder/arm/neon/fastquantizeb_neon.asm
index 3dd92b12e..dcf3c5090 100644
--- a/vp8/encoder/arm/neon/fastquantizeb_neon.asm
+++ b/vp8/encoder/arm/neon/fastquantizeb_neon.asm
@@ -10,6 +10,7 @@
EXPORT |vp8_fast_quantize_b_neon|
+ EXPORT |vp8_fast_quantize_b_pair_neon|
INCLUDE asm_enc_offsets.asm
@@ -19,6 +20,138 @@
AREA ||.text||, CODE, READONLY, ALIGN=4
+;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);
+|vp8_fast_quantize_b_pair_neon| PROC
+
+ stmfd sp!, {r4-r9}
+ vstmdb sp!, {q4-q7}
+
+ ldr r4, [r0, #vp8_block_coeff]
+ ldr r5, [r0, #vp8_block_quant_fast]
+ ldr r6, [r0, #vp8_block_round]
+
+ vld1.16 {q0, q1}, [r4@128] ; load z
+
+ ldr r7, [r2, #vp8_blockd_qcoeff]
+
+ vabs.s16 q4, q0 ; calculate x = abs(z)
+ vabs.s16 q5, q1
+
+ ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
+ vshr.s16 q2, q0, #15 ; sz
+ vshr.s16 q3, q1, #15
+
+ vld1.s16 {q6, q7}, [r6@128] ; load round_ptr [0-15]
+ vld1.s16 {q8, q9}, [r5@128] ; load quant_ptr [0-15]
+
+ ldr r4, [r1, #vp8_block_coeff]
+
+ vadd.s16 q4, q6 ; x + Round
+ vadd.s16 q5, q7
+
+ vld1.16 {q0, q1}, [r4@128] ; load z2
+
+ vqdmulh.s16 q4, q8 ; y = ((Round+abs(z)) * Quant) >> 16
+ vqdmulh.s16 q5, q9
+
+ vabs.s16 q10, q0 ; calculate x2 = abs(z_2)
+ vabs.s16 q11, q1
+ vshr.s16 q12, q0, #15 ; sz2
+ vshr.s16 q13, q1, #15
+
+ ;modify data to have its original sign
+ veor.s16 q4, q2 ; y^sz
+ veor.s16 q5, q3
+
+ vadd.s16 q10, q6 ; x2 + Round
+ vadd.s16 q11, q7
+
+ ldr r8, [r2, #vp8_blockd_dequant]
+
+ vqdmulh.s16 q10, q8 ; y2 = ((Round+abs(z)) * Quant) >> 16
+ vqdmulh.s16 q11, q9
+
+ vshr.s16 q4, #1 ; right shift 1 after vqdmulh
+ vshr.s16 q5, #1
+
+ vld1.s16 {q6, q7}, [r8@128] ;load dequant_ptr[i]
+
+ vsub.s16 q4, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
+ vsub.s16 q5, q3
+
+ vshr.s16 q10, #1 ; right shift 1 after vqdmulh
+ vshr.s16 q11, #1
+
+ ldr r9, [r2, #vp8_blockd_dqcoeff]
+
+ veor.s16 q10, q12 ; y2^sz2
+ veor.s16 q11, q13
+
+ vst1.s16 {q4, q5}, [r7] ; store: qcoeff = x1
+
+
+ vsub.s16 q10, q12 ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement)
+ vsub.s16 q11, q13
+
+ ldr r6, [r3, #vp8_blockd_qcoeff]
+
+ vmul.s16 q2, q6, q4 ; x * Dequant
+ vmul.s16 q3, q7, q5
+
+ ldr r0, _inv_zig_zag_ ; load ptr of inverse zigzag table
+
+ vceq.s16 q8, q8 ; set q8 to all 1
+
+ vst1.s16 {q10, q11}, [r6] ; store: qcoeff = x2
+
+ vmul.s16 q12, q6, q10 ; x2 * Dequant
+ vmul.s16 q13, q7, q11
+
+ vld1.16 {q6, q7}, [r0@128] ; load inverse scan order
+
+ vtst.16 q14, q4, q8 ; now find eob
+ vtst.16 q15, q5, q8 ; non-zero element is set to all 1
+
+ vst1.s16 {q2, q3}, [r9] ; store dqcoeff = x * Dequant
+
+ ldr r7, [r3, #vp8_blockd_dqcoeff]
+
+ vand q0, q6, q14 ; get all valid numbers from scan array
+ vand q1, q7, q15
+
+ vst1.s16 {q12, q13}, [r7] ; store dqcoeff = x * Dequant
+
+ vtst.16 q2, q10, q8 ; now find eob
+ vtst.16 q3, q11, q8 ; non-zero element is set to all 1
+
+ vmax.u16 q0, q0, q1 ; find maximum value in q0, q1
+
+ vand q10, q6, q2 ; get all valid numbers from scan array
+ vand q11, q7, q3
+ vmax.u16 q10, q10, q11 ; find maximum value in q10, q11
+
+ vmax.u16 d0, d0, d1
+ vmax.u16 d20, d20, d21
+ vmovl.u16 q0, d0
+ vmovl.u16 q10, d20
+
+
+ vmax.u32 d0, d0, d1
+ vmax.u32 d20, d20, d21
+ vpmax.u32 d0, d0, d0
+ vpmax.u32 d20, d20, d20
+
+ add r4, r2, #vp8_blockd_eob
+ add r5, r3, #vp8_blockd_eob
+
+ vst1.32 {d0[0]}, [r4@32]
+ vst1.32 {d20[0]}, [r5@32]
+
+ vldmia sp!, {q4-q7}
+ ldmfd sp!, {r4-r9}
+ bx lr
+
+ ENDP
;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
|vp8_fast_quantize_b_neon| PROC
@@ -97,10 +230,8 @@
vst1.s16 {q2, q3}, [r7@128] ; store dqcoeff = x * Dequant
- vmov.32 r0, d0[0] ; this instruction takes 1+13 cycles
- ; if we have vfp, we could use
- ; vstr s0, [r1, #vp8_blockd_eob]
- str r0, [r1, #vp8_blockd_eob]
+ add r4, r1, #vp8_blockd_eob
+ vst1.32 {d0[0]}, [r4@32]
ldmfd sp!, {r4-r7}
bx lr
diff --git a/vp8/encoder/arm/quantize_arm.c b/vp8/encoder/arm/quantize_arm.c
new file mode 100644
index 000000000..52d84013e
--- /dev/null
+++ b/vp8/encoder/arm/quantize_arm.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp8/encoder/quantize.h"
+#include "vp8/common/entropy.h"
+
+
+#if HAVE_ARMV7
+
+/* vp8_quantize_mbX functions here differs from corresponding ones in
+ * quantize.c only by using quantize_b_pair function pointer instead of
+ * the regular quantize_b function pointer */
+void vp8_quantize_mby_neon(MACROBLOCK *x)
+{
+ int i;
+ int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+ && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+
+ for (i = 0; i < 16; i+=2)
+ x->quantize_b_pair(&x->block[i], &x->block[i+1],
+ &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
+
+ if(has_2nd_order)
+ x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
+}
+
+void vp8_quantize_mb_neon(MACROBLOCK *x)
+{
+ int i;
+ int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+ && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+
+ for (i = 0; i < 24; i+=2)
+ x->quantize_b_pair(&x->block[i], &x->block[i+1],
+ &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
+
+ if (has_2nd_order)
+ x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+}
+
+
+void vp8_quantize_mbuv_neon(MACROBLOCK *x)
+{
+ int i;
+
+ for (i = 16; i < 24; i+=2)
+ x->quantize_b_pair(&x->block[i], &x->block[i+1],
+ &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
+}
+
+#endif /* HAVE_ARMV7 */
diff --git a/vp8/encoder/arm/quantize_arm.h b/vp8/encoder/arm/quantize_arm.h
index af4187ac1..7d2088d2d 100644
--- a/vp8/encoder/arm/quantize_arm.h
+++ b/vp8/encoder/arm/quantize_arm.h
@@ -16,8 +16,10 @@
extern prototype_quantize_block(vp8_fast_quantize_b_armv6);
+#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_quantize_fastquantb
#define vp8_quantize_fastquantb vp8_fast_quantize_b_armv6
+#endif
#endif /* HAVE_ARMV6 */
@@ -25,10 +27,25 @@ extern prototype_quantize_block(vp8_fast_quantize_b_armv6);
#if HAVE_ARMV7
extern prototype_quantize_block(vp8_fast_quantize_b_neon);
+extern prototype_quantize_block_pair(vp8_fast_quantize_b_pair_neon);
+#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_quantize_fastquantb
#define vp8_quantize_fastquantb vp8_fast_quantize_b_neon
+#undef vp8_quantize_fastquantb_pair
+#define vp8_quantize_fastquantb_pair vp8_fast_quantize_b_pair_neon
+
+#undef vp8_quantize_mb
+#define vp8_quantize_mb vp8_quantize_mb_neon
+
+#undef vp8_quantize_mbuv
+#define vp8_quantize_mbuv vp8_quantize_mbuv_neon
+
+#undef vp8_quantize_mby
+#define vp8_quantize_mby vp8_quantize_mby_neon
+#endif
+
#endif /* HAVE_ARMV7 */
#endif
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index fbdc89e87..fabd82a06 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -119,6 +119,7 @@ typedef struct
void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
void (*short_walsh4x4)(short *input, short *output, int pitch);
void (*quantize_b)(BLOCK *b, BLOCKD *d);
+ void (*quantize_b_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1);
} MACROBLOCK;
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 1bb026048..98d6232b0 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -1142,8 +1142,10 @@ int vp8cx_encode_inter_macroblock
/* Are we using the fast quantizer for the mode selection? */
if(cpi->sf.use_fastquant_for_pick)
{
- cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
- fastquantb);
+ cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
+ fastquantb);
+ cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
+ fastquantb_pair);
/* the fast quantizer does not use zbin_extra, so
* do not recalculate */
@@ -1155,7 +1157,10 @@ int vp8cx_encode_inter_macroblock
/* switch back to the regular quantizer for the encode */
if (cpi->sf.improved_quant)
{
- cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb);
+ cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
+ quantb);
+ cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
+ quantb_pair);
}
/* restore cpi->zbin_mode_boost_enabled */
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index 1d92f20af..565e4f22e 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -328,6 +328,7 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
z->vp8_short_fdct8x4 = x->vp8_short_fdct8x4;
z->short_walsh4x4 = x->short_walsh4x4;
z->quantize_b = x->quantize_b;
+ z->quantize_b_pair = x->quantize_b_pair;
z->optimize = x->optimize;
/*
diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
index 928f559f3..9af3f183a 100644
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -17,8 +17,6 @@
void vp8_arch_x86_encoder_init(VP8_COMP *cpi);
void vp8_arch_arm_encoder_init(VP8_COMP *cpi);
-extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
-
void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
@@ -88,7 +86,9 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_c;
cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;
+ cpi->rtcd.quantize.quantb_pair = vp8_regular_quantize_b_pair;
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_c;
+ cpi->rtcd.quantize.fastquantb_pair = vp8_fast_quantize_b_pair_c;
cpi->rtcd.search.full_search = vp8_full_search_sad;
cpi->rtcd.search.refining_search = vp8_refining_search_sad;
cpi->rtcd.search.diamond_search = vp8_diamond_search_sad;
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 18ffa02a8..2bdd46d4b 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1200,11 +1200,17 @@ void vp8_set_speed_features(VP8_COMP *cpi)
if (cpi->sf.improved_quant)
{
- cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb);
+ cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
+ quantb);
+ cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
+ quantb_pair);
}
else
{
- cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb);
+ cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
+ fastquantb);
+ cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
+ fastquantb_pair);
}
if (cpi->sf.improved_quant != last_improved_quant)
vp8cx_init_quantizer(cpi);
diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c
index 49e8e1b9b..503d24123 100644
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -269,7 +269,7 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
#endif
-void vp8_quantize_mby(MACROBLOCK *x)
+void vp8_quantize_mby_c(MACROBLOCK *x)
{
int i;
int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
@@ -282,7 +282,7 @@ void vp8_quantize_mby(MACROBLOCK *x)
x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
}
-void vp8_quantize_mb(MACROBLOCK *x)
+void vp8_quantize_mb_c(MACROBLOCK *x)
{
int i;
int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED
@@ -293,7 +293,7 @@ void vp8_quantize_mb(MACROBLOCK *x)
}
-void vp8_quantize_mbuv(MACROBLOCK *x)
+void vp8_quantize_mbuv_c(MACROBLOCK *x)
{
int i;
@@ -301,6 +301,22 @@ void vp8_quantize_mbuv(MACROBLOCK *x)
x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
}
+/* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
+ * these two C functions if corresponding optimized routine is not available.
+ * NEON optimized version implements currently the fast quantization for pair
+ * of blocks. */
+void vp8_regular_quantize_b_pair(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
+{
+ vp8_regular_quantize_b(b1, d1);
+ vp8_regular_quantize_b(b2, d2);
+}
+
+void vp8_fast_quantize_b_pair_c(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
+{
+ vp8_fast_quantize_b_c(b1, d1);
+ vp8_fast_quantize_b_c(b2, d2);
+}
+
static const int qrounding_factors[129] =
{
@@ -715,3 +731,4 @@ void vp8_set_quantizer(struct VP8_COMP *cpi, int Q)
vp8cx_init_quantizer(cpi);
}
+
diff --git a/vp8/encoder/quantize.h b/vp8/encoder/quantize.h
index d9a041071..f1f0156d8 100644
--- a/vp8/encoder/quantize.h
+++ b/vp8/encoder/quantize.h
@@ -17,6 +17,11 @@
#define prototype_quantize_block(sym) \
void (sym)(BLOCK *b,BLOCKD *d)
+#define prototype_quantize_block_pair(sym) \
+ void (sym)(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
+
+#define prototype_quantize_mb(sym) \
+ void (sym)(MACROBLOCK *x)
#if ARCH_X86 || ARCH_X86_64
#include "x86/quantize_x86.h"
@@ -31,17 +36,43 @@
#endif
extern prototype_quantize_block(vp8_quantize_quantb);
+#ifndef vp8_quantize_quantb_pair
+#define vp8_quantize_quantb_pair vp8_regular_quantize_b_pair
+#endif
+extern prototype_quantize_block_pair(vp8_quantize_quantb_pair);
+
#ifndef vp8_quantize_fastquantb
#define vp8_quantize_fastquantb vp8_fast_quantize_b_c
#endif
extern prototype_quantize_block(vp8_quantize_fastquantb);
+#ifndef vp8_quantize_fastquantb_pair
+#define vp8_quantize_fastquantb_pair vp8_fast_quantize_b_pair_c
+#endif
+extern prototype_quantize_block_pair(vp8_quantize_fastquantb_pair);
+
typedef struct
{
prototype_quantize_block(*quantb);
+ prototype_quantize_block_pair(*quantb_pair);
prototype_quantize_block(*fastquantb);
+ prototype_quantize_block_pair(*fastquantb_pair);
} vp8_quantize_rtcd_vtable_t;
+#ifndef vp8_quantize_mb
+#define vp8_quantize_mb vp8_quantize_mb_c
+#endif
+extern prototype_quantize_mb(vp8_quantize_mb);
+
+#ifndef vp8_quantize_mbuv
+#define vp8_quantize_mbuv vp8_quantize_mbuv_c
+#endif
+extern prototype_quantize_mb(vp8_quantize_mbuv);
+
+#ifndef vp8_quantize_mby
+#define vp8_quantize_mby vp8_quantize_mby_c
+#endif
+extern prototype_quantize_mb(vp8_quantize_mby);
#if CONFIG_RUNTIME_CPU_DETECT
#define QUANTIZE_INVOKE(ctx,fn) (ctx)->fn
@@ -51,10 +82,6 @@ typedef struct
extern void vp8_strict_quantize_b(BLOCK *b,BLOCKD *d);
-extern void vp8_quantize_mb(MACROBLOCK *x);
-extern void vp8_quantize_mbuv(MACROBLOCK *x);
-extern void vp8_quantize_mby(MACROBLOCK *x);
-
struct VP8_COMP;
extern void vp8_set_quantizer(struct VP8_COMP *cpi, int Q);
extern void vp8cx_frame_init_quantizer(struct VP8_COMP *cpi);
diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index 03d42d215..165dada2b 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -15,6 +15,7 @@
# encoder
VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/arm_csystemdependent.c
+VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/quantize_arm.c
VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/picklpf_arm.c
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/dct_arm.c
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/variance_arm.c