diff options
author | Ronald S. Bultje <rbultje@google.com> | 2012-11-01 11:09:58 -0700 |
---|---|---|
committer | Ronald S. Bultje <rbultje@google.com> | 2012-11-01 16:31:22 -0700 |
commit | 4b2c2b9aa4a273a23d90ddb3bbf6dfb3482e0b8f (patch) | |
tree | 20eef975f1a8c28978d826a354092433b9093588 /vp9/encoder/arm | |
parent | 6c280c2299f078a475dc87e7615fdf1a4998cd31 (diff) | |
download | libvpx-4b2c2b9aa4a273a23d90ddb3bbf6dfb3482e0b8f.tar libvpx-4b2c2b9aa4a273a23d90ddb3bbf6dfb3482e0b8f.tar.gz libvpx-4b2c2b9aa4a273a23d90ddb3bbf6dfb3482e0b8f.tar.bz2 libvpx-4b2c2b9aa4a273a23d90ddb3bbf6dfb3482e0b8f.zip |
Rename vp8/ codec directory to vp9/.
Change-Id: Ic084c475844b24092a433ab88138cf58af3abbe4
Diffstat (limited to 'vp9/encoder/arm')
37 files changed, 6992 insertions, 0 deletions
diff --git a/vp9/encoder/arm/arm_csystemdependent.c b/vp9/encoder/arm/arm_csystemdependent.c new file mode 100644 index 000000000..99129de98 --- /dev/null +++ b/vp9/encoder/arm/arm_csystemdependent.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "vpx_ports/arm.h" +#include "vp9/encoder/variance.h" +#include "vp9/encoder/onyx_int.h" + +extern void (*vp9_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction); +extern void vp9_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction); +extern void vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction); + +void vp9_arch_arm_encoder_init(VP9_COMP *cpi) { +#if CONFIG_RUNTIME_CPU_DETECT + int flags = cpi->common.rtcd.flags; + +#if HAVE_ARMV5TE + if (flags & HAS_EDSP) { + } +#endif + +#if HAVE_ARMV6 + if (flags & HAS_MEDIA) { + cpi->rtcd.variance.sad16x16 = vp9_sad16x16_armv6; + /*cpi->rtcd.variance.sad16x8 = vp9_sad16x8_c; + cpi->rtcd.variance.sad8x16 = vp9_sad8x16_c; + cpi->rtcd.variance.sad8x8 = vp9_sad8x8_c; + cpi->rtcd.variance.sad4x4 = vp9_sad4x4_c;*/ + + /*cpi->rtcd.variance.var4x4 = vp9_variance4x4_c;*/ + cpi->rtcd.variance.var8x8 = vp9_variance8x8_armv6; + /*cpi->rtcd.variance.var8x16 = vp9_variance8x16_c; + cpi->rtcd.variance.var16x8 = vp9_variance16x8_c;*/ + cpi->rtcd.variance.var16x16 = vp9_variance16x16_armv6; + + /*cpi->rtcd.variance.subpixvar4x4 = vp9_sub_pixel_variance4x4_c;*/ + cpi->rtcd.variance.subpixvar8x8 = vp9_sub_pixel_variance8x8_armv6; + /*cpi->rtcd.variance.subpixvar8x16 = vp9_sub_pixel_variance8x16_c; + cpi->rtcd.variance.subpixvar16x8 = vp9_sub_pixel_variance16x8_c;*/ + cpi->rtcd.variance.subpixvar16x16 = vp9_sub_pixel_variance16x16_armv6; + cpi->rtcd.variance.halfpixvar16x16_h = vp9_variance_halfpixvar16x16_h_armv6; + cpi->rtcd.variance.halfpixvar16x16_v = vp9_variance_halfpixvar16x16_v_armv6; + cpi->rtcd.variance.halfpixvar16x16_hv = vp9_variance_halfpixvar16x16_hv_armv6; + + cpi->rtcd.variance.mse16x16 = vp9_mse16x16_armv6; + /*cpi->rtcd.variance.getmbss = vp9_get_mb_ss_c;*/ + + cpi->rtcd.fdct.short4x4 = vp9_short_fdct4x4_armv6; + cpi->rtcd.fdct.short8x4 = vp9_short_fdct8x4_armv6; + cpi->rtcd.fdct.fast4x4 = vp9_short_fdct4x4_armv6; + cpi->rtcd.fdct.fast8x4 = vp9_short_fdct8x4_armv6; + cpi->rtcd.fdct.walsh_short4x4 = vp9_short_walsh4x4_armv6; + + /*cpi->rtcd.encodemb.berr = vp9_block_error_c; + cpi->rtcd.encodemb.mberr = vp9_mbblock_error_c; + cpi->rtcd.encodemb.mbuverr = vp9_mbuverror_c;*/ + cpi->rtcd.encodemb.subb = vp9_subtract_b_armv6; + cpi->rtcd.encodemb.submby = vp9_subtract_mby_armv6; + cpi->rtcd.encodemb.submbuv = vp9_subtract_mbuv_armv6; + + /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;*/ + cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_armv6; + } +#endif + +#if HAVE_ARMV7 + if (flags & HAS_NEON) { + cpi->rtcd.variance.sad16x16 = vp9_sad16x16_neon; + cpi->rtcd.variance.sad16x8 = vp9_sad16x8_neon; + cpi->rtcd.variance.sad8x16 = vp9_sad8x16_neon; + cpi->rtcd.variance.sad8x8 = vp9_sad8x8_neon; + cpi->rtcd.variance.sad4x4 = vp9_sad4x4_neon; + + /*cpi->rtcd.variance.var4x4 = vp9_variance4x4_c;*/ + cpi->rtcd.variance.var8x8 = vp9_variance8x8_neon; + cpi->rtcd.variance.var8x16 = vp9_variance8x16_neon; + cpi->rtcd.variance.var16x8 = vp9_variance16x8_neon; + cpi->rtcd.variance.var16x16 = vp9_variance16x16_neon; + + /*cpi->rtcd.variance.subpixvar4x4 = vp9_sub_pixel_variance4x4_c;*/ + cpi->rtcd.variance.subpixvar8x8 = vp9_sub_pixel_variance8x8_neon; + /*cpi->rtcd.variance.subpixvar8x16 = vp9_sub_pixel_variance8x16_c; + cpi->rtcd.variance.subpixvar16x8 = vp9_sub_pixel_variance16x8_c;*/ + cpi->rtcd.variance.subpixvar16x16 = vp9_sub_pixel_variance16x16_neon; + cpi->rtcd.variance.halfpixvar16x16_h = vp9_variance_halfpixvar16x16_h_neon; + cpi->rtcd.variance.halfpixvar16x16_v = vp9_variance_halfpixvar16x16_v_neon; + cpi->rtcd.variance.halfpixvar16x16_hv = vp9_variance_halfpixvar16x16_hv_neon; + + cpi->rtcd.variance.mse16x16 = vp9_mse16x16_neon; + /*cpi->rtcd.variance.getmbss = vp9_get_mb_ss_c;*/ + + cpi->rtcd.fdct.short4x4 = vp9_short_fdct4x4_neon; + cpi->rtcd.fdct.short8x4 = vp9_short_fdct8x4_neon; + cpi->rtcd.fdct.fast4x4 = vp9_short_fdct4x4_neon; + cpi->rtcd.fdct.fast8x4 = vp9_short_fdct8x4_neon; + cpi->rtcd.fdct.walsh_short4x4 = vp9_short_walsh4x4_neon; + + /*cpi->rtcd.encodemb.berr = vp9_block_error_c; + cpi->rtcd.encodemb.mberr = vp9_mbblock_error_c; + cpi->rtcd.encodemb.mbuverr = vp9_mbuverror_c;*/ + cpi->rtcd.encodemb.subb = vp9_subtract_b_neon; + cpi->rtcd.encodemb.submby = vp9_subtract_mby_neon; + cpi->rtcd.encodemb.submbuv = vp9_subtract_mbuv_neon; + + /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b; + cpi->rtcd.quantize.quantb_pair = vp8_regular_quantize_b_pair;*/ + cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_neon; + cpi->rtcd.quantize.fastquantb_pair = vp8_fast_quantize_b_pair_neon; + } +#endif + +#if HAVE_ARMV7 +#if CONFIG_RUNTIME_CPU_DETECT + if (flags & HAS_NEON) +#endif + { + vp9_yv12_copy_partial_frame_ptr = vpxyv12_copy_partial_frame_neon; + } +#endif +#endif +} diff --git a/vp9/encoder/arm/armv5te/boolhuff_armv5te.asm b/vp9/encoder/arm/armv5te/boolhuff_armv5te.asm new file mode 100644 index 000000000..180637e68 --- /dev/null +++ b/vp9/encoder/arm/armv5te/boolhuff_armv5te.asm @@ -0,0 +1,286 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_start_encode| + EXPORT |vp9_encode_bool| + EXPORT |vp8_stop_encode| + EXPORT |vp8_encode_value| + + INCLUDE asm_enc_offsets.asm + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY + +; r0 BOOL_CODER *br +; r1 unsigned char *source + +|vp8_start_encode| PROC + mov r12, #0 + mov r3, #255 + mvn r2, #23 + str r12, [r0, #vp9_writer_lowvalue] + str r3, [r0, #vp9_writer_range] + str r12, [r0, #vp9_writer_value] + str r2, [r0, #vp9_writer_count] + str r12, [r0, #vp9_writer_pos] + str r1, [r0, #vp9_writer_buffer] + bx lr + ENDP + +; r0 BOOL_CODER *br +; r1 int bit +; r2 int probability +|vp9_encode_bool| PROC + push {r4-r9, lr} + + mov r4, r2 + + ldr r2, [r0, #vp9_writer_lowvalue] + ldr r5, [r0, #vp9_writer_range] + ldr r3, [r0, #vp9_writer_count] + + sub r7, r5, #1 ; range-1 + + cmp r1, #0 + mul r6, r4, r7 ; ((range-1) * probability) + + mov r7, #1 + add r4, r7, r6, lsr #8 ; 1 + (((range-1) * probability) >> 8) + + addne r2, r2, r4 ; if (bit) lowvalue += split + subne r4, r5, r4 ; if (bit) range = range-split + + ; Counting the leading zeros is used to normalize range. + clz r6, r4 + sub r6, r6, #24 ; shift + + ; Flag is set on the sum of count. This flag is used later + ; to determine if count >= 0 + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi token_count_lt_zero ; if(count >= 0) + + sub r6, r6, r3 ; offset = shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl token_high_bit_not_set + + ldr r4, [r0, #vp9_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos-1 + b token_zero_while_start +token_zero_while_loop + mov r9, #0 + strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +token_zero_while_start + cmp r4, #0 + ldrge r7, [r0, #vp9_writer_buffer] + ldrb r1, [r7, r4] + cmpge r1, #0xff + beq token_zero_while_loop + + ldr r7, [r0, #vp9_writer_buffer] + ldrb r9, [r7, r4] ; w->buffer[x] + add r9, r9, #1 + strb r9, [r7, r4] ; w->buffer[x] + 1 +token_high_bit_not_set + rsb r4, r6, #24 ; 24-offset + ldr r9, [r0, #vp9_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp9_writer_pos] ; w->pos + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r1, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r1, [r0, #vp9_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r9, r4] ; w->buffer[w->pos++] + +token_count_lt_zero + lsl r2, r2, r6 ; lowvalue <<= shift + + str r2, [r0, #vp9_writer_lowvalue] + str r5, [r0, #vp9_writer_range] + str r3, [r0, #vp9_writer_count] + pop {r4-r9, pc} + ENDP + +; r0 BOOL_CODER *br +|vp8_stop_encode| PROC + push {r4-r10, lr} + + ldr r2, [r0, #vp9_writer_lowvalue] + ldr r5, [r0, #vp9_writer_range] + ldr r3, [r0, #vp9_writer_count] + + mov r10, #32 + +stop_encode_loop + sub r7, r5, #1 ; range-1 + + mov r4, r7, lsl #7 ; ((range-1) * 128) + + mov r7, #1 + add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8) + + ; Counting the leading zeros is used to normalize range. + clz r6, r4 + sub r6, r6, #24 ; shift + + ; Flag is set on the sum of count. This flag is used later + ; to determine if count >= 0 + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi token_count_lt_zero_se ; if(count >= 0) + + sub r6, r6, r3 ; offset = shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl token_high_bit_not_set_se + + ldr r4, [r0, #vp9_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos-1 + b token_zero_while_start_se +token_zero_while_loop_se + mov r9, #0 + strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +token_zero_while_start_se + cmp r4, #0 + ldrge r7, [r0, #vp9_writer_buffer] + ldrb r1, [r7, r4] + cmpge r1, #0xff + beq token_zero_while_loop_se + + ldr r7, [r0, #vp9_writer_buffer] + ldrb r9, [r7, r4] ; w->buffer[x] + add r9, r9, #1 + strb r9, [r7, r4] ; w->buffer[x] + 1 +token_high_bit_not_set_se + rsb r4, r6, #24 ; 24-offset + ldr r9, [r0, #vp9_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp9_writer_pos] ; w->pos + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r1, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r1, [r0, #vp9_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r9, r4] ; w->buffer[w->pos++] + +token_count_lt_zero_se + lsl r2, r2, r6 ; lowvalue <<= shift + + subs r10, r10, #1 + bne stop_encode_loop + + str r2, [r0, #vp9_writer_lowvalue] + str r5, [r0, #vp9_writer_range] + str r3, [r0, #vp9_writer_count] + pop {r4-r10, pc} + + ENDP + +; r0 BOOL_CODER *br +; r1 int data +; r2 int bits +|vp8_encode_value| PROC + push {r4-r11, lr} + + mov r10, r2 + + ldr r2, [r0, #vp9_writer_lowvalue] + ldr r5, [r0, #vp9_writer_range] + ldr r3, [r0, #vp9_writer_count] + + rsb r4, r10, #32 ; 32-n + + ; v is kept in r1 during the token pack loop + lsl r1, r1, r4 ; r1 = v << 32 - n + +encode_value_loop + sub r7, r5, #1 ; range-1 + + ; Decisions are made based on the bit value shifted + ; off of v, so set a flag here based on this. + ; This value is refered to as "bb" + lsls r1, r1, #1 ; bit = v >> n + mov r4, r7, lsl #7 ; ((range-1) * 128) + + mov r7, #1 + add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8) + + addcs r2, r2, r4 ; if (bit) lowvalue += split + subcs r4, r5, r4 ; if (bit) range = range-split + + ; Counting the leading zeros is used to normalize range. + clz r6, r4 + sub r6, r6, #24 ; shift + + ; Flag is set on the sum of count. This flag is used later + ; to determine if count >= 0 + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi token_count_lt_zero_ev ; if(count >= 0) + + sub r6, r6, r3 ; offset = shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl token_high_bit_not_set_ev + + ldr r4, [r0, #vp9_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos-1 + b token_zero_while_start_ev +token_zero_while_loop_ev + mov r9, #0 + strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +token_zero_while_start_ev + cmp r4, #0 + ldrge r7, [r0, #vp9_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq token_zero_while_loop_ev + + ldr r7, [r0, #vp9_writer_buffer] + ldrb r9, [r7, r4] ; w->buffer[x] + add r9, r9, #1 + strb r9, [r7, r4] ; w->buffer[x] + 1 +token_high_bit_not_set_ev + rsb r4, r6, #24 ; 24-offset + ldr r9, [r0, #vp9_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp9_writer_pos] ; w->pos + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp9_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r9, r4] ; w->buffer[w->pos++] + +token_count_lt_zero_ev + lsl r2, r2, r6 ; lowvalue <<= shift + + subs r10, r10, #1 + bne encode_value_loop + + str r2, [r0, #vp9_writer_lowvalue] + str r5, [r0, #vp9_writer_range] + str r3, [r0, #vp9_writer_count] + pop {r4-r11, pc} + ENDP + + END diff --git a/vp9/encoder/arm/armv5te/vp8_packtokens_armv5.asm b/vp9/encoder/arm/armv5te/vp8_packtokens_armv5.asm new file mode 100644 index 000000000..bf299770b --- /dev/null +++ b/vp9/encoder/arm/armv5te/vp8_packtokens_armv5.asm @@ -0,0 +1,291 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8cx_pack_tokens_armv5| + + INCLUDE asm_enc_offsets.asm + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY + +; r0 vp9_writer *w +; r1 const TOKENEXTRA *p +; r2 int xcount +; r3 vp8_coef_encodings +; s0 vp8_extra_bits +; s1 vp8_coef_tree +|vp8cx_pack_tokens_armv5| PROC + push {r4-r11, lr} + + ; Add size of xcount * sizeof (TOKENEXTRA) to get stop + ; sizeof (TOKENEXTRA) is 8 + sub sp, sp, #12 + add r2, r1, r2, lsl #3 ; stop = p + xcount*sizeof(TOKENEXTRA) + str r2, [sp, #0] + str r3, [sp, #8] ; save vp8_coef_encodings + ldr r2, [r0, #vp9_writer_lowvalue] + ldr r5, [r0, #vp9_writer_range] + ldr r3, [r0, #vp9_writer_count] + b check_p_lt_stop + +while_p_lt_stop + ldrb r6, [r1, #tokenextra_token] ; t + ldr r4, [sp, #8] ; vp8_coef_encodings + mov lr, #0 + add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t + ldr r9, [r1, #tokenextra_context_tree] ; pp + + ldrb r7, [r1, #tokenextra_skip_eob_node] + + ldr r6, [r4, #vp9_token_value] ; v + ldr r8, [r4, #vp9_token_len] ; n + + ; vp8 specific skip_eob_node + cmp r7, #0 + movne lr, #2 ; i = 2 + subne r8, r8, #1 ; --n + + rsb r4, r8, #32 ; 32-n + ldr r10, [sp, #52] ; vp8_coef_tree + + ; v is kept in r12 during the token pack loop + lsl r12, r6, r4 ; r12 = v << 32 - n + +; loop start +token_loop + ldrb r4, [r9, lr, asr #1] ; pp [i>>1] + sub r7, r5, #1 ; range-1 + + ; Decisions are made based on the bit value shifted + ; off of v, so set a flag here based on this. + ; This value is refered to as "bb" + lsls r12, r12, #1 ; bb = v >> n + mul r6, r4, r7 ; ((range-1) * pp[i>>1])) + + ; bb can only be 0 or 1. So only execute this statement + ; if bb == 1, otherwise it will act like i + 0 + addcs lr, lr, #1 ; i + bb + + mov r7, #1 + ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb] + add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8) + + addcs r2, r2, r4 ; if (bb) lowvalue += split + subcs r4, r5, r4 ; if (bb) range = range-split + + ; Counting the leading zeros is used to normalize range. + clz r6, r4 + sub r6, r6, #24 ; shift + + ; Flag is set on the sum of count. This flag is used later + ; to determine if count >= 0 + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi token_count_lt_zero ; if(count >= 0) + + sub r6, r6, r3 ; offset = shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl token_high_bit_not_set + + ldr r4, [r0, #vp9_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos-1 + b token_zero_while_start +token_zero_while_loop + mov r10, #0 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +token_zero_while_start + cmp r4, #0 + ldrge r7, [r0, #vp9_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq token_zero_while_loop + + ldr r7, [r0, #vp9_writer_buffer] + ldrb r10, [r7, r4] ; w->buffer[x] + add r10, r10, #1 + strb r10, [r7, r4] ; w->buffer[x] + 1 +token_high_bit_not_set + rsb r4, r6, #24 ; 24-offset + ldr r10, [r0, #vp9_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp9_writer_pos] ; w->pos + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp9_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r10, r4] ; w->buffer[w->pos++] + + ; r10 is used earlier in the loop, but r10 is used as + ; temp variable here. So after r10 is used, reload + ; vp8_coef_tree_dcd into r10 + ldr r10, [sp, #52] ; vp8_coef_tree + +token_count_lt_zero + lsl r2, r2, r6 ; lowvalue <<= shift + + subs r8, r8, #1 ; --n + bne token_loop + + ldrb r6, [r1, #tokenextra_token] ; t + ldr r7, [sp, #48] ; vp8_extra_bits + ; Add t * sizeof (vp9_extra_bit_struct) to get the desired + ; element. Here vp9_extra_bit_struct == 16 + add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t + + ldr r4, [r12, #vp9_extra_bit_struct_base_val] + cmp r4, #0 + beq skip_extra_bits + +; if( b->base_val) + ldr r8, [r12, #vp9_extra_bit_struct_len] ; L + ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra + cmp r8, #0 ; if( L) + beq no_extra_bits + + ldr r9, [r12, #vp9_extra_bit_struct_prob] + asr r7, lr, #1 ; v=e>>1 + + ldr r10, [r12, #vp9_extra_bit_struct_tree] + str r10, [sp, #4] ; b->tree + + rsb r4, r8, #32 + lsl r12, r7, r4 + + mov lr, #0 ; i = 0 + +extra_bits_loop + ldrb r4, [r9, lr, asr #1] ; pp[i>>1] + sub r7, r5, #1 ; range-1 + lsls r12, r12, #1 ; v >> n + mul r6, r4, r7 ; (range-1) * pp[i>>1] + addcs lr, lr, #1 ; i + bb + + mov r7, #1 + ldrsb lr, [r10, lr] ; i = b->tree[i+bb] + add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8) + + addcs r2, r2, r4 ; if (bb) lowvalue += split + subcs r4, r5, r4 ; if (bb) range = range-split + + clz r6, r4 + sub r6, r6, #24 + + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi extra_count_lt_zero ; if(count >= 0) + + sub r6, r6, r3 ; offset= shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl extra_high_bit_not_set + + ldr r4, [r0, #vp9_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos - 1 + b extra_zero_while_start +extra_zero_while_loop + mov r10, #0 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +extra_zero_while_start + cmp r4, #0 + ldrge r7, [r0, #vp9_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq extra_zero_while_loop + + ldr r7, [r0, #vp9_writer_buffer] + ldrb r10, [r7, r4] + add r10, r10, #1 + strb r10, [r7, r4] +extra_high_bit_not_set + rsb r4, r6, #24 ; 24-offset + ldr r10, [r0, #vp9_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp9_writer_pos] + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp9_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset)) + ldr r10, [sp, #4] ; b->tree +extra_count_lt_zero + lsl r2, r2, r6 + + subs r8, r8, #1 ; --n + bne extra_bits_loop ; while (n) + +no_extra_bits + ldr lr, [r1, #4] ; e = p->Extra + add r4, r5, #1 ; range + 1 + tst lr, #1 + lsr r4, r4, #1 ; split = (range + 1) >> 1 + addne r2, r2, r4 ; lowvalue += split + subne r4, r5, r4 ; range = range-split + tst r2, #0x80000000 ; lowvalue & 0x80000000 + lsl r5, r4, #1 ; range <<= 1 + beq end_high_bit_not_set + + ldr r4, [r0, #vp9_writer_pos] + mov r7, #0 + sub r4, r4, #1 + b end_zero_while_start +end_zero_while_loop + strb r7, [r6, r4] + sub r4, r4, #1 ; x-- +end_zero_while_start + cmp r4, #0 + ldrge r6, [r0, #vp9_writer_buffer] + ldrb r12, [r6, r4] + cmpge r12, #0xff + beq end_zero_while_loop + + ldr r6, [r0, #vp9_writer_buffer] + ldrb r7, [r6, r4] + add r7, r7, #1 + strb r7, [r6, r4] +end_high_bit_not_set + adds r3, r3, #1 ; ++count + lsl r2, r2, #1 ; lowvalue <<= 1 + bne end_count_zero + + ldr r4, [r0, #vp9_writer_pos] + mvn r3, #7 + ldr r7, [r0, #vp9_writer_buffer] + lsr r6, r2, #24 ; lowvalue >> 24 + add r12, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r12, [r0, #0x10] + strb r6, [r7, r4] +end_count_zero +skip_extra_bits + add r1, r1, #TOKENEXTRA_SZ ; ++p +check_p_lt_stop + ldr r4, [sp, #0] ; stop + cmp r1, r4 ; while( p < stop) + bcc while_p_lt_stop + + str r2, [r0, #vp9_writer_lowvalue] + str r5, [r0, #vp9_writer_range] + str r3, [r0, #vp9_writer_count] + add sp, sp, #12 + pop {r4-r11, pc} + ENDP + + END diff --git a/vp9/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm b/vp9/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm new file mode 100644 index 000000000..a1c647d6c --- /dev/null +++ b/vp9/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm @@ -0,0 +1,327 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8cx_pack_mb_row_tokens_armv5| + + INCLUDE asm_enc_offsets.asm + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY + +; r0 VP8_COMP *cpi +; r1 vp9_writer *w +; r2 vp8_coef_encodings +; r3 vp8_extra_bits +; s0 vp8_coef_tree + +|vp8cx_pack_mb_row_tokens_armv5| PROC + push {r4-r11, lr} + sub sp, sp, #24 + + ; Compute address of cpi->common.mb_rows + ldr r4, _VP8_COMP_common_ + ldr r6, _VP8_COMMON_MBrows_ + add r4, r0, r4 + + ldr r5, [r4, r6] ; load up mb_rows + + str r2, [sp, #20] ; save vp8_coef_encodings + str r5, [sp, #12] ; save mb_rows + str r3, [sp, #8] ; save vp8_extra_bits + + ldr r4, _VP8_COMP_tplist_ + add r4, r0, r4 + ldr r7, [r4, #0] ; dereference cpi->tp_list + + mov r0, r1 ; keep same as other loops + + ldr r2, [r0, #vp9_writer_lowvalue] + ldr r5, [r0, #vp9_writer_range] + ldr r3, [r0, #vp9_writer_count] + +mb_row_loop + + ldr r1, [r7, #tokenlist_start] + ldr r9, [r7, #tokenlist_stop] + str r9, [sp, #0] ; save stop for later comparison + str r7, [sp, #16] ; tokenlist address for next time + + b check_p_lt_stop + + ; actuall work gets done here! + +while_p_lt_stop + ldrb r6, [r1, #tokenextra_token] ; t + ldr r4, [sp, #20] ; vp8_coef_encodings + mov lr, #0 + add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t + ldr r9, [r1, #tokenextra_context_tree] ; pp + + ldrb r7, [r1, #tokenextra_skip_eob_node] + + ldr r6, [r4, #vp9_token_value] ; v + ldr r8, [r4, #vp9_token_len] ; n + + ; vp8 specific skip_eob_node + cmp r7, #0 + movne lr, #2 ; i = 2 + subne r8, r8, #1 ; --n + + rsb r4, r8, #32 ; 32-n + ldr r10, [sp, #60] ; vp8_coef_tree + + ; v is kept in r12 during the token pack loop + lsl r12, r6, r4 ; r12 = v << 32 - n + +; loop start +token_loop + ldrb r4, [r9, lr, asr #1] ; pp [i>>1] + sub r7, r5, #1 ; range-1 + + ; Decisions are made based on the bit value shifted + ; off of v, so set a flag here based on this. + ; This value is refered to as "bb" + lsls r12, r12, #1 ; bb = v >> n + mul r6, r4, r7 ; ((range-1) * pp[i>>1])) + + ; bb can only be 0 or 1. So only execute this statement + ; if bb == 1, otherwise it will act like i + 0 + addcs lr, lr, #1 ; i + bb + + mov r7, #1 + ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb] + add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8) + + addcs r2, r2, r4 ; if (bb) lowvalue += split + subcs r4, r5, r4 ; if (bb) range = range-split + + ; Counting the leading zeros is used to normalize range. + clz r6, r4 + sub r6, r6, #24 ; shift + + ; Flag is set on the sum of count. This flag is used later + ; to determine if count >= 0 + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi token_count_lt_zero ; if(count >= 0) + + sub r6, r6, r3 ; offset = shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl token_high_bit_not_set + + ldr r4, [r0, #vp9_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos-1 + b token_zero_while_start +token_zero_while_loop + mov r10, #0 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +token_zero_while_start + cmp r4, #0 + ldrge r7, [r0, #vp9_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq token_zero_while_loop + + ldr r7, [r0, #vp9_writer_buffer] + ldrb r10, [r7, r4] ; w->buffer[x] + add r10, r10, #1 + strb r10, [r7, r4] ; w->buffer[x] + 1 +token_high_bit_not_set + rsb r4, r6, #24 ; 24-offset + ldr r10, [r0, #vp9_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp9_writer_pos] ; w->pos + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp9_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r10, r4] ; w->buffer[w->pos++] + + ; r10 is used earlier in the loop, but r10 is used as + ; temp variable here. So after r10 is used, reload + ; vp8_coef_tree_dcd into r10 + ldr r10, [sp, #60] ; vp8_coef_tree + +token_count_lt_zero + lsl r2, r2, r6 ; lowvalue <<= shift + + subs r8, r8, #1 ; --n + bne token_loop + + ldrb r6, [r1, #tokenextra_token] ; t + ldr r7, [sp, #8] ; vp8_extra_bits + ; Add t * sizeof (vp9_extra_bit_struct) to get the desired + ; element. Here vp9_extra_bit_struct == 16 + add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t + + ldr r4, [r12, #vp9_extra_bit_struct_base_val] + cmp r4, #0 + beq skip_extra_bits + +; if( b->base_val) + ldr r8, [r12, #vp9_extra_bit_struct_len] ; L + ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra + cmp r8, #0 ; if( L) + beq no_extra_bits + + ldr r9, [r12, #vp9_extra_bit_struct_prob] + asr r7, lr, #1 ; v=e>>1 + + ldr r10, [r12, #vp9_extra_bit_struct_tree] + str r10, [sp, #4] ; b->tree + + rsb r4, r8, #32 + lsl r12, r7, r4 + + mov lr, #0 ; i = 0 + +extra_bits_loop + ldrb r4, [r9, lr, asr #1] ; pp[i>>1] + sub r7, r5, #1 ; range-1 + lsls r12, r12, #1 ; v >> n + mul r6, r4, r7 ; (range-1) * pp[i>>1] + addcs lr, lr, #1 ; i + bb + + mov r7, #1 + ldrsb lr, [r10, lr] ; i = b->tree[i+bb] + add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8) + + addcs r2, r2, r4 ; if (bb) lowvalue += split + subcs r4, r5, r4 ; if (bb) range = range-split + + clz r6, r4 + sub r6, r6, #24 + + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi extra_count_lt_zero ; if(count >= 0) + + sub r6, r6, r3 ; offset= shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl extra_high_bit_not_set + + ldr r4, [r0, #vp9_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos - 1 + b extra_zero_while_start +extra_zero_while_loop + mov r10, #0 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +extra_zero_while_start + cmp r4, #0 + ldrge r7, [r0, #vp9_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq extra_zero_while_loop + + ldr r7, [r0, #vp9_writer_buffer] + ldrb r10, [r7, r4] + add r10, r10, #1 + strb r10, [r7, r4] +extra_high_bit_not_set + rsb r4, r6, #24 ; 24-offset + ldr r10, [r0, #vp9_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp9_writer_pos] + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp9_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset)) + ldr r10, [sp, #4] ; b->tree +extra_count_lt_zero + lsl r2, r2, r6 + + subs r8, r8, #1 ; --n + bne extra_bits_loop ; while (n) + +no_extra_bits + ldr lr, [r1, #4] ; e = p->Extra + add r4, r5, #1 ; range + 1 + tst lr, #1 + lsr r4, r4, #1 ; split = (range + 1) >> 1 + addne r2, r2, r4 ; lowvalue += split + subne r4, r5, r4 ; range = range-split + tst r2, #0x80000000 ; lowvalue & 0x80000000 + lsl r5, r4, #1 ; range <<= 1 + beq end_high_bit_not_set + + ldr r4, [r0, #vp9_writer_pos] + mov r7, #0 + sub r4, r4, #1 + b end_zero_while_start +end_zero_while_loop + strb r7, [r6, r4] + sub r4, r4, #1 ; x-- +end_zero_while_start + cmp r4, #0 + ldrge r6, [r0, #vp9_writer_buffer] + ldrb r12, [r6, r4] + cmpge r12, #0xff + beq end_zero_while_loop + + ldr r6, [r0, #vp9_writer_buffer] + ldrb r7, [r6, r4] + add r7, r7, #1 + strb r7, [r6, r4] +end_high_bit_not_set + adds r3, r3, #1 ; ++count + lsl r2, r2, #1 ; lowvalue <<= 1 + bne end_count_zero + + ldr r4, [r0, #vp9_writer_pos] + mvn r3, #7 + ldr r7, [r0, #vp9_writer_buffer] + lsr r6, r2, #24 ; lowvalue >> 24 + add r12, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r12, [r0, #0x10] + strb r6, [r7, r4] +end_count_zero +skip_extra_bits + add r1, r1, #TOKENEXTRA_SZ ; ++p +check_p_lt_stop + ldr r4, [sp, #0] ; stop + cmp r1, r4 ; while( p < stop) + bcc while_p_lt_stop + + ldr r6, [sp, #12] ; mb_rows + ldr r7, [sp, #16] ; tokenlist address + subs r6, r6, #1 + add r7, r7, #TOKENLIST_SZ ; next element in the array + str r6, [sp, #12] + bne mb_row_loop + + str r2, [r0, #vp9_writer_lowvalue] + str r5, [r0, #vp9_writer_range] + str r3, [r0, #vp9_writer_count] + add sp, sp, #24 + pop {r4-r11, pc} + ENDP + +_VP8_COMP_common_ + DCD vp8_comp_common +_VP8_COMMON_MBrows_ + DCD vp8_common_mb_rows +_VP8_COMP_tplist_ + DCD vp8_comp_tplist + + END diff --git a/vp9/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm b/vp9/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm new file mode 100644 index 000000000..86c2feb4a --- /dev/null +++ b/vp9/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm @@ -0,0 +1,465 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8cx_pack_tokens_into_partitions_armv5| + + INCLUDE asm_enc_offsets.asm + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY + +; r0 VP8_COMP *cpi +; r1 unsigned char *cx_data +; r2 int num_part +; r3 *size +; s0 vp8_coef_encodings +; s1 vp8_extra_bits, +; s2 const vp9_tree_index *, + +|vp8cx_pack_tokens_into_partitions_armv5| PROC + push {r4-r11, lr} + sub sp, sp, #44 + + ; Compute address of cpi->common.mb_rows + ldr r4, _VP8_COMP_common_ + ldr r6, _VP8_COMMON_MBrows_ + add r4, r0, r4 + + ldr r5, [r4, r6] ; load up mb_rows + + str r5, [sp, #36] ; save mb_rows + str r1, [sp, #24] ; save cx_data + str r2, [sp, #20] ; save num_part + str r3, [sp, #8] ; save *size + + ; *size = 3*(num_part -1 ); + sub r2, r2, #1 ; num_part - 1 + add r2, r2, r2, lsl #1 ; 3*(num_part - 1) + str r2, [r3] + + add r2, r2, r1 ; cx_data + *size + str r2, [sp, #40] ; ptr + + ldr r4, _VP8_COMP_tplist_ + add r4, r0, r4 + ldr r7, [r4, #0] ; dereference cpi->tp_list + str r7, [sp, #32] ; store start of cpi->tp_list + + ldr r11, _VP8_COMP_bc2_ ; load up vp9_writer out of cpi + add r0, r0, r11 + + mov r11, #0 + str r11, [sp, #28] ; i + +numparts_loop + ldr r10, [sp, #40] ; ptr + ldr r5, [sp, #36] ; move mb_rows to the counting section + sub r5, r5, r11 ; move start point with each partition + ; mb_rows starts at i + str r5, [sp, #12] + + ; Reset all of the VP8 Writer data for each partition that + ; is processed. + ; start_encode + mov r2, #0 ; vp9_writer_lowvalue + mov r5, #255 ; vp9_writer_range + mvn r3, #23 ; vp9_writer_count + + str r2, [r0, #vp9_writer_value] + str r2, [r0, #vp9_writer_pos] + str r10, [r0, #vp9_writer_buffer] + +mb_row_loop + + ldr r1, [r7, #tokenlist_start] + ldr r9, [r7, #tokenlist_stop] + str r9, [sp, #0] ; save stop for later comparison + str r7, [sp, #16] ; tokenlist address for next time + + b check_p_lt_stop + + ; actual work gets done here! + +while_p_lt_stop + ldrb r6, [r1, #tokenextra_token] ; t + ldr r4, [sp, #80] ; vp8_coef_encodings + mov lr, #0 + add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t + ldr r9, [r1, #tokenextra_context_tree] ; pp + + ldrb r7, [r1, #tokenextra_skip_eob_node] + + ldr r6, [r4, #vp9_token_value] ; v + ldr r8, [r4, #vp9_token_len] ; n + + ; vp8 specific skip_eob_node + cmp r7, #0 + movne lr, #2 ; i = 2 + subne r8, r8, #1 ; --n + + rsb r4, r8, #32 ; 32-n + ldr r10, [sp, #88] ; vp8_coef_tree + + ; v is kept in r12 during the token pack loop + lsl r12, r6, r4 ; r12 = v << 32 - n + +; loop start +token_loop + ldrb r4, [r9, lr, asr #1] ; pp [i>>1] + sub r7, r5, #1 ; range-1 + + ; Decisions are made based on the bit value shifted + ; off of v, so set a flag here based on this. + ; This value is refered to as "bb" + lsls r12, r12, #1 ; bb = v >> n + mul r6, r4, r7 ; ((range-1) * pp[i>>1])) + + ; bb can only be 0 or 1. So only execute this statement + ; if bb == 1, otherwise it will act like i + 0 + addcs lr, lr, #1 ; i + bb + + mov r7, #1 + ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb] + add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8) + + addcs r2, r2, r4 ; if (bb) lowvalue += split + subcs r4, r5, r4 ; if (bb) range = range-split + + ; Counting the leading zeros is used to normalize range. + clz r6, r4 + sub r6, r6, #24 ; shift + + ; Flag is set on the sum of count. This flag is used later + ; to determine if count >= 0 + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi token_count_lt_zero ; if(count >= 0) + + sub r6, r6, r3 ; offset = shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl token_high_bit_not_set + + ldr r4, [r0, #vp9_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos-1 + b token_zero_while_start +token_zero_while_loop + mov r10, #0 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +token_zero_while_start + cmp r4, #0 + ldrge r7, [r0, #vp9_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq token_zero_while_loop + + ldr r7, [r0, #vp9_writer_buffer] + ldrb r10, [r7, r4] ; w->buffer[x] + add r10, r10, #1 + strb r10, [r7, r4] ; w->buffer[x] + 1 +token_high_bit_not_set + rsb r4, r6, #24 ; 24-offset + ldr r10, [r0, #vp9_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp9_writer_pos] ; w->pos + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp9_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r10, r4] ; w->buffer[w->pos++] + + ; r10 is used earlier in the loop, but r10 is used as + ; temp variable here. So after r10 is used, reload + ; vp8_coef_tree_dcd into r10 + ldr r10, [sp, #88] ; vp8_coef_tree + +token_count_lt_zero + lsl r2, r2, r6 ; lowvalue <<= shift + + subs r8, r8, #1 ; --n + bne token_loop + + ldrb r6, [r1, #tokenextra_token] ; t + ldr r7, [sp, #84] ; vp8_extra_bits + ; Add t * sizeof (vp9_extra_bit_struct) to get the desired + ; element. Here vp9_extra_bit_struct == 16 + add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t + + ldr r4, [r12, #vp9_extra_bit_struct_base_val] + cmp r4, #0 + beq skip_extra_bits + +; if( b->base_val) + ldr r8, [r12, #vp9_extra_bit_struct_len] ; L + ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra + cmp r8, #0 ; if( L) + beq no_extra_bits + + ldr r9, [r12, #vp9_extra_bit_struct_prob] + asr r7, lr, #1 ; v=e>>1 + + ldr r10, [r12, #vp9_extra_bit_struct_tree] + str r10, [sp, #4] ; b->tree + + rsb r4, r8, #32 + lsl r12, r7, r4 + + mov lr, #0 ; i = 0 + +extra_bits_loop + ldrb r4, [r9, lr, asr #1] ; pp[i>>1] + sub r7, r5, #1 ; range-1 + lsls r12, r12, #1 ; v >> n + mul r6, r4, r7 ; (range-1) * pp[i>>1] + addcs lr, lr, #1 ; i + bb + + mov r7, #1 + ldrsb lr, [r10, lr] ; i = b->tree[i+bb] + add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8) + + addcs r2, r2, r4 ; if (bb) lowvalue += split + subcs r4, r5, r4 ; if (bb) range = range-split + + clz r6, r4 + sub r6, r6, #24 + + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi extra_count_lt_zero ; if(count >= 0) + + sub r6, r6, r3 ; offset= shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl extra_high_bit_not_set + + ldr r4, [r0, #vp9_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos - 1 + b extra_zero_while_start +extra_zero_while_loop + mov r10, #0 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +extra_zero_while_start + cmp r4, #0 + ldrge r7, [r0, #vp9_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq extra_zero_while_loop + + ldr r7, [r0, #vp9_writer_buffer] + ldrb r10, [r7, r4] + add r10, r10, #1 + strb r10, [r7, r4] +extra_high_bit_not_set + rsb r4, r6, #24 ; 24-offset + ldr r10, [r0, #vp9_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp9_writer_pos] + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp9_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset)) + ldr r10, [sp, #4] ; b->tree +extra_count_lt_zero + lsl r2, r2, r6 + + subs r8, r8, #1 ; --n + bne extra_bits_loop ; while (n) + +no_extra_bits + ldr lr, [r1, #4] ; e = p->Extra + add r4, r5, #1 ; range + 1 + tst lr, #1 + lsr r4, r4, #1 ; split = (range + 1) >> 1 + addne r2, r2, r4 ; lowvalue += split + subne r4, r5, r4 ; range = range-split + tst r2, #0x80000000 ; lowvalue & 0x80000000 + lsl r5, r4, #1 ; range <<= 1 + beq end_high_bit_not_set + + ldr r4, [r0, #vp9_writer_pos] + mov r7, #0 + sub r4, r4, #1 + b end_zero_while_start +end_zero_while_loop + strb r7, [r6, r4] + sub r4, r4, #1 ; x-- +end_zero_while_start + cmp r4, #0 + ldrge r6, [r0, #vp9_writer_buffer] + ldrb r12, [r6, r4] + cmpge r12, #0xff + beq end_zero_while_loop + + ldr r6, [r0, #vp9_writer_buffer] + ldrb r7, [r6, r4] + add r7, r7, #1 + strb r7, [r6, r4] +end_high_bit_not_set + adds r3, r3, #1 ; ++count + lsl r2, r2, #1 ; lowvalue <<= 1 + bne end_count_zero + + ldr r4, [r0, #vp9_writer_pos] + mvn r3, #7 + ldr r7, [r0, #vp9_writer_buffer] + lsr r6, r2, #24 ; lowvalue >> 24 + add r12, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r12, [r0, #0x10] + strb r6, [r7, r4] +end_count_zero +skip_extra_bits + add r1, r1, #TOKENEXTRA_SZ ; ++p +check_p_lt_stop + ldr r4, [sp, #0] ; stop + cmp r1, r4 ; while( p < stop) + bcc while_p_lt_stop + + ldr r10, [sp, #20] ; num_parts + mov r1, #TOKENLIST_SZ + mul r1, r10, r1 + + ldr r6, [sp, #12] ; mb_rows + ldr r7, [sp, #16] ; tokenlist address + subs r6, r6, r10 + add r7, r7, r1 ; next element in the array + str r6, [sp, #12] + bgt mb_row_loop + + mov r12, #32 + +stop_encode_loop + sub r7, r5, #1 ; range-1 + + mov r4, r7, lsl #7 ; ((range-1) * 128) + + mov r7, #1 + add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8) + + ; Counting the leading zeros is used to normalize range. + clz r6, r4 + sub r6, r6, #24 ; shift + + ; Flag is set on the sum of count. This flag is used later + ; to determine if count >= 0 + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi token_count_lt_zero_se ; if(count >= 0) + + sub r6, r6, r3 ; offset = shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl token_high_bit_not_set_se + + ldr r4, [r0, #vp9_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos-1 + b token_zero_while_start_se +token_zero_while_loop_se + mov r10, #0 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +token_zero_while_start_se + cmp r4, #0 + ldrge r7, [r0, #vp9_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq token_zero_while_loop_se + + ldr r7, [r0, #vp9_writer_buffer] + ldrb r10, [r7, r4] ; w->buffer[x] + add r10, r10, #1 + strb r10, [r7, r4] ; w->buffer[x] + 1 +token_high_bit_not_set_se + rsb r4, r6, #24 ; 24-offset + ldr r10, [r0, #vp9_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp9_writer_pos] ; w->pos + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp9_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r10, r4] ; w->buffer[w->pos++] + +token_count_lt_zero_se + lsl r2, r2, r6 ; lowvalue <<= shift + + subs r12, r12, #1 + bne stop_encode_loop + + ldr r10, [sp, #8] ; *size + ldr r11, [r10] + ldr r4, [r0, #vp9_writer_pos] ; w->pos + add r11, r11, r4 ; *size += w->pos + str r11, [r10] + + ldr r9, [sp, #20] ; num_parts + sub r9, r9, #1 + ldr r10, [sp, #28] ; i + cmp r10, r9 ; if(i<(num_part - 1)) + bge skip_write_partition + + ldr r12, [sp, #40] ; ptr + add r12, r12, r4 ; ptr += w->pos + str r12, [sp, #40] + + ldr r9, [sp, #24] ; cx_data + mov r8, r4, asr #8 + strb r4, [r9, #0] + strb r8, [r9, #1] + mov r4, r4, asr #16 + strb r4, [r9, #2] + + add r9, r9, #3 ; cx_data += 3 + str r9, [sp, #24] + +skip_write_partition + + ldr r11, [sp, #28] ; i + ldr r10, [sp, #20] ; num_parts + + add r11, r11, #1 ; i++ + str r11, [sp, #28] + + ldr r7, [sp, #32] ; cpi->tp_list[i] + mov r1, #TOKENLIST_SZ + add r7, r7, r1 ; next element in cpi->tp_list + str r7, [sp, #32] ; cpi->tp_list[i+1] + + cmp r10, r11 + bgt numparts_loop + + + add sp, sp, #44 + pop {r4-r11, pc} + ENDP + +_VP8_COMP_common_ + DCD vp8_comp_common +_VP8_COMMON_MBrows_ + DCD vp8_common_mb_rows +_VP8_COMP_tplist_ + DCD vp8_comp_tplist +_VP8_COMP_bc2_ + DCD vp8_comp_bc2 + + END diff --git a/vp9/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm b/vp9/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm new file mode 100644 index 000000000..ae2f6030d --- /dev/null +++ b/vp9/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm @@ -0,0 +1,224 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_fast_quantize_b_armv6| + + INCLUDE asm_enc_offsets.asm + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 BLOCK *b +; r1 BLOCKD *d +|vp8_fast_quantize_b_armv6| PROC + stmfd sp!, {r1, r4-r11, lr} + + ldr r3, [r0, #vp8_block_coeff] ; coeff + ldr r4, [r0, #vp8_block_quant_fast] ; quant_fast + ldr r5, [r0, #vp8_block_round] ; round + ldr r6, [r1, #vp8_blockd_qcoeff] ; qcoeff + ldr r7, [r1, #vp8_blockd_dqcoeff] ; dqcoeff + ldr r8, [r1, #vp8_blockd_dequant] ; dequant + + ldr r2, loop_count ; loop_count=0x1000000. 'lsls' instruction + ; is used to update the counter so that + ; it can be used to mark nonzero + ; quantized coefficient pairs. + + mov r1, #0 ; flags for quantized coeffs + + ; PART 1: quantization and dequantization loop +loop + ldr r9, [r3], #4 ; [z1 | z0] + ldr r10, [r5], #4 ; [r1 | r0] + ldr r11, [r4], #4 ; [q1 | q0] + + ssat16 lr, #1, r9 ; [sz1 | sz0] + eor r9, r9, lr ; [z1 ^ sz1 | z0 ^ sz0] + ssub16 r9, r9, lr ; x = (z ^ sz) - sz + sadd16 r9, r9, r10 ; [x1+r1 | x0+r0] + + ldr r12, [r3], #4 ; [z3 | z2] + + smulbb r0, r9, r11 ; [(x0+r0)*q0] + smultt r9, r9, r11 ; [(x1+r1)*q1] + + ldr r10, [r5], #4 ; [r3 | r2] + + ssat16 r11, #1, r12 ; [sz3 | sz2] + eor r12, r12, r11 ; [z3 ^ sz3 | z2 ^ sz2] + pkhtb r0, r9, r0, asr #16 ; [y1 | y0] + ldr r9, [r4], #4 ; [q3 | q2] + ssub16 r12, r12, r11 ; x = (z ^ sz) - sz + + sadd16 r12, r12, r10 ; [x3+r3 | x2+r2] + + eor r0, r0, lr ; [(y1 ^ sz1) | (y0 ^ sz0)] + + smulbb r10, r12, r9 ; [(x2+r2)*q2] + smultt r12, r12, r9 ; [(x3+r3)*q3] + + ssub16 r0, r0, lr ; x = (y ^ sz) - sz + + cmp r0, #0 ; check if zero + orrne r1, r1, r2, lsr #24 ; add flag for nonzero coeffs + + str r0, [r6], #4 ; *qcoeff++ = x + ldr r9, [r8], #4 ; [dq1 | dq0] + + pkhtb r10, r12, r10, asr #16 ; [y3 | y2] + eor r10, r10, r11 ; [(y3 ^ sz3) | (y2 ^ sz2)] + ssub16 r10, r10, r11 ; x = (y ^ sz) - sz + + cmp r10, #0 ; check if zero + orrne r1, r1, r2, lsr #23 ; add flag for nonzero coeffs + + str r10, [r6], #4 ; *qcoeff++ = x + ldr r11, [r8], #4 ; [dq3 | dq2] + + smulbb r12, r0, r9 ; [x0*dq0] + smultt r0, r0, r9 ; [x1*dq1] + + smulbb r9, r10, r11 ; [x2*dq2] + smultt r10, r10, r11 ; [x3*dq3] + + lsls r2, r2, #2 ; update loop counter + strh r12, [r7, #0] ; dqcoeff[0] = [x0*dq0] + strh r0, [r7, #2] ; dqcoeff[1] = [x1*dq1] + strh r9, [r7, #4] ; dqcoeff[2] = [x2*dq2] + strh r10, [r7, #6] ; dqcoeff[3] = [x3*dq3] + add r7, r7, #8 ; dqcoeff += 8 + bne loop + + ; PART 2: check position for eob... + mov lr, #0 ; init eob + cmp r1, #0 ; coeffs after quantization? + ldr r11, [sp, #0] ; restore BLOCKD pointer + beq end ; skip eob calculations if all zero + + ldr r0, [r11, #vp8_blockd_qcoeff] + + ; check shortcut for nonzero qcoeffs + tst r1, #0x80 + bne quant_coeff_15_14 + tst r1, #0x20 + bne quant_coeff_13_11 + tst r1, #0x8 + bne quant_coeff_12_7 + tst r1, #0x40 + bne quant_coeff_10_9 + tst r1, #0x10 + bne quant_coeff_8_3 + tst r1, #0x2 + bne quant_coeff_6_5 + tst r1, #0x4 + bne quant_coeff_4_2 + b quant_coeff_1_0 + +quant_coeff_15_14 + ldrh r2, [r0, #30] ; rc=15, i=15 + mov lr, #16 + cmp r2, #0 + bne end + + ldrh r3, [r0, #28] ; rc=14, i=14 + mov lr, #15 + cmp r3, #0 + bne end + +quant_coeff_13_11 + ldrh r2, [r0, #22] ; rc=11, i=13 + mov lr, #14 + cmp r2, #0 + bne end + +quant_coeff_12_7 + ldrh r3, [r0, #14] ; rc=7, i=12 + mov lr, #13 + cmp r3, #0 + bne end + + ldrh r2, [r0, #20] ; rc=10, i=11 + mov lr, #12 + cmp r2, #0 + bne end + +quant_coeff_10_9 + ldrh r3, [r0, #26] ; rc=13, i=10 + mov lr, #11 + cmp r3, #0 + bne end + + ldrh r2, [r0, #24] ; rc=12, i=9 + mov lr, #10 + cmp r2, #0 + bne end + +quant_coeff_8_3 + ldrh r3, [r0, #18] ; rc=9, i=8 + mov lr, #9 + cmp r3, #0 + bne end + + ldrh r2, [r0, #12] ; rc=6, i=7 + mov lr, #8 + cmp r2, #0 + bne end + +quant_coeff_6_5 + ldrh r3, [r0, #6] ; rc=3, i=6 + mov lr, #7 + cmp r3, #0 + bne end + + ldrh r2, [r0, #4] ; rc=2, i=5 + mov lr, #6 + cmp r2, #0 + bne end + +quant_coeff_4_2 + ldrh r3, [r0, #10] ; rc=5, i=4 + mov lr, #5 + cmp r3, #0 + bne end + + ldrh r2, [r0, #16] ; rc=8, i=3 + mov lr, #4 + cmp r2, #0 + bne end + + ldrh r3, [r0, #8] ; rc=4, i=2 + mov lr, #3 + cmp r3, #0 + bne end + +quant_coeff_1_0 + ldrh r2, [r0, #2] ; rc=1, i=1 + mov lr, #2 + cmp r2, #0 + bne end + + mov lr, #1 ; rc=0, i=0 + +end + str lr, [r11, #vp8_blockd_eob] + ldmfd sp!, {r1, r4-r11, pc} + + ENDP + +loop_count + DCD 0x1000000 + + END + diff --git a/vp9/encoder/arm/armv6/vp8_mse16x16_armv6.asm b/vp9/encoder/arm/armv6/vp8_mse16x16_armv6.asm new file mode 100644 index 000000000..8e7283667 --- /dev/null +++ b/vp9/encoder/arm/armv6/vp8_mse16x16_armv6.asm @@ -0,0 +1,138 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_mse16x16_armv6| + + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +; +;note: Based on vp9_variance16x16_armv6. In this function, sum is never used. +; So, we can remove this part of calculation. + +|vp8_mse16x16_armv6| PROC + + push {r4-r9, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r12, #16 ; set loop counter to 16 (=block height) + mov r4, #0 ; initialize sse = 0 + +loop + ; 1st 4 pixels + ldr r5, [r0, #0x0] ; load 4 src pixels + ldr r6, [r2, #0x0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r8, r5, r6 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + + ldr r5, [r0, #0x4] ; load 4 src pixels + + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r6, [r2, #0x4] ; load 4 ref pixels + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r5, r6 ; calculate difference + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + ldr r5, [r0, #0x8] ; load 4 src pixels + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r6, [r2, #0x8] ; load 4 ref pixels + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r5, r6 ; calculate difference + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + + ldr r5, [r0, #0xc] ; load 4 src pixels + + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r6, [r2, #0xc] ; load 4 ref pixels + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r5, r6 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + + subs r12, r12, #1 ; next row + + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + bne loop + + ; return stuff + ldr r1, [sp, #28] ; get address of sse + mov r0, r4 ; return sse + str r4, [r1] ; store sse + + pop {r4-r9, pc} + + ENDP + + END diff --git a/vp9/encoder/arm/armv6/vp8_sad16x16_armv6.asm b/vp9/encoder/arm/armv6/vp8_sad16x16_armv6.asm new file mode 100644 index 000000000..1b4f5cf3b --- /dev/null +++ b/vp9/encoder/arm/armv6/vp8_sad16x16_armv6.asm @@ -0,0 +1,96 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_sad16x16_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 const unsigned char *src_ptr +; r1 int src_stride +; r2 const unsigned char *ref_ptr +; r3 int ref_stride +; stack max_sad (not used) +|vp8_sad16x16_armv6| PROC + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + pld [r0, r1, lsl #1] + pld [r2, r3, lsl #1] + + mov r4, #0 ; sad = 0; + mov r5, #8 ; loop count + +loop + ; 1st row + ldr r6, [r0, #0x0] ; load 4 src pixels (1A) + ldr r8, [r2, #0x0] ; load 4 ref pixels (1A) + ldr r7, [r0, #0x4] ; load 4 src pixels (1A) + ldr r9, [r2, #0x4] ; load 4 ref pixels (1A) + ldr r10, [r0, #0x8] ; load 4 src pixels (1B) + ldr r11, [r0, #0xC] ; load 4 src pixels (1B) + + usada8 r4, r8, r6, r4 ; calculate sad for 4 pixels + usad8 r8, r7, r9 ; calculate sad for 4 pixels + + ldr r12, [r2, #0x8] ; load 4 ref pixels (1B) + ldr lr, [r2, #0xC] ; load 4 ref pixels (1B) + + add r0, r0, r1 ; set src pointer to next row + add r2, r2, r3 ; set dst pointer to next row + + pld [r0, r1, lsl #1] + pld [r2, r3, lsl #1] + + usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels + usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels + + ldr r6, [r0, #0x0] ; load 4 src pixels (2A) + ldr r7, [r0, #0x4] ; load 4 src pixels (2A) + add r4, r4, r8 ; add partial sad values + + ; 2nd row + ldr r8, [r2, #0x0] ; load 4 ref pixels (2A) + ldr r9, [r2, #0x4] ; load 4 ref pixels (2A) + ldr r10, [r0, #0x8] ; load 4 src pixels (2B) + ldr r11, [r0, #0xC] ; load 4 src pixels (2B) + + usada8 r4, r6, r8, r4 ; calculate sad for 4 pixels + usad8 r8, r7, r9 ; calculate sad for 4 pixels + + ldr r12, [r2, #0x8] ; load 4 ref pixels (2B) + ldr lr, [r2, #0xC] ; load 4 ref pixels (2B) + + add r0, r0, r1 ; set src pointer to next row + add r2, r2, r3 ; set dst pointer to next row + + usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels + usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels + + pld [r0, r1, lsl #1] + pld [r2, r3, lsl #1] + + subs r5, r5, #1 ; decrement loop counter + add r4, r4, r8 ; add partial sad values + + bne loop + + mov r0, r4 ; return sad + ldmfd sp!, {r4-r12, pc} + + ENDP + + END + diff --git a/vp9/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm b/vp9/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm new file mode 100644 index 000000000..8034c1db9 --- /dev/null +++ b/vp9/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm @@ -0,0 +1,262 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp8_short_fdct4x4_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY +; void vp8_short_fdct4x4_c(short *input, short *output, int pitch) +|vp8_short_fdct4x4_armv6| PROC + + stmfd sp!, {r4 - r12, lr} + + ; PART 1 + + ; coeffs 0-3 + ldrd r4, r5, [r0] ; [i1 | i0] [i3 | i2] + + ldr r10, c7500 + ldr r11, c14500 + ldr r12, c0x22a453a0 ; [2217*4 | 5352*4] + ldr lr, c0x00080008 + ror r5, r5, #16 ; [i2 | i3] + + qadd16 r6, r4, r5 ; [i1+i2 | i0+i3] = [b1 | a1] without shift + qsub16 r7, r4, r5 ; [i1-i2 | i0-i3] = [c1 | d1] without shift + + add r0, r0, r2 ; update input pointer + + qadd16 r7, r7, r7 ; 2*[c1|d1] --> we can use smlad and smlsd + ; with 2217*4 and 5352*4 without losing the + ; sign bit (overflow) + + smuad r4, r6, lr ; o0 = (i1+i2)*8 + (i0+i3)*8 + smusd r5, r6, lr ; o2 = (i1+i2)*8 - (i0+i3)*8 + + smlad r6, r7, r12, r11 ; o1 = (c1 * 2217 + d1 * 5352 + 14500) + smlsdx r7, r7, r12, r10 ; o3 = (d1 * 2217 - c1 * 5352 + 7500) + + ldrd r8, r9, [r0] ; [i5 | i4] [i7 | i6] + + pkhbt r3, r4, r6, lsl #4 ; [o1 | o0], keep in register for PART 2 + pkhbt r6, r5, r7, lsl #4 ; [o3 | o2] + + str r6, [r1, #4] + + ; coeffs 4-7 + ror r9, r9, #16 ; [i6 | i7] + + qadd16 r6, r8, r9 ; [i5+i6 | i4+i7] = [b1 | a1] without shift + qsub16 r7, r8, r9 ; [i5-i6 | i4-i7] = [c1 | d1] without shift + + add r0, r0, r2 ; update input pointer + + qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd + ; with 2217*4 and 5352*4 without losing the + ; sign bit (overflow) + + smuad r9, r6, lr ; o4 = (i5+i6)*8 + (i4+i7)*8 + smusd r8, r6, lr ; o6 = (i5+i6)*8 - (i4+i7)*8 + + smlad r6, r7, r12, r11 ; o5 = (c1 * 2217 + d1 * 5352 + 14500) + smlsdx r7, r7, r12, r10 ; o7 = (d1 * 2217 - c1 * 5352 + 7500) + + ldrd r4, r5, [r0] ; [i9 | i8] [i11 | i10] + + pkhbt r9, r9, r6, lsl #4 ; [o5 | o4], keep in register for PART 2 + pkhbt r6, r8, r7, lsl #4 ; [o7 | o6] + + str r6, [r1, #12] + + ; coeffs 8-11 + ror r5, r5, #16 ; [i10 | i11] + + qadd16 r6, r4, r5 ; [i9+i10 | i8+i11]=[b1 | a1] without shift + qsub16 r7, r4, r5 ; [i9-i10 | i8-i11]=[c1 | d1] without shift + + add r0, r0, r2 ; update input pointer + + qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd + ; with 2217*4 and 5352*4 without losing the + ; sign bit (overflow) + + smuad r2, r6, lr ; o8 = (i9+i10)*8 + (i8+i11)*8 + smusd r8, r6, lr ; o10 = (i9+i10)*8 - (i8+i11)*8 + + smlad r6, r7, r12, r11 ; o9 = (c1 * 2217 + d1 * 5352 + 14500) + smlsdx r7, r7, r12, r10 ; o11 = (d1 * 2217 - c1 * 5352 + 7500) + + ldrd r4, r5, [r0] ; [i13 | i12] [i15 | i14] + + pkhbt r2, r2, r6, lsl #4 ; [o9 | o8], keep in register for PART 2 + pkhbt r6, r8, r7, lsl #4 ; [o11 | o10] + + str r6, [r1, #20] + + ; coeffs 12-15 + ror r5, r5, #16 ; [i14 | i15] + + qadd16 r6, r4, r5 ; [i13+i14 | i12+i15]=[b1|a1] without shift + qsub16 r7, r4, r5 ; [i13-i14 | i12-i15]=[c1|d1] without shift + + qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd + ; with 2217*4 and 5352*4 without losing the + ; sign bit (overflow) + + smuad r4, r6, lr ; o12 = (i13+i14)*8 + (i12+i15)*8 + smusd r5, r6, lr ; o14 = (i13+i14)*8 - (i12+i15)*8 + + smlad r6, r7, r12, r11 ; o13 = (c1 * 2217 + d1 * 5352 + 14500) + smlsdx r7, r7, r12, r10 ; o15 = (d1 * 2217 - c1 * 5352 + 7500) + + pkhbt r0, r4, r6, lsl #4 ; [o13 | o12], keep in register for PART 2 + pkhbt r6, r5, r7, lsl #4 ; [o15 | o14] + + str r6, [r1, #28] + + + ; PART 2 ------------------------------------------------- + ldr r11, c12000 + ldr r10, c51000 + ldr lr, c0x00070007 + + qadd16 r4, r3, r0 ; a1 = [i1+i13 | i0+i12] + qadd16 r5, r9, r2 ; b1 = [i5+i9 | i4+i8] + qsub16 r6, r9, r2 ; c1 = [i5-i9 | i4-i8] + qsub16 r7, r3, r0 ; d1 = [i1-i13 | i0-i12] + + qadd16 r4, r4, lr ; a1 + 7 + + add r0, r11, #0x10000 ; add (d!=0) + + qadd16 r2, r4, r5 ; a1 + b1 + 7 + qsub16 r3, r4, r5 ; a1 - b1 + 7 + + ldr r12, c0x08a914e8 ; [2217 | 5352] + + lsl r8, r2, #16 ; prepare bottom halfword for scaling + asr r2, r2, #4 ; scale top halfword + lsl r9, r3, #16 ; prepare bottom halfword for scaling + asr r3, r3, #4 ; scale top halfword + pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword + pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword + + smulbt r2, r6, r12 ; [ ------ | c1*2217] + str r4, [r1, #0] ; [ o1 | o0] + smultt r3, r6, r12 ; [c1*2217 | ------ ] + str r5, [r1, #16] ; [ o9 | o8] + + smlabb r8, r7, r12, r2 ; [ ------ | d1*5352] + smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ] + + smulbb r2, r6, r12 ; [ ------ | c1*5352] + smultb r3, r6, r12 ; [c1*5352 | ------ ] + + lsls r6, r7, #16 ; d1 != 0 ? + addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0) + addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0) + asrs r6, r7, #16 + addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0) + addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0) + + smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000 + smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000 + + pkhtb r9, r9, r8, asr #16 + + sub r4, r4, r2 + sub r5, r5, r3 + + ldr r3, [r1, #4] ; [i3 | i2] + + pkhtb r5, r5, r4, asr #16 ; [o13|o12] + + str r9, [r1, #8] ; [o5 | 04] + + ldr r9, [r1, #12] ; [i7 | i6] + ldr r8, [r1, #28] ; [i15|i14] + ldr r2, [r1, #20] ; [i11|i10] + str r5, [r1, #24] ; [o13|o12] + + qadd16 r4, r3, r8 ; a1 = [i3+i15 | i2+i14] + qadd16 r5, r9, r2 ; b1 = [i7+i11 | i6+i10] + + qadd16 r4, r4, lr ; a1 + 7 + + qsub16 r6, r9, r2 ; c1 = [i7-i11 | i6-i10] + qadd16 r2, r4, r5 ; a1 + b1 + 7 + qsub16 r7, r3, r8 ; d1 = [i3-i15 | i2-i14] + qsub16 r3, r4, r5 ; a1 - b1 + 7 + + lsl r8, r2, #16 ; prepare bottom halfword for scaling + asr r2, r2, #4 ; scale top halfword + lsl r9, r3, #16 ; prepare bottom halfword for scaling + asr r3, r3, #4 ; scale top halfword + pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword + pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword + + smulbt r2, r6, r12 ; [ ------ | c1*2217] + str r4, [r1, #4] ; [ o3 | o2] + smultt r3, r6, r12 ; [c1*2217 | ------ ] + str r5, [r1, #20] ; [ o11 | o10] + + smlabb r8, r7, r12, r2 ; [ ------ | d1*5352] + smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ] + + smulbb r2, r6, r12 ; [ ------ | c1*5352] + smultb r3, r6, r12 ; [c1*5352 | ------ ] + + lsls r6, r7, #16 ; d1 != 0 ? + addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0) + addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0) + + asrs r6, r7, #16 + addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0) + addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0) + + smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000 + smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000 + + pkhtb r9, r9, r8, asr #16 + + sub r4, r4, r2 + sub r5, r5, r3 + + str r9, [r1, #12] ; [o7 | o6] + pkhtb r5, r5, r4, asr #16 ; [o15|o14] + + str r5, [r1, #28] ; [o15|o14] + + ldmfd sp!, {r4 - r12, pc} + + ENDP + +; Used constants +c7500 + DCD 7500 +c14500 + DCD 14500 +c0x22a453a0 + DCD 0x22a453a0 +c0x00080008 + DCD 0x00080008 +c12000 + DCD 12000 +c51000 + DCD 51000 +c0x00070007 + DCD 0x00070007 +c0x08a914e8 + DCD 0x08a914e8 + + END diff --git a/vp9/encoder/arm/armv6/vp8_subtract_armv6.asm b/vp9/encoder/arm/armv6/vp8_subtract_armv6.asm new file mode 100644 index 000000000..0ca74387b --- /dev/null +++ b/vp9/encoder/arm/armv6/vp8_subtract_armv6.asm @@ -0,0 +1,265 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_subtract_mby_armv6| + EXPORT |vp8_subtract_mbuv_armv6| + EXPORT |vp8_subtract_b_armv6| + + INCLUDE asm_enc_offsets.asm + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 BLOCK *be +; r1 BLOCKD *bd +; r2 int pitch +|vp8_subtract_b_armv6| PROC + + stmfd sp!, {r4-r9} + + ldr r4, [r0, #vp8_block_base_src] + ldr r5, [r0, #vp8_block_src] + ldr r6, [r0, #vp8_block_src_diff] + + ldr r3, [r4] + ldr r7, [r0, #vp8_block_src_stride] + add r3, r3, r5 ; src = *base_src + src + ldr r8, [r1, #vp8_blockd_predictor] + + mov r9, #4 ; loop count + +loop_block + + ldr r0, [r3], r7 ; src + ldr r1, [r8], r2 ; pred + + uxtb16 r4, r0 ; [s2 | s0] + uxtb16 r5, r1 ; [p2 | p0] + uxtb16 r0, r0, ror #8 ; [s3 | s1] + uxtb16 r1, r1, ror #8 ; [p3 | p1] + + usub16 r4, r4, r5 ; [d2 | d0] + usub16 r5, r0, r1 ; [d3 | d1] + + subs r9, r9, #1 ; decrement loop counter + + pkhbt r0, r4, r5, lsl #16 ; [d1 | d0] + pkhtb r1, r5, r4, asr #16 ; [d3 | d2] + + str r0, [r6, #0] ; diff + str r1, [r6, #4] ; diff + + add r6, r6, r2, lsl #1 ; update diff pointer + bne loop_block + + ldmfd sp!, {r4-r9} + mov pc, lr + + ENDP + + +; r0 short *diff +; r1 unsigned char *usrc +; r2 unsigned char *vsrc +; r3 unsigned char *pred +; stack int stride +|vp8_subtract_mbuv_armv6| PROC + + stmfd sp!, {r4-r12, lr} + + add r0, r0, #512 ; set *diff point to Cb + add r3, r3, #256 ; set *pred point to Cb + + mov r4, #8 ; loop count + ldr r5, [sp, #40] ; stride + + ; Subtract U block +loop_u + ldr r6, [r1] ; src (A) + ldr r7, [r3], #4 ; pred (A) + + uxtb16 r8, r6 ; [s2 | s0] (A) + uxtb16 r9, r7 ; [p2 | p0] (A) + uxtb16 r10, r6, ror #8 ; [s3 | s1] (A) + uxtb16 r11, r7, ror #8 ; [p3 | p1] (A) + + usub16 r6, r8, r9 ; [d2 | d0] (A) + usub16 r7, r10, r11 ; [d3 | d1] (A) + + ldr r10, [r1, #4] ; src (B) + ldr r11, [r3], #4 ; pred (B) + + pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A) + pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A) + + str r8, [r0], #4 ; diff (A) + uxtb16 r8, r10 ; [s2 | s0] (B) + str r9, [r0], #4 ; diff (A) + + uxtb16 r9, r11 ; [p2 | p0] (B) + uxtb16 r10, r10, ror #8 ; [s3 | s1] (B) + uxtb16 r11, r11, ror #8 ; [p3 | p1] (B) + + usub16 r6, r8, r9 ; [d2 | d0] (B) + usub16 r7, r10, r11 ; [d3 | d1] (B) + + add r1, r1, r5 ; update usrc pointer + + pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B) + pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B) + + str r8, [r0], #4 ; diff (B) + subs r4, r4, #1 ; update loop counter + str r9, [r0], #4 ; diff (B) + + bne loop_u + + mov r4, #8 ; loop count + + ; Subtract V block +loop_v + ldr r6, [r2] ; src (A) + ldr r7, [r3], #4 ; pred (A) + + uxtb16 r8, r6 ; [s2 | s0] (A) + uxtb16 r9, r7 ; [p2 | p0] (A) + uxtb16 r10, r6, ror #8 ; [s3 | s1] (A) + uxtb16 r11, r7, ror #8 ; [p3 | p1] (A) + + usub16 r6, r8, r9 ; [d2 | d0] (A) + usub16 r7, r10, r11 ; [d3 | d1] (A) + + ldr r10, [r2, #4] ; src (B) + ldr r11, [r3], #4 ; pred (B) + + pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A) + pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A) + + str r8, [r0], #4 ; diff (A) + uxtb16 r8, r10 ; [s2 | s0] (B) + str r9, [r0], #4 ; diff (A) + + uxtb16 r9, r11 ; [p2 | p0] (B) + uxtb16 r10, r10, ror #8 ; [s3 | s1] (B) + uxtb16 r11, r11, ror #8 ; [p3 | p1] (B) + + usub16 r6, r8, r9 ; [d2 | d0] (B) + usub16 r7, r10, r11 ; [d3 | d1] (B) + + add r2, r2, r5 ; update vsrc pointer + + pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B) + pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B) + + str r8, [r0], #4 ; diff (B) + subs r4, r4, #1 ; update loop counter + str r9, [r0], #4 ; diff (B) + + bne loop_v + + ldmfd sp!, {r4-r12, pc} + + ENDP + + +; r0 short *diff +; r1 unsigned char *src +; r2 unsigned char *pred +; r3 int stride +|vp8_subtract_mby_armv6| PROC + + stmfd sp!, {r4-r11} + + mov r4, #16 +loop + ldr r6, [r1] ; src (A) + ldr r7, [r2], #4 ; pred (A) + + uxtb16 r8, r6 ; [s2 | s0] (A) + uxtb16 r9, r7 ; [p2 | p0] (A) + uxtb16 r10, r6, ror #8 ; [s3 | s1] (A) + uxtb16 r11, r7, ror #8 ; [p3 | p1] (A) + + usub16 r6, r8, r9 ; [d2 | d0] (A) + usub16 r7, r10, r11 ; [d3 | d1] (A) + + ldr r10, [r1, #4] ; src (B) + ldr r11, [r2], #4 ; pred (B) + + pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A) + pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A) + + str r8, [r0], #4 ; diff (A) + uxtb16 r8, r10 ; [s2 | s0] (B) + str r9, [r0], #4 ; diff (A) + + uxtb16 r9, r11 ; [p2 | p0] (B) + uxtb16 r10, r10, ror #8 ; [s3 | s1] (B) + uxtb16 r11, r11, ror #8 ; [p3 | p1] (B) + + usub16 r6, r8, r9 ; [d2 | d0] (B) + usub16 r7, r10, r11 ; [d3 | d1] (B) + + ldr r10, [r1, #8] ; src (C) + ldr r11, [r2], #4 ; pred (C) + + pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B) + pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B) + + str r8, [r0], #4 ; diff (B) + uxtb16 r8, r10 ; [s2 | s0] (C) + str r9, [r0], #4 ; diff (B) + + uxtb16 r9, r11 ; [p2 | p0] (C) + uxtb16 r10, r10, ror #8 ; [s3 | s1] (C) + uxtb16 r11, r11, ror #8 ; [p3 | p1] (C) + + usub16 r6, r8, r9 ; [d2 | d0] (C) + usub16 r7, r10, r11 ; [d3 | d1] (C) + + ldr r10, [r1, #12] ; src (D) + ldr r11, [r2], #4 ; pred (D) + + pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (C) + pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (C) + + str r8, [r0], #4 ; diff (C) + uxtb16 r8, r10 ; [s2 | s0] (D) + str r9, [r0], #4 ; diff (C) + + uxtb16 r9, r11 ; [p2 | p0] (D) + uxtb16 r10, r10, ror #8 ; [s3 | s1] (D) + uxtb16 r11, r11, ror #8 ; [p3 | p1] (D) + + usub16 r6, r8, r9 ; [d2 | d0] (D) + usub16 r7, r10, r11 ; [d3 | d1] (D) + + add r1, r1, r3 ; update src pointer + + pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (D) + pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (D) + + str r8, [r0], #4 ; diff (D) + subs r4, r4, #1 ; update loop counter + str r9, [r0], #4 ; diff (D) + + bne loop + + ldmfd sp!, {r4-r11} + mov pc, lr + + ENDP + + END + diff --git a/vp9/encoder/arm/armv6/vp8_variance16x16_armv6.asm b/vp9/encoder/arm/armv6/vp8_variance16x16_armv6.asm new file mode 100644 index 000000000..110db3074 --- /dev/null +++ b/vp9/encoder/arm/armv6/vp8_variance16x16_armv6.asm @@ -0,0 +1,154 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_variance16x16_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp9_variance16x16_armv6| PROC + + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r8, #0 ; initialize sum = 0 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + +loop + ; 1st 4 pixels + ldr r4, [r0, #0] ; load 4 src pixels + ldr r5, [r2, #0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load 4 src pixels + ldr r5, [r2, #4] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load 4 src pixels + ldr r5, [r2, #8] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load 4 src pixels + ldr r5, [r2, #12] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + + subs r12, r12, #1 + + bne loop + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + + END + diff --git a/vp9/encoder/arm/armv6/vp8_variance8x8_armv6.asm b/vp9/encoder/arm/armv6/vp8_variance8x8_armv6.asm new file mode 100644 index 000000000..101f6838d --- /dev/null +++ b/vp9/encoder/arm/armv6/vp8_variance8x8_armv6.asm @@ -0,0 +1,101 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_variance8x8_armv6| + + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp9_variance8x8_armv6| PROC + + push {r4-r10, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r12, #8 ; set loop counter to 8 (=block height) + mov r4, #0 ; initialize sum = 0 + mov r5, #0 ; initialize sse = 0 + +loop + ; 1st 4 pixels + ldr r6, [r0, #0x0] ; load 4 src pixels + ldr r7, [r2, #0x0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r8, r6, r7 ; calculate difference + pld [r0, r1, lsl #1] + sel r10, r8, lr ; select bytes with positive difference + usub8 r9, r7, r6 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r6, r10, lr ; calculate sum of positive differences + usad8 r7, r8, lr ; calculate sum of negative differences + orr r8, r8, r10 ; differences of all 4 pixels + ; calculate total sum + add r4, r4, r6 ; add positive differences to sum + sub r4, r4, r7 ; substract negative differences from sum + + ; calculate sse + uxtb16 r7, r8 ; byte (two pixels) to halfwords + uxtb16 r10, r8, ror #8 ; another two pixels to halfwords + smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r6, [r0, #0x4] ; load 4 src pixels + ldr r7, [r2, #0x4] ; load 4 ref pixels + smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r6, r7 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r10, r8, lr ; select bytes with positive difference + usub8 r9, r7, r6 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r6, r10, lr ; calculate sum of positive differences + usad8 r7, r8, lr ; calculate sum of negative differences + orr r8, r8, r10 ; differences of all 4 pixels + + ; calculate total sum + add r4, r4, r6 ; add positive differences to sum + sub r4, r4, r7 ; substract negative differences from sum + + ; calculate sse + uxtb16 r7, r8 ; byte (two pixels) to halfwords + uxtb16 r10, r8, ror #8 ; another two pixels to halfwords + smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) + subs r12, r12, #1 ; next row + smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) + + bne loop + + ; return stuff + ldr r8, [sp, #32] ; get address of sse + mul r1, r4, r4 ; sum * sum + str r5, [r8] ; store sse + sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6)) + + pop {r4-r10, pc} + + ENDP + + END diff --git a/vp9/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm b/vp9/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm new file mode 100644 index 000000000..7a8cafd3b --- /dev/null +++ b/vp9/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm @@ -0,0 +1,182 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_variance_halfpixvar16x16_h_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp9_variance_halfpixvar16x16_h_armv6| PROC + + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r8, #0 ; initialize sum = 0 + ldr r10, c80808080 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + mov lr, #0 ; constant zero +loop + ; 1st 4 pixels + ldr r4, [r0, #0] ; load 4 src pixels + ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #0] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load 4 src pixels + ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #4] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load 4 src pixels + ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #8] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load 4 src pixels + ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #12] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + subs r12, r12, #1 + + bne loop + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + +c80808080 + DCD 0x80808080 + + END + diff --git a/vp9/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm b/vp9/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm new file mode 100644 index 000000000..6ad5e90bb --- /dev/null +++ b/vp9/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm @@ -0,0 +1,222 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_variance_halfpixvar16x16_hv_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp9_variance_halfpixvar16x16_hv_armv6| PROC + + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r8, #0 ; initialize sum = 0 + ldr r10, c80808080 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + mov lr, #0 ; constant zero +loop + add r9, r0, r1 ; pointer to pixels on the next row + ; 1st 4 pixels + ldr r4, [r0, #0] ; load source pixels a, row N + ldr r6, [r0, #1] ; load source pixels b, row N + ldr r5, [r9, #0] ; load source pixels c, row N+1 + ldr r7, [r9, #1] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #0] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load source pixels a, row N + ldr r6, [r0, #5] ; load source pixels b, row N + ldr r5, [r9, #4] ; load source pixels c, row N+1 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + ldr r7, [r9, #5] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #4] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load source pixels a, row N + ldr r6, [r0, #9] ; load source pixels b, row N + ldr r5, [r9, #8] ; load source pixels c, row N+1 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + ldr r7, [r9, #9] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #8] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load source pixels a, row N + ldr r6, [r0, #13] ; load source pixels b, row N + ldr r5, [r9, #12] ; load source pixels c, row N+1 + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + ldr r7, [r9, #13] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #12] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + subs r12, r12, #1 + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + bne loop + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + +c80808080 + DCD 0x80808080 + + END diff --git a/vp9/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm b/vp9/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm new file mode 100644 index 000000000..0471d3d67 --- /dev/null +++ b/vp9/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm @@ -0,0 +1,184 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_variance_halfpixvar16x16_v_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp9_variance_halfpixvar16x16_v_armv6| PROC + + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r8, #0 ; initialize sum = 0 + ldr r10, c80808080 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + mov lr, #0 ; constant zero +loop + add r9, r0, r1 ; set src pointer to next row + ; 1st 4 pixels + ldr r4, [r0, #0] ; load 4 src pixels + ldr r6, [r9, #0] ; load 4 src pixels from next row + ldr r5, [r2, #0] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load 4 src pixels + ldr r6, [r9, #4] ; load 4 src pixels from next row + ldr r5, [r2, #4] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load 4 src pixels + ldr r6, [r9, #8] ; load 4 src pixels from next row + ldr r5, [r2, #8] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load 4 src pixels + ldr r6, [r9, #12] ; load 4 src pixels from next row + ldr r5, [r2, #12] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + + subs r12, r12, #1 + + bne loop + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + +c80808080 + DCD 0x80808080 + + END + diff --git a/vp9/encoder/arm/armv6/walsh_v6.asm b/vp9/encoder/arm/armv6/walsh_v6.asm new file mode 100644 index 000000000..5eaf3f25a --- /dev/null +++ b/vp9/encoder/arm/armv6/walsh_v6.asm @@ -0,0 +1,212 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp8_short_walsh4x4_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY ; name this block of code + +;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch) +; r0 short *input, +; r1 short *output, +; r2 int pitch +|vp8_short_walsh4x4_armv6| PROC + + stmdb sp!, {r4 - r11, lr} + + ldrd r4, r5, [r0], r2 + ldr lr, c00040004 + ldrd r6, r7, [r0], r2 + + ; 0-3 + qadd16 r3, r4, r5 ; [d1|a1] [1+3 | 0+2] + qsub16 r4, r4, r5 ; [c1|b1] [1-3 | 0-2] + + ldrd r8, r9, [r0], r2 + ; 4-7 + qadd16 r5, r6, r7 ; [d1|a1] [5+7 | 4+6] + qsub16 r6, r6, r7 ; [c1|b1] [5-7 | 4-6] + + ldrd r10, r11, [r0] + ; 8-11 + qadd16 r7, r8, r9 ; [d1|a1] [9+11 | 8+10] + qsub16 r8, r8, r9 ; [c1|b1] [9-11 | 8-10] + + ; 12-15 + qadd16 r9, r10, r11 ; [d1|a1] [13+15 | 12+14] + qsub16 r10, r10, r11 ; [c1|b1] [13-15 | 12-14] + + + lsls r2, r3, #16 + smuad r11, r3, lr ; A0 = a1<<2 + d1<<2 + addne r11, r11, #1 ; A0 += (a1!=0) + + lsls r2, r7, #16 + smuad r12, r7, lr ; C0 = a1<<2 + d1<<2 + addne r12, r12, #1 ; C0 += (a1!=0) + + add r0, r11, r12 ; a1_0 = A0 + C0 + sub r11, r11, r12 ; b1_0 = A0 - C0 + + lsls r2, r5, #16 + smuad r12, r5, lr ; B0 = a1<<2 + d1<<2 + addne r12, r12, #1 ; B0 += (a1!=0) + + lsls r2, r9, #16 + smuad r2, r9, lr ; D0 = a1<<2 + d1<<2 + addne r2, r2, #1 ; D0 += (a1!=0) + + add lr, r12, r2 ; d1_0 = B0 + D0 + sub r12, r12, r2 ; c1_0 = B0 - D0 + + ; op[0,4,8,12] + adds r2, r0, lr ; a2 = a1_0 + d1_0 + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + subs r0, r0, lr ; d2 = a1_0 - d1_0 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1] ; op[0] + + addmi r0, r0, #1 ; += a2 < 0 + add r0, r0, #3 ; += 3 + ldr lr, c00040004 + mov r0, r0, asr #3 ; >> 3 + strh r0, [r1, #24] ; op[12] + + adds r2, r11, r12 ; b2 = b1_0 + c1_0 + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + subs r0, r11, r12 ; c2 = b1_0 - c1_0 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1, #8] ; op[4] + + addmi r0, r0, #1 ; += a2 < 0 + add r0, r0, #3 ; += 3 + smusd r3, r3, lr ; A3 = a1<<2 - d1<<2 + smusd r7, r7, lr ; C3 = a1<<2 - d1<<2 + mov r0, r0, asr #3 ; >> 3 + strh r0, [r1, #16] ; op[8] + + + ; op[3,7,11,15] + add r0, r3, r7 ; a1_3 = A3 + C3 + sub r3, r3, r7 ; b1_3 = A3 - C3 + + smusd r5, r5, lr ; B3 = a1<<2 - d1<<2 + smusd r9, r9, lr ; D3 = a1<<2 - d1<<2 + add r7, r5, r9 ; d1_3 = B3 + D3 + sub r5, r5, r9 ; c1_3 = B3 - D3 + + adds r2, r0, r7 ; a2 = a1_3 + d1_3 + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + adds r9, r3, r5 ; b2 = b1_3 + c1_3 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1, #6] ; op[3] + + addmi r9, r9, #1 ; += a2 < 0 + add r9, r9, #3 ; += 3 + subs r2, r3, r5 ; c2 = b1_3 - c1_3 + mov r9, r9, asr #3 ; >> 3 + strh r9, [r1, #14] ; op[7] + + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + subs r9, r0, r7 ; d2 = a1_3 - d1_3 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1, #22] ; op[11] + + addmi r9, r9, #1 ; += a2 < 0 + add r9, r9, #3 ; += 3 + smuad r3, r4, lr ; A1 = b1<<2 + c1<<2 + smuad r5, r8, lr ; C1 = b1<<2 + c1<<2 + mov r9, r9, asr #3 ; >> 3 + strh r9, [r1, #30] ; op[15] + + ; op[1,5,9,13] + add r0, r3, r5 ; a1_1 = A1 + C1 + sub r3, r3, r5 ; b1_1 = A1 - C1 + + smuad r7, r6, lr ; B1 = b1<<2 + c1<<2 + smuad r9, r10, lr ; D1 = b1<<2 + c1<<2 + add r5, r7, r9 ; d1_1 = B1 + D1 + sub r7, r7, r9 ; c1_1 = B1 - D1 + + adds r2, r0, r5 ; a2 = a1_1 + d1_1 + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + adds r9, r3, r7 ; b2 = b1_1 + c1_1 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1, #2] ; op[1] + + addmi r9, r9, #1 ; += a2 < 0 + add r9, r9, #3 ; += 3 + subs r2, r3, r7 ; c2 = b1_1 - c1_1 + mov r9, r9, asr #3 ; >> 3 + strh r9, [r1, #10] ; op[5] + + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + subs r9, r0, r5 ; d2 = a1_1 - d1_1 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1, #18] ; op[9] + + addmi r9, r9, #1 ; += a2 < 0 + add r9, r9, #3 ; += 3 + smusd r4, r4, lr ; A2 = b1<<2 - c1<<2 + smusd r8, r8, lr ; C2 = b1<<2 - c1<<2 + mov r9, r9, asr #3 ; >> 3 + strh r9, [r1, #26] ; op[13] + + + ; op[2,6,10,14] + add r11, r4, r8 ; a1_2 = A2 + C2 + sub r12, r4, r8 ; b1_2 = A2 - C2 + + smusd r6, r6, lr ; B2 = b1<<2 - c1<<2 + smusd r10, r10, lr ; D2 = b1<<2 - c1<<2 + add r4, r6, r10 ; d1_2 = B2 + D2 + sub r8, r6, r10 ; c1_2 = B2 - D2 + + adds r2, r11, r4 ; a2 = a1_2 + d1_2 + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + adds r9, r12, r8 ; b2 = b1_2 + c1_2 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1, #4] ; op[2] + + addmi r9, r9, #1 ; += a2 < 0 + add r9, r9, #3 ; += 3 + subs r2, r12, r8 ; c2 = b1_2 - c1_2 + mov r9, r9, asr #3 ; >> 3 + strh r9, [r1, #12] ; op[6] + + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + subs r9, r11, r4 ; d2 = a1_2 - d1_2 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1, #20] ; op[10] + + addmi r9, r9, #1 ; += a2 < 0 + add r9, r9, #3 ; += 3 + mov r9, r9, asr #3 ; >> 3 + strh r9, [r1, #28] ; op[14] + + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_short_walsh4x4_armv6| + +c00040004 + DCD 0x00040004 + + END diff --git a/vp9/encoder/arm/boolhuff_arm.c b/vp9/encoder/arm/boolhuff_arm.c new file mode 100644 index 000000000..49cc6d63b --- /dev/null +++ b/vp9/encoder/arm/boolhuff_arm.c @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp9/encoder/boolhuff.h" +#include "vp9/common/blockd.h" + +const unsigned int vp9_prob_cost[256] = { + 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046, + 1023, 1000, 979, 959, 940, 922, 905, 889, 873, 858, 843, 829, 816, 803, 790, 778, + 767, 755, 744, 733, 723, 713, 703, 693, 684, 675, 666, 657, 649, 641, 633, 625, + 617, 609, 602, 594, 587, 580, 573, 567, 560, 553, 547, 541, 534, 528, 522, 516, + 511, 505, 499, 494, 488, 483, 477, 472, 467, 462, 457, 452, 447, 442, 437, 433, + 428, 424, 419, 415, 410, 406, 401, 397, 393, 389, 385, 381, 377, 373, 369, 365, + 361, 357, 353, 349, 346, 342, 338, 335, 331, 328, 324, 321, 317, 314, 311, 307, + 304, 301, 297, 294, 291, 288, 285, 281, 278, 275, 272, 269, 266, 263, 260, 257, + 255, 252, 249, 246, 243, 240, 238, 235, 232, 229, 227, 224, 221, 219, 216, 214, + 211, 208, 206, 203, 201, 198, 196, 194, 191, 189, 186, 184, 181, 179, 177, 174, + 172, 170, 168, 165, 163, 161, 159, 156, 154, 152, 150, 148, 145, 143, 141, 139, + 137, 135, 133, 131, 129, 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, + 105, 103, 101, 99, 97, 95, 93, 92, 90, 88, 86, 84, 82, 81, 79, 77, + 75, 73, 72, 70, 68, 66, 65, 63, 61, 60, 58, 56, 55, 53, 51, 50, + 48, 46, 45, 43, 41, 40, 38, 37, 35, 33, 32, 30, 29, 27, 25, 24, + 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 1 +}; + diff --git a/vp9/encoder/arm/dct_arm.c b/vp9/encoder/arm/dct_arm.c new file mode 100644 index 000000000..3fd04f383 --- /dev/null +++ b/vp9/encoder/arm/dct_arm.c @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "./vpx_rtcd.h" + +#if HAVE_ARMV6 + +void vp9_short_fdct8x4_armv6(short *input, short *output, int pitch) { + vp9_short_fdct4x4_armv6(input, output, pitch); + vp9_short_fdct4x4_armv6(input + 4, output + 16, pitch); +} + +#endif /* HAVE_ARMV6 */ diff --git a/vp9/encoder/arm/dct_arm.h b/vp9/encoder/arm/dct_arm.h new file mode 100644 index 000000000..83c446e7e --- /dev/null +++ b/vp9/encoder/arm/dct_arm.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef DCT_ARM_H +#define DCT_ARM_H + +#if HAVE_ARMV6 +extern prototype_fdct(vp9_short_walsh4x4_armv6); +extern prototype_fdct(vp9_short_fdct4x4_armv6); +extern prototype_fdct(vp9_short_fdct8x4_armv6); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_fdct_walsh_short4x4 +#define vp8_fdct_walsh_short4x4 vp9_short_walsh4x4_armv6 + +#undef vp8_fdct_short4x4 +#define vp8_fdct_short4x4 vp9_short_fdct4x4_armv6 + +#undef vp8_fdct_short8x4 +#define vp8_fdct_short8x4 vp9_short_fdct8x4_armv6 + +#undef vp8_fdct_fast4x4 +#define vp8_fdct_fast4x4 vp9_short_fdct4x4_armv6 + +#undef vp8_fdct_fast8x4 +#define vp8_fdct_fast8x4 vp9_short_fdct8x4_armv6 +#endif + +#endif /* HAVE_ARMV6 */ + +#if HAVE_ARMV7 +extern prototype_fdct(vp9_short_fdct4x4_neon); +extern prototype_fdct(vp9_short_fdct8x4_neon); +extern prototype_fdct(vp8_fast_fdct4x4_neon); +extern prototype_fdct(vp8_fast_fdct8x4_neon); +extern prototype_fdct(vp9_short_walsh4x4_neon); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_fdct_short4x4 +#define vp8_fdct_short4x4 vp9_short_fdct4x4_neon + +#undef vp8_fdct_short8x4 +#define vp8_fdct_short8x4 vp9_short_fdct8x4_neon + +#undef vp8_fdct_fast4x4 +#define vp8_fdct_fast4x4 vp9_short_fdct4x4_neon + +#undef vp8_fdct_fast8x4 +#define vp8_fdct_fast8x4 vp9_short_fdct8x4_neon + +#undef vp8_fdct_walsh_short4x4 +#define vp8_fdct_walsh_short4x4 vp9_short_walsh4x4_neon +#endif + +#endif + +#endif diff --git a/vp9/encoder/arm/encodemb_arm.h b/vp9/encoder/arm/encodemb_arm.h new file mode 100644 index 000000000..80bff79df --- /dev/null +++ b/vp9/encoder/arm/encodemb_arm.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef ENCODEMB_ARM_H +#define ENCODEMB_ARM_H + +#if HAVE_ARMV6 +extern prototype_subb(vp9_subtract_b_armv6); +extern prototype_submby(vp9_subtract_mby_armv6); +extern prototype_submbuv(vp9_subtract_mbuv_armv6); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_encodemb_subb +#define vp8_encodemb_subb vp9_subtract_b_armv6 + +#undef vp8_encodemb_submby +#define vp8_encodemb_submby vp9_subtract_mby_armv6 + +#undef vp8_encodemb_submbuv +#define vp8_encodemb_submbuv vp9_subtract_mbuv_armv6 +#endif + +#endif /* HAVE_ARMV6 */ + +#if HAVE_ARMV7 +// extern prototype_berr(vp9_block_error_c); +// extern prototype_mberr(vp9_mbblock_error_c); +// extern prototype_mbuverr(vp9_mbuverror_c); + +extern prototype_subb(vp9_subtract_b_neon); +extern prototype_submby(vp9_subtract_mby_neon); +extern prototype_submbuv(vp9_subtract_mbuv_neon); + +// #undef vp8_encodemb_berr +// #define vp8_encodemb_berr vp9_block_error_c + +// #undef vp8_encodemb_mberr +// #define vp8_encodemb_mberr vp9_mbblock_error_c + +// #undef vp8_encodemb_mbuverr +// #define vp8_encodemb_mbuverr vp9_mbuverror_c + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_encodemb_subb +#define vp8_encodemb_subb vp9_subtract_b_neon + +#undef vp8_encodemb_submby +#define vp8_encodemb_submby vp9_subtract_mby_neon + +#undef vp8_encodemb_submbuv +#define vp8_encodemb_submbuv vp9_subtract_mbuv_neon +#endif + +#endif + +#endif diff --git a/vp9/encoder/arm/neon/fastquantizeb_neon.asm b/vp9/encoder/arm/neon/fastquantizeb_neon.asm new file mode 100644 index 000000000..259707658 --- /dev/null +++ b/vp9/encoder/arm/neon/fastquantizeb_neon.asm @@ -0,0 +1,261 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_fast_quantize_b_neon| + EXPORT |vp8_fast_quantize_b_pair_neon| + + INCLUDE asm_enc_offsets.asm + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=4 + +;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2); +|vp8_fast_quantize_b_pair_neon| PROC + + stmfd sp!, {r4-r9} + vstmdb sp!, {q4-q7} + + ldr r4, [r0, #vp8_block_coeff] + ldr r5, [r0, #vp8_block_quant_fast] + ldr r6, [r0, #vp8_block_round] + + vld1.16 {q0, q1}, [r4@128] ; load z + + ldr r7, [r2, #vp8_blockd_qcoeff] + + vabs.s16 q4, q0 ; calculate x = abs(z) + vabs.s16 q5, q1 + + ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative + vshr.s16 q2, q0, #15 ; sz + vshr.s16 q3, q1, #15 + + vld1.s16 {q6, q7}, [r6@128] ; load round_ptr [0-15] + vld1.s16 {q8, q9}, [r5@128] ; load quant_ptr [0-15] + + ldr r4, [r1, #vp8_block_coeff] + + vadd.s16 q4, q6 ; x + Round + vadd.s16 q5, q7 + + vld1.16 {q0, q1}, [r4@128] ; load z2 + + vqdmulh.s16 q4, q8 ; y = ((Round+abs(z)) * Quant) >> 16 + vqdmulh.s16 q5, q9 + + vabs.s16 q10, q0 ; calculate x2 = abs(z_2) + vabs.s16 q11, q1 + vshr.s16 q12, q0, #15 ; sz2 + vshr.s16 q13, q1, #15 + + ;modify data to have its original sign + veor.s16 q4, q2 ; y^sz + veor.s16 q5, q3 + + vadd.s16 q10, q6 ; x2 + Round + vadd.s16 q11, q7 + + ldr r8, [r2, #vp8_blockd_dequant] + + vqdmulh.s16 q10, q8 ; y2 = ((Round+abs(z)) * Quant) >> 16 + vqdmulh.s16 q11, q9 + + vshr.s16 q4, #1 ; right shift 1 after vqdmulh + vshr.s16 q5, #1 + + vld1.s16 {q6, q7}, [r8@128] ;load dequant_ptr[i] + + vsub.s16 q4, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement) + vsub.s16 q5, q3 + + vshr.s16 q10, #1 ; right shift 1 after vqdmulh + vshr.s16 q11, #1 + + ldr r9, [r2, #vp8_blockd_dqcoeff] + + veor.s16 q10, q12 ; y2^sz2 + veor.s16 q11, q13 + + vst1.s16 {q4, q5}, [r7] ; store: qcoeff = x1 + + + vsub.s16 q10, q12 ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement) + vsub.s16 q11, q13 + + ldr r6, [r3, #vp8_blockd_qcoeff] + + vmul.s16 q2, q6, q4 ; x * Dequant + vmul.s16 q3, q7, q5 + + ldr r0, _inv_zig_zag_ ; load ptr of inverse zigzag table + + vceq.s16 q8, q8 ; set q8 to all 1 + + vst1.s16 {q10, q11}, [r6] ; store: qcoeff = x2 + + vmul.s16 q12, q6, q10 ; x2 * Dequant + vmul.s16 q13, q7, q11 + + vld1.16 {q6, q7}, [r0@128] ; load inverse scan order + + vtst.16 q14, q4, q8 ; now find eob + vtst.16 q15, q5, q8 ; non-zero element is set to all 1 + + vst1.s16 {q2, q3}, [r9] ; store dqcoeff = x * Dequant + + ldr r7, [r3, #vp8_blockd_dqcoeff] + + vand q0, q6, q14 ; get all valid numbers from scan array + vand q1, q7, q15 + + vst1.s16 {q12, q13}, [r7] ; store dqcoeff = x * Dequant + + vtst.16 q2, q10, q8 ; now find eob + vtst.16 q3, q11, q8 ; non-zero element is set to all 1 + + vmax.u16 q0, q0, q1 ; find maximum value in q0, q1 + + vand q10, q6, q2 ; get all valid numbers from scan array + vand q11, q7, q3 + vmax.u16 q10, q10, q11 ; find maximum value in q10, q11 + + vmax.u16 d0, d0, d1 + vmax.u16 d20, d20, d21 + vmovl.u16 q0, d0 + vmovl.u16 q10, d20 + + + vmax.u32 d0, d0, d1 + vmax.u32 d20, d20, d21 + vpmax.u32 d0, d0, d0 + vpmax.u32 d20, d20, d20 + + add r4, r2, #vp8_blockd_eob + add r5, r3, #vp8_blockd_eob + + vst1.32 {d0[0]}, [r4@32] + vst1.32 {d20[0]}, [r5@32] + + vldmia sp!, {q4-q7} + ldmfd sp!, {r4-r9} + bx lr + + ENDP + +;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d) +|vp8_fast_quantize_b_neon| PROC + + stmfd sp!, {r4-r7} + + ldr r3, [r0, #vp8_block_coeff] + ldr r4, [r0, #vp8_block_quant_fast] + ldr r5, [r0, #vp8_block_round] + + vld1.16 {q0, q1}, [r3@128] ; load z + vorr.s16 q14, q0, q1 ; check if all zero (step 1) + ldr r6, [r1, #vp8_blockd_qcoeff] + ldr r7, [r1, #vp8_blockd_dqcoeff] + vorr.s16 d28, d28, d29 ; check if all zero (step 2) + + vabs.s16 q12, q0 ; calculate x = abs(z) + vabs.s16 q13, q1 + + ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative + vshr.s16 q2, q0, #15 ; sz + vmov r2, r3, d28 ; check if all zero (step 3) + vshr.s16 q3, q1, #15 + + vld1.s16 {q14, q15}, [r5@128]; load round_ptr [0-15] + vld1.s16 {q8, q9}, [r4@128] ; load quant_ptr [0-15] + + vadd.s16 q12, q14 ; x + Round + vadd.s16 q13, q15 + + ldr r0, _inv_zig_zag_ ; load ptr of inverse zigzag table + + vqdmulh.s16 q12, q8 ; y = ((Round+abs(z)) * Quant) >> 16 + vqdmulh.s16 q13, q9 + + vld1.16 {q10, q11}, [r0@128]; load inverse scan order + + vceq.s16 q8, q8 ; set q8 to all 1 + + ldr r4, [r1, #vp8_blockd_dequant] + + vshr.s16 q12, #1 ; right shift 1 after vqdmulh + vshr.s16 q13, #1 + + orr r2, r2, r3 ; check if all zero (step 4) + cmp r2, #0 ; check if all zero (step 5) + beq zero_output ; check if all zero (step 6) + + ;modify data to have its original sign + veor.s16 q12, q2 ; y^sz + veor.s16 q13, q3 + + vsub.s16 q12, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement) + vsub.s16 q13, q3 + + vld1.s16 {q2, q3}, [r4@128] ; load dequant_ptr[i] + + vtst.16 q14, q12, q8 ; now find eob + vtst.16 q15, q13, q8 ; non-zero element is set to all 1 + + vst1.s16 {q12, q13}, [r6@128]; store: qcoeff = x1 + + vand q10, q10, q14 ; get all valid numbers from scan array + vand q11, q11, q15 + + + vmax.u16 q0, q10, q11 ; find maximum value in q0, q1 + vmax.u16 d0, d0, d1 + vmovl.u16 q0, d0 + + vmul.s16 q2, q12 ; x * Dequant + vmul.s16 q3, q13 + + vmax.u32 d0, d0, d1 + vpmax.u32 d0, d0, d0 + + vst1.s16 {q2, q3}, [r7@128] ; store dqcoeff = x * Dequant + + add r4, r1, #vp8_blockd_eob + vst1.32 {d0[0]}, [r4@32] + + ldmfd sp!, {r4-r7} + bx lr + +zero_output + str r2, [r1, #vp8_blockd_eob] + vst1.s16 {q0, q1}, [r6@128] ; qcoeff = 0 + vst1.s16 {q0, q1}, [r7@128] ; dqcoeff = 0 + + ldmfd sp!, {r4-r7} + bx lr + + ENDP + +; default inverse zigzag table is defined in vp9/common/entropy.c +_inv_zig_zag_ + DCD inv_zig_zag + + ALIGN 16 ; enable use of @128 bit aligned loads +inv_zig_zag + DCW 0x0001, 0x0002, 0x0006, 0x0007 + DCW 0x0003, 0x0005, 0x0008, 0x000d + DCW 0x0004, 0x0009, 0x000c, 0x000e + DCW 0x000a, 0x000b, 0x000f, 0x0010 + + END + diff --git a/vp9/encoder/arm/neon/picklpf_arm.c b/vp9/encoder/arm/neon/picklpf_arm.c new file mode 100644 index 000000000..f16dadc62 --- /dev/null +++ b/vp9/encoder/arm/neon/picklpf_arm.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp9/common/onyxc_int.h" +#include "vp9/encoder/onyx_int.h" +#include "vp9/encoder/quantize.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_scale/yv12extend.h" +#include "vpx_scale/vpxscale.h" +#include "vp9/common/alloccommon.h" + +extern void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz); + + +void +vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction) { + unsigned char *src_y, *dst_y; + int yheight; + int ystride; + int border; + int yoffset; + int linestocopy; + + border = src_ybc->border; + yheight = src_ybc->y_height; + ystride = src_ybc->y_stride; + + linestocopy = (yheight >> (Fraction + 4)); + + if (linestocopy < 1) + linestocopy = 1; + + linestocopy <<= 4; + + yoffset = ystride * ((yheight >> 5) * 16 - 8); + src_y = src_ybc->y_buffer + yoffset; + dst_y = dst_ybc->y_buffer + yoffset; + + // vpx_memcpy (dst_y, src_y, ystride * (linestocopy +16)); + vp8_memcpy_neon((unsigned char *)dst_y, (unsigned char *)src_y, (int)(ystride * (linestocopy + 16))); +} diff --git a/vp9/encoder/arm/neon/sad16_neon.asm b/vp9/encoder/arm/neon/sad16_neon.asm new file mode 100644 index 000000000..d7c590e15 --- /dev/null +++ b/vp9/encoder/arm/neon/sad16_neon.asm @@ -0,0 +1,207 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_sad16x16_neon| + EXPORT |vp8_sad16x8_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int src_stride +; r2 unsigned char *ref_ptr +; r3 int ref_stride +|vp8_sad16x16_neon| PROC +;; + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabdl.u8 q12, d0, d8 + vabdl.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0], r1 + vld1.8 {q7}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + +;; + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabal.u8 q12, d0, d8 + vabal.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0], r1 + vld1.8 {q7}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + +;; + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabal.u8 q12, d0, d8 + vabal.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0], r1 + vld1.8 {q7}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + +;; + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabal.u8 q12, d0, d8 + vabal.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0] + vld1.8 {q7}, [r2] + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vadd.u16 q0, q12, q13 + + vpaddl.u16 q1, q0 + vpaddl.u32 q0, q1 + + vadd.u32 d0, d0, d1 + + vmov.32 r0, d0[0] + + bx lr + + ENDP + +;============================== +;unsigned int vp8_sad16x8_c( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +|vp8_sad16x8_neon| PROC + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabdl.u8 q12, d0, d8 + vabdl.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0], r1 + vld1.8 {q7}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabal.u8 q12, d0, d8 + vabal.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0], r1 + vld1.8 {q7}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vadd.u16 q0, q12, q13 + + vpaddl.u16 q1, q0 + vpaddl.u32 q0, q1 + + vadd.u32 d0, d0, d1 + + vmov.32 r0, d0[0] + + bx lr + + ENDP + + END diff --git a/vp9/encoder/arm/neon/sad8_neon.asm b/vp9/encoder/arm/neon/sad8_neon.asm new file mode 100644 index 000000000..23ba6df93 --- /dev/null +++ b/vp9/encoder/arm/neon/sad8_neon.asm @@ -0,0 +1,209 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_sad8x8_neon| + EXPORT |vp8_sad8x16_neon| + EXPORT |vp8_sad4x4_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; unsigned int vp8_sad8x8_c( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) + +|vp8_sad8x8_neon| PROC + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabdl.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vabal.u8 q12, d6, d14 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabal.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q12, d6, d14 + + vpaddl.u16 q1, q12 + vpaddl.u32 q0, q1 + vadd.u32 d0, d0, d1 + + vmov.32 r0, d0[0] + + bx lr + + ENDP + +;============================ +;unsigned int vp8_sad8x16_c( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) + +|vp8_sad8x16_neon| PROC + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabdl.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vabal.u8 q12, d6, d14 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabal.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vabal.u8 q12, d6, d14 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabal.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vabal.u8 q12, d6, d14 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabal.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q12, d6, d14 + + vpaddl.u16 q1, q12 + vpaddl.u32 q0, q1 + vadd.u32 d0, d0, d1 + + vmov.32 r0, d0[0] + + bx lr + + ENDP + +;=========================== +;unsigned int vp8_sad4x4_c( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) + +|vp8_sad4x4_neon| PROC + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabdl.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q12, d6, d14 + + vpaddl.u16 d1, d24 + vpaddl.u32 d0, d1 + vmov.32 r0, d0[0] + + bx lr + + ENDP + + END diff --git a/vp9/encoder/arm/neon/shortfdct_neon.asm b/vp9/encoder/arm/neon/shortfdct_neon.asm new file mode 100644 index 000000000..09dd011ec --- /dev/null +++ b/vp9/encoder/arm/neon/shortfdct_neon.asm @@ -0,0 +1,221 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_short_fdct4x4_neon| + EXPORT |vp8_short_fdct8x4_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=4 + + + ALIGN 16 ; enable use of @128 bit aligned loads +coeff + DCW 5352, 5352, 5352, 5352 + DCW 2217, 2217, 2217, 2217 + DCD 14500, 14500, 14500, 14500 + DCD 7500, 7500, 7500, 7500 + DCD 12000, 12000, 12000, 12000 + DCD 51000, 51000, 51000, 51000 + +;void vp8_short_fdct4x4_c(short *input, short *output, int pitch) +|vp8_short_fdct4x4_neon| PROC + + ; Part one + vld1.16 {d0}, [r0@64], r2 + adr r12, coeff + vld1.16 {d1}, [r0@64], r2 + vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217 + vld1.16 {d2}, [r0@64], r2 + vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500 + vld1.16 {d3}, [r0@64], r2 + + ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3] + vtrn.32 d0, d2 + vtrn.32 d1, d3 + vld1.32 {q11,q12}, [r12@128] ; q11=12000, q12=51000 + vtrn.16 d0, d1 + vtrn.16 d2, d3 + + vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[3] + vadd.s16 d5, d1, d2 ; b1 = ip[1] + ip[2] + vsub.s16 d6, d1, d2 ; c1 = ip[1] - ip[2] + vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[3] + + vshl.s16 q2, q2, #3 ; (a1, b1) << 3 + vshl.s16 q3, q3, #3 ; (c1, d1) << 3 + + vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 + vsub.s16 d2, d4, d5 ; op[2] = a1 - b1 + + vmlal.s16 q9, d7, d16 ; d1*5352 + 14500 + vmlal.s16 q10, d7, d17 ; d1*2217 + 7500 + vmlal.s16 q9, d6, d17 ; c1*2217 + d1*5352 + 14500 + vmlsl.s16 q10, d6, d16 ; d1*2217 - c1*5352 + 7500 + + vshrn.s32 d1, q9, #12 ; op[1] = (c1*2217 + d1*5352 + 14500)>>12 + vshrn.s32 d3, q10, #12 ; op[3] = (d1*2217 - c1*5352 + 7500)>>12 + + + ; Part two + + ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12] + vtrn.32 d0, d2 + vtrn.32 d1, d3 + vtrn.16 d0, d1 + vtrn.16 d2, d3 + + vmov.s16 d26, #7 + + vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[12] + vadd.s16 d5, d1, d2 ; b1 = ip[4] + ip[8] + vsub.s16 d6, d1, d2 ; c1 = ip[4] - ip[8] + vadd.s16 d4, d4, d26 ; a1 + 7 + vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[12] + + vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 + 7 + vsub.s16 d2, d4, d5 ; op[8] = a1 - b1 + 7 + + vmlal.s16 q11, d7, d16 ; d1*5352 + 12000 + vmlal.s16 q12, d7, d17 ; d1*2217 + 51000 + + vceq.s16 d4, d7, #0 + + vshr.s16 d0, d0, #4 + vshr.s16 d2, d2, #4 + + vmlal.s16 q11, d6, d17 ; c1*2217 + d1*5352 + 12000 + vmlsl.s16 q12, d6, d16 ; d1*2217 - c1*5352 + 51000 + + vmvn.s16 d4, d4 + vshrn.s32 d1, q11, #16 ; op[4] = (c1*2217 + d1*5352 + 12000)>>16 + vsub.s16 d1, d1, d4 ; op[4] += (d1!=0) + vshrn.s32 d3, q12, #16 ; op[12]= (d1*2217 - c1*5352 + 51000)>>16 + + vst1.16 {q0, q1}, [r1@128] + + bx lr + + ENDP + +;void vp8_short_fdct8x4_c(short *input, short *output, int pitch) +|vp8_short_fdct8x4_neon| PROC + + ; Part one + + vld1.16 {q0}, [r0@128], r2 + adr r12, coeff + vld1.16 {q1}, [r0@128], r2 + vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217 + vld1.16 {q2}, [r0@128], r2 + vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500 + vld1.16 {q3}, [r0@128], r2 + + ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3] + vtrn.32 q0, q2 ; [A0|B0] + vtrn.32 q1, q3 ; [A1|B1] + vtrn.16 q0, q1 ; [A2|B2] + vtrn.16 q2, q3 ; [A3|B3] + + vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[3] + vadd.s16 q12, q1, q2 ; b1 = ip[1] + ip[2] + vsub.s16 q13, q1, q2 ; c1 = ip[1] - ip[2] + vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[3] + + vshl.s16 q11, q11, #3 ; a1 << 3 + vshl.s16 q12, q12, #3 ; b1 << 3 + vshl.s16 q13, q13, #3 ; c1 << 3 + vshl.s16 q14, q14, #3 ; d1 << 3 + + vadd.s16 q0, q11, q12 ; [A0 | B0] = a1 + b1 + vsub.s16 q2, q11, q12 ; [A2 | B2] = a1 - b1 + + vmov.s16 q11, q9 ; 14500 + vmov.s16 q12, q10 ; 7500 + + vmlal.s16 q9, d28, d16 ; A[1] = d1*5352 + 14500 + vmlal.s16 q10, d28, d17 ; A[3] = d1*2217 + 7500 + vmlal.s16 q11, d29, d16 ; B[1] = d1*5352 + 14500 + vmlal.s16 q12, d29, d17 ; B[3] = d1*2217 + 7500 + + vmlal.s16 q9, d26, d17 ; A[1] = c1*2217 + d1*5352 + 14500 + vmlsl.s16 q10, d26, d16 ; A[3] = d1*2217 - c1*5352 + 7500 + vmlal.s16 q11, d27, d17 ; B[1] = c1*2217 + d1*5352 + 14500 + vmlsl.s16 q12, d27, d16 ; B[3] = d1*2217 - c1*5352 + 7500 + + vshrn.s32 d2, q9, #12 ; A[1] = (c1*2217 + d1*5352 + 14500)>>12 + vshrn.s32 d6, q10, #12 ; A[3] = (d1*2217 - c1*5352 + 7500)>>12 + vshrn.s32 d3, q11, #12 ; B[1] = (c1*2217 + d1*5352 + 14500)>>12 + vshrn.s32 d7, q12, #12 ; B[3] = (d1*2217 - c1*5352 + 7500)>>12 + + + ; Part two + vld1.32 {q9,q10}, [r12@128] ; q9=12000, q10=51000 + + ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12] + vtrn.32 q0, q2 ; q0=[A0 | B0] + vtrn.32 q1, q3 ; q1=[A4 | B4] + vtrn.16 q0, q1 ; q2=[A8 | B8] + vtrn.16 q2, q3 ; q3=[A12|B12] + + vmov.s16 q15, #7 + + vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[12] + vadd.s16 q12, q1, q2 ; b1 = ip[4] + ip[8] + vadd.s16 q11, q11, q15 ; a1 + 7 + vsub.s16 q13, q1, q2 ; c1 = ip[4] - ip[8] + vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[12] + + vadd.s16 q0, q11, q12 ; a1 + b1 + 7 + vsub.s16 q1, q11, q12 ; a1 - b1 + 7 + + vmov.s16 q11, q9 ; 12000 + vmov.s16 q12, q10 ; 51000 + + vshr.s16 d0, d0, #4 ; A[0] = (a1 + b1 + 7)>>4 + vshr.s16 d4, d1, #4 ; B[0] = (a1 + b1 + 7)>>4 + vshr.s16 d2, d2, #4 ; A[8] = (a1 + b1 + 7)>>4 + vshr.s16 d6, d3, #4 ; B[8] = (a1 + b1 + 7)>>4 + + + vmlal.s16 q9, d28, d16 ; A[4] = d1*5352 + 12000 + vmlal.s16 q10, d28, d17 ; A[12] = d1*2217 + 51000 + vmlal.s16 q11, d29, d16 ; B[4] = d1*5352 + 12000 + vmlal.s16 q12, d29, d17 ; B[12] = d1*2217 + 51000 + + vceq.s16 q14, q14, #0 + + vmlal.s16 q9, d26, d17 ; A[4] = c1*2217 + d1*5352 + 12000 + vmlsl.s16 q10, d26, d16 ; A[12] = d1*2217 - c1*5352 + 51000 + vmlal.s16 q11, d27, d17 ; B[4] = c1*2217 + d1*5352 + 12000 + vmlsl.s16 q12, d27, d16 ; B[12] = d1*2217 - c1*5352 + 51000 + + vmvn.s16 q14, q14 + + vshrn.s32 d1, q9, #16 ; A[4] = (c1*2217 + d1*5352 + 12000)>>16 + vshrn.s32 d3, q10, #16 ; A[12]= (d1*2217 - c1*5352 + 51000)>>16 + vsub.s16 d1, d1, d28 ; A[4] += (d1!=0) + + vshrn.s32 d5, q11, #16 ; B[4] = (c1*2217 + d1*5352 + 12000)>>16 + vshrn.s32 d7, q12, #16 ; B[12]= (d1*2217 - c1*5352 + 51000)>>16 + vsub.s16 d5, d5, d29 ; B[4] += (d1!=0) + + vst1.16 {q0, q1}, [r1@128]! ; block A + vst1.16 {q2, q3}, [r1@128]! ; block B + + bx lr + + ENDP + + END + diff --git a/vp9/encoder/arm/neon/subtract_neon.asm b/vp9/encoder/arm/neon/subtract_neon.asm new file mode 100644 index 000000000..68c295062 --- /dev/null +++ b/vp9/encoder/arm/neon/subtract_neon.asm @@ -0,0 +1,185 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp8_subtract_b_neon| + EXPORT |vp8_subtract_mby_neon| + EXPORT |vp8_subtract_mbuv_neon| + + INCLUDE asm_enc_offsets.asm + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch) +|vp8_subtract_b_neon| PROC + + stmfd sp!, {r4-r7} + + ldr r3, [r0, #vp8_block_base_src] + ldr r4, [r0, #vp8_block_src] + ldr r5, [r0, #vp8_block_src_diff] + ldr r3, [r3] + ldr r6, [r0, #vp8_block_src_stride] + add r3, r3, r4 ; src = *base_src + src + ldr r7, [r1, #vp8_blockd_predictor] + + vld1.8 {d0}, [r3], r6 ;load src + vld1.8 {d1}, [r7], r2 ;load pred + vld1.8 {d2}, [r3], r6 + vld1.8 {d3}, [r7], r2 + vld1.8 {d4}, [r3], r6 + vld1.8 {d5}, [r7], r2 + vld1.8 {d6}, [r3], r6 + vld1.8 {d7}, [r7], r2 + + vsubl.u8 q10, d0, d1 + vsubl.u8 q11, d2, d3 + vsubl.u8 q12, d4, d5 + vsubl.u8 q13, d6, d7 + + mov r2, r2, lsl #1 + + vst1.16 {d20}, [r5], r2 ;store diff + vst1.16 {d22}, [r5], r2 + vst1.16 {d24}, [r5], r2 + vst1.16 {d26}, [r5], r2 + + ldmfd sp!, {r4-r7} + bx lr + + ENDP + + +;========================================== +;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride) +|vp8_subtract_mby_neon| PROC + mov r12, #4 + +subtract_mby_loop + vld1.8 {q0}, [r1], r3 ;load src + vld1.8 {q1}, [r2]! ;load pred + vld1.8 {q2}, [r1], r3 + vld1.8 {q3}, [r2]! + vld1.8 {q4}, [r1], r3 + vld1.8 {q5}, [r2]! + vld1.8 {q6}, [r1], r3 + vld1.8 {q7}, [r2]! + + vsubl.u8 q8, d0, d2 + vsubl.u8 q9, d1, d3 + vsubl.u8 q10, d4, d6 + vsubl.u8 q11, d5, d7 + vsubl.u8 q12, d8, d10 + vsubl.u8 q13, d9, d11 + vsubl.u8 q14, d12, d14 + vsubl.u8 q15, d13, d15 + + vst1.16 {q8}, [r0]! ;store diff + vst1.16 {q9}, [r0]! + vst1.16 {q10}, [r0]! + vst1.16 {q11}, [r0]! + vst1.16 {q12}, [r0]! + vst1.16 {q13}, [r0]! + vst1.16 {q14}, [r0]! + vst1.16 {q15}, [r0]! + + subs r12, r12, #1 + bne subtract_mby_loop + + bx lr + ENDP + +;================================= +;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) +|vp8_subtract_mbuv_neon| PROC + ldr r12, [sp] + +;u + add r0, r0, #512 ; short *udiff = diff + 256; + add r3, r3, #256 ; unsigned char *upred = pred + 256; + + vld1.8 {d0}, [r1], r12 ;load src + vld1.8 {d1}, [r3]! ;load pred + vld1.8 {d2}, [r1], r12 + vld1.8 {d3}, [r3]! + vld1.8 {d4}, [r1], r12 + vld1.8 {d5}, [r3]! + vld1.8 {d6}, [r1], r12 + vld1.8 {d7}, [r3]! + vld1.8 {d8}, [r1], r12 + vld1.8 {d9}, [r3]! + vld1.8 {d10}, [r1], r12 + vld1.8 {d11}, [r3]! + vld1.8 {d12}, [r1], r12 + vld1.8 {d13}, [r3]! + vld1.8 {d14}, [r1], r12 + vld1.8 {d15}, [r3]! + + vsubl.u8 q8, d0, d1 + vsubl.u8 q9, d2, d3 + vsubl.u8 q10, d4, d5 + vsubl.u8 q11, d6, d7 + vsubl.u8 q12, d8, d9 + vsubl.u8 q13, d10, d11 + vsubl.u8 q14, d12, d13 + vsubl.u8 q15, d14, d15 + + vst1.16 {q8}, [r0]! ;store diff + vst1.16 {q9}, [r0]! + vst1.16 {q10}, [r0]! + vst1.16 {q11}, [r0]! + vst1.16 {q12}, [r0]! + vst1.16 {q13}, [r0]! + vst1.16 {q14}, [r0]! + vst1.16 {q15}, [r0]! + +;v + vld1.8 {d0}, [r2], r12 ;load src + vld1.8 {d1}, [r3]! ;load pred + vld1.8 {d2}, [r2], r12 + vld1.8 {d3}, [r3]! + vld1.8 {d4}, [r2], r12 + vld1.8 {d5}, [r3]! + vld1.8 {d6}, [r2], r12 + vld1.8 {d7}, [r3]! + vld1.8 {d8}, [r2], r12 + vld1.8 {d9}, [r3]! + vld1.8 {d10}, [r2], r12 + vld1.8 {d11}, [r3]! + vld1.8 {d12}, [r2], r12 + vld1.8 {d13}, [r3]! + vld1.8 {d14}, [r2], r12 + vld1.8 {d15}, [r3]! + + vsubl.u8 q8, d0, d1 + vsubl.u8 q9, d2, d3 + vsubl.u8 q10, d4, d5 + vsubl.u8 q11, d6, d7 + vsubl.u8 q12, d8, d9 + vsubl.u8 q13, d10, d11 + vsubl.u8 q14, d12, d13 + vsubl.u8 q15, d14, d15 + + vst1.16 {q8}, [r0]! ;store diff + vst1.16 {q9}, [r0]! + vst1.16 {q10}, [r0]! + vst1.16 {q11}, [r0]! + vst1.16 {q12}, [r0]! + vst1.16 {q13}, [r0]! + vst1.16 {q14}, [r0]! + vst1.16 {q15}, [r0]! + + bx lr + ENDP + + END diff --git a/vp9/encoder/arm/neon/variance_neon.asm b/vp9/encoder/arm/neon/variance_neon.asm new file mode 100644 index 000000000..901934c61 --- /dev/null +++ b/vp9/encoder/arm/neon/variance_neon.asm @@ -0,0 +1,276 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_variance16x16_neon| + EXPORT |vp9_variance16x8_neon| + EXPORT |vp9_variance8x16_neon| + EXPORT |vp9_variance8x8_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp9_variance16x16_neon| PROC + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + mov r12, #8 + +variance16x16_neon_loop + vld1.8 {q0}, [r0], r1 ;Load up source and reference + vld1.8 {q2}, [r2], r3 + vld1.8 {q1}, [r0], r1 + vld1.8 {q3}, [r2], r3 + + vsubl.u8 q11, d0, d4 ;calculate diff + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + ;VPADAL adds adjacent pairs of elements of a vector, and accumulates + ;the results into the elements of the destination vector. The explanation + ;in ARM guide is wrong. + vpadal.s16 q8, q11 ;calculate sum + vmlal.s16 q9, d22, d22 ;calculate sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + bne variance16x16_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + ldr r12, [sp] ;load *sse from stack + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + ;vmov.32 r0, d0[0] ;this instruction costs a lot + ;vmov.32 r1, d1[0] + ;mul r0, r0, r0 + ;str r1, [r12] + ;sub r0, r1, r0, asr #8 + + ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should + ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right. + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [r12] ;store sse + vshr.s32 d10, d10, #8 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + bx lr + + ENDP + +;================================ +;unsigned int vp9_variance16x8_c( +; unsigned char *src_ptr, +; int source_stride, +; unsigned char *ref_ptr, +; int recon_stride, +; unsigned int *sse) +|vp9_variance16x8_neon| PROC + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + mov r12, #4 + +variance16x8_neon_loop + vld1.8 {q0}, [r0], r1 ;Load up source and reference + vld1.8 {q2}, [r2], r3 + vld1.8 {q1}, [r0], r1 + vld1.8 {q3}, [r2], r3 + + vsubl.u8 q11, d0, d4 ;calculate diff + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + vpadal.s16 q8, q11 ;calculate sum + vmlal.s16 q9, d22, d22 ;calculate sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + bne variance16x8_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + ldr r12, [sp] ;load *sse from stack + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [r12] ;store sse + vshr.s32 d10, d10, #7 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + bx lr + + ENDP + +;================================= +;unsigned int vp9_variance8x16_c( +; unsigned char *src_ptr, +; int source_stride, +; unsigned char *ref_ptr, +; int recon_stride, +; unsigned int *sse) + +|vp9_variance8x16_neon| PROC + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + mov r12, #8 + +variance8x16_neon_loop + vld1.8 {d0}, [r0], r1 ;Load up source and reference + vld1.8 {d4}, [r2], r3 + vld1.8 {d2}, [r0], r1 + vld1.8 {d6}, [r2], r3 + + vsubl.u8 q11, d0, d4 ;calculate diff + vsubl.u8 q12, d2, d6 + + vpadal.s16 q8, q11 ;calculate sum + vmlal.s16 q9, d22, d22 ;calculate sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + + bne variance8x16_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + ldr r12, [sp] ;load *sse from stack + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [r12] ;store sse + vshr.s32 d10, d10, #7 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + bx lr + + ENDP + +;================================== +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp9_variance8x8_neon| PROC + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + mov r12, #2 + +variance8x8_neon_loop + vld1.8 {d0}, [r0], r1 ;Load up source and reference + vld1.8 {d4}, [r2], r3 + vld1.8 {d1}, [r0], r1 + vld1.8 {d5}, [r2], r3 + vld1.8 {d2}, [r0], r1 + vld1.8 {d6}, [r2], r3 + vld1.8 {d3}, [r0], r1 + vld1.8 {d7}, [r2], r3 + + vsubl.u8 q11, d0, d4 ;calculate diff + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + vpadal.s16 q8, q11 ;calculate sum + vmlal.s16 q9, d22, d22 ;calculate sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + bne variance8x8_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + ldr r12, [sp] ;load *sse from stack + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [r12] ;store sse + vshr.s32 d10, d10, #6 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + bx lr + + ENDP + + END diff --git a/vp9/encoder/arm/neon/vp8_memcpy_neon.asm b/vp9/encoder/arm/neon/vp8_memcpy_neon.asm new file mode 100644 index 000000000..b0450e523 --- /dev/null +++ b/vp9/encoder/arm/neon/vp8_memcpy_neon.asm @@ -0,0 +1,68 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_memcpy_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;========================================= +;void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz); +|vp8_memcpy_neon| PROC + ;pld [r1] ;preload pred data + ;pld [r1, #128] + ;pld [r1, #256] + ;pld [r1, #384] + + mov r12, r2, lsr #8 ;copy 256 bytes data at one time + +memcpy_neon_loop + vld1.8 {q0, q1}, [r1]! ;load src data + subs r12, r12, #1 + vld1.8 {q2, q3}, [r1]! + vst1.8 {q0, q1}, [r0]! ;copy to dst_ptr + vld1.8 {q4, q5}, [r1]! + vst1.8 {q2, q3}, [r0]! + vld1.8 {q6, q7}, [r1]! + vst1.8 {q4, q5}, [r0]! + vld1.8 {q8, q9}, [r1]! + vst1.8 {q6, q7}, [r0]! + vld1.8 {q10, q11}, [r1]! + vst1.8 {q8, q9}, [r0]! + vld1.8 {q12, q13}, [r1]! + vst1.8 {q10, q11}, [r0]! + vld1.8 {q14, q15}, [r1]! + vst1.8 {q12, q13}, [r0]! + vst1.8 {q14, q15}, [r0]! + + ;pld [r1] ;preload pred data -- need to adjust for real device + ;pld [r1, #128] + ;pld [r1, #256] + ;pld [r1, #384] + + bne memcpy_neon_loop + + ands r3, r2, #0xff ;extra copy + beq done_copy_neon_loop + +extra_copy_neon_loop + vld1.8 {q0}, [r1]! ;load src data + subs r3, r3, #16 + vst1.8 {q0}, [r0]! + bne extra_copy_neon_loop + +done_copy_neon_loop + bx lr + ENDP + + END diff --git a/vp9/encoder/arm/neon/vp8_mse16x16_neon.asm b/vp9/encoder/arm/neon/vp8_mse16x16_neon.asm new file mode 100644 index 000000000..4d1512d40 --- /dev/null +++ b/vp9/encoder/arm/neon/vp8_mse16x16_neon.asm @@ -0,0 +1,116 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_mse16x16_neon| + EXPORT |vp8_get4x4sse_cs_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;============================ +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +;note: in this function, sum is never used. So, we can remove this part of calculation +;from vp9_variance(). + +|vp8_mse16x16_neon| PROC + vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse + vmov.i8 q8, #0 + vmov.i8 q9, #0 + vmov.i8 q10, #0 + + mov r12, #8 + +mse16x16_neon_loop + vld1.8 {q0}, [r0], r1 ;Load up source and reference + vld1.8 {q2}, [r2], r3 + vld1.8 {q1}, [r0], r1 + vld1.8 {q3}, [r2], r3 + + vsubl.u8 q11, d0, d4 + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + vmlal.s16 q7, d22, d22 + vmlal.s16 q8, d23, d23 + + subs r12, r12, #1 + + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vmlal.s16 q7, d26, d26 + vmlal.s16 q8, d27, d27 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + bne mse16x16_neon_loop + + vadd.u32 q7, q7, q8 + vadd.u32 q9, q9, q10 + + ldr r12, [sp] ;load *sse from stack + + vadd.u32 q10, q7, q9 + vpaddl.u32 q1, q10 + vadd.u64 d0, d2, d3 + + vst1.32 {d0[0]}, [r12] + vmov.32 r0, d0[0] + + bx lr + + ENDP + + +;============================= +; r0 unsigned char *src_ptr, +; r1 int source_stride, +; r2 unsigned char *ref_ptr, +; r3 int recon_stride +|vp8_get4x4sse_cs_neon| PROC + vld1.8 {d0}, [r0], r1 ;Load up source and reference + vld1.8 {d4}, [r2], r3 + vld1.8 {d1}, [r0], r1 + vld1.8 {d5}, [r2], r3 + vld1.8 {d2}, [r0], r1 + vld1.8 {d6}, [r2], r3 + vld1.8 {d3}, [r0], r1 + vld1.8 {d7}, [r2], r3 + + vsubl.u8 q11, d0, d4 + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + vmull.s16 q7, d22, d22 + vmull.s16 q8, d24, d24 + vmull.s16 q9, d26, d26 + vmull.s16 q10, d28, d28 + + vadd.u32 q7, q7, q8 + vadd.u32 q9, q9, q10 + vadd.u32 q9, q7, q9 + + vpaddl.u32 q1, q9 + vadd.u64 d0, d2, d3 + + vmov.32 r0, d0[0] + bx lr + + ENDP + + END diff --git a/vp9/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm b/vp9/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm new file mode 100644 index 000000000..22266297a --- /dev/null +++ b/vp9/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm @@ -0,0 +1,103 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_short_walsh4x4_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch) +; r0 short *input, +; r1 short *output, +; r2 int pitch +|vp8_short_walsh4x4_neon| PROC + + vld1.16 {d0}, [r0@64], r2 ; load input + vld1.16 {d1}, [r0@64], r2 + vld1.16 {d2}, [r0@64], r2 + vld1.16 {d3}, [r0@64] + + ;First for-loop + ;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3] + vtrn.32 d0, d2 + vtrn.32 d1, d3 + + vmov.s32 q15, #3 ; add 3 to all values + + vtrn.16 d0, d1 + vtrn.16 d2, d3 + + vadd.s16 d4, d0, d2 ; ip[0] + ip[2] + vadd.s16 d5, d1, d3 ; ip[1] + ip[3] + vsub.s16 d6, d1, d3 ; ip[1] - ip[3] + vsub.s16 d7, d0, d2 ; ip[0] - ip[2] + + vshl.s16 d4, d4, #2 ; a1 = (ip[0] + ip[2]) << 2 + vshl.s16 d5, d5, #2 ; d1 = (ip[1] + ip[3]) << 2 + vshl.s16 d6, d6, #2 ; c1 = (ip[1] - ip[3]) << 2 + vceq.s16 d16, d4, #0 ; a1 == 0 + vshl.s16 d7, d7, #2 ; b1 = (ip[0] - ip[2]) << 2 + + vadd.s16 d0, d4, d5 ; a1 + d1 + vmvn d16, d16 ; a1 != 0 + vsub.s16 d3, d4, d5 ; op[3] = a1 - d1 + vadd.s16 d1, d7, d6 ; op[1] = b1 + c1 + vsub.s16 d2, d7, d6 ; op[2] = b1 - c1 + vsub.s16 d0, d0, d16 ; op[0] = a1 + d1 + (a1 != 0) + + ;Second for-loop + ;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12] + vtrn.32 d1, d3 + vtrn.32 d0, d2 + vtrn.16 d2, d3 + vtrn.16 d0, d1 + + vaddl.s16 q8, d0, d2 ; a1 = ip[0]+ip[8] + vaddl.s16 q9, d1, d3 ; d1 = ip[4]+ip[12] + vsubl.s16 q10, d1, d3 ; c1 = ip[4]-ip[12] + vsubl.s16 q11, d0, d2 ; b1 = ip[0]-ip[8] + + vadd.s32 q0, q8, q9 ; a2 = a1 + d1 + vadd.s32 q1, q11, q10 ; b2 = b1 + c1 + vsub.s32 q2, q11, q10 ; c2 = b1 - c1 + vsub.s32 q3, q8, q9 ; d2 = a1 - d1 + + vclt.s32 q8, q0, #0 + vclt.s32 q9, q1, #0 + vclt.s32 q10, q2, #0 + vclt.s32 q11, q3, #0 + + ; subtract -1 (or 0) + vsub.s32 q0, q0, q8 ; a2 += a2 < 0 + vsub.s32 q1, q1, q9 ; b2 += b2 < 0 + vsub.s32 q2, q2, q10 ; c2 += c2 < 0 + vsub.s32 q3, q3, q11 ; d2 += d2 < 0 + + vadd.s32 q8, q0, q15 ; a2 + 3 + vadd.s32 q9, q1, q15 ; b2 + 3 + vadd.s32 q10, q2, q15 ; c2 + 3 + vadd.s32 q11, q3, q15 ; d2 + 3 + + ; vrshrn? would add 1 << 3-1 = 2 + vshrn.s32 d0, q8, #3 + vshrn.s32 d1, q9, #3 + vshrn.s32 d2, q10, #3 + vshrn.s32 d3, q11, #3 + + vst1.16 {q0, q1}, [r1@128] + + bx lr + + ENDP + + END diff --git a/vp9/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm b/vp9/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm new file mode 100644 index 000000000..8bb0734d1 --- /dev/null +++ b/vp9/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm @@ -0,0 +1,425 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_sub_pixel_variance16x16_neon_func| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack(r4) unsigned char *dst_ptr, +; stack(r5) int dst_pixels_per_line, +; stack(r6) unsigned int *sse +;note: most of the code is copied from bilinear_predict16x16_neon and vp9_variance16x16_neon. + +|vp9_sub_pixel_variance16x16_neon_func| PROC + push {r4-r6, lr} + + ldr r12, _BilinearTaps_coeff_ + ldr r4, [sp, #16] ;load *dst_ptr from stack + ldr r5, [sp, #20] ;load dst_pixels_per_line from stack + ldr r6, [sp, #24] ;load *sse from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_bfilter16x16_only + + add r2, r12, r2, lsl #3 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + + vld1.s32 {d31}, [r2] ;load first_pass filter + + beq firstpass_bfilter16x16_only + + sub sp, sp, #272 ;reserve space on stack for temporary storage + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + mov lr, sp + vld1.u8 {d5, d6, d7}, [r0], r1 + + mov r2, #3 ;loop counter + vld1.u8 {d8, d9, d10}, [r0], r1 + + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vld1.u8 {d11, d12, d13}, [r0], r1 + + vdup.8 d1, d31[4] + +;First Pass: output_height lines x output_width columns (17x16) +vp8e_filt_blk2d_fp16x16_loop_neon + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0]) + vmull.u8 q8, d3, d0 + vmull.u8 q9, d5, d0 + vmull.u8 q10, d6, d0 + vmull.u8 q11, d8, d0 + vmull.u8 q12, d9, d0 + vmull.u8 q13, d11, d0 + vmull.u8 q14, d12, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + vext.8 d11, d11, d12, #1 + + vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q9, d5, d1 + vmlal.u8 q11, d8, d1 + vmlal.u8 q13, d11, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + vext.8 d12, d12, d13, #1 + + vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q10, d6, d1 + vmlal.u8 q12, d9, d1 + vmlal.u8 q14, d12, d1 + + subs r2, r2, #1 + + vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d15, q8, #7 + vqrshrn.u16 d16, q9, #7 + vqrshrn.u16 d17, q10, #7 + vqrshrn.u16 d18, q11, #7 + vqrshrn.u16 d19, q12, #7 + vqrshrn.u16 d20, q13, #7 + + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + vqrshrn.u16 d21, q14, #7 + vld1.u8 {d5, d6, d7}, [r0], r1 + + vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result + vld1.u8 {d8, d9, d10}, [r0], r1 + vst1.u8 {d18, d19, d20, d21}, [lr]! + vld1.u8 {d11, d12, d13}, [r0], r1 + + bne vp8e_filt_blk2d_fp16x16_loop_neon + +;First-pass filtering for rest 5 lines + vld1.u8 {d14, d15, d16}, [r0], r1 + + vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0]) + vmull.u8 q10, d3, d0 + vmull.u8 q11, d5, d0 + vmull.u8 q12, d6, d0 + vmull.u8 q13, d8, d0 + vmull.u8 q14, d9, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + + vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q11, d5, d1 + vmlal.u8 q13, d8, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + + vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q12, d6, d1 + vmlal.u8 q14, d9, d1 + + vmull.u8 q1, d11, d0 + vmull.u8 q2, d12, d0 + vmull.u8 q3, d14, d0 + vmull.u8 q4, d15, d0 + + vext.8 d11, d11, d12, #1 ;construct src_ptr[1] + vext.8 d14, d14, d15, #1 + + vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q3, d14, d1 + + vext.8 d12, d12, d13, #1 + vext.8 d15, d15, d16, #1 + + vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q4, d15, d1 + + vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d11, q10, #7 + vqrshrn.u16 d12, q11, #7 + vqrshrn.u16 d13, q12, #7 + vqrshrn.u16 d14, q13, #7 + vqrshrn.u16 d15, q14, #7 + vqrshrn.u16 d16, q1, #7 + vqrshrn.u16 d17, q2, #7 + vqrshrn.u16 d18, q3, #7 + vqrshrn.u16 d19, q4, #7 + + vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result + vst1.u8 {d14, d15, d16, d17}, [lr]! + vst1.u8 {d18, d19}, [lr]! + +;Second pass: 16x16 +;secondpass_filter + add r3, r12, r3, lsl #3 + sub lr, lr, #272 + + vld1.u32 {d31}, [r3] ;load second_pass filter + + sub sp, sp, #256 + mov r3, sp + + vld1.u8 {d22, d23}, [lr]! ;load src data + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + mov r12, #4 ;loop counter + +vp8e_filt_blk2d_sp16x16_loop_neon + vld1.u8 {d24, d25}, [lr]! + vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0]) + vld1.u8 {d26, d27}, [lr]! + vmull.u8 q2, d23, d0 + vld1.u8 {d28, d29}, [lr]! + vmull.u8 q3, d24, d0 + vld1.u8 {d30, d31}, [lr]! + + vmull.u8 q4, d25, d0 + vmull.u8 q5, d26, d0 + vmull.u8 q6, d27, d0 + vmull.u8 q7, d28, d0 + vmull.u8 q8, d29, d0 + + vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1]) + vmlal.u8 q2, d25, d1 + vmlal.u8 q3, d26, d1 + vmlal.u8 q4, d27, d1 + vmlal.u8 q5, d28, d1 + vmlal.u8 q6, d29, d1 + vmlal.u8 q7, d30, d1 + vmlal.u8 q8, d31, d1 + + subs r12, r12, #1 + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + vqrshrn.u16 d6, q5, #7 + vqrshrn.u16 d7, q6, #7 + vqrshrn.u16 d8, q7, #7 + vqrshrn.u16 d9, q8, #7 + + vst1.u8 {d2, d3}, [r3]! ;store result + vst1.u8 {d4, d5}, [r3]! + vst1.u8 {d6, d7}, [r3]! + vmov q11, q15 + vst1.u8 {d8, d9}, [r3]! + + bne vp8e_filt_blk2d_sp16x16_loop_neon + + b sub_pixel_variance16x16_neon + +;-------------------- +firstpass_bfilter16x16_only + mov r2, #4 ;loop counter + sub sp, sp, #528 ;reserve space on stack for temporary storage + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vdup.8 d1, d31[4] + mov r3, sp + +;First Pass: output_height lines x output_width columns (16x16) +vp8e_filt_blk2d_fpo16x16_loop_neon + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + vld1.u8 {d5, d6, d7}, [r0], r1 + vld1.u8 {d8, d9, d10}, [r0], r1 + vld1.u8 {d11, d12, d13}, [r0], r1 + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0]) + vmull.u8 q8, d3, d0 + vmull.u8 q9, d5, d0 + vmull.u8 q10, d6, d0 + vmull.u8 q11, d8, d0 + vmull.u8 q12, d9, d0 + vmull.u8 q13, d11, d0 + vmull.u8 q14, d12, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + vext.8 d11, d11, d12, #1 + + vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q9, d5, d1 + vmlal.u8 q11, d8, d1 + vmlal.u8 q13, d11, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + vext.8 d12, d12, d13, #1 + + vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q10, d6, d1 + vmlal.u8 q12, d9, d1 + vmlal.u8 q14, d12, d1 + + subs r2, r2, #1 + + vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d15, q8, #7 + vqrshrn.u16 d16, q9, #7 + vqrshrn.u16 d17, q10, #7 + vqrshrn.u16 d18, q11, #7 + vqrshrn.u16 d19, q12, #7 + vqrshrn.u16 d20, q13, #7 + vst1.u8 {d14, d15}, [r3]! ;store result + vqrshrn.u16 d21, q14, #7 + + vst1.u8 {d16, d17}, [r3]! + vst1.u8 {d18, d19}, [r3]! + vst1.u8 {d20, d21}, [r3]! + + bne vp8e_filt_blk2d_fpo16x16_loop_neon + + b sub_pixel_variance16x16_neon + +;--------------------- +secondpass_bfilter16x16_only +;Second pass: 16x16 +;secondpass_filter + sub sp, sp, #528 ;reserve space on stack for temporary storage + add r3, r12, r3, lsl #3 + mov r12, #4 ;loop counter + vld1.u32 {d31}, [r3] ;load second_pass filter + vld1.u8 {d22, d23}, [r0], r1 ;load src data + mov r3, sp + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + +vp8e_filt_blk2d_spo16x16_loop_neon + vld1.u8 {d24, d25}, [r0], r1 + vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0]) + vld1.u8 {d26, d27}, [r0], r1 + vmull.u8 q2, d23, d0 + vld1.u8 {d28, d29}, [r0], r1 + vmull.u8 q3, d24, d0 + vld1.u8 {d30, d31}, [r0], r1 + + vmull.u8 q4, d25, d0 + vmull.u8 q5, d26, d0 + vmull.u8 q6, d27, d0 + vmull.u8 q7, d28, d0 + vmull.u8 q8, d29, d0 + + vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1]) + vmlal.u8 q2, d25, d1 + vmlal.u8 q3, d26, d1 + vmlal.u8 q4, d27, d1 + vmlal.u8 q5, d28, d1 + vmlal.u8 q6, d29, d1 + vmlal.u8 q7, d30, d1 + vmlal.u8 q8, d31, d1 + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + vqrshrn.u16 d6, q5, #7 + vqrshrn.u16 d7, q6, #7 + vqrshrn.u16 d8, q7, #7 + vqrshrn.u16 d9, q8, #7 + + vst1.u8 {d2, d3}, [r3]! ;store result + subs r12, r12, #1 + vst1.u8 {d4, d5}, [r3]! + vmov q11, q15 + vst1.u8 {d6, d7}, [r3]! + vst1.u8 {d8, d9}, [r3]! + + bne vp8e_filt_blk2d_spo16x16_loop_neon + + b sub_pixel_variance16x16_neon + +;---------------------------- +;variance16x16 +sub_pixel_variance16x16_neon + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + sub r3, r3, #256 + mov r12, #8 + +sub_pixel_variance16x16_neon_loop + vld1.8 {q0}, [r3]! ;Load up source and reference + vld1.8 {q2}, [r4], r5 + vld1.8 {q1}, [r3]! + vld1.8 {q3}, [r4], r5 + + vsubl.u8 q11, d0, d4 ;diff + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + vpadal.s16 q8, q11 ;sum + vmlal.s16 q9, d22, d22 ;sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + bne sub_pixel_variance16x16_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [r6] ;store sse + vshr.s32 d10, d10, #8 + vsub.s32 d0, d1, d10 + + add sp, sp, #528 + vmov.32 r0, d0[0] ;return + + pop {r4-r6,pc} + + ENDP + +;----------------- + +_BilinearTaps_coeff_ + DCD bilinear_taps_coeff +bilinear_taps_coeff + DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 + + END diff --git a/vp9/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/vp9/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm new file mode 100644 index 000000000..a3faf9a77 --- /dev/null +++ b/vp9/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm @@ -0,0 +1,572 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_variance_halfpixvar16x16_h_neon| + EXPORT |vp9_variance_halfpixvar16x16_v_neon| + EXPORT |vp9_variance_halfpixvar16x16_hv_neon| + EXPORT |vp9_sub_pixel_variance16x16s_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;================================================ +;unsigned int vp9_variance_halfpixvar16x16_h_neon +;( +; unsigned char *src_ptr, r0 +; int src_pixels_per_line, r1 +; unsigned char *dst_ptr, r2 +; int dst_pixels_per_line, r3 +; unsigned int *sse +;); +;================================================ +|vp9_variance_halfpixvar16x16_h_neon| PROC + push {lr} + + mov r12, #4 ;loop counter + ldr lr, [sp, #4] ;load *sse from stack + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + +;First Pass: output_height lines x output_width columns (16x16) +vp8_filt_fpo16x16s_4_0_loop_neon + vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data + vld1.8 {q11}, [r2], r3 + vld1.u8 {d4, d5, d6, d7}, [r0], r1 + vld1.8 {q12}, [r2], r3 + vld1.u8 {d8, d9, d10, d11}, [r0], r1 + vld1.8 {q13}, [r2], r3 + vld1.u8 {d12, d13, d14, d15}, [r0], r1 + + ;pld [r0] + ;pld [r0, r1] + ;pld [r0, r1, lsl #1] + + vext.8 q1, q0, q1, #1 ;construct src_ptr[1] + vext.8 q3, q2, q3, #1 + vext.8 q5, q4, q5, #1 + vext.8 q7, q6, q7, #1 + + vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 + vld1.8 {q14}, [r2], r3 + vrhadd.u8 q1, q2, q3 + vrhadd.u8 q2, q4, q5 + vrhadd.u8 q3, q6, q7 + + vsubl.u8 q4, d0, d22 ;diff + vsubl.u8 q5, d1, d23 + vsubl.u8 q6, d2, d24 + vsubl.u8 q7, d3, d25 + vsubl.u8 q0, d4, d26 + vsubl.u8 q1, d5, d27 + vsubl.u8 q2, d6, d28 + vsubl.u8 q3, d7, d29 + + vpadal.s16 q8, q4 ;sum + vmlal.s16 q9, d8, d8 ;sse + vmlal.s16 q10, d9, d9 + + subs r12, r12, #1 + + vpadal.s16 q8, q5 + vmlal.s16 q9, d10, d10 + vmlal.s16 q10, d11, d11 + vpadal.s16 q8, q6 + vmlal.s16 q9, d12, d12 + vmlal.s16 q10, d13, d13 + vpadal.s16 q8, q7 + vmlal.s16 q9, d14, d14 + vmlal.s16 q10, d15, d15 + + vpadal.s16 q8, q0 ;sum + vmlal.s16 q9, d0, d0 ;sse + vmlal.s16 q10, d1, d1 + vpadal.s16 q8, q1 + vmlal.s16 q9, d2, d2 + vmlal.s16 q10, d3, d3 + vpadal.s16 q8, q2 + vmlal.s16 q9, d4, d4 + vmlal.s16 q10, d5, d5 + vpadal.s16 q8, q3 + vmlal.s16 q9, d6, d6 + vmlal.s16 q10, d7, d7 + + bne vp8_filt_fpo16x16s_4_0_loop_neon + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [lr] ;store sse + vshr.s32 d10, d10, #8 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + pop {pc} + ENDP + +;================================================ +;unsigned int vp9_variance_halfpixvar16x16_v_neon +;( +; unsigned char *src_ptr, r0 +; int src_pixels_per_line, r1 +; unsigned char *dst_ptr, r2 +; int dst_pixels_per_line, r3 +; unsigned int *sse +;); +;================================================ +|vp9_variance_halfpixvar16x16_v_neon| PROC + push {lr} + + mov r12, #4 ;loop counter + + vld1.u8 {q0}, [r0], r1 ;load src data + ldr lr, [sp, #4] ;load *sse from stack + + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + +vp8_filt_spo16x16s_0_4_loop_neon + vld1.u8 {q2}, [r0], r1 + vld1.8 {q1}, [r2], r3 + vld1.u8 {q4}, [r0], r1 + vld1.8 {q3}, [r2], r3 + vld1.u8 {q6}, [r0], r1 + vld1.8 {q5}, [r2], r3 + vld1.u8 {q15}, [r0], r1 + + vrhadd.u8 q0, q0, q2 + vld1.8 {q7}, [r2], r3 + vrhadd.u8 q2, q2, q4 + vrhadd.u8 q4, q4, q6 + vrhadd.u8 q6, q6, q15 + + vsubl.u8 q11, d0, d2 ;diff + vsubl.u8 q12, d1, d3 + vsubl.u8 q13, d4, d6 + vsubl.u8 q14, d5, d7 + vsubl.u8 q0, d8, d10 + vsubl.u8 q1, d9, d11 + vsubl.u8 q2, d12, d14 + vsubl.u8 q3, d13, d15 + + vpadal.s16 q8, q11 ;sum + vmlal.s16 q9, d22, d22 ;sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + vpadal.s16 q8, q0 ;sum + vmlal.s16 q9, d0, d0 ;sse + vmlal.s16 q10, d1, d1 + vpadal.s16 q8, q1 + vmlal.s16 q9, d2, d2 + vmlal.s16 q10, d3, d3 + vpadal.s16 q8, q2 + vmlal.s16 q9, d4, d4 + vmlal.s16 q10, d5, d5 + + vmov q0, q15 + + vpadal.s16 q8, q3 + vmlal.s16 q9, d6, d6 + vmlal.s16 q10, d7, d7 + + bne vp8_filt_spo16x16s_0_4_loop_neon + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [lr] ;store sse + vshr.s32 d10, d10, #8 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + pop {pc} + ENDP + +;================================================ +;unsigned int vp9_variance_halfpixvar16x16_hv_neon +;( +; unsigned char *src_ptr, r0 +; int src_pixels_per_line, r1 +; unsigned char *dst_ptr, r2 +; int dst_pixels_per_line, r3 +; unsigned int *sse +;); +;================================================ +|vp9_variance_halfpixvar16x16_hv_neon| PROC + push {lr} + + vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data + + ldr lr, [sp, #4] ;load *sse from stack + vmov.i8 q13, #0 ;q8 - sum + vext.8 q1, q0, q1, #1 ;construct src_ptr[1] + + vmov.i8 q14, #0 ;q9, q10 - sse + vmov.i8 q15, #0 + + mov r12, #4 ;loop counter + vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 + +;First Pass: output_height lines x output_width columns (17x16) +vp8_filt16x16s_4_4_loop_neon + vld1.u8 {d4, d5, d6, d7}, [r0], r1 + vld1.u8 {d8, d9, d10, d11}, [r0], r1 + vld1.u8 {d12, d13, d14, d15}, [r0], r1 + vld1.u8 {d16, d17, d18, d19}, [r0], r1 + + ;pld [r0] + ;pld [r0, r1] + ;pld [r0, r1, lsl #1] + + vext.8 q3, q2, q3, #1 ;construct src_ptr[1] + vext.8 q5, q4, q5, #1 + vext.8 q7, q6, q7, #1 + vext.8 q9, q8, q9, #1 + + vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 + vrhadd.u8 q2, q4, q5 + vrhadd.u8 q3, q6, q7 + vrhadd.u8 q4, q8, q9 + + vld1.8 {q5}, [r2], r3 + vrhadd.u8 q0, q0, q1 + vld1.8 {q6}, [r2], r3 + vrhadd.u8 q1, q1, q2 + vld1.8 {q7}, [r2], r3 + vrhadd.u8 q2, q2, q3 + vld1.8 {q8}, [r2], r3 + vrhadd.u8 q3, q3, q4 + + vsubl.u8 q9, d0, d10 ;diff + vsubl.u8 q10, d1, d11 + vsubl.u8 q11, d2, d12 + vsubl.u8 q12, d3, d13 + + vsubl.u8 q0, d4, d14 ;diff + vsubl.u8 q1, d5, d15 + vsubl.u8 q5, d6, d16 + vsubl.u8 q6, d7, d17 + + vpadal.s16 q13, q9 ;sum + vmlal.s16 q14, d18, d18 ;sse + vmlal.s16 q15, d19, d19 + + vpadal.s16 q13, q10 ;sum + vmlal.s16 q14, d20, d20 ;sse + vmlal.s16 q15, d21, d21 + + vpadal.s16 q13, q11 ;sum + vmlal.s16 q14, d22, d22 ;sse + vmlal.s16 q15, d23, d23 + + vpadal.s16 q13, q12 ;sum + vmlal.s16 q14, d24, d24 ;sse + vmlal.s16 q15, d25, d25 + + subs r12, r12, #1 + + vpadal.s16 q13, q0 ;sum + vmlal.s16 q14, d0, d0 ;sse + vmlal.s16 q15, d1, d1 + + vpadal.s16 q13, q1 ;sum + vmlal.s16 q14, d2, d2 ;sse + vmlal.s16 q15, d3, d3 + + vpadal.s16 q13, q5 ;sum + vmlal.s16 q14, d10, d10 ;sse + vmlal.s16 q15, d11, d11 + + vmov q0, q4 + + vpadal.s16 q13, q6 ;sum + vmlal.s16 q14, d12, d12 ;sse + vmlal.s16 q15, d13, d13 + + bne vp8_filt16x16s_4_4_loop_neon + + vadd.u32 q15, q14, q15 ;accumulate sse + vpaddl.s32 q0, q13 ;accumulate sum + + vpaddl.u32 q1, q15 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [lr] ;store sse + vshr.s32 d10, d10, #8 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + pop {pc} + ENDP + +;============================== +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack unsigned char *dst_ptr, +; stack int dst_pixels_per_line, +; stack unsigned int *sse +;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step() +;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter, +;or filter coeff is {64, 64}. This simplified program only works in this situation. +;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later. + +|vp9_sub_pixel_variance16x16s_neon| PROC + push {r4, lr} + + ldr r4, [sp, #8] ;load *dst_ptr from stack + ldr r12, [sp, #12] ;load dst_pixels_per_line from stack + ldr lr, [sp, #16] ;load *sse from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_bfilter16x16s_only + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + beq firstpass_bfilter16x16s_only + + vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data + sub sp, sp, #256 ;reserve space on stack for temporary storage + vext.8 q1, q0, q1, #1 ;construct src_ptr[1] + mov r3, sp + mov r2, #4 ;loop counter + vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 + +;First Pass: output_height lines x output_width columns (17x16) +vp8e_filt_blk2d_fp16x16s_loop_neon + vld1.u8 {d4, d5, d6, d7}, [r0], r1 + vld1.u8 {d8, d9, d10, d11}, [r0], r1 + vld1.u8 {d12, d13, d14, d15}, [r0], r1 + vld1.u8 {d16, d17, d18, d19}, [r0], r1 + + ;pld [r0] + ;pld [r0, r1] + ;pld [r0, r1, lsl #1] + + vext.8 q3, q2, q3, #1 ;construct src_ptr[1] + vext.8 q5, q4, q5, #1 + vext.8 q7, q6, q7, #1 + vext.8 q9, q8, q9, #1 + + vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 + vrhadd.u8 q2, q4, q5 + vrhadd.u8 q3, q6, q7 + vrhadd.u8 q4, q8, q9 + + vrhadd.u8 q0, q0, q1 + vrhadd.u8 q1, q1, q2 + vrhadd.u8 q2, q2, q3 + vrhadd.u8 q3, q3, q4 + + subs r2, r2, #1 + vst1.u8 {d0, d1 ,d2, d3}, [r3]! ;store result + vmov q0, q4 + vst1.u8 {d4, d5, d6, d7}, [r3]! + + bne vp8e_filt_blk2d_fp16x16s_loop_neon + + b sub_pixel_variance16x16s_neon + +;-------------------- +firstpass_bfilter16x16s_only + mov r2, #2 ;loop counter + sub sp, sp, #256 ;reserve space on stack for temporary storage + mov r3, sp + +;First Pass: output_height lines x output_width columns (16x16) +vp8e_filt_blk2d_fpo16x16s_loop_neon + vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data + vld1.u8 {d4, d5, d6, d7}, [r0], r1 + vld1.u8 {d8, d9, d10, d11}, [r0], r1 + vld1.u8 {d12, d13, d14, d15}, [r0], r1 + + ;pld [r0] + ;pld [r0, r1] + ;pld [r0, r1, lsl #1] + + vext.8 q1, q0, q1, #1 ;construct src_ptr[1] + vld1.u8 {d16, d17, d18, d19}, [r0], r1 + vext.8 q3, q2, q3, #1 + vld1.u8 {d20, d21, d22, d23}, [r0], r1 + vext.8 q5, q4, q5, #1 + vld1.u8 {d24, d25, d26, d27}, [r0], r1 + vext.8 q7, q6, q7, #1 + vld1.u8 {d28, d29, d30, d31}, [r0], r1 + vext.8 q9, q8, q9, #1 + vext.8 q11, q10, q11, #1 + vext.8 q13, q12, q13, #1 + vext.8 q15, q14, q15, #1 + + vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 + vrhadd.u8 q1, q2, q3 + vrhadd.u8 q2, q4, q5 + vrhadd.u8 q3, q6, q7 + vrhadd.u8 q4, q8, q9 + vrhadd.u8 q5, q10, q11 + vrhadd.u8 q6, q12, q13 + vrhadd.u8 q7, q14, q15 + + subs r2, r2, #1 + + vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result + vst1.u8 {d4, d5, d6, d7}, [r3]! + vst1.u8 {d8, d9, d10, d11}, [r3]! + vst1.u8 {d12, d13, d14, d15}, [r3]! + + bne vp8e_filt_blk2d_fpo16x16s_loop_neon + + b sub_pixel_variance16x16s_neon + +;--------------------- +secondpass_bfilter16x16s_only + sub sp, sp, #256 ;reserve space on stack for temporary storage + + mov r2, #2 ;loop counter + vld1.u8 {d0, d1}, [r0], r1 ;load src data + mov r3, sp + +vp8e_filt_blk2d_spo16x16s_loop_neon + vld1.u8 {d2, d3}, [r0], r1 + vld1.u8 {d4, d5}, [r0], r1 + vld1.u8 {d6, d7}, [r0], r1 + vld1.u8 {d8, d9}, [r0], r1 + + vrhadd.u8 q0, q0, q1 + vld1.u8 {d10, d11}, [r0], r1 + vrhadd.u8 q1, q1, q2 + vld1.u8 {d12, d13}, [r0], r1 + vrhadd.u8 q2, q2, q3 + vld1.u8 {d14, d15}, [r0], r1 + vrhadd.u8 q3, q3, q4 + vld1.u8 {d16, d17}, [r0], r1 + vrhadd.u8 q4, q4, q5 + vrhadd.u8 q5, q5, q6 + vrhadd.u8 q6, q6, q7 + vrhadd.u8 q7, q7, q8 + + subs r2, r2, #1 + + vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result + vmov q0, q8 + vst1.u8 {d4, d5, d6, d7}, [r3]! + vst1.u8 {d8, d9, d10, d11}, [r3]! ;store result + vst1.u8 {d12, d13, d14, d15}, [r3]! + + bne vp8e_filt_blk2d_spo16x16s_loop_neon + + b sub_pixel_variance16x16s_neon + +;---------------------------- +;variance16x16 +sub_pixel_variance16x16s_neon + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + sub r3, r3, #256 + mov r2, #4 + +sub_pixel_variance16x16s_neon_loop + vld1.8 {q0}, [r3]! ;Load up source and reference + vld1.8 {q1}, [r4], r12 + vld1.8 {q2}, [r3]! + vld1.8 {q3}, [r4], r12 + vld1.8 {q4}, [r3]! + vld1.8 {q5}, [r4], r12 + vld1.8 {q6}, [r3]! + vld1.8 {q7}, [r4], r12 + + vsubl.u8 q11, d0, d2 ;diff + vsubl.u8 q12, d1, d3 + vsubl.u8 q13, d4, d6 + vsubl.u8 q14, d5, d7 + vsubl.u8 q0, d8, d10 + vsubl.u8 q1, d9, d11 + vsubl.u8 q2, d12, d14 + vsubl.u8 q3, d13, d15 + + vpadal.s16 q8, q11 ;sum + vmlal.s16 q9, d22, d22 ;sse + vmlal.s16 q10, d23, d23 + + subs r2, r2, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + vpadal.s16 q8, q0 ;sum + vmlal.s16 q9, d0, d0 ;sse + vmlal.s16 q10, d1, d1 + vpadal.s16 q8, q1 + vmlal.s16 q9, d2, d2 + vmlal.s16 q10, d3, d3 + vpadal.s16 q8, q2 + vmlal.s16 q9, d4, d4 + vmlal.s16 q10, d5, d5 + vpadal.s16 q8, q3 + vmlal.s16 q9, d6, d6 + vmlal.s16 q10, d7, d7 + + bne sub_pixel_variance16x16s_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [lr] ;store sse + vshr.s32 d10, d10, #8 + vsub.s32 d0, d1, d10 + + add sp, sp, #256 + vmov.32 r0, d0[0] ;return + + pop {r4, pc} + ENDP + + END diff --git a/vp9/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm b/vp9/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm new file mode 100644 index 000000000..29975f13e --- /dev/null +++ b/vp9/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm @@ -0,0 +1,224 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_sub_pixel_variance8x8_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack(r4) unsigned char *dst_ptr, +; stack(r5) int dst_pixels_per_line, +; stack(r6) unsigned int *sse +;note: most of the code is copied from bilinear_predict8x8_neon and vp9_variance8x8_neon. + +|vp9_sub_pixel_variance8x8_neon| PROC + push {r4-r5, lr} + + ldr r12, _BilinearTaps_coeff_ + ldr r4, [sp, #12] ;load *dst_ptr from stack + ldr r5, [sp, #16] ;load dst_pixels_per_line from stack + ldr lr, [sp, #20] ;load *sse from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq skip_firstpass_filter + +;First pass: output_height lines x output_width columns (9x8) + add r2, r12, r2, lsl #3 ;calculate filter location + + vld1.u8 {q1}, [r0], r1 ;load src data + vld1.u32 {d31}, [r2] ;load first_pass filter + vld1.u8 {q2}, [r0], r1 + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vld1.u8 {q3}, [r0], r1 + vdup.8 d1, d31[4] + vld1.u8 {q4}, [r0], r1 + + vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0]) + vmull.u8 q7, d4, d0 + vmull.u8 q8, d6, d0 + vmull.u8 q9, d8, d0 + + vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vext.8 d9, d8, d9, #1 + + vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1]) + vmlal.u8 q7, d5, d1 + vmlal.u8 q8, d7, d1 + vmlal.u8 q9, d9, d1 + + vld1.u8 {q1}, [r0], r1 ;load src data + vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 + vld1.u8 {q2}, [r0], r1 + vqrshrn.u16 d23, q7, #7 + vld1.u8 {q3}, [r0], r1 + vqrshrn.u16 d24, q8, #7 + vld1.u8 {q4}, [r0], r1 + vqrshrn.u16 d25, q9, #7 + + ;first_pass filtering on the rest 5-line data + vld1.u8 {q5}, [r0], r1 + + vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0]) + vmull.u8 q7, d4, d0 + vmull.u8 q8, d6, d0 + vmull.u8 q9, d8, d0 + vmull.u8 q10, d10, d0 + + vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vext.8 d9, d8, d9, #1 + vext.8 d11, d10, d11, #1 + + vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1]) + vmlal.u8 q7, d5, d1 + vmlal.u8 q8, d7, d1 + vmlal.u8 q9, d9, d1 + vmlal.u8 q10, d11, d1 + + vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d27, q7, #7 + vqrshrn.u16 d28, q8, #7 + vqrshrn.u16 d29, q9, #7 + vqrshrn.u16 d30, q10, #7 + +;Second pass: 8x8 +secondpass_filter + cmp r3, #0 ;skip second_pass filter if yoffset=0 + ;skip_secondpass_filter + beq sub_pixel_variance8x8_neon + + add r3, r12, r3, lsl #3 + + vld1.u32 {d31}, [r3] ;load second_pass filter + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + + vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0]) + vmull.u8 q2, d23, d0 + vmull.u8 q3, d24, d0 + vmull.u8 q4, d25, d0 + vmull.u8 q5, d26, d0 + vmull.u8 q6, d27, d0 + vmull.u8 q7, d28, d0 + vmull.u8 q8, d29, d0 + + vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * Filter[1]) + vmlal.u8 q2, d24, d1 + vmlal.u8 q3, d25, d1 + vmlal.u8 q4, d26, d1 + vmlal.u8 q5, d27, d1 + vmlal.u8 q6, d28, d1 + vmlal.u8 q7, d29, d1 + vmlal.u8 q8, d30, d1 + + vqrshrn.u16 d22, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d23, q2, #7 + vqrshrn.u16 d24, q3, #7 + vqrshrn.u16 d25, q4, #7 + vqrshrn.u16 d26, q5, #7 + vqrshrn.u16 d27, q6, #7 + vqrshrn.u16 d28, q7, #7 + vqrshrn.u16 d29, q8, #7 + + b sub_pixel_variance8x8_neon + +;-------------------- +skip_firstpass_filter + vld1.u8 {d22}, [r0], r1 ;load src data + vld1.u8 {d23}, [r0], r1 + vld1.u8 {d24}, [r0], r1 + vld1.u8 {d25}, [r0], r1 + vld1.u8 {d26}, [r0], r1 + vld1.u8 {d27}, [r0], r1 + vld1.u8 {d28}, [r0], r1 + vld1.u8 {d29}, [r0], r1 + vld1.u8 {d30}, [r0], r1 + + b secondpass_filter + +;---------------------- +;vp9_variance8x8_neon +sub_pixel_variance8x8_neon + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + mov r12, #2 + +sub_pixel_variance8x8_neon_loop + vld1.8 {d0}, [r4], r5 ;load dst data + subs r12, r12, #1 + vld1.8 {d1}, [r4], r5 + vld1.8 {d2}, [r4], r5 + vsubl.u8 q4, d22, d0 ;calculate diff + vld1.8 {d3}, [r4], r5 + + vsubl.u8 q5, d23, d1 + vsubl.u8 q6, d24, d2 + + vpadal.s16 q8, q4 ;sum + vmlal.s16 q9, d8, d8 ;sse + vmlal.s16 q10, d9, d9 + + vsubl.u8 q7, d25, d3 + + vpadal.s16 q8, q5 + vmlal.s16 q9, d10, d10 + vmlal.s16 q10, d11, d11 + + vmov q11, q13 + + vpadal.s16 q8, q6 + vmlal.s16 q9, d12, d12 + vmlal.s16 q10, d13, d13 + + vmov q12, q14 + + vpadal.s16 q8, q7 + vmlal.s16 q9, d14, d14 + vmlal.s16 q10, d15, d15 + + bne sub_pixel_variance8x8_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [lr] ;store sse + vshr.s32 d10, d10, #6 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + pop {r4-r5, pc} + + ENDP + +;----------------- + +_BilinearTaps_coeff_ + DCD bilinear_taps_coeff +bilinear_taps_coeff + DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 + + END diff --git a/vp9/encoder/arm/quantize_arm.c b/vp9/encoder/arm/quantize_arm.c new file mode 100644 index 000000000..e8aef4f08 --- /dev/null +++ b/vp9/encoder/arm/quantize_arm.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include <math.h> +#include "vpx_mem/vpx_mem.h" + +#include "vp9/encoder/quantize.h" +#include "vp9/common/entropy.h" + + +#if HAVE_ARMV7 + +/* vp8_quantize_mbX functions here differs from corresponding ones in + * quantize.c only by using quantize_b_pair function pointer instead of + * the regular quantize_b function pointer */ +void vp8_quantize_mby_neon(MACROBLOCK *x) { + int i; + int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED + && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV); + + for (i = 0; i < 16; i += 2) + x->quantize_b_pair(&x->block[i], &x->block[i + 1], + &x->e_mbd.block[i], &x->e_mbd.block[i + 1]); + + if (has_2nd_order) + x->quantize_b(&x->block[24], &x->e_mbd.block[24]); +} + +void vp8_quantize_mb_neon(MACROBLOCK *x) { + int i; + int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED + && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV); + + for (i = 0; i < 24; i += 2) + x->quantize_b_pair(&x->block[i], &x->block[i + 1], + &x->e_mbd.block[i], &x->e_mbd.block[i + 1]); + + if (has_2nd_order) + x->quantize_b(&x->block[i], &x->e_mbd.block[i]); +} + + +void vp8_quantize_mbuv_neon(MACROBLOCK *x) { + int i; + + for (i = 16; i < 24; i += 2) + x->quantize_b_pair(&x->block[i], &x->block[i + 1], + &x->e_mbd.block[i], &x->e_mbd.block[i + 1]); +} + +#endif /* HAVE_ARMV7 */ diff --git a/vp9/encoder/arm/quantize_arm.h b/vp9/encoder/arm/quantize_arm.h new file mode 100644 index 000000000..7d2088d2d --- /dev/null +++ b/vp9/encoder/arm/quantize_arm.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef QUANTIZE_ARM_H +#define QUANTIZE_ARM_H + +#if HAVE_ARMV6 + +extern prototype_quantize_block(vp8_fast_quantize_b_armv6); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_quantize_fastquantb +#define vp8_quantize_fastquantb vp8_fast_quantize_b_armv6 +#endif + +#endif /* HAVE_ARMV6 */ + + +#if HAVE_ARMV7 + +extern prototype_quantize_block(vp8_fast_quantize_b_neon); +extern prototype_quantize_block_pair(vp8_fast_quantize_b_pair_neon); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_quantize_fastquantb +#define vp8_quantize_fastquantb vp8_fast_quantize_b_neon + +#undef vp8_quantize_fastquantb_pair +#define vp8_quantize_fastquantb_pair vp8_fast_quantize_b_pair_neon + +#undef vp8_quantize_mb +#define vp8_quantize_mb vp8_quantize_mb_neon + +#undef vp8_quantize_mbuv +#define vp8_quantize_mbuv vp8_quantize_mbuv_neon + +#undef vp8_quantize_mby +#define vp8_quantize_mby vp8_quantize_mby_neon +#endif + +#endif /* HAVE_ARMV7 */ + +#endif + diff --git a/vp9/encoder/arm/variance_arm.c b/vp9/encoder/arm/variance_arm.c new file mode 100644 index 000000000..c3da1135d --- /dev/null +++ b/vp9/encoder/arm/variance_arm.c @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp9/encoder/variance.h" +#include "vp9/common/filter.h" +#include "vp9/common/arm/bilinearfilter_arm.h" + +#define HALFNDX 8 + +#if HAVE_ARMV6 + +unsigned int vp9_sub_pixel_variance8x8_armv6 +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) { + unsigned short first_pass[10 * 8]; + unsigned char second_pass[8 * 8]; + const short *HFilter, *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + vp9_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass, + src_pixels_per_line, + 9, 8, HFilter); + vp9_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, + 8, 8, 8, VFilter); + + return vp9_variance8x8_armv6(second_pass, 8, dst_ptr, + dst_pixels_per_line, sse); +} + +unsigned int vp9_sub_pixel_variance16x16_armv6 +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) { + unsigned short first_pass[36 * 16]; + unsigned char second_pass[20 * 16]; + const short *HFilter, *VFilter; + unsigned int var; + + if (xoffset == HALFNDX && yoffset == 0) { + var = vp9_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, sse); + } else if (xoffset == 0 && yoffset == HALFNDX) { + var = vp9_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, sse); + } else if (xoffset == HALFNDX && yoffset == HALFNDX) { + var = vp9_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, sse); + } else { + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + vp9_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass, + src_pixels_per_line, + 17, 16, HFilter); + vp9_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, + 16, 16, 16, VFilter); + + var = vp9_variance16x16_armv6(second_pass, 16, dst_ptr, + dst_pixels_per_line, sse); + } + return var; +} + +#endif /* HAVE_ARMV6 */ + + +#if HAVE_ARMV7 + +unsigned int vp9_sub_pixel_variance16x16_neon +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) { + if (xoffset == HALFNDX && yoffset == 0) + return vp9_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); + else if (xoffset == 0 && yoffset == HALFNDX) + return vp9_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); + else if (xoffset == HALFNDX && yoffset == HALFNDX) + return vp9_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); + else + return vp9_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); +} + +#endif diff --git a/vp9/encoder/arm/variance_arm.h b/vp9/encoder/arm/variance_arm.h new file mode 100644 index 000000000..c2c208a78 --- /dev/null +++ b/vp9/encoder/arm/variance_arm.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VARIANCE_ARM_H +#define VARIANCE_ARM_H + +#if HAVE_ARMV6 + +extern prototype_sad(vp9_sad16x16_armv6); +extern prototype_variance(vp9_variance16x16_armv6); +extern prototype_variance(vp9_variance8x8_armv6); +extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_armv6); +extern prototype_subpixvariance(vp9_sub_pixel_variance8x8_armv6); +extern prototype_variance(vp9_variance_halfpixvar16x16_h_armv6); +extern prototype_variance(vp9_variance_halfpixvar16x16_v_armv6); +extern prototype_variance(vp9_variance_halfpixvar16x16_hv_armv6); +extern prototype_variance(vp9_mse16x16_armv6); + +#if !CONFIG_RUNTIME_CPU_DETECT + +#undef vp9_variance_sad16x16 +#define vp9_variance_sad16x16 vp9_sad16x16_armv6 + +#undef vp9_variance_subpixvar16x16 +#define vp9_variance_subpixvar16x16 vp9_sub_pixel_variance16x16_armv6 + +#undef vp9_variance_subpixvar8x8 +#define vp9_variance_subpixvar8x8 vp9_sub_pixel_variance8x8_armv6 + +#undef vp9_variance_var16x16 +#define vp9_variance_var16x16 vp9_variance16x16_armv6 + +#undef vp9_variance_mse16x16 +#define vp9_variance_mse16x16 vp9_mse16x16_armv6 + +#undef vp9_variance_var8x8 +#define vp9_variance_var8x8 vp9_variance8x8_armv6 + +#undef vp9_variance_halfpixvar16x16_h +#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_armv6 + +#undef vp9_variance_halfpixvar16x16_v +#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_armv6 + +#undef vp9_variance_halfpixvar16x16_hv +#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_armv6 + +#endif /* !CONFIG_RUNTIME_CPU_DETECT */ + +#endif /* HAVE_ARMV6 */ + + +#if HAVE_ARMV7 +extern prototype_sad(vp9_sad4x4_neon); +extern prototype_sad(vp9_sad8x8_neon); +extern prototype_sad(vp9_sad8x16_neon); +extern prototype_sad(vp9_sad16x8_neon); +extern prototype_sad(vp9_sad16x16_neon); + +extern prototype_variance(vp9_variance8x8_neon); +extern prototype_variance(vp9_variance8x16_neon); +extern prototype_variance(vp9_variance16x8_neon); +extern prototype_variance(vp9_variance16x16_neon); + +extern prototype_subpixvariance(vp9_sub_pixel_variance8x8_neon); +extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_neon); +extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_neon_func); +extern prototype_variance(vp9_variance_halfpixvar16x16_h_neon); +extern prototype_variance(vp9_variance_halfpixvar16x16_v_neon); +extern prototype_variance(vp9_variance_halfpixvar16x16_hv_neon); + +extern prototype_variance(vp9_mse16x16_neon); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp9_variance_sad4x4 +#define vp9_variance_sad4x4 vp9_sad4x4_neon + +#undef vp9_variance_sad8x8 +#define vp9_variance_sad8x8 vp9_sad8x8_neon + +#undef vp9_variance_sad8x16 +#define vp9_variance_sad8x16 vp9_sad8x16_neon + +#undef vp9_variance_sad16x8 +#define vp9_variance_sad16x8 vp9_sad16x8_neon + +#undef vp9_variance_sad16x16 +#define vp9_variance_sad16x16 vp9_sad16x16_neon + +#undef vp9_variance_var8x8 +#define vp9_variance_var8x8 vp9_variance8x8_neon + +#undef vp9_variance_var8x16 +#define vp9_variance_var8x16 vp9_variance8x16_neon + +#undef vp9_variance_var16x8 +#define vp9_variance_var16x8 vp9_variance16x8_neon + +#undef vp9_variance_var16x16 +#define vp9_variance_var16x16 vp9_variance16x16_neon + +#undef vp9_variance_subpixvar8x8 +#define vp9_variance_subpixvar8x8 vp9_sub_pixel_variance8x8_neon + +#undef vp9_variance_subpixvar16x16 +#define vp9_variance_subpixvar16x16 vp9_sub_pixel_variance16x16_neon + +#undef vp9_variance_halfpixvar16x16_h +#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_neon + +#undef vp9_variance_halfpixvar16x16_v +#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_neon + +#undef vp9_variance_halfpixvar16x16_hv +#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_neon + +#undef vp9_variance_mse16x16 +#define vp9_variance_mse16x16 vp9_mse16x16_neon + +#endif + +#endif + +#endif |