From 34591b54dda4a25f42aa5add71b625b2600f6d6a Mon Sep 17 00:00:00 2001 From: Johann Date: Sun, 2 Dec 2012 14:14:00 -0800 Subject: Remove ARM optimizations from VP9 Change-Id: I9f0ae635fb9a95c4aa1529c177ccb07e2b76970b --- vp9/encoder/arm/armv5te/vp9_boolhuff_armv5te.asm | 286 ----------- vp9/encoder/arm/armv5te/vp9_packtokens_armv5.asm | 291 ----------- .../arm/armv5te/vp9_packtokens_mbrow_armv5.asm | 327 ------------ .../armv5te/vp9_packtokens_partitions_armv5.asm | 465 ----------------- .../arm/armv6/vp9_fast_quantize_b_armv6.asm | 223 -------- vp9/encoder/arm/armv6/vp9_mse16x16_armv6.asm | 138 ----- vp9/encoder/arm/armv6/vp9_sad16x16_armv6.asm | 95 ---- vp9/encoder/arm/armv6/vp9_short_fdct4x4_armv6.asm | 262 ---------- vp9/encoder/arm/armv6/vp9_subtract_armv6.asm | 264 ---------- vp9/encoder/arm/armv6/vp9_variance16x16_armv6.asm | 153 ------ vp9/encoder/arm/armv6/vp9_variance8x8_armv6.asm | 101 ---- .../armv6/vp9_variance_halfpixvar16x16_h_armv6.asm | 181 ------- .../vp9_variance_halfpixvar16x16_hv_armv6.asm | 222 -------- .../armv6/vp9_variance_halfpixvar16x16_v_armv6.asm | 183 ------- vp9/encoder/arm/armv6/vp9_walsh_v6.asm | 212 -------- vp9/encoder/arm/neon/vp9_fastquantizeb_neon.asm | 261 ---------- vp9/encoder/arm/neon/vp9_memcpy_neon.asm | 68 --- vp9/encoder/arm/neon/vp9_mse16x16_neon.asm | 116 ----- vp9/encoder/arm/neon/vp9_picklpf_arm.c | 48 -- vp9/encoder/arm/neon/vp9_sad16_neon.asm | 207 -------- vp9/encoder/arm/neon/vp9_sad8_neon.asm | 209 -------- vp9/encoder/arm/neon/vp9_shortfdct_neon.asm | 221 -------- vp9/encoder/arm/neon/vp9_shortwalsh4x4_neon.asm | 103 ---- .../arm/neon/vp9_subpixelvariance16x16_neon.asm | 425 --------------- .../arm/neon/vp9_subpixelvariance16x16s_neon.asm | 572 --------------------- .../arm/neon/vp9_subpixelvariance8x8_neon.asm | 224 -------- vp9/encoder/arm/neon/vp9_subtract_neon.asm | 185 ------- vp9/encoder/arm/neon/vp9_variance_neon.asm | 276 ---------- vp9/encoder/arm/vp9_arm_csystemdependent.c | 129 ----- vp9/encoder/arm/vp9_boolhuff_arm.c | 33 -- vp9/encoder/arm/vp9_dct_arm.c | 21 - vp9/encoder/arm/vp9_dct_arm.h | 65 --- vp9/encoder/arm/vp9_encodemb_arm.h | 64 --- vp9/encoder/arm/vp9_quantize_arm.c | 57 -- vp9/encoder/arm/vp9_quantize_arm.h | 52 -- vp9/encoder/arm/vp9_variance_arm.c | 112 ---- vp9/encoder/arm/vp9_variance_arm.h | 132 ----- vp9/encoder/vp9_asm_enc_offsets.c | 8 - vp9/encoder/vp9_onyx_if.c | 68 --- vp9/encoder/vp9_picklpf.c | 76 +-- vp9/encoder/vp9_quantize.h | 4 - 41 files changed, 4 insertions(+), 7135 deletions(-) delete mode 100644 vp9/encoder/arm/armv5te/vp9_boolhuff_armv5te.asm delete mode 100644 vp9/encoder/arm/armv5te/vp9_packtokens_armv5.asm delete mode 100644 vp9/encoder/arm/armv5te/vp9_packtokens_mbrow_armv5.asm delete mode 100644 vp9/encoder/arm/armv5te/vp9_packtokens_partitions_armv5.asm delete mode 100644 vp9/encoder/arm/armv6/vp9_fast_quantize_b_armv6.asm delete mode 100644 vp9/encoder/arm/armv6/vp9_mse16x16_armv6.asm delete mode 100644 vp9/encoder/arm/armv6/vp9_sad16x16_armv6.asm delete mode 100644 vp9/encoder/arm/armv6/vp9_short_fdct4x4_armv6.asm delete mode 100644 vp9/encoder/arm/armv6/vp9_subtract_armv6.asm delete mode 100644 vp9/encoder/arm/armv6/vp9_variance16x16_armv6.asm delete mode 100644 vp9/encoder/arm/armv6/vp9_variance8x8_armv6.asm delete mode 100644 vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_h_armv6.asm delete mode 100644 vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_hv_armv6.asm delete mode 100644 vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_v_armv6.asm delete mode 100644 vp9/encoder/arm/armv6/vp9_walsh_v6.asm delete mode 100644 vp9/encoder/arm/neon/vp9_fastquantizeb_neon.asm delete mode 100644 vp9/encoder/arm/neon/vp9_memcpy_neon.asm delete mode 100644 vp9/encoder/arm/neon/vp9_mse16x16_neon.asm delete mode 100644 vp9/encoder/arm/neon/vp9_picklpf_arm.c delete mode 100644 vp9/encoder/arm/neon/vp9_sad16_neon.asm delete mode 100644 vp9/encoder/arm/neon/vp9_sad8_neon.asm delete mode 100644 vp9/encoder/arm/neon/vp9_shortfdct_neon.asm delete mode 100644 vp9/encoder/arm/neon/vp9_shortwalsh4x4_neon.asm delete mode 100644 vp9/encoder/arm/neon/vp9_subpixelvariance16x16_neon.asm delete mode 100644 vp9/encoder/arm/neon/vp9_subpixelvariance16x16s_neon.asm delete mode 100644 vp9/encoder/arm/neon/vp9_subpixelvariance8x8_neon.asm delete mode 100644 vp9/encoder/arm/neon/vp9_subtract_neon.asm delete mode 100644 vp9/encoder/arm/neon/vp9_variance_neon.asm delete mode 100644 vp9/encoder/arm/vp9_arm_csystemdependent.c delete mode 100644 vp9/encoder/arm/vp9_boolhuff_arm.c delete mode 100644 vp9/encoder/arm/vp9_dct_arm.c delete mode 100644 vp9/encoder/arm/vp9_dct_arm.h delete mode 100644 vp9/encoder/arm/vp9_encodemb_arm.h delete mode 100644 vp9/encoder/arm/vp9_quantize_arm.c delete mode 100644 vp9/encoder/arm/vp9_quantize_arm.h delete mode 100644 vp9/encoder/arm/vp9_variance_arm.c delete mode 100644 vp9/encoder/arm/vp9_variance_arm.h (limited to 'vp9/encoder') diff --git a/vp9/encoder/arm/armv5te/vp9_boolhuff_armv5te.asm b/vp9/encoder/arm/armv5te/vp9_boolhuff_armv5te.asm deleted file mode 100644 index 94e65ef8d..000000000 --- a/vp9/encoder/arm/armv5te/vp9_boolhuff_armv5te.asm +++ /dev/null @@ -1,286 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_start_encode| - EXPORT |vp9_encode_bool| - EXPORT |vp8_stop_encode| - EXPORT |vp8_encode_value| - - INCLUDE vp9_asm_enc_offsets.asm - - ARM - REQUIRE8 - PRESERVE8 - - AREA |.text|, CODE, READONLY - -; r0 BOOL_CODER *br -; r1 unsigned char *source - -|vp8_start_encode| PROC - mov r12, #0 - mov r3, #255 - mvn r2, #23 - str r12, [r0, #vp9_writer_lowvalue] - str r3, [r0, #vp9_writer_range] - str r12, [r0, #vp9_writer_value] - str r2, [r0, #vp9_writer_count] - str r12, [r0, #vp9_writer_pos] - str r1, [r0, #vp9_writer_buffer] - bx lr - ENDP - -; r0 BOOL_CODER *br -; r1 int bit -; r2 int probability -|vp9_encode_bool| PROC - push {r4-r9, lr} - - mov r4, r2 - - ldr r2, [r0, #vp9_writer_lowvalue] - ldr r5, [r0, #vp9_writer_range] - ldr r3, [r0, #vp9_writer_count] - - sub r7, r5, #1 ; range-1 - - cmp r1, #0 - mul r6, r4, r7 ; ((range-1) * probability) - - mov r7, #1 - add r4, r7, r6, lsr #8 ; 1 + (((range-1) * probability) >> 8) - - addne r2, r2, r4 ; if (bit) lowvalue += split - subne r4, r5, r4 ; if (bit) range = range-split - - ; Counting the leading zeros is used to normalize range. - clz r6, r4 - sub r6, r6, #24 ; shift - - ; Flag is set on the sum of count. This flag is used later - ; to determine if count >= 0 - adds r3, r3, r6 ; count += shift - lsl r5, r4, r6 ; range <<= shift - bmi token_count_lt_zero ; if(count >= 0) - - sub r6, r6, r3 ; offset = shift - count - sub r4, r6, #1 ; offset-1 - lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) - bpl token_high_bit_not_set - - ldr r4, [r0, #vp9_writer_pos] ; x - sub r4, r4, #1 ; x = w->pos-1 - b token_zero_while_start -token_zero_while_loop - mov r9, #0 - strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0 - sub r4, r4, #1 ; x-- -token_zero_while_start - cmp r4, #0 - ldrge r7, [r0, #vp9_writer_buffer] - ldrb r1, [r7, r4] - cmpge r1, #0xff - beq token_zero_while_loop - - ldr r7, [r0, #vp9_writer_buffer] - ldrb r9, [r7, r4] ; w->buffer[x] - add r9, r9, #1 - strb r9, [r7, r4] ; w->buffer[x] + 1 -token_high_bit_not_set - rsb r4, r6, #24 ; 24-offset - ldr r9, [r0, #vp9_writer_buffer] - lsr r7, r2, r4 ; lowvalue >> (24-offset) - ldr r4, [r0, #vp9_writer_pos] ; w->pos - lsl r2, r2, r6 ; lowvalue <<= offset - mov r6, r3 ; shift = count - add r1, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r1, [r0, #vp9_writer_pos] - sub r3, r3, #8 ; count -= 8 - strb r7, [r9, r4] ; w->buffer[w->pos++] - -token_count_lt_zero - lsl r2, r2, r6 ; lowvalue <<= shift - - str r2, [r0, #vp9_writer_lowvalue] - str r5, [r0, #vp9_writer_range] - str r3, [r0, #vp9_writer_count] - pop {r4-r9, pc} - ENDP - -; r0 BOOL_CODER *br -|vp8_stop_encode| PROC - push {r4-r10, lr} - - ldr r2, [r0, #vp9_writer_lowvalue] - ldr r5, [r0, #vp9_writer_range] - ldr r3, [r0, #vp9_writer_count] - - mov r10, #32 - -stop_encode_loop - sub r7, r5, #1 ; range-1 - - mov r4, r7, lsl #7 ; ((range-1) * 128) - - mov r7, #1 - add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8) - - ; Counting the leading zeros is used to normalize range. - clz r6, r4 - sub r6, r6, #24 ; shift - - ; Flag is set on the sum of count. This flag is used later - ; to determine if count >= 0 - adds r3, r3, r6 ; count += shift - lsl r5, r4, r6 ; range <<= shift - bmi token_count_lt_zero_se ; if(count >= 0) - - sub r6, r6, r3 ; offset = shift - count - sub r4, r6, #1 ; offset-1 - lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) - bpl token_high_bit_not_set_se - - ldr r4, [r0, #vp9_writer_pos] ; x - sub r4, r4, #1 ; x = w->pos-1 - b token_zero_while_start_se -token_zero_while_loop_se - mov r9, #0 - strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0 - sub r4, r4, #1 ; x-- -token_zero_while_start_se - cmp r4, #0 - ldrge r7, [r0, #vp9_writer_buffer] - ldrb r1, [r7, r4] - cmpge r1, #0xff - beq token_zero_while_loop_se - - ldr r7, [r0, #vp9_writer_buffer] - ldrb r9, [r7, r4] ; w->buffer[x] - add r9, r9, #1 - strb r9, [r7, r4] ; w->buffer[x] + 1 -token_high_bit_not_set_se - rsb r4, r6, #24 ; 24-offset - ldr r9, [r0, #vp9_writer_buffer] - lsr r7, r2, r4 ; lowvalue >> (24-offset) - ldr r4, [r0, #vp9_writer_pos] ; w->pos - lsl r2, r2, r6 ; lowvalue <<= offset - mov r6, r3 ; shift = count - add r1, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r1, [r0, #vp9_writer_pos] - sub r3, r3, #8 ; count -= 8 - strb r7, [r9, r4] ; w->buffer[w->pos++] - -token_count_lt_zero_se - lsl r2, r2, r6 ; lowvalue <<= shift - - subs r10, r10, #1 - bne stop_encode_loop - - str r2, [r0, #vp9_writer_lowvalue] - str r5, [r0, #vp9_writer_range] - str r3, [r0, #vp9_writer_count] - pop {r4-r10, pc} - - ENDP - -; r0 BOOL_CODER *br -; r1 int data -; r2 int bits -|vp8_encode_value| PROC - push {r4-r11, lr} - - mov r10, r2 - - ldr r2, [r0, #vp9_writer_lowvalue] - ldr r5, [r0, #vp9_writer_range] - ldr r3, [r0, #vp9_writer_count] - - rsb r4, r10, #32 ; 32-n - - ; v is kept in r1 during the token pack loop - lsl r1, r1, r4 ; r1 = v << 32 - n - -encode_value_loop - sub r7, r5, #1 ; range-1 - - ; Decisions are made based on the bit value shifted - ; off of v, so set a flag here based on this. - ; This value is refered to as "bb" - lsls r1, r1, #1 ; bit = v >> n - mov r4, r7, lsl #7 ; ((range-1) * 128) - - mov r7, #1 - add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8) - - addcs r2, r2, r4 ; if (bit) lowvalue += split - subcs r4, r5, r4 ; if (bit) range = range-split - - ; Counting the leading zeros is used to normalize range. - clz r6, r4 - sub r6, r6, #24 ; shift - - ; Flag is set on the sum of count. This flag is used later - ; to determine if count >= 0 - adds r3, r3, r6 ; count += shift - lsl r5, r4, r6 ; range <<= shift - bmi token_count_lt_zero_ev ; if(count >= 0) - - sub r6, r6, r3 ; offset = shift - count - sub r4, r6, #1 ; offset-1 - lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) - bpl token_high_bit_not_set_ev - - ldr r4, [r0, #vp9_writer_pos] ; x - sub r4, r4, #1 ; x = w->pos-1 - b token_zero_while_start_ev -token_zero_while_loop_ev - mov r9, #0 - strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0 - sub r4, r4, #1 ; x-- -token_zero_while_start_ev - cmp r4, #0 - ldrge r7, [r0, #vp9_writer_buffer] - ldrb r11, [r7, r4] - cmpge r11, #0xff - beq token_zero_while_loop_ev - - ldr r7, [r0, #vp9_writer_buffer] - ldrb r9, [r7, r4] ; w->buffer[x] - add r9, r9, #1 - strb r9, [r7, r4] ; w->buffer[x] + 1 -token_high_bit_not_set_ev - rsb r4, r6, #24 ; 24-offset - ldr r9, [r0, #vp9_writer_buffer] - lsr r7, r2, r4 ; lowvalue >> (24-offset) - ldr r4, [r0, #vp9_writer_pos] ; w->pos - lsl r2, r2, r6 ; lowvalue <<= offset - mov r6, r3 ; shift = count - add r11, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r11, [r0, #vp9_writer_pos] - sub r3, r3, #8 ; count -= 8 - strb r7, [r9, r4] ; w->buffer[w->pos++] - -token_count_lt_zero_ev - lsl r2, r2, r6 ; lowvalue <<= shift - - subs r10, r10, #1 - bne encode_value_loop - - str r2, [r0, #vp9_writer_lowvalue] - str r5, [r0, #vp9_writer_range] - str r3, [r0, #vp9_writer_count] - pop {r4-r11, pc} - ENDP - - END diff --git a/vp9/encoder/arm/armv5te/vp9_packtokens_armv5.asm b/vp9/encoder/arm/armv5te/vp9_packtokens_armv5.asm deleted file mode 100644 index 9ccbaa6c1..000000000 --- a/vp9/encoder/arm/armv5te/vp9_packtokens_armv5.asm +++ /dev/null @@ -1,291 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8cx_pack_tokens_armv5| - - INCLUDE vp9_asm_enc_offsets.asm - - ARM - REQUIRE8 - PRESERVE8 - - AREA |.text|, CODE, READONLY - -; r0 vp9_writer *w -; r1 const TOKENEXTRA *p -; r2 int xcount -; r3 vp8_coef_encodings -; s0 vp8_extra_bits -; s1 vp8_coef_tree -|vp8cx_pack_tokens_armv5| PROC - push {r4-r11, lr} - - ; Add size of xcount * sizeof (TOKENEXTRA) to get stop - ; sizeof (TOKENEXTRA) is 8 - sub sp, sp, #12 - add r2, r1, r2, lsl #3 ; stop = p + xcount*sizeof(TOKENEXTRA) - str r2, [sp, #0] - str r3, [sp, #8] ; save vp8_coef_encodings - ldr r2, [r0, #vp9_writer_lowvalue] - ldr r5, [r0, #vp9_writer_range] - ldr r3, [r0, #vp9_writer_count] - b check_p_lt_stop - -while_p_lt_stop - ldrb r6, [r1, #tokenextra_token] ; t - ldr r4, [sp, #8] ; vp8_coef_encodings - mov lr, #0 - add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t - ldr r9, [r1, #tokenextra_context_tree] ; pp - - ldrb r7, [r1, #tokenextra_skip_eob_node] - - ldr r6, [r4, #vp9_token_value] ; v - ldr r8, [r4, #vp9_token_len] ; n - - ; vp8 specific skip_eob_node - cmp r7, #0 - movne lr, #2 ; i = 2 - subne r8, r8, #1 ; --n - - rsb r4, r8, #32 ; 32-n - ldr r10, [sp, #52] ; vp8_coef_tree - - ; v is kept in r12 during the token pack loop - lsl r12, r6, r4 ; r12 = v << 32 - n - -; loop start -token_loop - ldrb r4, [r9, lr, asr #1] ; pp [i>>1] - sub r7, r5, #1 ; range-1 - - ; Decisions are made based on the bit value shifted - ; off of v, so set a flag here based on this. - ; This value is refered to as "bb" - lsls r12, r12, #1 ; bb = v >> n - mul r6, r4, r7 ; ((range-1) * pp[i>>1])) - - ; bb can only be 0 or 1. So only execute this statement - ; if bb == 1, otherwise it will act like i + 0 - addcs lr, lr, #1 ; i + bb - - mov r7, #1 - ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb] - add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8) - - addcs r2, r2, r4 ; if (bb) lowvalue += split - subcs r4, r5, r4 ; if (bb) range = range-split - - ; Counting the leading zeros is used to normalize range. - clz r6, r4 - sub r6, r6, #24 ; shift - - ; Flag is set on the sum of count. This flag is used later - ; to determine if count >= 0 - adds r3, r3, r6 ; count += shift - lsl r5, r4, r6 ; range <<= shift - bmi token_count_lt_zero ; if(count >= 0) - - sub r6, r6, r3 ; offset = shift - count - sub r4, r6, #1 ; offset-1 - lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) - bpl token_high_bit_not_set - - ldr r4, [r0, #vp9_writer_pos] ; x - sub r4, r4, #1 ; x = w->pos-1 - b token_zero_while_start -token_zero_while_loop - mov r10, #0 - strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 - sub r4, r4, #1 ; x-- -token_zero_while_start - cmp r4, #0 - ldrge r7, [r0, #vp9_writer_buffer] - ldrb r11, [r7, r4] - cmpge r11, #0xff - beq token_zero_while_loop - - ldr r7, [r0, #vp9_writer_buffer] - ldrb r10, [r7, r4] ; w->buffer[x] - add r10, r10, #1 - strb r10, [r7, r4] ; w->buffer[x] + 1 -token_high_bit_not_set - rsb r4, r6, #24 ; 24-offset - ldr r10, [r0, #vp9_writer_buffer] - lsr r7, r2, r4 ; lowvalue >> (24-offset) - ldr r4, [r0, #vp9_writer_pos] ; w->pos - lsl r2, r2, r6 ; lowvalue <<= offset - mov r6, r3 ; shift = count - add r11, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r11, [r0, #vp9_writer_pos] - sub r3, r3, #8 ; count -= 8 - strb r7, [r10, r4] ; w->buffer[w->pos++] - - ; r10 is used earlier in the loop, but r10 is used as - ; temp variable here. So after r10 is used, reload - ; vp8_coef_tree_dcd into r10 - ldr r10, [sp, #52] ; vp8_coef_tree - -token_count_lt_zero - lsl r2, r2, r6 ; lowvalue <<= shift - - subs r8, r8, #1 ; --n - bne token_loop - - ldrb r6, [r1, #tokenextra_token] ; t - ldr r7, [sp, #48] ; vp8_extra_bits - ; Add t * sizeof (vp9_extra_bit_struct) to get the desired - ; element. Here vp9_extra_bit_struct == 16 - add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t - - ldr r4, [r12, #vp9_extra_bit_struct_base_val] - cmp r4, #0 - beq skip_extra_bits - -; if( b->base_val) - ldr r8, [r12, #vp9_extra_bit_struct_len] ; L - ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra - cmp r8, #0 ; if( L) - beq no_extra_bits - - ldr r9, [r12, #vp9_extra_bit_struct_prob] - asr r7, lr, #1 ; v=e>>1 - - ldr r10, [r12, #vp9_extra_bit_struct_tree] - str r10, [sp, #4] ; b->tree - - rsb r4, r8, #32 - lsl r12, r7, r4 - - mov lr, #0 ; i = 0 - -extra_bits_loop - ldrb r4, [r9, lr, asr #1] ; pp[i>>1] - sub r7, r5, #1 ; range-1 - lsls r12, r12, #1 ; v >> n - mul r6, r4, r7 ; (range-1) * pp[i>>1] - addcs lr, lr, #1 ; i + bb - - mov r7, #1 - ldrsb lr, [r10, lr] ; i = b->tree[i+bb] - add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8) - - addcs r2, r2, r4 ; if (bb) lowvalue += split - subcs r4, r5, r4 ; if (bb) range = range-split - - clz r6, r4 - sub r6, r6, #24 - - adds r3, r3, r6 ; count += shift - lsl r5, r4, r6 ; range <<= shift - bmi extra_count_lt_zero ; if(count >= 0) - - sub r6, r6, r3 ; offset= shift - count - sub r4, r6, #1 ; offset-1 - lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) - bpl extra_high_bit_not_set - - ldr r4, [r0, #vp9_writer_pos] ; x - sub r4, r4, #1 ; x = w->pos - 1 - b extra_zero_while_start -extra_zero_while_loop - mov r10, #0 - strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 - sub r4, r4, #1 ; x-- -extra_zero_while_start - cmp r4, #0 - ldrge r7, [r0, #vp9_writer_buffer] - ldrb r11, [r7, r4] - cmpge r11, #0xff - beq extra_zero_while_loop - - ldr r7, [r0, #vp9_writer_buffer] - ldrb r10, [r7, r4] - add r10, r10, #1 - strb r10, [r7, r4] -extra_high_bit_not_set - rsb r4, r6, #24 ; 24-offset - ldr r10, [r0, #vp9_writer_buffer] - lsr r7, r2, r4 ; lowvalue >> (24-offset) - ldr r4, [r0, #vp9_writer_pos] - lsl r2, r2, r6 ; lowvalue <<= offset - mov r6, r3 ; shift = count - add r11, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r11, [r0, #vp9_writer_pos] - sub r3, r3, #8 ; count -= 8 - strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset)) - ldr r10, [sp, #4] ; b->tree -extra_count_lt_zero - lsl r2, r2, r6 - - subs r8, r8, #1 ; --n - bne extra_bits_loop ; while (n) - -no_extra_bits - ldr lr, [r1, #4] ; e = p->Extra - add r4, r5, #1 ; range + 1 - tst lr, #1 - lsr r4, r4, #1 ; split = (range + 1) >> 1 - addne r2, r2, r4 ; lowvalue += split - subne r4, r5, r4 ; range = range-split - tst r2, #0x80000000 ; lowvalue & 0x80000000 - lsl r5, r4, #1 ; range <<= 1 - beq end_high_bit_not_set - - ldr r4, [r0, #vp9_writer_pos] - mov r7, #0 - sub r4, r4, #1 - b end_zero_while_start -end_zero_while_loop - strb r7, [r6, r4] - sub r4, r4, #1 ; x-- -end_zero_while_start - cmp r4, #0 - ldrge r6, [r0, #vp9_writer_buffer] - ldrb r12, [r6, r4] - cmpge r12, #0xff - beq end_zero_while_loop - - ldr r6, [r0, #vp9_writer_buffer] - ldrb r7, [r6, r4] - add r7, r7, #1 - strb r7, [r6, r4] -end_high_bit_not_set - adds r3, r3, #1 ; ++count - lsl r2, r2, #1 ; lowvalue <<= 1 - bne end_count_zero - - ldr r4, [r0, #vp9_writer_pos] - mvn r3, #7 - ldr r7, [r0, #vp9_writer_buffer] - lsr r6, r2, #24 ; lowvalue >> 24 - add r12, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r12, [r0, #0x10] - strb r6, [r7, r4] -end_count_zero -skip_extra_bits - add r1, r1, #TOKENEXTRA_SZ ; ++p -check_p_lt_stop - ldr r4, [sp, #0] ; stop - cmp r1, r4 ; while( p < stop) - bcc while_p_lt_stop - - str r2, [r0, #vp9_writer_lowvalue] - str r5, [r0, #vp9_writer_range] - str r3, [r0, #vp9_writer_count] - add sp, sp, #12 - pop {r4-r11, pc} - ENDP - - END diff --git a/vp9/encoder/arm/armv5te/vp9_packtokens_mbrow_armv5.asm b/vp9/encoder/arm/armv5te/vp9_packtokens_mbrow_armv5.asm deleted file mode 100644 index 0938ce1a3..000000000 --- a/vp9/encoder/arm/armv5te/vp9_packtokens_mbrow_armv5.asm +++ /dev/null @@ -1,327 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8cx_pack_mb_row_tokens_armv5| - - INCLUDE vp9_asm_enc_offsets.asm - - ARM - REQUIRE8 - PRESERVE8 - - AREA |.text|, CODE, READONLY - -; r0 VP8_COMP *cpi -; r1 vp9_writer *w -; r2 vp8_coef_encodings -; r3 vp8_extra_bits -; s0 vp8_coef_tree - -|vp8cx_pack_mb_row_tokens_armv5| PROC - push {r4-r11, lr} - sub sp, sp, #24 - - ; Compute address of cpi->common.mb_rows - ldr r4, _VP8_COMP_common_ - ldr r6, _VP8_COMMON_MBrows_ - add r4, r0, r4 - - ldr r5, [r4, r6] ; load up mb_rows - - str r2, [sp, #20] ; save vp8_coef_encodings - str r5, [sp, #12] ; save mb_rows - str r3, [sp, #8] ; save vp8_extra_bits - - ldr r4, _VP8_COMP_tplist_ - add r4, r0, r4 - ldr r7, [r4, #0] ; dereference cpi->tp_list - - mov r0, r1 ; keep same as other loops - - ldr r2, [r0, #vp9_writer_lowvalue] - ldr r5, [r0, #vp9_writer_range] - ldr r3, [r0, #vp9_writer_count] - -mb_row_loop - - ldr r1, [r7, #tokenlist_start] - ldr r9, [r7, #tokenlist_stop] - str r9, [sp, #0] ; save stop for later comparison - str r7, [sp, #16] ; tokenlist address for next time - - b check_p_lt_stop - - ; actuall work gets done here! - -while_p_lt_stop - ldrb r6, [r1, #tokenextra_token] ; t - ldr r4, [sp, #20] ; vp8_coef_encodings - mov lr, #0 - add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t - ldr r9, [r1, #tokenextra_context_tree] ; pp - - ldrb r7, [r1, #tokenextra_skip_eob_node] - - ldr r6, [r4, #vp9_token_value] ; v - ldr r8, [r4, #vp9_token_len] ; n - - ; vp8 specific skip_eob_node - cmp r7, #0 - movne lr, #2 ; i = 2 - subne r8, r8, #1 ; --n - - rsb r4, r8, #32 ; 32-n - ldr r10, [sp, #60] ; vp8_coef_tree - - ; v is kept in r12 during the token pack loop - lsl r12, r6, r4 ; r12 = v << 32 - n - -; loop start -token_loop - ldrb r4, [r9, lr, asr #1] ; pp [i>>1] - sub r7, r5, #1 ; range-1 - - ; Decisions are made based on the bit value shifted - ; off of v, so set a flag here based on this. - ; This value is refered to as "bb" - lsls r12, r12, #1 ; bb = v >> n - mul r6, r4, r7 ; ((range-1) * pp[i>>1])) - - ; bb can only be 0 or 1. So only execute this statement - ; if bb == 1, otherwise it will act like i + 0 - addcs lr, lr, #1 ; i + bb - - mov r7, #1 - ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb] - add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8) - - addcs r2, r2, r4 ; if (bb) lowvalue += split - subcs r4, r5, r4 ; if (bb) range = range-split - - ; Counting the leading zeros is used to normalize range. - clz r6, r4 - sub r6, r6, #24 ; shift - - ; Flag is set on the sum of count. This flag is used later - ; to determine if count >= 0 - adds r3, r3, r6 ; count += shift - lsl r5, r4, r6 ; range <<= shift - bmi token_count_lt_zero ; if(count >= 0) - - sub r6, r6, r3 ; offset = shift - count - sub r4, r6, #1 ; offset-1 - lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) - bpl token_high_bit_not_set - - ldr r4, [r0, #vp9_writer_pos] ; x - sub r4, r4, #1 ; x = w->pos-1 - b token_zero_while_start -token_zero_while_loop - mov r10, #0 - strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 - sub r4, r4, #1 ; x-- -token_zero_while_start - cmp r4, #0 - ldrge r7, [r0, #vp9_writer_buffer] - ldrb r11, [r7, r4] - cmpge r11, #0xff - beq token_zero_while_loop - - ldr r7, [r0, #vp9_writer_buffer] - ldrb r10, [r7, r4] ; w->buffer[x] - add r10, r10, #1 - strb r10, [r7, r4] ; w->buffer[x] + 1 -token_high_bit_not_set - rsb r4, r6, #24 ; 24-offset - ldr r10, [r0, #vp9_writer_buffer] - lsr r7, r2, r4 ; lowvalue >> (24-offset) - ldr r4, [r0, #vp9_writer_pos] ; w->pos - lsl r2, r2, r6 ; lowvalue <<= offset - mov r6, r3 ; shift = count - add r11, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r11, [r0, #vp9_writer_pos] - sub r3, r3, #8 ; count -= 8 - strb r7, [r10, r4] ; w->buffer[w->pos++] - - ; r10 is used earlier in the loop, but r10 is used as - ; temp variable here. So after r10 is used, reload - ; vp8_coef_tree_dcd into r10 - ldr r10, [sp, #60] ; vp8_coef_tree - -token_count_lt_zero - lsl r2, r2, r6 ; lowvalue <<= shift - - subs r8, r8, #1 ; --n - bne token_loop - - ldrb r6, [r1, #tokenextra_token] ; t - ldr r7, [sp, #8] ; vp8_extra_bits - ; Add t * sizeof (vp9_extra_bit_struct) to get the desired - ; element. Here vp9_extra_bit_struct == 16 - add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t - - ldr r4, [r12, #vp9_extra_bit_struct_base_val] - cmp r4, #0 - beq skip_extra_bits - -; if( b->base_val) - ldr r8, [r12, #vp9_extra_bit_struct_len] ; L - ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra - cmp r8, #0 ; if( L) - beq no_extra_bits - - ldr r9, [r12, #vp9_extra_bit_struct_prob] - asr r7, lr, #1 ; v=e>>1 - - ldr r10, [r12, #vp9_extra_bit_struct_tree] - str r10, [sp, #4] ; b->tree - - rsb r4, r8, #32 - lsl r12, r7, r4 - - mov lr, #0 ; i = 0 - -extra_bits_loop - ldrb r4, [r9, lr, asr #1] ; pp[i>>1] - sub r7, r5, #1 ; range-1 - lsls r12, r12, #1 ; v >> n - mul r6, r4, r7 ; (range-1) * pp[i>>1] - addcs lr, lr, #1 ; i + bb - - mov r7, #1 - ldrsb lr, [r10, lr] ; i = b->tree[i+bb] - add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8) - - addcs r2, r2, r4 ; if (bb) lowvalue += split - subcs r4, r5, r4 ; if (bb) range = range-split - - clz r6, r4 - sub r6, r6, #24 - - adds r3, r3, r6 ; count += shift - lsl r5, r4, r6 ; range <<= shift - bmi extra_count_lt_zero ; if(count >= 0) - - sub r6, r6, r3 ; offset= shift - count - sub r4, r6, #1 ; offset-1 - lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) - bpl extra_high_bit_not_set - - ldr r4, [r0, #vp9_writer_pos] ; x - sub r4, r4, #1 ; x = w->pos - 1 - b extra_zero_while_start -extra_zero_while_loop - mov r10, #0 - strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 - sub r4, r4, #1 ; x-- -extra_zero_while_start - cmp r4, #0 - ldrge r7, [r0, #vp9_writer_buffer] - ldrb r11, [r7, r4] - cmpge r11, #0xff - beq extra_zero_while_loop - - ldr r7, [r0, #vp9_writer_buffer] - ldrb r10, [r7, r4] - add r10, r10, #1 - strb r10, [r7, r4] -extra_high_bit_not_set - rsb r4, r6, #24 ; 24-offset - ldr r10, [r0, #vp9_writer_buffer] - lsr r7, r2, r4 ; lowvalue >> (24-offset) - ldr r4, [r0, #vp9_writer_pos] - lsl r2, r2, r6 ; lowvalue <<= offset - mov r6, r3 ; shift = count - add r11, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r11, [r0, #vp9_writer_pos] - sub r3, r3, #8 ; count -= 8 - strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset)) - ldr r10, [sp, #4] ; b->tree -extra_count_lt_zero - lsl r2, r2, r6 - - subs r8, r8, #1 ; --n - bne extra_bits_loop ; while (n) - -no_extra_bits - ldr lr, [r1, #4] ; e = p->Extra - add r4, r5, #1 ; range + 1 - tst lr, #1 - lsr r4, r4, #1 ; split = (range + 1) >> 1 - addne r2, r2, r4 ; lowvalue += split - subne r4, r5, r4 ; range = range-split - tst r2, #0x80000000 ; lowvalue & 0x80000000 - lsl r5, r4, #1 ; range <<= 1 - beq end_high_bit_not_set - - ldr r4, [r0, #vp9_writer_pos] - mov r7, #0 - sub r4, r4, #1 - b end_zero_while_start -end_zero_while_loop - strb r7, [r6, r4] - sub r4, r4, #1 ; x-- -end_zero_while_start - cmp r4, #0 - ldrge r6, [r0, #vp9_writer_buffer] - ldrb r12, [r6, r4] - cmpge r12, #0xff - beq end_zero_while_loop - - ldr r6, [r0, #vp9_writer_buffer] - ldrb r7, [r6, r4] - add r7, r7, #1 - strb r7, [r6, r4] -end_high_bit_not_set - adds r3, r3, #1 ; ++count - lsl r2, r2, #1 ; lowvalue <<= 1 - bne end_count_zero - - ldr r4, [r0, #vp9_writer_pos] - mvn r3, #7 - ldr r7, [r0, #vp9_writer_buffer] - lsr r6, r2, #24 ; lowvalue >> 24 - add r12, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r12, [r0, #0x10] - strb r6, [r7, r4] -end_count_zero -skip_extra_bits - add r1, r1, #TOKENEXTRA_SZ ; ++p -check_p_lt_stop - ldr r4, [sp, #0] ; stop - cmp r1, r4 ; while( p < stop) - bcc while_p_lt_stop - - ldr r6, [sp, #12] ; mb_rows - ldr r7, [sp, #16] ; tokenlist address - subs r6, r6, #1 - add r7, r7, #TOKENLIST_SZ ; next element in the array - str r6, [sp, #12] - bne mb_row_loop - - str r2, [r0, #vp9_writer_lowvalue] - str r5, [r0, #vp9_writer_range] - str r3, [r0, #vp9_writer_count] - add sp, sp, #24 - pop {r4-r11, pc} - ENDP - -_VP8_COMP_common_ - DCD vp8_comp_common -_VP8_COMMON_MBrows_ - DCD vp8_common_mb_rows -_VP8_COMP_tplist_ - DCD vp8_comp_tplist - - END diff --git a/vp9/encoder/arm/armv5te/vp9_packtokens_partitions_armv5.asm b/vp9/encoder/arm/armv5te/vp9_packtokens_partitions_armv5.asm deleted file mode 100644 index 4611b407d..000000000 --- a/vp9/encoder/arm/armv5te/vp9_packtokens_partitions_armv5.asm +++ /dev/null @@ -1,465 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8cx_pack_tokens_into_partitions_armv5| - - INCLUDE vp9_asm_enc_offsets.asm - - ARM - REQUIRE8 - PRESERVE8 - - AREA |.text|, CODE, READONLY - -; r0 VP8_COMP *cpi -; r1 unsigned char *cx_data -; r2 int num_part -; r3 *size -; s0 vp8_coef_encodings -; s1 vp8_extra_bits, -; s2 const vp9_tree_index *, - -|vp8cx_pack_tokens_into_partitions_armv5| PROC - push {r4-r11, lr} - sub sp, sp, #44 - - ; Compute address of cpi->common.mb_rows - ldr r4, _VP8_COMP_common_ - ldr r6, _VP8_COMMON_MBrows_ - add r4, r0, r4 - - ldr r5, [r4, r6] ; load up mb_rows - - str r5, [sp, #36] ; save mb_rows - str r1, [sp, #24] ; save cx_data - str r2, [sp, #20] ; save num_part - str r3, [sp, #8] ; save *size - - ; *size = 3*(num_part -1 ); - sub r2, r2, #1 ; num_part - 1 - add r2, r2, r2, lsl #1 ; 3*(num_part - 1) - str r2, [r3] - - add r2, r2, r1 ; cx_data + *size - str r2, [sp, #40] ; ptr - - ldr r4, _VP8_COMP_tplist_ - add r4, r0, r4 - ldr r7, [r4, #0] ; dereference cpi->tp_list - str r7, [sp, #32] ; store start of cpi->tp_list - - ldr r11, _VP8_COMP_bc2_ ; load up vp9_writer out of cpi - add r0, r0, r11 - - mov r11, #0 - str r11, [sp, #28] ; i - -numparts_loop - ldr r10, [sp, #40] ; ptr - ldr r5, [sp, #36] ; move mb_rows to the counting section - sub r5, r5, r11 ; move start point with each partition - ; mb_rows starts at i - str r5, [sp, #12] - - ; Reset all of the VP8 Writer data for each partition that - ; is processed. - ; start_encode - mov r2, #0 ; vp9_writer_lowvalue - mov r5, #255 ; vp9_writer_range - mvn r3, #23 ; vp9_writer_count - - str r2, [r0, #vp9_writer_value] - str r2, [r0, #vp9_writer_pos] - str r10, [r0, #vp9_writer_buffer] - -mb_row_loop - - ldr r1, [r7, #tokenlist_start] - ldr r9, [r7, #tokenlist_stop] - str r9, [sp, #0] ; save stop for later comparison - str r7, [sp, #16] ; tokenlist address for next time - - b check_p_lt_stop - - ; actual work gets done here! - -while_p_lt_stop - ldrb r6, [r1, #tokenextra_token] ; t - ldr r4, [sp, #80] ; vp8_coef_encodings - mov lr, #0 - add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t - ldr r9, [r1, #tokenextra_context_tree] ; pp - - ldrb r7, [r1, #tokenextra_skip_eob_node] - - ldr r6, [r4, #vp9_token_value] ; v - ldr r8, [r4, #vp9_token_len] ; n - - ; vp8 specific skip_eob_node - cmp r7, #0 - movne lr, #2 ; i = 2 - subne r8, r8, #1 ; --n - - rsb r4, r8, #32 ; 32-n - ldr r10, [sp, #88] ; vp8_coef_tree - - ; v is kept in r12 during the token pack loop - lsl r12, r6, r4 ; r12 = v << 32 - n - -; loop start -token_loop - ldrb r4, [r9, lr, asr #1] ; pp [i>>1] - sub r7, r5, #1 ; range-1 - - ; Decisions are made based on the bit value shifted - ; off of v, so set a flag here based on this. - ; This value is refered to as "bb" - lsls r12, r12, #1 ; bb = v >> n - mul r6, r4, r7 ; ((range-1) * pp[i>>1])) - - ; bb can only be 0 or 1. So only execute this statement - ; if bb == 1, otherwise it will act like i + 0 - addcs lr, lr, #1 ; i + bb - - mov r7, #1 - ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb] - add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8) - - addcs r2, r2, r4 ; if (bb) lowvalue += split - subcs r4, r5, r4 ; if (bb) range = range-split - - ; Counting the leading zeros is used to normalize range. - clz r6, r4 - sub r6, r6, #24 ; shift - - ; Flag is set on the sum of count. This flag is used later - ; to determine if count >= 0 - adds r3, r3, r6 ; count += shift - lsl r5, r4, r6 ; range <<= shift - bmi token_count_lt_zero ; if(count >= 0) - - sub r6, r6, r3 ; offset = shift - count - sub r4, r6, #1 ; offset-1 - lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) - bpl token_high_bit_not_set - - ldr r4, [r0, #vp9_writer_pos] ; x - sub r4, r4, #1 ; x = w->pos-1 - b token_zero_while_start -token_zero_while_loop - mov r10, #0 - strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 - sub r4, r4, #1 ; x-- -token_zero_while_start - cmp r4, #0 - ldrge r7, [r0, #vp9_writer_buffer] - ldrb r11, [r7, r4] - cmpge r11, #0xff - beq token_zero_while_loop - - ldr r7, [r0, #vp9_writer_buffer] - ldrb r10, [r7, r4] ; w->buffer[x] - add r10, r10, #1 - strb r10, [r7, r4] ; w->buffer[x] + 1 -token_high_bit_not_set - rsb r4, r6, #24 ; 24-offset - ldr r10, [r0, #vp9_writer_buffer] - lsr r7, r2, r4 ; lowvalue >> (24-offset) - ldr r4, [r0, #vp9_writer_pos] ; w->pos - lsl r2, r2, r6 ; lowvalue <<= offset - mov r6, r3 ; shift = count - add r11, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r11, [r0, #vp9_writer_pos] - sub r3, r3, #8 ; count -= 8 - strb r7, [r10, r4] ; w->buffer[w->pos++] - - ; r10 is used earlier in the loop, but r10 is used as - ; temp variable here. So after r10 is used, reload - ; vp8_coef_tree_dcd into r10 - ldr r10, [sp, #88] ; vp8_coef_tree - -token_count_lt_zero - lsl r2, r2, r6 ; lowvalue <<= shift - - subs r8, r8, #1 ; --n - bne token_loop - - ldrb r6, [r1, #tokenextra_token] ; t - ldr r7, [sp, #84] ; vp8_extra_bits - ; Add t * sizeof (vp9_extra_bit_struct) to get the desired - ; element. Here vp9_extra_bit_struct == 16 - add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t - - ldr r4, [r12, #vp9_extra_bit_struct_base_val] - cmp r4, #0 - beq skip_extra_bits - -; if( b->base_val) - ldr r8, [r12, #vp9_extra_bit_struct_len] ; L - ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra - cmp r8, #0 ; if( L) - beq no_extra_bits - - ldr r9, [r12, #vp9_extra_bit_struct_prob] - asr r7, lr, #1 ; v=e>>1 - - ldr r10, [r12, #vp9_extra_bit_struct_tree] - str r10, [sp, #4] ; b->tree - - rsb r4, r8, #32 - lsl r12, r7, r4 - - mov lr, #0 ; i = 0 - -extra_bits_loop - ldrb r4, [r9, lr, asr #1] ; pp[i>>1] - sub r7, r5, #1 ; range-1 - lsls r12, r12, #1 ; v >> n - mul r6, r4, r7 ; (range-1) * pp[i>>1] - addcs lr, lr, #1 ; i + bb - - mov r7, #1 - ldrsb lr, [r10, lr] ; i = b->tree[i+bb] - add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8) - - addcs r2, r2, r4 ; if (bb) lowvalue += split - subcs r4, r5, r4 ; if (bb) range = range-split - - clz r6, r4 - sub r6, r6, #24 - - adds r3, r3, r6 ; count += shift - lsl r5, r4, r6 ; range <<= shift - bmi extra_count_lt_zero ; if(count >= 0) - - sub r6, r6, r3 ; offset= shift - count - sub r4, r6, #1 ; offset-1 - lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) - bpl extra_high_bit_not_set - - ldr r4, [r0, #vp9_writer_pos] ; x - sub r4, r4, #1 ; x = w->pos - 1 - b extra_zero_while_start -extra_zero_while_loop - mov r10, #0 - strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 - sub r4, r4, #1 ; x-- -extra_zero_while_start - cmp r4, #0 - ldrge r7, [r0, #vp9_writer_buffer] - ldrb r11, [r7, r4] - cmpge r11, #0xff - beq extra_zero_while_loop - - ldr r7, [r0, #vp9_writer_buffer] - ldrb r10, [r7, r4] - add r10, r10, #1 - strb r10, [r7, r4] -extra_high_bit_not_set - rsb r4, r6, #24 ; 24-offset - ldr r10, [r0, #vp9_writer_buffer] - lsr r7, r2, r4 ; lowvalue >> (24-offset) - ldr r4, [r0, #vp9_writer_pos] - lsl r2, r2, r6 ; lowvalue <<= offset - mov r6, r3 ; shift = count - add r11, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r11, [r0, #vp9_writer_pos] - sub r3, r3, #8 ; count -= 8 - strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset)) - ldr r10, [sp, #4] ; b->tree -extra_count_lt_zero - lsl r2, r2, r6 - - subs r8, r8, #1 ; --n - bne extra_bits_loop ; while (n) - -no_extra_bits - ldr lr, [r1, #4] ; e = p->Extra - add r4, r5, #1 ; range + 1 - tst lr, #1 - lsr r4, r4, #1 ; split = (range + 1) >> 1 - addne r2, r2, r4 ; lowvalue += split - subne r4, r5, r4 ; range = range-split - tst r2, #0x80000000 ; lowvalue & 0x80000000 - lsl r5, r4, #1 ; range <<= 1 - beq end_high_bit_not_set - - ldr r4, [r0, #vp9_writer_pos] - mov r7, #0 - sub r4, r4, #1 - b end_zero_while_start -end_zero_while_loop - strb r7, [r6, r4] - sub r4, r4, #1 ; x-- -end_zero_while_start - cmp r4, #0 - ldrge r6, [r0, #vp9_writer_buffer] - ldrb r12, [r6, r4] - cmpge r12, #0xff - beq end_zero_while_loop - - ldr r6, [r0, #vp9_writer_buffer] - ldrb r7, [r6, r4] - add r7, r7, #1 - strb r7, [r6, r4] -end_high_bit_not_set - adds r3, r3, #1 ; ++count - lsl r2, r2, #1 ; lowvalue <<= 1 - bne end_count_zero - - ldr r4, [r0, #vp9_writer_pos] - mvn r3, #7 - ldr r7, [r0, #vp9_writer_buffer] - lsr r6, r2, #24 ; lowvalue >> 24 - add r12, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r12, [r0, #0x10] - strb r6, [r7, r4] -end_count_zero -skip_extra_bits - add r1, r1, #TOKENEXTRA_SZ ; ++p -check_p_lt_stop - ldr r4, [sp, #0] ; stop - cmp r1, r4 ; while( p < stop) - bcc while_p_lt_stop - - ldr r10, [sp, #20] ; num_parts - mov r1, #TOKENLIST_SZ - mul r1, r10, r1 - - ldr r6, [sp, #12] ; mb_rows - ldr r7, [sp, #16] ; tokenlist address - subs r6, r6, r10 - add r7, r7, r1 ; next element in the array - str r6, [sp, #12] - bgt mb_row_loop - - mov r12, #32 - -stop_encode_loop - sub r7, r5, #1 ; range-1 - - mov r4, r7, lsl #7 ; ((range-1) * 128) - - mov r7, #1 - add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8) - - ; Counting the leading zeros is used to normalize range. - clz r6, r4 - sub r6, r6, #24 ; shift - - ; Flag is set on the sum of count. This flag is used later - ; to determine if count >= 0 - adds r3, r3, r6 ; count += shift - lsl r5, r4, r6 ; range <<= shift - bmi token_count_lt_zero_se ; if(count >= 0) - - sub r6, r6, r3 ; offset = shift - count - sub r4, r6, #1 ; offset-1 - lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) - bpl token_high_bit_not_set_se - - ldr r4, [r0, #vp9_writer_pos] ; x - sub r4, r4, #1 ; x = w->pos-1 - b token_zero_while_start_se -token_zero_while_loop_se - mov r10, #0 - strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 - sub r4, r4, #1 ; x-- -token_zero_while_start_se - cmp r4, #0 - ldrge r7, [r0, #vp9_writer_buffer] - ldrb r11, [r7, r4] - cmpge r11, #0xff - beq token_zero_while_loop_se - - ldr r7, [r0, #vp9_writer_buffer] - ldrb r10, [r7, r4] ; w->buffer[x] - add r10, r10, #1 - strb r10, [r7, r4] ; w->buffer[x] + 1 -token_high_bit_not_set_se - rsb r4, r6, #24 ; 24-offset - ldr r10, [r0, #vp9_writer_buffer] - lsr r7, r2, r4 ; lowvalue >> (24-offset) - ldr r4, [r0, #vp9_writer_pos] ; w->pos - lsl r2, r2, r6 ; lowvalue <<= offset - mov r6, r3 ; shift = count - add r11, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r11, [r0, #vp9_writer_pos] - sub r3, r3, #8 ; count -= 8 - strb r7, [r10, r4] ; w->buffer[w->pos++] - -token_count_lt_zero_se - lsl r2, r2, r6 ; lowvalue <<= shift - - subs r12, r12, #1 - bne stop_encode_loop - - ldr r10, [sp, #8] ; *size - ldr r11, [r10] - ldr r4, [r0, #vp9_writer_pos] ; w->pos - add r11, r11, r4 ; *size += w->pos - str r11, [r10] - - ldr r9, [sp, #20] ; num_parts - sub r9, r9, #1 - ldr r10, [sp, #28] ; i - cmp r10, r9 ; if(i<(num_part - 1)) - bge skip_write_partition - - ldr r12, [sp, #40] ; ptr - add r12, r12, r4 ; ptr += w->pos - str r12, [sp, #40] - - ldr r9, [sp, #24] ; cx_data - mov r8, r4, asr #8 - strb r4, [r9, #0] - strb r8, [r9, #1] - mov r4, r4, asr #16 - strb r4, [r9, #2] - - add r9, r9, #3 ; cx_data += 3 - str r9, [sp, #24] - -skip_write_partition - - ldr r11, [sp, #28] ; i - ldr r10, [sp, #20] ; num_parts - - add r11, r11, #1 ; i++ - str r11, [sp, #28] - - ldr r7, [sp, #32] ; cpi->tp_list[i] - mov r1, #TOKENLIST_SZ - add r7, r7, r1 ; next element in cpi->tp_list - str r7, [sp, #32] ; cpi->tp_list[i+1] - - cmp r10, r11 - bgt numparts_loop - - - add sp, sp, #44 - pop {r4-r11, pc} - ENDP - -_VP8_COMP_common_ - DCD vp8_comp_common -_VP8_COMMON_MBrows_ - DCD vp8_common_mb_rows -_VP8_COMP_tplist_ - DCD vp8_comp_tplist -_VP8_COMP_bc2_ - DCD vp8_comp_bc2 - - END diff --git a/vp9/encoder/arm/armv6/vp9_fast_quantize_b_armv6.asm b/vp9/encoder/arm/armv6/vp9_fast_quantize_b_armv6.asm deleted file mode 100644 index 4f75ef5e7..000000000 --- a/vp9/encoder/arm/armv6/vp9_fast_quantize_b_armv6.asm +++ /dev/null @@ -1,223 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_fast_quantize_b_armv6| - - INCLUDE vp9_asm_enc_offsets.asm - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 BLOCK *b -; r1 BLOCKD *d -|vp8_fast_quantize_b_armv6| PROC - stmfd sp!, {r1, r4-r11, lr} - - ldr r3, [r0, #vp8_block_coeff] ; coeff - ldr r4, [r0, #vp8_block_quant_fast] ; quant_fast - ldr r5, [r0, #vp8_block_round] ; round - ldr r6, [r1, #vp8_blockd_qcoeff] ; qcoeff - ldr r7, [r1, #vp8_blockd_dqcoeff] ; dqcoeff - ldr r8, [r1, #vp8_blockd_dequant] ; dequant - - ldr r2, loop_count ; loop_count=0x1000000. 'lsls' instruction - ; is used to update the counter so that - ; it can be used to mark nonzero - ; quantized coefficient pairs. - - mov r1, #0 ; flags for quantized coeffs - - ; PART 1: quantization and dequantization loop -loop - ldr r9, [r3], #4 ; [z1 | z0] - ldr r10, [r5], #4 ; [r1 | r0] - ldr r11, [r4], #4 ; [q1 | q0] - - ssat16 lr, #1, r9 ; [sz1 | sz0] - eor r9, r9, lr ; [z1 ^ sz1 | z0 ^ sz0] - ssub16 r9, r9, lr ; x = (z ^ sz) - sz - sadd16 r9, r9, r10 ; [x1+r1 | x0+r0] - - ldr r12, [r3], #4 ; [z3 | z2] - - smulbb r0, r9, r11 ; [(x0+r0)*q0] - smultt r9, r9, r11 ; [(x1+r1)*q1] - - ldr r10, [r5], #4 ; [r3 | r2] - - ssat16 r11, #1, r12 ; [sz3 | sz2] - eor r12, r12, r11 ; [z3 ^ sz3 | z2 ^ sz2] - pkhtb r0, r9, r0, asr #16 ; [y1 | y0] - ldr r9, [r4], #4 ; [q3 | q2] - ssub16 r12, r12, r11 ; x = (z ^ sz) - sz - - sadd16 r12, r12, r10 ; [x3+r3 | x2+r2] - - eor r0, r0, lr ; [(y1 ^ sz1) | (y0 ^ sz0)] - - smulbb r10, r12, r9 ; [(x2+r2)*q2] - smultt r12, r12, r9 ; [(x3+r3)*q3] - - ssub16 r0, r0, lr ; x = (y ^ sz) - sz - - cmp r0, #0 ; check if zero - orrne r1, r1, r2, lsr #24 ; add flag for nonzero coeffs - - str r0, [r6], #4 ; *qcoeff++ = x - ldr r9, [r8], #4 ; [dq1 | dq0] - - pkhtb r10, r12, r10, asr #16 ; [y3 | y2] - eor r10, r10, r11 ; [(y3 ^ sz3) | (y2 ^ sz2)] - ssub16 r10, r10, r11 ; x = (y ^ sz) - sz - - cmp r10, #0 ; check if zero - orrne r1, r1, r2, lsr #23 ; add flag for nonzero coeffs - - str r10, [r6], #4 ; *qcoeff++ = x - ldr r11, [r8], #4 ; [dq3 | dq2] - - smulbb r12, r0, r9 ; [x0*dq0] - smultt r0, r0, r9 ; [x1*dq1] - - smulbb r9, r10, r11 ; [x2*dq2] - smultt r10, r10, r11 ; [x3*dq3] - - lsls r2, r2, #2 ; update loop counter - strh r12, [r7, #0] ; dqcoeff[0] = [x0*dq0] - strh r0, [r7, #2] ; dqcoeff[1] = [x1*dq1] - strh r9, [r7, #4] ; dqcoeff[2] = [x2*dq2] - strh r10, [r7, #6] ; dqcoeff[3] = [x3*dq3] - add r7, r7, #8 ; dqcoeff += 8 - bne loop - - ; PART 2: check position for eob... - mov lr, #0 ; init eob - cmp r1, #0 ; coeffs after quantization? - ldr r11, [sp, #0] ; restore BLOCKD pointer - beq end ; skip eob calculations if all zero - - ldr r0, [r11, #vp8_blockd_qcoeff] - - ; check shortcut for nonzero qcoeffs - tst r1, #0x80 - bne quant_coeff_15_14 - tst r1, #0x20 - bne quant_coeff_13_11 - tst r1, #0x8 - bne quant_coeff_12_7 - tst r1, #0x40 - bne quant_coeff_10_9 - tst r1, #0x10 - bne quant_coeff_8_3 - tst r1, #0x2 - bne quant_coeff_6_5 - tst r1, #0x4 - bne quant_coeff_4_2 - b quant_coeff_1_0 - -quant_coeff_15_14 - ldrh r2, [r0, #30] ; rc=15, i=15 - mov lr, #16 - cmp r2, #0 - bne end - - ldrh r3, [r0, #28] ; rc=14, i=14 - mov lr, #15 - cmp r3, #0 - bne end - -quant_coeff_13_11 - ldrh r2, [r0, #22] ; rc=11, i=13 - mov lr, #14 - cmp r2, #0 - bne end - -quant_coeff_12_7 - ldrh r3, [r0, #14] ; rc=7, i=12 - mov lr, #13 - cmp r3, #0 - bne end - - ldrh r2, [r0, #20] ; rc=10, i=11 - mov lr, #12 - cmp r2, #0 - bne end - -quant_coeff_10_9 - ldrh r3, [r0, #26] ; rc=13, i=10 - mov lr, #11 - cmp r3, #0 - bne end - - ldrh r2, [r0, #24] ; rc=12, i=9 - mov lr, #10 - cmp r2, #0 - bne end - -quant_coeff_8_3 - ldrh r3, [r0, #18] ; rc=9, i=8 - mov lr, #9 - cmp r3, #0 - bne end - - ldrh r2, [r0, #12] ; rc=6, i=7 - mov lr, #8 - cmp r2, #0 - bne end - -quant_coeff_6_5 - ldrh r3, [r0, #6] ; rc=3, i=6 - mov lr, #7 - cmp r3, #0 - bne end - - ldrh r2, [r0, #4] ; rc=2, i=5 - mov lr, #6 - cmp r2, #0 - bne end - -quant_coeff_4_2 - ldrh r3, [r0, #10] ; rc=5, i=4 - mov lr, #5 - cmp r3, #0 - bne end - - ldrh r2, [r0, #16] ; rc=8, i=3 - mov lr, #4 - cmp r2, #0 - bne end - - ldrh r3, [r0, #8] ; rc=4, i=2 - mov lr, #3 - cmp r3, #0 - bne end - -quant_coeff_1_0 - ldrh r2, [r0, #2] ; rc=1, i=1 - mov lr, #2 - cmp r2, #0 - bne end - - mov lr, #1 ; rc=0, i=0 - -end - str lr, [r11, #vp8_blockd_eob] - ldmfd sp!, {r1, r4-r11, pc} - - ENDP - -loop_count - DCD 0x1000000 - - END diff --git a/vp9/encoder/arm/armv6/vp9_mse16x16_armv6.asm b/vp9/encoder/arm/armv6/vp9_mse16x16_armv6.asm deleted file mode 100644 index 8e7283667..000000000 --- a/vp9/encoder/arm/armv6/vp9_mse16x16_armv6.asm +++ /dev/null @@ -1,138 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_mse16x16_armv6| - - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -; -;note: Based on vp9_variance16x16_armv6. In this function, sum is never used. -; So, we can remove this part of calculation. - -|vp8_mse16x16_armv6| PROC - - push {r4-r9, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r12, #16 ; set loop counter to 16 (=block height) - mov r4, #0 ; initialize sse = 0 - -loop - ; 1st 4 pixels - ldr r5, [r0, #0x0] ; load 4 src pixels - ldr r6, [r2, #0x0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r8, r5, r6 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - ldr r5, [r0, #0x4] ; load 4 src pixels - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r6, [r2, #0x4] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - ldr r5, [r0, #0x8] ; load 4 src pixels - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r6, [r2, #0x8] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - ldr r5, [r0, #0xc] ; load 4 src pixels - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r6, [r2, #0xc] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - subs r12, r12, #1 ; next row - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - bne loop - - ; return stuff - ldr r1, [sp, #28] ; get address of sse - mov r0, r4 ; return sse - str r4, [r1] ; store sse - - pop {r4-r9, pc} - - ENDP - - END diff --git a/vp9/encoder/arm/armv6/vp9_sad16x16_armv6.asm b/vp9/encoder/arm/armv6/vp9_sad16x16_armv6.asm deleted file mode 100644 index 4dcceb2bf..000000000 --- a/vp9/encoder/arm/armv6/vp9_sad16x16_armv6.asm +++ /dev/null @@ -1,95 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sad16x16_armv6| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 const unsigned char *src_ptr -; r1 int src_stride -; r2 const unsigned char *ref_ptr -; r3 int ref_stride -; stack max_sad (not used) -|vp8_sad16x16_armv6| PROC - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - pld [r0, r1, lsl #1] - pld [r2, r3, lsl #1] - - mov r4, #0 ; sad = 0; - mov r5, #8 ; loop count - -loop - ; 1st row - ldr r6, [r0, #0x0] ; load 4 src pixels (1A) - ldr r8, [r2, #0x0] ; load 4 ref pixels (1A) - ldr r7, [r0, #0x4] ; load 4 src pixels (1A) - ldr r9, [r2, #0x4] ; load 4 ref pixels (1A) - ldr r10, [r0, #0x8] ; load 4 src pixels (1B) - ldr r11, [r0, #0xC] ; load 4 src pixels (1B) - - usada8 r4, r8, r6, r4 ; calculate sad for 4 pixels - usad8 r8, r7, r9 ; calculate sad for 4 pixels - - ldr r12, [r2, #0x8] ; load 4 ref pixels (1B) - ldr lr, [r2, #0xC] ; load 4 ref pixels (1B) - - add r0, r0, r1 ; set src pointer to next row - add r2, r2, r3 ; set dst pointer to next row - - pld [r0, r1, lsl #1] - pld [r2, r3, lsl #1] - - usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels - usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels - - ldr r6, [r0, #0x0] ; load 4 src pixels (2A) - ldr r7, [r0, #0x4] ; load 4 src pixels (2A) - add r4, r4, r8 ; add partial sad values - - ; 2nd row - ldr r8, [r2, #0x0] ; load 4 ref pixels (2A) - ldr r9, [r2, #0x4] ; load 4 ref pixels (2A) - ldr r10, [r0, #0x8] ; load 4 src pixels (2B) - ldr r11, [r0, #0xC] ; load 4 src pixels (2B) - - usada8 r4, r6, r8, r4 ; calculate sad for 4 pixels - usad8 r8, r7, r9 ; calculate sad for 4 pixels - - ldr r12, [r2, #0x8] ; load 4 ref pixels (2B) - ldr lr, [r2, #0xC] ; load 4 ref pixels (2B) - - add r0, r0, r1 ; set src pointer to next row - add r2, r2, r3 ; set dst pointer to next row - - usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels - usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels - - pld [r0, r1, lsl #1] - pld [r2, r3, lsl #1] - - subs r5, r5, #1 ; decrement loop counter - add r4, r4, r8 ; add partial sad values - - bne loop - - mov r0, r4 ; return sad - ldmfd sp!, {r4-r12, pc} - - ENDP - - END diff --git a/vp9/encoder/arm/armv6/vp9_short_fdct4x4_armv6.asm b/vp9/encoder/arm/armv6/vp9_short_fdct4x4_armv6.asm deleted file mode 100644 index 8034c1db9..000000000 --- a/vp9/encoder/arm/armv6/vp9_short_fdct4x4_armv6.asm +++ /dev/null @@ -1,262 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vp8_short_fdct4x4_armv6| - - ARM - REQUIRE8 - PRESERVE8 - - AREA |.text|, CODE, READONLY -; void vp8_short_fdct4x4_c(short *input, short *output, int pitch) -|vp8_short_fdct4x4_armv6| PROC - - stmfd sp!, {r4 - r12, lr} - - ; PART 1 - - ; coeffs 0-3 - ldrd r4, r5, [r0] ; [i1 | i0] [i3 | i2] - - ldr r10, c7500 - ldr r11, c14500 - ldr r12, c0x22a453a0 ; [2217*4 | 5352*4] - ldr lr, c0x00080008 - ror r5, r5, #16 ; [i2 | i3] - - qadd16 r6, r4, r5 ; [i1+i2 | i0+i3] = [b1 | a1] without shift - qsub16 r7, r4, r5 ; [i1-i2 | i0-i3] = [c1 | d1] without shift - - add r0, r0, r2 ; update input pointer - - qadd16 r7, r7, r7 ; 2*[c1|d1] --> we can use smlad and smlsd - ; with 2217*4 and 5352*4 without losing the - ; sign bit (overflow) - - smuad r4, r6, lr ; o0 = (i1+i2)*8 + (i0+i3)*8 - smusd r5, r6, lr ; o2 = (i1+i2)*8 - (i0+i3)*8 - - smlad r6, r7, r12, r11 ; o1 = (c1 * 2217 + d1 * 5352 + 14500) - smlsdx r7, r7, r12, r10 ; o3 = (d1 * 2217 - c1 * 5352 + 7500) - - ldrd r8, r9, [r0] ; [i5 | i4] [i7 | i6] - - pkhbt r3, r4, r6, lsl #4 ; [o1 | o0], keep in register for PART 2 - pkhbt r6, r5, r7, lsl #4 ; [o3 | o2] - - str r6, [r1, #4] - - ; coeffs 4-7 - ror r9, r9, #16 ; [i6 | i7] - - qadd16 r6, r8, r9 ; [i5+i6 | i4+i7] = [b1 | a1] without shift - qsub16 r7, r8, r9 ; [i5-i6 | i4-i7] = [c1 | d1] without shift - - add r0, r0, r2 ; update input pointer - - qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd - ; with 2217*4 and 5352*4 without losing the - ; sign bit (overflow) - - smuad r9, r6, lr ; o4 = (i5+i6)*8 + (i4+i7)*8 - smusd r8, r6, lr ; o6 = (i5+i6)*8 - (i4+i7)*8 - - smlad r6, r7, r12, r11 ; o5 = (c1 * 2217 + d1 * 5352 + 14500) - smlsdx r7, r7, r12, r10 ; o7 = (d1 * 2217 - c1 * 5352 + 7500) - - ldrd r4, r5, [r0] ; [i9 | i8] [i11 | i10] - - pkhbt r9, r9, r6, lsl #4 ; [o5 | o4], keep in register for PART 2 - pkhbt r6, r8, r7, lsl #4 ; [o7 | o6] - - str r6, [r1, #12] - - ; coeffs 8-11 - ror r5, r5, #16 ; [i10 | i11] - - qadd16 r6, r4, r5 ; [i9+i10 | i8+i11]=[b1 | a1] without shift - qsub16 r7, r4, r5 ; [i9-i10 | i8-i11]=[c1 | d1] without shift - - add r0, r0, r2 ; update input pointer - - qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd - ; with 2217*4 and 5352*4 without losing the - ; sign bit (overflow) - - smuad r2, r6, lr ; o8 = (i9+i10)*8 + (i8+i11)*8 - smusd r8, r6, lr ; o10 = (i9+i10)*8 - (i8+i11)*8 - - smlad r6, r7, r12, r11 ; o9 = (c1 * 2217 + d1 * 5352 + 14500) - smlsdx r7, r7, r12, r10 ; o11 = (d1 * 2217 - c1 * 5352 + 7500) - - ldrd r4, r5, [r0] ; [i13 | i12] [i15 | i14] - - pkhbt r2, r2, r6, lsl #4 ; [o9 | o8], keep in register for PART 2 - pkhbt r6, r8, r7, lsl #4 ; [o11 | o10] - - str r6, [r1, #20] - - ; coeffs 12-15 - ror r5, r5, #16 ; [i14 | i15] - - qadd16 r6, r4, r5 ; [i13+i14 | i12+i15]=[b1|a1] without shift - qsub16 r7, r4, r5 ; [i13-i14 | i12-i15]=[c1|d1] without shift - - qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd - ; with 2217*4 and 5352*4 without losing the - ; sign bit (overflow) - - smuad r4, r6, lr ; o12 = (i13+i14)*8 + (i12+i15)*8 - smusd r5, r6, lr ; o14 = (i13+i14)*8 - (i12+i15)*8 - - smlad r6, r7, r12, r11 ; o13 = (c1 * 2217 + d1 * 5352 + 14500) - smlsdx r7, r7, r12, r10 ; o15 = (d1 * 2217 - c1 * 5352 + 7500) - - pkhbt r0, r4, r6, lsl #4 ; [o13 | o12], keep in register for PART 2 - pkhbt r6, r5, r7, lsl #4 ; [o15 | o14] - - str r6, [r1, #28] - - - ; PART 2 ------------------------------------------------- - ldr r11, c12000 - ldr r10, c51000 - ldr lr, c0x00070007 - - qadd16 r4, r3, r0 ; a1 = [i1+i13 | i0+i12] - qadd16 r5, r9, r2 ; b1 = [i5+i9 | i4+i8] - qsub16 r6, r9, r2 ; c1 = [i5-i9 | i4-i8] - qsub16 r7, r3, r0 ; d1 = [i1-i13 | i0-i12] - - qadd16 r4, r4, lr ; a1 + 7 - - add r0, r11, #0x10000 ; add (d!=0) - - qadd16 r2, r4, r5 ; a1 + b1 + 7 - qsub16 r3, r4, r5 ; a1 - b1 + 7 - - ldr r12, c0x08a914e8 ; [2217 | 5352] - - lsl r8, r2, #16 ; prepare bottom halfword for scaling - asr r2, r2, #4 ; scale top halfword - lsl r9, r3, #16 ; prepare bottom halfword for scaling - asr r3, r3, #4 ; scale top halfword - pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword - pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword - - smulbt r2, r6, r12 ; [ ------ | c1*2217] - str r4, [r1, #0] ; [ o1 | o0] - smultt r3, r6, r12 ; [c1*2217 | ------ ] - str r5, [r1, #16] ; [ o9 | o8] - - smlabb r8, r7, r12, r2 ; [ ------ | d1*5352] - smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ] - - smulbb r2, r6, r12 ; [ ------ | c1*5352] - smultb r3, r6, r12 ; [c1*5352 | ------ ] - - lsls r6, r7, #16 ; d1 != 0 ? - addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0) - addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0) - asrs r6, r7, #16 - addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0) - addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0) - - smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000 - smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000 - - pkhtb r9, r9, r8, asr #16 - - sub r4, r4, r2 - sub r5, r5, r3 - - ldr r3, [r1, #4] ; [i3 | i2] - - pkhtb r5, r5, r4, asr #16 ; [o13|o12] - - str r9, [r1, #8] ; [o5 | 04] - - ldr r9, [r1, #12] ; [i7 | i6] - ldr r8, [r1, #28] ; [i15|i14] - ldr r2, [r1, #20] ; [i11|i10] - str r5, [r1, #24] ; [o13|o12] - - qadd16 r4, r3, r8 ; a1 = [i3+i15 | i2+i14] - qadd16 r5, r9, r2 ; b1 = [i7+i11 | i6+i10] - - qadd16 r4, r4, lr ; a1 + 7 - - qsub16 r6, r9, r2 ; c1 = [i7-i11 | i6-i10] - qadd16 r2, r4, r5 ; a1 + b1 + 7 - qsub16 r7, r3, r8 ; d1 = [i3-i15 | i2-i14] - qsub16 r3, r4, r5 ; a1 - b1 + 7 - - lsl r8, r2, #16 ; prepare bottom halfword for scaling - asr r2, r2, #4 ; scale top halfword - lsl r9, r3, #16 ; prepare bottom halfword for scaling - asr r3, r3, #4 ; scale top halfword - pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword - pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword - - smulbt r2, r6, r12 ; [ ------ | c1*2217] - str r4, [r1, #4] ; [ o3 | o2] - smultt r3, r6, r12 ; [c1*2217 | ------ ] - str r5, [r1, #20] ; [ o11 | o10] - - smlabb r8, r7, r12, r2 ; [ ------ | d1*5352] - smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ] - - smulbb r2, r6, r12 ; [ ------ | c1*5352] - smultb r3, r6, r12 ; [c1*5352 | ------ ] - - lsls r6, r7, #16 ; d1 != 0 ? - addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0) - addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0) - - asrs r6, r7, #16 - addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0) - addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0) - - smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000 - smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000 - - pkhtb r9, r9, r8, asr #16 - - sub r4, r4, r2 - sub r5, r5, r3 - - str r9, [r1, #12] ; [o7 | o6] - pkhtb r5, r5, r4, asr #16 ; [o15|o14] - - str r5, [r1, #28] ; [o15|o14] - - ldmfd sp!, {r4 - r12, pc} - - ENDP - -; Used constants -c7500 - DCD 7500 -c14500 - DCD 14500 -c0x22a453a0 - DCD 0x22a453a0 -c0x00080008 - DCD 0x00080008 -c12000 - DCD 12000 -c51000 - DCD 51000 -c0x00070007 - DCD 0x00070007 -c0x08a914e8 - DCD 0x08a914e8 - - END diff --git a/vp9/encoder/arm/armv6/vp9_subtract_armv6.asm b/vp9/encoder/arm/armv6/vp9_subtract_armv6.asm deleted file mode 100644 index e53c1ed5b..000000000 --- a/vp9/encoder/arm/armv6/vp9_subtract_armv6.asm +++ /dev/null @@ -1,264 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_subtract_mby_armv6| - EXPORT |vp8_subtract_mbuv_armv6| - EXPORT |vp8_subtract_b_armv6| - - INCLUDE vp9_asm_enc_offsets.asm - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 BLOCK *be -; r1 BLOCKD *bd -; r2 int pitch -|vp8_subtract_b_armv6| PROC - - stmfd sp!, {r4-r9} - - ldr r4, [r0, #vp8_block_base_src] - ldr r5, [r0, #vp8_block_src] - ldr r6, [r0, #vp8_block_src_diff] - - ldr r3, [r4] - ldr r7, [r0, #vp8_block_src_stride] - add r3, r3, r5 ; src = *base_src + src - ldr r8, [r1, #vp8_blockd_predictor] - - mov r9, #4 ; loop count - -loop_block - - ldr r0, [r3], r7 ; src - ldr r1, [r8], r2 ; pred - - uxtb16 r4, r0 ; [s2 | s0] - uxtb16 r5, r1 ; [p2 | p0] - uxtb16 r0, r0, ror #8 ; [s3 | s1] - uxtb16 r1, r1, ror #8 ; [p3 | p1] - - usub16 r4, r4, r5 ; [d2 | d0] - usub16 r5, r0, r1 ; [d3 | d1] - - subs r9, r9, #1 ; decrement loop counter - - pkhbt r0, r4, r5, lsl #16 ; [d1 | d0] - pkhtb r1, r5, r4, asr #16 ; [d3 | d2] - - str r0, [r6, #0] ; diff - str r1, [r6, #4] ; diff - - add r6, r6, r2, lsl #1 ; update diff pointer - bne loop_block - - ldmfd sp!, {r4-r9} - mov pc, lr - - ENDP - - -; r0 short *diff -; r1 unsigned char *usrc -; r2 unsigned char *vsrc -; r3 unsigned char *pred -; stack int stride -|vp8_subtract_mbuv_armv6| PROC - - stmfd sp!, {r4-r12, lr} - - add r0, r0, #512 ; set *diff point to Cb - add r3, r3, #256 ; set *pred point to Cb - - mov r4, #8 ; loop count - ldr r5, [sp, #40] ; stride - - ; Subtract U block -loop_u - ldr r6, [r1] ; src (A) - ldr r7, [r3], #4 ; pred (A) - - uxtb16 r8, r6 ; [s2 | s0] (A) - uxtb16 r9, r7 ; [p2 | p0] (A) - uxtb16 r10, r6, ror #8 ; [s3 | s1] (A) - uxtb16 r11, r7, ror #8 ; [p3 | p1] (A) - - usub16 r6, r8, r9 ; [d2 | d0] (A) - usub16 r7, r10, r11 ; [d3 | d1] (A) - - ldr r10, [r1, #4] ; src (B) - ldr r11, [r3], #4 ; pred (B) - - pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A) - pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A) - - str r8, [r0], #4 ; diff (A) - uxtb16 r8, r10 ; [s2 | s0] (B) - str r9, [r0], #4 ; diff (A) - - uxtb16 r9, r11 ; [p2 | p0] (B) - uxtb16 r10, r10, ror #8 ; [s3 | s1] (B) - uxtb16 r11, r11, ror #8 ; [p3 | p1] (B) - - usub16 r6, r8, r9 ; [d2 | d0] (B) - usub16 r7, r10, r11 ; [d3 | d1] (B) - - add r1, r1, r5 ; update usrc pointer - - pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B) - pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B) - - str r8, [r0], #4 ; diff (B) - subs r4, r4, #1 ; update loop counter - str r9, [r0], #4 ; diff (B) - - bne loop_u - - mov r4, #8 ; loop count - - ; Subtract V block -loop_v - ldr r6, [r2] ; src (A) - ldr r7, [r3], #4 ; pred (A) - - uxtb16 r8, r6 ; [s2 | s0] (A) - uxtb16 r9, r7 ; [p2 | p0] (A) - uxtb16 r10, r6, ror #8 ; [s3 | s1] (A) - uxtb16 r11, r7, ror #8 ; [p3 | p1] (A) - - usub16 r6, r8, r9 ; [d2 | d0] (A) - usub16 r7, r10, r11 ; [d3 | d1] (A) - - ldr r10, [r2, #4] ; src (B) - ldr r11, [r3], #4 ; pred (B) - - pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A) - pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A) - - str r8, [r0], #4 ; diff (A) - uxtb16 r8, r10 ; [s2 | s0] (B) - str r9, [r0], #4 ; diff (A) - - uxtb16 r9, r11 ; [p2 | p0] (B) - uxtb16 r10, r10, ror #8 ; [s3 | s1] (B) - uxtb16 r11, r11, ror #8 ; [p3 | p1] (B) - - usub16 r6, r8, r9 ; [d2 | d0] (B) - usub16 r7, r10, r11 ; [d3 | d1] (B) - - add r2, r2, r5 ; update vsrc pointer - - pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B) - pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B) - - str r8, [r0], #4 ; diff (B) - subs r4, r4, #1 ; update loop counter - str r9, [r0], #4 ; diff (B) - - bne loop_v - - ldmfd sp!, {r4-r12, pc} - - ENDP - - -; r0 short *diff -; r1 unsigned char *src -; r2 unsigned char *pred -; r3 int stride -|vp8_subtract_mby_armv6| PROC - - stmfd sp!, {r4-r11} - - mov r4, #16 -loop - ldr r6, [r1] ; src (A) - ldr r7, [r2], #4 ; pred (A) - - uxtb16 r8, r6 ; [s2 | s0] (A) - uxtb16 r9, r7 ; [p2 | p0] (A) - uxtb16 r10, r6, ror #8 ; [s3 | s1] (A) - uxtb16 r11, r7, ror #8 ; [p3 | p1] (A) - - usub16 r6, r8, r9 ; [d2 | d0] (A) - usub16 r7, r10, r11 ; [d3 | d1] (A) - - ldr r10, [r1, #4] ; src (B) - ldr r11, [r2], #4 ; pred (B) - - pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A) - pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A) - - str r8, [r0], #4 ; diff (A) - uxtb16 r8, r10 ; [s2 | s0] (B) - str r9, [r0], #4 ; diff (A) - - uxtb16 r9, r11 ; [p2 | p0] (B) - uxtb16 r10, r10, ror #8 ; [s3 | s1] (B) - uxtb16 r11, r11, ror #8 ; [p3 | p1] (B) - - usub16 r6, r8, r9 ; [d2 | d0] (B) - usub16 r7, r10, r11 ; [d3 | d1] (B) - - ldr r10, [r1, #8] ; src (C) - ldr r11, [r2], #4 ; pred (C) - - pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B) - pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B) - - str r8, [r0], #4 ; diff (B) - uxtb16 r8, r10 ; [s2 | s0] (C) - str r9, [r0], #4 ; diff (B) - - uxtb16 r9, r11 ; [p2 | p0] (C) - uxtb16 r10, r10, ror #8 ; [s3 | s1] (C) - uxtb16 r11, r11, ror #8 ; [p3 | p1] (C) - - usub16 r6, r8, r9 ; [d2 | d0] (C) - usub16 r7, r10, r11 ; [d3 | d1] (C) - - ldr r10, [r1, #12] ; src (D) - ldr r11, [r2], #4 ; pred (D) - - pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (C) - pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (C) - - str r8, [r0], #4 ; diff (C) - uxtb16 r8, r10 ; [s2 | s0] (D) - str r9, [r0], #4 ; diff (C) - - uxtb16 r9, r11 ; [p2 | p0] (D) - uxtb16 r10, r10, ror #8 ; [s3 | s1] (D) - uxtb16 r11, r11, ror #8 ; [p3 | p1] (D) - - usub16 r6, r8, r9 ; [d2 | d0] (D) - usub16 r7, r10, r11 ; [d3 | d1] (D) - - add r1, r1, r3 ; update src pointer - - pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (D) - pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (D) - - str r8, [r0], #4 ; diff (D) - subs r4, r4, #1 ; update loop counter - str r9, [r0], #4 ; diff (D) - - bne loop - - ldmfd sp!, {r4-r11} - mov pc, lr - - ENDP - - END diff --git a/vp9/encoder/arm/armv6/vp9_variance16x16_armv6.asm b/vp9/encoder/arm/armv6/vp9_variance16x16_armv6.asm deleted file mode 100644 index aa4727e66..000000000 --- a/vp9/encoder/arm/armv6/vp9_variance16x16_armv6.asm +++ /dev/null @@ -1,153 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_variance16x16_armv6| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp9_variance16x16_armv6| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - -loop - ; 1st 4 pixels - ldr r4, [r0, #0] ; load 4 src pixels - ldr r5, [r2, #0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load 4 src pixels - ldr r5, [r2, #4] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load 4 src pixels - ldr r5, [r2, #8] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load 4 src pixels - ldr r5, [r2, #12] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - - subs r12, r12, #1 - - bne loop - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - - END diff --git a/vp9/encoder/arm/armv6/vp9_variance8x8_armv6.asm b/vp9/encoder/arm/armv6/vp9_variance8x8_armv6.asm deleted file mode 100644 index 101f6838d..000000000 --- a/vp9/encoder/arm/armv6/vp9_variance8x8_armv6.asm +++ /dev/null @@ -1,101 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_variance8x8_armv6| - - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp9_variance8x8_armv6| PROC - - push {r4-r10, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r12, #8 ; set loop counter to 8 (=block height) - mov r4, #0 ; initialize sum = 0 - mov r5, #0 ; initialize sse = 0 - -loop - ; 1st 4 pixels - ldr r6, [r0, #0x0] ; load 4 src pixels - ldr r7, [r2, #0x0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r8, r6, r7 ; calculate difference - pld [r0, r1, lsl #1] - sel r10, r8, lr ; select bytes with positive difference - usub8 r9, r7, r6 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r6, r10, lr ; calculate sum of positive differences - usad8 r7, r8, lr ; calculate sum of negative differences - orr r8, r8, r10 ; differences of all 4 pixels - ; calculate total sum - add r4, r4, r6 ; add positive differences to sum - sub r4, r4, r7 ; substract negative differences from sum - - ; calculate sse - uxtb16 r7, r8 ; byte (two pixels) to halfwords - uxtb16 r10, r8, ror #8 ; another two pixels to halfwords - smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r6, [r0, #0x4] ; load 4 src pixels - ldr r7, [r2, #0x4] ; load 4 ref pixels - smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r6, r7 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r10, r8, lr ; select bytes with positive difference - usub8 r9, r7, r6 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r6, r10, lr ; calculate sum of positive differences - usad8 r7, r8, lr ; calculate sum of negative differences - orr r8, r8, r10 ; differences of all 4 pixels - - ; calculate total sum - add r4, r4, r6 ; add positive differences to sum - sub r4, r4, r7 ; substract negative differences from sum - - ; calculate sse - uxtb16 r7, r8 ; byte (two pixels) to halfwords - uxtb16 r10, r8, ror #8 ; another two pixels to halfwords - smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) - subs r12, r12, #1 ; next row - smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) - - bne loop - - ; return stuff - ldr r8, [sp, #32] ; get address of sse - mul r1, r4, r4 ; sum * sum - str r5, [r8] ; store sse - sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6)) - - pop {r4-r10, pc} - - ENDP - - END diff --git a/vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_h_armv6.asm b/vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_h_armv6.asm deleted file mode 100644 index e25436c22..000000000 --- a/vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_h_armv6.asm +++ /dev/null @@ -1,181 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_variance_halfpixvar16x16_h_armv6| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp9_variance_halfpixvar16x16_h_armv6| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - ldr r10, c80808080 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - mov lr, #0 ; constant zero -loop - ; 1st 4 pixels - ldr r4, [r0, #0] ; load 4 src pixels - ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset - ldr r5, [r2, #0] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load 4 src pixels - ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset - ldr r5, [r2, #4] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load 4 src pixels - ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset - ldr r5, [r2, #8] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load 4 src pixels - ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset - ldr r5, [r2, #12] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - subs r12, r12, #1 - - bne loop - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - -c80808080 - DCD 0x80808080 - - END diff --git a/vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_hv_armv6.asm b/vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_hv_armv6.asm deleted file mode 100644 index 6ad5e90bb..000000000 --- a/vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_hv_armv6.asm +++ /dev/null @@ -1,222 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_variance_halfpixvar16x16_hv_armv6| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp9_variance_halfpixvar16x16_hv_armv6| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - ldr r10, c80808080 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - mov lr, #0 ; constant zero -loop - add r9, r0, r1 ; pointer to pixels on the next row - ; 1st 4 pixels - ldr r4, [r0, #0] ; load source pixels a, row N - ldr r6, [r0, #1] ; load source pixels b, row N - ldr r5, [r9, #0] ; load source pixels c, row N+1 - ldr r7, [r9, #1] ; load source pixels d, row N+1 - - ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 - mvn r7, r7 - uhsub8 r5, r5, r7 - eor r5, r5, r10 - ; z = (x + y + 1) >> 1, interpolate half pixel values vertically - mvn r5, r5 - uhsub8 r4, r4, r5 - ldr r5, [r2, #0] ; load 4 ref pixels - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load source pixels a, row N - ldr r6, [r0, #5] ; load source pixels b, row N - ldr r5, [r9, #4] ; load source pixels c, row N+1 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - ldr r7, [r9, #5] ; load source pixels d, row N+1 - - ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 - mvn r7, r7 - uhsub8 r5, r5, r7 - eor r5, r5, r10 - ; z = (x + y + 1) >> 1, interpolate half pixel values vertically - mvn r5, r5 - uhsub8 r4, r4, r5 - ldr r5, [r2, #4] ; load 4 ref pixels - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load source pixels a, row N - ldr r6, [r0, #9] ; load source pixels b, row N - ldr r5, [r9, #8] ; load source pixels c, row N+1 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - ldr r7, [r9, #9] ; load source pixels d, row N+1 - - ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 - mvn r7, r7 - uhsub8 r5, r5, r7 - eor r5, r5, r10 - ; z = (x + y + 1) >> 1, interpolate half pixel values vertically - mvn r5, r5 - uhsub8 r4, r4, r5 - ldr r5, [r2, #8] ; load 4 ref pixels - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load source pixels a, row N - ldr r6, [r0, #13] ; load source pixels b, row N - ldr r5, [r9, #12] ; load source pixels c, row N+1 - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - ldr r7, [r9, #13] ; load source pixels d, row N+1 - - ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 - mvn r7, r7 - uhsub8 r5, r5, r7 - eor r5, r5, r10 - ; z = (x + y + 1) >> 1, interpolate half pixel values vertically - mvn r5, r5 - uhsub8 r4, r4, r5 - ldr r5, [r2, #12] ; load 4 ref pixels - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - subs r12, r12, #1 - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - bne loop - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - -c80808080 - DCD 0x80808080 - - END diff --git a/vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_v_armv6.asm b/vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_v_armv6.asm deleted file mode 100644 index c1ac5a1cb..000000000 --- a/vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_v_armv6.asm +++ /dev/null @@ -1,183 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_variance_halfpixvar16x16_v_armv6| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp9_variance_halfpixvar16x16_v_armv6| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - ldr r10, c80808080 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - mov lr, #0 ; constant zero -loop - add r9, r0, r1 ; set src pointer to next row - ; 1st 4 pixels - ldr r4, [r0, #0] ; load 4 src pixels - ldr r6, [r9, #0] ; load 4 src pixels from next row - ldr r5, [r2, #0] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load 4 src pixels - ldr r6, [r9, #4] ; load 4 src pixels from next row - ldr r5, [r2, #4] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load 4 src pixels - ldr r6, [r9, #8] ; load 4 src pixels from next row - ldr r5, [r2, #8] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load 4 src pixels - ldr r6, [r9, #12] ; load 4 src pixels from next row - ldr r5, [r2, #12] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - - subs r12, r12, #1 - - bne loop - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - -c80808080 - DCD 0x80808080 - - END diff --git a/vp9/encoder/arm/armv6/vp9_walsh_v6.asm b/vp9/encoder/arm/armv6/vp9_walsh_v6.asm deleted file mode 100644 index 5eaf3f25a..000000000 --- a/vp9/encoder/arm/armv6/vp9_walsh_v6.asm +++ /dev/null @@ -1,212 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vp8_short_walsh4x4_armv6| - - ARM - REQUIRE8 - PRESERVE8 - - AREA |.text|, CODE, READONLY ; name this block of code - -;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch) -; r0 short *input, -; r1 short *output, -; r2 int pitch -|vp8_short_walsh4x4_armv6| PROC - - stmdb sp!, {r4 - r11, lr} - - ldrd r4, r5, [r0], r2 - ldr lr, c00040004 - ldrd r6, r7, [r0], r2 - - ; 0-3 - qadd16 r3, r4, r5 ; [d1|a1] [1+3 | 0+2] - qsub16 r4, r4, r5 ; [c1|b1] [1-3 | 0-2] - - ldrd r8, r9, [r0], r2 - ; 4-7 - qadd16 r5, r6, r7 ; [d1|a1] [5+7 | 4+6] - qsub16 r6, r6, r7 ; [c1|b1] [5-7 | 4-6] - - ldrd r10, r11, [r0] - ; 8-11 - qadd16 r7, r8, r9 ; [d1|a1] [9+11 | 8+10] - qsub16 r8, r8, r9 ; [c1|b1] [9-11 | 8-10] - - ; 12-15 - qadd16 r9, r10, r11 ; [d1|a1] [13+15 | 12+14] - qsub16 r10, r10, r11 ; [c1|b1] [13-15 | 12-14] - - - lsls r2, r3, #16 - smuad r11, r3, lr ; A0 = a1<<2 + d1<<2 - addne r11, r11, #1 ; A0 += (a1!=0) - - lsls r2, r7, #16 - smuad r12, r7, lr ; C0 = a1<<2 + d1<<2 - addne r12, r12, #1 ; C0 += (a1!=0) - - add r0, r11, r12 ; a1_0 = A0 + C0 - sub r11, r11, r12 ; b1_0 = A0 - C0 - - lsls r2, r5, #16 - smuad r12, r5, lr ; B0 = a1<<2 + d1<<2 - addne r12, r12, #1 ; B0 += (a1!=0) - - lsls r2, r9, #16 - smuad r2, r9, lr ; D0 = a1<<2 + d1<<2 - addne r2, r2, #1 ; D0 += (a1!=0) - - add lr, r12, r2 ; d1_0 = B0 + D0 - sub r12, r12, r2 ; c1_0 = B0 - D0 - - ; op[0,4,8,12] - adds r2, r0, lr ; a2 = a1_0 + d1_0 - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - subs r0, r0, lr ; d2 = a1_0 - d1_0 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1] ; op[0] - - addmi r0, r0, #1 ; += a2 < 0 - add r0, r0, #3 ; += 3 - ldr lr, c00040004 - mov r0, r0, asr #3 ; >> 3 - strh r0, [r1, #24] ; op[12] - - adds r2, r11, r12 ; b2 = b1_0 + c1_0 - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - subs r0, r11, r12 ; c2 = b1_0 - c1_0 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1, #8] ; op[4] - - addmi r0, r0, #1 ; += a2 < 0 - add r0, r0, #3 ; += 3 - smusd r3, r3, lr ; A3 = a1<<2 - d1<<2 - smusd r7, r7, lr ; C3 = a1<<2 - d1<<2 - mov r0, r0, asr #3 ; >> 3 - strh r0, [r1, #16] ; op[8] - - - ; op[3,7,11,15] - add r0, r3, r7 ; a1_3 = A3 + C3 - sub r3, r3, r7 ; b1_3 = A3 - C3 - - smusd r5, r5, lr ; B3 = a1<<2 - d1<<2 - smusd r9, r9, lr ; D3 = a1<<2 - d1<<2 - add r7, r5, r9 ; d1_3 = B3 + D3 - sub r5, r5, r9 ; c1_3 = B3 - D3 - - adds r2, r0, r7 ; a2 = a1_3 + d1_3 - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - adds r9, r3, r5 ; b2 = b1_3 + c1_3 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1, #6] ; op[3] - - addmi r9, r9, #1 ; += a2 < 0 - add r9, r9, #3 ; += 3 - subs r2, r3, r5 ; c2 = b1_3 - c1_3 - mov r9, r9, asr #3 ; >> 3 - strh r9, [r1, #14] ; op[7] - - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - subs r9, r0, r7 ; d2 = a1_3 - d1_3 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1, #22] ; op[11] - - addmi r9, r9, #1 ; += a2 < 0 - add r9, r9, #3 ; += 3 - smuad r3, r4, lr ; A1 = b1<<2 + c1<<2 - smuad r5, r8, lr ; C1 = b1<<2 + c1<<2 - mov r9, r9, asr #3 ; >> 3 - strh r9, [r1, #30] ; op[15] - - ; op[1,5,9,13] - add r0, r3, r5 ; a1_1 = A1 + C1 - sub r3, r3, r5 ; b1_1 = A1 - C1 - - smuad r7, r6, lr ; B1 = b1<<2 + c1<<2 - smuad r9, r10, lr ; D1 = b1<<2 + c1<<2 - add r5, r7, r9 ; d1_1 = B1 + D1 - sub r7, r7, r9 ; c1_1 = B1 - D1 - - adds r2, r0, r5 ; a2 = a1_1 + d1_1 - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - adds r9, r3, r7 ; b2 = b1_1 + c1_1 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1, #2] ; op[1] - - addmi r9, r9, #1 ; += a2 < 0 - add r9, r9, #3 ; += 3 - subs r2, r3, r7 ; c2 = b1_1 - c1_1 - mov r9, r9, asr #3 ; >> 3 - strh r9, [r1, #10] ; op[5] - - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - subs r9, r0, r5 ; d2 = a1_1 - d1_1 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1, #18] ; op[9] - - addmi r9, r9, #1 ; += a2 < 0 - add r9, r9, #3 ; += 3 - smusd r4, r4, lr ; A2 = b1<<2 - c1<<2 - smusd r8, r8, lr ; C2 = b1<<2 - c1<<2 - mov r9, r9, asr #3 ; >> 3 - strh r9, [r1, #26] ; op[13] - - - ; op[2,6,10,14] - add r11, r4, r8 ; a1_2 = A2 + C2 - sub r12, r4, r8 ; b1_2 = A2 - C2 - - smusd r6, r6, lr ; B2 = b1<<2 - c1<<2 - smusd r10, r10, lr ; D2 = b1<<2 - c1<<2 - add r4, r6, r10 ; d1_2 = B2 + D2 - sub r8, r6, r10 ; c1_2 = B2 - D2 - - adds r2, r11, r4 ; a2 = a1_2 + d1_2 - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - adds r9, r12, r8 ; b2 = b1_2 + c1_2 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1, #4] ; op[2] - - addmi r9, r9, #1 ; += a2 < 0 - add r9, r9, #3 ; += 3 - subs r2, r12, r8 ; c2 = b1_2 - c1_2 - mov r9, r9, asr #3 ; >> 3 - strh r9, [r1, #12] ; op[6] - - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - subs r9, r11, r4 ; d2 = a1_2 - d1_2 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1, #20] ; op[10] - - addmi r9, r9, #1 ; += a2 < 0 - add r9, r9, #3 ; += 3 - mov r9, r9, asr #3 ; >> 3 - strh r9, [r1, #28] ; op[14] - - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_short_walsh4x4_armv6| - -c00040004 - DCD 0x00040004 - - END diff --git a/vp9/encoder/arm/neon/vp9_fastquantizeb_neon.asm b/vp9/encoder/arm/neon/vp9_fastquantizeb_neon.asm deleted file mode 100644 index c68233617..000000000 --- a/vp9/encoder/arm/neon/vp9_fastquantizeb_neon.asm +++ /dev/null @@ -1,261 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_fast_quantize_b_neon| - EXPORT |vp8_fast_quantize_b_pair_neon| - - INCLUDE vp9_asm_enc_offsets.asm - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=4 - -;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2); -|vp8_fast_quantize_b_pair_neon| PROC - - stmfd sp!, {r4-r9} - vstmdb sp!, {q4-q7} - - ldr r4, [r0, #vp8_block_coeff] - ldr r5, [r0, #vp8_block_quant_fast] - ldr r6, [r0, #vp8_block_round] - - vld1.16 {q0, q1}, [r4@128] ; load z - - ldr r7, [r2, #vp8_blockd_qcoeff] - - vabs.s16 q4, q0 ; calculate x = abs(z) - vabs.s16 q5, q1 - - ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative - vshr.s16 q2, q0, #15 ; sz - vshr.s16 q3, q1, #15 - - vld1.s16 {q6, q7}, [r6@128] ; load round_ptr [0-15] - vld1.s16 {q8, q9}, [r5@128] ; load quant_ptr [0-15] - - ldr r4, [r1, #vp8_block_coeff] - - vadd.s16 q4, q6 ; x + Round - vadd.s16 q5, q7 - - vld1.16 {q0, q1}, [r4@128] ; load z2 - - vqdmulh.s16 q4, q8 ; y = ((Round+abs(z)) * Quant) >> 16 - vqdmulh.s16 q5, q9 - - vabs.s16 q10, q0 ; calculate x2 = abs(z_2) - vabs.s16 q11, q1 - vshr.s16 q12, q0, #15 ; sz2 - vshr.s16 q13, q1, #15 - - ;modify data to have its original sign - veor.s16 q4, q2 ; y^sz - veor.s16 q5, q3 - - vadd.s16 q10, q6 ; x2 + Round - vadd.s16 q11, q7 - - ldr r8, [r2, #vp8_blockd_dequant] - - vqdmulh.s16 q10, q8 ; y2 = ((Round+abs(z)) * Quant) >> 16 - vqdmulh.s16 q11, q9 - - vshr.s16 q4, #1 ; right shift 1 after vqdmulh - vshr.s16 q5, #1 - - vld1.s16 {q6, q7}, [r8@128] ;load dequant_ptr[i] - - vsub.s16 q4, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement) - vsub.s16 q5, q3 - - vshr.s16 q10, #1 ; right shift 1 after vqdmulh - vshr.s16 q11, #1 - - ldr r9, [r2, #vp8_blockd_dqcoeff] - - veor.s16 q10, q12 ; y2^sz2 - veor.s16 q11, q13 - - vst1.s16 {q4, q5}, [r7] ; store: qcoeff = x1 - - - vsub.s16 q10, q12 ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement) - vsub.s16 q11, q13 - - ldr r6, [r3, #vp8_blockd_qcoeff] - - vmul.s16 q2, q6, q4 ; x * Dequant - vmul.s16 q3, q7, q5 - - ldr r0, _inv_zig_zag_ ; load ptr of inverse zigzag table - - vceq.s16 q8, q8 ; set q8 to all 1 - - vst1.s16 {q10, q11}, [r6] ; store: qcoeff = x2 - - vmul.s16 q12, q6, q10 ; x2 * Dequant - vmul.s16 q13, q7, q11 - - vld1.16 {q6, q7}, [r0@128] ; load inverse scan order - - vtst.16 q14, q4, q8 ; now find eob - vtst.16 q15, q5, q8 ; non-zero element is set to all 1 - - vst1.s16 {q2, q3}, [r9] ; store dqcoeff = x * Dequant - - ldr r7, [r3, #vp8_blockd_dqcoeff] - - vand q0, q6, q14 ; get all valid numbers from scan array - vand q1, q7, q15 - - vst1.s16 {q12, q13}, [r7] ; store dqcoeff = x * Dequant - - vtst.16 q2, q10, q8 ; now find eob - vtst.16 q3, q11, q8 ; non-zero element is set to all 1 - - vmax.u16 q0, q0, q1 ; find maximum value in q0, q1 - - vand q10, q6, q2 ; get all valid numbers from scan array - vand q11, q7, q3 - vmax.u16 q10, q10, q11 ; find maximum value in q10, q11 - - vmax.u16 d0, d0, d1 - vmax.u16 d20, d20, d21 - vmovl.u16 q0, d0 - vmovl.u16 q10, d20 - - - vmax.u32 d0, d0, d1 - vmax.u32 d20, d20, d21 - vpmax.u32 d0, d0, d0 - vpmax.u32 d20, d20, d20 - - add r4, r2, #vp8_blockd_eob - add r5, r3, #vp8_blockd_eob - - vst1.32 {d0[0]}, [r4@32] - vst1.32 {d20[0]}, [r5@32] - - vldmia sp!, {q4-q7} - ldmfd sp!, {r4-r9} - bx lr - - ENDP - -;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d) -|vp8_fast_quantize_b_neon| PROC - - stmfd sp!, {r4-r7} - - ldr r3, [r0, #vp8_block_coeff] - ldr r4, [r0, #vp8_block_quant_fast] - ldr r5, [r0, #vp8_block_round] - - vld1.16 {q0, q1}, [r3@128] ; load z - vorr.s16 q14, q0, q1 ; check if all zero (step 1) - ldr r6, [r1, #vp8_blockd_qcoeff] - ldr r7, [r1, #vp8_blockd_dqcoeff] - vorr.s16 d28, d28, d29 ; check if all zero (step 2) - - vabs.s16 q12, q0 ; calculate x = abs(z) - vabs.s16 q13, q1 - - ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative - vshr.s16 q2, q0, #15 ; sz - vmov r2, r3, d28 ; check if all zero (step 3) - vshr.s16 q3, q1, #15 - - vld1.s16 {q14, q15}, [r5@128]; load round_ptr [0-15] - vld1.s16 {q8, q9}, [r4@128] ; load quant_ptr [0-15] - - vadd.s16 q12, q14 ; x + Round - vadd.s16 q13, q15 - - ldr r0, _inv_zig_zag_ ; load ptr of inverse zigzag table - - vqdmulh.s16 q12, q8 ; y = ((Round+abs(z)) * Quant) >> 16 - vqdmulh.s16 q13, q9 - - vld1.16 {q10, q11}, [r0@128]; load inverse scan order - - vceq.s16 q8, q8 ; set q8 to all 1 - - ldr r4, [r1, #vp8_blockd_dequant] - - vshr.s16 q12, #1 ; right shift 1 after vqdmulh - vshr.s16 q13, #1 - - orr r2, r2, r3 ; check if all zero (step 4) - cmp r2, #0 ; check if all zero (step 5) - beq zero_output ; check if all zero (step 6) - - ;modify data to have its original sign - veor.s16 q12, q2 ; y^sz - veor.s16 q13, q3 - - vsub.s16 q12, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement) - vsub.s16 q13, q3 - - vld1.s16 {q2, q3}, [r4@128] ; load dequant_ptr[i] - - vtst.16 q14, q12, q8 ; now find eob - vtst.16 q15, q13, q8 ; non-zero element is set to all 1 - - vst1.s16 {q12, q13}, [r6@128]; store: qcoeff = x1 - - vand q10, q10, q14 ; get all valid numbers from scan array - vand q11, q11, q15 - - - vmax.u16 q0, q10, q11 ; find maximum value in q0, q1 - vmax.u16 d0, d0, d1 - vmovl.u16 q0, d0 - - vmul.s16 q2, q12 ; x * Dequant - vmul.s16 q3, q13 - - vmax.u32 d0, d0, d1 - vpmax.u32 d0, d0, d0 - - vst1.s16 {q2, q3}, [r7@128] ; store dqcoeff = x * Dequant - - add r4, r1, #vp8_blockd_eob - vst1.32 {d0[0]}, [r4@32] - - ldmfd sp!, {r4-r7} - bx lr - -zero_output - str r2, [r1, #vp8_blockd_eob] - vst1.s16 {q0, q1}, [r6@128] ; qcoeff = 0 - vst1.s16 {q0, q1}, [r7@128] ; dqcoeff = 0 - - ldmfd sp!, {r4-r7} - bx lr - - ENDP - -; default inverse zigzag table is defined in vp9/common/vp9_entropy.c -_inv_zig_zag_ - DCD inv_zig_zag - - ALIGN 16 ; enable use of @128 bit aligned loads -inv_zig_zag - DCW 0x0001, 0x0002, 0x0006, 0x0007 - DCW 0x0003, 0x0005, 0x0008, 0x000d - DCW 0x0004, 0x0009, 0x000c, 0x000e - DCW 0x000a, 0x000b, 0x000f, 0x0010 - - END - diff --git a/vp9/encoder/arm/neon/vp9_memcpy_neon.asm b/vp9/encoder/arm/neon/vp9_memcpy_neon.asm deleted file mode 100644 index b0450e523..000000000 --- a/vp9/encoder/arm/neon/vp9_memcpy_neon.asm +++ /dev/null @@ -1,68 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_memcpy_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;========================================= -;void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz); -|vp8_memcpy_neon| PROC - ;pld [r1] ;preload pred data - ;pld [r1, #128] - ;pld [r1, #256] - ;pld [r1, #384] - - mov r12, r2, lsr #8 ;copy 256 bytes data at one time - -memcpy_neon_loop - vld1.8 {q0, q1}, [r1]! ;load src data - subs r12, r12, #1 - vld1.8 {q2, q3}, [r1]! - vst1.8 {q0, q1}, [r0]! ;copy to dst_ptr - vld1.8 {q4, q5}, [r1]! - vst1.8 {q2, q3}, [r0]! - vld1.8 {q6, q7}, [r1]! - vst1.8 {q4, q5}, [r0]! - vld1.8 {q8, q9}, [r1]! - vst1.8 {q6, q7}, [r0]! - vld1.8 {q10, q11}, [r1]! - vst1.8 {q8, q9}, [r0]! - vld1.8 {q12, q13}, [r1]! - vst1.8 {q10, q11}, [r0]! - vld1.8 {q14, q15}, [r1]! - vst1.8 {q12, q13}, [r0]! - vst1.8 {q14, q15}, [r0]! - - ;pld [r1] ;preload pred data -- need to adjust for real device - ;pld [r1, #128] - ;pld [r1, #256] - ;pld [r1, #384] - - bne memcpy_neon_loop - - ands r3, r2, #0xff ;extra copy - beq done_copy_neon_loop - -extra_copy_neon_loop - vld1.8 {q0}, [r1]! ;load src data - subs r3, r3, #16 - vst1.8 {q0}, [r0]! - bne extra_copy_neon_loop - -done_copy_neon_loop - bx lr - ENDP - - END diff --git a/vp9/encoder/arm/neon/vp9_mse16x16_neon.asm b/vp9/encoder/arm/neon/vp9_mse16x16_neon.asm deleted file mode 100644 index 4d1512d40..000000000 --- a/vp9/encoder/arm/neon/vp9_mse16x16_neon.asm +++ /dev/null @@ -1,116 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_mse16x16_neon| - EXPORT |vp8_get4x4sse_cs_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;============================ -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -;note: in this function, sum is never used. So, we can remove this part of calculation -;from vp9_variance(). - -|vp8_mse16x16_neon| PROC - vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse - vmov.i8 q8, #0 - vmov.i8 q9, #0 - vmov.i8 q10, #0 - - mov r12, #8 - -mse16x16_neon_loop - vld1.8 {q0}, [r0], r1 ;Load up source and reference - vld1.8 {q2}, [r2], r3 - vld1.8 {q1}, [r0], r1 - vld1.8 {q3}, [r2], r3 - - vsubl.u8 q11, d0, d4 - vsubl.u8 q12, d1, d5 - vsubl.u8 q13, d2, d6 - vsubl.u8 q14, d3, d7 - - vmlal.s16 q7, d22, d22 - vmlal.s16 q8, d23, d23 - - subs r12, r12, #1 - - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - vmlal.s16 q7, d26, d26 - vmlal.s16 q8, d27, d27 - vmlal.s16 q9, d28, d28 - vmlal.s16 q10, d29, d29 - - bne mse16x16_neon_loop - - vadd.u32 q7, q7, q8 - vadd.u32 q9, q9, q10 - - ldr r12, [sp] ;load *sse from stack - - vadd.u32 q10, q7, q9 - vpaddl.u32 q1, q10 - vadd.u64 d0, d2, d3 - - vst1.32 {d0[0]}, [r12] - vmov.32 r0, d0[0] - - bx lr - - ENDP - - -;============================= -; r0 unsigned char *src_ptr, -; r1 int source_stride, -; r2 unsigned char *ref_ptr, -; r3 int recon_stride -|vp8_get4x4sse_cs_neon| PROC - vld1.8 {d0}, [r0], r1 ;Load up source and reference - vld1.8 {d4}, [r2], r3 - vld1.8 {d1}, [r0], r1 - vld1.8 {d5}, [r2], r3 - vld1.8 {d2}, [r0], r1 - vld1.8 {d6}, [r2], r3 - vld1.8 {d3}, [r0], r1 - vld1.8 {d7}, [r2], r3 - - vsubl.u8 q11, d0, d4 - vsubl.u8 q12, d1, d5 - vsubl.u8 q13, d2, d6 - vsubl.u8 q14, d3, d7 - - vmull.s16 q7, d22, d22 - vmull.s16 q8, d24, d24 - vmull.s16 q9, d26, d26 - vmull.s16 q10, d28, d28 - - vadd.u32 q7, q7, q8 - vadd.u32 q9, q9, q10 - vadd.u32 q9, q7, q9 - - vpaddl.u32 q1, q9 - vadd.u64 d0, d2, d3 - - vmov.32 r0, d0[0] - bx lr - - ENDP - - END diff --git a/vp9/encoder/arm/neon/vp9_picklpf_arm.c b/vp9/encoder/arm/neon/vp9_picklpf_arm.c deleted file mode 100644 index b427e5ef7..000000000 --- a/vp9/encoder/arm/neon/vp9_picklpf_arm.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vp9/common/vp9_onyxc_int.h" -#include "vp9/encoder/vp9_onyx_int.h" -#include "vp9/encoder/vp9_quantize.h" -#include "vpx_mem/vpx_mem.h" -#include "vpx_scale/vpxscale.h" -#include "vp9/common/vp9_alloccommon.h" - -extern void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz); - - -void -vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction) { - unsigned char *src_y, *dst_y; - int yheight; - int ystride; - int border; - int yoffset; - int linestocopy; - - border = src_ybc->border; - yheight = src_ybc->y_height; - ystride = src_ybc->y_stride; - - linestocopy = (yheight >> (Fraction + 4)); - - if (linestocopy < 1) - linestocopy = 1; - - linestocopy <<= 4; - - yoffset = ystride * ((yheight >> 5) * 16 - 8); - src_y = src_ybc->y_buffer + yoffset; - dst_y = dst_ybc->y_buffer + yoffset; - - // vpx_memcpy (dst_y, src_y, ystride * (linestocopy +16)); - vp8_memcpy_neon((unsigned char *)dst_y, (unsigned char *)src_y, (int)(ystride * (linestocopy + 16))); -} diff --git a/vp9/encoder/arm/neon/vp9_sad16_neon.asm b/vp9/encoder/arm/neon/vp9_sad16_neon.asm deleted file mode 100644 index d7c590e15..000000000 --- a/vp9/encoder/arm/neon/vp9_sad16_neon.asm +++ /dev/null @@ -1,207 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sad16x16_neon| - EXPORT |vp8_sad16x8_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int src_stride -; r2 unsigned char *ref_ptr -; r3 int ref_stride -|vp8_sad16x16_neon| PROC -;; - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabdl.u8 q12, d0, d8 - vabdl.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0], r1 - vld1.8 {q7}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - -;; - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabal.u8 q12, d0, d8 - vabal.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0], r1 - vld1.8 {q7}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - -;; - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabal.u8 q12, d0, d8 - vabal.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0], r1 - vld1.8 {q7}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - -;; - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabal.u8 q12, d0, d8 - vabal.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0] - vld1.8 {q7}, [r2] - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vadd.u16 q0, q12, q13 - - vpaddl.u16 q1, q0 - vpaddl.u32 q0, q1 - - vadd.u32 d0, d0, d1 - - vmov.32 r0, d0[0] - - bx lr - - ENDP - -;============================== -;unsigned int vp8_sad16x8_c( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -|vp8_sad16x8_neon| PROC - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabdl.u8 q12, d0, d8 - vabdl.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0], r1 - vld1.8 {q7}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabal.u8 q12, d0, d8 - vabal.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0], r1 - vld1.8 {q7}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vadd.u16 q0, q12, q13 - - vpaddl.u16 q1, q0 - vpaddl.u32 q0, q1 - - vadd.u32 d0, d0, d1 - - vmov.32 r0, d0[0] - - bx lr - - ENDP - - END diff --git a/vp9/encoder/arm/neon/vp9_sad8_neon.asm b/vp9/encoder/arm/neon/vp9_sad8_neon.asm deleted file mode 100644 index 23ba6df93..000000000 --- a/vp9/encoder/arm/neon/vp9_sad8_neon.asm +++ /dev/null @@ -1,209 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sad8x8_neon| - EXPORT |vp8_sad8x16_neon| - EXPORT |vp8_sad4x4_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; unsigned int vp8_sad8x8_c( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) - -|vp8_sad8x8_neon| PROC - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabdl.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vabal.u8 q12, d6, d14 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabal.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q12, d6, d14 - - vpaddl.u16 q1, q12 - vpaddl.u32 q0, q1 - vadd.u32 d0, d0, d1 - - vmov.32 r0, d0[0] - - bx lr - - ENDP - -;============================ -;unsigned int vp8_sad8x16_c( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) - -|vp8_sad8x16_neon| PROC - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabdl.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vabal.u8 q12, d6, d14 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabal.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vabal.u8 q12, d6, d14 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabal.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vabal.u8 q12, d6, d14 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabal.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q12, d6, d14 - - vpaddl.u16 q1, q12 - vpaddl.u32 q0, q1 - vadd.u32 d0, d0, d1 - - vmov.32 r0, d0[0] - - bx lr - - ENDP - -;=========================== -;unsigned int vp8_sad4x4_c( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) - -|vp8_sad4x4_neon| PROC - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabdl.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q12, d6, d14 - - vpaddl.u16 d1, d24 - vpaddl.u32 d0, d1 - vmov.32 r0, d0[0] - - bx lr - - ENDP - - END diff --git a/vp9/encoder/arm/neon/vp9_shortfdct_neon.asm b/vp9/encoder/arm/neon/vp9_shortfdct_neon.asm deleted file mode 100644 index 09dd011ec..000000000 --- a/vp9/encoder/arm/neon/vp9_shortfdct_neon.asm +++ /dev/null @@ -1,221 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_short_fdct4x4_neon| - EXPORT |vp8_short_fdct8x4_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=4 - - - ALIGN 16 ; enable use of @128 bit aligned loads -coeff - DCW 5352, 5352, 5352, 5352 - DCW 2217, 2217, 2217, 2217 - DCD 14500, 14500, 14500, 14500 - DCD 7500, 7500, 7500, 7500 - DCD 12000, 12000, 12000, 12000 - DCD 51000, 51000, 51000, 51000 - -;void vp8_short_fdct4x4_c(short *input, short *output, int pitch) -|vp8_short_fdct4x4_neon| PROC - - ; Part one - vld1.16 {d0}, [r0@64], r2 - adr r12, coeff - vld1.16 {d1}, [r0@64], r2 - vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217 - vld1.16 {d2}, [r0@64], r2 - vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500 - vld1.16 {d3}, [r0@64], r2 - - ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3] - vtrn.32 d0, d2 - vtrn.32 d1, d3 - vld1.32 {q11,q12}, [r12@128] ; q11=12000, q12=51000 - vtrn.16 d0, d1 - vtrn.16 d2, d3 - - vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[3] - vadd.s16 d5, d1, d2 ; b1 = ip[1] + ip[2] - vsub.s16 d6, d1, d2 ; c1 = ip[1] - ip[2] - vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[3] - - vshl.s16 q2, q2, #3 ; (a1, b1) << 3 - vshl.s16 q3, q3, #3 ; (c1, d1) << 3 - - vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 - vsub.s16 d2, d4, d5 ; op[2] = a1 - b1 - - vmlal.s16 q9, d7, d16 ; d1*5352 + 14500 - vmlal.s16 q10, d7, d17 ; d1*2217 + 7500 - vmlal.s16 q9, d6, d17 ; c1*2217 + d1*5352 + 14500 - vmlsl.s16 q10, d6, d16 ; d1*2217 - c1*5352 + 7500 - - vshrn.s32 d1, q9, #12 ; op[1] = (c1*2217 + d1*5352 + 14500)>>12 - vshrn.s32 d3, q10, #12 ; op[3] = (d1*2217 - c1*5352 + 7500)>>12 - - - ; Part two - - ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12] - vtrn.32 d0, d2 - vtrn.32 d1, d3 - vtrn.16 d0, d1 - vtrn.16 d2, d3 - - vmov.s16 d26, #7 - - vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[12] - vadd.s16 d5, d1, d2 ; b1 = ip[4] + ip[8] - vsub.s16 d6, d1, d2 ; c1 = ip[4] - ip[8] - vadd.s16 d4, d4, d26 ; a1 + 7 - vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[12] - - vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 + 7 - vsub.s16 d2, d4, d5 ; op[8] = a1 - b1 + 7 - - vmlal.s16 q11, d7, d16 ; d1*5352 + 12000 - vmlal.s16 q12, d7, d17 ; d1*2217 + 51000 - - vceq.s16 d4, d7, #0 - - vshr.s16 d0, d0, #4 - vshr.s16 d2, d2, #4 - - vmlal.s16 q11, d6, d17 ; c1*2217 + d1*5352 + 12000 - vmlsl.s16 q12, d6, d16 ; d1*2217 - c1*5352 + 51000 - - vmvn.s16 d4, d4 - vshrn.s32 d1, q11, #16 ; op[4] = (c1*2217 + d1*5352 + 12000)>>16 - vsub.s16 d1, d1, d4 ; op[4] += (d1!=0) - vshrn.s32 d3, q12, #16 ; op[12]= (d1*2217 - c1*5352 + 51000)>>16 - - vst1.16 {q0, q1}, [r1@128] - - bx lr - - ENDP - -;void vp8_short_fdct8x4_c(short *input, short *output, int pitch) -|vp8_short_fdct8x4_neon| PROC - - ; Part one - - vld1.16 {q0}, [r0@128], r2 - adr r12, coeff - vld1.16 {q1}, [r0@128], r2 - vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217 - vld1.16 {q2}, [r0@128], r2 - vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500 - vld1.16 {q3}, [r0@128], r2 - - ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3] - vtrn.32 q0, q2 ; [A0|B0] - vtrn.32 q1, q3 ; [A1|B1] - vtrn.16 q0, q1 ; [A2|B2] - vtrn.16 q2, q3 ; [A3|B3] - - vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[3] - vadd.s16 q12, q1, q2 ; b1 = ip[1] + ip[2] - vsub.s16 q13, q1, q2 ; c1 = ip[1] - ip[2] - vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[3] - - vshl.s16 q11, q11, #3 ; a1 << 3 - vshl.s16 q12, q12, #3 ; b1 << 3 - vshl.s16 q13, q13, #3 ; c1 << 3 - vshl.s16 q14, q14, #3 ; d1 << 3 - - vadd.s16 q0, q11, q12 ; [A0 | B0] = a1 + b1 - vsub.s16 q2, q11, q12 ; [A2 | B2] = a1 - b1 - - vmov.s16 q11, q9 ; 14500 - vmov.s16 q12, q10 ; 7500 - - vmlal.s16 q9, d28, d16 ; A[1] = d1*5352 + 14500 - vmlal.s16 q10, d28, d17 ; A[3] = d1*2217 + 7500 - vmlal.s16 q11, d29, d16 ; B[1] = d1*5352 + 14500 - vmlal.s16 q12, d29, d17 ; B[3] = d1*2217 + 7500 - - vmlal.s16 q9, d26, d17 ; A[1] = c1*2217 + d1*5352 + 14500 - vmlsl.s16 q10, d26, d16 ; A[3] = d1*2217 - c1*5352 + 7500 - vmlal.s16 q11, d27, d17 ; B[1] = c1*2217 + d1*5352 + 14500 - vmlsl.s16 q12, d27, d16 ; B[3] = d1*2217 - c1*5352 + 7500 - - vshrn.s32 d2, q9, #12 ; A[1] = (c1*2217 + d1*5352 + 14500)>>12 - vshrn.s32 d6, q10, #12 ; A[3] = (d1*2217 - c1*5352 + 7500)>>12 - vshrn.s32 d3, q11, #12 ; B[1] = (c1*2217 + d1*5352 + 14500)>>12 - vshrn.s32 d7, q12, #12 ; B[3] = (d1*2217 - c1*5352 + 7500)>>12 - - - ; Part two - vld1.32 {q9,q10}, [r12@128] ; q9=12000, q10=51000 - - ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12] - vtrn.32 q0, q2 ; q0=[A0 | B0] - vtrn.32 q1, q3 ; q1=[A4 | B4] - vtrn.16 q0, q1 ; q2=[A8 | B8] - vtrn.16 q2, q3 ; q3=[A12|B12] - - vmov.s16 q15, #7 - - vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[12] - vadd.s16 q12, q1, q2 ; b1 = ip[4] + ip[8] - vadd.s16 q11, q11, q15 ; a1 + 7 - vsub.s16 q13, q1, q2 ; c1 = ip[4] - ip[8] - vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[12] - - vadd.s16 q0, q11, q12 ; a1 + b1 + 7 - vsub.s16 q1, q11, q12 ; a1 - b1 + 7 - - vmov.s16 q11, q9 ; 12000 - vmov.s16 q12, q10 ; 51000 - - vshr.s16 d0, d0, #4 ; A[0] = (a1 + b1 + 7)>>4 - vshr.s16 d4, d1, #4 ; B[0] = (a1 + b1 + 7)>>4 - vshr.s16 d2, d2, #4 ; A[8] = (a1 + b1 + 7)>>4 - vshr.s16 d6, d3, #4 ; B[8] = (a1 + b1 + 7)>>4 - - - vmlal.s16 q9, d28, d16 ; A[4] = d1*5352 + 12000 - vmlal.s16 q10, d28, d17 ; A[12] = d1*2217 + 51000 - vmlal.s16 q11, d29, d16 ; B[4] = d1*5352 + 12000 - vmlal.s16 q12, d29, d17 ; B[12] = d1*2217 + 51000 - - vceq.s16 q14, q14, #0 - - vmlal.s16 q9, d26, d17 ; A[4] = c1*2217 + d1*5352 + 12000 - vmlsl.s16 q10, d26, d16 ; A[12] = d1*2217 - c1*5352 + 51000 - vmlal.s16 q11, d27, d17 ; B[4] = c1*2217 + d1*5352 + 12000 - vmlsl.s16 q12, d27, d16 ; B[12] = d1*2217 - c1*5352 + 51000 - - vmvn.s16 q14, q14 - - vshrn.s32 d1, q9, #16 ; A[4] = (c1*2217 + d1*5352 + 12000)>>16 - vshrn.s32 d3, q10, #16 ; A[12]= (d1*2217 - c1*5352 + 51000)>>16 - vsub.s16 d1, d1, d28 ; A[4] += (d1!=0) - - vshrn.s32 d5, q11, #16 ; B[4] = (c1*2217 + d1*5352 + 12000)>>16 - vshrn.s32 d7, q12, #16 ; B[12]= (d1*2217 - c1*5352 + 51000)>>16 - vsub.s16 d5, d5, d29 ; B[4] += (d1!=0) - - vst1.16 {q0, q1}, [r1@128]! ; block A - vst1.16 {q2, q3}, [r1@128]! ; block B - - bx lr - - ENDP - - END - diff --git a/vp9/encoder/arm/neon/vp9_shortwalsh4x4_neon.asm b/vp9/encoder/arm/neon/vp9_shortwalsh4x4_neon.asm deleted file mode 100644 index 22266297a..000000000 --- a/vp9/encoder/arm/neon/vp9_shortwalsh4x4_neon.asm +++ /dev/null @@ -1,103 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_short_walsh4x4_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch) -; r0 short *input, -; r1 short *output, -; r2 int pitch -|vp8_short_walsh4x4_neon| PROC - - vld1.16 {d0}, [r0@64], r2 ; load input - vld1.16 {d1}, [r0@64], r2 - vld1.16 {d2}, [r0@64], r2 - vld1.16 {d3}, [r0@64] - - ;First for-loop - ;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3] - vtrn.32 d0, d2 - vtrn.32 d1, d3 - - vmov.s32 q15, #3 ; add 3 to all values - - vtrn.16 d0, d1 - vtrn.16 d2, d3 - - vadd.s16 d4, d0, d2 ; ip[0] + ip[2] - vadd.s16 d5, d1, d3 ; ip[1] + ip[3] - vsub.s16 d6, d1, d3 ; ip[1] - ip[3] - vsub.s16 d7, d0, d2 ; ip[0] - ip[2] - - vshl.s16 d4, d4, #2 ; a1 = (ip[0] + ip[2]) << 2 - vshl.s16 d5, d5, #2 ; d1 = (ip[1] + ip[3]) << 2 - vshl.s16 d6, d6, #2 ; c1 = (ip[1] - ip[3]) << 2 - vceq.s16 d16, d4, #0 ; a1 == 0 - vshl.s16 d7, d7, #2 ; b1 = (ip[0] - ip[2]) << 2 - - vadd.s16 d0, d4, d5 ; a1 + d1 - vmvn d16, d16 ; a1 != 0 - vsub.s16 d3, d4, d5 ; op[3] = a1 - d1 - vadd.s16 d1, d7, d6 ; op[1] = b1 + c1 - vsub.s16 d2, d7, d6 ; op[2] = b1 - c1 - vsub.s16 d0, d0, d16 ; op[0] = a1 + d1 + (a1 != 0) - - ;Second for-loop - ;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12] - vtrn.32 d1, d3 - vtrn.32 d0, d2 - vtrn.16 d2, d3 - vtrn.16 d0, d1 - - vaddl.s16 q8, d0, d2 ; a1 = ip[0]+ip[8] - vaddl.s16 q9, d1, d3 ; d1 = ip[4]+ip[12] - vsubl.s16 q10, d1, d3 ; c1 = ip[4]-ip[12] - vsubl.s16 q11, d0, d2 ; b1 = ip[0]-ip[8] - - vadd.s32 q0, q8, q9 ; a2 = a1 + d1 - vadd.s32 q1, q11, q10 ; b2 = b1 + c1 - vsub.s32 q2, q11, q10 ; c2 = b1 - c1 - vsub.s32 q3, q8, q9 ; d2 = a1 - d1 - - vclt.s32 q8, q0, #0 - vclt.s32 q9, q1, #0 - vclt.s32 q10, q2, #0 - vclt.s32 q11, q3, #0 - - ; subtract -1 (or 0) - vsub.s32 q0, q0, q8 ; a2 += a2 < 0 - vsub.s32 q1, q1, q9 ; b2 += b2 < 0 - vsub.s32 q2, q2, q10 ; c2 += c2 < 0 - vsub.s32 q3, q3, q11 ; d2 += d2 < 0 - - vadd.s32 q8, q0, q15 ; a2 + 3 - vadd.s32 q9, q1, q15 ; b2 + 3 - vadd.s32 q10, q2, q15 ; c2 + 3 - vadd.s32 q11, q3, q15 ; d2 + 3 - - ; vrshrn? would add 1 << 3-1 = 2 - vshrn.s32 d0, q8, #3 - vshrn.s32 d1, q9, #3 - vshrn.s32 d2, q10, #3 - vshrn.s32 d3, q11, #3 - - vst1.16 {q0, q1}, [r1@128] - - bx lr - - ENDP - - END diff --git a/vp9/encoder/arm/neon/vp9_subpixelvariance16x16_neon.asm b/vp9/encoder/arm/neon/vp9_subpixelvariance16x16_neon.asm deleted file mode 100644 index 8bb0734d1..000000000 --- a/vp9/encoder/arm/neon/vp9_subpixelvariance16x16_neon.asm +++ /dev/null @@ -1,425 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_sub_pixel_variance16x16_neon_func| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; stack(r4) unsigned char *dst_ptr, -; stack(r5) int dst_pixels_per_line, -; stack(r6) unsigned int *sse -;note: most of the code is copied from bilinear_predict16x16_neon and vp9_variance16x16_neon. - -|vp9_sub_pixel_variance16x16_neon_func| PROC - push {r4-r6, lr} - - ldr r12, _BilinearTaps_coeff_ - ldr r4, [sp, #16] ;load *dst_ptr from stack - ldr r5, [sp, #20] ;load dst_pixels_per_line from stack - ldr r6, [sp, #24] ;load *sse from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_bfilter16x16_only - - add r2, r12, r2, lsl #3 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - - vld1.s32 {d31}, [r2] ;load first_pass filter - - beq firstpass_bfilter16x16_only - - sub sp, sp, #272 ;reserve space on stack for temporary storage - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - mov lr, sp - vld1.u8 {d5, d6, d7}, [r0], r1 - - mov r2, #3 ;loop counter - vld1.u8 {d8, d9, d10}, [r0], r1 - - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vld1.u8 {d11, d12, d13}, [r0], r1 - - vdup.8 d1, d31[4] - -;First Pass: output_height lines x output_width columns (17x16) -vp8e_filt_blk2d_fp16x16_loop_neon - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0]) - vmull.u8 q8, d3, d0 - vmull.u8 q9, d5, d0 - vmull.u8 q10, d6, d0 - vmull.u8 q11, d8, d0 - vmull.u8 q12, d9, d0 - vmull.u8 q13, d11, d0 - vmull.u8 q14, d12, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - vext.8 d11, d11, d12, #1 - - vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q9, d5, d1 - vmlal.u8 q11, d8, d1 - vmlal.u8 q13, d11, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - vext.8 d12, d12, d13, #1 - - vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q10, d6, d1 - vmlal.u8 q12, d9, d1 - vmlal.u8 q14, d12, d1 - - subs r2, r2, #1 - - vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d15, q8, #7 - vqrshrn.u16 d16, q9, #7 - vqrshrn.u16 d17, q10, #7 - vqrshrn.u16 d18, q11, #7 - vqrshrn.u16 d19, q12, #7 - vqrshrn.u16 d20, q13, #7 - - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - vqrshrn.u16 d21, q14, #7 - vld1.u8 {d5, d6, d7}, [r0], r1 - - vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result - vld1.u8 {d8, d9, d10}, [r0], r1 - vst1.u8 {d18, d19, d20, d21}, [lr]! - vld1.u8 {d11, d12, d13}, [r0], r1 - - bne vp8e_filt_blk2d_fp16x16_loop_neon - -;First-pass filtering for rest 5 lines - vld1.u8 {d14, d15, d16}, [r0], r1 - - vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0]) - vmull.u8 q10, d3, d0 - vmull.u8 q11, d5, d0 - vmull.u8 q12, d6, d0 - vmull.u8 q13, d8, d0 - vmull.u8 q14, d9, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - - vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q11, d5, d1 - vmlal.u8 q13, d8, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - - vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q12, d6, d1 - vmlal.u8 q14, d9, d1 - - vmull.u8 q1, d11, d0 - vmull.u8 q2, d12, d0 - vmull.u8 q3, d14, d0 - vmull.u8 q4, d15, d0 - - vext.8 d11, d11, d12, #1 ;construct src_ptr[1] - vext.8 d14, d14, d15, #1 - - vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q3, d14, d1 - - vext.8 d12, d12, d13, #1 - vext.8 d15, d15, d16, #1 - - vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q4, d15, d1 - - vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d11, q10, #7 - vqrshrn.u16 d12, q11, #7 - vqrshrn.u16 d13, q12, #7 - vqrshrn.u16 d14, q13, #7 - vqrshrn.u16 d15, q14, #7 - vqrshrn.u16 d16, q1, #7 - vqrshrn.u16 d17, q2, #7 - vqrshrn.u16 d18, q3, #7 - vqrshrn.u16 d19, q4, #7 - - vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result - vst1.u8 {d14, d15, d16, d17}, [lr]! - vst1.u8 {d18, d19}, [lr]! - -;Second pass: 16x16 -;secondpass_filter - add r3, r12, r3, lsl #3 - sub lr, lr, #272 - - vld1.u32 {d31}, [r3] ;load second_pass filter - - sub sp, sp, #256 - mov r3, sp - - vld1.u8 {d22, d23}, [lr]! ;load src data - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - mov r12, #4 ;loop counter - -vp8e_filt_blk2d_sp16x16_loop_neon - vld1.u8 {d24, d25}, [lr]! - vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0]) - vld1.u8 {d26, d27}, [lr]! - vmull.u8 q2, d23, d0 - vld1.u8 {d28, d29}, [lr]! - vmull.u8 q3, d24, d0 - vld1.u8 {d30, d31}, [lr]! - - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1]) - vmlal.u8 q2, d25, d1 - vmlal.u8 q3, d26, d1 - vmlal.u8 q4, d27, d1 - vmlal.u8 q5, d28, d1 - vmlal.u8 q6, d29, d1 - vmlal.u8 q7, d30, d1 - vmlal.u8 q8, d31, d1 - - subs r12, r12, #1 - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - vqrshrn.u16 d6, q5, #7 - vqrshrn.u16 d7, q6, #7 - vqrshrn.u16 d8, q7, #7 - vqrshrn.u16 d9, q8, #7 - - vst1.u8 {d2, d3}, [r3]! ;store result - vst1.u8 {d4, d5}, [r3]! - vst1.u8 {d6, d7}, [r3]! - vmov q11, q15 - vst1.u8 {d8, d9}, [r3]! - - bne vp8e_filt_blk2d_sp16x16_loop_neon - - b sub_pixel_variance16x16_neon - -;-------------------- -firstpass_bfilter16x16_only - mov r2, #4 ;loop counter - sub sp, sp, #528 ;reserve space on stack for temporary storage - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vdup.8 d1, d31[4] - mov r3, sp - -;First Pass: output_height lines x output_width columns (16x16) -vp8e_filt_blk2d_fpo16x16_loop_neon - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - vld1.u8 {d5, d6, d7}, [r0], r1 - vld1.u8 {d8, d9, d10}, [r0], r1 - vld1.u8 {d11, d12, d13}, [r0], r1 - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0]) - vmull.u8 q8, d3, d0 - vmull.u8 q9, d5, d0 - vmull.u8 q10, d6, d0 - vmull.u8 q11, d8, d0 - vmull.u8 q12, d9, d0 - vmull.u8 q13, d11, d0 - vmull.u8 q14, d12, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - vext.8 d11, d11, d12, #1 - - vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q9, d5, d1 - vmlal.u8 q11, d8, d1 - vmlal.u8 q13, d11, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - vext.8 d12, d12, d13, #1 - - vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q10, d6, d1 - vmlal.u8 q12, d9, d1 - vmlal.u8 q14, d12, d1 - - subs r2, r2, #1 - - vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d15, q8, #7 - vqrshrn.u16 d16, q9, #7 - vqrshrn.u16 d17, q10, #7 - vqrshrn.u16 d18, q11, #7 - vqrshrn.u16 d19, q12, #7 - vqrshrn.u16 d20, q13, #7 - vst1.u8 {d14, d15}, [r3]! ;store result - vqrshrn.u16 d21, q14, #7 - - vst1.u8 {d16, d17}, [r3]! - vst1.u8 {d18, d19}, [r3]! - vst1.u8 {d20, d21}, [r3]! - - bne vp8e_filt_blk2d_fpo16x16_loop_neon - - b sub_pixel_variance16x16_neon - -;--------------------- -secondpass_bfilter16x16_only -;Second pass: 16x16 -;secondpass_filter - sub sp, sp, #528 ;reserve space on stack for temporary storage - add r3, r12, r3, lsl #3 - mov r12, #4 ;loop counter - vld1.u32 {d31}, [r3] ;load second_pass filter - vld1.u8 {d22, d23}, [r0], r1 ;load src data - mov r3, sp - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - -vp8e_filt_blk2d_spo16x16_loop_neon - vld1.u8 {d24, d25}, [r0], r1 - vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0]) - vld1.u8 {d26, d27}, [r0], r1 - vmull.u8 q2, d23, d0 - vld1.u8 {d28, d29}, [r0], r1 - vmull.u8 q3, d24, d0 - vld1.u8 {d30, d31}, [r0], r1 - - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1]) - vmlal.u8 q2, d25, d1 - vmlal.u8 q3, d26, d1 - vmlal.u8 q4, d27, d1 - vmlal.u8 q5, d28, d1 - vmlal.u8 q6, d29, d1 - vmlal.u8 q7, d30, d1 - vmlal.u8 q8, d31, d1 - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - vqrshrn.u16 d6, q5, #7 - vqrshrn.u16 d7, q6, #7 - vqrshrn.u16 d8, q7, #7 - vqrshrn.u16 d9, q8, #7 - - vst1.u8 {d2, d3}, [r3]! ;store result - subs r12, r12, #1 - vst1.u8 {d4, d5}, [r3]! - vmov q11, q15 - vst1.u8 {d6, d7}, [r3]! - vst1.u8 {d8, d9}, [r3]! - - bne vp8e_filt_blk2d_spo16x16_loop_neon - - b sub_pixel_variance16x16_neon - -;---------------------------- -;variance16x16 -sub_pixel_variance16x16_neon - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - - sub r3, r3, #256 - mov r12, #8 - -sub_pixel_variance16x16_neon_loop - vld1.8 {q0}, [r3]! ;Load up source and reference - vld1.8 {q2}, [r4], r5 - vld1.8 {q1}, [r3]! - vld1.8 {q3}, [r4], r5 - - vsubl.u8 q11, d0, d4 ;diff - vsubl.u8 q12, d1, d5 - vsubl.u8 q13, d2, d6 - vsubl.u8 q14, d3, d7 - - vpadal.s16 q8, q11 ;sum - vmlal.s16 q9, d22, d22 ;sse - vmlal.s16 q10, d23, d23 - - subs r12, r12, #1 - - vpadal.s16 q8, q12 - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - vpadal.s16 q8, q13 - vmlal.s16 q9, d26, d26 - vmlal.s16 q10, d27, d27 - vpadal.s16 q8, q14 - vmlal.s16 q9, d28, d28 - vmlal.s16 q10, d29, d29 - - bne sub_pixel_variance16x16_neon_loop - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [r6] ;store sse - vshr.s32 d10, d10, #8 - vsub.s32 d0, d1, d10 - - add sp, sp, #528 - vmov.32 r0, d0[0] ;return - - pop {r4-r6,pc} - - ENDP - -;----------------- - -_BilinearTaps_coeff_ - DCD bilinear_taps_coeff -bilinear_taps_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - - END diff --git a/vp9/encoder/arm/neon/vp9_subpixelvariance16x16s_neon.asm b/vp9/encoder/arm/neon/vp9_subpixelvariance16x16s_neon.asm deleted file mode 100644 index a3faf9a77..000000000 --- a/vp9/encoder/arm/neon/vp9_subpixelvariance16x16s_neon.asm +++ /dev/null @@ -1,572 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_variance_halfpixvar16x16_h_neon| - EXPORT |vp9_variance_halfpixvar16x16_v_neon| - EXPORT |vp9_variance_halfpixvar16x16_hv_neon| - EXPORT |vp9_sub_pixel_variance16x16s_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;================================================ -;unsigned int vp9_variance_halfpixvar16x16_h_neon -;( -; unsigned char *src_ptr, r0 -; int src_pixels_per_line, r1 -; unsigned char *dst_ptr, r2 -; int dst_pixels_per_line, r3 -; unsigned int *sse -;); -;================================================ -|vp9_variance_halfpixvar16x16_h_neon| PROC - push {lr} - - mov r12, #4 ;loop counter - ldr lr, [sp, #4] ;load *sse from stack - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - -;First Pass: output_height lines x output_width columns (16x16) -vp8_filt_fpo16x16s_4_0_loop_neon - vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data - vld1.8 {q11}, [r2], r3 - vld1.u8 {d4, d5, d6, d7}, [r0], r1 - vld1.8 {q12}, [r2], r3 - vld1.u8 {d8, d9, d10, d11}, [r0], r1 - vld1.8 {q13}, [r2], r3 - vld1.u8 {d12, d13, d14, d15}, [r0], r1 - - ;pld [r0] - ;pld [r0, r1] - ;pld [r0, r1, lsl #1] - - vext.8 q1, q0, q1, #1 ;construct src_ptr[1] - vext.8 q3, q2, q3, #1 - vext.8 q5, q4, q5, #1 - vext.8 q7, q6, q7, #1 - - vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 - vld1.8 {q14}, [r2], r3 - vrhadd.u8 q1, q2, q3 - vrhadd.u8 q2, q4, q5 - vrhadd.u8 q3, q6, q7 - - vsubl.u8 q4, d0, d22 ;diff - vsubl.u8 q5, d1, d23 - vsubl.u8 q6, d2, d24 - vsubl.u8 q7, d3, d25 - vsubl.u8 q0, d4, d26 - vsubl.u8 q1, d5, d27 - vsubl.u8 q2, d6, d28 - vsubl.u8 q3, d7, d29 - - vpadal.s16 q8, q4 ;sum - vmlal.s16 q9, d8, d8 ;sse - vmlal.s16 q10, d9, d9 - - subs r12, r12, #1 - - vpadal.s16 q8, q5 - vmlal.s16 q9, d10, d10 - vmlal.s16 q10, d11, d11 - vpadal.s16 q8, q6 - vmlal.s16 q9, d12, d12 - vmlal.s16 q10, d13, d13 - vpadal.s16 q8, q7 - vmlal.s16 q9, d14, d14 - vmlal.s16 q10, d15, d15 - - vpadal.s16 q8, q0 ;sum - vmlal.s16 q9, d0, d0 ;sse - vmlal.s16 q10, d1, d1 - vpadal.s16 q8, q1 - vmlal.s16 q9, d2, d2 - vmlal.s16 q10, d3, d3 - vpadal.s16 q8, q2 - vmlal.s16 q9, d4, d4 - vmlal.s16 q10, d5, d5 - vpadal.s16 q8, q3 - vmlal.s16 q9, d6, d6 - vmlal.s16 q10, d7, d7 - - bne vp8_filt_fpo16x16s_4_0_loop_neon - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [lr] ;store sse - vshr.s32 d10, d10, #8 - vsub.s32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - pop {pc} - ENDP - -;================================================ -;unsigned int vp9_variance_halfpixvar16x16_v_neon -;( -; unsigned char *src_ptr, r0 -; int src_pixels_per_line, r1 -; unsigned char *dst_ptr, r2 -; int dst_pixels_per_line, r3 -; unsigned int *sse -;); -;================================================ -|vp9_variance_halfpixvar16x16_v_neon| PROC - push {lr} - - mov r12, #4 ;loop counter - - vld1.u8 {q0}, [r0], r1 ;load src data - ldr lr, [sp, #4] ;load *sse from stack - - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - -vp8_filt_spo16x16s_0_4_loop_neon - vld1.u8 {q2}, [r0], r1 - vld1.8 {q1}, [r2], r3 - vld1.u8 {q4}, [r0], r1 - vld1.8 {q3}, [r2], r3 - vld1.u8 {q6}, [r0], r1 - vld1.8 {q5}, [r2], r3 - vld1.u8 {q15}, [r0], r1 - - vrhadd.u8 q0, q0, q2 - vld1.8 {q7}, [r2], r3 - vrhadd.u8 q2, q2, q4 - vrhadd.u8 q4, q4, q6 - vrhadd.u8 q6, q6, q15 - - vsubl.u8 q11, d0, d2 ;diff - vsubl.u8 q12, d1, d3 - vsubl.u8 q13, d4, d6 - vsubl.u8 q14, d5, d7 - vsubl.u8 q0, d8, d10 - vsubl.u8 q1, d9, d11 - vsubl.u8 q2, d12, d14 - vsubl.u8 q3, d13, d15 - - vpadal.s16 q8, q11 ;sum - vmlal.s16 q9, d22, d22 ;sse - vmlal.s16 q10, d23, d23 - - subs r12, r12, #1 - - vpadal.s16 q8, q12 - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - vpadal.s16 q8, q13 - vmlal.s16 q9, d26, d26 - vmlal.s16 q10, d27, d27 - vpadal.s16 q8, q14 - vmlal.s16 q9, d28, d28 - vmlal.s16 q10, d29, d29 - - vpadal.s16 q8, q0 ;sum - vmlal.s16 q9, d0, d0 ;sse - vmlal.s16 q10, d1, d1 - vpadal.s16 q8, q1 - vmlal.s16 q9, d2, d2 - vmlal.s16 q10, d3, d3 - vpadal.s16 q8, q2 - vmlal.s16 q9, d4, d4 - vmlal.s16 q10, d5, d5 - - vmov q0, q15 - - vpadal.s16 q8, q3 - vmlal.s16 q9, d6, d6 - vmlal.s16 q10, d7, d7 - - bne vp8_filt_spo16x16s_0_4_loop_neon - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [lr] ;store sse - vshr.s32 d10, d10, #8 - vsub.s32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - pop {pc} - ENDP - -;================================================ -;unsigned int vp9_variance_halfpixvar16x16_hv_neon -;( -; unsigned char *src_ptr, r0 -; int src_pixels_per_line, r1 -; unsigned char *dst_ptr, r2 -; int dst_pixels_per_line, r3 -; unsigned int *sse -;); -;================================================ -|vp9_variance_halfpixvar16x16_hv_neon| PROC - push {lr} - - vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data - - ldr lr, [sp, #4] ;load *sse from stack - vmov.i8 q13, #0 ;q8 - sum - vext.8 q1, q0, q1, #1 ;construct src_ptr[1] - - vmov.i8 q14, #0 ;q9, q10 - sse - vmov.i8 q15, #0 - - mov r12, #4 ;loop counter - vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 - -;First Pass: output_height lines x output_width columns (17x16) -vp8_filt16x16s_4_4_loop_neon - vld1.u8 {d4, d5, d6, d7}, [r0], r1 - vld1.u8 {d8, d9, d10, d11}, [r0], r1 - vld1.u8 {d12, d13, d14, d15}, [r0], r1 - vld1.u8 {d16, d17, d18, d19}, [r0], r1 - - ;pld [r0] - ;pld [r0, r1] - ;pld [r0, r1, lsl #1] - - vext.8 q3, q2, q3, #1 ;construct src_ptr[1] - vext.8 q5, q4, q5, #1 - vext.8 q7, q6, q7, #1 - vext.8 q9, q8, q9, #1 - - vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 - vrhadd.u8 q2, q4, q5 - vrhadd.u8 q3, q6, q7 - vrhadd.u8 q4, q8, q9 - - vld1.8 {q5}, [r2], r3 - vrhadd.u8 q0, q0, q1 - vld1.8 {q6}, [r2], r3 - vrhadd.u8 q1, q1, q2 - vld1.8 {q7}, [r2], r3 - vrhadd.u8 q2, q2, q3 - vld1.8 {q8}, [r2], r3 - vrhadd.u8 q3, q3, q4 - - vsubl.u8 q9, d0, d10 ;diff - vsubl.u8 q10, d1, d11 - vsubl.u8 q11, d2, d12 - vsubl.u8 q12, d3, d13 - - vsubl.u8 q0, d4, d14 ;diff - vsubl.u8 q1, d5, d15 - vsubl.u8 q5, d6, d16 - vsubl.u8 q6, d7, d17 - - vpadal.s16 q13, q9 ;sum - vmlal.s16 q14, d18, d18 ;sse - vmlal.s16 q15, d19, d19 - - vpadal.s16 q13, q10 ;sum - vmlal.s16 q14, d20, d20 ;sse - vmlal.s16 q15, d21, d21 - - vpadal.s16 q13, q11 ;sum - vmlal.s16 q14, d22, d22 ;sse - vmlal.s16 q15, d23, d23 - - vpadal.s16 q13, q12 ;sum - vmlal.s16 q14, d24, d24 ;sse - vmlal.s16 q15, d25, d25 - - subs r12, r12, #1 - - vpadal.s16 q13, q0 ;sum - vmlal.s16 q14, d0, d0 ;sse - vmlal.s16 q15, d1, d1 - - vpadal.s16 q13, q1 ;sum - vmlal.s16 q14, d2, d2 ;sse - vmlal.s16 q15, d3, d3 - - vpadal.s16 q13, q5 ;sum - vmlal.s16 q14, d10, d10 ;sse - vmlal.s16 q15, d11, d11 - - vmov q0, q4 - - vpadal.s16 q13, q6 ;sum - vmlal.s16 q14, d12, d12 ;sse - vmlal.s16 q15, d13, d13 - - bne vp8_filt16x16s_4_4_loop_neon - - vadd.u32 q15, q14, q15 ;accumulate sse - vpaddl.s32 q0, q13 ;accumulate sum - - vpaddl.u32 q1, q15 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [lr] ;store sse - vshr.s32 d10, d10, #8 - vsub.s32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - pop {pc} - ENDP - -;============================== -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; stack unsigned char *dst_ptr, -; stack int dst_pixels_per_line, -; stack unsigned int *sse -;note: in vp8_find_best_half_pixel_step()(called when 8common.rtcd.flags; - -#if HAVE_ARMV5TE - if (flags & HAS_EDSP) { - } -#endif - -#if HAVE_ARMV6 - if (flags & HAS_MEDIA) { - cpi->rtcd.variance.sad16x16 = vp9_sad16x16_armv6; - /*cpi->rtcd.variance.sad16x8 = vp9_sad16x8_c; - cpi->rtcd.variance.sad8x16 = vp9_sad8x16_c; - cpi->rtcd.variance.sad8x8 = vp9_sad8x8_c; - cpi->rtcd.variance.sad4x4 = vp9_sad4x4_c;*/ - - /*cpi->rtcd.variance.var4x4 = vp9_variance4x4_c;*/ - cpi->rtcd.variance.var8x8 = vp9_variance8x8_armv6; - /*cpi->rtcd.variance.var8x16 = vp9_variance8x16_c; - cpi->rtcd.variance.var16x8 = vp9_variance16x8_c;*/ - cpi->rtcd.variance.var16x16 = vp9_variance16x16_armv6; - - /*cpi->rtcd.variance.subpixvar4x4 = vp9_sub_pixel_variance4x4_c;*/ - cpi->rtcd.variance.subpixvar8x8 = vp9_sub_pixel_variance8x8_armv6; - /*cpi->rtcd.variance.subpixvar8x16 = vp9_sub_pixel_variance8x16_c; - cpi->rtcd.variance.subpixvar16x8 = vp9_sub_pixel_variance16x8_c;*/ - cpi->rtcd.variance.subpixvar16x16 = vp9_sub_pixel_variance16x16_armv6; - cpi->rtcd.variance.halfpixvar16x16_h = vp9_variance_halfpixvar16x16_h_armv6; - cpi->rtcd.variance.halfpixvar16x16_v = vp9_variance_halfpixvar16x16_v_armv6; - cpi->rtcd.variance.halfpixvar16x16_hv = vp9_variance_halfpixvar16x16_hv_armv6; - - cpi->rtcd.variance.mse16x16 = vp9_mse16x16_armv6; - /*cpi->rtcd.variance.getmbss = vp9_get_mb_ss_c;*/ - - cpi->rtcd.fdct.short4x4 = vp9_short_fdct4x4_armv6; - cpi->rtcd.fdct.short8x4 = vp9_short_fdct8x4_armv6; - cpi->rtcd.fdct.fast4x4 = vp9_short_fdct4x4_armv6; - cpi->rtcd.fdct.fast8x4 = vp9_short_fdct8x4_armv6; - cpi->rtcd.fdct.walsh_short4x4 = vp9_short_walsh4x4_armv6; - - /*cpi->rtcd.encodemb.berr = vp9_block_error_c; - cpi->rtcd.encodemb.mberr = vp9_mbblock_error_c; - cpi->rtcd.encodemb.mbuverr = vp9_mbuverror_c;*/ - cpi->rtcd.encodemb.subb = vp9_subtract_b_armv6; - cpi->rtcd.encodemb.submby = vp9_subtract_mby_armv6; - cpi->rtcd.encodemb.submbuv = vp9_subtract_mbuv_armv6; - - /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;*/ - cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_armv6; - } -#endif - -#if HAVE_ARMV7 - if (flags & HAS_NEON) { - cpi->rtcd.variance.sad16x16 = vp9_sad16x16_neon; - cpi->rtcd.variance.sad16x8 = vp9_sad16x8_neon; - cpi->rtcd.variance.sad8x16 = vp9_sad8x16_neon; - cpi->rtcd.variance.sad8x8 = vp9_sad8x8_neon; - cpi->rtcd.variance.sad4x4 = vp9_sad4x4_neon; - - /*cpi->rtcd.variance.var4x4 = vp9_variance4x4_c;*/ - cpi->rtcd.variance.var8x8 = vp9_variance8x8_neon; - cpi->rtcd.variance.var8x16 = vp9_variance8x16_neon; - cpi->rtcd.variance.var16x8 = vp9_variance16x8_neon; - cpi->rtcd.variance.var16x16 = vp9_variance16x16_neon; - - /*cpi->rtcd.variance.subpixvar4x4 = vp9_sub_pixel_variance4x4_c;*/ - cpi->rtcd.variance.subpixvar8x8 = vp9_sub_pixel_variance8x8_neon; - /*cpi->rtcd.variance.subpixvar8x16 = vp9_sub_pixel_variance8x16_c; - cpi->rtcd.variance.subpixvar16x8 = vp9_sub_pixel_variance16x8_c;*/ - cpi->rtcd.variance.subpixvar16x16 = vp9_sub_pixel_variance16x16_neon; - cpi->rtcd.variance.halfpixvar16x16_h = vp9_variance_halfpixvar16x16_h_neon; - cpi->rtcd.variance.halfpixvar16x16_v = vp9_variance_halfpixvar16x16_v_neon; - cpi->rtcd.variance.halfpixvar16x16_hv = vp9_variance_halfpixvar16x16_hv_neon; - - cpi->rtcd.variance.mse16x16 = vp9_mse16x16_neon; - /*cpi->rtcd.variance.getmbss = vp9_get_mb_ss_c;*/ - - cpi->rtcd.fdct.short4x4 = vp9_short_fdct4x4_neon; - cpi->rtcd.fdct.short8x4 = vp9_short_fdct8x4_neon; - cpi->rtcd.fdct.fast4x4 = vp9_short_fdct4x4_neon; - cpi->rtcd.fdct.fast8x4 = vp9_short_fdct8x4_neon; - cpi->rtcd.fdct.walsh_short4x4 = vp9_short_walsh4x4_neon; - - /*cpi->rtcd.encodemb.berr = vp9_block_error_c; - cpi->rtcd.encodemb.mberr = vp9_mbblock_error_c; - cpi->rtcd.encodemb.mbuverr = vp9_mbuverror_c;*/ - cpi->rtcd.encodemb.subb = vp9_subtract_b_neon; - cpi->rtcd.encodemb.submby = vp9_subtract_mby_neon; - cpi->rtcd.encodemb.submbuv = vp9_subtract_mbuv_neon; - - /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b; - cpi->rtcd.quantize.quantb_pair = vp8_regular_quantize_b_pair;*/ - cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_neon; - cpi->rtcd.quantize.fastquantb_pair = vp8_fast_quantize_b_pair_neon; - } -#endif - -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (flags & HAS_NEON) -#endif - { - vp9_yv12_copy_partial_frame_ptr = vpxyv12_copy_partial_frame_neon; - } -#endif -#endif -} diff --git a/vp9/encoder/arm/vp9_boolhuff_arm.c b/vp9/encoder/arm/vp9_boolhuff_arm.c deleted file mode 100644 index 9ff8e5f56..000000000 --- a/vp9/encoder/arm/vp9_boolhuff_arm.c +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vp9/encoder/vp9_boolhuff.h" -#include "vp9/common/vp9_blockd.h" - -const unsigned int vp9_prob_cost[256] = { - 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046, - 1023, 1000, 979, 959, 940, 922, 905, 889, 873, 858, 843, 829, 816, 803, 790, 778, - 767, 755, 744, 733, 723, 713, 703, 693, 684, 675, 666, 657, 649, 641, 633, 625, - 617, 609, 602, 594, 587, 580, 573, 567, 560, 553, 547, 541, 534, 528, 522, 516, - 511, 505, 499, 494, 488, 483, 477, 472, 467, 462, 457, 452, 447, 442, 437, 433, - 428, 424, 419, 415, 410, 406, 401, 397, 393, 389, 385, 381, 377, 373, 369, 365, - 361, 357, 353, 349, 346, 342, 338, 335, 331, 328, 324, 321, 317, 314, 311, 307, - 304, 301, 297, 294, 291, 288, 285, 281, 278, 275, 272, 269, 266, 263, 260, 257, - 255, 252, 249, 246, 243, 240, 238, 235, 232, 229, 227, 224, 221, 219, 216, 214, - 211, 208, 206, 203, 201, 198, 196, 194, 191, 189, 186, 184, 181, 179, 177, 174, - 172, 170, 168, 165, 163, 161, 159, 156, 154, 152, 150, 148, 145, 143, 141, 139, - 137, 135, 133, 131, 129, 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, - 105, 103, 101, 99, 97, 95, 93, 92, 90, 88, 86, 84, 82, 81, 79, 77, - 75, 73, 72, 70, 68, 66, 65, 63, 61, 60, 58, 56, 55, 53, 51, 50, - 48, 46, 45, 43, 41, 40, 38, 37, 35, 33, 32, 30, 29, 27, 25, 24, - 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 1 -}; - diff --git a/vp9/encoder/arm/vp9_dct_arm.c b/vp9/encoder/arm/vp9_dct_arm.c deleted file mode 100644 index 5e20a4723..000000000 --- a/vp9/encoder/arm/vp9_dct_arm.c +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2011 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_config.h" -#include "./vp9_rtcd.h" - -#if HAVE_ARMV6 - -void vp9_short_fdct8x4_armv6(short *input, short *output, int pitch) { - vp9_short_fdct4x4_armv6(input, output, pitch); - vp9_short_fdct4x4_armv6(input + 4, output + 16, pitch); -} - -#endif /* HAVE_ARMV6 */ diff --git a/vp9/encoder/arm/vp9_dct_arm.h b/vp9/encoder/arm/vp9_dct_arm.h deleted file mode 100644 index 8eed31e60..000000000 --- a/vp9/encoder/arm/vp9_dct_arm.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_ENCODER_ARM_VP9_DCT_ARM_H_ -#define VP9_ENCODER_ARM_VP9_DCT_ARM_H_ - -#if HAVE_ARMV6 -extern prototype_fdct(vp9_short_walsh4x4_armv6); -extern prototype_fdct(vp9_short_fdct4x4_armv6); -extern prototype_fdct(vp9_short_fdct8x4_armv6); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_fdct_walsh_short4x4 -#define vp8_fdct_walsh_short4x4 vp9_short_walsh4x4_armv6 - -#undef vp8_fdct_short4x4 -#define vp8_fdct_short4x4 vp9_short_fdct4x4_armv6 - -#undef vp8_fdct_short8x4 -#define vp8_fdct_short8x4 vp9_short_fdct8x4_armv6 - -#undef vp8_fdct_fast4x4 -#define vp8_fdct_fast4x4 vp9_short_fdct4x4_armv6 - -#undef vp8_fdct_fast8x4 -#define vp8_fdct_fast8x4 vp9_short_fdct8x4_armv6 -#endif - -#endif /* HAVE_ARMV6 */ - -#if HAVE_ARMV7 -extern prototype_fdct(vp9_short_fdct4x4_neon); -extern prototype_fdct(vp9_short_fdct8x4_neon); -extern prototype_fdct(vp8_fast_fdct4x4_neon); -extern prototype_fdct(vp8_fast_fdct8x4_neon); -extern prototype_fdct(vp9_short_walsh4x4_neon); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_fdct_short4x4 -#define vp8_fdct_short4x4 vp9_short_fdct4x4_neon - -#undef vp8_fdct_short8x4 -#define vp8_fdct_short8x4 vp9_short_fdct8x4_neon - -#undef vp8_fdct_fast4x4 -#define vp8_fdct_fast4x4 vp9_short_fdct4x4_neon - -#undef vp8_fdct_fast8x4 -#define vp8_fdct_fast8x4 vp9_short_fdct8x4_neon - -#undef vp8_fdct_walsh_short4x4 -#define vp8_fdct_walsh_short4x4 vp9_short_walsh4x4_neon -#endif - -#endif - -#endif diff --git a/vp9/encoder/arm/vp9_encodemb_arm.h b/vp9/encoder/arm/vp9_encodemb_arm.h deleted file mode 100644 index 2f21d2cba..000000000 --- a/vp9/encoder/arm/vp9_encodemb_arm.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_ENCODER_ARM_VP9_ENCODEMB_ARM_H_ -#define VP9_ENCODER_ARM_VP9_ENCODEMB_ARM_H_ - -#if HAVE_ARMV6 -extern prototype_subb(vp9_subtract_b_armv6); -extern prototype_submby(vp9_subtract_mby_armv6); -extern prototype_submbuv(vp9_subtract_mbuv_armv6); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_encodemb_subb -#define vp8_encodemb_subb vp9_subtract_b_armv6 - -#undef vp8_encodemb_submby -#define vp8_encodemb_submby vp9_subtract_mby_armv6 - -#undef vp8_encodemb_submbuv -#define vp8_encodemb_submbuv vp9_subtract_mbuv_armv6 -#endif - -#endif /* HAVE_ARMV6 */ - -#if HAVE_ARMV7 -// extern prototype_berr(vp9_block_error_c); -// extern prototype_mberr(vp9_mbblock_error_c); -// extern prototype_mbuverr(vp9_mbuverror_c); - -extern prototype_subb(vp9_subtract_b_neon); -extern prototype_submby(vp9_subtract_mby_neon); -extern prototype_submbuv(vp9_subtract_mbuv_neon); - -// #undef vp8_encodemb_berr -// #define vp8_encodemb_berr vp9_block_error_c - -// #undef vp8_encodemb_mberr -// #define vp8_encodemb_mberr vp9_mbblock_error_c - -// #undef vp8_encodemb_mbuverr -// #define vp8_encodemb_mbuverr vp9_mbuverror_c - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_encodemb_subb -#define vp8_encodemb_subb vp9_subtract_b_neon - -#undef vp8_encodemb_submby -#define vp8_encodemb_submby vp9_subtract_mby_neon - -#undef vp8_encodemb_submbuv -#define vp8_encodemb_submbuv vp9_subtract_mbuv_neon -#endif - -#endif - -#endif diff --git a/vp9/encoder/arm/vp9_quantize_arm.c b/vp9/encoder/arm/vp9_quantize_arm.c deleted file mode 100644 index aacaa529c..000000000 --- a/vp9/encoder/arm/vp9_quantize_arm.c +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include -#include "vpx_mem/vpx_mem.h" - -#include "vp9/encoder/vp9_quantize.h" -#include "vp9/common/vp9_entropy.h" - - -#if HAVE_ARMV7 - -/* vp8_quantize_mbX functions here differs from corresponding ones in - * vp9_quantize.c only by using quantize_b_pair function pointer instead of - * the regular quantize_b function pointer */ -void vp8_quantize_mby_neon(MACROBLOCK *x) { - int i; - int has_2nd_order = get_2nd_order_usage(xd); - - for (i = 0; i < 16; i += 2) - x->quantize_b_pair(&x->block[i], &x->block[i + 1], - &x->e_mbd.block[i], &x->e_mbd.block[i + 1]); - - if (has_2nd_order) - x->quantize_b(&x->block[24], &x->e_mbd.block[24]); -} - -void vp8_quantize_mb_neon(MACROBLOCK *x) { - int i; - int has_2nd_order = get_2nd_order_usage(xd); - - for (i = 0; i < 24; i += 2) - x->quantize_b_pair(&x->block[i], &x->block[i + 1], - &x->e_mbd.block[i], &x->e_mbd.block[i + 1]); - - if (has_2nd_order) - x->quantize_b(&x->block[i], &x->e_mbd.block[i]); -} - - -void vp8_quantize_mbuv_neon(MACROBLOCK *x) { - int i; - - for (i = 16; i < 24; i += 2) - x->quantize_b_pair(&x->block[i], &x->block[i + 1], - &x->e_mbd.block[i], &x->e_mbd.block[i + 1]); -} - -#endif /* HAVE_ARMV7 */ diff --git a/vp9/encoder/arm/vp9_quantize_arm.h b/vp9/encoder/arm/vp9_quantize_arm.h deleted file mode 100644 index 41a83d7f9..000000000 --- a/vp9/encoder/arm/vp9_quantize_arm.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_ENCODER_ARM_VP9_QUANTIZE_ARM_H_ -#define VP9_ENCODER_ARM_VP9_QUANTIZE_ARM_H_ - -#if HAVE_ARMV6 - -extern prototype_quantize_block(vp8_fast_quantize_b_armv6); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_quantize_fastquantb -#define vp8_quantize_fastquantb vp8_fast_quantize_b_armv6 -#endif - -#endif /* HAVE_ARMV6 */ - - -#if HAVE_ARMV7 - -extern prototype_quantize_block(vp8_fast_quantize_b_neon); -extern prototype_quantize_block_pair(vp8_fast_quantize_b_pair_neon); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_quantize_fastquantb -#define vp8_quantize_fastquantb vp8_fast_quantize_b_neon - -#undef vp8_quantize_fastquantb_pair -#define vp8_quantize_fastquantb_pair vp8_fast_quantize_b_pair_neon - -#undef vp8_quantize_mb -#define vp8_quantize_mb vp8_quantize_mb_neon - -#undef vp8_quantize_mbuv -#define vp8_quantize_mbuv vp8_quantize_mbuv_neon - -#undef vp8_quantize_mby -#define vp8_quantize_mby vp8_quantize_mby_neon -#endif - -#endif /* HAVE_ARMV7 */ - -#endif - diff --git a/vp9/encoder/arm/vp9_variance_arm.c b/vp9/encoder/arm/vp9_variance_arm.c deleted file mode 100644 index 91c0236e3..000000000 --- a/vp9/encoder/arm/vp9_variance_arm.c +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_config.h" -#include "vp9/encoder/vp9_variance.h" -#include "vp9/common/vp9_filter.h" -#include "vp9/common/arm/vp9_bilinearfilter_arm.h" - -#define HALFNDX 8 - -#if HAVE_ARMV6 - -unsigned int vp9_sub_pixel_variance8x8_armv6 -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - unsigned short first_pass[10 * 8]; - unsigned char second_pass[8 * 8]; - const short *HFilter, *VFilter; - - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - vp9_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass, - src_pixels_per_line, - 9, 8, HFilter); - vp9_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, - 8, 8, 8, VFilter); - - return vp9_variance8x8_armv6(second_pass, 8, dst_ptr, - dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_variance16x16_armv6 -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - unsigned short first_pass[36 * 16]; - unsigned char second_pass[20 * 16]; - const short *HFilter, *VFilter; - unsigned int var; - - if (xoffset == HALFNDX && yoffset == 0) { - var = vp9_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, sse); - } else if (xoffset == 0 && yoffset == HALFNDX) { - var = vp9_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, sse); - } else if (xoffset == HALFNDX && yoffset == HALFNDX) { - var = vp9_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, sse); - } else { - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - vp9_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass, - src_pixels_per_line, - 17, 16, HFilter); - vp9_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, - 16, 16, 16, VFilter); - - var = vp9_variance16x16_armv6(second_pass, 16, dst_ptr, - dst_pixels_per_line, sse); - } - return var; -} - -#endif /* HAVE_ARMV6 */ - - -#if HAVE_ARMV7 - -unsigned int vp9_sub_pixel_variance16x16_neon -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - if (xoffset == HALFNDX && yoffset == 0) - return vp9_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); - else if (xoffset == 0 && yoffset == HALFNDX) - return vp9_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); - else if (xoffset == HALFNDX && yoffset == HALFNDX) - return vp9_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); - else - return vp9_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); -} - -#endif diff --git a/vp9/encoder/arm/vp9_variance_arm.h b/vp9/encoder/arm/vp9_variance_arm.h deleted file mode 100644 index 144feea3d..000000000 --- a/vp9/encoder/arm/vp9_variance_arm.h +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_ENCODER_ARM_VP9_VARIANCE_ARM_H_ -#define VP9_ENCODER_ARM_VP9_VARIANCE_ARM_H_ - -#if HAVE_ARMV6 - -extern prototype_sad(vp9_sad16x16_armv6); -extern prototype_variance(vp9_variance16x16_armv6); -extern prototype_variance(vp9_variance8x8_armv6); -extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_armv6); -extern prototype_subpixvariance(vp9_sub_pixel_variance8x8_armv6); -extern prototype_variance(vp9_variance_halfpixvar16x16_h_armv6); -extern prototype_variance(vp9_variance_halfpixvar16x16_v_armv6); -extern prototype_variance(vp9_variance_halfpixvar16x16_hv_armv6); -extern prototype_variance(vp9_mse16x16_armv6); - -#if !CONFIG_RUNTIME_CPU_DETECT - -#undef vp9_variance_sad16x16 -#define vp9_variance_sad16x16 vp9_sad16x16_armv6 - -#undef vp9_variance_subpixvar16x16 -#define vp9_variance_subpixvar16x16 vp9_sub_pixel_variance16x16_armv6 - -#undef vp9_variance_subpixvar8x8 -#define vp9_variance_subpixvar8x8 vp9_sub_pixel_variance8x8_armv6 - -#undef vp9_variance_var16x16 -#define vp9_variance_var16x16 vp9_variance16x16_armv6 - -#undef vp9_variance_mse16x16 -#define vp9_variance_mse16x16 vp9_mse16x16_armv6 - -#undef vp9_variance_var8x8 -#define vp9_variance_var8x8 vp9_variance8x8_armv6 - -#undef vp9_variance_halfpixvar16x16_h -#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_armv6 - -#undef vp9_variance_halfpixvar16x16_v -#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_armv6 - -#undef vp9_variance_halfpixvar16x16_hv -#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_armv6 - -#endif /* !CONFIG_RUNTIME_CPU_DETECT */ - -#endif /* HAVE_ARMV6 */ - - -#if HAVE_ARMV7 -extern prototype_sad(vp9_sad4x4_neon); -extern prototype_sad(vp9_sad8x8_neon); -extern prototype_sad(vp9_sad8x16_neon); -extern prototype_sad(vp9_sad16x8_neon); -extern prototype_sad(vp9_sad16x16_neon); - -extern prototype_variance(vp9_variance8x8_neon); -extern prototype_variance(vp9_variance8x16_neon); -extern prototype_variance(vp9_variance16x8_neon); -extern prototype_variance(vp9_variance16x16_neon); - -extern prototype_subpixvariance(vp9_sub_pixel_variance8x8_neon); -extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_neon); -extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_neon_func); -extern prototype_variance(vp9_variance_halfpixvar16x16_h_neon); -extern prototype_variance(vp9_variance_halfpixvar16x16_v_neon); -extern prototype_variance(vp9_variance_halfpixvar16x16_hv_neon); - -extern prototype_variance(vp9_mse16x16_neon); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_variance_sad4x4 -#define vp9_variance_sad4x4 vp9_sad4x4_neon - -#undef vp9_variance_sad8x8 -#define vp9_variance_sad8x8 vp9_sad8x8_neon - -#undef vp9_variance_sad8x16 -#define vp9_variance_sad8x16 vp9_sad8x16_neon - -#undef vp9_variance_sad16x8 -#define vp9_variance_sad16x8 vp9_sad16x8_neon - -#undef vp9_variance_sad16x16 -#define vp9_variance_sad16x16 vp9_sad16x16_neon - -#undef vp9_variance_var8x8 -#define vp9_variance_var8x8 vp9_variance8x8_neon - -#undef vp9_variance_var8x16 -#define vp9_variance_var8x16 vp9_variance8x16_neon - -#undef vp9_variance_var16x8 -#define vp9_variance_var16x8 vp9_variance16x8_neon - -#undef vp9_variance_var16x16 -#define vp9_variance_var16x16 vp9_variance16x16_neon - -#undef vp9_variance_subpixvar8x8 -#define vp9_variance_subpixvar8x8 vp9_sub_pixel_variance8x8_neon - -#undef vp9_variance_subpixvar16x16 -#define vp9_variance_subpixvar16x16 vp9_sub_pixel_variance16x16_neon - -#undef vp9_variance_halfpixvar16x16_h -#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_neon - -#undef vp9_variance_halfpixvar16x16_v -#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_neon - -#undef vp9_variance_halfpixvar16x16_hv -#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_neon - -#undef vp9_variance_mse16x16 -#define vp9_variance_mse16x16 vp9_mse16x16_neon - -#endif - -#endif - -#endif diff --git a/vp9/encoder/vp9_asm_enc_offsets.c b/vp9/encoder/vp9_asm_enc_offsets.c index 3fe9c8fb7..30431ff8c 100644 --- a/vp9/encoder/vp9_asm_enc_offsets.c +++ b/vp9/encoder/vp9_asm_enc_offsets.c @@ -79,12 +79,4 @@ END /* add asserts for any offset that is not supported by assembly code * add asserts for any size that is not supported by assembly code - - * These are used in vp8cx_pack_tokens. They are hard coded so if their sizes - * change they will have to be adjusted. */ - -#if HAVE_ARMV5TE -ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8) -ct_assert(vp9_extra_bit_struct_sz, sizeof(vp9_extra_bit_struct) == 16) -#endif diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index f94e00c1e..4270a1d35 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -40,24 +40,12 @@ #include "vp9/common/vp9_mvref_common.h" #include "vp9/encoder/vp9_temporal_filter.h" -#if ARCH_ARM -#include "vpx_ports/arm.h" -#endif - #include #include #include extern void print_tree_update_probs(); -#if HAVE_ARMV7 -extern void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, - YV12_BUFFER_CONFIG *dst_ybc); - -extern void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, - YV12_BUFFER_CONFIG *dst_ybc); -#endif - static void set_default_lf_deltas(VP9_COMP *cpi); #define DEFAULT_INTERP_FILTER EIGHTTAP /* SWITCHABLE for better performance */ @@ -4055,33 +4043,15 @@ static void Pass2Encode(VP9_COMP *cpi, unsigned long *size, } } -// For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us. -#if HAVE_ARMV7 -extern void vp9_push_neon(int64_t *store); -extern void vp9_pop_neon(int64_t *store); -#endif - int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time) { -#if HAVE_ARMV7 - int64_t store_reg[8]; -#endif VP9_COMP *cpi = (VP9_COMP *) ptr; VP9_COMMON *cm = &cpi->common; struct vpx_usec_timer timer; int res = 0; -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp9_push_neon(store_reg); - } -#endif - vpx_usec_timer_start(&timer); if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags, cpi->active_map_enabled ? cpi->active_map : NULL)) @@ -4090,15 +4060,6 @@ int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags, vpx_usec_timer_mark(&timer); cpi->time_receive_data += vpx_usec_timer_elapsed(&timer); -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp9_pop_neon(store_reg); - } -#endif - return res; } @@ -4119,9 +4080,6 @@ static int frame_is_reference(const VP9_COMP *cpi) { int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, int64_t *time_stamp, int64_t *time_end, int flush) { -#if HAVE_ARMV7 - int64_t store_reg[8]; -#endif VP9_COMP *cpi = (VP9_COMP *) ptr; VP9_COMMON *cm = &cpi->common; struct vpx_usec_timer cmptimer; @@ -4130,15 +4088,6 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, if (!cpi) return -1; -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp9_push_neon(store_reg); - } -#endif - vpx_usec_timer_start(&cmptimer); cpi->source = NULL; @@ -4191,14 +4140,6 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, cpi->twopass.first_pass_done = 1; } -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp9_pop_neon(store_reg); - } -#endif return -1; } @@ -4425,15 +4366,6 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, #endif -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp9_pop_neon(store_reg); - } -#endif - return 0; } diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c index 824951afa..f10fb3a1d 100644 --- a/vp9/encoder/vp9_picklpf.c +++ b/vp9/encoder/vp9_picklpf.c @@ -17,13 +17,6 @@ #include "vpx_scale/vpxscale.h" #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_loopfilter.h" -#if ARCH_ARM -#include "vpx_ports/arm.h" -#endif - -#if HAVE_ARMV7 -extern void vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); -#endif void vp9_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction) { @@ -254,22 +247,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { int Bias = 0; // Bias against raising loop filter and in favour of lowering it // Make a copy of the unfiltered / processed recon buffer -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(cm->frame_to_show, &cpi->last_frame_uf); - } -#if CONFIG_RUNTIME_CPU_DETECT - else -#endif -#endif -#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT - { - vp8_yv12_copy_frame(cm->frame_to_show, &cpi->last_frame_uf); - } -#endif + vp8_yv12_copy_frame(cm->frame_to_show, &cpi->last_frame_uf); if (cm->frame_type == KEY_FRAME) cm->sharpness_level = 0; @@ -295,22 +273,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { filt_best = filt_mid; // Re-instate the unfiltered frame -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show); - } -#if CONFIG_RUNTIME_CPU_DETECT - else -#endif -#endif -#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT - { - vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); - } -#endif + vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); while (filter_step > 0) { Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; // PGW change 12/12/06 for small images @@ -334,22 +297,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { filt_err = vp9_calc_ss_err(sd, cm->frame_to_show); // Re-instate the unfiltered frame -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show); - } -#if CONFIG_RUNTIME_CPU_DETECT - else -#endif -#endif -#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT - { - vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); - } -#endif + vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); // If value is close to the best so far then bias towards a lower loop filter value. if ((filt_err - Bias) < best_err) { @@ -369,22 +317,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { filt_err = vp9_calc_ss_err(sd, cm->frame_to_show); // Re-instate the unfiltered frame -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show); - } -#if CONFIG_RUNTIME_CPU_DETECT - else -#endif -#endif -#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT - { - vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); - } -#endif + vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); // Was it better than the previous best? if (filt_err < (best_err - Bias)) { @@ -405,4 +338,3 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { cm->filter_level = filt_best; } - diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h index d801ca74b..dd11e75ba 100644 --- a/vp9/encoder/vp9_quantize.h +++ b/vp9/encoder/vp9_quantize.h @@ -26,10 +26,6 @@ #include "x86/vp9_quantize_x86.h" #endif -#if ARCH_ARM -#include "arm/vp9_quantize_arm.h" -#endif - #define prototype_quantize_block_type(sym) \ void (sym)(BLOCK *b, BLOCKD *d, TX_TYPE type) extern prototype_quantize_block_type(vp9_ht_quantize_b_4x4); -- cgit v1.2.3