diff options
author | John Koleszar <jkoleszar@google.com> | 2010-05-18 11:58:33 -0400 |
---|---|---|
committer | John Koleszar <jkoleszar@google.com> | 2010-05-18 11:58:33 -0400 |
commit | 0ea50ce9cb4b65eee6afa1d041fe8beb5abda667 (patch) | |
tree | 1f3b9019f28bc56fd3156f96e5a9653a983ee61b /vp8/encoder/arm/neon | |
download | libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.gz libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.bz2 libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.zip |
Initial WebM release
Diffstat (limited to 'vp8/encoder/arm/neon')
18 files changed, 4364 insertions, 0 deletions
diff --git a/vp8/encoder/arm/neon/boolhuff_armv7.asm b/vp8/encoder/arm/neon/boolhuff_armv7.asm new file mode 100644 index 000000000..9a5f36661 --- /dev/null +++ b/vp8/encoder/arm/neon/boolhuff_armv7.asm @@ -0,0 +1,292 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_start_encode| + EXPORT |vp8_encode_bool| + EXPORT |vp8_stop_encode| + EXPORT |vp8_encode_value| + + INCLUDE vpx_vp8_enc_asm_offsets.asm + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY + +; r0 BOOL_CODER *br +; r1 unsigned char *source + +|vp8_start_encode| PROC + mov r12, #0 + mov r3, #255 + mvn r2, #23 + str r12, [r0, #vp8_writer_lowvalue] + str r3, [r0, #vp8_writer_range] + str r12, [r0, #vp8_writer_value] + str r2, [r0, #vp8_writer_count] + str r12, [r0, #vp8_writer_pos] + str r1, [r0, #vp8_writer_buffer] + bx lr + ENDP + +; r0 BOOL_CODER *br +; r1 int bit +; r2 int probability +|vp8_encode_bool| PROC + push {r4-r9, lr} + + mov r4, r2 + + ldr r2, [r0, #vp8_writer_lowvalue] + ldr r5, [r0, #vp8_writer_range] + ldr r3, [r0, #vp8_writer_count] + + sub r7, r5, #1 ; range-1 + + cmp r1, #0 + mul r4, r4, r7 ; ((range-1) * probability) + + mov r7, #1 + add r4, r7, r4, lsr #8 ; 1 + (((range-1) * probability) >> 8) + + addne r2, r2, r4 ; if (bit) lowvalue += split + subne r4, r5, r4 ; if (bit) range = range-split + + ; Counting the leading zeros is used to normalize range. + clz r6, r4 + sub r6, r6, #24 ; shift + + ; Flag is set on the sum of count. This flag is used later + ; to determine if count >= 0 + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi token_count_lt_zero ; if(count >= 0) + + sub r6, r6, r3 ; offset = shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl token_high_bit_not_set + + ldr r4, [r0, #vp8_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos-1 + b token_zero_while_start +token_zero_while_loop + mov r9, #0 + strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +token_zero_while_start + cmp r4, #0 + ldrge r7, [r0, #vp8_writer_buffer] + ldrb r1, [r7, r4] + cmpge r1, #0xff + beq token_zero_while_loop + + ldr r7, [r0, #vp8_writer_buffer] + ldrb r9, [r7, r4] ; w->buffer[x] + add r9, r9, #1 + strb r9, [r7, r4] ; w->buffer[x] + 1 +token_high_bit_not_set + rsb r4, r6, #24 ; 24-offset + ldr r9, [r0, #vp8_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp8_writer_pos] ; w->pos + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r1, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r1, [r0, #vp8_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r9, r4] ; w->buffer[w->pos++] + +token_count_lt_zero + lsl r2, r2, r6 ; lowvalue <<= shift + + str r2, [r0, #vp8_writer_lowvalue] + str r5, [r0, #vp8_writer_range] + str r3, [r0, #vp8_writer_count] + pop {r4-r9, pc} + ENDP + +; r0 BOOL_CODER *br +|vp8_stop_encode| PROC + push {r4-r10, lr} + + ldr r2, [r0, #vp8_writer_lowvalue] + ldr r5, [r0, #vp8_writer_range] + ldr r3, [r0, #vp8_writer_count] + + mov r10, #32 + +stop_encode_loop + sub r7, r5, #1 ; range-1 + + mov r4, r7, lsl #7 ; ((range-1) * 128) + + mov r7, #1 + add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8) + + ; Counting the leading zeros is used to normalize range. + clz r6, r4 + sub r6, r6, #24 ; shift + + ; Flag is set on the sum of count. This flag is used later + ; to determine if count >= 0 + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi token_count_lt_zero_se ; if(count >= 0) + + sub r6, r6, r3 ; offset = shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl token_high_bit_not_set_se + + ldr r4, [r0, #vp8_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos-1 + b token_zero_while_start_se +token_zero_while_loop_se + mov r9, #0 + strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +token_zero_while_start_se + cmp r4, #0 + ldrge r7, [r0, #vp8_writer_buffer] + ldrb r1, [r7, r4] + cmpge r1, #0xff + beq token_zero_while_loop_se + + ldr r7, [r0, #vp8_writer_buffer] + ldrb r9, [r7, r4] ; w->buffer[x] + add r9, r9, #1 + strb r9, [r7, r4] ; w->buffer[x] + 1 +token_high_bit_not_set_se + rsb r4, r6, #24 ; 24-offset + ldr r9, [r0, #vp8_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp8_writer_pos] ; w->pos + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r1, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r1, [r0, #vp8_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r9, r4] ; w->buffer[w->pos++] + +token_count_lt_zero_se + lsl r2, r2, r6 ; lowvalue <<= shift + + subs r10, r10, #1 + bne stop_encode_loop + + str r2, [r0, #vp8_writer_lowvalue] + str r5, [r0, #vp8_writer_range] + str r3, [r0, #vp8_writer_count] + pop {r4-r10, pc} + + ENDP + +; r0 BOOL_CODER *br +; r1 int data +; r2 int bits +|vp8_encode_value| PROC + push {r4-r11, lr} + + mov r10, r2 + + ldr r2, [r0, #vp8_writer_lowvalue] + ldr r5, [r0, #vp8_writer_range] + ldr r3, [r0, #vp8_writer_count] + + ; reverse the stream of bits to be packed. Normally + ; the most significant bit is peeled off and compared + ; in the form of (v >> --n) & 1. ARM architecture has + ; the ability to set a flag based on the value of the + ; bit shifted off the bottom of the register. To make + ; that happen the bitstream is reversed. + rbit r11, r1 + rsb r4, r10, #32 ; 32-n + + ; v is kept in r1 during the token pack loop + lsr r1, r11, r4 ; v >>= 32 - n + +encode_value_loop + sub r7, r5, #1 ; range-1 + + ; Decisions are made based on the bit value shifted + ; off of v, so set a flag here based on this. + ; This value is refered to as "bb" + lsrs r1, r1, #1 ; bit = v >> n + mov r4, r7, lsl #7 ; ((range-1) * 128) + + mov r7, #1 + add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8) + + addcs r2, r2, r4 ; if (bit) lowvalue += split + subcs r4, r5, r4 ; if (bit) range = range-split + + ; Counting the leading zeros is used to normalize range. + clz r6, r4 + sub r6, r6, #24 ; shift + + ; Flag is set on the sum of count. This flag is used later + ; to determine if count >= 0 + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi token_count_lt_zero_ev ; if(count >= 0) + + sub r6, r6, r3 ; offset = shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl token_high_bit_not_set_ev + + ldr r4, [r0, #vp8_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos-1 + b token_zero_while_start_ev +token_zero_while_loop_ev + mov r9, #0 + strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +token_zero_while_start_ev + cmp r4, #0 + ldrge r7, [r0, #vp8_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq token_zero_while_loop_ev + + ldr r7, [r0, #vp8_writer_buffer] + ldrb r9, [r7, r4] ; w->buffer[x] + add r9, r9, #1 + strb r9, [r7, r4] ; w->buffer[x] + 1 +token_high_bit_not_set_ev + rsb r4, r6, #24 ; 24-offset + ldr r9, [r0, #vp8_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp8_writer_pos] ; w->pos + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp8_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r9, r4] ; w->buffer[w->pos++] + +token_count_lt_zero_ev + lsl r2, r2, r6 ; lowvalue <<= shift + + subs r10, r10, #1 + bne encode_value_loop + + str r2, [r0, #vp8_writer_lowvalue] + str r5, [r0, #vp8_writer_range] + str r3, [r0, #vp8_writer_count] + pop {r4-r11, pc} + ENDP + + END diff --git a/vp8/encoder/arm/neon/fastfdct4x4_neon.asm b/vp8/encoder/arm/neon/fastfdct4x4_neon.asm new file mode 100644 index 000000000..d5dec440d --- /dev/null +++ b/vp8/encoder/arm/neon/fastfdct4x4_neon.asm @@ -0,0 +1,126 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_fast_fdct4x4_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch); +;NOTE: +;The input *src_diff. src_diff is calculated as: +;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function) +;In which *src_ptr and *pred_ptr both are unsigned char. +;Therefore, *src_diff should be in the range of [-255, 255]. +;CAUTION: +;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255]. +;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes +;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c. + +|vp8_fast_fdct4x4_neon| PROC + vld1.16 {d2}, [r0], r2 ;load input + ldr r12, _ffdct_coeff_ + vld1.16 {d3}, [r0], r2 + vld1.16 {d4}, [r0], r2 + vld1.16 {d0}, [r12] + vld1.16 {d5}, [r0], r2 + + ;First for-loop + ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[1], d4=ip[2], d5=ip[3] + vtrn.32 d2, d4 + vtrn.32 d3, d5 + vtrn.16 d2, d3 + vtrn.16 d4, d5 + + vadd.s16 d6, d2, d5 ;ip[0]+ip[3] + vadd.s16 d7, d3, d4 ;ip[1]+ip[2] + vsub.s16 d8, d3, d4 ;ip[1]-ip[2] + vsub.s16 d9, d2, d5 ;ip[0]-ip[3] + vshl.i16 q3, q3, #1 ; a1, b1 + vshl.i16 q4, q4, #1 ; c1, d1 + + vadd.s16 d10, d6, d7 ;temp1 = a1 + b1 + vsub.s16 d11, d6, d7 ;temp2 = a1 - b1 + + vqdmulh.s16 q6, q5, d0[1] + vqdmulh.s16 q8, q4, d0[0] + vqdmulh.s16 q7, q4, d0[2] + + vshr.s16 q6, q6, #1 + vshr.s16 q8, q8, #1 + vshr.s16 q7, q7, #1 ;d14:temp1 = ( c1 * x_c3)>>16; d15:temp1 = (d1 * x_c3)>>16 + vadd.s16 q8, q4, q8 ;d16:temp2 = ((c1 * x_c1)>>16) + c1; d17:temp2 = ((d1 * x_c1)>>16) + d1 + + vadd.s16 d2, d10, d12 ;op[0] = ((temp1 * x_c2 )>>16) + temp1 + vadd.s16 d4, d11, d13 ;op[2] = ((temp2 * x_c2 )>>16) + temp2 + vadd.s16 d3, d14, d17 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection + vsub.s16 d5, d15, d16 ;op[3] = temp1 - temp2 + + ;Second for-loop + ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[4], d4=ip[8], d5=ip[12] + vtrn.32 d2, d4 + vtrn.32 d3, d5 + vtrn.16 d2, d3 + vtrn.16 d4, d5 + + vadd.s16 d6, d2, d5 ;a1 = ip[0]+ip[12] + vadd.s16 d7, d3, d4 ;b1 = ip[4]+ip[8] + vsub.s16 d8, d3, d4 ;c1 = ip[4]-ip[8] + vsub.s16 d9, d2, d5 ;d1 = ip[0]-ip[12] + + vadd.s16 d10, d6, d7 ;temp1 = a1 + b1 + vsub.s16 d11, d6, d7 ;temp2 = a1 - b1 + + + vqdmulh.s16 q6, q5, d0[1] + vqdmulh.s16 q8, q4, d0[0] + vqdmulh.s16 q7, q4, d0[2] + + vshr.s16 q6, q6, #1 + vshr.s16 q8, q8, #1 + vshr.s16 q7, q7, #1 ;d14:temp1 = ( c1 * x_c3)>>16; d15:temp1 = (d1 * x_c3)>>16 + vadd.s16 q8, q4, q8 ;d16:temp2 = ((c1 * x_c1)>>16) + c1; d17:temp2 = ((d1 * x_c1)>>16) + d1 + + vadd.s16 d2, d10, d12 ;a2 = ((temp1 * x_c2 )>>16) + temp1 + vadd.s16 d4, d11, d13 ;c2 = ((temp2 * x_c2 )>>16) + temp2 + vadd.s16 d3, d14, d17 ;b2 = temp1 + temp2 -- q is not necessary, just for protection + vsub.s16 d5, d15, d16 ;d2 = temp1 - temp2 + + vclt.s16 q3, q1, #0 + vclt.s16 q4, q2, #0 + + vsub.s16 q1, q1, q3 + vsub.s16 q2, q2, q4 + + vshr.s16 q1, q1, #1 + vshr.s16 q2, q2, #1 + + vst1.16 {q1, q2}, [r1] + + bx lr + + ENDP + +;----------------- + AREA fastfdct_dat, DATA, READONLY +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_ffdct_coeff_ + DCD ffdct_coeff +ffdct_coeff +; 60547 = 0xEC83 +; 46341 = 0xB505 +; 25080 = 0x61F8 + DCD 0xB505EC83, 0x000061F8 + + END diff --git a/vp8/encoder/arm/neon/fastfdct8x4_neon.asm b/vp8/encoder/arm/neon/fastfdct8x4_neon.asm new file mode 100644 index 000000000..de1c25469 --- /dev/null +++ b/vp8/encoder/arm/neon/fastfdct8x4_neon.asm @@ -0,0 +1,179 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_fast_fdct8x4_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch); +;NOTE: +;The input *src_diff. src_diff is calculated as: +;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function) +;In which *src_ptr and *pred_ptr both are unsigned char. +;Therefore, *src_diff should be in the range of [-255, 255]. +;CAUTION: +;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255]. +;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes +;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c. + +|vp8_fast_fdct8x4_neon| PROC + vld1.16 {q1}, [r0], r2 ;load input + ldr r12, _ffdct8_coeff_ + vld1.16 {q2}, [r0], r2 + vld1.16 {q3}, [r0], r2 + vld1.16 {d0}, [r12] + vld1.16 {q4}, [r0], r2 + + ;First for-loop + ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[1], d6=ip[2], d8=ip[3] + ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[1], d7=ip[2], d9=ip[3] + vtrn.32 d2, d6 + vtrn.32 d3, d7 + vtrn.32 d4, d8 + vtrn.32 d5, d9 + vtrn.16 d2, d4 + vtrn.16 d3, d5 + vtrn.16 d6, d8 + vtrn.16 d7, d9 + + vadd.s16 d10, d2, d8 ;ip[0]+ip[3] + vadd.s16 d11, d4, d6 ;ip[1]+ip[2] + vsub.s16 d12, d4, d6 ;ip[1]-ip[2] + vsub.s16 d13, d2, d8 ;ip[0]-ip[3] + vadd.s16 d22, d3, d9 + vadd.s16 d23, d5, d7 + vsub.s16 d24, d5, d7 + vsub.s16 d25, d3, d9 + + vshl.i16 q5, q5, #1 ; a1, b1 + vshl.i16 q6, q6, #1 ; c1, d1 + vshl.i16 q1, q11, #1 + vshl.i16 q2, q12, #1 + + vadd.s16 d14, d10, d11 ;temp1 = a1 + b1 + vsub.s16 d15, d10, d11 ;temp2 = a1 - b1 + vadd.s16 d24, d2, d3 + vsub.s16 d25, d2, d3 + + vqdmulh.s16 q8, q7, d0[1] + vqdmulh.s16 q13, q12, d0[1] + vqdmulh.s16 q10, q6, d0[0] + vqdmulh.s16 q15, q2, d0[0] + vqdmulh.s16 q9, q6, d0[2] + vqdmulh.s16 q14, q2, d0[2] + + vshr.s16 q8, q8, #1 + vshr.s16 q13, q13, #1 + vshr.s16 q10, q10, #1 + vshr.s16 q15, q15, #1 + vshr.s16 q9, q9, #1 ;d18:temp1 = ( c1 * x_c3)>>16; d19:temp1 = (d1 * x_c3)>>16 + vshr.s16 q14, q14, #1 ;d28:temp1 = ( c1 * x_c3)>>16; d29:temp1 = (d1 * x_c3)>>16 + vadd.s16 q10, q6, q10 ;d20:temp2 = ((c1 * x_c1)>>16) + c1; d21:temp2 = ((d1 * x_c1)>>16) + d1 + vadd.s16 q15, q2, q15 ;d30:temp2 = ((c1 * x_c1)>>16) + c1; d31:temp2 = ((d1 * x_c1)>>16) + d1 + + vadd.s16 d2, d14, d16 ;op[0] = ((temp1 * x_c2 )>>16) + temp1 + vadd.s16 d3, d24, d26 ;op[0] = ((temp1 * x_c2 )>>16) + temp1 + vadd.s16 d6, d15, d17 ;op[2] = ((temp2 * x_c2 )>>16) + temp2 + vadd.s16 d7, d25, d27 ;op[2] = ((temp2 * x_c2 )>>16) + temp2 + vadd.s16 d4, d18, d21 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection + vadd.s16 d5, d28, d31 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection + vsub.s16 d8, d19, d20 ;op[3] = temp1 - temp2 + vsub.s16 d9, d29, d30 ;op[3] = temp1 - temp2 + + ;Second for-loop + ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[4], d6=ip[8], d8=ip[12] + ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[4], d7=ip[8], d9=ip[12] + vtrn.32 d2, d6 + vtrn.32 d3, d7 + vtrn.32 d4, d8 + vtrn.32 d5, d9 + vtrn.16 d2, d4 + vtrn.16 d3, d5 + vtrn.16 d6, d8 + vtrn.16 d7, d9 + + vadd.s16 d10, d2, d8 ;a1 = ip[0]+ip[12] + vadd.s16 d11, d4, d6 ;b1 = ip[4]+ip[8] + vsub.s16 d12, d4, d6 ;c1 = ip[4]-ip[8] + vsub.s16 d13, d2, d8 ;d1 = ip[0]-ip[12] + vadd.s16 d2, d3, d9 + vadd.s16 d4, d5, d7 + vsub.s16 d24, d5, d7 + vsub.s16 d25, d3, d9 + + vadd.s16 d14, d10, d11 ;temp1 = a1 + b1 + vsub.s16 d15, d10, d11 ;temp2 = a1 - b1 + vadd.s16 d22, d2, d4 + vsub.s16 d23, d2, d4 + + vqdmulh.s16 q8, q7, d0[1] + vqdmulh.s16 q13, q11, d0[1] + vqdmulh.s16 q10, q6, d0[0] + vqdmulh.s16 q15, q12, d0[0] + vqdmulh.s16 q9, q6, d0[2] + vqdmulh.s16 q14, q12, d0[2] + + vshr.s16 q8, q8, #1 + vshr.s16 q13, q13, #1 + vshr.s16 q10, q10, #1 + vshr.s16 q15, q15, #1 + vshr.s16 q9, q9, #1 ;d18:temp1 = ( c1 * x_c3)>>16; d19:temp1 = (d1 * x_c3)>>16 + vshr.s16 q14, q14, #1 ;d28:temp1 = ( c1 * x_c3)>>16; d29:temp1 = (d1 * x_c3)>>16 + vadd.s16 q10, q6, q10 ;d20:temp2 = ((c1 * x_c1)>>16) + c1; d21:temp2 = ((d1 * x_c1)>>16) + d1 + vadd.s16 q15, q12, q15 ;d30:temp2 = ((c1 * x_c1)>>16) + c1; d31:temp2 = ((d1 * x_c1)>>16) + d1 + + vadd.s16 d2, d14, d16 ;a2 = ((temp1 * x_c2 )>>16) + temp1 + vadd.s16 d6, d22, d26 ;a2 = ((temp1 * x_c2 )>>16) + temp1 + vadd.s16 d4, d15, d17 ;c2 = ((temp2 * x_c2 )>>16) + temp2 + vadd.s16 d8, d23, d27 ;c2 = ((temp2 * x_c2 )>>16) + temp2 + vadd.s16 d3, d18, d21 ;b2 = temp1 + temp2 -- q is not necessary, just for protection + vadd.s16 d7, d28, d31 ;b2 = temp1 + temp2 -- q is not necessary, just for protection + vsub.s16 d5, d19, d20 ;d2 = temp1 - temp2 + vsub.s16 d9, d29, d30 ;d2 = temp1 - temp2 + + vclt.s16 q5, q1, #0 + vclt.s16 q6, q2, #0 + vclt.s16 q7, q3, #0 + vclt.s16 q8, q4, #0 + + vsub.s16 q1, q1, q5 + vsub.s16 q2, q2, q6 + vsub.s16 q3, q3, q7 + vsub.s16 q4, q4, q8 + + vshr.s16 q1, q1, #1 + vshr.s16 q2, q2, #1 + vshr.s16 q3, q3, #1 + vshr.s16 q4, q4, #1 + + vst1.16 {q1, q2}, [r1]! + vst1.16 {q3, q4}, [r1] + + bx lr + + ENDP + +;----------------- + AREA fastfdct8x4_dat, DATA, READONLY +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_ffdct8_coeff_ + DCD ffdct8_coeff +ffdct8_coeff +; 60547 = 0xEC83 +; 46341 = 0xB505 +; 25080 = 0x61F8 + DCD 0xB505EC83, 0x000061F8 + + END diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.asm b/vp8/encoder/arm/neon/fastquantizeb_neon.asm new file mode 100644 index 000000000..11070377b --- /dev/null +++ b/vp8/encoder/arm/neon/fastquantizeb_neon.asm @@ -0,0 +1,117 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_fast_quantize_b_neon_func| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 short *coeff_ptr +; r1 short *zbin_ptr +; r2 short *qcoeff_ptr +; r3 short *dqcoeff_ptr +; stack short *dequant_ptr +; stack short *scan_mask +; stack short *round_ptr +; stack short *quant_ptr + +; return int * eob +|vp8_fast_quantize_b_neon_func| PROC + vld1.16 {q0, q1}, [r0] ;load z + vld1.16 {q10, q11}, [r1] ;load zbin + + vabs.s16 q4, q0 ;calculate x = abs(z) + vabs.s16 q5, q1 + + vcge.s16 q10, q4, q10 ;x>=zbin + vcge.s16 q11, q5, q11 + + ;if x<zbin (q10 & q11 are all 0), go to zero_output + vorr.s16 q6, q10, q11 + vorr.s16 d12, d12, d13 + vmov r0, r1, d12 + orr r0, r0, r1 + cmp r0, #0 + beq zero_output + + ldr r0, [sp, #8] ;load round_ptr + ldr r12, [sp, #12] ;load quant_ptr + + ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative + vshr.s16 q2, q0, #15 ; sz + vshr.s16 q3, q1, #15 + + vld1.s16 {q6, q7}, [r0] ;load round_ptr [0-15] + vld1.s16 {q8, q9}, [r12] ;load quant_ptr [0-15] + + vadd.s16 q4, q6 ;x + Round + vadd.s16 q5, q7 + + ldr r0, [sp, #4] ;load rvsplus1_scan_order ptr + + vqdmulh.s16 q4, q8 ;y = ((Round + abs(z)) * Quant) >> 16 + vqdmulh.s16 q5, q9 + + vld1.16 {q0, q1}, [r0] ;load rvsplus1_scan_order + vceq.s16 q8, q8 ;set q8 to all 1 + + vshr.s16 q4, #1 ;right shift 1 after vqdmulh + vshr.s16 q5, #1 + + ;modify data to have its original sign + veor.s16 q4, q2 ; y^sz + veor.s16 q5, q3 + + ldr r12, [sp] ;load dequant_ptr + + vsub.s16 q4, q2 ; x1 = (y^sz) - sz = (y^sz) - (-1) (two's complement) + vsub.s16 q5, q3 + + vand.s16 q4, q10 ;mask off x1 elements + vand.s16 q5, q11 + + vld1.s16 {q6, q7}, [r12] ;load dequant_ptr[i] + + vtst.16 q14, q4, q8 ;now find eob + vtst.16 q15, q5, q8 ;non-zero element is set to all 1 in q4, q5 + + vst1.s16 {q4, q5}, [r2] ;store: qcoeff = x1 + + vand q0, q0, q14 ;get all valid number from rvsplus1_scan_order array + vand q1, q1, q15 + + vmax.u16 q0, q0, q1 ;find maximum value in q0, q1 + vmax.u16 d0, d0, d1 + vmovl.u16 q0, d0 + + vmul.s16 q6, q4 ;x * Dequant + vmul.s16 q7, q5 + + vmax.u32 d0, d0, d1 + vpmax.u32 d0, d0, d0 + + vst1.s16 {q6, q7}, [r3] ;store dqcoeff = x * Dequant + + vmov.32 r0, d0[0] + bx lr + +zero_output + vst1.s16 {q10, q11}, [r2] ; qcoeff = 0 + vst1.s16 {q10, q11}, [r3] ; dqcoeff = 0 + mov r0, #0 + + bx lr + + ENDP + + END diff --git a/vp8/encoder/arm/neon/sad16_neon.asm b/vp8/encoder/arm/neon/sad16_neon.asm new file mode 100644 index 000000000..6169f10da --- /dev/null +++ b/vp8/encoder/arm/neon/sad16_neon.asm @@ -0,0 +1,206 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_sad16x16_neon| + EXPORT |vp8_sad16x8_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int src_stride +; r2 unsigned char *ref_ptr +; r3 int ref_stride +|vp8_sad16x16_neon| PROC +;; + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabdl.u8 q12, d0, d8 + vabdl.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0], r1 + vld1.8 {q7}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + +;; + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabal.u8 q12, d0, d8 + vabal.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0], r1 + vld1.8 {q7}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + +;; + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabal.u8 q12, d0, d8 + vabal.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0], r1 + vld1.8 {q7}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + +;; + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabal.u8 q12, d0, d8 + vabal.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0] + vld1.8 {q7}, [r2] + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vadd.u16 q0, q12, q13 + + vpaddl.u16 q1, q0 + vpaddl.u32 q0, q1 + + vadd.u32 d0, d0, d1 + + vmov.32 r0, d0[0] + + bx lr + + ENDP + +;============================== +;unsigned int vp8_sad16x8_c( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +|vp8_sad16x8_neon| PROC + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabdl.u8 q12, d0, d8 + vabdl.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0], r1 + vld1.8 {q7}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabal.u8 q12, d0, d8 + vabal.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0], r1 + vld1.8 {q7}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vadd.u16 q0, q12, q13 + + vpaddl.u16 q1, q0 + vpaddl.u32 q0, q1 + + vadd.u32 d0, d0, d1 + + vmov.32 r0, d0[0] + + bx lr + + ENDP + + END diff --git a/vp8/encoder/arm/neon/sad8_neon.asm b/vp8/encoder/arm/neon/sad8_neon.asm new file mode 100644 index 000000000..28604ddeb --- /dev/null +++ b/vp8/encoder/arm/neon/sad8_neon.asm @@ -0,0 +1,208 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_sad8x8_neon| + EXPORT |vp8_sad8x16_neon| + EXPORT |vp8_sad4x4_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; unsigned int vp8_sad8x8_c( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) + +|vp8_sad8x8_neon| PROC + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabdl.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vabal.u8 q12, d6, d14 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabal.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q12, d6, d14 + + vpaddl.u16 q1, q12 + vpaddl.u32 q0, q1 + vadd.u32 d0, d0, d1 + + vmov.32 r0, d0[0] + + bx lr + + ENDP + +;============================ +;unsigned int vp8_sad8x16_c( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) + +|vp8_sad8x16_neon| PROC + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabdl.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vabal.u8 q12, d6, d14 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabal.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vabal.u8 q12, d6, d14 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabal.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vabal.u8 q12, d6, d14 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabal.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q12, d6, d14 + + vpaddl.u16 q1, q12 + vpaddl.u32 q0, q1 + vadd.u32 d0, d0, d1 + + vmov.32 r0, d0[0] + + bx lr + + ENDP + +;=========================== +;unsigned int vp8_sad4x4_c( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) + +|vp8_sad4x4_neon| PROC + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabdl.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q12, d6, d14 + + vpaddl.u16 d1, d24 + vpaddl.u32 d0, d1 + vmov.32 r0, d0[0] + + bx lr + + ENDP + + END diff --git a/vp8/encoder/arm/neon/shortfdct_neon.asm b/vp8/encoder/arm/neon/shortfdct_neon.asm new file mode 100644 index 000000000..26bc0d06c --- /dev/null +++ b/vp8/encoder/arm/neon/shortfdct_neon.asm @@ -0,0 +1,146 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_short_fdct4x4_neon| + EXPORT |vp8_short_fdct8x4_neon| + ARM + REQUIRE8 + PRESERVE8 + + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 short *input +; r1 short *output +; r2 int pitch +; Input has a pitch, output is contiguous +|vp8_short_fdct4x4_neon| PROC + ldr r12, _dct_matrix_ + vld1.16 d0, [r0], r2 + vld1.16 d1, [r0], r2 + vld1.16 d2, [r0], r2 + vld1.16 d3, [r0] + vld1.16 {q2, q3}, [r12] + +;first stage + vmull.s16 q11, d4, d0[0] ;i=0 + vmull.s16 q12, d4, d1[0] ;i=1 + vmull.s16 q13, d4, d2[0] ;i=2 + vmull.s16 q14, d4, d3[0] ;i=3 + + vmlal.s16 q11, d5, d0[1] + vmlal.s16 q12, d5, d1[1] + vmlal.s16 q13, d5, d2[1] + vmlal.s16 q14, d5, d3[1] + + vmlal.s16 q11, d6, d0[2] + vmlal.s16 q12, d6, d1[2] + vmlal.s16 q13, d6, d2[2] + vmlal.s16 q14, d6, d3[2] + + vmlal.s16 q11, d7, d0[3] ;sumtemp for i=0 + vmlal.s16 q12, d7, d1[3] ;sumtemp for i=1 + vmlal.s16 q13, d7, d2[3] ;sumtemp for i=2 + vmlal.s16 q14, d7, d3[3] ;sumtemp for i=3 + + ; rounding + vrshrn.i32 d22, q11, #14 + vrshrn.i32 d24, q12, #14 + vrshrn.i32 d26, q13, #14 + vrshrn.i32 d28, q14, #14 + +;second stage + vmull.s16 q4, d22, d4[0] ;i=0 + vmull.s16 q5, d22, d4[1] ;i=1 + vmull.s16 q6, d22, d4[2] ;i=2 + vmull.s16 q7, d22, d4[3] ;i=3 + + vmlal.s16 q4, d24, d5[0] + vmlal.s16 q5, d24, d5[1] + vmlal.s16 q6, d24, d5[2] + vmlal.s16 q7, d24, d5[3] + + vmlal.s16 q4, d26, d6[0] + vmlal.s16 q5, d26, d6[1] + vmlal.s16 q6, d26, d6[2] + vmlal.s16 q7, d26, d6[3] + + vmlal.s16 q4, d28, d7[0] ;sumtemp for i=0 + vmlal.s16 q5, d28, d7[1] ;sumtemp for i=1 + vmlal.s16 q6, d28, d7[2] ;sumtemp for i=2 + vmlal.s16 q7, d28, d7[3] ;sumtemp for i=3 + + vrshr.s32 q0, q4, #16 + vrshr.s32 q1, q5, #16 + vrshr.s32 q2, q6, #16 + vrshr.s32 q3, q7, #16 + + vmovn.i32 d0, q0 + vmovn.i32 d1, q1 + vmovn.i32 d2, q2 + vmovn.i32 d3, q3 + + vst1.16 {q0, q1}, [r1] + + bx lr + + ENDP + +; r0 short *input +; r1 short *output +; r2 int pitch +|vp8_short_fdct8x4_neon| PROC + ; Store link register and input before calling + ; first 4x4 fdct. Do not need to worry about + ; output or pitch because those pointers are not + ; touched in the 4x4 fdct function + stmdb sp!, {r0, lr} + + bl vp8_short_fdct4x4_neon + + ldmia sp!, {r0, lr} + + ; Move to the next block of data. + add r0, r0, #8 + add r1, r1, #32 + + ; Second time through do not store off the + ; link register, just return from the 4x4 fdtc + b vp8_short_fdct4x4_neon + + ; Should never get to this. + bx lr + + ENDP + +;----------------- + AREA dct4x4_dat, DATA, READONLY +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_dct_matrix_ + DCD dct_matrix +dct_matrix +; DCW 23170, 30274, 23170, 12540 +; DCW 23170, 12540, -23170,-30274 +; DCW 23170, -12540, -23170, 30274 +; DCW 23170, -30274, 23170,-12540 +; 23170 = 0x5a82 +; -23170 = 0xa57e +; 30274 = 0x7642 +; -30274 = 0x89be +; 12540 = 0x30fc +; -12540 = 0xcf04 + DCD 0x76425a82, 0x30fc5a82 + DCD 0x30fc5a82, 0x89bea57e + DCD 0xcf045a82, 0x7642a57e + DCD 0x89be5a82, 0xcf045a82 + + END diff --git a/vp8/encoder/arm/neon/subtract_neon.asm b/vp8/encoder/arm/neon/subtract_neon.asm new file mode 100644 index 000000000..8781ca0cc --- /dev/null +++ b/vp8/encoder/arm/neon/subtract_neon.asm @@ -0,0 +1,171 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_subtract_b_neon_func| + EXPORT |vp8_subtract_mby_neon| + EXPORT |vp8_subtract_mbuv_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;========================================= +;void vp8_subtract_b_neon_func(short *diff, unsigned char *src, unsigned char *pred, int stride, int pitch); +|vp8_subtract_b_neon_func| PROC + ldr r12, [sp] ;load pitch + + vld1.8 {d0}, [r1], r3 ;load src + vld1.8 {d1}, [r2], r12 ;load pred + vld1.8 {d2}, [r1], r3 + vld1.8 {d3}, [r2], r12 + vld1.8 {d4}, [r1], r3 + vld1.8 {d5}, [r2], r12 + vld1.8 {d6}, [r1], r3 + vld1.8 {d7}, [r2], r12 + + vsubl.u8 q10, d0, d1 + vsubl.u8 q11, d2, d3 + vsubl.u8 q12, d4, d5 + vsubl.u8 q13, d6, d7 + + mov r12, r12, lsl #1 + + vst1.16 {d20}, [r0], r12 ;store diff + vst1.16 {d22}, [r0], r12 + vst1.16 {d24}, [r0], r12 + vst1.16 {d26}, [r0], r12 + + bx lr + ENDP + +;========================================== +;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride) +|vp8_subtract_mby_neon| PROC + mov r12, #4 + +subtract_mby_loop + vld1.8 {q0}, [r1], r3 ;load src + vld1.8 {q1}, [r2]! ;load pred + vld1.8 {q2}, [r1], r3 + vld1.8 {q3}, [r2]! + vld1.8 {q4}, [r1], r3 + vld1.8 {q5}, [r2]! + vld1.8 {q6}, [r1], r3 + vld1.8 {q7}, [r2]! + + vsubl.u8 q8, d0, d2 + vsubl.u8 q9, d1, d3 + vsubl.u8 q10, d4, d6 + vsubl.u8 q11, d5, d7 + vsubl.u8 q12, d8, d10 + vsubl.u8 q13, d9, d11 + vsubl.u8 q14, d12, d14 + vsubl.u8 q15, d13, d15 + + vst1.16 {q8}, [r0]! ;store diff + vst1.16 {q9}, [r0]! + vst1.16 {q10}, [r0]! + vst1.16 {q11}, [r0]! + vst1.16 {q12}, [r0]! + vst1.16 {q13}, [r0]! + vst1.16 {q14}, [r0]! + vst1.16 {q15}, [r0]! + + subs r12, r12, #1 + bne subtract_mby_loop + + bx lr + ENDP + +;================================= +;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) +|vp8_subtract_mbuv_neon| PROC + ldr r12, [sp] + +;u + add r0, r0, #512 ; short *udiff = diff + 256; + add r3, r3, #256 ; unsigned char *upred = pred + 256; + + vld1.8 {d0}, [r1], r12 ;load src + vld1.8 {d1}, [r3]! ;load pred + vld1.8 {d2}, [r1], r12 + vld1.8 {d3}, [r3]! + vld1.8 {d4}, [r1], r12 + vld1.8 {d5}, [r3]! + vld1.8 {d6}, [r1], r12 + vld1.8 {d7}, [r3]! + vld1.8 {d8}, [r1], r12 + vld1.8 {d9}, [r3]! + vld1.8 {d10}, [r1], r12 + vld1.8 {d11}, [r3]! + vld1.8 {d12}, [r1], r12 + vld1.8 {d13}, [r3]! + vld1.8 {d14}, [r1], r12 + vld1.8 {d15}, [r3]! + + vsubl.u8 q8, d0, d1 + vsubl.u8 q9, d2, d3 + vsubl.u8 q10, d4, d5 + vsubl.u8 q11, d6, d7 + vsubl.u8 q12, d8, d9 + vsubl.u8 q13, d10, d11 + vsubl.u8 q14, d12, d13 + vsubl.u8 q15, d14, d15 + + vst1.16 {q8}, [r0]! ;store diff + vst1.16 {q9}, [r0]! + vst1.16 {q10}, [r0]! + vst1.16 {q11}, [r0]! + vst1.16 {q12}, [r0]! + vst1.16 {q13}, [r0]! + vst1.16 {q14}, [r0]! + vst1.16 {q15}, [r0]! + +;v + vld1.8 {d0}, [r2], r12 ;load src + vld1.8 {d1}, [r3]! ;load pred + vld1.8 {d2}, [r2], r12 + vld1.8 {d3}, [r3]! + vld1.8 {d4}, [r2], r12 + vld1.8 {d5}, [r3]! + vld1.8 {d6}, [r2], r12 + vld1.8 {d7}, [r3]! + vld1.8 {d8}, [r2], r12 + vld1.8 {d9}, [r3]! + vld1.8 {d10}, [r2], r12 + vld1.8 {d11}, [r3]! + vld1.8 {d12}, [r2], r12 + vld1.8 {d13}, [r3]! + vld1.8 {d14}, [r2], r12 + vld1.8 {d15}, [r3]! + + vsubl.u8 q8, d0, d1 + vsubl.u8 q9, d2, d3 + vsubl.u8 q10, d4, d5 + vsubl.u8 q11, d6, d7 + vsubl.u8 q12, d8, d9 + vsubl.u8 q13, d10, d11 + vsubl.u8 q14, d12, d13 + vsubl.u8 q15, d14, d15 + + vst1.16 {q8}, [r0]! ;store diff + vst1.16 {q9}, [r0]! + vst1.16 {q10}, [r0]! + vst1.16 {q11}, [r0]! + vst1.16 {q12}, [r0]! + vst1.16 {q13}, [r0]! + vst1.16 {q14}, [r0]! + vst1.16 {q15}, [r0]! + + bx lr + ENDP + + END diff --git a/vp8/encoder/arm/neon/variance_neon.asm b/vp8/encoder/arm/neon/variance_neon.asm new file mode 100644 index 000000000..64b83ca43 --- /dev/null +++ b/vp8/encoder/arm/neon/variance_neon.asm @@ -0,0 +1,275 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_variance16x16_neon| + EXPORT |vp8_variance16x8_neon| + EXPORT |vp8_variance8x16_neon| + EXPORT |vp8_variance8x8_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp8_variance16x16_neon| PROC + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + mov r12, #8 + +variance16x16_neon_loop + vld1.8 {q0}, [r0], r1 ;Load up source and reference + vld1.8 {q2}, [r2], r3 + vld1.8 {q1}, [r0], r1 + vld1.8 {q3}, [r2], r3 + + vsubl.u8 q11, d0, d4 ;calculate diff + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + ;VPADAL adds adjacent pairs of elements of a vector, and accumulates + ;the results into the elements of the destination vector. The explanation + ;in ARM guide is wrong. + vpadal.s16 q8, q11 ;calculate sum + vmlal.s16 q9, d22, d22 ;calculate sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + bne variance16x16_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + ldr r12, [sp] ;load *sse from stack + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + ;vmov.32 r0, d0[0] ;this instruction costs a lot + ;vmov.32 r1, d1[0] + ;mul r0, r0, r0 + ;str r1, [r12] + ;sub r0, r1, r0, asr #8 + + ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should + ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right. + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [r12] ;store sse + vshr.s32 d10, d10, #8 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + bx lr + + ENDP + +;================================ +;unsigned int vp8_variance16x8_c( +; unsigned char *src_ptr, +; int source_stride, +; unsigned char *ref_ptr, +; int recon_stride, +; unsigned int *sse) +|vp8_variance16x8_neon| PROC + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + mov r12, #4 + +variance16x8_neon_loop + vld1.8 {q0}, [r0], r1 ;Load up source and reference + vld1.8 {q2}, [r2], r3 + vld1.8 {q1}, [r0], r1 + vld1.8 {q3}, [r2], r3 + + vsubl.u8 q11, d0, d4 ;calculate diff + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + vpadal.s16 q8, q11 ;calculate sum + vmlal.s16 q9, d22, d22 ;calculate sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + bne variance16x8_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + ldr r12, [sp] ;load *sse from stack + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [r12] ;store sse + vshr.s32 d10, d10, #7 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + bx lr + + ENDP + +;================================= +;unsigned int vp8_variance8x16_c( +; unsigned char *src_ptr, +; int source_stride, +; unsigned char *ref_ptr, +; int recon_stride, +; unsigned int *sse) + +|vp8_variance8x16_neon| PROC + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + mov r12, #8 + +variance8x16_neon_loop + vld1.8 {d0}, [r0], r1 ;Load up source and reference + vld1.8 {d4}, [r2], r3 + vld1.8 {d2}, [r0], r1 + vld1.8 {d6}, [r2], r3 + + vsubl.u8 q11, d0, d4 ;calculate diff + vsubl.u8 q12, d2, d6 + + vpadal.s16 q8, q11 ;calculate sum + vmlal.s16 q9, d22, d22 ;calculate sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + + bne variance8x16_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + ldr r12, [sp] ;load *sse from stack + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [r12] ;store sse + vshr.s32 d10, d10, #7 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + bx lr + + ENDP + +;================================== +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp8_variance8x8_neon| PROC + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + mov r12, #2 + +variance8x8_neon_loop + vld1.8 {d0}, [r0], r1 ;Load up source and reference + vld1.8 {d4}, [r2], r3 + vld1.8 {d1}, [r0], r1 + vld1.8 {d5}, [r2], r3 + vld1.8 {d2}, [r0], r1 + vld1.8 {d6}, [r2], r3 + vld1.8 {d3}, [r0], r1 + vld1.8 {d7}, [r2], r3 + + vsubl.u8 q11, d0, d4 ;calculate diff + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + vpadal.s16 q8, q11 ;calculate sum + vmlal.s16 q9, d22, d22 ;calculate sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + bne variance8x8_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + ldr r12, [sp] ;load *sse from stack + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [r12] ;store sse + vshr.s32 d10, d10, #6 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + bx lr + + ENDP + + END diff --git a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm b/vp8/encoder/arm/neon/vp8_memcpy_neon.asm new file mode 100644 index 000000000..f26b4d7ae --- /dev/null +++ b/vp8/encoder/arm/neon/vp8_memcpy_neon.asm @@ -0,0 +1,67 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_memcpy_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;========================================= +;void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz); +|vp8_memcpy_neon| PROC + ;pld [r1] ;preload pred data + ;pld [r1, #128] + ;pld [r1, #256] + ;pld [r1, #384] + + mov r12, r2, lsr #8 ;copy 256 bytes data at one time + +memcpy_neon_loop + vld1.8 {q0, q1}, [r1]! ;load src data + subs r12, r12, #1 + vld1.8 {q2, q3}, [r1]! + vst1.8 {q0, q1}, [r0]! ;copy to dst_ptr + vld1.8 {q4, q5}, [r1]! + vst1.8 {q2, q3}, [r0]! + vld1.8 {q6, q7}, [r1]! + vst1.8 {q4, q5}, [r0]! + vld1.8 {q8, q9}, [r1]! + vst1.8 {q6, q7}, [r0]! + vld1.8 {q10, q11}, [r1]! + vst1.8 {q8, q9}, [r0]! + vld1.8 {q12, q13}, [r1]! + vst1.8 {q10, q11}, [r0]! + vld1.8 {q14, q15}, [r1]! + vst1.8 {q12, q13}, [r0]! + vst1.8 {q14, q15}, [r0]! + + ;pld [r1] ;preload pred data -- need to adjust for real device + ;pld [r1, #128] + ;pld [r1, #256] + ;pld [r1, #384] + + bne memcpy_neon_loop + + ands r3, r2, #0xff ;extra copy + beq done_copy_neon_loop + +extra_copy_neon_loop + vld1.8 {q0}, [r1]! ;load src data + subs r3, r3, #16 + vst1.8 {q0}, [r0]! + bne extra_copy_neon_loop + +done_copy_neon_loop + bx lr + ENDP + + END diff --git a/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm b/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm new file mode 100644 index 000000000..f53596727 --- /dev/null +++ b/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm @@ -0,0 +1,172 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_mse16x16_neon| + EXPORT |vp8_get16x16pred_error_neon| + EXPORT |vp8_get4x4sse_cs_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;============================ +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +;note: in this function, sum is never used. So, we can remove this part of calculation +;from vp8_variance(). + +|vp8_mse16x16_neon| PROC + vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse + vmov.i8 q8, #0 + vmov.i8 q9, #0 + vmov.i8 q10, #0 + + mov r12, #8 + +mse16x16_neon_loop + vld1.8 {q0}, [r0], r1 ;Load up source and reference + vld1.8 {q2}, [r2], r3 + vld1.8 {q1}, [r0], r1 + vld1.8 {q3}, [r2], r3 + + vsubl.u8 q11, d0, d4 + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + vmlal.s16 q7, d22, d22 + vmlal.s16 q8, d23, d23 + + subs r12, r12, #1 + + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vmlal.s16 q7, d26, d26 + vmlal.s16 q8, d27, d27 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + bne mse16x16_neon_loop + + vadd.u32 q7, q7, q8 + vadd.u32 q9, q9, q10 + + ldr r12, [sp] ;load *sse from stack + + vadd.u32 q10, q7, q9 + vpaddl.u32 q1, q10 + vadd.u64 d0, d2, d3 + + vst1.32 {d0[0]}, [r12] + vmov.32 r0, d0[0] + + bx lr + + ENDP + +;============================ +; r0 unsigned char *src_ptr +; r1 int src_stride +; r2 unsigned char *ref_ptr +; r3 int ref_stride +|vp8_get16x16pred_error_neon| PROC + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - pred_error + vmov.i8 q10, #0 + + mov r12, #8 + +get16x16pred_error_neon_loop + vld1.8 {q0}, [r0], r1 ;Load up source and reference + vld1.8 {q2}, [r2], r3 + vld1.8 {q1}, [r0], r1 + vld1.8 {q3}, [r2], r3 + + vsubl.u8 q11, d0, d4 + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + vpadal.s16 q8, q11 + vmlal.s16 q9, d22, d22 + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + bne get16x16pred_error_neon_loop + + vadd.u32 q10, q9, q10 + vpaddl.s32 q0, q8 + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vshr.s32 d10, d10, #8 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] + bx lr + + ENDP + +;============================= +; r0 unsigned char *src_ptr, +; r1 int source_stride, +; r2 unsigned char *ref_ptr, +; r3 int recon_stride +|vp8_get4x4sse_cs_neon| PROC + vld1.8 {d0}, [r0], r1 ;Load up source and reference + vld1.8 {d4}, [r2], r3 + vld1.8 {d1}, [r0], r1 + vld1.8 {d5}, [r2], r3 + vld1.8 {d2}, [r0], r1 + vld1.8 {d6}, [r2], r3 + vld1.8 {d3}, [r0], r1 + vld1.8 {d7}, [r2], r3 + + vsubl.u8 q11, d0, d4 + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + vmull.s16 q7, d22, d22 + vmull.s16 q8, d24, d24 + vmull.s16 q9, d26, d26 + vmull.s16 q10, d28, d28 + + vadd.u32 q7, q7, q8 + vadd.u32 q9, q9, q10 + vadd.u32 q9, q7, q9 + + vpaddl.u32 q1, q9 + vadd.u64 d0, d2, d3 + + vmov.32 r0, d0[0] + bx lr + + ENDP + + END diff --git a/vp8/encoder/arm/neon/vp8_packtokens_armv7.asm b/vp8/encoder/arm/neon/vp8_packtokens_armv7.asm new file mode 100644 index 000000000..9c52c52f6 --- /dev/null +++ b/vp8/encoder/arm/neon/vp8_packtokens_armv7.asm @@ -0,0 +1,300 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8cx_pack_tokens_armv7| + + INCLUDE vpx_vp8_enc_asm_offsets.asm + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY + +; r0 vp8_writer *w +; r1 const TOKENEXTRA *p +; r2 int xcount +; r3 vp8_coef_encodings +; s0 vp8_extra_bits +; s1 vp8_coef_tree +|vp8cx_pack_tokens_armv7| PROC + push {r4-r11, lr} + + ; Add size of xcount * sizeof (TOKENEXTRA) to get stop + ; sizeof (TOKENEXTRA) is 20 + add r2, r2, r2, lsl #2 ; xcount + sub sp, sp, #12 + add r2, r1, r2, lsl #2 ; stop = p + xcount + str r2, [sp, #0] + str r3, [sp, #8] ; save vp8_coef_encodings + ldr r2, [r0, #vp8_writer_lowvalue] + ldr r5, [r0, #vp8_writer_range] + ldr r3, [r0, #vp8_writer_count] + b check_p_lt_stop + +while_p_lt_stop + ldr r6, [r1, #tokenextra_token] ; t + ldr r4, [sp, #8] ; vp8_coef_encodings + mov lr, #0 + add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t + ldr r9, [r1, #tokenextra_context_tree] ; pp + + ldr r7, [r1, #tokenextra_skip_eob_node] + + ldr r6, [r4, #vp8_token_value] ; v + ldr r8, [r4, #vp8_token_len] ; n + + ; vp8 specific skip_eob_node + cmp r7, #0 + movne lr, #2 ; i = 2 + subne r8, r8, #1 ; --n + + ; reverse the stream of bits to be packed. Normally + ; the most significant bit is peeled off and compared + ; in the form of (v >> --n) & 1. ARM architecture has + ; the ability to set a flag based on the value of the + ; bit shifted off the bottom of the register. To make + ; that happen the bitstream is reversed. + rbit r12, r6 + rsb r4, r8, #32 ; 32-n + ldr r10, [sp, #52] ; vp8_coef_tree + + ; v is kept in r12 during the token pack loop + lsr r12, r12, r4 ; v >>= 32 - n + +; loop start +token_loop + ldrb r4, [r9, lr, asr #1] ; pp [i>>1] + sub r7, r5, #1 ; range-1 + + ; Decisions are made based on the bit value shifted + ; off of v, so set a flag here based on this. + ; This value is refered to as "bb" + lsrs r12, r12, #1 ; bb = v >> n + mul r4, r4, r7 ; ((range-1) * pp[i>>1])) + + ; bb can only be 0 or 1. So only execute this statement + ; if bb == 1, otherwise it will act like i + 0 + addcs lr, lr, #1 ; i + bb + + mov r7, #1 + ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb] + add r4, r7, r4, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8) + + addcs r2, r2, r4 ; if (bb) lowvalue += split + subcs r4, r5, r4 ; if (bb) range = range-split + + ; Counting the leading zeros is used to normalize range. + clz r6, r4 + sub r6, r6, #24 ; shift + + ; Flag is set on the sum of count. This flag is used later + ; to determine if count >= 0 + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi token_count_lt_zero ; if(count >= 0) + + sub r6, r6, r3 ; offset = shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl token_high_bit_not_set + + ldr r4, [r0, #vp8_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos-1 + b token_zero_while_start +token_zero_while_loop + mov r10, #0 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +token_zero_while_start + cmp r4, #0 + ldrge r7, [r0, #vp8_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq token_zero_while_loop + + ldr r7, [r0, #vp8_writer_buffer] + ldrb r10, [r7, r4] ; w->buffer[x] + add r10, r10, #1 + strb r10, [r7, r4] ; w->buffer[x] + 1 +token_high_bit_not_set + rsb r4, r6, #24 ; 24-offset + ldr r10, [r0, #vp8_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp8_writer_pos] ; w->pos + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp8_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r10, r4] ; w->buffer[w->pos++] + + ; r10 is used earlier in the loop, but r10 is used as + ; temp variable here. So after r10 is used, reload + ; vp8_coef_tree_dcd into r10 + ldr r10, [sp, #52] ; vp8_coef_tree + +token_count_lt_zero + lsl r2, r2, r6 ; lowvalue <<= shift + + subs r8, r8, #1 ; --n + bne token_loop + + ldr r6, [r1, #tokenextra_token] ; t + ldr r7, [sp, #48] ; vp8_extra_bits + ; Add t * sizeof (vp8_extra_bit_struct) to get the desired + ; element. Here vp8_extra_bit_struct == 20 + add r6, r6, r6, lsl #2 ; b = vp8_extra_bits + t + add r12, r7, r6, lsl #2 ; b = vp8_extra_bits + t + + ldr r4, [r12, #vp8_extra_bit_struct_base_val] + cmp r4, #0 + beq skip_extra_bits + +; if( b->base_val) + ldr r8, [r12, #vp8_extra_bit_struct_len] ; L + ldr lr, [r1, #tokenextra_extra] ; e = p->Extra + cmp r8, #0 ; if( L) + beq no_extra_bits + + ldr r9, [r12, #vp8_extra_bit_struct_prob] + asr r7, lr, #1 ; v=e>>1 + + ldr r10, [r12, #vp8_extra_bit_struct_tree] + str r10, [sp, #4] ; b->tree + + rbit r12, r7 ; reverse v + rsb r4, r8, #32 + lsr r12, r12, r4 + + mov lr, #0 ; i = 0 + +extra_bits_loop + ldrb r4, [r9, lr, asr #1] ; pp[i>>1] + sub r7, r5, #1 ; range-1 + lsrs r12, r12, #1 ; v >> n + mul r4, r4, r7 ; (range-1) * pp[i>>1] + addcs lr, lr, #1 ; i + bb + + mov r7, #1 + ldrsb lr, [r10, lr] ; i = b->tree[i+bb] + add r4, r7, r4, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8) + + addcs r2, r2, r4 ; if (bb) lowvalue += split + subcs r4, r5, r4 ; if (bb) range = range-split + + clz r6, r4 + sub r6, r6, #24 + + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi extra_count_lt_zero ; if(count >= 0) + + sub r6, r6, r3 ; offset= shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl extra_high_bit_not_set + + ldr r4, [r0, #vp8_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos - 1 + b extra_zero_while_start +extra_zero_while_loop + mov r10, #0 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +extra_zero_while_start + cmp r4, #0 + ldrge r7, [r0, #vp8_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq extra_zero_while_loop + + ldr r7, [r0, #vp8_writer_buffer] + ldrb r10, [r7, r4] + add r10, r10, #1 + strb r10, [r7, r4] +extra_high_bit_not_set + rsb r4, r6, #24 ; 24-offset + ldr r10, [r0, #vp8_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp8_writer_pos] + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp8_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset)) + ldr r10, [sp, #4] ; b->tree +extra_count_lt_zero + lsl r2, r2, r6 + + subs r8, r8, #1 ; --n + bne extra_bits_loop ; while (n) + +no_extra_bits + ldr lr, [r1, #4] ; e = p->Extra + add r4, r5, #1 ; range + 1 + tst lr, #1 + lsr r4, r4, #1 ; split = (range + 1) >> 1 + addne r2, r2, r4 ; lowvalue += split + subne r4, r5, r4 ; range = range-split + tst r2, #0x80000000 ; lowvalue & 0x80000000 + lsl r5, r4, #1 ; range <<= 1 + beq end_high_bit_not_set + + ldr r4, [r0, #vp8_writer_pos] + mov r7, #0 + sub r4, r4, #1 + b end_zero_while_start +end_zero_while_loop + strb r7, [r6, r4] + sub r4, r4, #1 ; x-- +end_zero_while_start + cmp r4, #0 + ldrge r6, [r0, #vp8_writer_buffer] + ldrb r12, [r6, r4] + cmpge r12, #0xff + beq end_zero_while_loop + + ldr r6, [r0, #vp8_writer_buffer] + ldrb r7, [r6, r4] + add r7, r7, #1 + strb r7, [r6, r4] +end_high_bit_not_set + adds r3, r3, #1 ; ++count + lsl r2, r2, #1 ; lowvalue <<= 1 + bne end_count_zero + + ldr r4, [r0, #vp8_writer_pos] + mvn r3, #7 + ldr r7, [r0, #vp8_writer_buffer] + lsr r6, r2, #24 ; lowvalue >> 24 + add r12, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r12, [r0, #0x10] + strb r6, [r7, r4] +end_count_zero +skip_extra_bits + add r1, r1, #TOKENEXTRA_SZ ; ++p +check_p_lt_stop + ldr r4, [sp, #0] ; stop + cmp r1, r4 ; while( p < stop) + bcc while_p_lt_stop + + str r2, [r0, #vp8_writer_lowvalue] + str r5, [r0, #vp8_writer_range] + str r3, [r0, #vp8_writer_count] + add sp, sp, #12 + pop {r4-r11, pc} + ENDP + + END diff --git a/vp8/encoder/arm/neon/vp8_packtokens_mbrow_armv7.asm b/vp8/encoder/arm/neon/vp8_packtokens_mbrow_armv7.asm new file mode 100644 index 000000000..92b098909 --- /dev/null +++ b/vp8/encoder/arm/neon/vp8_packtokens_mbrow_armv7.asm @@ -0,0 +1,335 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8cx_pack_mb_row_tokens_armv7| + + INCLUDE vpx_vp8_enc_asm_offsets.asm + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY + +; r0 VP8_COMP *cpi +; r1 vp8_writer *w +; r2 vp8_coef_encodings +; r3 vp8_extra_bits +; s0 vp8_coef_tree + +|vp8cx_pack_mb_row_tokens_armv7| PROC + push {r4-r11, lr} + sub sp, sp, #24 + + ; Compute address of cpi->common.mb_rows + ldr r4, _VP8_COMP_common_ + ldr r6, _VP8_COMMON_MBrows_ + add r4, r0, r4 + + ldr r5, [r4, r6] ; load up mb_rows + + str r2, [sp, #20] ; save vp8_coef_encodings + str r5, [sp, #12] ; save mb_rows + str r3, [sp, #8] ; save vp8_extra_bits + + ldr r4, _VP8_COMP_tplist_ + add r4, r0, r4 + ldr r7, [r4, #0] ; dereference cpi->tp_list + + mov r0, r1 ; keep same as other loops + + ldr r2, [r0, #vp8_writer_lowvalue] + ldr r5, [r0, #vp8_writer_range] + ldr r3, [r0, #vp8_writer_count] + +mb_row_loop + + ldr r1, [r7, #tokenlist_start] + ldr r9, [r7, #tokenlist_stop] + str r9, [sp, #0] ; save stop for later comparison + str r7, [sp, #16] ; tokenlist address for next time + + b check_p_lt_stop + + ; actuall work gets done here! + +while_p_lt_stop + ldr r6, [r1, #tokenextra_token] ; t + ldr r4, [sp, #20] ; vp8_coef_encodings + mov lr, #0 + add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t + ldr r9, [r1, #tokenextra_context_tree] ; pp + + ldr r7, [r1, #tokenextra_skip_eob_node] + + ldr r6, [r4, #vp8_token_value] ; v + ldr r8, [r4, #vp8_token_len] ; n + + ; vp8 specific skip_eob_node + cmp r7, #0 + movne lr, #2 ; i = 2 + subne r8, r8, #1 ; --n + + ; reverse the stream of bits to be packed. Normally + ; the most significant bit is peeled off and compared + ; in the form of (v >> --n) & 1. ARM architecture has + ; the ability to set a flag based on the value of the + ; bit shifted off the bottom of the register. To make + ; that happen the bitstream is reversed. + rbit r12, r6 + rsb r4, r8, #32 ; 32-n + ldr r10, [sp, #60] ; vp8_coef_tree + + ; v is kept in r12 during the token pack loop + lsr r12, r12, r4 ; v >>= 32 - n + +; loop start +token_loop + ldrb r4, [r9, lr, asr #1] ; pp [i>>1] + sub r7, r5, #1 ; range-1 + + ; Decisions are made based on the bit value shifted + ; off of v, so set a flag here based on this. + ; This value is refered to as "bb" + lsrs r12, r12, #1 ; bb = v >> n + mul r4, r4, r7 ; ((range-1) * pp[i>>1])) + + ; bb can only be 0 or 1. So only execute this statement + ; if bb == 1, otherwise it will act like i + 0 + addcs lr, lr, #1 ; i + bb + + mov r7, #1 + ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb] + add r4, r7, r4, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8) + + addcs r2, r2, r4 ; if (bb) lowvalue += split + subcs r4, r5, r4 ; if (bb) range = range-split + + ; Counting the leading zeros is used to normalize range. + clz r6, r4 + sub r6, r6, #24 ; shift + + ; Flag is set on the sum of count. This flag is used later + ; to determine if count >= 0 + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi token_count_lt_zero ; if(count >= 0) + + sub r6, r6, r3 ; offset = shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl token_high_bit_not_set + + ldr r4, [r0, #vp8_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos-1 + b token_zero_while_start +token_zero_while_loop + mov r10, #0 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +token_zero_while_start + cmp r4, #0 + ldrge r7, [r0, #vp8_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq token_zero_while_loop + + ldr r7, [r0, #vp8_writer_buffer] + ldrb r10, [r7, r4] ; w->buffer[x] + add r10, r10, #1 + strb r10, [r7, r4] ; w->buffer[x] + 1 +token_high_bit_not_set + rsb r4, r6, #24 ; 24-offset + ldr r10, [r0, #vp8_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp8_writer_pos] ; w->pos + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp8_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r10, r4] ; w->buffer[w->pos++] + + ; r10 is used earlier in the loop, but r10 is used as + ; temp variable here. So after r10 is used, reload + ; vp8_coef_tree_dcd into r10 + ldr r10, [sp, #60] ; vp8_coef_tree + +token_count_lt_zero + lsl r2, r2, r6 ; lowvalue <<= shift + + subs r8, r8, #1 ; --n + bne token_loop + + ldr r6, [r1, #tokenextra_token] ; t + ldr r7, [sp, #8] ; vp8_extra_bits + ; Add t * sizeof (vp8_extra_bit_struct) to get the desired + ; element. Here vp8_extra_bit_struct == 20 + add r6, r6, r6, lsl #2 ; b = vp8_extra_bits + t + add r12, r7, r6, lsl #2 ; b = vp8_extra_bits + t + + ldr r4, [r12, #vp8_extra_bit_struct_base_val] + cmp r4, #0 + beq skip_extra_bits + +; if( b->base_val) + ldr r8, [r12, #vp8_extra_bit_struct_len] ; L + ldr lr, [r1, #tokenextra_extra] ; e = p->Extra + cmp r8, #0 ; if( L) + beq no_extra_bits + + ldr r9, [r12, #vp8_extra_bit_struct_prob] + asr r7, lr, #1 ; v=e>>1 + + ldr r10, [r12, #vp8_extra_bit_struct_tree] + str r10, [sp, #4] ; b->tree + + rbit r12, r7 ; reverse v + rsb r4, r8, #32 + lsr r12, r12, r4 + + mov lr, #0 ; i = 0 + +extra_bits_loop + ldrb r4, [r9, lr, asr #1] ; pp[i>>1] + sub r7, r5, #1 ; range-1 + lsrs r12, r12, #1 ; v >> n + mul r4, r4, r7 ; (range-1) * pp[i>>1] + addcs lr, lr, #1 ; i + bb + + mov r7, #1 + ldrsb lr, [r10, lr] ; i = b->tree[i+bb] + add r4, r7, r4, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8) + + addcs r2, r2, r4 ; if (bb) lowvalue += split + subcs r4, r5, r4 ; if (bb) range = range-split + + clz r6, r4 + sub r6, r6, #24 + + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi extra_count_lt_zero ; if(count >= 0) + + sub r6, r6, r3 ; offset= shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl extra_high_bit_not_set + + ldr r4, [r0, #vp8_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos - 1 + b extra_zero_while_start +extra_zero_while_loop + mov r10, #0 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +extra_zero_while_start + cmp r4, #0 + ldrge r7, [r0, #vp8_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq extra_zero_while_loop + + ldr r7, [r0, #vp8_writer_buffer] + ldrb r10, [r7, r4] + add r10, r10, #1 + strb r10, [r7, r4] +extra_high_bit_not_set + rsb r4, r6, #24 ; 24-offset + ldr r10, [r0, #vp8_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp8_writer_pos] + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp8_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset)) + ldr r10, [sp, #4] ; b->tree +extra_count_lt_zero + lsl r2, r2, r6 + + subs r8, r8, #1 ; --n + bne extra_bits_loop ; while (n) + +no_extra_bits + ldr lr, [r1, #4] ; e = p->Extra + add r4, r5, #1 ; range + 1 + tst lr, #1 + lsr r4, r4, #1 ; split = (range + 1) >> 1 + addne r2, r2, r4 ; lowvalue += split + subne r4, r5, r4 ; range = range-split + tst r2, #0x80000000 ; lowvalue & 0x80000000 + lsl r5, r4, #1 ; range <<= 1 + beq end_high_bit_not_set + + ldr r4, [r0, #vp8_writer_pos] + mov r7, #0 + sub r4, r4, #1 + b end_zero_while_start +end_zero_while_loop + strb r7, [r6, r4] + sub r4, r4, #1 ; x-- +end_zero_while_start + cmp r4, #0 + ldrge r6, [r0, #vp8_writer_buffer] + ldrb r12, [r6, r4] + cmpge r12, #0xff + beq end_zero_while_loop + + ldr r6, [r0, #vp8_writer_buffer] + ldrb r7, [r6, r4] + add r7, r7, #1 + strb r7, [r6, r4] +end_high_bit_not_set + adds r3, r3, #1 ; ++count + lsl r2, r2, #1 ; lowvalue <<= 1 + bne end_count_zero + + ldr r4, [r0, #vp8_writer_pos] + mvn r3, #7 + ldr r7, [r0, #vp8_writer_buffer] + lsr r6, r2, #24 ; lowvalue >> 24 + add r12, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r12, [r0, #0x10] + strb r6, [r7, r4] +end_count_zero +skip_extra_bits + add r1, r1, #TOKENEXTRA_SZ ; ++p +check_p_lt_stop + ldr r4, [sp, #0] ; stop + cmp r1, r4 ; while( p < stop) + bcc while_p_lt_stop + + ldr r6, [sp, #12] ; mb_rows + ldr r7, [sp, #16] ; tokenlist address + subs r6, r6, #1 + add r7, r7, #TOKENLIST_SZ ; next element in the array + str r6, [sp, #12] + bne mb_row_loop + + str r2, [r0, #vp8_writer_lowvalue] + str r5, [r0, #vp8_writer_range] + str r3, [r0, #vp8_writer_count] + add sp, sp, #24 + pop {r4-r11, pc} + ENDP + +_VP8_COMP_common_ + DCD vp8_comp_common +_VP8_COMMON_MBrows_ + DCD vp8_common_mb_rows +_VP8_COMP_tplist_ + DCD vp8_comp_tplist + + END diff --git a/vp8/encoder/arm/neon/vp8_packtokens_partitions_armv7.asm b/vp8/encoder/arm/neon/vp8_packtokens_partitions_armv7.asm new file mode 100644 index 000000000..6d5f882ed --- /dev/null +++ b/vp8/encoder/arm/neon/vp8_packtokens_partitions_armv7.asm @@ -0,0 +1,471 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8cx_pack_tokens_into_partitions_armv7| + + INCLUDE vpx_vp8_enc_asm_offsets.asm + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY + +; r0 VP8_COMP *cpi +; r1 unsigned char *cx_data +; r2 int num_part +; r3 *size +; s0 vp8_coef_encodings +; s1 vp8_extra_bits, +; s2 const vp8_tree_index *, + +|vp8cx_pack_tokens_into_partitions_armv7| PROC + push {r4-r11, lr} + sub sp, sp, #44 + + ; Compute address of cpi->common.mb_rows + ldr r4, _VP8_COMP_common_ + ldr r6, _VP8_COMMON_MBrows_ + add r4, r0, r4 + + ldr r5, [r4, r6] ; load up mb_rows + + str r5, [sp, #36] ; save mb_rows + str r1, [sp, #24] ; save cx_data + str r2, [sp, #20] ; save num_part + str r3, [sp, #8] ; save *size + + ; *size = 3*(num_part -1 ); + sub r2, r2, #1 ; num_part - 1 + add r2, r2, r2, lsl #1 ; 3*(num_part - 1) + str r2, [r3] + + add r2, r2, r1 ; cx_data + *size + str r2, [sp, #40] ; ptr + + ldr r4, _VP8_COMP_tplist_ + add r4, r0, r4 + ldr r7, [r4, #0] ; dereference cpi->tp_list + str r7, [sp, #32] ; store start of cpi->tp_list + + ldr r11, _VP8_COMP_bc2_ ; load up vp8_writer out of cpi + add r0, r0, r11 + + mov r11, #0 + str r11, [sp, #28] ; i + +numparts_loop + ldr r10, [sp, #40] ; ptr + ldr r5, [sp, #36] ; move mb_rows to the counting section + str r5, [sp, #12] + + ; Reset all of the VP8 Writer data for each partition that + ; is processed. + ; start_encode + mov r2, #0 ; vp8_writer_lowvalue + mov r5, #255 ; vp8_writer_range + mvn r3, #23 ; vp8_writer_count + + str r2, [r0, #vp8_writer_value] + str r2, [r0, #vp8_writer_pos] + str r10, [r0, #vp8_writer_buffer] + +mb_row_loop + + ldr r1, [r7, #tokenlist_start] + ldr r9, [r7, #tokenlist_stop] + str r9, [sp, #0] ; save stop for later comparison + str r7, [sp, #16] ; tokenlist address for next time + + b check_p_lt_stop + + ; actual work gets done here! + +while_p_lt_stop + ldr r6, [r1, #tokenextra_token] ; t + ldr r4, [sp, #80] ; vp8_coef_encodings + mov lr, #0 + add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t + ldr r9, [r1, #tokenextra_context_tree] ; pp + + ldr r7, [r1, #tokenextra_skip_eob_node] + + ldr r6, [r4, #vp8_token_value] ; v + ldr r8, [r4, #vp8_token_len] ; n + + ; vp8 specific skip_eob_node + cmp r7, #0 + movne lr, #2 ; i = 2 + subne r8, r8, #1 ; --n + + ; reverse the stream of bits to be packed. Normally + ; the most significant bit is peeled off and compared + ; in the form of (v >> --n) & 1. ARM architecture has + ; the ability to set a flag based on the value of the + ; bit shifted off the bottom of the register. To make + ; that happen the bitstream is reversed. + rbit r12, r6 + rsb r4, r8, #32 ; 32-n + ldr r10, [sp, #88] ; vp8_coef_tree + + ; v is kept in r12 during the token pack loop + lsr r12, r12, r4 ; v >>= 32 - n + +; loop start +token_loop + ldrb r4, [r9, lr, asr #1] ; pp [i>>1] + sub r7, r5, #1 ; range-1 + + ; Decisions are made based on the bit value shifted + ; off of v, so set a flag here based on this. + ; This value is refered to as "bb" + lsrs r12, r12, #1 ; bb = v >> n + mul r4, r4, r7 ; ((range-1) * pp[i>>1])) + + ; bb can only be 0 or 1. So only execute this statement + ; if bb == 1, otherwise it will act like i + 0 + addcs lr, lr, #1 ; i + bb + + mov r7, #1 + ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb] + add r4, r7, r4, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8) + + addcs r2, r2, r4 ; if (bb) lowvalue += split + subcs r4, r5, r4 ; if (bb) range = range-split + + ; Counting the leading zeros is used to normalize range. + clz r6, r4 + sub r6, r6, #24 ; shift + + ; Flag is set on the sum of count. This flag is used later + ; to determine if count >= 0 + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi token_count_lt_zero ; if(count >= 0) + + sub r6, r6, r3 ; offset = shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl token_high_bit_not_set + + ldr r4, [r0, #vp8_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos-1 + b token_zero_while_start +token_zero_while_loop + mov r10, #0 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +token_zero_while_start + cmp r4, #0 + ldrge r7, [r0, #vp8_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq token_zero_while_loop + + ldr r7, [r0, #vp8_writer_buffer] + ldrb r10, [r7, r4] ; w->buffer[x] + add r10, r10, #1 + strb r10, [r7, r4] ; w->buffer[x] + 1 +token_high_bit_not_set + rsb r4, r6, #24 ; 24-offset + ldr r10, [r0, #vp8_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp8_writer_pos] ; w->pos + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp8_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r10, r4] ; w->buffer[w->pos++] + + ; r10 is used earlier in the loop, but r10 is used as + ; temp variable here. So after r10 is used, reload + ; vp8_coef_tree_dcd into r10 + ldr r10, [sp, #88] ; vp8_coef_tree + +token_count_lt_zero + lsl r2, r2, r6 ; lowvalue <<= shift + + subs r8, r8, #1 ; --n + bne token_loop + + ldr r6, [r1, #tokenextra_token] ; t + ldr r7, [sp, #84] ; vp8_extra_bits + ; Add t * sizeof (vp8_extra_bit_struct) to get the desired + ; element. Here vp8_extra_bit_struct == 20 + add r6, r6, r6, lsl #2 ; b = vp8_extra_bits + t + add r12, r7, r6, lsl #2 ; b = vp8_extra_bits + t + + ldr r4, [r12, #vp8_extra_bit_struct_base_val] + cmp r4, #0 + beq skip_extra_bits + +; if( b->base_val) + ldr r8, [r12, #vp8_extra_bit_struct_len] ; L + ldr lr, [r1, #tokenextra_extra] ; e = p->Extra + cmp r8, #0 ; if( L) + beq no_extra_bits + + ldr r9, [r12, #vp8_extra_bit_struct_prob] + asr r7, lr, #1 ; v=e>>1 + + ldr r10, [r12, #vp8_extra_bit_struct_tree] + str r10, [sp, #4] ; b->tree + + rbit r12, r7 ; reverse v + rsb r4, r8, #32 + lsr r12, r12, r4 + + mov lr, #0 ; i = 0 + +extra_bits_loop + ldrb r4, [r9, lr, asr #1] ; pp[i>>1] + sub r7, r5, #1 ; range-1 + lsrs r12, r12, #1 ; v >> n + mul r4, r4, r7 ; (range-1) * pp[i>>1] + addcs lr, lr, #1 ; i + bb + + mov r7, #1 + ldrsb lr, [r10, lr] ; i = b->tree[i+bb] + add r4, r7, r4, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8) + + addcs r2, r2, r4 ; if (bb) lowvalue += split + subcs r4, r5, r4 ; if (bb) range = range-split + + clz r6, r4 + sub r6, r6, #24 + + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi extra_count_lt_zero ; if(count >= 0) + + sub r6, r6, r3 ; offset= shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl extra_high_bit_not_set + + ldr r4, [r0, #vp8_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos - 1 + b extra_zero_while_start +extra_zero_while_loop + mov r10, #0 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +extra_zero_while_start + cmp r4, #0 + ldrge r7, [r0, #vp8_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq extra_zero_while_loop + + ldr r7, [r0, #vp8_writer_buffer] + ldrb r10, [r7, r4] + add r10, r10, #1 + strb r10, [r7, r4] +extra_high_bit_not_set + rsb r4, r6, #24 ; 24-offset + ldr r10, [r0, #vp8_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp8_writer_pos] + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp8_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset)) + ldr r10, [sp, #4] ; b->tree +extra_count_lt_zero + lsl r2, r2, r6 + + subs r8, r8, #1 ; --n + bne extra_bits_loop ; while (n) + +no_extra_bits + ldr lr, [r1, #4] ; e = p->Extra + add r4, r5, #1 ; range + 1 + tst lr, #1 + lsr r4, r4, #1 ; split = (range + 1) >> 1 + addne r2, r2, r4 ; lowvalue += split + subne r4, r5, r4 ; range = range-split + tst r2, #0x80000000 ; lowvalue & 0x80000000 + lsl r5, r4, #1 ; range <<= 1 + beq end_high_bit_not_set + + ldr r4, [r0, #vp8_writer_pos] + mov r7, #0 + sub r4, r4, #1 + b end_zero_while_start +end_zero_while_loop + strb r7, [r6, r4] + sub r4, r4, #1 ; x-- +end_zero_while_start + cmp r4, #0 + ldrge r6, [r0, #vp8_writer_buffer] + ldrb r12, [r6, r4] + cmpge r12, #0xff + beq end_zero_while_loop + + ldr r6, [r0, #vp8_writer_buffer] + ldrb r7, [r6, r4] + add r7, r7, #1 + strb r7, [r6, r4] +end_high_bit_not_set + adds r3, r3, #1 ; ++count + lsl r2, r2, #1 ; lowvalue <<= 1 + bne end_count_zero + + ldr r4, [r0, #vp8_writer_pos] + mvn r3, #7 + ldr r7, [r0, #vp8_writer_buffer] + lsr r6, r2, #24 ; lowvalue >> 24 + add r12, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r12, [r0, #0x10] + strb r6, [r7, r4] +end_count_zero +skip_extra_bits + add r1, r1, #TOKENEXTRA_SZ ; ++p +check_p_lt_stop + ldr r4, [sp, #0] ; stop + cmp r1, r4 ; while( p < stop) + bcc while_p_lt_stop + + ldr r10, [sp, #20] ; num_parts + mov r1, #TOKENLIST_SZ + mul r1, r10, r1 + + ldr r6, [sp, #12] ; mb_rows + ldr r7, [sp, #16] ; tokenlist address + subs r6, r6, r10 + add r7, r7, r1 ; next element in the array + str r6, [sp, #12] + bgt mb_row_loop + + mov r12, #32 + +stop_encode_loop + sub r7, r5, #1 ; range-1 + + mov r4, r7, lsl #7 ; ((range-1) * 128) + + mov r7, #1 + add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8) + + ; Counting the leading zeros is used to normalize range. + clz r6, r4 + sub r6, r6, #24 ; shift + + ; Flag is set on the sum of count. This flag is used later + ; to determine if count >= 0 + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi token_count_lt_zero_se ; if(count >= 0) + + sub r6, r6, r3 ; offset = shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl token_high_bit_not_set_se + + ldr r4, [r0, #vp8_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos-1 + b token_zero_while_start_se +token_zero_while_loop_se + mov r10, #0 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +token_zero_while_start_se + cmp r4, #0 + ldrge r7, [r0, #vp8_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq token_zero_while_loop_se + + ldr r7, [r0, #vp8_writer_buffer] + ldrb r10, [r7, r4] ; w->buffer[x] + add r10, r10, #1 + strb r10, [r7, r4] ; w->buffer[x] + 1 +token_high_bit_not_set_se + rsb r4, r6, #24 ; 24-offset + ldr r10, [r0, #vp8_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp8_writer_pos] ; w->pos + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp8_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r10, r4] ; w->buffer[w->pos++] + +token_count_lt_zero_se + lsl r2, r2, r6 ; lowvalue <<= shift + + subs r12, r12, #1 + bne stop_encode_loop + + ldr r10, [sp, #8] ; *size + ldr r11, [r10] + ldr r4, [r0, #vp8_writer_pos] ; w->pos + add r11, r11, r4 ; *size += w->pos + str r11, [r10] + + ldr r9, [sp, #20] ; num_parts + sub r9, r9, #1 + ldr r10, [sp, #28] ; i + cmp r10, r9 ; if(i<(num_part - 1)) + bge skip_write_partition + + ldr r12, [sp, #40] ; ptr + add r12, r12, r4 ; ptr += w->pos + str r12, [sp, #40] + + ldr r9, [sp, #24] ; cx_data + mov r8, r4, asr #8 + strb r4, [r9, #0] + strb r8, [r9, #1] + mov r4, r4, asr #16 + strb r4, [r9, #2] + + add r9, r9, #3 ; cx_data += 3 + str r9, [sp, #24] + +skip_write_partition + + ldr r11, [sp, #28] ; i + ldr r10, [sp, #20] ; num_parts + + add r11, r11, #1 ; i++ + str r11, [sp, #28] + + ldr r7, [sp, #32] ; cpi->tp_list[i] + mov r1, #TOKENLIST_SZ + add r7, r7, r1 ; next element in cpi->tp_list + str r7, [sp, #32] ; cpi->tp_list[i+1] + + cmp r10, r11 + bgt numparts_loop + + + add sp, sp, #44 + pop {r4-r11, pc} + ENDP + +_VP8_COMP_common_ + DCD vp8_comp_common +_VP8_COMMON_MBrows_ + DCD vp8_common_mb_rows +_VP8_COMP_tplist_ + DCD vp8_comp_tplist +_VP8_COMP_bc2_ + DCD vp8_comp_bc2 + + END diff --git a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm new file mode 100644 index 000000000..5269c0af8 --- /dev/null +++ b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm @@ -0,0 +1,75 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_short_walsh4x4_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;void vp8_short_walsh4x4_c(short *input, short *output, int pitch) + +|vp8_short_walsh4x4_neon| PROC + vld1.16 {d2}, [r0], r2 ;load input + vld1.16 {d3}, [r0], r2 + vld1.16 {d4}, [r0], r2 + vld1.16 {d5}, [r0], r2 + + ;First for-loop + ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[1], d4=ip[2], d5=ip[3] + vtrn.32 d2, d4 + vtrn.32 d3, d5 + vtrn.16 d2, d3 + vtrn.16 d4, d5 + + vadd.s16 d6, d2, d5 ;a1 = ip[0]+ip[3] + vadd.s16 d7, d3, d4 ;b1 = ip[1]+ip[2] + vsub.s16 d8, d3, d4 ;c1 = ip[1]-ip[2] + vsub.s16 d9, d2, d5 ;d1 = ip[0]-ip[3] + + vadd.s16 d2, d6, d7 ;op[0] = a1 + b1 + vsub.s16 d4, d6, d7 ;op[2] = a1 - b1 + vadd.s16 d3, d8, d9 ;op[1] = c1 + d1 + vsub.s16 d5, d9, d8 ;op[3] = d1 - c1 + + ;Second for-loop + ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[4], d4=ip[8], d5=ip[12] + vtrn.32 d2, d4 + vtrn.32 d3, d5 + vtrn.16 d2, d3 + vtrn.16 d4, d5 + + vadd.s16 d6, d2, d5 ;a1 = ip[0]+ip[12] + vadd.s16 d7, d3, d4 ;b1 = ip[4]+ip[8] + vsub.s16 d8, d3, d4 ;c1 = ip[4]-ip[8] + vsub.s16 d9, d2, d5 ;d1 = ip[0]-ip[12] + + vadd.s16 d2, d6, d7 ;a2 = a1 + b1; + vsub.s16 d4, d6, d7 ;c2 = a1 - b1; + vadd.s16 d3, d8, d9 ;b2 = c1 + d1; + vsub.s16 d5, d9, d8 ;d2 = d1 - c1; + + vcgt.s16 q3, q1, #0 + vcgt.s16 q4, q2, #0 + + vsub.s16 q1, q1, q3 + vsub.s16 q2, q2, q4 + + vshr.s16 q1, q1, #1 + vshr.s16 q2, q2, #1 + + vst1.16 {q1, q2}, [r1] + + bx lr + + ENDP + + END diff --git a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm new file mode 100644 index 000000000..aec716e3b --- /dev/null +++ b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm @@ -0,0 +1,427 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_sub_pixel_variance16x16_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack(r4) unsigned char *dst_ptr, +; stack(r5) int dst_pixels_per_line, +; stack(r6) unsigned int *sse +;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon. + +|vp8_sub_pixel_variance16x16_neon| PROC + push {r4-r6, lr} + + ldr r12, _BilinearTaps_coeff_ + ldr r4, [sp, #16] ;load *dst_ptr from stack + ldr r5, [sp, #20] ;load dst_pixels_per_line from stack + ldr r6, [sp, #24] ;load *sse from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_bfilter16x16_only + + add r2, r12, r2, lsl #3 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + + vld1.s32 {d31}, [r2] ;load first_pass filter + + beq firstpass_bfilter16x16_only + + sub sp, sp, #272 ;reserve space on stack for temporary storage + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + mov lr, sp + vld1.u8 {d5, d6, d7}, [r0], r1 + + mov r2, #3 ;loop counter + vld1.u8 {d8, d9, d10}, [r0], r1 + + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vld1.u8 {d11, d12, d13}, [r0], r1 + + vdup.8 d1, d31[4] + +;First Pass: output_height lines x output_width columns (17x16) +vp8e_filt_blk2d_fp16x16_loop_neon + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0]) + vmull.u8 q8, d3, d0 + vmull.u8 q9, d5, d0 + vmull.u8 q10, d6, d0 + vmull.u8 q11, d8, d0 + vmull.u8 q12, d9, d0 + vmull.u8 q13, d11, d0 + vmull.u8 q14, d12, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + vext.8 d11, d11, d12, #1 + + vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q9, d5, d1 + vmlal.u8 q11, d8, d1 + vmlal.u8 q13, d11, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + vext.8 d12, d12, d13, #1 + + vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q10, d6, d1 + vmlal.u8 q12, d9, d1 + vmlal.u8 q14, d12, d1 + + subs r2, r2, #1 + + vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d15, q8, #7 + vqrshrn.u16 d16, q9, #7 + vqrshrn.u16 d17, q10, #7 + vqrshrn.u16 d18, q11, #7 + vqrshrn.u16 d19, q12, #7 + vqrshrn.u16 d20, q13, #7 + + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + vqrshrn.u16 d21, q14, #7 + vld1.u8 {d5, d6, d7}, [r0], r1 + + vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result + vld1.u8 {d8, d9, d10}, [r0], r1 + vst1.u8 {d18, d19, d20, d21}, [lr]! + vld1.u8 {d11, d12, d13}, [r0], r1 + + bne vp8e_filt_blk2d_fp16x16_loop_neon + +;First-pass filtering for rest 5 lines + vld1.u8 {d14, d15, d16}, [r0], r1 + + vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0]) + vmull.u8 q10, d3, d0 + vmull.u8 q11, d5, d0 + vmull.u8 q12, d6, d0 + vmull.u8 q13, d8, d0 + vmull.u8 q14, d9, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + + vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q11, d5, d1 + vmlal.u8 q13, d8, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + + vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q12, d6, d1 + vmlal.u8 q14, d9, d1 + + vmull.u8 q1, d11, d0 + vmull.u8 q2, d12, d0 + vmull.u8 q3, d14, d0 + vmull.u8 q4, d15, d0 + + vext.8 d11, d11, d12, #1 ;construct src_ptr[1] + vext.8 d14, d14, d15, #1 + + vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q3, d14, d1 + + vext.8 d12, d12, d13, #1 + vext.8 d15, d15, d16, #1 + + vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q4, d15, d1 + + vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d11, q10, #7 + vqrshrn.u16 d12, q11, #7 + vqrshrn.u16 d13, q12, #7 + vqrshrn.u16 d14, q13, #7 + vqrshrn.u16 d15, q14, #7 + vqrshrn.u16 d16, q1, #7 + vqrshrn.u16 d17, q2, #7 + vqrshrn.u16 d18, q3, #7 + vqrshrn.u16 d19, q4, #7 + + vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result + vst1.u8 {d14, d15, d16, d17}, [lr]! + vst1.u8 {d18, d19}, [lr]! + +;Second pass: 16x16 +;secondpass_filter + add r3, r12, r3, lsl #3 + sub lr, lr, #272 + + vld1.u32 {d31}, [r3] ;load second_pass filter + + sub sp, sp, #256 + mov r3, sp + + vld1.u8 {d22, d23}, [lr]! ;load src data + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + mov r12, #4 ;loop counter + +vp8e_filt_blk2d_sp16x16_loop_neon + vld1.u8 {d24, d25}, [lr]! + vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0]) + vld1.u8 {d26, d27}, [lr]! + vmull.u8 q2, d23, d0 + vld1.u8 {d28, d29}, [lr]! + vmull.u8 q3, d24, d0 + vld1.u8 {d30, d31}, [lr]! + + vmull.u8 q4, d25, d0 + vmull.u8 q5, d26, d0 + vmull.u8 q6, d27, d0 + vmull.u8 q7, d28, d0 + vmull.u8 q8, d29, d0 + + vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1]) + vmlal.u8 q2, d25, d1 + vmlal.u8 q3, d26, d1 + vmlal.u8 q4, d27, d1 + vmlal.u8 q5, d28, d1 + vmlal.u8 q6, d29, d1 + vmlal.u8 q7, d30, d1 + vmlal.u8 q8, d31, d1 + + subs r12, r12, #1 + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + vqrshrn.u16 d6, q5, #7 + vqrshrn.u16 d7, q6, #7 + vqrshrn.u16 d8, q7, #7 + vqrshrn.u16 d9, q8, #7 + + vst1.u8 {d2, d3}, [r3]! ;store result + vst1.u8 {d4, d5}, [r3]! + vst1.u8 {d6, d7}, [r3]! + vmov q11, q15 + vst1.u8 {d8, d9}, [r3]! + + bne vp8e_filt_blk2d_sp16x16_loop_neon + + b sub_pixel_variance16x16_neon + +;-------------------- +firstpass_bfilter16x16_only + mov r2, #4 ;loop counter + sub sp, sp, #528 ;reserve space on stack for temporary storage + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vdup.8 d1, d31[4] + mov r3, sp + +;First Pass: output_height lines x output_width columns (16x16) +vp8e_filt_blk2d_fpo16x16_loop_neon + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + vld1.u8 {d5, d6, d7}, [r0], r1 + vld1.u8 {d8, d9, d10}, [r0], r1 + vld1.u8 {d11, d12, d13}, [r0], r1 + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0]) + vmull.u8 q8, d3, d0 + vmull.u8 q9, d5, d0 + vmull.u8 q10, d6, d0 + vmull.u8 q11, d8, d0 + vmull.u8 q12, d9, d0 + vmull.u8 q13, d11, d0 + vmull.u8 q14, d12, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + vext.8 d11, d11, d12, #1 + + vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q9, d5, d1 + vmlal.u8 q11, d8, d1 + vmlal.u8 q13, d11, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + vext.8 d12, d12, d13, #1 + + vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q10, d6, d1 + vmlal.u8 q12, d9, d1 + vmlal.u8 q14, d12, d1 + + subs r2, r2, #1 + + vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d15, q8, #7 + vqrshrn.u16 d16, q9, #7 + vqrshrn.u16 d17, q10, #7 + vqrshrn.u16 d18, q11, #7 + vqrshrn.u16 d19, q12, #7 + vqrshrn.u16 d20, q13, #7 + vst1.u8 {d14, d15}, [r3]! ;store result + vqrshrn.u16 d21, q14, #7 + + vst1.u8 {d16, d17}, [r3]! + vst1.u8 {d18, d19}, [r3]! + vst1.u8 {d20, d21}, [r3]! + + bne vp8e_filt_blk2d_fpo16x16_loop_neon + + b sub_pixel_variance16x16_neon + +;--------------------- +secondpass_bfilter16x16_only +;Second pass: 16x16 +;secondpass_filter + sub sp, sp, #528 ;reserve space on stack for temporary storage + add r3, r12, r3, lsl #3 + mov r12, #4 ;loop counter + vld1.u32 {d31}, [r3] ;load second_pass filter + vld1.u8 {d22, d23}, [r0], r1 ;load src data + mov r3, sp + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + +vp8e_filt_blk2d_spo16x16_loop_neon + vld1.u8 {d24, d25}, [r0], r1 + vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0]) + vld1.u8 {d26, d27}, [r0], r1 + vmull.u8 q2, d23, d0 + vld1.u8 {d28, d29}, [r0], r1 + vmull.u8 q3, d24, d0 + vld1.u8 {d30, d31}, [r0], r1 + + vmull.u8 q4, d25, d0 + vmull.u8 q5, d26, d0 + vmull.u8 q6, d27, d0 + vmull.u8 q7, d28, d0 + vmull.u8 q8, d29, d0 + + vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1]) + vmlal.u8 q2, d25, d1 + vmlal.u8 q3, d26, d1 + vmlal.u8 q4, d27, d1 + vmlal.u8 q5, d28, d1 + vmlal.u8 q6, d29, d1 + vmlal.u8 q7, d30, d1 + vmlal.u8 q8, d31, d1 + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + vqrshrn.u16 d6, q5, #7 + vqrshrn.u16 d7, q6, #7 + vqrshrn.u16 d8, q7, #7 + vqrshrn.u16 d9, q8, #7 + + vst1.u8 {d2, d3}, [r3]! ;store result + subs r12, r12, #1 + vst1.u8 {d4, d5}, [r3]! + vmov q11, q15 + vst1.u8 {d6, d7}, [r3]! + vst1.u8 {d8, d9}, [r3]! + + bne vp8e_filt_blk2d_spo16x16_loop_neon + + b sub_pixel_variance16x16_neon + +;---------------------------- +;variance16x16 +sub_pixel_variance16x16_neon + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + sub r3, r3, #256 + mov r12, #8 + +sub_pixel_variance16x16_neon_loop + vld1.8 {q0}, [r3]! ;Load up source and reference + vld1.8 {q2}, [r4], r5 + vld1.8 {q1}, [r3]! + vld1.8 {q3}, [r4], r5 + + vsubl.u8 q11, d0, d4 ;diff + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + vpadal.s16 q8, q11 ;sum + vmlal.s16 q9, d22, d22 ;sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + bne sub_pixel_variance16x16_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [r6] ;store sse + vshr.s32 d10, d10, #8 + vsub.s32 d0, d1, d10 + + add sp, sp, #528 + vmov.32 r0, d0[0] ;return + + pop {r4-r6,pc} + + ENDP + +;----------------- + AREA vp8e_bilinear_taps_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_BilinearTaps_coeff_ + DCD bilinear_taps_coeff +bilinear_taps_coeff + DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 + + END diff --git a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm new file mode 100644 index 000000000..3d02d7c40 --- /dev/null +++ b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm @@ -0,0 +1,571 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_sub_pixel_variance16x16s_4_0_neon| + EXPORT |vp8_sub_pixel_variance16x16s_0_4_neon| + EXPORT |vp8_sub_pixel_variance16x16s_4_4_neon| + EXPORT |vp8_sub_pixel_variance16x16s_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;================================================ +;unsigned int vp8_sub_pixel_variance16x16s_4_0_neon +;( +; unsigned char *src_ptr, r0 +; int src_pixels_per_line, r1 +; unsigned char *dst_ptr, r2 +; int dst_pixels_per_line, r3 +; unsigned int *sse +;); +;================================================ +|vp8_sub_pixel_variance16x16s_4_0_neon| PROC + push {lr} + + mov r12, #4 ;loop counter + ldr lr, [sp, #4] ;load *sse from stack + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + +;First Pass: output_height lines x output_width columns (16x16) +vp8_filt_fpo16x16s_4_0_loop_neon + vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data + vld1.8 {q11}, [r2], r3 + vld1.u8 {d4, d5, d6, d7}, [r0], r1 + vld1.8 {q12}, [r2], r3 + vld1.u8 {d8, d9, d10, d11}, [r0], r1 + vld1.8 {q13}, [r2], r3 + vld1.u8 {d12, d13, d14, d15}, [r0], r1 + + ;pld [r0] + ;pld [r0, r1] + ;pld [r0, r1, lsl #1] + + vext.8 q1, q0, q1, #1 ;construct src_ptr[1] + vext.8 q3, q2, q3, #1 + vext.8 q5, q4, q5, #1 + vext.8 q7, q6, q7, #1 + + vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 + vld1.8 {q14}, [r2], r3 + vrhadd.u8 q1, q2, q3 + vrhadd.u8 q2, q4, q5 + vrhadd.u8 q3, q6, q7 + + vsubl.u8 q4, d0, d22 ;diff + vsubl.u8 q5, d1, d23 + vsubl.u8 q6, d2, d24 + vsubl.u8 q7, d3, d25 + vsubl.u8 q0, d4, d26 + vsubl.u8 q1, d5, d27 + vsubl.u8 q2, d6, d28 + vsubl.u8 q3, d7, d29 + + vpadal.s16 q8, q4 ;sum + vmlal.s16 q9, d8, d8 ;sse + vmlal.s16 q10, d9, d9 + + subs r12, r12, #1 + + vpadal.s16 q8, q5 + vmlal.s16 q9, d10, d10 + vmlal.s16 q10, d11, d11 + vpadal.s16 q8, q6 + vmlal.s16 q9, d12, d12 + vmlal.s16 q10, d13, d13 + vpadal.s16 q8, q7 + vmlal.s16 q9, d14, d14 + vmlal.s16 q10, d15, d15 + + vpadal.s16 q8, q0 ;sum + vmlal.s16 q9, d0, d0 ;sse + vmlal.s16 q10, d1, d1 + vpadal.s16 q8, q1 + vmlal.s16 q9, d2, d2 + vmlal.s16 q10, d3, d3 + vpadal.s16 q8, q2 + vmlal.s16 q9, d4, d4 + vmlal.s16 q10, d5, d5 + vpadal.s16 q8, q3 + vmlal.s16 q9, d6, d6 + vmlal.s16 q10, d7, d7 + + bne vp8_filt_fpo16x16s_4_0_loop_neon + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [lr] ;store sse + vshr.s32 d10, d10, #8 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + pop {pc} + ENDP + +;================================================ +;unsigned int vp8_sub_pixel_variance16x16s_0_4_neon +;( +; unsigned char *src_ptr, r0 +; int src_pixels_per_line, r1 +; unsigned char *dst_ptr, r2 +; int dst_pixels_per_line, r3 +; unsigned int *sse +;); +;================================================ +|vp8_sub_pixel_variance16x16s_0_4_neon| PROC + push {lr} + + mov r12, #4 ;loop counter + + vld1.u8 {q0}, [r0], r1 ;load src data + ldr lr, [sp, #4] ;load *sse from stack + + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + +vp8_filt_spo16x16s_0_4_loop_neon + vld1.u8 {q2}, [r0], r1 + vld1.8 {q1}, [r2], r3 + vld1.u8 {q4}, [r0], r1 + vld1.8 {q3}, [r2], r3 + vld1.u8 {q6}, [r0], r1 + vld1.8 {q5}, [r2], r3 + vld1.u8 {q15}, [r0], r1 + + vrhadd.u8 q0, q0, q2 + vld1.8 {q7}, [r2], r3 + vrhadd.u8 q2, q2, q4 + vrhadd.u8 q4, q4, q6 + vrhadd.u8 q6, q6, q15 + + vsubl.u8 q11, d0, d2 ;diff + vsubl.u8 q12, d1, d3 + vsubl.u8 q13, d4, d6 + vsubl.u8 q14, d5, d7 + vsubl.u8 q0, d8, d10 + vsubl.u8 q1, d9, d11 + vsubl.u8 q2, d12, d14 + vsubl.u8 q3, d13, d15 + + vpadal.s16 q8, q11 ;sum + vmlal.s16 q9, d22, d22 ;sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + vpadal.s16 q8, q0 ;sum + vmlal.s16 q9, d0, d0 ;sse + vmlal.s16 q10, d1, d1 + vpadal.s16 q8, q1 + vmlal.s16 q9, d2, d2 + vmlal.s16 q10, d3, d3 + vpadal.s16 q8, q2 + vmlal.s16 q9, d4, d4 + vmlal.s16 q10, d5, d5 + + vmov q0, q15 + + vpadal.s16 q8, q3 + vmlal.s16 q9, d6, d6 + vmlal.s16 q10, d7, d7 + + bne vp8_filt_spo16x16s_0_4_loop_neon + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [lr] ;store sse + vshr.s32 d10, d10, #8 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + pop {pc} + ENDP + +;================================================ +;unsigned int vp8_sub_pixel_variance16x16s_4_4_neon +;( +; unsigned char *src_ptr, r0 +; int src_pixels_per_line, r1 +; unsigned char *dst_ptr, r2 +; int dst_pixels_per_line, r3 +; unsigned int *sse +;); +;================================================ +|vp8_sub_pixel_variance16x16s_4_4_neon| PROC + push {lr} + + vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data + + ldr lr, [sp, #4] ;load *sse from stack + vmov.i8 q13, #0 ;q8 - sum + vext.8 q1, q0, q1, #1 ;construct src_ptr[1] + + vmov.i8 q14, #0 ;q9, q10 - sse + vmov.i8 q15, #0 + + mov r12, #4 ;loop counter + vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 + +;First Pass: output_height lines x output_width columns (17x16) +vp8_filt16x16s_4_4_loop_neon + vld1.u8 {d4, d5, d6, d7}, [r0], r1 + vld1.u8 {d8, d9, d10, d11}, [r0], r1 + vld1.u8 {d12, d13, d14, d15}, [r0], r1 + vld1.u8 {d16, d17, d18, d19}, [r0], r1 + + ;pld [r0] + ;pld [r0, r1] + ;pld [r0, r1, lsl #1] + + vext.8 q3, q2, q3, #1 ;construct src_ptr[1] + vext.8 q5, q4, q5, #1 + vext.8 q7, q6, q7, #1 + vext.8 q9, q8, q9, #1 + + vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 + vrhadd.u8 q2, q4, q5 + vrhadd.u8 q3, q6, q7 + vrhadd.u8 q4, q8, q9 + + vld1.8 {q5}, [r2], r3 + vrhadd.u8 q0, q0, q1 + vld1.8 {q6}, [r2], r3 + vrhadd.u8 q1, q1, q2 + vld1.8 {q7}, [r2], r3 + vrhadd.u8 q2, q2, q3 + vld1.8 {q8}, [r2], r3 + vrhadd.u8 q3, q3, q4 + + vsubl.u8 q9, d0, d10 ;diff + vsubl.u8 q10, d1, d11 + vsubl.u8 q11, d2, d12 + vsubl.u8 q12, d3, d13 + + vsubl.u8 q0, d4, d14 ;diff + vsubl.u8 q1, d5, d15 + vsubl.u8 q5, d6, d16 + vsubl.u8 q6, d7, d17 + + vpadal.s16 q13, q9 ;sum + vmlal.s16 q14, d18, d18 ;sse + vmlal.s16 q15, d19, d19 + + vpadal.s16 q13, q10 ;sum + vmlal.s16 q14, d20, d20 ;sse + vmlal.s16 q15, d21, d21 + + vpadal.s16 q13, q11 ;sum + vmlal.s16 q14, d22, d22 ;sse + vmlal.s16 q15, d23, d23 + + vpadal.s16 q13, q12 ;sum + vmlal.s16 q14, d24, d24 ;sse + vmlal.s16 q15, d25, d25 + + subs r12, r12, #1 + + vpadal.s16 q13, q0 ;sum + vmlal.s16 q14, d0, d0 ;sse + vmlal.s16 q15, d1, d1 + + vpadal.s16 q13, q1 ;sum + vmlal.s16 q14, d2, d2 ;sse + vmlal.s16 q15, d3, d3 + + vpadal.s16 q13, q5 ;sum + vmlal.s16 q14, d10, d10 ;sse + vmlal.s16 q15, d11, d11 + + vmov q0, q4 + + vpadal.s16 q13, q6 ;sum + vmlal.s16 q14, d12, d12 ;sse + vmlal.s16 q15, d13, d13 + + bne vp8_filt16x16s_4_4_loop_neon + + vadd.u32 q15, q14, q15 ;accumulate sse + vpaddl.s32 q0, q13 ;accumulate sum + + vpaddl.u32 q1, q15 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [lr] ;store sse + vshr.s32 d10, d10, #8 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + pop {pc} + ENDP + +;============================== +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack unsigned char *dst_ptr, +; stack int dst_pixels_per_line, +; stack unsigned int *sse +;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step() +;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter, +;or filter coeff is {64, 64}. This simplified program only works in this situation. +;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later. + +|vp8_sub_pixel_variance16x16s_neon| PROC + push {r4, lr} + + ldr r4, [sp, #8] ;load *dst_ptr from stack + ldr r12, [sp, #12] ;load dst_pixels_per_line from stack + ldr lr, [sp, #16] ;load *sse from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_bfilter16x16s_only + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + beq firstpass_bfilter16x16s_only + + vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data + sub sp, sp, #256 ;reserve space on stack for temporary storage + vext.8 q1, q0, q1, #1 ;construct src_ptr[1] + mov r3, sp + mov r2, #4 ;loop counter + vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 + +;First Pass: output_height lines x output_width columns (17x16) +vp8e_filt_blk2d_fp16x16s_loop_neon + vld1.u8 {d4, d5, d6, d7}, [r0], r1 + vld1.u8 {d8, d9, d10, d11}, [r0], r1 + vld1.u8 {d12, d13, d14, d15}, [r0], r1 + vld1.u8 {d16, d17, d18, d19}, [r0], r1 + + ;pld [r0] + ;pld [r0, r1] + ;pld [r0, r1, lsl #1] + + vext.8 q3, q2, q3, #1 ;construct src_ptr[1] + vext.8 q5, q4, q5, #1 + vext.8 q7, q6, q7, #1 + vext.8 q9, q8, q9, #1 + + vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 + vrhadd.u8 q2, q4, q5 + vrhadd.u8 q3, q6, q7 + vrhadd.u8 q4, q8, q9 + + vrhadd.u8 q0, q0, q1 + vrhadd.u8 q1, q1, q2 + vrhadd.u8 q2, q2, q3 + vrhadd.u8 q3, q3, q4 + + subs r2, r2, #1 + vst1.u8 {d0, d1 ,d2, d3}, [r3]! ;store result + vmov q0, q4 + vst1.u8 {d4, d5, d6, d7}, [r3]! + + bne vp8e_filt_blk2d_fp16x16s_loop_neon + + b sub_pixel_variance16x16s_neon + +;-------------------- +firstpass_bfilter16x16s_only + mov r2, #2 ;loop counter + sub sp, sp, #256 ;reserve space on stack for temporary storage + mov r3, sp + +;First Pass: output_height lines x output_width columns (16x16) +vp8e_filt_blk2d_fpo16x16s_loop_neon + vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data + vld1.u8 {d4, d5, d6, d7}, [r0], r1 + vld1.u8 {d8, d9, d10, d11}, [r0], r1 + vld1.u8 {d12, d13, d14, d15}, [r0], r1 + + ;pld [r0] + ;pld [r0, r1] + ;pld [r0, r1, lsl #1] + + vext.8 q1, q0, q1, #1 ;construct src_ptr[1] + vld1.u8 {d16, d17, d18, d19}, [r0], r1 + vext.8 q3, q2, q3, #1 + vld1.u8 {d20, d21, d22, d23}, [r0], r1 + vext.8 q5, q4, q5, #1 + vld1.u8 {d24, d25, d26, d27}, [r0], r1 + vext.8 q7, q6, q7, #1 + vld1.u8 {d28, d29, d30, d31}, [r0], r1 + vext.8 q9, q8, q9, #1 + vext.8 q11, q10, q11, #1 + vext.8 q13, q12, q13, #1 + vext.8 q15, q14, q15, #1 + + vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 + vrhadd.u8 q1, q2, q3 + vrhadd.u8 q2, q4, q5 + vrhadd.u8 q3, q6, q7 + vrhadd.u8 q4, q8, q9 + vrhadd.u8 q5, q10, q11 + vrhadd.u8 q6, q12, q13 + vrhadd.u8 q7, q14, q15 + + subs r2, r2, #1 + + vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result + vst1.u8 {d4, d5, d6, d7}, [r3]! + vst1.u8 {d8, d9, d10, d11}, [r3]! + vst1.u8 {d12, d13, d14, d15}, [r3]! + + bne vp8e_filt_blk2d_fpo16x16s_loop_neon + + b sub_pixel_variance16x16s_neon + +;--------------------- +secondpass_bfilter16x16s_only + sub sp, sp, #256 ;reserve space on stack for temporary storage + + mov r2, #2 ;loop counter + vld1.u8 {d0, d1}, [r0], r1 ;load src data + mov r3, sp + +vp8e_filt_blk2d_spo16x16s_loop_neon + vld1.u8 {d2, d3}, [r0], r1 + vld1.u8 {d4, d5}, [r0], r1 + vld1.u8 {d6, d7}, [r0], r1 + vld1.u8 {d8, d9}, [r0], r1 + + vrhadd.u8 q0, q0, q1 + vld1.u8 {d10, d11}, [r0], r1 + vrhadd.u8 q1, q1, q2 + vld1.u8 {d12, d13}, [r0], r1 + vrhadd.u8 q2, q2, q3 + vld1.u8 {d14, d15}, [r0], r1 + vrhadd.u8 q3, q3, q4 + vld1.u8 {d16, d17}, [r0], r1 + vrhadd.u8 q4, q4, q5 + vrhadd.u8 q5, q5, q6 + vrhadd.u8 q6, q6, q7 + vrhadd.u8 q7, q7, q8 + + subs r2, r2, #1 + + vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result + vmov q0, q8 + vst1.u8 {d4, d5, d6, d7}, [r3]! + vst1.u8 {d8, d9, d10, d11}, [r3]! ;store result + vst1.u8 {d12, d13, d14, d15}, [r3]! + + bne vp8e_filt_blk2d_spo16x16s_loop_neon + + b sub_pixel_variance16x16s_neon + +;---------------------------- +;variance16x16 +sub_pixel_variance16x16s_neon + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + sub r3, r3, #256 + mov r2, #4 + +sub_pixel_variance16x16s_neon_loop + vld1.8 {q0}, [r3]! ;Load up source and reference + vld1.8 {q1}, [r4], r12 + vld1.8 {q2}, [r3]! + vld1.8 {q3}, [r4], r12 + vld1.8 {q4}, [r3]! + vld1.8 {q5}, [r4], r12 + vld1.8 {q6}, [r3]! + vld1.8 {q7}, [r4], r12 + + vsubl.u8 q11, d0, d2 ;diff + vsubl.u8 q12, d1, d3 + vsubl.u8 q13, d4, d6 + vsubl.u8 q14, d5, d7 + vsubl.u8 q0, d8, d10 + vsubl.u8 q1, d9, d11 + vsubl.u8 q2, d12, d14 + vsubl.u8 q3, d13, d15 + + vpadal.s16 q8, q11 ;sum + vmlal.s16 q9, d22, d22 ;sse + vmlal.s16 q10, d23, d23 + + subs r2, r2, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + vpadal.s16 q8, q0 ;sum + vmlal.s16 q9, d0, d0 ;sse + vmlal.s16 q10, d1, d1 + vpadal.s16 q8, q1 + vmlal.s16 q9, d2, d2 + vmlal.s16 q10, d3, d3 + vpadal.s16 q8, q2 + vmlal.s16 q9, d4, d4 + vmlal.s16 q10, d5, d5 + vpadal.s16 q8, q3 + vmlal.s16 q9, d6, d6 + vmlal.s16 q10, d7, d7 + + bne sub_pixel_variance16x16s_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [lr] ;store sse + vshr.s32 d10, d10, #8 + vsub.s32 d0, d1, d10 + + add sp, sp, #256 + vmov.32 r0, d0[0] ;return + + pop {r4, pc} + ENDP + + END diff --git a/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm b/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm new file mode 100644 index 000000000..bd56761fa --- /dev/null +++ b/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm @@ -0,0 +1,226 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_sub_pixel_variance8x8_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack(r4) unsigned char *dst_ptr, +; stack(r5) int dst_pixels_per_line, +; stack(r6) unsigned int *sse +;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon. + +|vp8_sub_pixel_variance8x8_neon| PROC + push {r4-r5, lr} + + ldr r12, _BilinearTaps_coeff_ + ldr r4, [sp, #12] ;load *dst_ptr from stack + ldr r5, [sp, #16] ;load dst_pixels_per_line from stack + ldr lr, [sp, #20] ;load *sse from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq skip_firstpass_filter + +;First pass: output_height lines x output_width columns (9x8) + add r2, r12, r2, lsl #3 ;calculate filter location + + vld1.u8 {q1}, [r0], r1 ;load src data + vld1.u32 {d31}, [r2] ;load first_pass filter + vld1.u8 {q2}, [r0], r1 + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vld1.u8 {q3}, [r0], r1 + vdup.8 d1, d31[4] + vld1.u8 {q4}, [r0], r1 + + vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0]) + vmull.u8 q7, d4, d0 + vmull.u8 q8, d6, d0 + vmull.u8 q9, d8, d0 + + vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vext.8 d9, d8, d9, #1 + + vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1]) + vmlal.u8 q7, d5, d1 + vmlal.u8 q8, d7, d1 + vmlal.u8 q9, d9, d1 + + vld1.u8 {q1}, [r0], r1 ;load src data + vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 + vld1.u8 {q2}, [r0], r1 + vqrshrn.u16 d23, q7, #7 + vld1.u8 {q3}, [r0], r1 + vqrshrn.u16 d24, q8, #7 + vld1.u8 {q4}, [r0], r1 + vqrshrn.u16 d25, q9, #7 + + ;first_pass filtering on the rest 5-line data + vld1.u8 {q5}, [r0], r1 + + vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0]) + vmull.u8 q7, d4, d0 + vmull.u8 q8, d6, d0 + vmull.u8 q9, d8, d0 + vmull.u8 q10, d10, d0 + + vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vext.8 d9, d8, d9, #1 + vext.8 d11, d10, d11, #1 + + vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1]) + vmlal.u8 q7, d5, d1 + vmlal.u8 q8, d7, d1 + vmlal.u8 q9, d9, d1 + vmlal.u8 q10, d11, d1 + + vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d27, q7, #7 + vqrshrn.u16 d28, q8, #7 + vqrshrn.u16 d29, q9, #7 + vqrshrn.u16 d30, q10, #7 + +;Second pass: 8x8 +secondpass_filter + cmp r3, #0 ;skip second_pass filter if yoffset=0 + ;skip_secondpass_filter + beq sub_pixel_variance8x8_neon + + add r3, r12, r3, lsl #3 + + vld1.u32 {d31}, [r3] ;load second_pass filter + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + + vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0]) + vmull.u8 q2, d23, d0 + vmull.u8 q3, d24, d0 + vmull.u8 q4, d25, d0 + vmull.u8 q5, d26, d0 + vmull.u8 q6, d27, d0 + vmull.u8 q7, d28, d0 + vmull.u8 q8, d29, d0 + + vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * Filter[1]) + vmlal.u8 q2, d24, d1 + vmlal.u8 q3, d25, d1 + vmlal.u8 q4, d26, d1 + vmlal.u8 q5, d27, d1 + vmlal.u8 q6, d28, d1 + vmlal.u8 q7, d29, d1 + vmlal.u8 q8, d30, d1 + + vqrshrn.u16 d22, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d23, q2, #7 + vqrshrn.u16 d24, q3, #7 + vqrshrn.u16 d25, q4, #7 + vqrshrn.u16 d26, q5, #7 + vqrshrn.u16 d27, q6, #7 + vqrshrn.u16 d28, q7, #7 + vqrshrn.u16 d29, q8, #7 + + b sub_pixel_variance8x8_neon + +;-------------------- +skip_firstpass_filter + vld1.u8 {d22}, [r0], r1 ;load src data + vld1.u8 {d23}, [r0], r1 + vld1.u8 {d24}, [r0], r1 + vld1.u8 {d25}, [r0], r1 + vld1.u8 {d26}, [r0], r1 + vld1.u8 {d27}, [r0], r1 + vld1.u8 {d28}, [r0], r1 + vld1.u8 {d29}, [r0], r1 + vld1.u8 {d30}, [r0], r1 + + b secondpass_filter + +;---------------------- +;vp8_variance8x8_neon +sub_pixel_variance8x8_neon + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + mov r12, #2 + +sub_pixel_variance8x8_neon_loop + vld1.8 {d0}, [r4], r5 ;load dst data + subs r12, r12, #1 + vld1.8 {d1}, [r4], r5 + vld1.8 {d2}, [r4], r5 + vsubl.u8 q4, d22, d0 ;calculate diff + vld1.8 {d3}, [r4], r5 + + vsubl.u8 q5, d23, d1 + vsubl.u8 q6, d24, d2 + + vpadal.s16 q8, q4 ;sum + vmlal.s16 q9, d8, d8 ;sse + vmlal.s16 q10, d9, d9 + + vsubl.u8 q7, d25, d3 + + vpadal.s16 q8, q5 + vmlal.s16 q9, d10, d10 + vmlal.s16 q10, d11, d11 + + vmov q11, q13 + + vpadal.s16 q8, q6 + vmlal.s16 q9, d12, d12 + vmlal.s16 q10, d13, d13 + + vmov q12, q14 + + vpadal.s16 q8, q7 + vmlal.s16 q9, d14, d14 + vmlal.s16 q10, d15, d15 + + bne sub_pixel_variance8x8_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [lr] ;store sse + vshr.s32 d10, d10, #6 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + pop {r4-r5, pc} + + ENDP + +;----------------- + AREA bilinear_taps_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_BilinearTaps_coeff_ + DCD bilinear_taps_coeff +bilinear_taps_coeff + DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 + + END |