From f5e433464b6a0a79978d966bd666c794415abdce Mon Sep 17 00:00:00 2001 From: Tero Rintaluoma Date: Mon, 28 Mar 2011 09:51:51 +0300 Subject: Half pixel variance further optimized for ARMv6 Half pixel interpolations optimized in variance calculations. Separate function calls to vp8_filter_block2d_bil_x_pass_armv6 are avoided.On average, performance improvement is 6-7% for VGA@30fps sequences. Change-Id: Idb5f118a9d51548e824719d2cfe5be0fa6996628 --- vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm | 23 +-- .../armv6/vp8_variance_halfpixvar16x16_h_armv6.asm | 176 +++++++++++++++++ .../vp8_variance_halfpixvar16x16_hv_armv6.asm | 216 +++++++++++++++++++++ .../armv6/vp8_variance_halfpixvar16x16_v_armv6.asm | 178 +++++++++++++++++ vp8/encoder/arm/variance_arm.c | 77 +++----- 5 files changed, 614 insertions(+), 56 deletions(-) create mode 100644 vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm create mode 100644 vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm create mode 100644 vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm (limited to 'vp8/encoder') diff --git a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm index 8d7258af7..988376390 100644 --- a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm +++ b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm @@ -25,14 +25,14 @@ |vp8_variance16x16_armv6| PROC stmfd sp!, {r4-r12, lr} - mov r12, #16 ; set loop counter to 16 (=block height) mov r8, #0 ; initialize sum = 0 mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) loop ; 1st 4 pixels - ldr r4, [r0, #0x0] ; load 4 src pixels - ldr r5, [r2, #0x0] ; load 4 ref pixels + ldr r4, [r0, #0] ; load 4 src pixels + ldr r5, [r2, #0] ; load 4 ref pixels mov lr, #0 ; constant zero @@ -55,8 +55,8 @@ loop smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) ; 2nd 4 pixels - ldr r4, [r0, #0x4] ; load 4 src pixels - ldr r5, [r2, #0x4] ; load 4 ref pixels + ldr r4, [r0, #4] ; load 4 src pixels + ldr r5, [r2, #4] ; load 4 ref pixels smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) usub8 r6, r4, r5 ; calculate difference @@ -79,8 +79,8 @@ loop smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) ; 3rd 4 pixels - ldr r4, [r0, #0x8] ; load 4 src pixels - ldr r5, [r2, #0x8] ; load 4 ref pixels + ldr r4, [r0, #8] ; load 4 src pixels + ldr r5, [r2, #8] ; load 4 ref pixels smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) usub8 r6, r4, r5 ; calculate difference @@ -103,8 +103,8 @@ loop smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) ; 4th 4 pixels - ldr r4, [r0, #0xc] ; load 4 src pixels - ldr r5, [r2, #0xc] ; load 4 ref pixels + ldr r4, [r0, #12] ; load 4 src pixels + ldr r5, [r2, #12] ; load 4 ref pixels smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) usub8 r6, r4, r5 ; calculate difference @@ -135,13 +135,14 @@ loop bne loop ; return stuff - ldr r6, [sp, #0x28] ; get address of sse + ldr r6, [sp, #40] ; get address of sse mul r0, r8, r8 ; sum * sum str r11, [r6] ; store sse - sub r0, r11, r0, ASR #8 ; return (sse - ((sum * sum) >> 8)) + sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8)) ldmfd sp!, {r4-r12, pc} ENDP END + diff --git a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm new file mode 100644 index 000000000..2350f3e8b --- /dev/null +++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm @@ -0,0 +1,176 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_variance_halfpixvar16x16_h_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp8_variance_halfpixvar16x16_h_armv6| PROC + + stmfd sp!, {r4-r12, lr} + mov r8, #0 ; initialize sum = 0 + ldr r10, c80808080 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + mov lr, #0 ; constant zero +loop + ; 1st 4 pixels + ldr r4, [r0, #0] ; load 4 src pixels + ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #0] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load 4 src pixels + ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #4] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load 4 src pixels + ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #8] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load 4 src pixels + ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #12] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + subs r12, r12, #1 + + bne loop + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + +c80808080 + DCD 0x80808080 + + END + diff --git a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm new file mode 100644 index 000000000..f9ae3b7e2 --- /dev/null +++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm @@ -0,0 +1,216 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_variance_halfpixvar16x16_hv_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp8_variance_halfpixvar16x16_hv_armv6| PROC + + stmfd sp!, {r4-r12, lr} + mov r8, #0 ; initialize sum = 0 + ldr r10, c80808080 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + mov lr, #0 ; constant zero +loop + add r9, r0, r1 ; pointer to pixels on the next row + ; 1st 4 pixels + ldr r4, [r0, #0] ; load source pixels a, row N + ldr r6, [r0, #1] ; load source pixels b, row N + ldr r5, [r9, #0] ; load source pixels c, row N+1 + ldr r7, [r9, #1] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #0] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load source pixels a, row N + ldr r6, [r0, #5] ; load source pixels b, row N + ldr r5, [r9, #4] ; load source pixels c, row N+1 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + ldr r7, [r9, #5] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #4] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load source pixels a, row N + ldr r6, [r0, #9] ; load source pixels b, row N + ldr r5, [r9, #8] ; load source pixels c, row N+1 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + ldr r7, [r9, #9] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #8] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load source pixels a, row N + ldr r6, [r0, #13] ; load source pixels b, row N + ldr r5, [r9, #12] ; load source pixels c, row N+1 + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + ldr r7, [r9, #13] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #12] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + subs r12, r12, #1 + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + bne loop + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + +c80808080 + DCD 0x80808080 + + END diff --git a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm new file mode 100644 index 000000000..9e0a03548 --- /dev/null +++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm @@ -0,0 +1,178 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_variance_halfpixvar16x16_v_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp8_variance_halfpixvar16x16_v_armv6| PROC + + stmfd sp!, {r4-r12, lr} + mov r8, #0 ; initialize sum = 0 + ldr r10, c80808080 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + mov lr, #0 ; constant zero +loop + add r9, r0, r1 ; set src pointer to next row + ; 1st 4 pixels + ldr r4, [r0, #0] ; load 4 src pixels + ldr r6, [r9, #0] ; load 4 src pixels from next row + ldr r5, [r2, #0] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load 4 src pixels + ldr r6, [r9, #4] ; load 4 src pixels from next row + ldr r5, [r2, #4] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load 4 src pixels + ldr r6, [r9, #8] ; load 4 src pixels from next row + ldr r5, [r2, #8] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load 4 src pixels + ldr r6, [r9, #12] ; load 4 src pixels from next row + ldr r5, [r2, #12] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + + subs r12, r12, #1 + + bne loop + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + +c80808080 + DCD 0x80808080 + + END + diff --git a/vp8/encoder/arm/variance_arm.c b/vp8/encoder/arm/variance_arm.c index ed1fb16d5..e77be9f73 100644 --- a/vp8/encoder/arm/variance_arm.c +++ b/vp8/encoder/arm/variance_arm.c @@ -57,51 +57,38 @@ unsigned int vp8_sub_pixel_variance16x16_armv6 unsigned short first_pass[36*16]; unsigned char second_pass[20*16]; const short *HFilter, *VFilter; - - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass, - src_pixels_per_line, - 17, 16, HFilter); - vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, - 16, 16, 16, VFilter); - - return vp8_variance16x16_armv6(second_pass, 16, dst_ptr, - dst_pixels_per_line, sse); -} - -unsigned int vp8_variance_halfpixvar16x16_h_armv6( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - return vp8_sub_pixel_variance16x16_armv6(src_ptr, source_stride, 4, 0, - ref_ptr, recon_stride, sse); -} - -unsigned int vp8_variance_halfpixvar16x16_v_armv6( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - return vp8_sub_pixel_variance16x16_armv6(src_ptr, source_stride, 0, 4, - ref_ptr, recon_stride, sse); -} - -unsigned int vp8_variance_halfpixvar16x16_hv_armv6( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - return vp8_sub_pixel_variance16x16_armv6(src_ptr, source_stride, 4, 4, - ref_ptr, recon_stride, sse); + unsigned int var; + + if (xoffset == 4 && yoffset == 0) + { + var = vp8_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, sse); + } + else if (xoffset == 0 && yoffset == 4) + { + var = vp8_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, sse); + } + else if (xoffset == 4 && yoffset == 4) + { + var = vp8_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, sse); + } + else + { + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass, + src_pixels_per_line, + 17, 16, HFilter); + vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, + 16, 16, 16, VFilter); + + var = vp8_variance16x16_armv6(second_pass, 16, dst_ptr, + dst_pixels_per_line, sse); + } + return var; } #endif /* HAVE_ARMV6 */ -- cgit v1.2.3 From f0c22a3f333ad9a3e1218b8387879766bfea6d42 Mon Sep 17 00:00:00 2001 From: Johann Date: Tue, 29 Mar 2011 10:28:30 -0400 Subject: use GLOBAL correctly on 32bit shared libraries http://code.google.com/p/webm/issues/detail?id=309 Change-Id: I6fce9e2f74bc09a9f258df7f91ab599812324e8c --- vp8/encoder/x86/quantize_sse2.asm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'vp8/encoder') diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm index bc70b68a9..9a1584024 100644 --- a/vp8/encoder/x86/quantize_sse2.asm +++ b/vp8/encoder/x86/quantize_sse2.asm @@ -130,7 +130,7 @@ sym(vp8_regular_quantize_b_sse2): mov [rsp + zrun_zbin_boost], rsi %macro ZIGZAG_LOOP 1 - movsx edx, WORD PTR[GLOBAL(zig_zag) + (%1 * 2)] ; rc + movsx edx, WORD PTR[GLOBAL(zig_zag + (%1 * 2))] ; rc ; x movsx ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2] @@ -209,7 +209,7 @@ ZIGZAG_LOOP 15 pxor xmm3, xmm6 ; mask inv_zig_zag pand xmm2, [GLOBAL(inv_zig_zag)] - pand xmm3, [GLOBAL(inv_zig_zag) + 16] + pand xmm3, [GLOBAL(inv_zig_zag + 16)] ; select the max value pmaxsw xmm2, xmm3 pshufd xmm3, xmm2, 00001110b -- cgit v1.2.3 From b843aa4eda473577e7d22cd4045fb59df521898f Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Tue, 29 Mar 2011 11:31:06 -0400 Subject: Fix a crash while enabling shared (--enable-shared) Fixed a bug in SSSE3 sub-pixel filter functions. Change-Id: I2e2126652970eb78307ffcefcace1efd5966fb0a --- vp8/encoder/x86/variance_impl_ssse3.asm | 40 +++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 12 deletions(-) (limited to 'vp8/encoder') diff --git a/vp8/encoder/x86/variance_impl_ssse3.asm b/vp8/encoder/x86/variance_impl_ssse3.asm index b1976328d..3c0fef9b5 100644 --- a/vp8/encoder/x86/variance_impl_ssse3.asm +++ b/vp8/encoder/x86/variance_impl_ssse3.asm @@ -38,7 +38,6 @@ sym(vp8_filter_block2d_bil_var_ssse3): GET_GOT rbx push rsi push rdi - push rbx ; end prolog pxor xmm6, xmm6 @@ -81,10 +80,12 @@ sym(vp8_filter_block2d_bil_var_ssse3): packuswb xmm0, xmm2 - movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line - lea rsi, [rsi + rbx] -%if ABI_IS_32BIT=0 +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line movsxd r9, dword ptr arg(3) ;src_pixels_per_line + lea rsi, [rsi + r8] %endif filter_block2d_bil_var_ssse3_loop: @@ -132,10 +133,11 @@ filter_block2d_bil_var_ssse3_loop: paddd xmm7, xmm2 paddd xmm7, xmm3 - lea rsi, [rsi + rbx] ;ref_pixels_per_line %if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line add rdi, dword ptr arg(3) ;src_pixels_per_line %else + lea rsi, [rsi + r8] lea rdi, [rdi + r9] %endif @@ -161,7 +163,10 @@ filter_block2d_bil_var_ssse3_sp_only: movdqu xmm1, XMMWORD PTR [rsi] movdqa xmm0, xmm1 - movsxd rbx, dword ptr arg(3) ;src_pixels_per_line +%if ABI_IS_32BIT=0 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + lea rsi, [rsi + rax] filter_block2d_bil_sp_only_loop: @@ -196,7 +201,12 @@ filter_block2d_bil_sp_only_loop: movdqa xmm1, xmm0 lea rsi, [rsi + rax] ;ref_pixels_per_line - lea rdi, [rdi + rbx] ;src_pixels_per_line + +%if ABI_IS_32BIT + add rdi, dword ptr arg(3) ;src_pixels_per_line +%else + lea rdi, [rdi + r9] +%endif sub rcx, 1 jnz filter_block2d_bil_sp_only_loop @@ -208,7 +218,7 @@ filter_block2d_bil_var_ssse3_full_pixel: mov rdi, arg(2) ;src_ptr movsxd rcx, dword ptr arg(4) ;Height movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line pxor xmm0, xmm0 filter_block2d_bil_full_pixel_loop: @@ -232,7 +242,7 @@ filter_block2d_bil_full_pixel_loop: paddd xmm7, xmm2 lea rsi, [rsi + rax] ;ref_pixels_per_line - lea rdi, [rdi + rbx] ;src_pixels_per_line + lea rdi, [rdi + rdx] ;src_pixels_per_line sub rcx, 1 jnz filter_block2d_bil_full_pixel_loop @@ -245,7 +255,10 @@ filter_block2d_bil_var_ssse3_fp_only: movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line pxor xmm0, xmm0 - movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + +%if ABI_IS_32BIT=0 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif filter_block2d_bil_fp_only_loop: movdqu xmm1, XMMWORD PTR [rsi] @@ -278,7 +291,11 @@ filter_block2d_bil_fp_only_loop: paddd xmm7, xmm3 lea rsi, [rsi + rdx] - lea rdi, [rdi + rbx] ;src_pixels_per_line +%if ABI_IS_32BIT + add rdi, dword ptr arg(3) ;src_pixels_per_line +%else + lea rdi, [rdi + r9] +%endif sub rcx, 1 jnz filter_block2d_bil_fp_only_loop @@ -322,7 +339,6 @@ filter_block2d_bil_variance: movd [rdi], xmm6 ; begin epilog - pop rbx pop rdi pop rsi RESTORE_GOT -- cgit v1.2.3