summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Koleszar <jkoleszar@google.com>2011-03-30 00:05:07 -0400
committerJohn Koleszar <jkoleszar@google.com>2011-03-30 00:05:07 -0400
commit23afb5810e9f0682d66b7f2a734545413121e04d (patch)
tree90741e777374a3f882151b7036fdb18e8ed94217
parentb9f23561823eb0745d07064c7e8dcd254686e2d3 (diff)
parent0e43668546010bf0a9ece398effb37beca5dbea5 (diff)
downloadlibvpx-23afb5810e9f0682d66b7f2a734545413121e04d.tar
libvpx-23afb5810e9f0682d66b7f2a734545413121e04d.tar.gz
libvpx-23afb5810e9f0682d66b7f2a734545413121e04d.tar.bz2
libvpx-23afb5810e9f0682d66b7f2a734545413121e04d.zip
Merge remote branch 'origin/master' into experimental
Change-Id: Ie86a006320f3cea6a068a6b235267e19c3a19c4e
-rwxr-xr-xconfigure2
-rw-r--r--vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm23
-rw-r--r--vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm176
-rw-r--r--vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm216
-rw-r--r--vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm178
-rw-r--r--vp8/encoder/arm/variance_arm.c77
-rw-r--r--vp8/encoder/x86/quantize_sse2.asm4
-rw-r--r--vp8/encoder/x86/variance_impl_ssse3.asm40
-rw-r--r--vp8/vp8cx_arm.mk3
9 files changed, 648 insertions, 71 deletions
diff --git a/configure b/configure
index 09cfcf05d..d50cb19b9 100755
--- a/configure
+++ b/configure
@@ -504,7 +504,7 @@ process_toolchain() {
check_add_cflags -Wpointer-arith
check_add_cflags -Wtype-limits
check_add_cflags -Wcast-qual
- enabled extra_warnings || check_add_cflags -Wno-unused
+ enabled extra_warnings || check_add_cflags -Wno-unused-function
fi
if enabled icc; then
diff --git a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
index 8d7258af7..988376390 100644
--- a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
@@ -25,14 +25,14 @@
|vp8_variance16x16_armv6| PROC
stmfd sp!, {r4-r12, lr}
- mov r12, #16 ; set loop counter to 16 (=block height)
mov r8, #0 ; initialize sum = 0
mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
loop
; 1st 4 pixels
- ldr r4, [r0, #0x0] ; load 4 src pixels
- ldr r5, [r2, #0x0] ; load 4 ref pixels
+ ldr r4, [r0, #0] ; load 4 src pixels
+ ldr r5, [r2, #0] ; load 4 ref pixels
mov lr, #0 ; constant zero
@@ -55,8 +55,8 @@ loop
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 2nd 4 pixels
- ldr r4, [r0, #0x4] ; load 4 src pixels
- ldr r5, [r2, #0x4] ; load 4 ref pixels
+ ldr r4, [r0, #4] ; load 4 src pixels
+ ldr r5, [r2, #4] ; load 4 ref pixels
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
usub8 r6, r4, r5 ; calculate difference
@@ -79,8 +79,8 @@ loop
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 3rd 4 pixels
- ldr r4, [r0, #0x8] ; load 4 src pixels
- ldr r5, [r2, #0x8] ; load 4 ref pixels
+ ldr r4, [r0, #8] ; load 4 src pixels
+ ldr r5, [r2, #8] ; load 4 ref pixels
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
usub8 r6, r4, r5 ; calculate difference
@@ -103,8 +103,8 @@ loop
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 4th 4 pixels
- ldr r4, [r0, #0xc] ; load 4 src pixels
- ldr r5, [r2, #0xc] ; load 4 ref pixels
+ ldr r4, [r0, #12] ; load 4 src pixels
+ ldr r5, [r2, #12] ; load 4 ref pixels
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
usub8 r6, r4, r5 ; calculate difference
@@ -135,13 +135,14 @@ loop
bne loop
; return stuff
- ldr r6, [sp, #0x28] ; get address of sse
+ ldr r6, [sp, #40] ; get address of sse
mul r0, r8, r8 ; sum * sum
str r11, [r6] ; store sse
- sub r0, r11, r0, ASR #8 ; return (sse - ((sum * sum) >> 8))
+ sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
ldmfd sp!, {r4-r12, pc}
ENDP
END
+
diff --git a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
new file mode 100644
index 000000000..2350f3e8b
--- /dev/null
+++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
@@ -0,0 +1,176 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_variance_halfpixvar16x16_h_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp8_variance_halfpixvar16x16_h_armv6| PROC
+
+ stmfd sp!, {r4-r12, lr}
+ mov r8, #0 ; initialize sum = 0
+ ldr r10, c80808080
+ mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov lr, #0 ; constant zero
+loop
+ ; 1st 4 pixels
+ ldr r4, [r0, #0] ; load 4 src pixels
+ ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #0] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #4] ; load 4 src pixels
+ ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #4] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #8] ; load 4 src pixels
+ ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #8] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #12] ; load 4 src pixels
+ ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #12] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ subs r12, r12, #1
+
+ bne loop
+
+ ; return stuff
+ ldr r6, [sp, #40] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+c80808080
+ DCD 0x80808080
+
+ END
+
diff --git a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
new file mode 100644
index 000000000..f9ae3b7e2
--- /dev/null
+++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
@@ -0,0 +1,216 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_variance_halfpixvar16x16_hv_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp8_variance_halfpixvar16x16_hv_armv6| PROC
+
+ stmfd sp!, {r4-r12, lr}
+ mov r8, #0 ; initialize sum = 0
+ ldr r10, c80808080
+ mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov lr, #0 ; constant zero
+loop
+ add r9, r0, r1 ; pointer to pixels on the next row
+ ; 1st 4 pixels
+ ldr r4, [r0, #0] ; load source pixels a, row N
+ ldr r6, [r0, #1] ; load source pixels b, row N
+ ldr r5, [r9, #0] ; load source pixels c, row N+1
+ ldr r7, [r9, #1] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #0] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #4] ; load source pixels a, row N
+ ldr r6, [r0, #5] ; load source pixels b, row N
+ ldr r5, [r9, #4] ; load source pixels c, row N+1
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ ldr r7, [r9, #5] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #4] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #8] ; load source pixels a, row N
+ ldr r6, [r0, #9] ; load source pixels b, row N
+ ldr r5, [r9, #8] ; load source pixels c, row N+1
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ ldr r7, [r9, #9] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #8] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #12] ; load source pixels a, row N
+ ldr r6, [r0, #13] ; load source pixels b, row N
+ ldr r5, [r9, #12] ; load source pixels c, row N+1
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+ ldr r7, [r9, #13] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #12] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ subs r12, r12, #1
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ bne loop
+
+ ; return stuff
+ ldr r6, [sp, #40] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+c80808080
+ DCD 0x80808080
+
+ END
diff --git a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
new file mode 100644
index 000000000..9e0a03548
--- /dev/null
+++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
@@ -0,0 +1,178 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_variance_halfpixvar16x16_v_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp8_variance_halfpixvar16x16_v_armv6| PROC
+
+ stmfd sp!, {r4-r12, lr}
+ mov r8, #0 ; initialize sum = 0
+ ldr r10, c80808080
+ mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov lr, #0 ; constant zero
+loop
+ add r9, r0, r1 ; set src pointer to next row
+ ; 1st 4 pixels
+ ldr r4, [r0, #0] ; load 4 src pixels
+ ldr r6, [r9, #0] ; load 4 src pixels from next row
+ ldr r5, [r2, #0] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #4] ; load 4 src pixels
+ ldr r6, [r9, #4] ; load 4 src pixels from next row
+ ldr r5, [r2, #4] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #8] ; load 4 src pixels
+ ldr r6, [r9, #8] ; load 4 src pixels from next row
+ ldr r5, [r2, #8] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #12] ; load 4 src pixels
+ ldr r6, [r9, #12] ; load 4 src pixels from next row
+ ldr r5, [r2, #12] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+
+ subs r12, r12, #1
+
+ bne loop
+
+ ; return stuff
+ ldr r6, [sp, #40] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+c80808080
+ DCD 0x80808080
+
+ END
+
diff --git a/vp8/encoder/arm/variance_arm.c b/vp8/encoder/arm/variance_arm.c
index ed1fb16d5..e77be9f73 100644
--- a/vp8/encoder/arm/variance_arm.c
+++ b/vp8/encoder/arm/variance_arm.c
@@ -57,51 +57,38 @@ unsigned int vp8_sub_pixel_variance16x16_armv6
unsigned short first_pass[36*16];
unsigned char second_pass[20*16];
const short *HFilter, *VFilter;
-
- HFilter = vp8_bilinear_filters[xoffset];
- VFilter = vp8_bilinear_filters[yoffset];
-
- vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
- src_pixels_per_line,
- 17, 16, HFilter);
- vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
- 16, 16, 16, VFilter);
-
- return vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
- dst_pixels_per_line, sse);
-}
-
-unsigned int vp8_variance_halfpixvar16x16_h_armv6(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- return vp8_sub_pixel_variance16x16_armv6(src_ptr, source_stride, 4, 0,
- ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp8_variance_halfpixvar16x16_v_armv6(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- return vp8_sub_pixel_variance16x16_armv6(src_ptr, source_stride, 0, 4,
- ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp8_variance_halfpixvar16x16_hv_armv6(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- return vp8_sub_pixel_variance16x16_armv6(src_ptr, source_stride, 4, 4,
- ref_ptr, recon_stride, sse);
+ unsigned int var;
+
+ if (xoffset == 4 && yoffset == 0)
+ {
+ var = vp8_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, sse);
+ }
+ else if (xoffset == 0 && yoffset == 4)
+ {
+ var = vp8_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, sse);
+ }
+ else if (xoffset == 4 && yoffset == 4)
+ {
+ var = vp8_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, sse);
+ }
+ else
+ {
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
+ src_pixels_per_line,
+ 17, 16, HFilter);
+ vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
+ 16, 16, 16, VFilter);
+
+ var = vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
+ dst_pixels_per_line, sse);
+ }
+ return var;
}
#endif /* HAVE_ARMV6 */
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
index bc70b68a9..9a1584024 100644
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -130,7 +130,7 @@ sym(vp8_regular_quantize_b_sse2):
mov [rsp + zrun_zbin_boost], rsi
%macro ZIGZAG_LOOP 1
- movsx edx, WORD PTR[GLOBAL(zig_zag) + (%1 * 2)] ; rc
+ movsx edx, WORD PTR[GLOBAL(zig_zag + (%1 * 2))] ; rc
; x
movsx ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2]
@@ -209,7 +209,7 @@ ZIGZAG_LOOP 15
pxor xmm3, xmm6
; mask inv_zig_zag
pand xmm2, [GLOBAL(inv_zig_zag)]
- pand xmm3, [GLOBAL(inv_zig_zag) + 16]
+ pand xmm3, [GLOBAL(inv_zig_zag + 16)]
; select the max value
pmaxsw xmm2, xmm3
pshufd xmm3, xmm2, 00001110b
diff --git a/vp8/encoder/x86/variance_impl_ssse3.asm b/vp8/encoder/x86/variance_impl_ssse3.asm
index b1976328d..3c0fef9b5 100644
--- a/vp8/encoder/x86/variance_impl_ssse3.asm
+++ b/vp8/encoder/x86/variance_impl_ssse3.asm
@@ -38,7 +38,6 @@ sym(vp8_filter_block2d_bil_var_ssse3):
GET_GOT rbx
push rsi
push rdi
- push rbx
; end prolog
pxor xmm6, xmm6
@@ -81,10 +80,12 @@ sym(vp8_filter_block2d_bil_var_ssse3):
packuswb xmm0, xmm2
- movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line
- lea rsi, [rsi + rbx]
-%if ABI_IS_32BIT=0
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+ lea rsi, [rsi + r8]
%endif
filter_block2d_bil_var_ssse3_loop:
@@ -132,10 +133,11 @@ filter_block2d_bil_var_ssse3_loop:
paddd xmm7, xmm2
paddd xmm7, xmm3
- lea rsi, [rsi + rbx] ;ref_pixels_per_line
%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line
add rdi, dword ptr arg(3) ;src_pixels_per_line
%else
+ lea rsi, [rsi + r8]
lea rdi, [rdi + r9]
%endif
@@ -161,7 +163,10 @@ filter_block2d_bil_var_ssse3_sp_only:
movdqu xmm1, XMMWORD PTR [rsi]
movdqa xmm0, xmm1
- movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+%if ABI_IS_32BIT=0
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
lea rsi, [rsi + rax]
filter_block2d_bil_sp_only_loop:
@@ -196,7 +201,12 @@ filter_block2d_bil_sp_only_loop:
movdqa xmm1, xmm0
lea rsi, [rsi + rax] ;ref_pixels_per_line
- lea rdi, [rdi + rbx] ;src_pixels_per_line
+
+%if ABI_IS_32BIT
+ add rdi, dword ptr arg(3) ;src_pixels_per_line
+%else
+ lea rdi, [rdi + r9]
+%endif
sub rcx, 1
jnz filter_block2d_bil_sp_only_loop
@@ -208,7 +218,7 @@ filter_block2d_bil_var_ssse3_full_pixel:
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
- movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
pxor xmm0, xmm0
filter_block2d_bil_full_pixel_loop:
@@ -232,7 +242,7 @@ filter_block2d_bil_full_pixel_loop:
paddd xmm7, xmm2
lea rsi, [rsi + rax] ;ref_pixels_per_line
- lea rdi, [rdi + rbx] ;src_pixels_per_line
+ lea rdi, [rdi + rdx] ;src_pixels_per_line
sub rcx, 1
jnz filter_block2d_bil_full_pixel_loop
@@ -245,7 +255,10 @@ filter_block2d_bil_var_ssse3_fp_only:
movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
pxor xmm0, xmm0
- movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+
+%if ABI_IS_32BIT=0
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
filter_block2d_bil_fp_only_loop:
movdqu xmm1, XMMWORD PTR [rsi]
@@ -278,7 +291,11 @@ filter_block2d_bil_fp_only_loop:
paddd xmm7, xmm3
lea rsi, [rsi + rdx]
- lea rdi, [rdi + rbx] ;src_pixels_per_line
+%if ABI_IS_32BIT
+ add rdi, dword ptr arg(3) ;src_pixels_per_line
+%else
+ lea rdi, [rdi + r9]
+%endif
sub rcx, 1
jnz filter_block2d_bil_fp_only_loop
@@ -322,7 +339,6 @@ filter_block2d_bil_variance:
movd [rdi], xmm6
; begin epilog
- pop rbx
pop rdi
pop rsi
RESTORE_GOT
diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index 349c3fd98..69a886baf 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -38,6 +38,9 @@ VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_fast_fdct4x4_armv6$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance8x8_armv6$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/walsh_v6$(ASM)