summaryrefslogtreecommitdiff
path: root/vp8
diff options
context:
space:
mode:
authorJohann <johannkoenig@google.com>2011-02-10 06:06:46 -0800
committerCode Review <code-review@webmproject.org>2011-02-10 06:06:46 -0800
commit7d8199f0c3422e6bd63bef4b9992ce0b2e6b74b5 (patch)
tree59c51f00a9827ad729509ce2a0edd17b6805f8d5 /vp8
parentfffa2a61d72883019c7c09466db7e947cb7de34d (diff)
parentcb14764fab88b5b28ba09fa9490bd72c017cb7c2 (diff)
downloadlibvpx-7d8199f0c3422e6bd63bef4b9992ce0b2e6b74b5.tar
libvpx-7d8199f0c3422e6bd63bef4b9992ce0b2e6b74b5.tar.gz
libvpx-7d8199f0c3422e6bd63bef4b9992ce0b2e6b74b5.tar.bz2
libvpx-7d8199f0c3422e6bd63bef4b9992ce0b2e6b74b5.zip
Merge "Adds armv6 optimized variance calculation"
Diffstat (limited to 'vp8')
-rw-r--r--vp8/common/arm/bilinearfilter_arm.c21
-rw-r--r--vp8/common/arm/bilinearfilter_arm.h35
-rw-r--r--vp8/encoder/arm/arm_csystemdependent.c8
-rw-r--r--vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm147
-rw-r--r--vp8/encoder/arm/variance_arm.c34
-rw-r--r--vp8/encoder/arm/variance_arm.h17
-rw-r--r--vp8/vp8_common.mk1
-rw-r--r--vp8/vp8cx_arm.mk4
8 files changed, 242 insertions, 25 deletions
diff --git a/vp8/common/arm/bilinearfilter_arm.c b/vp8/common/arm/bilinearfilter_arm.c
index 961d142c9..6a46ef685 100644
--- a/vp8/common/arm/bilinearfilter_arm.c
+++ b/vp8/common/arm/bilinearfilter_arm.c
@@ -12,26 +12,7 @@
#include <math.h>
#include "filter.h"
#include "subpixel.h"
-
-extern void vp8_filter_block2d_bil_first_pass_armv6
-(
- unsigned char *src_ptr,
- unsigned short *dst_ptr,
- unsigned int src_pitch,
- unsigned int height,
- unsigned int width,
- const short *vp8_filter
-);
-
-extern void vp8_filter_block2d_bil_second_pass_armv6
-(
- unsigned short *src_ptr,
- unsigned char *dst_ptr,
- int dst_pitch,
- unsigned int height,
- unsigned int width,
- const short *vp8_filter
-);
+#include "arm/bilinearfilter_arm.h"
void vp8_filter_block2d_bil_armv6
(
diff --git a/vp8/common/arm/bilinearfilter_arm.h b/vp8/common/arm/bilinearfilter_arm.h
new file mode 100644
index 000000000..b7155d3f0
--- /dev/null
+++ b/vp8/common/arm/bilinearfilter_arm.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef BILINEARFILTER_ARM_H
+#define BILINEARFILTER_ARM_H
+
+extern void vp8_filter_block2d_bil_first_pass_armv6
+(
+ const unsigned char *src_ptr,
+ unsigned short *dst_ptr,
+ unsigned int src_pitch,
+ unsigned int height,
+ unsigned int width,
+ const short *vp8_filter
+);
+
+extern void vp8_filter_block2d_bil_second_pass_armv6
+(
+ const unsigned short *src_ptr,
+ unsigned char *dst_ptr,
+ int dst_pitch,
+ unsigned int height,
+ unsigned int width,
+ const short *vp8_filter
+);
+
+#endif /* BILINEARFILTER_ARM_H */
diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c
index a1f110260..6c17a7984 100644
--- a/vp8/encoder/arm/arm_csystemdependent.c
+++ b/vp8/encoder/arm/arm_csystemdependent.c
@@ -38,14 +38,14 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
/*cpi->rtcd.variance.var4x4 = vp8_variance4x4_c;
cpi->rtcd.variance.var8x8 = vp8_variance8x8_c;
cpi->rtcd.variance.var8x16 = vp8_variance8x16_c;
- cpi->rtcd.variance.var16x8 = vp8_variance16x8_c;
- cpi->rtcd.variance.var16x16 = vp8_variance16x16_c;*/
+ cpi->rtcd.variance.var16x8 = vp8_variance16x8_c;*/
+ cpi->rtcd.variance.var16x16 = vp8_variance16x16_armv6;
/*cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_c;
cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_c;
cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_c;
- cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_c;
- cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_c;*/
+ cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_c;*/
+ cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_armv6;
/*cpi->rtcd.variance.mse16x16 = vp8_mse16x16_c;
cpi->rtcd.variance.getmbss = vp8_get_mb_ss_c;*/
diff --git a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
new file mode 100644
index 000000000..8d7258af7
--- /dev/null
+++ b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
@@ -0,0 +1,147 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_variance16x16_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp8_variance16x16_armv6| PROC
+
+ stmfd sp!, {r4-r12, lr}
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov r8, #0 ; initialize sum = 0
+ mov r11, #0 ; initialize sse = 0
+
+loop
+ ; 1st 4 pixels
+ ldr r4, [r0, #0x0] ; load 4 src pixels
+ ldr r5, [r2, #0x0] ; load 4 ref pixels
+
+ mov lr, #0 ; constant zero
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #0x4] ; load 4 src pixels
+ ldr r5, [r2, #0x4] ; load 4 ref pixels
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #0x8] ; load 4 src pixels
+ ldr r5, [r2, #0x8] ; load 4 ref pixels
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #0xc] ; load 4 src pixels
+ ldr r5, [r2, #0xc] ; load 4 ref pixels
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+
+ subs r12, r12, #1
+
+ bne loop
+
+ ; return stuff
+ ldr r6, [sp, #0x28] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, ASR #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+ END
diff --git a/vp8/encoder/arm/variance_arm.c b/vp8/encoder/arm/variance_arm.c
index b40c0482f..9737bef46 100644
--- a/vp8/encoder/arm/variance_arm.c
+++ b/vp8/encoder/arm/variance_arm.c
@@ -10,6 +10,40 @@
#include "vpx_config.h"
#include "variance.h"
+#include "filter.h"
+#include "arm/bilinearfilter_arm.h"
+
+#if HAVE_ARMV6
+
+unsigned int vp8_sub_pixel_variance16x16_armv6
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ unsigned short first_pass[36*16];
+ unsigned char second_pass[20*16];
+ const short *HFilter, *VFilter;
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
+ src_pixels_per_line,
+ 17, 16, HFilter);
+ vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
+ 16, 16, 16, VFilter);
+
+ return vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
+ dst_pixels_per_line, sse);
+}
+
+#endif
#if HAVE_ARMV7
diff --git a/vp8/encoder/arm/variance_arm.h b/vp8/encoder/arm/variance_arm.h
index 3cbacfac3..06d72873e 100644
--- a/vp8/encoder/arm/variance_arm.h
+++ b/vp8/encoder/arm/variance_arm.h
@@ -12,6 +12,23 @@
#ifndef VARIANCE_ARM_H
#define VARIANCE_ARM_H
+#if HAVE_ARMV6
+
+extern prototype_variance(vp8_variance16x16_armv6);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_armv6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_armv6
+
+#undef vp8_variance_var16x16
+#define vp8_variance_var16x16 vp8_variance16x16_armv6
+
+#endif /* !CONFIG_RUNTIME_CPU_DETECT */
+
+#endif /* HAVE_ARMV6 */
+
#if HAVE_ARMV7
extern prototype_sad(vp8_sad4x4_neon);
extern prototype_sad(vp8_sad8x8_neon);
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 25909456a..af07618a1 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -116,6 +116,7 @@ VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/arm_systemdependent.c
# common (c)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/bilinearfilter_arm.c
+VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/bilinearfilter_arm.h
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/filter_arm.c
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/loopfilter_arm.c
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/reconintra_arm.c
diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index b23ac96ca..abc5dc800 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -17,9 +17,10 @@ VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/arm_csystemdependent.c
VP8_CX_SRCS-$(ARCH_ARM) += encoder/asm_enc_offsets.c
VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/encodemb_arm.c
-VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/variance_arm.c
VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/quantize_arm.c
VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/picklpf_arm.c
+VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/variance_arm.c
+VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/variance_arm.h
VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/boolhuff_arm.c
VP8_CX_SRCS_REMOVE-$(HAVE_ARMV5TE) += encoder/boolhuff.c
@@ -33,6 +34,7 @@ VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/vp8_packtokens_partitions_ar
#File list for armv6
# encoder
+VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/walsh_v6$(ASM)
#File list for neon