From a53d5a4c442a84cacbd8225fac72db3789b3e10c Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Thu, 15 Dec 2011 14:23:36 -0500 Subject: Moved dequant idct into common These functions are now used by the encoder. This is WIP with the goal of creating a common idct/add for the encoder and decoder. A boost of 1.8% was seen for the HD rt test clip used. [Tero] Added needed changes to ARM side. Change-Id: Ibbb8000be09034203d7adffc457d3c3f8b06a5bf --- vp8/common/arm/armv6/dequant_idct_v6.asm | 190 +++++++++++++++++++++++++++++++ vp8/common/arm/armv6/dequantize_v6.asm | 69 +++++++++++ vp8/common/arm/armv6/idct_blk_v6.c | 116 +++++++++++++++++++ 3 files changed, 375 insertions(+) create mode 100644 vp8/common/arm/armv6/dequant_idct_v6.asm create mode 100644 vp8/common/arm/armv6/dequantize_v6.asm create mode 100644 vp8/common/arm/armv6/idct_blk_v6.c (limited to 'vp8/common/arm/armv6') diff --git a/vp8/common/arm/armv6/dequant_idct_v6.asm b/vp8/common/arm/armv6/dequant_idct_v6.asm new file mode 100644 index 000000000..2510ad838 --- /dev/null +++ b/vp8/common/arm/armv6/dequant_idct_v6.asm @@ -0,0 +1,190 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + EXPORT |vp8_dequant_idct_add_v6| + + AREA |.text|, CODE, READONLY +;void vp8_dequant_idct_v6(short *input, short *dq, +; unsigned char *dest, int stride) +; r0 = q +; r1 = dq +; r2 = dst +; r3 = stride + +|vp8_dequant_idct_add_v6| PROC + stmdb sp!, {r4-r11, lr} + + ldr r4, [r0] ;input + ldr r5, [r1], #4 ;dq + + sub sp, sp, #4 + str r3, [sp] + + mov r12, #4 + +vp8_dequant_add_loop + smulbb r6, r4, r5 + smultt r7, r4, r5 + + ldr r4, [r0, #4] ;input + ldr r5, [r1], #4 ;dq + + strh r6, [r0], #2 + strh r7, [r0], #2 + + smulbb r6, r4, r5 + smultt r7, r4, r5 + + subs r12, r12, #1 + + ldrne r4, [r0, #4] + ldrne r5, [r1], #4 + + strh r6, [r0], #2 + strh r7, [r0], #2 + + bne vp8_dequant_add_loop + + sub r0, r0, #32 + mov r1, r0 + +; short_idct4x4llm_v6_dual + ldr r3, cospi8sqrt2minus1 + ldr r4, sinpi8sqrt2 + ldr r6, [r0, #8] + mov r5, #2 +vp8_dequant_idct_loop1_v6 + ldr r12, [r0, #24] + ldr r14, [r0, #16] + smulwt r9, r3, r6 + smulwb r7, r3, r6 + smulwt r10, r4, r6 + smulwb r8, r4, r6 + pkhbt r7, r7, r9, lsl #16 + smulwt r11, r3, r12 + pkhbt r8, r8, r10, lsl #16 + uadd16 r6, r6, r7 + smulwt r7, r4, r12 + smulwb r9, r3, r12 + smulwb r10, r4, r12 + subs r5, r5, #1 + pkhbt r9, r9, r11, lsl #16 + ldr r11, [r0], #4 + pkhbt r10, r10, r7, lsl #16 + uadd16 r7, r12, r9 + usub16 r7, r8, r7 + uadd16 r6, r6, r10 + uadd16 r10, r11, r14 + usub16 r8, r11, r14 + uadd16 r9, r10, r6 + usub16 r10, r10, r6 + uadd16 r6, r8, r7 + usub16 r7, r8, r7 + str r6, [r1, #8] + ldrne r6, [r0, #8] + str r7, [r1, #16] + str r10, [r1, #24] + str r9, [r1], #4 + bne vp8_dequant_idct_loop1_v6 + + mov r5, #2 + sub r0, r1, #8 +vp8_dequant_idct_loop2_v6 + ldr r6, [r0], #4 + ldr r7, [r0], #4 + ldr r8, [r0], #4 + ldr r9, [r0], #4 + smulwt r1, r3, r6 + smulwt r12, r4, r6 + smulwt lr, r3, r8 + smulwt r10, r4, r8 + pkhbt r11, r8, r6, lsl #16 + pkhbt r1, lr, r1, lsl #16 + pkhbt r12, r10, r12, lsl #16 + pkhtb r6, r6, r8, asr #16 + uadd16 r6, r1, r6 + pkhbt lr, r9, r7, lsl #16 + uadd16 r10, r11, lr + usub16 lr, r11, lr + pkhtb r8, r7, r9, asr #16 + subs r5, r5, #1 + smulwt r1, r3, r8 + smulwb r7, r3, r8 + smulwt r11, r4, r8 + smulwb r9, r4, r8 + pkhbt r1, r7, r1, lsl #16 + uadd16 r8, r1, r8 + pkhbt r11, r9, r11, lsl #16 + usub16 r1, r12, r8 + uadd16 r8, r11, r6 + ldr r9, c0x00040004 + ldr r12, [sp] ; get stride from stack + uadd16 r6, r10, r8 + usub16 r7, r10, r8 + uadd16 r7, r7, r9 + uadd16 r6, r6, r9 + uadd16 r10, r14, r1 + usub16 r1, r14, r1 + uadd16 r10, r10, r9 + uadd16 r1, r1, r9 + ldr r11, [r2] ; load input from dst + mov r8, r7, asr #3 + pkhtb r9, r8, r10, asr #19 + mov r8, r1, asr #3 + pkhtb r8, r8, r6, asr #19 + uxtb16 lr, r11, ror #8 + qadd16 r9, r9, lr + uxtb16 lr, r11 + qadd16 r8, r8, lr + usat16 r9, #8, r9 + usat16 r8, #8, r8 + orr r9, r8, r9, lsl #8 + ldr r11, [r2, r12] ; load input from dst + mov r7, r7, lsl #16 + mov r1, r1, lsl #16 + mov r10, r10, lsl #16 + mov r6, r6, lsl #16 + mov r7, r7, asr #3 + pkhtb r7, r7, r10, asr #19 + mov r1, r1, asr #3 + pkhtb r1, r1, r6, asr #19 + uxtb16 r8, r11, ror #8 + qadd16 r7, r7, r8 + uxtb16 r8, r11 + qadd16 r1, r1, r8 + usat16 r7, #8, r7 + usat16 r1, #8, r1 + orr r1, r1, r7, lsl #8 + str r9, [r2], r12 ; store output to dst + str r1, [r2], r12 ; store output to dst + bne vp8_dequant_idct_loop2_v6 + +; vpx_memset + sub r0, r0, #32 + add sp, sp, #4 + + mov r12, #0 + str r12, [r0] + str r12, [r0, #4] + str r12, [r0, #8] + str r12, [r0, #12] + str r12, [r0, #16] + str r12, [r0, #20] + str r12, [r0, #24] + str r12, [r0, #28] + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_dequant_idct_add_v6| + +; Constant Pool +cospi8sqrt2minus1 DCD 0x00004E7B +sinpi8sqrt2 DCD 0x00008A8C +c0x00040004 DCD 0x00040004 + + END diff --git a/vp8/common/arm/armv6/dequantize_v6.asm b/vp8/common/arm/armv6/dequantize_v6.asm new file mode 100644 index 000000000..72f7e0ee5 --- /dev/null +++ b/vp8/common/arm/armv6/dequantize_v6.asm @@ -0,0 +1,69 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_dequantize_b_loop_v6| + + AREA |.text|, CODE, READONLY ; name this block of code +;------------------------------- +;void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ); +; r0 short *Q, +; r1 short *DQC +; r2 short *DQ +|vp8_dequantize_b_loop_v6| PROC + stmdb sp!, {r4-r9, lr} + + ldr r3, [r0] ;load Q + ldr r4, [r1] ;load DQC + ldr r5, [r0, #4] + ldr r6, [r1, #4] + + mov r12, #2 ;loop counter + +dequant_loop + smulbb r7, r3, r4 ;multiply + smultt r8, r3, r4 + smulbb r9, r5, r6 + smultt lr, r5, r6 + + ldr r3, [r0, #8] + ldr r4, [r1, #8] + ldr r5, [r0, #12] + ldr r6, [r1, #12] + + strh r7, [r2], #2 ;store result + smulbb r7, r3, r4 ;multiply + strh r8, [r2], #2 + smultt r8, r3, r4 + strh r9, [r2], #2 + smulbb r9, r5, r6 + strh lr, [r2], #2 + smultt lr, r5, r6 + + subs r12, r12, #1 + + add r0, r0, #16 + add r1, r1, #16 + + ldrne r3, [r0] + strh r7, [r2], #2 ;store result + ldrne r4, [r1] + strh r8, [r2], #2 + ldrne r5, [r0, #4] + strh r9, [r2], #2 + ldrne r6, [r1, #4] + strh lr, [r2], #2 + + bne dequant_loop + + ldmia sp!, {r4-r9, pc} + ENDP ;|vp8_dequantize_b_loop_v6| + + END diff --git a/vp8/common/arm/armv6/idct_blk_v6.c b/vp8/common/arm/armv6/idct_blk_v6.c new file mode 100644 index 000000000..9108929f5 --- /dev/null +++ b/vp8/common/arm/armv6/idct_blk_v6.c @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8/common/idct.h" +#include "vp8/common/dequantize.h" + + +void vp8_dequant_idct_add_y_block_v6(short *q, short *dq, + unsigned char *dst, + int stride, char *eobs) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_v6 (q, dq, dst, stride); + else if (eobs[0] == 1) + { + vp8_dc_only_idct_add_v6 (q[0]*dq[0], dst, stride, dst, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_v6 (q+16, dq, dst+4, stride); + else if (eobs[1] == 1) + { + vp8_dc_only_idct_add_v6 (q[16]*dq[0], dst+4, stride, dst+4, stride); + ((int *)(q+16))[0] = 0; + } + + if (eobs[2] > 1) + vp8_dequant_idct_add_v6 (q+32, dq, dst+8, stride); + else if (eobs[2] == 1) + { + vp8_dc_only_idct_add_v6 (q[32]*dq[0], dst+8, stride, dst+8, stride); + ((int *)(q+32))[0] = 0; + } + + if (eobs[3] > 1) + vp8_dequant_idct_add_v6 (q+48, dq, dst+12, stride); + else if (eobs[3] == 1) + { + vp8_dc_only_idct_add_v6 (q[48]*dq[0], dst+12, stride,dst+12,stride); + ((int *)(q+48))[0] = 0; + } + + q += 64; + dst += 4*stride; + eobs += 4; + } +} + +void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq, + unsigned char *dstu, + unsigned char *dstv, + int stride, char *eobs) +{ + int i; + + for (i = 0; i < 2; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_v6 (q, dq, dstu, stride); + else if (eobs[0] == 1) + { + vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstu, stride, dstu, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_v6 (q+16, dq, dstu+4, stride); + else if (eobs[1] == 1) + { + vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstu+4, stride, + dstu+4, stride); + ((int *)(q+16))[0] = 0; + } + + q += 32; + dstu += 4*stride; + eobs += 2; + } + + for (i = 0; i < 2; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_v6 (q, dq, dstv, stride); + else if (eobs[0] == 1) + { + vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstv, stride, dstv, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_v6 (q+16, dq, dstv+4, stride); + else if (eobs[1] == 1) + { + vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstv+4, stride, + dstv+4, stride); + ((int *)(q+16))[0] = 0; + } + + q += 32; + dstv += 4*stride; + eobs += 2; + } +} -- cgit v1.2.3