diff options
author | John Koleszar <jkoleszar@google.com> | 2010-05-18 11:58:33 -0400 |
---|---|---|
committer | John Koleszar <jkoleszar@google.com> | 2010-05-18 11:58:33 -0400 |
commit | 0ea50ce9cb4b65eee6afa1d041fe8beb5abda667 (patch) | |
tree | 1f3b9019f28bc56fd3156f96e5a9653a983ee61b /vp8/common | |
download | libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.gz libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.bz2 libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.zip |
Initial WebM release
Diffstat (limited to 'vp8/common')
168 files changed, 37458 insertions, 0 deletions
diff --git a/vp8/common/alloccommon.c b/vp8/common/alloccommon.c new file mode 100644 index 000000000..ac110f761 --- /dev/null +++ b/vp8/common/alloccommon.c @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "blockd.h" +#include "vpx_mem/vpx_mem.h" +#include "onyxc_int.h" +#include "findnearmv.h" +#include "entropymode.h" +#include "systemdependent.h" +#include "vpxerrors.h" + +#ifdef HAVE_CONFIG_H +#include "vpx_config.h" +#endif + +extern void vp8_init_scan_order_mask(); + +void vp8_update_mode_info_border(MODE_INFO *mi, int rows, int cols) +{ + int i; + vpx_memset(mi - cols - 1, 0, sizeof(MODE_INFO) * cols + 1); + + for (i = 0; i < rows; i++) + { + vpx_memset(&mi[i*cols-1], 0, sizeof(MODE_INFO)); + } +} +void vp8_de_alloc_frame_buffers(VP8_COMMON *oci) +{ + vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame); + vp8_yv12_de_alloc_frame_buffer(&oci->new_frame); + vp8_yv12_de_alloc_frame_buffer(&oci->last_frame); + vp8_yv12_de_alloc_frame_buffer(&oci->golden_frame); + vp8_yv12_de_alloc_frame_buffer(&oci->alt_ref_frame); + vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer); + + vpx_free(oci->above_context[Y1CONTEXT]); + vpx_free(oci->above_context[UCONTEXT]); + vpx_free(oci->above_context[VCONTEXT]); + vpx_free(oci->above_context[Y2CONTEXT]); + vpx_free(oci->mip); + + oci->above_context[Y1CONTEXT] = 0; + oci->above_context[UCONTEXT] = 0; + oci->above_context[VCONTEXT] = 0; + oci->above_context[Y2CONTEXT] = 0; + oci->mip = 0; + + // Structure used to minitor GF useage + if (oci->gf_active_flags != 0) + vpx_free(oci->gf_active_flags); + + oci->gf_active_flags = 0; +} + +int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height) +{ + vp8_de_alloc_frame_buffers(oci); + + // our internal buffers are always multiples of 16 + if ((width & 0xf) != 0) + width += 16 - (width & 0xf); + + if ((height & 0xf) != 0) + height += 16 - (height & 0xf); + + + if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame, width, 16, VP8BORDERINPIXELS) < 0) + { + vp8_de_alloc_frame_buffers(oci); + return ALLOC_FAILURE; + } + + + if (vp8_yv12_alloc_frame_buffer(&oci->new_frame, width, height, VP8BORDERINPIXELS) < 0) + { + vp8_de_alloc_frame_buffers(oci); + return ALLOC_FAILURE; + } + + if (vp8_yv12_alloc_frame_buffer(&oci->last_frame, width, height, VP8BORDERINPIXELS) < 0) + { + vp8_de_alloc_frame_buffers(oci); + return ALLOC_FAILURE; + } + + if (vp8_yv12_alloc_frame_buffer(&oci->golden_frame, width, height, VP8BORDERINPIXELS) < 0) + { + vp8_de_alloc_frame_buffers(oci); + return ALLOC_FAILURE; + } + + if (vp8_yv12_alloc_frame_buffer(&oci->alt_ref_frame, width, height, VP8BORDERINPIXELS) < 0) + { + vp8_de_alloc_frame_buffers(oci); + return ALLOC_FAILURE; + } + + if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0) + { + vp8_de_alloc_frame_buffers(oci); + return ALLOC_FAILURE; + } + + oci->mb_rows = height >> 4; + oci->mb_cols = width >> 4; + oci->MBs = oci->mb_rows * oci->mb_cols; + oci->mode_info_stride = oci->mb_cols + 1; + oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO)); + + if (!oci->mip) + { + vp8_de_alloc_frame_buffers(oci); + return ALLOC_FAILURE; + } + + oci->mi = oci->mip + oci->mode_info_stride + 1; + + + oci->above_context[Y1CONTEXT] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * oci->mb_cols * 4 , 1); + + if (!oci->above_context[Y1CONTEXT]) + { + vp8_de_alloc_frame_buffers(oci); + return ALLOC_FAILURE; + } + + oci->above_context[UCONTEXT] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * oci->mb_cols * 2 , 1); + + if (!oci->above_context[UCONTEXT]) + { + vp8_de_alloc_frame_buffers(oci); + return ALLOC_FAILURE; + } + + oci->above_context[VCONTEXT] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * oci->mb_cols * 2 , 1); + + if (!oci->above_context[VCONTEXT]) + { + vp8_de_alloc_frame_buffers(oci); + return ALLOC_FAILURE; + } + + oci->above_context[Y2CONTEXT] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * oci->mb_cols , 1); + + if (!oci->above_context[Y2CONTEXT]) + { + vp8_de_alloc_frame_buffers(oci); + return ALLOC_FAILURE; + } + + vp8_update_mode_info_border(oci->mi, oci->mb_rows, oci->mb_cols); + + // Structures used to minitor GF usage + if (oci->gf_active_flags != 0) + vpx_free(oci->gf_active_flags); + + oci->gf_active_flags = (unsigned char *)vpx_calloc(oci->mb_rows * oci->mb_cols, 1); + + if (!oci->gf_active_flags) + { + vp8_de_alloc_frame_buffers(oci); + return ALLOC_FAILURE; + } + + oci->gf_active_count = oci->mb_rows * oci->mb_cols; + + return 0; +} +void vp8_setup_version(VP8_COMMON *cm) +{ + switch (cm->version) + { + case 0: + cm->no_lpf = 0; + cm->simpler_lpf = 0; + cm->use_bilinear_mc_filter = 0; + cm->full_pixel = 0; + break; + case 1: + cm->no_lpf = 0; + cm->simpler_lpf = 1; + cm->use_bilinear_mc_filter = 1; + cm->full_pixel = 0; + break; + case 2: + cm->no_lpf = 1; + cm->simpler_lpf = 0; + cm->use_bilinear_mc_filter = 1; + cm->full_pixel = 0; + break; + case 3: + cm->no_lpf = 1; + cm->simpler_lpf = 1; + cm->use_bilinear_mc_filter = 1; + cm->full_pixel = 1; + break; + default: + //4,5,6,7 are reserved for future use + cm->no_lpf = 0; + cm->simpler_lpf = 0; + cm->use_bilinear_mc_filter = 0; + cm->full_pixel = 0; + break; + } +} +void vp8_create_common(VP8_COMMON *oci) +{ + vp8_machine_specific_config(oci); + vp8_default_coef_probs(oci); + vp8_init_mbmode_probs(oci); + vp8_default_bmode_probs(oci->fc.bmode_prob); + + oci->mb_no_coeff_skip = 1; + oci->no_lpf = 0; + oci->simpler_lpf = 0; + oci->use_bilinear_mc_filter = 0; + oci->full_pixel = 0; + oci->multi_token_partition = ONE_PARTITION; + oci->clr_type = REG_YUV; + oci->clamp_type = RECON_CLAMP_REQUIRED; + + // Initialise reference frame sign bias structure to defaults + vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias)); + + // Default disable buffer to buffer copying + oci->copy_buffer_to_gf = 0; + oci->copy_buffer_to_arf = 0; +} + +void vp8_remove_common(VP8_COMMON *oci) +{ + vp8_de_alloc_frame_buffers(oci); +} + +void vp8_initialize_common() +{ + vp8_coef_tree_initialize(); + + vp8_entropy_mode_init(); + + vp8_init_scan_order_mask(); + +} diff --git a/vp8/common/alloccommon.h b/vp8/common/alloccommon.h new file mode 100644 index 000000000..73c7383c7 --- /dev/null +++ b/vp8/common/alloccommon.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_ALLOCCOMMON_H +#define __INC_ALLOCCOMMON_H + +#include "onyxc_int.h" + +void vp8_create_common(VP8_COMMON *oci); +void vp8_remove_common(VP8_COMMON *oci); +void vp8_de_alloc_frame_buffers(VP8_COMMON *oci); +int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height); +void vp8_setup_version(VP8_COMMON *oci); + +#endif diff --git a/vp8/common/arm/armv6/bilinearfilter_v6.asm b/vp8/common/arm/armv6/bilinearfilter_v6.asm new file mode 100644 index 000000000..4428cf8ff --- /dev/null +++ b/vp8/common/arm/armv6/bilinearfilter_v6.asm @@ -0,0 +1,237 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_filter_block2d_bil_first_pass_armv6| + EXPORT |vp8_filter_block2d_bil_second_pass_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code + +;------------------------------------- +; r0 unsigned char *src_ptr, +; r1 unsigned short *output_ptr, +; r2 unsigned int src_pixels_per_line, +; r3 unsigned int output_height, +; stack unsigned int output_width, +; stack const short *vp8_filter +;------------------------------------- +; The output is transposed stroed in output array to make it easy for second pass filtering. +|vp8_filter_block2d_bil_first_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp8_filter address + ldr r4, [sp, #36] ; output width + + mov r12, r3 ; outer-loop counter + sub r2, r2, r4 ; src increment for height loop + + ;;IF ARCHITECTURE=6 + pld [r0] + ;;ENDIF + + ldr r5, [r11] ; load up filter coefficients + + mov r3, r3, lsl #1 ; output_height*2 + add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1) + + mov r11, r1 ; save output_ptr for each row + + cmp r5, #128 ; if filter coef = 128, then skip the filter + beq bil_null_1st_filter + +|bil_height_loop_1st_v6| + ldrb r6, [r0] ; load source data + ldrb r7, [r0, #1] + ldrb r8, [r0, #2] + mov lr, r4, lsr #2 ; 4-in-parellel loop counter + +|bil_width_loop_1st_v6| + ldrb r9, [r0, #3] + ldrb r10, [r0, #4] + + pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0] + pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1] + + smuad r6, r6, r5 ; apply the filter + pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2] + smuad r7, r7, r5 + pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3] + + smuad r8, r8, r5 + smuad r9, r9, r5 + + add r0, r0, #4 + subs lr, lr, #1 + + add r6, r6, #0x40 ; round_shift_and_clamp + add r7, r7, #0x40 + usat r6, #16, r6, asr #7 + usat r7, #16, r7, asr #7 + + strh r6, [r1], r3 ; result is transposed and stored + + add r8, r8, #0x40 ; round_shift_and_clamp + strh r7, [r1], r3 + add r9, r9, #0x40 + usat r8, #16, r8, asr #7 + usat r9, #16, r9, asr #7 + + strh r8, [r1], r3 ; result is transposed and stored + + ldrneb r6, [r0] ; load source data + strh r9, [r1], r3 + + ldrneb r7, [r0, #1] + ldrneb r8, [r0, #2] + + bne bil_width_loop_1st_v6 + + add r0, r0, r2 ; move to next input row + subs r12, r12, #1 + + ;;IF ARCHITECTURE=6 + pld [r0] + ;;ENDIF + + add r11, r11, #2 ; move over to next column + mov r1, r11 + + bne bil_height_loop_1st_v6 + + ldmia sp!, {r4 - r11, pc} + +|bil_null_1st_filter| +|bil_height_loop_null_1st| + mov lr, r4, lsr #2 ; loop counter + +|bil_width_loop_null_1st| + ldrb r6, [r0] ; load data + ldrb r7, [r0, #1] + ldrb r8, [r0, #2] + ldrb r9, [r0, #3] + + strh r6, [r1], r3 ; store it to immediate buffer + add r0, r0, #4 + strh r7, [r1], r3 + subs lr, lr, #1 + strh r8, [r1], r3 + strh r9, [r1], r3 + + bne bil_width_loop_null_1st + + subs r12, r12, #1 + add r0, r0, r2 ; move to next input line + add r11, r11, #2 ; move over to next column + mov r1, r11 + + bne bil_height_loop_null_1st + + ldmia sp!, {r4 - r11, pc} + + ENDP ; |vp8_filter_block2d_bil_first_pass_armv6| + + +;--------------------------------- +; r0 unsigned short *src_ptr, +; r1 unsigned char *output_ptr, +; r2 int output_pitch, +; r3 unsigned int output_height, +; stack unsigned int output_width, +; stack const short *vp8_filter +;--------------------------------- +|vp8_filter_block2d_bil_second_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp8_filter address + ldr r4, [sp, #36] ; output width + + ldr r5, [r11] ; load up filter coefficients + mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix + mov r11, r1 + + cmp r5, #128 ; if filter coef = 128, then skip the filter + beq bil_null_2nd_filter + +|bil_height_loop_2nd| + ldr r6, [r0] ; load the data + ldr r8, [r0, #4] + ldrh r10, [r0, #8] + mov lr, r3, lsr #2 ; loop counter + +|bil_width_loop_2nd| + pkhtb r7, r6, r8 ; src[1] | src[2] + pkhtb r9, r8, r10 ; src[3] | src[4] + + smuad r6, r6, r5 ; apply filter + smuad r8, r8, r5 ; apply filter + + subs lr, lr, #1 + + smuadx r7, r7, r5 ; apply filter + smuadx r9, r9, r5 ; apply filter + + add r0, r0, #8 + + add r6, r6, #0x40 ; round_shift_and_clamp + add r7, r7, #0x40 + usat r6, #8, r6, asr #7 + usat r7, #8, r7, asr #7 + strb r6, [r1], r2 ; the result is transposed back and stored + + add r8, r8, #0x40 ; round_shift_and_clamp + strb r7, [r1], r2 + add r9, r9, #0x40 + usat r8, #8, r8, asr #7 + usat r9, #8, r9, asr #7 + strb r8, [r1], r2 ; the result is transposed back and stored + + ldrne r6, [r0] ; load data + strb r9, [r1], r2 + ldrne r8, [r0, #4] + ldrneh r10, [r0, #8] + + bne bil_width_loop_2nd + + subs r12, r12, #1 + add r0, r0, #4 ; update src for next row + add r11, r11, #1 + mov r1, r11 + + bne bil_height_loop_2nd + ldmia sp!, {r4 - r11, pc} + +|bil_null_2nd_filter| +|bil_height_loop_null_2nd| + mov lr, r3, lsr #2 + +|bil_width_loop_null_2nd| + ldr r6, [r0], #4 ; load data + subs lr, lr, #1 + ldr r8, [r0], #4 + + strb r6, [r1], r2 ; store data + mov r7, r6, lsr #16 + strb r7, [r1], r2 + mov r9, r8, lsr #16 + strb r8, [r1], r2 + strb r9, [r1], r2 + + bne bil_width_loop_null_2nd + + subs r12, r12, #1 + add r0, r0, #4 + add r11, r11, #1 + mov r1, r11 + + bne bil_height_loop_null_2nd + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_filter_block2d_second_pass_armv6| + + END diff --git a/vp8/common/arm/armv6/copymem16x16_v6.asm b/vp8/common/arm/armv6/copymem16x16_v6.asm new file mode 100644 index 000000000..00e97397c --- /dev/null +++ b/vp8/common/arm/armv6/copymem16x16_v6.asm @@ -0,0 +1,181 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem16x16_v6| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem16x16_v6| PROC + stmdb sp!, {r4 - r7} + ;push {r4-r7} + + ;preload + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + ands r4, r0, #15 + beq copy_mem16x16_fast + + ands r4, r0, #7 + beq copy_mem16x16_8 + + ands r4, r0, #3 + beq copy_mem16x16_4 + + ;copy one byte each time + ldrb r4, [r0] + ldrb r5, [r0, #1] + ldrb r6, [r0, #2] + ldrb r7, [r0, #3] + + mov r12, #16 + +copy_mem16x16_1_loop + strb r4, [r2] + strb r5, [r2, #1] + strb r6, [r2, #2] + strb r7, [r2, #3] + + ldrb r4, [r0, #4] + ldrb r5, [r0, #5] + ldrb r6, [r0, #6] + ldrb r7, [r0, #7] + + subs r12, r12, #1 + + strb r4, [r2, #4] + strb r5, [r2, #5] + strb r6, [r2, #6] + strb r7, [r2, #7] + + ldrb r4, [r0, #8] + ldrb r5, [r0, #9] + ldrb r6, [r0, #10] + ldrb r7, [r0, #11] + + strb r4, [r2, #8] + strb r5, [r2, #9] + strb r6, [r2, #10] + strb r7, [r2, #11] + + ldrb r4, [r0, #12] + ldrb r5, [r0, #13] + ldrb r6, [r0, #14] + ldrb r7, [r0, #15] + + add r0, r0, r1 + + strb r4, [r2, #12] + strb r5, [r2, #13] + strb r6, [r2, #14] + strb r7, [r2, #15] + + add r2, r2, r3 + + ldrneb r4, [r0] + ldrneb r5, [r0, #1] + ldrneb r6, [r0, #2] + ldrneb r7, [r0, #3] + + bne copy_mem16x16_1_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + +;copy 4 bytes each time +copy_mem16x16_4 + ldr r4, [r0] + ldr r5, [r0, #4] + ldr r6, [r0, #8] + ldr r7, [r0, #12] + + mov r12, #16 + +copy_mem16x16_4_loop + subs r12, r12, #1 + add r0, r0, r1 + + str r4, [r2] + str r5, [r2, #4] + str r6, [r2, #8] + str r7, [r2, #12] + + add r2, r2, r3 + + ldrne r4, [r0] + ldrne r5, [r0, #4] + ldrne r6, [r0, #8] + ldrne r7, [r0, #12] + + bne copy_mem16x16_4_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + +;copy 8 bytes each time +copy_mem16x16_8 + sub r1, r1, #16 + sub r3, r3, #16 + + mov r12, #16 + +copy_mem16x16_8_loop + ldmia r0!, {r4-r5} + ;ldm r0, {r4-r5} + ldmia r0!, {r6-r7} + + add r0, r0, r1 + + stmia r2!, {r4-r5} + subs r12, r12, #1 + ;stm r2, {r4-r5} + stmia r2!, {r6-r7} + + add r2, r2, r3 + + bne copy_mem16x16_8_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + +;copy 16 bytes each time +copy_mem16x16_fast + ;sub r1, r1, #16 + ;sub r3, r3, #16 + + mov r12, #16 + +copy_mem16x16_fast_loop + ldmia r0, {r4-r7} + ;ldm r0, {r4-r7} + add r0, r0, r1 + + subs r12, r12, #1 + stmia r2, {r4-r7} + ;stm r2, {r4-r7} + add r2, r2, r3 + + bne copy_mem16x16_fast_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + + ENDP ; |vp8_copy_mem16x16_v6| + + END diff --git a/vp8/common/arm/armv6/copymem8x4_v6.asm b/vp8/common/arm/armv6/copymem8x4_v6.asm new file mode 100644 index 000000000..94473ca65 --- /dev/null +++ b/vp8/common/arm/armv6/copymem8x4_v6.asm @@ -0,0 +1,127 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem8x4_v6| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void vp8_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem8x4_v6| PROC + ;push {r4-r5} + stmdb sp!, {r4-r5} + + ;preload + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + ands r4, r0, #7 + beq copy_mem8x4_fast + + ands r4, r0, #3 + beq copy_mem8x4_4 + + ;copy 1 byte each time + ldrb r4, [r0] + ldrb r5, [r0, #1] + + mov r12, #4 + +copy_mem8x4_1_loop + strb r4, [r2] + strb r5, [r2, #1] + + ldrb r4, [r0, #2] + ldrb r5, [r0, #3] + + subs r12, r12, #1 + + strb r4, [r2, #2] + strb r5, [r2, #3] + + ldrb r4, [r0, #4] + ldrb r5, [r0, #5] + + strb r4, [r2, #4] + strb r5, [r2, #5] + + ldrb r4, [r0, #6] + ldrb r5, [r0, #7] + + add r0, r0, r1 + + strb r4, [r2, #6] + strb r5, [r2, #7] + + add r2, r2, r3 + + ldrneb r4, [r0] + ldrneb r5, [r0, #1] + + bne copy_mem8x4_1_loop + + ldmia sp!, {r4 - r5} + ;pop {r4-r5} + mov pc, lr + +;copy 4 bytes each time +copy_mem8x4_4 + ldr r4, [r0] + ldr r5, [r0, #4] + + mov r12, #4 + +copy_mem8x4_4_loop + subs r12, r12, #1 + add r0, r0, r1 + + str r4, [r2] + str r5, [r2, #4] + + add r2, r2, r3 + + ldrne r4, [r0] + ldrne r5, [r0, #4] + + bne copy_mem8x4_4_loop + + ldmia sp!, {r4-r5} + ;pop {r4-r5} + mov pc, lr + +;copy 8 bytes each time +copy_mem8x4_fast + ;sub r1, r1, #8 + ;sub r3, r3, #8 + + mov r12, #4 + +copy_mem8x4_fast_loop + ldmia r0, {r4-r5} + ;ldm r0, {r4-r5} + add r0, r0, r1 + + subs r12, r12, #1 + stmia r2, {r4-r5} + ;stm r2, {r4-r5} + add r2, r2, r3 + + bne copy_mem8x4_fast_loop + + ldmia sp!, {r4-r5} + ;pop {r4-r5} + mov pc, lr + + ENDP ; |vp8_copy_mem8x4_v6| + + END diff --git a/vp8/common/arm/armv6/copymem8x8_v6.asm b/vp8/common/arm/armv6/copymem8x8_v6.asm new file mode 100644 index 000000000..7cfa53389 --- /dev/null +++ b/vp8/common/arm/armv6/copymem8x8_v6.asm @@ -0,0 +1,127 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem8x8_v6| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem8x8_v6| PROC + ;push {r4-r5} + stmdb sp!, {r4-r5} + + ;preload + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + ands r4, r0, #7 + beq copy_mem8x8_fast + + ands r4, r0, #3 + beq copy_mem8x8_4 + + ;copy 1 byte each time + ldrb r4, [r0] + ldrb r5, [r0, #1] + + mov r12, #8 + +copy_mem8x8_1_loop + strb r4, [r2] + strb r5, [r2, #1] + + ldrb r4, [r0, #2] + ldrb r5, [r0, #3] + + subs r12, r12, #1 + + strb r4, [r2, #2] + strb r5, [r2, #3] + + ldrb r4, [r0, #4] + ldrb r5, [r0, #5] + + strb r4, [r2, #4] + strb r5, [r2, #5] + + ldrb r4, [r0, #6] + ldrb r5, [r0, #7] + + add r0, r0, r1 + + strb r4, [r2, #6] + strb r5, [r2, #7] + + add r2, r2, r3 + + ldrneb r4, [r0] + ldrneb r5, [r0, #1] + + bne copy_mem8x8_1_loop + + ldmia sp!, {r4 - r5} + ;pop {r4-r5} + mov pc, lr + +;copy 4 bytes each time +copy_mem8x8_4 + ldr r4, [r0] + ldr r5, [r0, #4] + + mov r12, #8 + +copy_mem8x8_4_loop + subs r12, r12, #1 + add r0, r0, r1 + + str r4, [r2] + str r5, [r2, #4] + + add r2, r2, r3 + + ldrne r4, [r0] + ldrne r5, [r0, #4] + + bne copy_mem8x8_4_loop + + ldmia sp!, {r4 - r5} + ;pop {r4-r5} + mov pc, lr + +;copy 8 bytes each time +copy_mem8x8_fast + ;sub r1, r1, #8 + ;sub r3, r3, #8 + + mov r12, #8 + +copy_mem8x8_fast_loop + ldmia r0, {r4-r5} + ;ldm r0, {r4-r5} + add r0, r0, r1 + + subs r12, r12, #1 + stmia r2, {r4-r5} + ;stm r2, {r4-r5} + add r2, r2, r3 + + bne copy_mem8x8_fast_loop + + ldmia sp!, {r4-r5} + ;pop {r4-r5} + mov pc, lr + + ENDP ; |vp8_copy_mem8x8_v6| + + END diff --git a/vp8/common/arm/armv6/filter_v6.asm b/vp8/common/arm/armv6/filter_v6.asm new file mode 100644 index 000000000..a7863fc94 --- /dev/null +++ b/vp8/common/arm/armv6/filter_v6.asm @@ -0,0 +1,383 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_filter_block2d_first_pass_armv6| + EXPORT |vp8_filter_block2d_second_pass_armv6| + EXPORT |vp8_filter_block2d_first_pass_only_armv6| + EXPORT |vp8_filter_block2d_second_pass_only_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code +;------------------------------------- +; r0 unsigned char *src_ptr +; r1 short *output_ptr +; r2 unsigned int src_pixels_per_line +; r3 unsigned int output_width +; stack unsigned int output_height +; stack const short *vp8_filter +;------------------------------------- +; vp8_filter the input and put in the output array. Apply the 6 tap FIR filter with +; the output being a 2 byte value and the intput being a 1 byte value. +|vp8_filter_block2d_first_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp8_filter address + ldr r7, [sp, #36] ; output height + + sub r2, r2, r3 ; inside loop increments input array, + ; so the height loop only needs to add + ; r2 - width to the input pointer + + mov r3, r3, lsl #1 ; multiply width by 2 because using shorts + add r12, r3, #16 ; square off the output + sub sp, sp, #4 + + ;;IF ARCHITECTURE=6 + ;pld [r0, #-2] + ;;pld [r0, #30] + ;;ENDIF + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + str r1, [sp] ; push destination to stack + mov r7, r7, lsl #16 ; height is top part of counter + +; six tap filter +|height_loop_1st_6| + ldrb r8, [r0, #-2] ; load source data + ldrb r9, [r0, #-1] + ldrb r10, [r0], #2 + orr r7, r7, r3, lsr #2 ; construct loop counter + +|width_loop_1st_6| + ldrb r11, [r0, #-1] + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0] + + smuad lr, lr, r4 ; apply the filter + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + smuad r8, r8, r4 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0, #1] + smlad r8, r11, r5, r8 + ldrb r11, [r0, #2] + + sub r7, r7, #1 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r11, r10, r6, r8 + + ands r10, r7, #0xff ; test loop counter + + add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0, #-2] ; load data for next loop + usat lr, #8, lr, asr #7 + add r11, r11, #0x40 + ldrneb r9, [r0, #-1] + usat r11, #8, r11, asr #7 + + strh lr, [r1], r12 ; result is transposed and stored, which + ; will make second pass filtering easier. + ldrneb r10, [r0], #2 + strh r11, [r1], r12 + + bne width_loop_1st_6 + + ;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines + ;;IF ARCHITECTURE=6 + ;pld [r0, r2] + ;;pld [r0, r9] + ;;ENDIF + + ldr r1, [sp] ; load and update dst address + subs r7, r7, #0x10000 + add r0, r0, r2 ; move to next input line + add r1, r1, #2 ; move over to next column + str r1, [sp] + + bne height_loop_1st_6 + + add sp, sp, #4 + ldmia sp!, {r4 - r11, pc} + + ENDP + +;--------------------------------- +; r0 short *src_ptr, +; r1 unsigned char *output_ptr, +; r2 unsigned int output_pitch, +; r3 unsigned int cnt, +; stack const short *vp8_filter +;--------------------------------- +|vp8_filter_block2d_second_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #36] ; vp8_filter address + sub sp, sp, #4 + mov r7, r3, lsl #16 ; height is top part of counter + str r1, [sp] ; push destination to stack + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + pkhbt r12, r5, r4 ; pack the filter differently + pkhbt r11, r6, r5 + + sub r0, r0, #4 ; offset input buffer + +|height_loop_2nd| + ldr r8, [r0] ; load the data + ldr r9, [r0, #4] + orr r7, r7, r3, lsr #1 ; loop counter + +|width_loop_2nd| + smuad lr, r4, r8 ; apply filter + sub r7, r7, #1 + smulbt r8, r4, r8 + + ldr r10, [r0, #8] + + smlad lr, r5, r9, lr + smladx r8, r12, r9, r8 + + ldrh r9, [r0, #12] + + smlad lr, r6, r10, lr + smladx r8, r11, r10, r8 + + add r0, r0, #4 + smlatb r10, r6, r9, r8 + + add lr, lr, #0x40 ; round_shift_and_clamp + ands r8, r7, #0xff + usat lr, #8, lr, asr #7 + add r10, r10, #0x40 + strb lr, [r1], r2 ; the result is transposed back and stored + usat r10, #8, r10, asr #7 + + ldrne r8, [r0] ; load data for next loop + ldrne r9, [r0, #4] + strb r10, [r1], r2 + + bne width_loop_2nd + + ldr r1, [sp] ; update dst for next loop + subs r7, r7, #0x10000 + add r0, r0, #16 ; updata src for next loop + add r1, r1, #1 + str r1, [sp] + + bne height_loop_2nd + + add sp, sp, #4 + ldmia sp!, {r4 - r11, pc} + + ENDP + +;------------------------------------ +; r0 unsigned char *src_ptr +; r1 unsigned char *output_ptr, +; r2 unsigned int src_pixels_per_line +; r3 unsigned int cnt, +; stack unsigned int output_pitch, +; stack const short *vp8_filter +;------------------------------------ +|vp8_filter_block2d_first_pass_only_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r4, [sp, #36] ; output pitch + ldr r11, [sp, #40] ; HFilter address + sub sp, sp, #8 + + mov r7, r3 + sub r2, r2, r3 ; inside loop increments input array, + ; so the height loop only needs to add + ; r2 - width to the input pointer + + sub r4, r4, r3 + str r4, [sp] ; save modified output pitch + str r2, [sp, #4] + + mov r2, #0x40 + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + +; six tap filter +|height_loop_1st_only_6| + ldrb r8, [r0, #-2] ; load data + ldrb r9, [r0, #-1] + ldrb r10, [r0], #2 + + mov r12, r3, lsr #1 ; loop counter + +|width_loop_1st_only_6| + ldrb r11, [r0, #-1] + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0] + +;; smuad lr, lr, r4 + smlad lr, lr, r4, r2 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 +;; smuad r8, r8, r4 + smlad r8, r8, r4, r2 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0, #1] + smlad r8, r11, r5, r8 + ldrb r11, [r0, #2] + + subs r12, r12, #1 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r10, r10, r6, r8 + +;; add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0, #-2] ; load data for next loop + usat lr, #8, lr, asr #7 +;; add r10, r10, #0x40 + strb lr, [r1], #1 ; store the result + usat r10, #8, r10, asr #7 + + ldrneb r9, [r0, #-1] + strb r10, [r1], #1 + ldrneb r10, [r0], #2 + + bne width_loop_1st_only_6 + + ;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines + ;;IF ARCHITECTURE=6 + ;pld [r0, r2] + ;;pld [r0, r9] + ;;ENDIF + + ldr lr, [sp] ; load back output pitch + ldr r12, [sp, #4] ; load back output pitch + subs r7, r7, #1 + add r0, r0, r12 ; updata src for next loop + add r1, r1, lr ; update dst for next loop + + bne height_loop_1st_only_6 + + add sp, sp, #8 + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_filter_block2d_first_pass_only_armv6| + + +;------------------------------------ +; r0 unsigned char *src_ptr, +; r1 unsigned char *output_ptr, +; r2 unsigned int src_pixels_per_line +; r3 unsigned int cnt, +; stack unsigned int output_pitch, +; stack const short *vp8_filter +;------------------------------------ +|vp8_filter_block2d_second_pass_only_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; VFilter address + ldr r12, [sp, #36] ; output pitch + + mov r7, r3, lsl #16 ; height is top part of counter + sub r0, r0, r2, lsl #1 ; need 6 elements for filtering, 2 before, 3 after + + sub sp, sp, #8 + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + str r0, [sp] ; save r0 to stack + str r1, [sp, #4] ; save dst to stack + +; six tap filter +|width_loop_2nd_only_6| + ldrb r8, [r0], r2 ; load data + orr r7, r7, r3 ; loop counter + ldrb r9, [r0], r2 + ldrb r10, [r0], r2 + +|height_loop_2nd_only_6| + ; filter first column in this inner loop, than, move to next colum. + ldrb r11, [r0], r2 + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0], r2 + + smuad lr, lr, r4 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + smuad r8, r8, r4 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0], r2 + smlad r8, r11, r5, r8 + ldrb r11, [r0] + + sub r7, r7, #2 + sub r0, r0, r2, lsl #2 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r10, r10, r6, r8 + + ands r9, r7, #0xff + + add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0], r2 ; load data for next loop + usat lr, #8, lr, asr #7 + add r10, r10, #0x40 + strb lr, [r1], r12 ; store the result for the column + usat r10, #8, r10, asr #7 + + ldrneb r9, [r0], r2 + strb r10, [r1], r12 + ldrneb r10, [r0], r2 + + bne height_loop_2nd_only_6 + + ldr r0, [sp] + ldr r1, [sp, #4] + subs r7, r7, #0x10000 + add r0, r0, #1 ; move to filter next column + str r0, [sp] + add r1, r1, #1 + str r1, [sp, #4] + + bne width_loop_2nd_only_6 + + add sp, sp, #8 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_filter_block2d_second_pass_only_armv6| + + END diff --git a/vp8/common/arm/armv6/idct_v6.asm b/vp8/common/arm/armv6/idct_v6.asm new file mode 100644 index 000000000..25c5165ec --- /dev/null +++ b/vp8/common/arm/armv6/idct_v6.asm @@ -0,0 +1,376 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +; r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r14 + EXPORT |vp8_short_idct4x4llm_1_v6| + EXPORT |vp8_short_idct4x4llm_v6| + EXPORT |vp8_short_idct4x4llm_v6_scott| + EXPORT |vp8_short_idct4x4llm_v6_dual| + + EXPORT |vp8_dc_only_idct_armv6| + + AREA |.text|, CODE, READONLY + +;******************************************************************************** +;* void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch) +;* r0 INT16 * input +;* r1 INT16 * output +;* r2 INT32 pitch +;* bench: 3/5 +;******************************************************************************** + +|vp8_short_idct4x4llm_1_v6| PROC ; cycles in out pit + ; + ldrsh r0, [r0] ; load input[0] 1, r0 un 2 + add r0, r0, #4 ; 1 +4 + stmdb sp!, {r4, r5, lr} ; make room for wide writes 1 backup + mov r0, r0, asr #3 ; (input[0] + 4) >> 3 1, r0 req`d ^1 >> 3 + pkhbt r4, r0, r0, lsl #16 ; pack r0 into r4 1, r0 req`d ^1 pack + mov r5, r4 ; expand expand + + strd r4, [r1], r2 ; *output = r0, post inc 1 + strd r4, [r1], r2 ; 1 + strd r4, [r1], r2 ; 1 + strd r4, [r1] ; 1 + ; + ldmia sp!, {r4, r5, pc} ; replace vars, return restore + ENDP ; |vp8_short_idct4x4llm_1_v6| +;******************************************************************************** +;******************************************************************************** +;******************************************************************************** + +;******************************************************************************** +;* void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch) +;* r0 INT16 * input +;* r1 INT16 * output +;* r2 INT32 pitch +;* bench: +;******************************************************************************** + +|vp8_short_idct4x4llm_v6| PROC ; cycles in out pit + ; + stmdb sp!, {r4-r11, lr} ; backup registers 1 backup + ; + mov r4, #0x00004E00 ; 1 cst + orr r4, r4, #0x0000007B ; cospi8sqrt2minus1 + mov r5, #0x00008A00 ; 1 cst + orr r5, r5, #0x0000008C ; sinpi8sqrt2 + ; + mov r6, #4 ; i=4 1 i +loop1 ; + ldrsh r12, [r0, #8] ; input[4] 1, r12 unavail 2 [4] + ldrsh r3, [r0, #24] ; input[12] 1, r3 unavail 2 [12] + ldrsh r8, [r0, #16] ; input[8] 1, r8 unavail 2 [8] + ldrsh r7, [r0], #0x2 ; input[0] 1, r7 unavail 2 ++ [0] + smulwb r10, r5, r12 ; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1 t1 + smulwb r11, r4, r3 ; ([12] * cospi8sqrt2minus1) >> 16 1, r11 un 2, r3/r4 ^1 t2 + add r9, r7, r8 ; a1 = [0] + [8] 1 a1 + sub r7, r7, r8 ; b1 = [0] - [8] 1 b1 + add r11, r3, r11 ; temp2 1 + rsb r11, r11, r10 ; c1 = temp1 - temp2 1 c1 + smulwb r3, r5, r3 ; ([12] * sinpi8sqrt2) >> 16 1, r3 un 2, r3/r5 ^ 1 t2 + smulwb r10, r4, r12 ; ([4] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r12/r4 ^1 t1 + add r8, r7, r11 ; b1 + c1 1 b+c + strh r8, [r1, r2] ; out[pitch] = b1+c1 1 + sub r7, r7, r11 ; b1 - c1 1 b-c + add r10, r12, r10 ; temp1 1 + add r3, r10, r3 ; d1 = temp1 + temp2 1 d1 + add r10, r9, r3 ; a1 + d1 1 a+d + sub r3, r9, r3 ; a1 - d1 1 a-d + add r8, r2, r2 ; pitch * 2 1 p*2 + strh r7, [r1, r8] ; out[pitch*2] = b1-c1 1 + add r7, r2, r2, lsl #1 ; pitch * 3 1 p*3 + strh r3, [r1, r7] ; out[pitch*3] = a1-d1 1 + subs r6, r6, #1 ; i-- 1 -- + strh r10, [r1], #0x2 ; out[0] = a1+d1 1 ++ + bne loop1 ; if i>0, continue + ; + sub r1, r1, #8 ; set up out for next loop 1 -4 + ; for this iteration, input=prev output + mov r6, #4 ; i=4 1 i +; b returnfull +loop2 ; + ldrsh r11, [r1, #2] ; input[1] 1, r11 un 2 [1] + ldrsh r8, [r1, #6] ; input[3] 1, r8 un 2 [3] + ldrsh r3, [r1, #4] ; input[2] 1, r3 un 2 [2] + ldrsh r0, [r1] ; input[0] 1, r0 un 2 [0] + smulwb r9, r5, r11 ; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1 t1 + smulwb r10, r4, r8 ; ([3] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r8 ^1 t2 + add r7, r0, r3 ; a1 = [0] + [2] 1 a1 + sub r0, r0, r3 ; b1 = [0] - [2] 1 b1 + add r10, r8, r10 ; temp2 1 + rsb r9, r10, r9 ; c1 = temp1 - temp2 1 c1 + smulwb r8, r5, r8 ; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1 t2 + smulwb r10, r4, r11 ; ([1] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r11 ^1 t1 + add r3, r0, r9 ; b1+c1 1 b+c + add r3, r3, #4 ; b1+c1+4 1 +4 + add r10, r11, r10 ; temp1 1 + mov r3, r3, asr #3 ; b1+c1+4 >> 3 1, r3 ^1 >>3 + strh r3, [r1, #2] ; out[1] = b1+c1 1 + add r10, r10, r8 ; d1 = temp1 + temp2 1 d1 + add r3, r7, r10 ; a1+d1 1 a+d + add r3, r3, #4 ; a1+d1+4 1 +4 + sub r7, r7, r10 ; a1-d1 1 a-d + add r7, r7, #4 ; a1-d1+4 1 +4 + mov r3, r3, asr #3 ; a1+d1+4 >> 3 1, r3 ^1 >>3 + mov r7, r7, asr #3 ; a1-d1+4 >> 3 1, r7 ^1 >>3 + strh r7, [r1, #6] ; out[3] = a1-d1 1 + sub r0, r0, r9 ; b1-c1 1 b-c + add r0, r0, #4 ; b1-c1+4 1 +4 + subs r6, r6, #1 ; i-- 1 -- + mov r0, r0, asr #3 ; b1-c1+4 >> 3 1, r0 ^1 >>3 + strh r0, [r1, #4] ; out[2] = b1-c1 1 + strh r3, [r1], r2 ; out[0] = a1+d1 1 +; add r1, r1, r2 ; out += pitch 1 ++ + bne loop2 ; if i>0, continue +returnfull ; + ldmia sp!, {r4 - r11, pc} ; replace vars, return restore + ENDP + +;******************************************************************************** +;******************************************************************************** +;******************************************************************************** + +;******************************************************************************** +;* void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch) +;* r0 INT16 * input +;* r1 INT16 * output +;* r2 INT32 pitch +;* bench: +;******************************************************************************** + +|vp8_short_idct4x4llm_v6_scott| PROC ; cycles in out pit +; mov r0, #0 ; +; ldr r0, [r0] ; + stmdb sp!, {r4 - r11, lr} ; backup registers 1 backup + ; + mov r3, #0x00004E00 ; cos + orr r3, r3, #0x0000007B ; cospi8sqrt2minus1 + mov r4, #0x00008A00 ; sin + orr r4, r4, #0x0000008C ; sinpi8sqrt2 + ; + mov r5, #0x2 ; i i + ; +short_idct4x4llm_v6_scott_loop1 ; + ldr r10, [r0, #(4*2)] ; i5 | i4 5,4 + ldr r11, [r0, #(12*2)] ; i13 | i12 13,12 + ; + smulwb r6, r4, r10 ; ((ip[4] * sinpi8sqrt2) >> 16) lt1 + smulwb r7, r3, r11 ; ((ip[12] * cospi8sqrt2minus1) >> 16) lt2 + ; + smulwb r12, r3, r10 ; ((ip[4] * cospi8sqrt2misu1) >> 16) l2t2 + smulwb r14, r4, r11 ; ((ip[12] * sinpi8sqrt2) >> 16) l2t1 + ; + add r6, r6, r7 ; partial c1 lt1-lt2 + add r12, r12, r14 ; partial d1 l2t2+l2t1 + ; + smulwt r14, r4, r10 ; ((ip[5] * sinpi8sqrt2) >> 16) ht1 + smulwt r7, r3, r11 ; ((ip[13] * cospi8sqrt2minus1) >> 16) ht2 + ; + smulwt r8, r3, r10 ; ((ip[5] * cospi8sqrt2minus1) >> 16) h2t1 + smulwt r9, r4, r11 ; ((ip[13] * sinpi8sqrt2) >> 16) h2t2 + ; + add r7, r14, r7 ; partial c1_2 ht1+ht2 + sub r8, r8, r9 ; partial d1_2 h2t1-h2t2 + ; + pkhbt r6, r6, r7, lsl #16 ; partial c1_2 | partial c1_1 pack + pkhbt r12, r12, r8, lsl #16 ; partial d1_2 | partial d1_1 pack + ; + usub16 r6, r6, r10 ; c1_2 | c1_1 c + uadd16 r12, r12, r11 ; d1_2 | d1_1 d + ; + ldr r10, [r0, #0] ; i1 | i0 1,0 + ldr r11, [r0, #(8*2)] ; i9 | i10 9,10 + ; +;;;;;; add r0, r0, #0x4 ; +4 +;;;;;; add r1, r1, #0x4 ; +4 + ; + uadd16 r8, r10, r11 ; i1 + i9 | i0 + i8 aka a1 a + usub16 r9, r10, r11 ; i1 - i9 | i0 - i8 aka b1 b + ; + uadd16 r7, r8, r12 ; a1 + d1 pair a+d + usub16 r14, r8, r12 ; a1 - d1 pair a-d + ; + str r7, [r1] ; op[0] = a1 + d1 + str r14, [r1, r2] ; op[pitch*3] = a1 - d1 + ; + add r0, r0, #0x4 ; op[pitch] = b1 + c1 ++ + add r1, r1, #0x4 ; op[pitch*2] = b1 - c1 ++ + ; + subs r5, r5, #0x1 ; -- + bne short_idct4x4llm_v6_scott_loop1 ; + ; + sub r1, r1, #16 ; reset output ptr + mov r5, #0x4 ; + mov r0, r1 ; input = output + ; +short_idct4x4llm_v6_scott_loop2 ; + ; + subs r5, r5, #0x1 ; + bne short_idct4x4llm_v6_scott_loop2 ; + ; + ldmia sp!, {r4 - r11, pc} ; + ENDP ; + ; +;******************************************************************************** +;******************************************************************************** +;******************************************************************************** + +;******************************************************************************** +;* void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch) +;* r0 INT16 * input +;* r1 INT16 * output +;* r2 INT32 pitch +;* bench: +;******************************************************************************** + +|vp8_short_idct4x4llm_v6_dual| PROC ; cycles in out pit + ; + stmdb sp!, {r4-r11, lr} ; backup registers 1 backup + mov r3, #0x00004E00 ; cos + orr r3, r3, #0x0000007B ; cospi8sqrt2minus1 + mov r4, #0x00008A00 ; sin + orr r4, r4, #0x0000008C ; sinpi8sqrt2 + mov r5, #0x2 ; i=2 i +loop1_dual + ldr r6, [r0, #(4*2)] ; i5 | i4 5|4 + ldr r12, [r0, #(12*2)] ; i13 | i12 13|12 + ldr r14, [r0, #(8*2)] ; i9 | i8 9|8 + + smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c + smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c + smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s + smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s + pkhbt r7, r7, r9, lsl #16 ; 5c | 4c + smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c + pkhbt r8, r8, r10, lsl #16 ; 5s | 4s + uadd16 r6, r6, r7 ; 5c+5 | 4c+4 + smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s + smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c + smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s + subs r5, r5, #0x1 ; i-- -- + pkhbt r9, r9, r11, lsl #16 ; 13c | 12c + ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0 + pkhbt r10, r10, r7, lsl #16 ; 13s | 12s + uadd16 r7, r12, r9 ; 13c+13 | 12c+12 + usub16 r7, r8, r7 ; c c + uadd16 r6, r6, r10 ; d d + uadd16 r10, r11, r14 ; a a + usub16 r8, r11, r14 ; b b + uadd16 r9, r10, r6 ; a+d a+d + usub16 r10, r10, r6 ; a-d a-d + uadd16 r6, r8, r7 ; b+c b+c + usub16 r7, r8, r7 ; b-c b-c + str r6, [r1, r2] ; o5 | o4 + add r6, r2, r2 ; pitch * 2 p2 + str r7, [r1, r6] ; o9 | o8 + add r6, r6, r2 ; pitch * 3 p3 + str r10, [r1, r6] ; o13 | o12 + str r9, [r1], #0x4 ; o1 | o0 ++ + bne loop1_dual ; + mov r5, #0x2 ; i=2 i + sub r0, r1, #8 ; reset input/output i/o +loop2_dual + ldr r6, [r0, r2] ; i5 | i4 5|4 + ldr r1, [r0] ; i1 | i0 1|0 + ldr r12, [r0, #0x4] ; i3 | i2 3|2 + add r14, r2, #0x4 ; pitch + 2 p+2 + ldr r14, [r0, r14] ; i7 | i6 7|6 + smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c + smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c + smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s + smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s + pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4 + pkhbt r7, r9, r7, lsl #16 ; 1c | 5c + pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1 + pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5 + uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2 + pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6 + uadd16 r10, r11, r9 ; a a + usub16 r9, r11, r9 ; b b + pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7 + subs r5, r5, #0x1 ; i-- -- + smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c + smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s + smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c + smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s + + pkhbt r7, r12, r7, lsl #16 ; 3c | 7c + pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1 + uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2 + usub16 r12, r8, r6 ; c (o1 | o5) c + uadd16 r6, r11, r1 ; d (o3 | o7) d + uadd16 r7, r10, r6 ; a+d a+d + mov r8, #0x4 ; set up 4's 4 + orr r8, r8, #0x40000 ; 4|4 + usub16 r6, r10, r6 ; a-d a-d + uadd16 r6, r6, r8 ; a-d+4 3|7 + uadd16 r7, r7, r8 ; a+d+4 0|4 + uadd16 r10, r9, r12 ; b+c b+c + usub16 r1, r9, r12 ; b-c b-c + uadd16 r10, r10, r8 ; b+c+4 1|5 + uadd16 r1, r1, r8 ; b-c+4 2|6 + mov r8, r10, asr #19 ; o1 >> 3 + strh r8, [r0, #2] ; o1 + mov r8, r1, asr #19 ; o2 >> 3 + strh r8, [r0, #4] ; o2 + mov r8, r6, asr #19 ; o3 >> 3 + strh r8, [r0, #6] ; o3 + mov r8, r7, asr #19 ; o0 >> 3 + strh r8, [r0], r2 ; o0 +p + sxth r10, r10 ; + mov r8, r10, asr #3 ; o5 >> 3 + strh r8, [r0, #2] ; o5 + sxth r1, r1 ; + mov r8, r1, asr #3 ; o6 >> 3 + strh r8, [r0, #4] ; o6 + sxth r6, r6 ; + mov r8, r6, asr #3 ; o7 >> 3 + strh r8, [r0, #6] ; o7 + sxth r7, r7 ; + mov r8, r7, asr #3 ; o4 >> 3 + strh r8, [r0], r2 ; o4 +p +;;;;; subs r5, r5, #0x1 ; i-- -- + bne loop2_dual ; + ; + ldmia sp!, {r4 - r11, pc} ; replace vars, return restore + ENDP + + +; sjl added 10/17/08 +;void dc_only_idct_armv6(short input_dc, short *output, int pitch) +|vp8_dc_only_idct_armv6| PROC + stmdb sp!, {r4 - r6, lr} + + add r0, r0, #0x4 + add r4, r1, r2 ; output + shortpitch + mov r0, r0, ASR #0x3 ;aka a1 + add r5, r1, r2, LSL #1 ; output + shortpitch * 2 + pkhbt r0, r0, r0, lsl #16 ; a1 | a1 + add r6, r5, r2 ; output + shortpitch * 3 + + str r0, [r1, #0] + str r0, [r1, #4] + + str r0, [r4, #0] + str r0, [r4, #4] + + str r0, [r5, #0] + str r0, [r5, #4] + + str r0, [r6, #0] + str r0, [r6, #4] + + + ldmia sp!, {r4 - r6, pc} + + ENDP ; |vp8_dc_only_idct_armv6| + + END diff --git a/vp8/common/arm/armv6/iwalsh_v6.asm b/vp8/common/arm/armv6/iwalsh_v6.asm new file mode 100644 index 000000000..87475681f --- /dev/null +++ b/vp8/common/arm/armv6/iwalsh_v6.asm @@ -0,0 +1,151 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + EXPORT |vp8_short_inv_walsh4x4_armv6| + EXPORT |vp8_short_inv_walsh4x4_1_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY ; name this block of code + +;short vp8_short_inv_walsh4x4_armv6(short *input, short *output) +|vp8_short_inv_walsh4x4_armv6| PROC + + stmdb sp!, {r4 - r11, lr} + + ldr r2, [r0], #4 ; [1 | 0] + ldr r3, [r0], #4 ; [3 | 2] + ldr r4, [r0], #4 ; [5 | 4] + ldr r5, [r0], #4 ; [7 | 6] + ldr r6, [r0], #4 ; [9 | 8] + ldr r7, [r0], #4 ; [11 | 10] + ldr r8, [r0], #4 ; [13 | 12] + ldr r9, [r0] ; [15 | 14] + + qadd16 r10, r2, r8 ; a1 [1+13 | 0+12] + qadd16 r11, r4, r6 ; b1 [5+9 | 4+8] + qsub16 r12, r4, r6 ; c1 [5-9 | 4-8] + qsub16 lr, r2, r8 ; d1 [1-13 | 0-12] + + qadd16 r2, r10, r11 ; a1 + b1 [1 | 0] + qadd16 r4, r12, lr ; c1 + d1 [5 | 4] + qsub16 r6, r10, r11 ; a1 - b1 [9 | 8] + qsub16 r8, lr, r12 ; d1 - c1 [13 | 12] + + qadd16 r10, r3, r9 ; a1 [3+15 | 2+14] + qadd16 r11, r5, r7 ; b1 [7+11 | 6+10] + qsub16 r12, r5, r7 ; c1 [7-11 | 6-10] + qsub16 lr, r3, r9 ; d1 [3-15 | 2-14] + + qadd16 r3, r10, r11 ; a1 + b1 [3 | 2] + qadd16 r5, r12, lr ; c1 + d1 [7 | 6] + qsub16 r7, r10, r11 ; a1 - b1 [11 | 10] + qsub16 r9, lr, r12 ; d1 - c1 [15 | 14] + + ; first transform complete + + qsubaddx r10, r2, r3 ; [c1|a1] [1-2 | 0+3] + qaddsubx r11, r2, r3 ; [b1|d1] [1+2 | 0-3] + qsubaddx r12, r4, r5 ; [c1|a1] [5-6 | 4+7] + qaddsubx lr, r4, r5 ; [b1|d1] [5+6 | 4-7] + + qaddsubx r2, r10, r11 ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r3, r11, r10 ; [a2|d2] [b1+a1 | d1-c1] + ldr r10, c0x00030003 + qaddsubx r4, r12, lr ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r5, lr, r12 ; [a2|d2] [b1+a1 | d1-c1] + + qadd16 r2, r2, r10 ; [b2+3|c2+3] + qadd16 r3, r3, r10 ; [a2+3|d2+3] + qadd16 r4, r4, r10 ; [b2+3|c2+3] + qadd16 r5, r5, r10 ; [a2+3|d2+3] + + asr r12, r2, #3 ; [1 | x] + pkhtb r12, r12, r3, asr #19; [1 | 0] + lsl lr, r3, #16 ; [~3 | x] + lsl r2, r2, #16 ; [~2 | x] + asr lr, lr, #3 ; [3 | x] + pkhtb lr, lr, r2, asr #19 ; [3 | 2] + + asr r2, r4, #3 ; [5 | x] + pkhtb r2, r2, r5, asr #19 ; [5 | 4] + lsl r3, r5, #16 ; [~7 | x] + lsl r4, r4, #16 ; [~6 | x] + asr r3, r3, #3 ; [7 | x] + pkhtb r3, r3, r4, asr #19 ; [7 | 6] + + str r12, [r1], #4 + str lr, [r1], #4 + str r2, [r1], #4 + str r3, [r1], #4 + + qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11] + qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11] + qsubaddx r4, r8, r9 ; [c1|a1] [13-14 | 12+15] + qaddsubx r5, r8, r9 ; [b1|d1] [13+14 | 12-15] + + qaddsubx r6, r2, r3 ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r7, r3, r2 ; [a2|d2] [b1+a1 | d1-c1] + qaddsubx r8, r4, r5 ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r9, r5, r4 ; [a2|d2] [b1+a1 | d1-c1] + + qadd16 r6, r6, r10 ; [b2+3|c2+3] + qadd16 r7, r7, r10 ; [a2+3|d2+3] + qadd16 r8, r8, r10 ; [b2+3|c2+3] + qadd16 r9, r9, r10 ; [a2+3|d2+3] + + asr r2, r6, #3 ; [9 | x] + pkhtb r2, r2, r7, asr #19 ; [9 | 8] + lsl r3, r7, #16 ; [~11| x] + lsl r4, r6, #16 ; [~10| x] + asr r3, r3, #3 ; [11 | x] + pkhtb r3, r3, r4, asr #19 ; [11 | 10] + + asr r4, r8, #3 ; [13 | x] + pkhtb r4, r4, r9, asr #19 ; [13 | 12] + lsl r5, r9, #16 ; [~15| x] + lsl r6, r8, #16 ; [~14| x] + asr r5, r5, #3 ; [15 | x] + pkhtb r5, r5, r6, asr #19 ; [15 | 14] + + str r2, [r1], #4 + str r3, [r1], #4 + str r4, [r1], #4 + str r5, [r1] + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_short_inv_walsh4x4_armv6| + + +;short vp8_short_inv_walsh4x4_1_armv6(short *input, short *output) +|vp8_short_inv_walsh4x4_1_armv6| PROC + + ldrsh r2, [r0] ; [0] + add r2, r2, #3 ; [0] + 3 + asr r2, r2, #3 ; a1 ([0]+3) >> 3 + lsl r2, r2, #16 ; [a1 | x] + orr r2, r2, r2, lsr #16 ; [a1 | a1] + + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1] + + bx lr + ENDP ; |vp8_short_inv_walsh4x4_1_armv6| + +; Constant Pool +c0x00030003 DCD 0x00030003 + END diff --git a/vp8/common/arm/armv6/loopfilter_v6.asm b/vp8/common/arm/armv6/loopfilter_v6.asm new file mode 100644 index 000000000..c2b02dc0a --- /dev/null +++ b/vp8/common/arm/armv6/loopfilter_v6.asm @@ -0,0 +1,1263 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_horizontal_edge_armv6| + EXPORT |vp8_mbloop_filter_horizontal_edge_armv6| + EXPORT |vp8_loop_filter_vertical_edge_armv6| + EXPORT |vp8_mbloop_filter_vertical_edge_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code + + MACRO + TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 + ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 + ; a0: 03 02 01 00 + ; a1: 13 12 11 10 + ; a2: 23 22 21 20 + ; a3: 33 32 31 30 + ; b3 b2 b1 b0 + + uxtb16 $b1, $a1 ; xx 12 xx 10 + uxtb16 $b0, $a0 ; xx 02 xx 00 + uxtb16 $b3, $a3 ; xx 32 xx 30 + uxtb16 $b2, $a2 ; xx 22 xx 20 + orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 + orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 + + uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 + uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 + uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 + uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 + orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 + orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 + + pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 + pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 + + pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 + pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 + MEND + + +src RN r0 +pstep RN r1 +count RN r5 + +;r0 unsigned char *src_ptr, +;r1 int src_pixel_step, +;r2 const char *flimit, +;r3 const char *limit, +;stack const char *thresh, +;stack int count + +;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_loop_filter_horizontal_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r6, [sp, #36] ; load thresh address + sub sp, sp, #16 ; create temp buffer + + ldr r9, [src], pstep ; p3 + ldr r4, [r2], #4 ; flimit + ldr r10, [src], pstep ; p2 + ldr r2, [r3], #4 ; limit + ldr r11, [src], pstep ; p1 + uadd8 r4, r4, r4 ; flimit * 2 + ldr r3, [r6], #4 ; thresh + mov count, count, lsl #1 ; 4-in-parallel + uadd8 r4, r4, r2 ; flimit * 2 + limit + +|Hnext8| + ; vp8_filter_mask() function + ; calculate breakout conditions + ldr r12, [src], pstep ; p0 + + uqsub8 r6, r9, r10 ; p3 - p2 + uqsub8 r7, r10, r9 ; p2 - p3 + uqsub8 r8, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + + orr r6, r6, r7 ; abs (p3-p2) + orr r8, r8, r10 ; abs (p2-p1) + uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask + uqsub8 r8, r8, r2 ; compare to limit + uqsub8 r6, r11, r12 ; p1 - p0 + orr lr, lr, r8 + uqsub8 r7, r12, r11 ; p0 - p1 + ldr r9, [src], pstep ; q0 + ldr r10, [src], pstep ; q1 + orr r6, r6, r7 ; abs (p1-p0) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later + orr lr, lr, r7 + + uqsub8 r6, r11, r10 ; p1 - q1 + uqsub8 r7, r10, r11 ; q1 - p1 + uqsub8 r11, r12, r9 ; p0 - q0 + uqsub8 r12, r9, r12 ; q0 - p0 + orr r6, r6, r7 ; abs (p1-q1) + ldr r7, c0x7F7F7F7F + orr r12, r11, r12 ; abs (p0-q0) + ldr r11, [src], pstep ; q2 + uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 + and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r7, r9, r10 ; q0 - q1 + uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r6, r10, r9 ; q1 - q0 + uqsub8 r12, r12, r4 ; compare to flimit + uqsub8 r9, r11, r10 ; q2 - q1 + + orr lr, lr, r12 + + ldr r12, [src], pstep ; q3 + uqsub8 r10, r10, r11 ; q1 - q2 + orr r6, r7, r6 ; abs (q1-q0) + orr r10, r9, r10 ; abs (q2-q1) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r10, r10, r2 ; compare to limit + uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later + orr lr, lr, r7 + orr lr, lr, r10 + + uqsub8 r10, r12, r11 ; q3 - q2 + uqsub8 r9, r11, r12 ; q2 - q3 + + mvn r11, #0 ; r11 == -1 + + orr r10, r10, r9 ; abs (q3-q2) + uqsub8 r10, r10, r2 ; compare to limit + + mov r12, #0 + orr lr, lr, r10 + sub src, src, pstep, lsl #2 + + usub8 lr, r12, lr ; use usub8 instead of ssub8 + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq hskip_filter ; skip filtering + + sub src, src, pstep, lsl #1 ; move src pointer down by 6 lines + + ;vp8_hevmask() function + ;calculate high edge variance + orr r10, r6, r8 ; calculate vp8_hevmask + + ldr r7, [src], pstep ; p1 + + usub8 r10, r12, r10 ; use usub8 instead of ssub8 + sel r6, r12, r11 ; obtain vp8_hevmask: r6 + + ;vp8_filter() function + ldr r8, [src], pstep ; p0 + ldr r12, c0x80808080 + ldr r9, [src], pstep ; q0 + ldr r10, [src], pstep ; q1 + + eor r7, r7, r12 ; p1 offset to convert to a signed value + eor r8, r8, r12 ; p0 offset to convert to a signed value + eor r9, r9, r12 ; q0 offset to convert to a signed value + eor r10, r10, r12 ; q1 offset to convert to a signed value + + str r9, [sp] ; store qs0 temporarily + str r8, [sp, #4] ; store ps0 temporarily + str r10, [sp, #8] ; store qs1 temporarily + str r7, [sp, #12] ; store ps1 temporarily + + qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) + qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + + and r7, r7, r6 ; vp8_filter (r7) &= hev + + qadd8 r7, r7, r8 + ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 + + qadd8 r7, r7, r8 + ldr r10, c0x04040404 + + qadd8 r7, r7, r8 + and r7, r7, lr ; vp8_filter &= mask; + + ;modify code for vp8 -- Filter1 = vp8_filter (r7) + qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) + qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) + + mov r9, #0 + shadd8 r8 , r8 , r9 ; Filter2 >>= 3 + shadd8 r7 , r7 , r9 ; vp8_filter >>= 3 + shadd8 r8 , r8 , r9 + shadd8 r7 , r7 , r9 + shadd8 lr , r8 , r9 ; lr: Filter2 + shadd8 r7 , r7 , r9 ; r7: filter + + ;usub8 lr, r8, r10 ; s = (s==4)*-1 + ;sel lr, r11, r9 + ;usub8 r8, r10, r8 + ;sel r8, r11, r9 + ;and r8, r8, lr ; -1 for each element that equals 4 + + ;calculate output + ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter) + + ldr r8, [sp] ; load qs0 + ldr r9, [sp, #4] ; load ps0 + + ldr r10, c0x01010101 + + qsub8 r8 ,r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) + qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2) + + ;end of modification for vp8 + + mov lr, #0 + sadd8 r7, r7 , r10 ; vp8_filter += 1 + shadd8 r7, r7, lr ; vp8_filter >>= 1 + + ldr r11, [sp, #12] ; load ps1 + ldr r10, [sp, #8] ; load qs1 + + bic r7, r7, r6 ; vp8_filter &= ~hev + sub src, src, pstep, lsl #2 + + qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) + qsub8 r10, r10,r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) + + eor r11, r11, r12 ; *op1 = u^0x80 + str r11, [src], pstep ; store op1 + eor r9, r9, r12 ; *op0 = u^0x80 + str r9, [src], pstep ; store op0 result + eor r8, r8, r12 ; *oq0 = u^0x80 + str r8, [src], pstep ; store oq0 result + eor r10, r10, r12 ; *oq1 = u^0x80 + str r10, [src], pstep ; store oq1 + + sub src, src, pstep, lsl #1 + +|hskip_filter| + add src, src, #4 + sub src, src, pstep, lsl #2 + + subs count, count, #1 + + ;pld [src] + ;pld [src, pstep] + ;pld [src, pstep, lsl #1] + ;pld [src, pstep, lsl #2] + ;pld [src, pstep, lsl #3] + + ldrne r9, [src], pstep ; p3 + ldrne r10, [src], pstep ; p2 + ldrne r11, [src], pstep ; p1 + + bne Hnext8 + + add sp, sp, #16 + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_loop_filter_horizontal_edge_armv6| + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_mbloop_filter_horizontal_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r6, [sp, #36] ; load thresh address + sub sp, sp, #16 ; create temp buffer + + ldr r9, [src], pstep ; p3 + ldr r4, [r2], #4 ; flimit + ldr r10, [src], pstep ; p2 + ldr r2, [r3], #4 ; limit + ldr r11, [src], pstep ; p1 + uadd8 r4, r4, r4 ; flimit * 2 + ldr r3, [r6], #4 ; thresh + mov count, count, lsl #1 ; 4-in-parallel + uadd8 r4, r4, r2 ; flimit * 2 + limit + +|MBHnext8| + + ; vp8_filter_mask() function + ; calculate breakout conditions + ldr r12, [src], pstep ; p0 + + uqsub8 r6, r9, r10 ; p3 - p2 + uqsub8 r7, r10, r9 ; p2 - p3 + uqsub8 r8, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + + orr r6, r6, r7 ; abs (p3-p2) + orr r8, r8, r10 ; abs (p2-p1) + uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask + uqsub8 r8, r8, r2 ; compare to limit + + uqsub8 r6, r11, r12 ; p1 - p0 + orr lr, lr, r8 + uqsub8 r7, r12, r11 ; p0 - p1 + ldr r9, [src], pstep ; q0 + ldr r10, [src], pstep ; q1 + orr r6, r6, r7 ; abs (p1-p0) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later + orr lr, lr, r7 + + uqsub8 r6, r11, r10 ; p1 - q1 + uqsub8 r7, r10, r11 ; q1 - p1 + uqsub8 r11, r12, r9 ; p0 - q0 + uqsub8 r12, r9, r12 ; q0 - p0 + orr r6, r6, r7 ; abs (p1-q1) + ldr r7, c0x7F7F7F7F + orr r12, r11, r12 ; abs (p0-q0) + ldr r11, [src], pstep ; q2 + uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 + and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r7, r9, r10 ; q0 - q1 + uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r6, r10, r9 ; q1 - q0 + uqsub8 r12, r12, r4 ; compare to flimit + uqsub8 r9, r11, r10 ; q2 - q1 + + orr lr, lr, r12 + + ldr r12, [src], pstep ; q3 + + uqsub8 r10, r10, r11 ; q1 - q2 + orr r6, r7, r6 ; abs (q1-q0) + orr r10, r9, r10 ; abs (q2-q1) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r10, r10, r2 ; compare to limit + uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later + orr lr, lr, r7 + orr lr, lr, r10 + + uqsub8 r10, r12, r11 ; q3 - q2 + uqsub8 r9, r11, r12 ; q2 - q3 + + mvn r11, #0 ; r11 == -1 + + orr r10, r10, r9 ; abs (q3-q2) + uqsub8 r10, r10, r2 ; compare to limit + + mov r12, #0 + + orr lr, lr, r10 + + usub8 lr, r12, lr ; use usub8 instead of ssub8 + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq mbhskip_filter ; skip filtering + + ;vp8_hevmask() function + ;calculate high edge variance + sub src, src, pstep, lsl #2 ; move src pointer down by 6 lines + sub src, src, pstep, lsl #1 + + orr r10, r6, r8 + ldr r7, [src], pstep ; p1 + + usub8 r10, r12, r10 + sel r6, r12, r11 ; hev mask: r6 + + ;vp8_mbfilter() function + ;p2, q2 are only needed at the end. Don't need to load them in now. + ldr r8, [src], pstep ; p0 + ldr r12, c0x80808080 + ldr r9, [src], pstep ; q0 + ldr r10, [src] ; q1 + + eor r7, r7, r12 ; ps1 + eor r8, r8, r12 ; ps0 + eor r9, r9, r12 ; qs0 + eor r10, r10, r12 ; qs1 + + qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + str r7, [sp, #12] ; store ps1 temporarily + qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) + str r10, [sp, #8] ; store qs1 temporarily + qadd8 r7, r7, r12 + str r9, [sp] ; store qs0 temporarily + qadd8 r7, r7, r12 + str r8, [sp, #4] ; store ps0 temporarily + qadd8 r7, r7, r12 ; vp8_filter: r7 + + ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 + ldr r9, c0x04040404 + + and r7, r7, lr ; vp8_filter &= mask (lr is free) + + mov r12, r7 ; Filter2: r12 + and r12, r12, r6 ; Filter2 &= hev + + ;modify code for vp8 + ;save bottom 3 bits so that we round one side +4 and the other +3 + qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4) + qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3) + + mov r10, #0 + shadd8 r8 , r8 , r10 ; Filter1 >>= 3 + shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + shadd8 r8 , r8 , r10 + shadd8 r12 , r12 , r10 + shadd8 r8 , r8 , r10 ; r8: Filter1 + shadd8 r12 , r12 , r10 ; r12: Filter2 + + ldr r9, [sp] ; load qs0 + ldr r11, [sp, #4] ; load ps0 + + qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) + qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) + + ;save bottom 3 bits so that we round one side +4 and the other +3 + ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) + ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4) + ;mov r10, #0 + ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + ;usub8 lr, r8, r9 ; s = (s==4)*-1 + ;sel lr, r11, r10 + ;shadd8 r12 , r12 , r10 + ;usub8 r8, r9, r8 + ;sel r8, r11, r10 + ;ldr r9, [sp] ; load qs0 + ;ldr r11, [sp, #4] ; load ps0 + ;shadd8 r12 , r12 , r10 + ;and r8, r8, lr ; -1 for each element that equals 4 + ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2) + ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) + ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u) + + ;end of modification for vp8 + + bic r12, r7, r6 ; vp8_filter &= ~hev ( r6 is free) + ;mov r12, r7 + + ;roughly 3/7th difference across boundary + mov lr, #0x1b ; 27 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r7, r10, lr, r7 + smultb r10, r10, lr + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + add r10, r10, #63 + ssat r7, #8, r7, asr #7 + ssat r10, #8, r10, asr #7 + + ldr lr, c0x80808080 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r7, r10, lsl #16 + uxtb16 r6, r6 + uxtb16 r10, r10 + + sub src, src, pstep + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) + + qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u) + qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u) + eor r8, r8, lr ; *oq0 = s^0x80 + str r8, [src] ; store *oq0 + sub src, src, pstep + eor r10, r10, lr ; *op0 = s^0x80 + str r10, [src] ; store *op0 + + ;roughly 2/7th difference across boundary + mov lr, #0x12 ; 18 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r9, r10, lr, r7 + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r9, #8, r9, asr #7 + ssat r10, #8, r10, asr #7 + + ldr lr, c0x80808080 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r9, r10, lsl #16 + + ldr r9, [sp, #8] ; load qs1 + ldr r11, [sp, #12] ; load ps1 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + sub src, src, pstep + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + + qadd8 r11, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u) + qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u) + eor r11, r11, lr ; *op1 = s^0x80 + str r11, [src], pstep ; store *op1 + eor r8, r8, lr ; *oq1 = s^0x80 + add src, src, pstep, lsl #1 + + mov r7, #0x3f ; 63 + + str r8, [src], pstep ; store *oq1 + + ;roughly 1/7th difference across boundary + mov lr, #0x9 ; 9 + ldr r9, [src] ; load q2 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r12, r10, lr, r7 + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r12, #8, r12, asr #7 + ssat r10, #8, r10, asr #7 + + sub src, src, pstep, lsl #2 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r12, r10, lsl #16 + + sub src, src, pstep + ldr lr, c0x80808080 + + ldr r11, [src] ; load p2 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + eor r9, r9, lr + eor r11, r11, lr + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) + + qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u) + qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u) + eor r8, r8, lr ; *op2 = s^0x80 + str r8, [src], pstep, lsl #2 ; store *op2 + add src, src, pstep + eor r10, r10, lr ; *oq2 = s^0x80 + str r10, [src], pstep, lsl #1 ; store *oq2 + +|mbhskip_filter| + add src, src, #4 + sub src, src, pstep, lsl #3 + subs count, count, #1 + + ldrne r9, [src], pstep ; p3 + ldrne r10, [src], pstep ; p2 + ldrne r11, [src], pstep ; p1 + + bne MBHnext8 + + add sp, sp, #16 + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_mbloop_filter_horizontal_edge_armv6| + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_loop_filter_vertical_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, #4 ; move src pointer down by 4 + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r12, [sp, #36] ; load thresh address + sub sp, sp, #16 ; create temp buffer + + ldr r6, [src], pstep ; load source data + ldr r4, [r2], #4 ; flimit + ldr r7, [src], pstep + ldr r2, [r3], #4 ; limit + ldr r8, [src], pstep + uadd8 r4, r4, r4 ; flimit * 2 + ldr r3, [r12], #4 ; thresh + ldr lr, [src], pstep + mov count, count, lsl #1 ; 4-in-parallel + uadd8 r4, r4, r2 ; flimit * 2 + limit + +|Vnext8| + + ; vp8_filter_mask() function + ; calculate breakout conditions + ; transpose the source data for 4-in-parallel operation + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + uqsub8 r7, r9, r10 ; p3 - p2 + uqsub8 r8, r10, r9 ; p2 - p3 + uqsub8 r9, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + orr r7, r7, r8 ; abs (p3-p2) + orr r10, r9, r10 ; abs (p2-p1) + uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask + uqsub8 r10, r10, r2 ; compare to limit + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr lr, lr, r10 + + uqsub8 r6, r11, r12 ; p1 - p0 + uqsub8 r7, r12, r11 ; p0 - p1 + add src, src, #4 ; move src pointer up by 4 + orr r6, r6, r7 ; abs (p1-p0) + str r11, [sp, #12] ; save p1 + uqsub8 r10, r6, r2 ; compare to limit + uqsub8 r11, r6, r3 ; compare to thresh + orr lr, lr, r10 + + ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now + ; transpose the source data for 4-in-parallel operation + ldr r6, [src], pstep ; load source data + str r11, [sp] ; push r11 to stack + ldr r7, [src], pstep + str r12, [sp, #4] ; save current reg before load q0 - q3 data + ldr r8, [src], pstep + str lr, [sp, #8] + ldr lr, [src], pstep + + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + ldr lr, [sp, #8] ; load back (f)limit accumulator + + uqsub8 r6, r12, r11 ; q3 - q2 + uqsub8 r7, r11, r12 ; q2 - q3 + uqsub8 r12, r11, r10 ; q2 - q1 + uqsub8 r11, r10, r11 ; q1 - q2 + orr r6, r6, r7 ; abs (q3-q2) + orr r7, r12, r11 ; abs (q2-q1) + uqsub8 r6, r6, r2 ; compare to limit + uqsub8 r7, r7, r2 ; compare to limit + ldr r11, [sp, #4] ; load back p0 + ldr r12, [sp, #12] ; load back p1 + orr lr, lr, r6 + orr lr, lr, r7 + + uqsub8 r6, r11, r9 ; p0 - q0 + uqsub8 r7, r9, r11 ; q0 - p0 + uqsub8 r8, r12, r10 ; p1 - q1 + uqsub8 r11, r10, r12 ; q1 - p1 + orr r6, r6, r7 ; abs (p0-q0) + ldr r7, c0x7F7F7F7F + orr r8, r8, r11 ; abs (p1-q1) + uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 + and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r11, r10, r9 ; q1 - q0 + uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r12, r9, r10 ; q0 - q1 + uqsub8 r6, r6, r4 ; compare to flimit + + orr r9, r11, r12 ; abs (q1-q0) + uqsub8 r8, r9, r2 ; compare to limit + uqsub8 r10, r9, r3 ; compare to thresh + orr lr, lr, r6 + orr lr, lr, r8 + + mvn r11, #0 ; r11 == -1 + mov r12, #0 + + usub8 lr, r12, lr + ldr r9, [sp] ; load the compared result + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq vskip_filter ; skip filtering + + ;vp8_hevmask() function + ;calculate high edge variance + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr r9, r9, r10 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + usub8 r9, r12, r9 + sel r6, r12, r11 ; hev mask: r6 + + ;vp8_filter() function + ; load soure data to r6, r11, r12, lr + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + pkhbt r12, r7, r8, lsl #16 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + pkhbt r11, r9, r10, lsl #16 + + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first + str r6, [sp] + str lr, [sp, #4] + + pkhbt r6, r7, r8, lsl #16 + pkhbt lr, r9, r10, lsl #16 + + ;transpose r12, r11, r6, lr to r7, r8, r9, r10 + TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 + + ;load back hev_mask r6 and filter_mask lr + ldr r12, c0x80808080 + ldr r6, [sp] + ldr lr, [sp, #4] + + eor r7, r7, r12 ; p1 offset to convert to a signed value + eor r8, r8, r12 ; p0 offset to convert to a signed value + eor r9, r9, r12 ; q0 offset to convert to a signed value + eor r10, r10, r12 ; q1 offset to convert to a signed value + + str r9, [sp] ; store qs0 temporarily + str r8, [sp, #4] ; store ps0 temporarily + str r10, [sp, #8] ; store qs1 temporarily + str r7, [sp, #12] ; store ps1 temporarily + + qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) + qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + + and r7, r7, r6 ; vp8_filter (r7) &= hev (r7 : filter) + + qadd8 r7, r7, r8 + ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 + + qadd8 r7, r7, r8 + ldr r10, c0x04040404 + + qadd8 r7, r7, r8 + ;mvn r11, #0 ; r11 == -1 + + and r7, r7, lr ; vp8_filter &= mask + + ;modify code for vp8 -- Filter1 = vp8_filter (r7) + qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) + qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) + + mov r9, #0 + shadd8 r8 , r8 , r9 ; Filter2 >>= 3 + shadd8 r7 , r7 , r9 ; vp8_filter >>= 3 + shadd8 r8 , r8 , r9 + shadd8 r7 , r7 , r9 + shadd8 lr , r8 , r9 ; lr: filter2 + shadd8 r7 , r7 , r9 ; r7: filter + + ;usub8 lr, r8, r10 ; s = (s==4)*-1 + ;sel lr, r11, r9 + ;usub8 r8, r10, r8 + ;sel r8, r11, r9 + ;and r8, r8, lr ; -1 for each element that equals 4 -- r8: s + + ;calculate output + ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter) + + ldr r8, [sp] ; load qs0 + ldr r9, [sp, #4] ; load ps0 + + ldr r10, c0x01010101 + + qsub8 r8, r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) + qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2) + ;end of modification for vp8 + + eor r8, r8, r12 + eor r9, r9, r12 + + mov lr, #0 + + sadd8 r7, r7, r10 + shadd8 r7, r7, lr + + ldr r10, [sp, #8] ; load qs1 + ldr r11, [sp, #12] ; load ps1 + + bic r7, r7, r6 ; r7: vp8_filter + + qsub8 r10 , r10, r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) + qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) + eor r10, r10, r12 + eor r11, r11, r12 + + sub src, src, pstep, lsl #2 + + ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1 + ;output is b0, b1, b2, b3 + ;b0: 03 02 01 00 + ;b1: 13 12 11 10 + ;b2: 23 22 21 20 + ;b3: 33 32 31 30 + ; p1 p0 q0 q1 + ; (a3 a2 a1 a0) + TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr + + strh r6, [src, #-2] ; store the result + mov r6, r6, lsr #16 + strh r6, [src], pstep + + strh r7, [src, #-2] + mov r7, r7, lsr #16 + strh r7, [src], pstep + + strh r12, [src, #-2] + mov r12, r12, lsr #16 + strh r12, [src], pstep + + strh lr, [src, #-2] + mov lr, lr, lsr #16 + strh lr, [src], pstep + +|vskip_filter| + sub src, src, #4 + subs count, count, #1 + + ldrne r6, [src], pstep ; load source data + ldrne r7, [src], pstep + ldrne r8, [src], pstep + ldrne lr, [src], pstep + + bne Vnext8 + + add sp, sp, #16 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_loop_filter_vertical_edge_armv6| + + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_mbloop_filter_vertical_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, #4 ; move src pointer down by 4 + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r12, [sp, #36] ; load thresh address + sub sp, sp, #16 ; create temp buffer + + ldr r6, [src], pstep ; load source data + ldr r4, [r2], #4 ; flimit + ldr r7, [src], pstep + ldr r2, [r3], #4 ; limit + ldr r8, [src], pstep + uadd8 r4, r4, r4 ; flimit * 2 + ldr r3, [r12], #4 ; thresh + ldr lr, [src], pstep + mov count, count, lsl #1 ; 4-in-parallel + uadd8 r4, r4, r2 ; flimit * 2 + limit + +|MBVnext8| + ; vp8_filter_mask() function + ; calculate breakout conditions + ; transpose the source data for 4-in-parallel operation + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + uqsub8 r7, r9, r10 ; p3 - p2 + uqsub8 r8, r10, r9 ; p2 - p3 + uqsub8 r9, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + orr r7, r7, r8 ; abs (p3-p2) + orr r10, r9, r10 ; abs (p2-p1) + uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask + uqsub8 r10, r10, r2 ; compare to limit + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr lr, lr, r10 + + uqsub8 r6, r11, r12 ; p1 - p0 + uqsub8 r7, r12, r11 ; p0 - p1 + add src, src, #4 ; move src pointer up by 4 + orr r6, r6, r7 ; abs (p1-p0) + str r11, [sp, #12] ; save p1 + uqsub8 r10, r6, r2 ; compare to limit + uqsub8 r11, r6, r3 ; compare to thresh + orr lr, lr, r10 + + ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now + ; transpose the source data for 4-in-parallel operation + ldr r6, [src], pstep ; load source data + str r11, [sp] ; push r11 to stack + ldr r7, [src], pstep + str r12, [sp, #4] ; save current reg before load q0 - q3 data + ldr r8, [src], pstep + str lr, [sp, #8] + ldr lr, [src], pstep + + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + ldr lr, [sp, #8] ; load back (f)limit accumulator + + uqsub8 r6, r12, r11 ; q3 - q2 + uqsub8 r7, r11, r12 ; q2 - q3 + uqsub8 r12, r11, r10 ; q2 - q1 + uqsub8 r11, r10, r11 ; q1 - q2 + orr r6, r6, r7 ; abs (q3-q2) + orr r7, r12, r11 ; abs (q2-q1) + uqsub8 r6, r6, r2 ; compare to limit + uqsub8 r7, r7, r2 ; compare to limit + ldr r11, [sp, #4] ; load back p0 + ldr r12, [sp, #12] ; load back p1 + orr lr, lr, r6 + orr lr, lr, r7 + + uqsub8 r6, r11, r9 ; p0 - q0 + uqsub8 r7, r9, r11 ; q0 - p0 + uqsub8 r8, r12, r10 ; p1 - q1 + uqsub8 r11, r10, r12 ; q1 - p1 + orr r6, r6, r7 ; abs (p0-q0) + ldr r7, c0x7F7F7F7F + orr r8, r8, r11 ; abs (p1-q1) + uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 + and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r11, r10, r9 ; q1 - q0 + uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r12, r9, r10 ; q0 - q1 + uqsub8 r6, r6, r4 ; compare to flimit + + orr r9, r11, r12 ; abs (q1-q0) + uqsub8 r8, r9, r2 ; compare to limit + uqsub8 r10, r9, r3 ; compare to thresh + orr lr, lr, r6 + orr lr, lr, r8 + + mvn r11, #0 ; r11 == -1 + mov r12, #0 + + usub8 lr, r12, lr + ldr r9, [sp] ; load the compared result + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq mbvskip_filter ; skip filtering + + + ;vp8_hevmask() function + ;calculate high edge variance + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr r9, r9, r10 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + usub8 r9, r12, r9 + sel r6, r12, r11 ; hev mask: r6 + + + ; vp8_mbfilter() function + ; p2, q2 are only needed at the end. Don't need to load them in now. + ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first + ; load soure data to r6, r11, r12, lr + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + pkhbt r12, r7, r8, lsl #16 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + pkhbt r11, r9, r10, lsl #16 + + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + str r6, [sp] ; save r6 + str lr, [sp, #4] ; save lr + + pkhbt r6, r7, r8, lsl #16 + pkhbt lr, r9, r10, lsl #16 + + ;transpose r12, r11, r6, lr to p1, p0, q0, q1 + TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 + + ;load back hev_mask r6 and filter_mask lr + ldr r12, c0x80808080 + ldr r6, [sp] + ldr lr, [sp, #4] + + eor r7, r7, r12 ; ps1 + eor r8, r8, r12 ; ps0 + eor r9, r9, r12 ; qs0 + eor r10, r10, r12 ; qs1 + + qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + str r7, [sp, #12] ; store ps1 temporarily + qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) + str r10, [sp, #8] ; store qs1 temporarily + qadd8 r7, r7, r12 + str r9, [sp] ; store qs0 temporarily + qadd8 r7, r7, r12 + str r8, [sp, #4] ; store ps0 temporarily + qadd8 r7, r7, r12 ; vp8_filter: r7 + + ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 + ldr r9, c0x04040404 + ;mvn r11, #0 ; r11 == -1 + + and r7, r7, lr ; vp8_filter &= mask (lr is free) + + mov r12, r7 ; Filter2: r12 + and r12, r12, r6 ; Filter2 &= hev + + ;modify code for vp8 + ;save bottom 3 bits so that we round one side +4 and the other +3 + qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4) + qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3) + + mov r10, #0 + shadd8 r8 , r8 , r10 ; Filter1 >>= 3 + shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + shadd8 r8 , r8 , r10 + shadd8 r12 , r12 , r10 + shadd8 r8 , r8 , r10 ; r8: Filter1 + shadd8 r12 , r12 , r10 ; r12: Filter2 + + ldr r9, [sp] ; load qs0 + ldr r11, [sp, #4] ; load ps0 + + qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) + qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) + + ;save bottom 3 bits so that we round one side +4 and the other +3 + ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) + ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4) + ;mov r10, #0 + ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + ;usub8 lr, r8, r9 ; s = (s==4)*-1 + ;sel lr, r11, r10 + ;shadd8 r12 , r12 , r10 + ;usub8 r8, r9, r8 + ;sel r8, r11, r10 + ;ldr r9, [sp] ; load qs0 + ;ldr r11, [sp, #4] ; load ps0 + ;shadd8 r12 , r12 , r10 + ;and r8, r8, lr ; -1 for each element that equals 4 + ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2) + ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) + ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u) + + ;end of modification for vp8 + + bic r12, r7, r6 ;vp8_filter &= ~hev ( r6 is free) + ;mov r12, r7 + + ;roughly 3/7th difference across boundary + mov lr, #0x1b ; 27 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r7, r10, lr, r7 + smultb r10, r10, lr + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + add r10, r10, #63 + ssat r7, #8, r7, asr #7 + ssat r10, #8, r10, asr #7 + + ldr lr, c0x80808080 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r7, r10, lsl #16 + uxtb16 r6, r6 + uxtb16 r10, r10 + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) + + qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u) + qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u) + eor r8, r8, lr ; *oq0 = s^0x80 + eor r10, r10, lr ; *op0 = s^0x80 + + strb r10, [src, #-1] ; store op0 result + strb r8, [src], pstep ; store oq0 result + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + strb r10, [src, #-1] + strb r8, [src], pstep + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + strb r10, [src, #-1] + strb r8, [src], pstep + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + strb r10, [src, #-1] + strb r8, [src], pstep + + ;roughly 2/7th difference across boundary + mov lr, #0x12 ; 18 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r9, r10, lr, r7 + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r9, #8, r9, asr #7 + ssat r10, #8, r10, asr #7 + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r9, r10, lsl #16 + + ldr r9, [sp, #8] ; load qs1 + ldr r11, [sp, #12] ; load ps1 + ldr lr, c0x80808080 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + add src, src, #2 + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + + qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u) + qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u) + eor r8, r8, lr ; *oq1 = s^0x80 + eor r10, r10, lr ; *op1 = s^0x80 + + ldrb r11, [src, #-5] ; load p2 for 1/7th difference across boundary + strb r10, [src, #-4] ; store op1 + strb r8, [src, #-1] ; store oq1 + ldrb r9, [src], pstep ; load q2 for 1/7th difference across boundary + + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + + ldrb r6, [src, #-5] + strb r10, [src, #-4] + strb r8, [src, #-1] + ldrb r7, [src], pstep + + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + orr r11, r11, r6, lsl #8 + orr r9, r9, r7, lsl #8 + + ldrb r6, [src, #-5] + strb r10, [src, #-4] + strb r8, [src, #-1] + ldrb r7, [src], pstep + + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + orr r11, r11, r6, lsl #16 + orr r9, r9, r7, lsl #16 + + ldrb r6, [src, #-5] + strb r10, [src, #-4] + strb r8, [src, #-1] + ldrb r7, [src], pstep + orr r11, r11, r6, lsl #24 + orr r9, r9, r7, lsl #24 + + ;roughly 1/7th difference across boundary + eor r9, r9, lr + eor r11, r11, lr + + mov lr, #0x9 ; 9 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r12, r10, lr, r7 + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r12, #8, r12, asr #7 + ssat r10, #8, r10, asr #7 + + sub src, src, pstep, lsl #2 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r12, r10, lsl #16 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + ldr lr, c0x80808080 + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) + + qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u) + qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u) + eor r8, r8, lr ; *op2 = s^0x80 + eor r10, r10, lr ; *oq2 = s^0x80 + + strb r8, [src, #-5] ; store *op2 + strb r10, [src], pstep ; store *oq2 + mov r8, r8, lsr #8 + mov r10, r10, lsr #8 + strb r8, [src, #-5] + strb r10, [src], pstep + mov r8, r8, lsr #8 + mov r10, r10, lsr #8 + strb r8, [src, #-5] + strb r10, [src], pstep + mov r8, r8, lsr #8 + mov r10, r10, lsr #8 + strb r8, [src, #-5] + strb r10, [src], pstep + + ;adjust src pointer for next loop + sub src, src, #2 + +|mbvskip_filter| + sub src, src, #4 + subs count, count, #1 + + ldrne r6, [src], pstep ; load source data + ldrne r7, [src], pstep + ldrne r8, [src], pstep + ldrne lr, [src], pstep + + bne MBVnext8 + + add sp, sp, #16 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_mbloop_filter_vertical_edge_armv6| + +; Constant Pool +c0x80808080 DCD 0x80808080 +c0x03030303 DCD 0x03030303 +c0x04040404 DCD 0x04040404 +c0x01010101 DCD 0x01010101 +c0x7F7F7F7F DCD 0x7F7F7F7F + + END diff --git a/vp8/common/arm/armv6/recon_v6.asm b/vp8/common/arm/armv6/recon_v6.asm new file mode 100644 index 000000000..085ff80c9 --- /dev/null +++ b/vp8/common/arm/armv6/recon_v6.asm @@ -0,0 +1,280 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_recon_b_armv6| + EXPORT |vp8_recon2b_armv6| + EXPORT |vp8_recon4b_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code +prd RN r0 +dif RN r1 +dst RN r2 +stride RN r3 + +;void recon_b(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride) +; R0 char* pred_ptr +; R1 short * dif_ptr +; R2 char * dst_ptr +; R3 int stride + +; Description: +; Loop through the block adding the Pred and Diff together. Clamp and then +; store back into the Dst. + +; Restrictions : +; all buffers are expected to be 4 byte aligned coming in and +; going out. +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +; +; +; +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_recon_b_armv6| PROC + stmdb sp!, {r4 - r9, lr} + + ;0, 1, 2, 3 + ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 + ldr r6, [dif, #0] ; 1 | 0 + ldr r7, [dif, #4] ; 3 | 2 + + pkhbt r8, r6, r7, lsl #16 ; 2 | 0 + pkhtb r9, r7, r6, asr #16 ; 3 | 1 + + uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 + uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 + + usat16 r8, #8, r8 + usat16 r9, #8, r9 + add dif, dif, #32 + orr r8, r8, r9, lsl #8 + + str r8, [dst], stride + + ;0, 1, 2, 3 + ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 +;; ldr r6, [dif, #8] ; 1 | 0 +;; ldr r7, [dif, #12] ; 3 | 2 + ldr r6, [dif, #0] ; 1 | 0 + ldr r7, [dif, #4] ; 3 | 2 + + pkhbt r8, r6, r7, lsl #16 ; 2 | 0 + pkhtb r9, r7, r6, asr #16 ; 3 | 1 + + uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 + uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 + + usat16 r8, #8, r8 + usat16 r9, #8, r9 + add dif, dif, #32 + orr r8, r8, r9, lsl #8 + + str r8, [dst], stride + + ;0, 1, 2, 3 + ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 +;; ldr r6, [dif, #16] ; 1 | 0 +;; ldr r7, [dif, #20] ; 3 | 2 + ldr r6, [dif, #0] ; 1 | 0 + ldr r7, [dif, #4] ; 3 | 2 + + pkhbt r8, r6, r7, lsl #16 ; 2 | 0 + pkhtb r9, r7, r6, asr #16 ; 3 | 1 + + uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 + uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 + + usat16 r8, #8, r8 + usat16 r9, #8, r9 + add dif, dif, #32 + orr r8, r8, r9, lsl #8 + + str r8, [dst], stride + + ;0, 1, 2, 3 + ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 +;; ldr r6, [dif, #24] ; 1 | 0 +;; ldr r7, [dif, #28] ; 3 | 2 + ldr r6, [dif, #0] ; 1 | 0 + ldr r7, [dif, #4] ; 3 | 2 + + pkhbt r8, r6, r7, lsl #16 ; 2 | 0 + pkhtb r9, r7, r6, asr #16 ; 3 | 1 + + uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 + uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 + + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst], stride + + ldmia sp!, {r4 - r9, pc} + + ENDP ; |recon_b| + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +; +; +; +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +; R0 char *pred_ptr +; R1 short *dif_ptr +; R2 char *dst_ptr +; R3 int stride +|vp8_recon4b_armv6| PROC + stmdb sp!, {r4 - r9, lr} + + mov lr, #4 + +recon4b_loop + ;0, 1, 2, 3 + ldr r4, [prd], #4 ; 3 | 2 | 1 | 0 + ldr r6, [dif, #0] ; 1 | 0 + ldr r7, [dif, #4] ; 3 | 2 + + pkhbt r8, r6, r7, lsl #16 ; 2 | 0 + pkhtb r9, r7, r6, asr #16 ; 3 | 1 + + uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 + uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 + + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst] + + ;4, 5, 6, 7 + ldr r4, [prd], #4 +;; ldr r6, [dif, #32] +;; ldr r7, [dif, #36] + ldr r6, [dif, #8] + ldr r7, [dif, #12] + + pkhbt r8, r6, r7, lsl #16 + pkhtb r9, r7, r6, asr #16 + + uxtab16 r8, r8, r4 + uxtab16 r9, r9, r4, ror #8 + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst, #4] + + ;8, 9, 10, 11 + ldr r4, [prd], #4 +;; ldr r6, [dif, #64] +;; ldr r7, [dif, #68] + ldr r6, [dif, #16] + ldr r7, [dif, #20] + + pkhbt r8, r6, r7, lsl #16 + pkhtb r9, r7, r6, asr #16 + + uxtab16 r8, r8, r4 + uxtab16 r9, r9, r4, ror #8 + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst, #8] + + ;12, 13, 14, 15 + ldr r4, [prd], #4 +;; ldr r6, [dif, #96] +;; ldr r7, [dif, #100] + ldr r6, [dif, #24] + ldr r7, [dif, #28] + + pkhbt r8, r6, r7, lsl #16 + pkhtb r9, r7, r6, asr #16 + + uxtab16 r8, r8, r4 + uxtab16 r9, r9, r4, ror #8 + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst, #12] + + add dst, dst, stride +;; add dif, dif, #8 + add dif, dif, #32 + + subs lr, lr, #1 + bne recon4b_loop + + ldmia sp!, {r4 - r9, pc} + + ENDP ; |Recon4B| + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +; +; +; +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +; R0 char *pred_ptr +; R1 short *dif_ptr +; R2 char *dst_ptr +; R3 int stride +|vp8_recon2b_armv6| PROC + stmdb sp!, {r4 - r9, lr} + + mov lr, #4 + +recon2b_loop + ;0, 1, 2, 3 + ldr r4, [prd], #4 + ldr r6, [dif, #0] + ldr r7, [dif, #4] + + pkhbt r8, r6, r7, lsl #16 + pkhtb r9, r7, r6, asr #16 + + uxtab16 r8, r8, r4 + uxtab16 r9, r9, r4, ror #8 + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst] + + ;4, 5, 6, 7 + ldr r4, [prd], #4 +;; ldr r6, [dif, #32] +;; ldr r7, [dif, #36] + ldr r6, [dif, #8] + ldr r7, [dif, #12] + + pkhbt r8, r6, r7, lsl #16 + pkhtb r9, r7, r6, asr #16 + + uxtab16 r8, r8, r4 + uxtab16 r9, r9, r4, ror #8 + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst, #4] + + add dst, dst, stride +;; add dif, dif, #8 + add dif, dif, #16 + + subs lr, lr, #1 + bne recon2b_loop + + ldmia sp!, {r4 - r9, pc} + + ENDP ; |Recon2B| + + END diff --git a/vp8/common/arm/armv6/simpleloopfilter_v6.asm b/vp8/common/arm/armv6/simpleloopfilter_v6.asm new file mode 100644 index 000000000..15c6c7d16 --- /dev/null +++ b/vp8/common/arm/armv6/simpleloopfilter_v6.asm @@ -0,0 +1,321 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_simple_horizontal_edge_armv6| + EXPORT |vp8_loop_filter_simple_vertical_edge_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code + + MACRO + TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 + ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 + ; a0: 03 02 01 00 + ; a1: 13 12 11 10 + ; a2: 23 22 21 20 + ; a3: 33 32 31 30 + ; b3 b2 b1 b0 + + uxtb16 $b1, $a1 ; xx 12 xx 10 + uxtb16 $b0, $a0 ; xx 02 xx 00 + uxtb16 $b3, $a3 ; xx 32 xx 30 + uxtb16 $b2, $a2 ; xx 22 xx 20 + orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 + orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 + + uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 + uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 + uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 + uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 + orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 + orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 + + pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 + pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 + + pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 + pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 + MEND + + +src RN r0 +pstep RN r1 + +;r0 unsigned char *src_ptr, +;r1 int src_pixel_step, +;r2 const char *flimit, +;r3 const char *limit, +;stack const char *thresh, +;stack int count + +;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_loop_filter_simple_horizontal_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, pstep, lsl #1 ; move src pointer down by 2 lines + + ldr r12, [r3], #4 ; limit + ldr r3, [src], pstep ; p1 + + ldr r9, [sp, #36] ; count for 8-in-parallel + ldr r4, [src], pstep ; p0 + + ldr r7, [r2], #4 ; flimit + ldr r5, [src], pstep ; q0 + ldr r2, c0x80808080 + + ldr r6, [src] ; q1 + + uadd8 r7, r7, r7 ; flimit * 2 + mov r9, r9, lsl #1 ; 4-in-parallel + uadd8 r12, r7, r12 ; flimit * 2 + limit + +|simple_hnext8| + ; vp8_simple_filter_mask() function + + uqsub8 r7, r3, r6 ; p1 - q1 + uqsub8 r8, r6, r3 ; q1 - p1 + uqsub8 r10, r4, r5 ; p0 - q0 + uqsub8 r11, r5, r4 ; q0 - p0 + orr r8, r8, r7 ; abs(p1 - q1) + ldr lr, c0x7F7F7F7F ; 01111111 mask + orr r10, r10, r11 ; abs(p0 - q0) + and r8, lr, r8, lsr #1 ; abs(p1 - q1) / 2 + uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2 + mvn lr, #0 ; r10 == -1 + uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 + ; STALL waiting on r10 :( + uqsub8 r10, r10, r12 ; compare to flimit + mov r8, #0 + + usub8 r10, r8, r10 ; use usub8 instead of ssub8 + ; STALL (maybe?) when are flags set? :/ + sel r10, lr, r8 ; filter mask: lr + + cmp r10, #0 + beq simple_hskip_filter ; skip filtering + + ;vp8_simple_filter() function + + eor r3, r3, r2 ; p1 offset to convert to a signed value + eor r6, r6, r2 ; q1 offset to convert to a signed value + eor r4, r4, r2 ; p0 offset to convert to a signed value + eor r5, r5, r2 ; q0 offset to convert to a signed value + + qsub8 r3, r3, r6 ; vp8_filter (r3) = vp8_signed_char_clamp(p1-q1) + qsub8 r6, r5, r4 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( q0 - p0)) + + qadd8 r3, r3, r6 + ldr r8, c0x03030303 ; r8 = 3 + + qadd8 r3, r3, r6 + ldr r7, c0x04040404 + + qadd8 r3, r3, r6 + and r3, r3, lr ; vp8_filter &= mask; + + ;save bottom 3 bits so that we round one side +4 and the other +3 + qadd8 r8 , r3 , r8 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) + qadd8 r3 , r3 , r7 ; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4) + + mov r7, #0 + shadd8 r8 , r8 , r7 ; Filter2 >>= 3 + shadd8 r3 , r3 , r7 ; Filter1 >>= 3 + shadd8 r8 , r8 , r7 + shadd8 r3 , r3 , r7 + shadd8 r8 , r8 , r7 ; r8: Filter2 + shadd8 r3 , r3 , r7 ; r7: filter1 + + ;calculate output + sub src, src, pstep, lsl #1 + + qadd8 r4, r4, r8 ; u = vp8_signed_char_clamp(p0 + Filter2) + qsub8 r5 ,r5, r3 ; u = vp8_signed_char_clamp(q0 - Filter1) + eor r4, r4, r2 ; *op0 = u^0x80 + str r4, [src], pstep ; store op0 result + eor r5, r5, r2 ; *oq0 = u^0x80 + str r5, [src], pstep ; store oq0 result + +|simple_hskip_filter| + add src, src, #4 + sub src, src, pstep + sub src, src, pstep, lsl #1 + + subs r9, r9, #1 + + ;pld [src] + ;pld [src, pstep] + ;pld [src, pstep, lsl #1] + + ldrne r3, [src], pstep ; p1 + ldrne r4, [src], pstep ; p0 + ldrne r5, [src], pstep ; q0 + ldrne r6, [src] ; q1 + + bne simple_hnext8 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_loop_filter_simple_horizontal_edge_armv6| + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_loop_filter_simple_vertical_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + ldr r12, [r2], #4 ; r12: flimit + ldr r2, c0x80808080 + ldr r7, [r3], #4 ; limit + + ; load soure data to r7, r8, r9, r10 + ldrh r3, [src, #-2] + ldrh r4, [src], pstep + uadd8 r12, r12, r12 ; flimit * 2 + + ldrh r5, [src, #-2] + ldrh r6, [src], pstep + uadd8 r12, r12, r7 ; flimit * 2 + limit + + pkhbt r7, r3, r4, lsl #16 + + ldrh r3, [src, #-2] + ldrh r4, [src], pstep + ldr r11, [sp, #40] ; count (r11) for 8-in-parallel + + pkhbt r8, r5, r6, lsl #16 + + ldrh r5, [src, #-2] + ldrh r6, [src], pstep + mov r11, r11, lsl #1 ; 4-in-parallel + +|simple_vnext8| + ; vp8_simple_filter_mask() function + pkhbt r9, r3, r4, lsl #16 + pkhbt r10, r5, r6, lsl #16 + + ;transpose r7, r8, r9, r10 to r3, r4, r5, r6 + TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6 + + uqsub8 r7, r3, r6 ; p1 - q1 + uqsub8 r8, r6, r3 ; q1 - p1 + uqsub8 r9, r4, r5 ; p0 - q0 + uqsub8 r10, r5, r4 ; q0 - p0 + orr r7, r7, r8 ; abs(p1 - q1) + orr r9, r9, r10 ; abs(p0 - q0) + ldr lr, c0x7F7F7F7F ; 0111 1111 mask + uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2 + and r7, lr, r7, lsr #1 ; abs(p1 - q1) / 2 + mov r8, #0 + uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 + mvn r10, #0 ; r10 == -1 + uqsub8 r7, r7, r12 ; compare to flimit + + usub8 r7, r8, r7 + sel r7, r10, r8 ; filter mask: lr + + cmp lr, #0 + beq simple_vskip_filter ; skip filtering + + ;vp8_simple_filter() function + eor r3, r3, r2 ; p1 offset to convert to a signed value + eor r6, r6, r2 ; q1 offset to convert to a signed value + eor r4, r4, r2 ; p0 offset to convert to a signed value + eor r5, r5, r2 ; q0 offset to convert to a signed value + + qsub8 r3, r3, r6 ; vp8_filter (r3) = vp8_signed_char_clamp(p1-q1) + qsub8 r6, r5, r4 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( q0 - p0)) + + qadd8 r3, r3, r6 + ldr r8, c0x03030303 ; r8 = 3 + + qadd8 r3, r3, r6 + ldr r7, c0x04040404 + + qadd8 r3, r3, r6 + and r3, r3, lr ; vp8_filter &= mask + + ;save bottom 3 bits so that we round one side +4 and the other +3 + qadd8 r8 , r3 , r8 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) + qadd8 r3 , r3 , r7 ; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4) + + mov r7, #0 + shadd8 r8 , r8 , r7 ; Filter2 >>= 3 + shadd8 r3 , r3 , r7 ; Filter1 >>= 3 + shadd8 r8 , r8 , r7 + shadd8 r3 , r3 , r7 + shadd8 r8 , r8 , r7 ; r8: filter2 + shadd8 r3 , r3 , r7 ; r7: filter1 + + ;calculate output + sub src, src, pstep, lsl #2 + + qadd8 r4, r4, r8 ; u = vp8_signed_char_clamp(p0 + Filter2) + qsub8 r5, r5, r3 ; u = vp8_signed_char_clamp(q0 - Filter1) + eor r4, r4, r2 ; *op0 = u^0x80 + eor r5, r5, r2 ; *oq0 = u^0x80 + + strb r4, [src, #-1] ; store the result + mov r4, r4, lsr #8 + strb r5, [src], pstep + mov r5, r5, lsr #8 + + strb r4, [src, #-1] + mov r4, r4, lsr #8 + strb r5, [src], pstep + mov r5, r5, lsr #8 + + strb r4, [src, #-1] + mov r4, r4, lsr #8 + strb r5, [src], pstep + mov r5, r5, lsr #8 + + strb r4, [src, #-1] + strb r5, [src], pstep + +|simple_vskip_filter| + subs r11, r11, #1 + + ;pld [src] + ;pld [src, pstep] + ;pld [src, pstep, lsl #1] + + ; load soure data to r7, r8, r9, r10 + ldrneh r3, [src, #-2] + ldrneh r4, [src], pstep + + ldrneh r5, [src, #-2] + ldrneh r6, [src], pstep + + pkhbt r7, r3, r4, lsl #16 + + ldrneh r3, [src, #-2] + ldrneh r4, [src], pstep + + pkhbt r8, r5, r6, lsl #16 + + ldrneh r5, [src, #-2] + ldrneh r6, [src], pstep + + bne simple_vnext8 + + ldmia sp!, {r4 - r12, pc} + ENDP ; |vp8_loop_filter_simple_vertical_edge_armv6| + +; Constant Pool +c0x80808080 DCD 0x80808080 +c0x03030303 DCD 0x03030303 +c0x04040404 DCD 0x04040404 +c0x01010101 DCD 0x01010101 +c0x7F7F7F7F DCD 0x7F7F7F7F + + END diff --git a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm new file mode 100644 index 000000000..551d863e9 --- /dev/null +++ b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm @@ -0,0 +1,277 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_sixtap_predict8x4_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code +;------------------------------------- +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack unsigned char *dst_ptr, +; stack int dst_pitch +;------------------------------------- +;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184. +;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack, +;and the result is stored in transpose. +|vp8_sixtap_predict8x4_armv6| PROC + stmdb sp!, {r4 - r11, lr} + sub sp, sp, #184 ;reserve space on stack for temporary storage: 20x(8+1) +4 + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + str r3, [sp], #4 ;store yoffset + beq skip_firstpass_filter + +;first-pass filter + ldr r12, _filter8_coeff_ + sub r0, r0, r1, lsl #1 + + add r2, r12, r2, lsl #4 ;calculate filter location + add r0, r0, #3 ;adjust src only for loading convinience + + ldr r3, [r2] ; load up packed filter coefficients + ldr r4, [r2, #4] + ldr r5, [r2, #8] + + mov r2, #0x90000 ; height=9 is top part of counter + + sub r1, r1, #8 + mov lr, #20 + +|first_pass_hloop_v6| + ldrb r6, [r0, #-5] ; load source data + ldrb r7, [r0, #-4] + ldrb r8, [r0, #-3] + ldrb r9, [r0, #-2] + ldrb r10, [r0, #-1] + + orr r2, r2, #0x4 ; construct loop counter. width=8=4x2 + + pkhbt r6, r6, r7, lsl #16 ; r7 | r6 + pkhbt r7, r7, r8, lsl #16 ; r8 | r7 + + pkhbt r8, r8, r9, lsl #16 ; r9 | r8 + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + +|first_pass_wloop_v6| + smuad r11, r6, r3 ; vp8_filter[0], vp8_filter[1] + smuad r12, r7, r3 + + ldrb r6, [r0], #1 + + smlad r11, r8, r4, r11 ; vp8_filter[2], vp8_filter[3] + ldrb r7, [r0], #1 + smlad r12, r9, r4, r12 + + pkhbt r10, r10, r6, lsl #16 ; r10 | r9 + pkhbt r6, r6, r7, lsl #16 ; r11 | r10 + smlad r11, r10, r5, r11 ; vp8_filter[4], vp8_filter[5] + smlad r12, r6, r5, r12 + + sub r2, r2, #1 + + add r11, r11, #0x40 ; round_shift_and_clamp + tst r2, #0xff ; test loop counter + usat r11, #8, r11, asr #7 + add r12, r12, #0x40 + strh r11, [sp], lr ; result is transposed and stored, which + usat r12, #8, r12, asr #7 + + strh r12, [sp], lr + + movne r11, r6 + movne r12, r7 + + movne r6, r8 + movne r7, r9 + movne r8, r10 + movne r9, r11 + movne r10, r12 + + bne first_pass_wloop_v6 + + ;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines + ;;IF ARCHITECTURE=6 + ;pld [src, ppl] + ;;pld [src, r9] + ;;ENDIF + + subs r2, r2, #0x10000 + + mov r6, #158 + sub sp, sp, r6 + + add r0, r0, r1 ; move to next input line + + bne first_pass_hloop_v6 + +;second pass filter +secondpass_filter + mov r1, #18 + sub sp, sp, r1 ; 18+4 + + ldr r3, [sp, #-4] ; load back yoffset + ldr r0, [sp, #216] ; load dst address from stack 180+36 + ldr r1, [sp, #220] ; load dst stride from stack 180+40 + + cmp r3, #0 + beq skip_secondpass_filter + + ldr r12, _filter8_coeff_ + add lr, r12, r3, lsl #4 ;calculate filter location + + mov r2, #0x00080000 + + ldr r3, [lr] ; load up packed filter coefficients + ldr r4, [lr, #4] + ldr r5, [lr, #8] + + pkhbt r12, r4, r3 ; pack the filter differently + pkhbt r11, r5, r4 + +second_pass_hloop_v6 + ldr r6, [sp] ; load the data + ldr r7, [sp, #4] + + orr r2, r2, #2 ; loop counter + +second_pass_wloop_v6 + smuad lr, r3, r6 ; apply filter + smulbt r10, r3, r6 + + ldr r8, [sp, #8] + + smlad lr, r4, r7, lr + smladx r10, r12, r7, r10 + + ldrh r9, [sp, #12] + + smlad lr, r5, r8, lr + smladx r10, r11, r8, r10 + + add sp, sp, #4 + smlatb r10, r5, r9, r10 + + sub r2, r2, #1 + + add lr, lr, #0x40 ; round_shift_and_clamp + tst r2, #0xff + usat lr, #8, lr, asr #7 + add r10, r10, #0x40 + strb lr, [r0], r1 ; the result is transposed back and stored + usat r10, #8, r10, asr #7 + + strb r10, [r0],r1 + + movne r6, r7 + movne r7, r8 + + bne second_pass_wloop_v6 + + subs r2, r2, #0x10000 + add sp, sp, #12 ; updata src for next loop (20-8) + sub r0, r0, r1, lsl #2 + add r0, r0, #1 + + bne second_pass_hloop_v6 + + add sp, sp, #20 + ldmia sp!, {r4 - r11, pc} + +;-------------------- +skip_firstpass_filter + sub r0, r0, r1, lsl #1 + sub r1, r1, #8 + mov r2, #9 + mov r3, #20 + +skip_firstpass_hloop + ldrb r4, [r0], #1 ; load data + subs r2, r2, #1 + ldrb r5, [r0], #1 + strh r4, [sp], r3 ; store it to immediate buffer + ldrb r6, [r0], #1 ; load data + strh r5, [sp], r3 + ldrb r7, [r0], #1 + strh r6, [sp], r3 + ldrb r8, [r0], #1 + strh r7, [sp], r3 + ldrb r9, [r0], #1 + strh r8, [sp], r3 + ldrb r10, [r0], #1 + strh r9, [sp], r3 + ldrb r11, [r0], #1 + strh r10, [sp], r3 + add r0, r0, r1 ; move to next input line + strh r11, [sp], r3 + + mov r4, #158 + sub sp, sp, r4 ; move over to next column + bne skip_firstpass_hloop + + b secondpass_filter + +;-------------------- +skip_secondpass_filter + mov r2, #8 + add sp, sp, #4 ;start from src[0] instead of src[-2] + +skip_secondpass_hloop + ldr r6, [sp], #4 + subs r2, r2, #1 + ldr r8, [sp], #4 + + mov r7, r6, lsr #16 ; unpack + strb r6, [r0], r1 + mov r9, r8, lsr #16 + strb r7, [r0], r1 + add sp, sp, #12 ; 20-8 + strb r8, [r0], r1 + strb r9, [r0], r1 + + sub r0, r0, r1, lsl #2 + add r0, r0, #1 + + bne skip_secondpass_hloop + + add sp, sp, #16 ; 180 - (160 +4) + + ldmia sp!, {r4 - r11, pc} + + ENDP + +;----------------- + AREA subpelfilters8_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_filter8_coeff_ + DCD filter8_coeff +filter8_coeff + DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000 + DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000 + DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000 + DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000 + DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000 + DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000 + DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000 + DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000 + + ;DCD 0, 0, 128, 0, 0, 0 + ;DCD 0, -6, 123, 12, -1, 0 + ;DCD 2, -11, 108, 36, -8, 1 + ;DCD 0, -9, 93, 50, -6, 0 + ;DCD 3, -16, 77, 77, -16, 3 + ;DCD 0, -6, 50, 93, -9, 0 + ;DCD 1, -8, 36, 108, -11, 2 + ;DCD 0, -1, 12, 123, -6, 0 + + END diff --git a/vp8/common/arm/bilinearfilter_arm.c b/vp8/common/arm/bilinearfilter_arm.c new file mode 100644 index 000000000..bf972a3bc --- /dev/null +++ b/vp8/common/arm/bilinearfilter_arm.c @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include <math.h> +#include "subpixel.h" + +#define BLOCK_HEIGHT_WIDTH 4 +#define VP8_FILTER_WEIGHT 128 +#define VP8_FILTER_SHIFT 7 + +static const short bilinear_filters[8][2] = +{ + { 128, 0 }, + { 112, 16 }, + { 96, 32 }, + { 80, 48 }, + { 64, 64 }, + { 48, 80 }, + { 32, 96 }, + { 16, 112 } +}; + + +extern void vp8_filter_block2d_bil_first_pass_armv6 +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); + +extern void vp8_filter_block2d_bil_second_pass_armv6 +( + unsigned short *src_ptr, + unsigned char *output_ptr, + int output_pitch, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); + +/* +void vp8_filter_block2d_bil_first_pass_6 +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +) +{ + unsigned int i, j; + + for ( i=0; i<output_height; i++ ) + { + for ( j=0; j<output_width; j++ ) + { + // Apply bilinear filter + output_ptr[j] = ( ( (int)src_ptr[0] * vp8_filter[0]) + + ((int)src_ptr[1] * vp8_filter[1]) + + (VP8_FILTER_WEIGHT/2) ) >> VP8_FILTER_SHIFT; + src_ptr++; + } + + // Next row... + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +void vp8_filter_block2d_bil_second_pass_6 +( + unsigned short *src_ptr, + unsigned char *output_ptr, + int output_pitch, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +) +{ + unsigned int i,j; + int Temp; + + for ( i=0; i<output_height; i++ ) + { + for ( j=0; j<output_width; j++ ) + { + // Apply filter + Temp = ((int)src_ptr[0] * vp8_filter[0]) + + ((int)src_ptr[output_width] * vp8_filter[1]) + + (VP8_FILTER_WEIGHT/2); + output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT); + src_ptr++; + } + + // Next row... + //src_ptr += src_pixels_per_line - output_width; + output_ptr += output_pitch; + } +} +*/ + +void vp8_filter_block2d_bil_armv6 +( + unsigned char *src_ptr, + unsigned char *output_ptr, + unsigned int src_pixels_per_line, + unsigned int dst_pitch, + const short *HFilter, + const short *VFilter, + int Width, + int Height +) +{ + + unsigned short FData[36*16]; // Temp data bufffer used in filtering + + // First filter 1-D horizontally... + // pixel_step = 1; + vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pixels_per_line, Height + 1, Width, HFilter); + + // then 1-D vertically... + vp8_filter_block2d_bil_second_pass_armv6(FData, output_ptr, dst_pitch, Height, Width, VFilter); +} + + +void vp8_bilinear_predict4x4_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = bilinear_filters[xoffset]; + VFilter = bilinear_filters[yoffset]; + + vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4); +} + +void vp8_bilinear_predict8x8_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = bilinear_filters[xoffset]; + VFilter = bilinear_filters[yoffset]; + + vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8); +} + +void vp8_bilinear_predict8x4_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = bilinear_filters[xoffset]; + VFilter = bilinear_filters[yoffset]; + + vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4); +} + +void vp8_bilinear_predict16x16_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = bilinear_filters[xoffset]; + VFilter = bilinear_filters[yoffset]; + + vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16); +} diff --git a/vp8/common/arm/filter_arm.c b/vp8/common/arm/filter_arm.c new file mode 100644 index 000000000..2a4640cae --- /dev/null +++ b/vp8/common/arm/filter_arm.c @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include <math.h> +#include "subpixel.h" +#include "vpx_ports/mem.h" + +#define BLOCK_HEIGHT_WIDTH 4 +#define VP8_FILTER_WEIGHT 128 +#define VP8_FILTER_SHIFT 7 + +DECLARE_ALIGNED(16, static const short, sub_pel_filters[8][6]) = +{ + { 0, 0, 128, 0, 0, 0 }, // note that 1/8 pel positions are just as per alpha -0.5 bicubic + { 0, -6, 123, 12, -1, 0 }, + { 2, -11, 108, 36, -8, 1 }, // New 1/4 pel 6 tap filter + { 0, -9, 93, 50, -6, 0 }, + { 3, -16, 77, 77, -16, 3 }, // New 1/2 pel 6 tap filter + { 0, -6, 50, 93, -9, 0 }, + { 1, -8, 36, 108, -11, 2 }, // New 1/4 pel 6 tap filter + { 0, -1, 12, 123, -6, 0 }, +}; + + +extern void vp8_filter_block2d_first_pass_armv6 +( + unsigned char *src_ptr, + short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_width, + unsigned int output_height, + const short *vp8_filter +); + +extern void vp8_filter_block2d_second_pass_armv6 +( + short *src_ptr, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int cnt, + const short *vp8_filter +); + +extern void vp8_filter_block2d_first_pass_only_armv6 +( + unsigned char *src_ptr, + unsigned char *output_ptr, + unsigned int src_pixels_per_line, + unsigned int cnt, + unsigned int output_pitch, + const short *vp8_filter +); + + +extern void vp8_filter_block2d_second_pass_only_armv6 +( + unsigned char *src_ptr, + unsigned char *output_ptr, + unsigned int src_pixels_per_line, + unsigned int cnt, + unsigned int output_pitch, + const short *vp8_filter +); + +#if HAVE_ARMV6 +void vp8_sixtap_predict_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); // Temp data bufffer used in filtering + + + HFilter = sub_pel_filters[xoffset]; // 6 tap + VFilter = sub_pel_filters[yoffset]; // 6 tap + + // Vfilter is null. First pass only + if (xoffset && !yoffset) + { + //vp8_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter ); + //vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter ); + + vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter); + } + // Hfilter is null. Second pass only + else if (!xoffset && yoffset) + { + vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter); + } + else + { + // Vfilter is a 4 tap filter + if (yoffset & 0x1) + vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter); + // Vfilter is 6 tap filter + else + vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter); + + vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter); + } +} + +/* +void vp8_sixtap_predict8x4_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); // Temp data bufffer used in filtering + + HFilter = sub_pel_filters[xoffset]; // 6 tap + VFilter = sub_pel_filters[yoffset]; // 6 tap + + +// if (xoffset && !yoffset) +// { +// vp8_filter_block2d_first_pass_only_armv6 ( src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter ); +// } + // Hfilter is null. Second pass only +// else if (!xoffset && yoffset) +// { +// vp8_filter_block2d_second_pass_only_armv6 ( src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter ); +// } +// else +// { +// if (yoffset & 0x1) + // vp8_filter_block2d_first_pass_armv6 ( src_ptr-src_pixels_per_line, FData+1, src_pixels_per_line, 8, 7, HFilter ); + // else + + vp8_filter_block2d_first_pass_armv6 ( src_ptr-(2*src_pixels_per_line), FData, src_pixels_per_line, 8, 9, HFilter ); + + vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, 8, VFilter ); +// } +} +*/ + +void vp8_sixtap_predict8x8_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); // Temp data bufffer used in filtering + + HFilter = sub_pel_filters[xoffset]; // 6 tap + VFilter = sub_pel_filters[yoffset]; // 6 tap + + if (xoffset && !yoffset) + { + vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter); + } + // Hfilter is null. Second pass only + else if (!xoffset && yoffset) + { + vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter); + } + else + { + if (yoffset & 0x1) + vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter); + else + vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter); + + vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter); + } +} + + +void vp8_sixtap_predict16x16_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16); // Temp data bufffer used in filtering + + HFilter = sub_pel_filters[xoffset]; // 6 tap + VFilter = sub_pel_filters[yoffset]; // 6 tap + + if (xoffset && !yoffset) + { + vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter); + } + // Hfilter is null. Second pass only + else if (!xoffset && yoffset) + { + vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter); + } + else + { + if (yoffset & 0x1) + vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter); + else + vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter); + + vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter); + } + +} +#endif diff --git a/vp8/common/arm/idct_arm.h b/vp8/common/arm/idct_arm.h new file mode 100644 index 000000000..f9ed21e0d --- /dev/null +++ b/vp8/common/arm/idct_arm.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef IDCT_ARM_H +#define IDCT_ARM_H + +#if HAVE_ARMV6 +extern prototype_idct(vp8_short_idct4x4llm_1_v6); +extern prototype_idct(vp8_short_idct4x4llm_v6_dual); +extern prototype_idct_scalar(vp8_dc_only_idct_armv6); +extern prototype_second_order(vp8_short_inv_walsh4x4_1_armv6); +extern prototype_second_order(vp8_short_inv_walsh4x4_armv6); + +#undef vp8_idct_idct1 +#define vp8_idct_idct1 vp8_short_idct4x4llm_1_v6 + +#undef vp8_idct_idct16 +#define vp8_idct_idct16 vp8_short_idct4x4llm_v6_dual + +#undef vp8_idct_idct1_scalar +#define vp8_idct_idct1_scalar vp8_dc_only_idct_armv6 + +#undef vp8_idct_iwalsh1 +#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_armv6 + +#undef vp8_idct_iwalsh16 +#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_armv6 +#endif + +#if HAVE_ARMV7 +extern prototype_idct(vp8_short_idct4x4llm_1_neon); +extern prototype_idct(vp8_short_idct4x4llm_neon); +extern prototype_idct_scalar(vp8_dc_only_idct_neon); +extern prototype_second_order(vp8_short_inv_walsh4x4_1_neon); +extern prototype_second_order(vp8_short_inv_walsh4x4_neon); + +#undef vp8_idct_idct1 +#define vp8_idct_idct1 vp8_short_idct4x4llm_1_neon + +#undef vp8_idct_idct16 +#define vp8_idct_idct16 vp8_short_idct4x4llm_neon + +#undef vp8_idct_idct1_scalar +#define vp8_idct_idct1_scalar vp8_dc_only_idct_neon + +#undef vp8_idct_iwalsh1 +#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon + +#undef vp8_idct_iwalsh16 +#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_neon +#endif + +#endif diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c new file mode 100644 index 000000000..fa7c62617 --- /dev/null +++ b/vp8/common/arm/loopfilter_arm.c @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include <math.h> +#include "loopfilter.h" +#include "onyxc_int.h" + +typedef void loop_filter_uvfunction +( + unsigned char *u, // source pointer + int p, // pitch + const signed char *flimit, + const signed char *limit, + const signed char *thresh, + unsigned char *v +); + +extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6); +extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6); +extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6); +extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6); +extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6); +extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6); + +extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_y_neon); +extern prototype_loopfilter(vp8_loop_filter_vertical_edge_y_neon); +extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_y_neon); +extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_y_neon); +extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_neon); +extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_neon); + +extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_neon; +extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_neon; +extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_neon; +extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_neon; + + +#if HAVE_ARMV6 +//ARMV6 loopfilter functions +// Horizontal MB filtering +void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1); + + if (v_ptr) + vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1); +} + +void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); +} + +// Vertical MB Filtering +void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1); + + if (v_ptr) + vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1); +} + +void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); +} + +// Horizontal B Filtering +void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1); + + if (v_ptr) + vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1); +} + +void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); +} + +// Vertical B Filtering +void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + + if (u_ptr) + vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1); + + if (v_ptr) + vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1); +} + +void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); +} +#endif + +#if HAVE_ARMV7 +// NEON loopfilter functions +// Horizontal MB filtering +void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr); +} + +void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); +} + +// Vertical MB Filtering +void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr); +} + +void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); +} + +// Horizontal B Filtering +void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4 * uv_stride); +} + +void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); +} + +// Vertical B Filtering +void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + + if (u_ptr) + vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4); +} + +void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); +} +#endif diff --git a/vp8/common/arm/loopfilter_arm.h b/vp8/common/arm/loopfilter_arm.h new file mode 100644 index 000000000..4bb49456d --- /dev/null +++ b/vp8/common/arm/loopfilter_arm.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef LOOPFILTER_ARM_H +#define LOOPFILTER_ARM_H + +#if HAVE_ARMV6 +extern prototype_loopfilter_block(vp8_loop_filter_mbv_armv6); +extern prototype_loopfilter_block(vp8_loop_filter_bv_armv6); +extern prototype_loopfilter_block(vp8_loop_filter_mbh_armv6); +extern prototype_loopfilter_block(vp8_loop_filter_bh_armv6); +extern prototype_loopfilter_block(vp8_loop_filter_mbvs_armv6); +extern prototype_loopfilter_block(vp8_loop_filter_bvs_armv6); +extern prototype_loopfilter_block(vp8_loop_filter_mbhs_armv6); +extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6); + +#undef vp8_lf_normal_mb_v +#define vp8_lf_normal_mb_v vp8_loop_filter_mbv_armv6 + +#undef vp8_lf_normal_b_v +#define vp8_lf_normal_b_v vp8_loop_filter_bv_armv6 + +#undef vp8_lf_normal_mb_h +#define vp8_lf_normal_mb_h vp8_loop_filter_mbh_armv6 + +#undef vp8_lf_normal_b_h +#define vp8_lf_normal_b_h vp8_loop_filter_bh_armv6 + +#undef vp8_lf_simple_mb_v +#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_armv6 + +#undef vp8_lf_simple_b_v +#define vp8_lf_simple_b_v vp8_loop_filter_bvs_armv6 + +#undef vp8_lf_simple_mb_h +#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_armv6 + +#undef vp8_lf_simple_b_h +#define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6 +#endif + +#if HAVE_ARMV7 +extern prototype_loopfilter_block(vp8_loop_filter_mbv_neon); +extern prototype_loopfilter_block(vp8_loop_filter_bv_neon); +extern prototype_loopfilter_block(vp8_loop_filter_mbh_neon); +extern prototype_loopfilter_block(vp8_loop_filter_bh_neon); +extern prototype_loopfilter_block(vp8_loop_filter_mbvs_neon); +extern prototype_loopfilter_block(vp8_loop_filter_bvs_neon); +extern prototype_loopfilter_block(vp8_loop_filter_mbhs_neon); +extern prototype_loopfilter_block(vp8_loop_filter_bhs_neon); + +#undef vp8_lf_normal_mb_v +#define vp8_lf_normal_mb_v vp8_loop_filter_mbv_neon + +#undef vp8_lf_normal_b_v +#define vp8_lf_normal_b_v vp8_loop_filter_bv_neon + +#undef vp8_lf_normal_mb_h +#define vp8_lf_normal_mb_h vp8_loop_filter_mbh_neon + +#undef vp8_lf_normal_b_h +#define vp8_lf_normal_b_h vp8_loop_filter_bh_neon + +#undef vp8_lf_simple_mb_v +#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_neon + +#undef vp8_lf_simple_b_v +#define vp8_lf_simple_b_v vp8_loop_filter_bvs_neon + +#undef vp8_lf_simple_mb_h +#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_neon + +#undef vp8_lf_simple_b_h +#define vp8_lf_simple_b_h vp8_loop_filter_bhs_neon +#endif + +#endif diff --git a/vp8/common/arm/neon/bilinearpredict16x16_neon.asm b/vp8/common/arm/neon/bilinearpredict16x16_neon.asm new file mode 100644 index 000000000..a2fea2bd6 --- /dev/null +++ b/vp8/common/arm/neon/bilinearpredict16x16_neon.asm @@ -0,0 +1,361 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_bilinear_predict16x16_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(r5) int dst_pitch + +|vp8_bilinear_predict16x16_neon| PROC + push {r4-r5, lr} + + ldr r12, _bifilter16_coeff_ + ldr r4, [sp, #12] ;load parameters from stack + ldr r5, [sp, #16] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_bfilter16x16_only + + add r2, r12, r2, lsl #3 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + + vld1.s32 {d31}, [r2] ;load first_pass filter + + beq firstpass_bfilter16x16_only + + sub sp, sp, #272 ;reserve space on stack for temporary storage + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + mov lr, sp + vld1.u8 {d5, d6, d7}, [r0], r1 + + mov r2, #3 ;loop counter + vld1.u8 {d8, d9, d10}, [r0], r1 + + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vld1.u8 {d11, d12, d13}, [r0], r1 + + vdup.8 d1, d31[4] + +;First Pass: output_height lines x output_width columns (17x16) +filt_blk2d_fp16x16_loop_neon + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q8, d3, d0 + vmull.u8 q9, d5, d0 + vmull.u8 q10, d6, d0 + vmull.u8 q11, d8, d0 + vmull.u8 q12, d9, d0 + vmull.u8 q13, d11, d0 + vmull.u8 q14, d12, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + vext.8 d11, d11, d12, #1 + + vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q9, d5, d1 + vmlal.u8 q11, d8, d1 + vmlal.u8 q13, d11, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + vext.8 d12, d12, d13, #1 + + vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q10, d6, d1 + vmlal.u8 q12, d9, d1 + vmlal.u8 q14, d12, d1 + + subs r2, r2, #1 + + vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d15, q8, #7 + vqrshrn.u16 d16, q9, #7 + vqrshrn.u16 d17, q10, #7 + vqrshrn.u16 d18, q11, #7 + vqrshrn.u16 d19, q12, #7 + vqrshrn.u16 d20, q13, #7 + + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + vqrshrn.u16 d21, q14, #7 + vld1.u8 {d5, d6, d7}, [r0], r1 + + vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result + vld1.u8 {d8, d9, d10}, [r0], r1 + vst1.u8 {d18, d19, d20, d21}, [lr]! + vld1.u8 {d11, d12, d13}, [r0], r1 + + bne filt_blk2d_fp16x16_loop_neon + +;First-pass filtering for rest 5 lines + vld1.u8 {d14, d15, d16}, [r0], r1 + + vmull.u8 q9, d2, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q10, d3, d0 + vmull.u8 q11, d5, d0 + vmull.u8 q12, d6, d0 + vmull.u8 q13, d8, d0 + vmull.u8 q14, d9, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + + vmlal.u8 q9, d2, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q11, d5, d1 + vmlal.u8 q13, d8, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + + vmlal.u8 q10, d3, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q12, d6, d1 + vmlal.u8 q14, d9, d1 + + vmull.u8 q1, d11, d0 + vmull.u8 q2, d12, d0 + vmull.u8 q3, d14, d0 + vmull.u8 q4, d15, d0 + + vext.8 d11, d11, d12, #1 ;construct src_ptr[1] + vext.8 d14, d14, d15, #1 + + vmlal.u8 q1, d11, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q3, d14, d1 + + vext.8 d12, d12, d13, #1 + vext.8 d15, d15, d16, #1 + + vmlal.u8 q2, d12, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q4, d15, d1 + + vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d11, q10, #7 + vqrshrn.u16 d12, q11, #7 + vqrshrn.u16 d13, q12, #7 + vqrshrn.u16 d14, q13, #7 + vqrshrn.u16 d15, q14, #7 + vqrshrn.u16 d16, q1, #7 + vqrshrn.u16 d17, q2, #7 + vqrshrn.u16 d18, q3, #7 + vqrshrn.u16 d19, q4, #7 + + vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result + vst1.u8 {d14, d15, d16, d17}, [lr]! + vst1.u8 {d18, d19}, [lr]! + +;Second pass: 16x16 +;secondpass_filter + add r3, r12, r3, lsl #3 + sub lr, lr, #272 + + vld1.u32 {d31}, [r3] ;load second_pass filter + + vld1.u8 {d22, d23}, [lr]! ;load src data + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + mov r12, #4 ;loop counter + +filt_blk2d_sp16x16_loop_neon + vld1.u8 {d24, d25}, [lr]! + vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) + vld1.u8 {d26, d27}, [lr]! + vmull.u8 q2, d23, d0 + vld1.u8 {d28, d29}, [lr]! + vmull.u8 q3, d24, d0 + vld1.u8 {d30, d31}, [lr]! + + vmull.u8 q4, d25, d0 + vmull.u8 q5, d26, d0 + vmull.u8 q6, d27, d0 + vmull.u8 q7, d28, d0 + vmull.u8 q8, d29, d0 + + vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) + vmlal.u8 q2, d25, d1 + vmlal.u8 q3, d26, d1 + vmlal.u8 q4, d27, d1 + vmlal.u8 q5, d28, d1 + vmlal.u8 q6, d29, d1 + vmlal.u8 q7, d30, d1 + vmlal.u8 q8, d31, d1 + + subs r12, r12, #1 + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + vqrshrn.u16 d6, q5, #7 + vqrshrn.u16 d7, q6, #7 + vqrshrn.u16 d8, q7, #7 + vqrshrn.u16 d9, q8, #7 + + vst1.u8 {d2, d3}, [r4], r5 ;store result + vst1.u8 {d4, d5}, [r4], r5 + vst1.u8 {d6, d7}, [r4], r5 + vmov q11, q15 + vst1.u8 {d8, d9}, [r4], r5 + + bne filt_blk2d_sp16x16_loop_neon + + add sp, sp, #272 + + pop {r4-r5,pc} + +;-------------------- +firstpass_bfilter16x16_only + mov r2, #4 ;loop counter + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vdup.8 d1, d31[4] + +;First Pass: output_height lines x output_width columns (16x16) +filt_blk2d_fpo16x16_loop_neon + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + vld1.u8 {d5, d6, d7}, [r0], r1 + vld1.u8 {d8, d9, d10}, [r0], r1 + vld1.u8 {d11, d12, d13}, [r0], r1 + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q8, d3, d0 + vmull.u8 q9, d5, d0 + vmull.u8 q10, d6, d0 + vmull.u8 q11, d8, d0 + vmull.u8 q12, d9, d0 + vmull.u8 q13, d11, d0 + vmull.u8 q14, d12, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + vext.8 d11, d11, d12, #1 + + vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q9, d5, d1 + vmlal.u8 q11, d8, d1 + vmlal.u8 q13, d11, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + vext.8 d12, d12, d13, #1 + + vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q10, d6, d1 + vmlal.u8 q12, d9, d1 + vmlal.u8 q14, d12, d1 + + subs r2, r2, #1 + + vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d15, q8, #7 + vqrshrn.u16 d16, q9, #7 + vqrshrn.u16 d17, q10, #7 + vqrshrn.u16 d18, q11, #7 + vqrshrn.u16 d19, q12, #7 + vqrshrn.u16 d20, q13, #7 + vst1.u8 {d14, d15}, [r4], r5 ;store result + vqrshrn.u16 d21, q14, #7 + + vst1.u8 {d16, d17}, [r4], r5 + vst1.u8 {d18, d19}, [r4], r5 + vst1.u8 {d20, d21}, [r4], r5 + + bne filt_blk2d_fpo16x16_loop_neon + pop {r4-r5,pc} + +;--------------------- +secondpass_bfilter16x16_only +;Second pass: 16x16 +;secondpass_filter + add r3, r12, r3, lsl #3 + mov r12, #4 ;loop counter + vld1.u32 {d31}, [r3] ;load second_pass filter + vld1.u8 {d22, d23}, [r0], r1 ;load src data + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + +filt_blk2d_spo16x16_loop_neon + vld1.u8 {d24, d25}, [r0], r1 + vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) + vld1.u8 {d26, d27}, [r0], r1 + vmull.u8 q2, d23, d0 + vld1.u8 {d28, d29}, [r0], r1 + vmull.u8 q3, d24, d0 + vld1.u8 {d30, d31}, [r0], r1 + + vmull.u8 q4, d25, d0 + vmull.u8 q5, d26, d0 + vmull.u8 q6, d27, d0 + vmull.u8 q7, d28, d0 + vmull.u8 q8, d29, d0 + + vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) + vmlal.u8 q2, d25, d1 + vmlal.u8 q3, d26, d1 + vmlal.u8 q4, d27, d1 + vmlal.u8 q5, d28, d1 + vmlal.u8 q6, d29, d1 + vmlal.u8 q7, d30, d1 + vmlal.u8 q8, d31, d1 + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + vqrshrn.u16 d6, q5, #7 + vqrshrn.u16 d7, q6, #7 + vqrshrn.u16 d8, q7, #7 + vqrshrn.u16 d9, q8, #7 + + vst1.u8 {d2, d3}, [r4], r5 ;store result + subs r12, r12, #1 + vst1.u8 {d4, d5}, [r4], r5 + vmov q11, q15 + vst1.u8 {d6, d7}, [r4], r5 + vst1.u8 {d8, d9}, [r4], r5 + + bne filt_blk2d_spo16x16_loop_neon + pop {r4-r5,pc} + + ENDP + +;----------------- + AREA bifilters16_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_bifilter16_coeff_ + DCD bifilter16_coeff +bifilter16_coeff + DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 + + END diff --git a/vp8/common/arm/neon/bilinearpredict4x4_neon.asm b/vp8/common/arm/neon/bilinearpredict4x4_neon.asm new file mode 100644 index 000000000..74d2db5dc --- /dev/null +++ b/vp8/common/arm/neon/bilinearpredict4x4_neon.asm @@ -0,0 +1,134 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_bilinear_predict4x4_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(lr) int dst_pitch + +|vp8_bilinear_predict4x4_neon| PROC + push {r4, lr} + + ldr r12, _bifilter4_coeff_ + ldr r4, [sp, #8] ;load parameters from stack + ldr lr, [sp, #12] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq skip_firstpass_filter + +;First pass: output_height lines x output_width columns (5x4) + vld1.u8 {d2}, [r0], r1 ;load src data + add r2, r12, r2, lsl #3 ;calculate Hfilter location (2coeffsx4bytes=8bytes) + + vld1.u8 {d3}, [r0], r1 + vld1.u32 {d31}, [r2] ;first_pass filter + + vld1.u8 {d4}, [r0], r1 + vdup.8 d0, d31[0] ;first_pass filter (d0-d1) + vld1.u8 {d5}, [r0], r1 + vdup.8 d1, d31[4] + vld1.u8 {d6}, [r0], r1 + + vshr.u64 q4, q1, #8 ;construct src_ptr[1] + vshr.u64 q5, q2, #8 + vshr.u64 d12, d6, #8 + + vzip.32 d2, d3 ;put 2-line data in 1 register (src_ptr[0]) + vzip.32 d4, d5 + vzip.32 d8, d9 ;put 2-line data in 1 register (src_ptr[1]) + vzip.32 d10, d11 + + vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q8, d4, d0 + vmull.u8 q9, d6, d0 + + vmlal.u8 q7, d8, d1 ;(src_ptr[1] * vp8_filter[1]) + vmlal.u8 q8, d10, d1 + vmlal.u8 q9, d12, d1 + + vqrshrn.u16 d28, q7, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d29, q8, #7 + vqrshrn.u16 d30, q9, #7 + +;Second pass: 4x4 +secondpass_filter + cmp r3, #0 ;skip second_pass filter if yoffset=0 + beq skip_secondpass_filter + + add r3, r12, r3, lsl #3 ;calculate Vfilter location + vld1.u32 {d31}, [r3] ;load second_pass filter + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d31[4] + + vmull.u8 q1, d28, d0 + vmull.u8 q2, d29, d0 + + vext.8 d26, d28, d29, #4 ;construct src_ptr[pixel_step] + vext.8 d27, d29, d30, #4 + + vmlal.u8 q1, d26, d1 + vmlal.u8 q2, d27, d1 + + add r0, r4, lr + add r1, r0, lr + add r2, r1, lr + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + + vst1.32 {d2[0]}, [r4] ;store result + vst1.32 {d2[1]}, [r0] + vst1.32 {d3[0]}, [r1] + vst1.32 {d3[1]}, [r2] + + pop {r4, pc} + +;-------------------- +skip_firstpass_filter + + vld1.32 {d28[0]}, [r0], r1 ;load src data + vld1.32 {d28[1]}, [r0], r1 + vld1.32 {d29[0]}, [r0], r1 + vld1.32 {d29[1]}, [r0], r1 + vld1.32 {d30[0]}, [r0], r1 + + b secondpass_filter + +;--------------------- +skip_secondpass_filter + vst1.32 {d28[0]}, [r4], lr ;store result + vst1.32 {d28[1]}, [r4], lr + vst1.32 {d29[0]}, [r4], lr + vst1.32 {d29[1]}, [r4], lr + + pop {r4, pc} + + ENDP + +;----------------- + AREA bilinearfilters4_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_bifilter4_coeff_ + DCD bifilter4_coeff +bifilter4_coeff + DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 + + END diff --git a/vp8/common/arm/neon/bilinearpredict8x4_neon.asm b/vp8/common/arm/neon/bilinearpredict8x4_neon.asm new file mode 100644 index 000000000..46ebb0e0b --- /dev/null +++ b/vp8/common/arm/neon/bilinearpredict8x4_neon.asm @@ -0,0 +1,139 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_bilinear_predict8x4_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(lr) int dst_pitch + +|vp8_bilinear_predict8x4_neon| PROC + push {r4, lr} + + ldr r12, _bifilter8x4_coeff_ + ldr r4, [sp, #8] ;load parameters from stack + ldr lr, [sp, #12] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq skip_firstpass_filter + +;First pass: output_height lines x output_width columns (5x8) + add r2, r12, r2, lsl #3 ;calculate filter location + + vld1.u8 {q1}, [r0], r1 ;load src data + vld1.u32 {d31}, [r2] ;load first_pass filter + vld1.u8 {q2}, [r0], r1 + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vld1.u8 {q3}, [r0], r1 + vdup.8 d1, d31[4] + vld1.u8 {q4}, [r0], r1 + + vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0]) + vld1.u8 {q5}, [r0], r1 + vmull.u8 q7, d4, d0 + vmull.u8 q8, d6, d0 + vmull.u8 q9, d8, d0 + vmull.u8 q10, d10, d0 + + vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vext.8 d9, d8, d9, #1 + vext.8 d11, d10, d11, #1 + + vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1]) + vmlal.u8 q7, d5, d1 + vmlal.u8 q8, d7, d1 + vmlal.u8 q9, d9, d1 + vmlal.u8 q10, d11, d1 + + vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d23, q7, #7 + vqrshrn.u16 d24, q8, #7 + vqrshrn.u16 d25, q9, #7 + vqrshrn.u16 d26, q10, #7 + +;Second pass: 4x8 +secondpass_filter + cmp r3, #0 ;skip second_pass filter if yoffset=0 + beq skip_secondpass_filter + + add r3, r12, r3, lsl #3 + add r0, r4, lr + + vld1.u32 {d31}, [r3] ;load second_pass filter + add r1, r0, lr + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + + vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q2, d23, d0 + vmull.u8 q3, d24, d0 + vmull.u8 q4, d25, d0 + + vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) + vmlal.u8 q2, d24, d1 + vmlal.u8 q3, d25, d1 + vmlal.u8 q4, d26, d1 + + add r2, r1, lr + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + + vst1.u8 {d2}, [r4] ;store result + vst1.u8 {d3}, [r0] + vst1.u8 {d4}, [r1] + vst1.u8 {d5}, [r2] + + pop {r4, pc} + +;-------------------- +skip_firstpass_filter + vld1.u8 {d22}, [r0], r1 ;load src data + vld1.u8 {d23}, [r0], r1 + vld1.u8 {d24}, [r0], r1 + vld1.u8 {d25}, [r0], r1 + vld1.u8 {d26}, [r0], r1 + + b secondpass_filter + +;--------------------- +skip_secondpass_filter + vst1.u8 {d22}, [r4], lr ;store result + vst1.u8 {d23}, [r4], lr + vst1.u8 {d24}, [r4], lr + vst1.u8 {d25}, [r4], lr + + pop {r4, pc} + + ENDP + +;----------------- + AREA bifilters8x4_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_bifilter8x4_coeff_ + DCD bifilter8x4_coeff +bifilter8x4_coeff + DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 + + END diff --git a/vp8/common/arm/neon/bilinearpredict8x8_neon.asm b/vp8/common/arm/neon/bilinearpredict8x8_neon.asm new file mode 100644 index 000000000..80728d4f8 --- /dev/null +++ b/vp8/common/arm/neon/bilinearpredict8x8_neon.asm @@ -0,0 +1,187 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_bilinear_predict8x8_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(lr) int dst_pitch + +|vp8_bilinear_predict8x8_neon| PROC + push {r4, lr} + + ldr r12, _bifilter8_coeff_ + ldr r4, [sp, #8] ;load parameters from stack + ldr lr, [sp, #12] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq skip_firstpass_filter + +;First pass: output_height lines x output_width columns (9x8) + add r2, r12, r2, lsl #3 ;calculate filter location + + vld1.u8 {q1}, [r0], r1 ;load src data + vld1.u32 {d31}, [r2] ;load first_pass filter + vld1.u8 {q2}, [r0], r1 + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vld1.u8 {q3}, [r0], r1 + vdup.8 d1, d31[4] + vld1.u8 {q4}, [r0], r1 + + vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q7, d4, d0 + vmull.u8 q8, d6, d0 + vmull.u8 q9, d8, d0 + + vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vext.8 d9, d8, d9, #1 + + vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1]) + vmlal.u8 q7, d5, d1 + vmlal.u8 q8, d7, d1 + vmlal.u8 q9, d9, d1 + + vld1.u8 {q1}, [r0], r1 ;load src data + vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 + vld1.u8 {q2}, [r0], r1 + vqrshrn.u16 d23, q7, #7 + vld1.u8 {q3}, [r0], r1 + vqrshrn.u16 d24, q8, #7 + vld1.u8 {q4}, [r0], r1 + vqrshrn.u16 d25, q9, #7 + + ;first_pass filtering on the rest 5-line data + vld1.u8 {q5}, [r0], r1 + + vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q7, d4, d0 + vmull.u8 q8, d6, d0 + vmull.u8 q9, d8, d0 + vmull.u8 q10, d10, d0 + + vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vext.8 d9, d8, d9, #1 + vext.8 d11, d10, d11, #1 + + vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1]) + vmlal.u8 q7, d5, d1 + vmlal.u8 q8, d7, d1 + vmlal.u8 q9, d9, d1 + vmlal.u8 q10, d11, d1 + + vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d27, q7, #7 + vqrshrn.u16 d28, q8, #7 + vqrshrn.u16 d29, q9, #7 + vqrshrn.u16 d30, q10, #7 + +;Second pass: 8x8 +secondpass_filter + cmp r3, #0 ;skip second_pass filter if yoffset=0 + beq skip_secondpass_filter + + add r3, r12, r3, lsl #3 + add r0, r4, lr + + vld1.u32 {d31}, [r3] ;load second_pass filter + add r1, r0, lr + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + + vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q2, d23, d0 + vmull.u8 q3, d24, d0 + vmull.u8 q4, d25, d0 + vmull.u8 q5, d26, d0 + vmull.u8 q6, d27, d0 + vmull.u8 q7, d28, d0 + vmull.u8 q8, d29, d0 + + vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) + vmlal.u8 q2, d24, d1 + vmlal.u8 q3, d25, d1 + vmlal.u8 q4, d26, d1 + vmlal.u8 q5, d27, d1 + vmlal.u8 q6, d28, d1 + vmlal.u8 q7, d29, d1 + vmlal.u8 q8, d30, d1 + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + vqrshrn.u16 d6, q5, #7 + vqrshrn.u16 d7, q6, #7 + vqrshrn.u16 d8, q7, #7 + vqrshrn.u16 d9, q8, #7 + + vst1.u8 {d2}, [r4] ;store result + vst1.u8 {d3}, [r0] + vst1.u8 {d4}, [r1], lr + vst1.u8 {d5}, [r1], lr + vst1.u8 {d6}, [r1], lr + vst1.u8 {d7}, [r1], lr + vst1.u8 {d8}, [r1], lr + vst1.u8 {d9}, [r1], lr + + pop {r4, pc} + +;-------------------- +skip_firstpass_filter + vld1.u8 {d22}, [r0], r1 ;load src data + vld1.u8 {d23}, [r0], r1 + vld1.u8 {d24}, [r0], r1 + vld1.u8 {d25}, [r0], r1 + vld1.u8 {d26}, [r0], r1 + vld1.u8 {d27}, [r0], r1 + vld1.u8 {d28}, [r0], r1 + vld1.u8 {d29}, [r0], r1 + vld1.u8 {d30}, [r0], r1 + + b secondpass_filter + +;--------------------- +skip_secondpass_filter + vst1.u8 {d22}, [r4], lr ;store result + vst1.u8 {d23}, [r4], lr + vst1.u8 {d24}, [r4], lr + vst1.u8 {d25}, [r4], lr + vst1.u8 {d26}, [r4], lr + vst1.u8 {d27}, [r4], lr + vst1.u8 {d28}, [r4], lr + vst1.u8 {d29}, [r4], lr + + pop {r4, pc} + + ENDP + +;----------------- + AREA bifilters8_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_bifilter8_coeff_ + DCD bifilter8_coeff +bifilter8_coeff + DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 + + END diff --git a/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm b/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm new file mode 100644 index 000000000..f42ac63c9 --- /dev/null +++ b/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm @@ -0,0 +1,583 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_build_intra_predictors_mby_neon_func| + EXPORT |vp8_build_intra_predictors_mby_s_neon_func| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *y_buffer +; r1 unsigned char *ypred_ptr +; r2 int y_stride +; r3 int mode +; stack int Up +; stack int Left + +|vp8_build_intra_predictors_mby_neon_func| PROC + push {r4-r8, lr} + + cmp r3, #0 + beq case_dc_pred + cmp r3, #1 + beq case_v_pred + cmp r3, #2 + beq case_h_pred + cmp r3, #3 + beq case_tm_pred + +case_dc_pred + ldr r4, [sp, #24] ; Up + ldr r5, [sp, #28] ; Left + + ; Default the DC average to 128 + mov r12, #128 + vdup.u8 q0, r12 + + ; Zero out running sum + mov r12, #0 + + ; compute shift and jump + adds r7, r4, r5 + beq skip_dc_pred_up_left + + ; Load above row, if it exists + cmp r4, #0 + beq skip_dc_pred_up + + sub r6, r0, r2 + vld1.8 {q1}, [r6] + vpaddl.u8 q2, q1 + vpaddl.u16 q3, q2 + vpaddl.u32 q4, q3 + + vmov.32 r4, d8[0] + vmov.32 r6, d9[0] + + add r12, r4, r6 + + ; Move back to interger registers + +skip_dc_pred_up + + cmp r5, #0 + beq skip_dc_pred_left + + sub r0, r0, #1 + + ; Load left row, if it exists + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0] + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + +skip_dc_pred_left + add r7, r7, #3 ; Shift + sub r4, r7, #1 + mov r5, #1 + add r12, r12, r5, lsl r4 + mov r5, r12, lsr r7 ; expected_dc + + vdup.u8 q0, r5 + +skip_dc_pred_up_left + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + + pop {r4-r8,pc} +case_v_pred + ; Copy down above row + sub r6, r0, r2 + vld1.8 {q0}, [r6] + + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + pop {r4-r8,pc} + +case_h_pred + ; Load 4x yleft_col + sub r0, r0, #1 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1]! + vst1.u8 {q1}, [r1]! + vst1.u8 {q2}, [r1]! + vst1.u8 {q3}, [r1]! + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1]! + vst1.u8 {q1}, [r1]! + vst1.u8 {q2}, [r1]! + vst1.u8 {q3}, [r1]! + + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1]! + vst1.u8 {q1}, [r1]! + vst1.u8 {q2}, [r1]! + vst1.u8 {q3}, [r1]! + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1]! + vst1.u8 {q1}, [r1]! + vst1.u8 {q2}, [r1]! + vst1.u8 {q3}, [r1]! + + pop {r4-r8,pc} + +case_tm_pred + ; Load yabove_row + sub r3, r0, r2 + vld1.8 {q8}, [r3] + + ; Load ytop_left + sub r3, r3, #1 + ldrb r7, [r3] + + vdup.u16 q7, r7 + + ; Compute yabove_row - ytop_left + mov r3, #1 + vdup.u8 q0, r3 + + vmull.u8 q4, d16, d0 + vmull.u8 q5, d17, d0 + + vsub.s16 q4, q4, q7 + vsub.s16 q5, q5, q7 + + ; Load 4x yleft_col + sub r0, r0, #1 + mov r12, #4 + +case_tm_pred_loop + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u16 q0, r3 + vdup.u16 q1, r4 + vdup.u16 q2, r5 + vdup.u16 q3, r6 + + vqadd.s16 q8, q0, q4 + vqadd.s16 q9, q0, q5 + + vqadd.s16 q10, q1, q4 + vqadd.s16 q11, q1, q5 + + vqadd.s16 q12, q2, q4 + vqadd.s16 q13, q2, q5 + + vqadd.s16 q14, q3, q4 + vqadd.s16 q15, q3, q5 + + vqshrun.s16 d0, q8, #0 + vqshrun.s16 d1, q9, #0 + + vqshrun.s16 d2, q10, #0 + vqshrun.s16 d3, q11, #0 + + vqshrun.s16 d4, q12, #0 + vqshrun.s16 d5, q13, #0 + + vqshrun.s16 d6, q14, #0 + vqshrun.s16 d7, q15, #0 + + vst1.u8 {q0}, [r1]! + vst1.u8 {q1}, [r1]! + vst1.u8 {q2}, [r1]! + vst1.u8 {q3}, [r1]! + + subs r12, r12, #1 + bne case_tm_pred_loop + + pop {r4-r8,pc} + + ENDP + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; r0 unsigned char *y_buffer +; r1 unsigned char *ypred_ptr +; r2 int y_stride +; r3 int mode +; stack int Up +; stack int Left + +|vp8_build_intra_predictors_mby_s_neon_func| PROC + push {r4-r8, lr} + + mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor; + + cmp r3, #0 + beq case_dc_pred_s + cmp r3, #1 + beq case_v_pred_s + cmp r3, #2 + beq case_h_pred_s + cmp r3, #3 + beq case_tm_pred_s + +case_dc_pred_s + ldr r4, [sp, #24] ; Up + ldr r5, [sp, #28] ; Left + + ; Default the DC average to 128 + mov r12, #128 + vdup.u8 q0, r12 + + ; Zero out running sum + mov r12, #0 + + ; compute shift and jump + adds r7, r4, r5 + beq skip_dc_pred_up_left_s + + ; Load above row, if it exists + cmp r4, #0 + beq skip_dc_pred_up_s + + sub r6, r0, r2 + vld1.8 {q1}, [r6] + vpaddl.u8 q2, q1 + vpaddl.u16 q3, q2 + vpaddl.u32 q4, q3 + + vmov.32 r4, d8[0] + vmov.32 r6, d9[0] + + add r12, r4, r6 + + ; Move back to interger registers + +skip_dc_pred_up_s + + cmp r5, #0 + beq skip_dc_pred_left_s + + sub r0, r0, #1 + + ; Load left row, if it exists + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0] + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + +skip_dc_pred_left_s + add r7, r7, #3 ; Shift + sub r4, r7, #1 + mov r5, #1 + add r12, r12, r5, lsl r4 + mov r5, r12, lsr r7 ; expected_dc + + vdup.u8 q0, r5 + +skip_dc_pred_up_left_s + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + + pop {r4-r8,pc} +case_v_pred_s + ; Copy down above row + sub r6, r0, r2 + vld1.8 {q0}, [r6] + + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + pop {r4-r8,pc} + +case_h_pred_s + ; Load 4x yleft_col + sub r0, r0, #1 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q1}, [r1], r2 + vst1.u8 {q2}, [r1], r2 + vst1.u8 {q3}, [r1], r2 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q1}, [r1], r2 + vst1.u8 {q2}, [r1], r2 + vst1.u8 {q3}, [r1], r2 + + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q1}, [r1], r2 + vst1.u8 {q2}, [r1], r2 + vst1.u8 {q3}, [r1], r2 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q1}, [r1], r2 + vst1.u8 {q2}, [r1], r2 + vst1.u8 {q3}, [r1], r2 + + pop {r4-r8,pc} + +case_tm_pred_s + ; Load yabove_row + sub r3, r0, r2 + vld1.8 {q8}, [r3] + + ; Load ytop_left + sub r3, r3, #1 + ldrb r7, [r3] + + vdup.u16 q7, r7 + + ; Compute yabove_row - ytop_left + mov r3, #1 + vdup.u8 q0, r3 + + vmull.u8 q4, d16, d0 + vmull.u8 q5, d17, d0 + + vsub.s16 q4, q4, q7 + vsub.s16 q5, q5, q7 + + ; Load 4x yleft_col + sub r0, r0, #1 + mov r12, #4 + +case_tm_pred_loop_s + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u16 q0, r3 + vdup.u16 q1, r4 + vdup.u16 q2, r5 + vdup.u16 q3, r6 + + vqadd.s16 q8, q0, q4 + vqadd.s16 q9, q0, q5 + + vqadd.s16 q10, q1, q4 + vqadd.s16 q11, q1, q5 + + vqadd.s16 q12, q2, q4 + vqadd.s16 q13, q2, q5 + + vqadd.s16 q14, q3, q4 + vqadd.s16 q15, q3, q5 + + vqshrun.s16 d0, q8, #0 + vqshrun.s16 d1, q9, #0 + + vqshrun.s16 d2, q10, #0 + vqshrun.s16 d3, q11, #0 + + vqshrun.s16 d4, q12, #0 + vqshrun.s16 d5, q13, #0 + + vqshrun.s16 d6, q14, #0 + vqshrun.s16 d7, q15, #0 + + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q1}, [r1], r2 + vst1.u8 {q2}, [r1], r2 + vst1.u8 {q3}, [r1], r2 + + subs r12, r12, #1 + bne case_tm_pred_loop_s + + pop {r4-r8,pc} + + ENDP + + + END diff --git a/vp8/common/arm/neon/copymem16x16_neon.asm b/vp8/common/arm/neon/copymem16x16_neon.asm new file mode 100644 index 000000000..89d5e1018 --- /dev/null +++ b/vp8/common/arm/neon/copymem16x16_neon.asm @@ -0,0 +1,58 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem16x16_neon| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem16x16_neon| PROC + + vld1.u8 {q0}, [r0], r1 + vld1.u8 {q1}, [r0], r1 + vld1.u8 {q2}, [r0], r1 + vst1.u8 {q0}, [r2], r3 + vld1.u8 {q3}, [r0], r1 + vst1.u8 {q1}, [r2], r3 + vld1.u8 {q4}, [r0], r1 + vst1.u8 {q2}, [r2], r3 + vld1.u8 {q5}, [r0], r1 + vst1.u8 {q3}, [r2], r3 + vld1.u8 {q6}, [r0], r1 + vst1.u8 {q4}, [r2], r3 + vld1.u8 {q7}, [r0], r1 + vst1.u8 {q5}, [r2], r3 + vld1.u8 {q8}, [r0], r1 + vst1.u8 {q6}, [r2], r3 + vld1.u8 {q9}, [r0], r1 + vst1.u8 {q7}, [r2], r3 + vld1.u8 {q10}, [r0], r1 + vst1.u8 {q8}, [r2], r3 + vld1.u8 {q11}, [r0], r1 + vst1.u8 {q9}, [r2], r3 + vld1.u8 {q12}, [r0], r1 + vst1.u8 {q10}, [r2], r3 + vld1.u8 {q13}, [r0], r1 + vst1.u8 {q11}, [r2], r3 + vld1.u8 {q14}, [r0], r1 + vst1.u8 {q12}, [r2], r3 + vld1.u8 {q15}, [r0], r1 + vst1.u8 {q13}, [r2], r3 + vst1.u8 {q14}, [r2], r3 + vst1.u8 {q15}, [r2], r3 + + mov pc, lr + + ENDP ; |vp8_copy_mem16x16_neon| + + END diff --git a/vp8/common/arm/neon/copymem8x4_neon.asm b/vp8/common/arm/neon/copymem8x4_neon.asm new file mode 100644 index 000000000..302f734ff --- /dev/null +++ b/vp8/common/arm/neon/copymem8x4_neon.asm @@ -0,0 +1,33 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem8x4_neon| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem8x4_neon| PROC + vld1.u8 {d0}, [r0], r1 + vld1.u8 {d1}, [r0], r1 + vst1.u8 {d0}, [r2], r3 + vld1.u8 {d2}, [r0], r1 + vst1.u8 {d1}, [r2], r3 + vld1.u8 {d3}, [r0], r1 + vst1.u8 {d2}, [r2], r3 + vst1.u8 {d3}, [r2], r3 + + mov pc, lr + + ENDP ; |vp8_copy_mem8x4_neon| + + END diff --git a/vp8/common/arm/neon/copymem8x8_neon.asm b/vp8/common/arm/neon/copymem8x8_neon.asm new file mode 100644 index 000000000..50d39ef66 --- /dev/null +++ b/vp8/common/arm/neon/copymem8x8_neon.asm @@ -0,0 +1,42 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem8x8_neon| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem8x8_neon| PROC + + vld1.u8 {d0}, [r0], r1 + vld1.u8 {d1}, [r0], r1 + vst1.u8 {d0}, [r2], r3 + vld1.u8 {d2}, [r0], r1 + vst1.u8 {d1}, [r2], r3 + vld1.u8 {d3}, [r0], r1 + vst1.u8 {d2}, [r2], r3 + vld1.u8 {d4}, [r0], r1 + vst1.u8 {d3}, [r2], r3 + vld1.u8 {d5}, [r0], r1 + vst1.u8 {d4}, [r2], r3 + vld1.u8 {d6}, [r0], r1 + vst1.u8 {d5}, [r2], r3 + vld1.u8 {d7}, [r0], r1 + vst1.u8 {d6}, [r2], r3 + vst1.u8 {d7}, [r2], r3 + + mov pc, lr + + ENDP ; |vp8_copy_mem8x8_neon| + + END diff --git a/vp8/common/arm/neon/iwalsh_neon.asm b/vp8/common/arm/neon/iwalsh_neon.asm new file mode 100644 index 000000000..4fc744c96 --- /dev/null +++ b/vp8/common/arm/neon/iwalsh_neon.asm @@ -0,0 +1,95 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + EXPORT |vp8_short_inv_walsh4x4_neon| + EXPORT |vp8_short_inv_walsh4x4_1_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY ; name this block of code + +;short vp8_short_inv_walsh4x4_neon(short *input, short *output) +|vp8_short_inv_walsh4x4_neon| PROC + + ; read in all four lines of values: d0->d3 + vldm.64 r0, {q0, q1} + + ; first for loop + + vadd.s16 d4, d0, d3 ;a = [0] + [12] + vadd.s16 d5, d1, d2 ;b = [4] + [8] + vsub.s16 d6, d1, d2 ;c = [4] - [8] + vsub.s16 d7, d0, d3 ;d = [0] - [12] + + vadd.s16 d0, d4, d5 ;a + b + vadd.s16 d1, d6, d7 ;c + d + vsub.s16 d2, d4, d5 ;a - b + vsub.s16 d3, d7, d6 ;d - c + + vtrn.32 d0, d2 ;d0: 0 1 8 9 + ;d2: 2 3 10 11 + vtrn.32 d1, d3 ;d1: 4 5 12 13 + ;d3: 6 7 14 15 + + vtrn.16 d0, d1 ;d0: 0 4 8 12 + ;d1: 1 5 9 13 + vtrn.16 d2, d3 ;d2: 2 6 10 14 + ;d3: 3 7 11 15 + + ; second for loop + + vadd.s16 d4, d0, d3 ;a = [0] + [3] + vadd.s16 d5, d1, d2 ;b = [1] + [2] + vsub.s16 d6, d1, d2 ;c = [1] - [2] + vsub.s16 d7, d0, d3 ;d = [0] - [3] + + vadd.s16 d0, d4, d5 ;e = a + b + vadd.s16 d1, d6, d7 ;f = c + d + vsub.s16 d2, d4, d5 ;g = a - b + vsub.s16 d3, d7, d6 ;h = d - c + + vmov.i16 q2, #3 + vadd.i16 q0, q0, q2 ;e/f += 3 + vadd.i16 q1, q1, q2 ;g/h += 3 + + vshr.s16 q0, q0, #3 ;e/f >> 3 + vshr.s16 q1, q1, #3 ;g/h >> 3 + + vtrn.32 d0, d2 + vtrn.32 d1, d3 + vtrn.16 d0, d1 + vtrn.16 d2, d3 + + vstmia.16 r1!, {q0} + vstmia.16 r1!, {q1} + + bx lr + ENDP ; |vp8_short_inv_walsh4x4_neon| + + +;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output) +|vp8_short_inv_walsh4x4_1_neon| PROC + ; load a full line into a neon register + vld1.16 {q0}, [r0] + ; extract first element and replicate + vdup.16 q1, d0[0] + ; add 3 to all values + vmov.i16 q2, #3 + vadd.i16 q3, q1, q2 + ; right shift + vshr.s16 q3, q3, #3 + ; write it back + vstmia.16 r1!, {q3} + vstmia.16 r1!, {q3} + + bx lr + ENDP ; |vp8_short_inv_walsh4x4_1_neon| + + END diff --git a/vp8/common/arm/neon/loopfilterhorizontaledge_uv_neon.asm b/vp8/common/arm/neon/loopfilterhorizontaledge_uv_neon.asm new file mode 100644 index 000000000..e3e8e8a72 --- /dev/null +++ b/vp8/common/arm/neon/loopfilterhorizontaledge_uv_neon.asm @@ -0,0 +1,205 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_horizontal_edge_uv_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit +;are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. +; r0 unsigned char *u, +; r1 int p, //pitch +; r2 const signed char *flimit, +; r3 const signed char *limit, +; stack(r4) const signed char *thresh, +; stack(r5) unsigned char *v + +|vp8_loop_filter_horizontal_edge_uv_neon| PROC + sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines + vld1.s8 {d0[], d1[]}, [r2] ; flimit + + ldr r2, [sp, #4] ; load v ptr + ldr r12, [sp, #0] ; load thresh pointer + + sub r2, r2, r1, lsl #2 ; move v pointer down by 4 lines + + vld1.u8 {d6}, [r0], r1 ; p3 + vld1.u8 {d7}, [r2], r1 ; p3 + vld1.u8 {d8}, [r0], r1 ; p2 + vld1.u8 {d9}, [r2], r1 ; p2 + vld1.u8 {d10}, [r0], r1 ; p1 + vld1.u8 {d11}, [r2], r1 ; p1 + vld1.u8 {d12}, [r0], r1 ; p0 + vld1.u8 {d13}, [r2], r1 ; p0 + vld1.u8 {d14}, [r0], r1 ; q0 + vld1.u8 {d15}, [r2], r1 ; q0 + vld1.u8 {d16}, [r0], r1 ; q1 + vld1.u8 {d17}, [r2], r1 ; q1 + vld1.u8 {d18}, [r0], r1 ; q2 + vld1.u8 {d19}, [r2], r1 ; q2 + vld1.u8 {d20}, [r0], r1 ; q3 + vld1.u8 {d21}, [r2], r1 ; q3 + + vld1.s8 {d2[], d3[]}, [r3] ; limit + vld1.s8 {d4[], d5[]}, [r12] ; thresh + + ldr r12, _lfhuv_coeff_ + ;vp8_filter_mask() function + ;vp8_hevmask() function + vabd.u8 q11, q3, q4 ; abs(p3 - p2) + vabd.u8 q12, q4, q5 ; abs(p2 - p1) + vabd.u8 q13, q5, q6 ; abs(p1 - p0) + vabd.u8 q14, q8, q7 ; abs(q1 - q0) + vabd.u8 q3, q9, q8 ; abs(q2 - q1) + vabd.u8 q4, q10, q9 ; abs(q3 - q2) + vabd.u8 q9, q6, q7 ; abs(p0 - q0) + + vcge.u8 q15, q1, q11 ; (abs(p3 - p2) > limit)*-1 + vcge.u8 q12, q1, q12 ; (abs(p2 - p1) > limit)*-1 + vcge.u8 q10, q1, q13 ; (abs(p1 - p0) > limit)*-1 + vcge.u8 q11, q1, q14 ; (abs(q1 - q0) > limit)*-1 + + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + + vcge.u8 q3, q1, q3 ; (abs(q2 - q1) > limit)*-1 + vcge.u8 q4, q1, q4 ; (abs(q3 - q2) > limit)*-1 + vadd.u8 q0, q0, q0 ; flimit * 2 + vadd.u8 q0, q0, q1 ; flimit * 2 + limit + + vand q15, q15, q12 + vand q10, q10, q11 + vand q3, q3, q4 + + vabd.u8 q2, q5, q8 ; abs(p1 - q1) + vqadd.u8 q9, q9, q9 ; abs(p0 - q0) * 2 + vshr.u8 q2, q2, #1 ; abs(p1 - q1) / 2 + vqadd.u8 q9, q9, q2 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + vcge.u8 q9, q0, q9 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 + + vld1.u8 {q0}, [r12]! + + vand q15, q15, q10 + + ;vp8_filter() function + veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value + veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value + veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value + veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value +;;;;;;;;;;;;;; + vld1.u8 {q10}, [r12]! + + ;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0) + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q11, d15, d13 + + vand q3, q3, q9 + vmovl.u8 q4, d20 + + vqsub.s8 q1, q5, q8 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + vorr q14, q13, q14 ; q14: vp8_hevmask + + ;vmul.i8 q2, q2, q10 ; 3 * ( qs0 - ps0) + vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) + vmul.i16 q11, q11, q4 + + vand q1, q1, q14 ; vp8_filter &= hev + vand q15, q15, q3 ; q15: vp8_filter_mask + ;; + ;vld1.u8 {q4}, [r12]! ;no need 7 any more + + ;vqadd.s8 q1, q1, q2 + vaddw.s8 q2, q2, d2 + vaddw.s8 q11, q11, d3 + + vld1.u8 {q9}, [r12]! + ; + vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d3, q11 + ;; + + vand q1, q1, q15 ; vp8_filter &= mask + ;; +;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7 +; vand q2, q1, q4 ; s = vp8_filter & 7 +; vqadd.s8 q1, q1, q9 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) + ;;;; +; vshr.s8 q1, q1, #3 ; vp8_filter >>= 3 +; vceq.i8 q2, q2, q9 ; s = (s==4)*-1 + ;; +; ;calculate output +; vqsub.s8 q10, q7, q1 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) +; vqadd.s8 q11, q2, q1 ; u = vp8_signed_char_clamp(s + vp8_filter) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; q10=3 + vqadd.s8 q2, q1, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) + vqadd.s8 q1, q1, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) + vshr.s8 q2, q2, #3 ; Filter2 >>= 3 + vshr.s8 q1, q1, #3 ; Filter1 >>= 3 + + ;calculate output + vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) + vqsub.s8 q10, q7, q1 ; u = vp8_signed_char_clamp(qs0 - Filter1) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vrshr.s8 q1, q1, #1 ;round/shift: vp8_filter += 1; vp8_filter >>= 1 + + sub r0, r0, r1, lsl #2 + sub r0, r0, r1, lsl #1 + ; + + vbic q1, q1, q14 ; vp8_filter &= ~hev + + sub r2, r2, r1, lsl #2 + sub r2, r2, r1, lsl #1 + ;; + + vqadd.s8 q13, q5, q1 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) + ;vqadd.s8 q11, q6, q11 ; u = vp8_signed_char_clamp(ps0 + u) + vqsub.s8 q12, q8, q1 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) + ; + + veor q5, q13, q0 ; *op1 = u^0x80 + veor q6, q11, q0 ; *op0 = u^0x80 + veor q7, q10, q0 ; *oq0 = u^0x80 + veor q8, q12, q0 ; *oq1 = u^0x80 + ; + + vst1.u8 {d10}, [r0], r1 ; store u op1 + vst1.u8 {d11}, [r2], r1 ; store v op1 + vst1.u8 {d12}, [r0], r1 ; store u op0 + vst1.u8 {d13}, [r2], r1 ; store v op0 + vst1.u8 {d14}, [r0], r1 ; store u oq0 + vst1.u8 {d15}, [r2], r1 ; store v oq0 + vst1.u8 {d16}, [r0], r1 ; store u oq1 + vst1.u8 {d17}, [r2], r1 ; store v oq1 + + bx lr + ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon| + +;----------------- + AREA hloopfilteruv_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 16 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_lfhuv_coeff_ + DCD lfhuv_coeff +lfhuv_coeff + DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 + DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 + DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 + DCD 0x01010101, 0x01010101, 0x01010101, 0x01010101 + + END diff --git a/vp8/common/arm/neon/loopfilterhorizontaledge_y_neon.asm b/vp8/common/arm/neon/loopfilterhorizontaledge_y_neon.asm new file mode 100644 index 000000000..f11055d42 --- /dev/null +++ b/vp8/common/arm/neon/loopfilterhorizontaledge_y_neon.asm @@ -0,0 +1,188 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_horizontal_edge_y_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit +;are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. +; r0 unsigned char *s, +; r1 int p, //pitch +; r2 const signed char *flimit, +; r3 const signed char *limit, +; stack(r4) const signed char *thresh, +; //stack(r5) int count --unused + +|vp8_loop_filter_horizontal_edge_y_neon| PROC + sub r0, r0, r1, lsl #2 ; move src pointer down by 4 lines + ldr r12, [sp, #0] ; load thresh pointer + + vld1.u8 {q3}, [r0], r1 ; p3 + vld1.s8 {d0[], d1[]}, [r2] ; flimit + vld1.u8 {q4}, [r0], r1 ; p2 + vld1.s8 {d2[], d3[]}, [r3] ; limit + vld1.u8 {q5}, [r0], r1 ; p1 + vld1.s8 {d4[], d5[]}, [r12] ; thresh + vld1.u8 {q6}, [r0], r1 ; p0 + ldr r12, _lfhy_coeff_ + vld1.u8 {q7}, [r0], r1 ; q0 + + ;vp8_filter_mask() function + ;vp8_hevmask() function + vabd.u8 q11, q3, q4 ; abs(p3 - p2) + vld1.u8 {q8}, [r0], r1 ; q1 + vabd.u8 q12, q4, q5 ; abs(p2 - p1) + vld1.u8 {q9}, [r0], r1 ; q2 + vabd.u8 q13, q5, q6 ; abs(p1 - p0) + vld1.u8 {q10}, [r0], r1 ; q3 + vabd.u8 q14, q8, q7 ; abs(q1 - q0) + vabd.u8 q3, q9, q8 ; abs(q2 - q1) + vabd.u8 q4, q10, q9 ; abs(q3 - q2) + vabd.u8 q9, q6, q7 ; abs(p0 - q0) + + vcge.u8 q15, q1, q11 ; (abs(p3 - p2) > limit)*-1 + vcge.u8 q12, q1, q12 ; (abs(p2 - p1) > limit)*-1 + vcge.u8 q10, q1, q13 ; (abs(p1 - p0) > limit)*-1 + vcge.u8 q11, q1, q14 ; (abs(q1 - q0) > limit)*-1 + + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + + vcge.u8 q3, q1, q3 ; (abs(q2 - q1) > limit)*-1 + vcge.u8 q4, q1, q4 ; (abs(q3 - q2) > limit)*-1 + vadd.u8 q0, q0, q0 ; flimit * 2 + vadd.u8 q0, q0, q1 ; flimit * 2 + limit + + vand q15, q15, q12 + vand q10, q10, q11 + vand q3, q3, q4 + + vabd.u8 q2, q5, q8 ; abs(p1 - q1) + vqadd.u8 q9, q9, q9 ; abs(p0 - q0) * 2 + vshr.u8 q2, q2, #1 ; abs(p1 - q1) / 2 + vqadd.u8 q9, q9, q2 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + vcge.u8 q9, q0, q9 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 + + vld1.u8 {q0}, [r12]! + + vand q15, q15, q10 + + ;vp8_filter() function + veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value + veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value + veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value + veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value +;;;;;;;;;;;;;; + vld1.u8 {q10}, [r12]! + + ;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0) + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q11, d15, d13 + + vand q3, q3, q9 + vmovl.u8 q4, d20 + + vqsub.s8 q1, q5, q8 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + vorr q14, q13, q14 ; q14: vp8_hevmask + + ;vmul.i8 q2, q2, q10 ; 3 * ( qs0 - ps0) + vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) + vmul.i16 q11, q11, q4 + + vand q1, q1, q14 ; vp8_filter &= hev + vand q15, q15, q3 ; q15: vp8_filter_mask + ;; + ;vld1.u8 {q4}, [r12]! ;no need 7 any more + + ;vqadd.s8 q1, q1, q2 + vaddw.s8 q2, q2, d2 + vaddw.s8 q11, q11, d3 + + vld1.u8 {q9}, [r12]! + ; + vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d3, q11 + ;; + + vand q1, q1, q15 ; vp8_filter &= mask + ;; +;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7 +; vand q2, q1, q4 ; s = vp8_filter & 7 +; vqadd.s8 q1, q1, q9 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) + ;;;; +; vshr.s8 q1, q1, #3 ; vp8_filter >>= 3 +; vceq.i8 q2, q2, q9 ; s = (s==4)*-1 + ;; +; ;calculate output +; vqsub.s8 q10, q7, q1 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) +; vqadd.s8 q11, q2, q1 ; u = vp8_signed_char_clamp(s + vp8_filter) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; q10=3 + vqadd.s8 q2, q1, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) + vqadd.s8 q1, q1, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) + vshr.s8 q2, q2, #3 ; Filter2 >>= 3 + vshr.s8 q1, q1, #3 ; Filter1 >>= 3 + + ;calculate output + vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) + vqsub.s8 q10, q7, q1 ; u = vp8_signed_char_clamp(qs0 - Filter1) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vrshr.s8 q1, q1, #1 ;round/shift: vp8_filter += 1; vp8_filter >>= 1 + + sub r0, r0, r1, lsl #2 + sub r0, r0, r1, lsl #1 + ; + + vbic q1, q1, q14 ; vp8_filter &= ~hev + ; + add r2, r1, r0 + + vqadd.s8 q13, q5, q1 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) + ;vqadd.s8 q11, q6, q11 ; u = vp8_signed_char_clamp(ps0 + u) + vqsub.s8 q12, q8, q1 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) + + add r3, r2, r1 + + veor q5, q13, q0 ; *op1 = u^0x80 + veor q6, q11, q0 ; *op0 = u^0x80 + veor q7, q10, q0 ; *oq0 = u^0x80 + veor q8, q12, q0 ; *oq1 = u^0x80 + + add r12, r3, r1 + + vst1.u8 {q5}, [r0] ; store op1 + vst1.u8 {q6}, [r2] ; store op0 + vst1.u8 {q7}, [r3] ; store oq0 + vst1.u8 {q8}, [r12] ; store oq1 + + bx lr + ENDP ; |vp8_loop_filter_horizontal_edge_y_neon| + +;----------------- + AREA hloopfiltery_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 16 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_lfhy_coeff_ + DCD lfhy_coeff +lfhy_coeff + DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 + DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 + DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 + DCD 0x01010101, 0x01010101, 0x01010101, 0x01010101 + + END diff --git a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm new file mode 100644 index 000000000..6d74fab52 --- /dev/null +++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm @@ -0,0 +1,117 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_simple_horizontal_edge_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit +;are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. +; r0 unsigned char *s, +; r1 int p, //pitch +; r2 const signed char *flimit, +; r3 const signed char *limit, +; stack(r4) const signed char *thresh, +; //stack(r5) int count --unused + +|vp8_loop_filter_simple_horizontal_edge_neon| PROC + sub r0, r0, r1, lsl #1 ; move src pointer down by 2 lines + + ldr r12, _lfhy_coeff_ + vld1.u8 {q5}, [r0], r1 ; p1 + vld1.s8 {d2[], d3[]}, [r2] ; flimit + vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13 + vld1.u8 {q6}, [r0], r1 ; p0 + vld1.u8 {q0}, [r12]! ; 0x80 + vld1.u8 {q7}, [r0], r1 ; q0 + vld1.u8 {q10}, [r12]! ; 0x03 + vld1.u8 {q8}, [r0] ; q1 + + ;vp8_filter_mask() function + vabd.u8 q15, q6, q7 ; abs(p0 - q0) + vabd.u8 q14, q5, q8 ; abs(p1 - q1) + vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 + vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 + vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + + ;vp8_filter() function + veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value + veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value + veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value + veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value + + vadd.u8 q1, q1, q1 ; flimit * 2 + vadd.u8 q1, q1, q13 ; flimit * 2 + limit + vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 + +;;;;;;;;;; + ;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0) + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q3, d15, d13 + + vqsub.s8 q4, q5, q8 ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1) + + ;vmul.i8 q2, q2, q10 ; 3 * ( qs0 - ps0) + vadd.s16 q11, q2, q2 ; 3 * ( qs0 - ps0) + vadd.s16 q12, q3, q3 + + vld1.u8 {q9}, [r12]! ; 0x04 + + vadd.s16 q2, q2, q11 + vadd.s16 q3, q3, q12 + + vaddw.s8 q2, q2, d8 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q3, q3, d9 + + ;vqadd.s8 q4, q4, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d8, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d9, q3 +;;;;;;;;;;;;; + + vand q4, q4, q15 ; vp8_filter &= mask + + vqadd.s8 q2, q4, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) + vqadd.s8 q4, q4, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) + vshr.s8 q2, q2, #3 ; Filter2 >>= 3 + vshr.s8 q4, q4, #3 ; Filter1 >>= 3 + + sub r0, r0, r1, lsl #1 + + ;calculate output + vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) + vqsub.s8 q10, q7, q4 ; u = vp8_signed_char_clamp(qs0 - Filter1) + + add r3, r0, r1 + + veor q6, q11, q0 ; *op0 = u^0x80 + veor q7, q10, q0 ; *oq0 = u^0x80 + + vst1.u8 {q6}, [r0] ; store op0 + vst1.u8 {q7}, [r3] ; store oq0 + + bx lr + ENDP ; |vp8_loop_filter_simple_horizontal_edge_neon| + +;----------------- + AREA hloopfiltery_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 16 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_lfhy_coeff_ + DCD lfhy_coeff +lfhy_coeff + DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 + DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 + DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 + + END diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm new file mode 100644 index 000000000..2bb6222b9 --- /dev/null +++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm @@ -0,0 +1,158 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_simple_vertical_edge_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;Note: flimit, limit, and thresh should be positive numbers. All 16 elements in flimit +;are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. +; r0 unsigned char *s, +; r1 int p, //pitch +; r2 const signed char *flimit, +; r3 const signed char *limit, +; stack(r4) const signed char *thresh, +; //stack(r5) int count --unused + +|vp8_loop_filter_simple_vertical_edge_neon| PROC + sub r0, r0, #2 ; move src pointer down by 2 columns + + vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r1 + vld1.s8 {d2[], d3[]}, [r2] ; flimit + vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13 + vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r0], r1 + ldr r12, _vlfy_coeff_ + vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r1 + vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r0], r1 + vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r1 + vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r0], r1 + vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r1 + vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r0], r1 + + vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 + vld1.u8 {q0}, [r12]! ; 0x80 + vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 + vld1.u8 {q11}, [r12]! ; 0x03 + vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 + vld1.u8 {q12}, [r12]! ; 0x04 + vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 + vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 + vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 + vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 + vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1 + + vswp d7, d10 + vswp d12, d9 + ;vswp q4, q5 ; p1:q3, p0:q5, q0:q4, q1:q6 + + ;vp8_filter_mask() function + ;vp8_hevmask() function + sub r0, r0, r1, lsl #4 + vabd.u8 q15, q5, q4 ; abs(p0 - q0) + vabd.u8 q14, q3, q6 ; abs(p1 - q1) + vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 + vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 + vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + + veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value + veor q5, q5, q0 ; ps0: p0 offset to convert to a signed value + veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value + veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value + + vadd.u8 q1, q1, q1 ; flimit * 2 + vadd.u8 q1, q1, q13 ; flimit * 2 + limit + vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 + + ;vp8_filter() function +;;;;;;;;;; + ;vqsub.s8 q2, q5, q4 ; ( qs0 - ps0) + vsubl.s8 q2, d8, d10 ; ( qs0 - ps0) + vsubl.s8 q13, d9, d11 + + vqsub.s8 q1, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + + ;vmul.i8 q2, q2, q11 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vadd.s16 q10, q2, q2 ; 3 * ( qs0 - ps0) + vadd.s16 q14, q13, q13 + vadd.s16 q2, q2, q10 + vadd.s16 q13, q13, q14 + + ;vqadd.s8 q1, q1, q2 + vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q13, q13, d3 + + vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d3, q13 + + add r0, r0, #1 + add r2, r0, r1 +;;;;;;;;;;; + + vand q1, q1, q15 ; vp8_filter &= mask + + vqadd.s8 q2, q1, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) + vqadd.s8 q1, q1, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) + vshr.s8 q2, q2, #3 ; Filter2 >>= 3 + vshr.s8 q1, q1, #3 ; Filter1 >>= 3 + + ;calculate output + vqsub.s8 q10, q4, q1 ; u = vp8_signed_char_clamp(qs0 - Filter1) + vqadd.s8 q11, q5, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) + + veor q7, q10, q0 ; *oq0 = u^0x80 + veor q6, q11, q0 ; *op0 = u^0x80 + + add r3, r2, r1 + vswp d13, d14 + add r12, r3, r1 + + ;store op1, op0, oq0, oq1 + vst2.8 {d12[0], d13[0]}, [r0] + vst2.8 {d12[1], d13[1]}, [r2] + vst2.8 {d12[2], d13[2]}, [r3] + vst2.8 {d12[3], d13[3]}, [r12], r1 + add r0, r12, r1 + vst2.8 {d12[4], d13[4]}, [r12] + vst2.8 {d12[5], d13[5]}, [r0], r1 + add r2, r0, r1 + vst2.8 {d12[6], d13[6]}, [r0] + vst2.8 {d12[7], d13[7]}, [r2], r1 + add r3, r2, r1 + vst2.8 {d14[0], d15[0]}, [r2] + vst2.8 {d14[1], d15[1]}, [r3], r1 + add r12, r3, r1 + vst2.8 {d14[2], d15[2]}, [r3] + vst2.8 {d14[3], d15[3]}, [r12], r1 + add r0, r12, r1 + vst2.8 {d14[4], d15[4]}, [r12] + vst2.8 {d14[5], d15[5]}, [r0], r1 + add r2, r0, r1 + vst2.8 {d14[6], d15[6]}, [r0] + vst2.8 {d14[7], d15[7]}, [r2] + + bx lr + ENDP ; |vp8_loop_filter_simple_vertical_edge_neon| + +;----------------- + AREA vloopfiltery_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 16 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_vlfy_coeff_ + DCD vlfy_coeff +vlfy_coeff + DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 + DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 + DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 + + END diff --git a/vp8/common/arm/neon/loopfilterverticaledge_uv_neon.asm b/vp8/common/arm/neon/loopfilterverticaledge_uv_neon.asm new file mode 100644 index 000000000..d79cc68a3 --- /dev/null +++ b/vp8/common/arm/neon/loopfilterverticaledge_uv_neon.asm @@ -0,0 +1,231 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_vertical_edge_uv_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit +;are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. +; r0 unsigned char *u, +; r1 int p, //pitch +; r2 const signed char *flimit, +; r3 const signed char *limit, +; stack(r4) const signed char *thresh, +; stack(r5) unsigned char *v + +|vp8_loop_filter_vertical_edge_uv_neon| PROC + sub r0, r0, #4 ; move u pointer down by 4 columns + vld1.s8 {d0[], d1[]}, [r2] ; flimit + + ldr r2, [sp, #4] ; load v ptr + ldr r12, [sp, #0] ; load thresh pointer + + sub r2, r2, #4 ; move v pointer down by 4 columns + + vld1.u8 {d6}, [r0], r1 ;load u data + vld1.u8 {d7}, [r2], r1 ;load v data + vld1.u8 {d8}, [r0], r1 + vld1.u8 {d9}, [r2], r1 + vld1.u8 {d10}, [r0], r1 + vld1.u8 {d11}, [r2], r1 + vld1.u8 {d12}, [r0], r1 + vld1.u8 {d13}, [r2], r1 + vld1.u8 {d14}, [r0], r1 + vld1.u8 {d15}, [r2], r1 + vld1.u8 {d16}, [r0], r1 + vld1.u8 {d17}, [r2], r1 + vld1.u8 {d18}, [r0], r1 + vld1.u8 {d19}, [r2], r1 + vld1.u8 {d20}, [r0], r1 + vld1.u8 {d21}, [r2], r1 + + ;transpose to 8x16 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + vld1.s8 {d2[], d3[]}, [r3] ; limit + vld1.s8 {d4[], d5[]}, [r12] ; thresh + + ldr r12, _vlfuv_coeff_ + ;vp8_filter_mask() function + ;vp8_hevmask() function + vabd.u8 q11, q3, q4 ; abs(p3 - p2) + vabd.u8 q12, q4, q5 ; abs(p2 - p1) + vabd.u8 q13, q5, q6 ; abs(p1 - p0) + vabd.u8 q14, q8, q7 ; abs(q1 - q0) + vabd.u8 q3, q9, q8 ; abs(q2 - q1) + vabd.u8 q4, q10, q9 ; abs(q3 - q2) + vabd.u8 q9, q6, q7 ; abs(p0 - q0) + + vcge.u8 q15, q1, q11 ; (abs(p3 - p2) > limit)*-1 + vcge.u8 q12, q1, q12 ; (abs(p2 - p1) > limit)*-1 + vcge.u8 q10, q1, q13 ; (abs(p1 - p0) > limit)*-1 + vcge.u8 q11, q1, q14 ; (abs(q1 - q0) > limit)*-1 + + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + + vcge.u8 q3, q1, q3 ; (abs(q2 - q1) > limit)*-1 + vcge.u8 q4, q1, q4 ; (abs(q3 - q2) > limit)*-1 + vadd.u8 q0, q0, q0 ; flimit * 2 + vadd.u8 q0, q0, q1 ; flimit * 2 + limit + + vand q15, q15, q12 + vand q10, q10, q11 + vand q3, q3, q4 + + vabd.u8 q2, q5, q8 ; abs(p1 - q1) + vqadd.u8 q9, q9, q9 ; abs(p0 - q0) * 2 + vshr.u8 q2, q2, #1 ; abs(p1 - q1) / 2 + vqadd.u8 q9, q9, q2 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + vcge.u8 q9, q0, q9 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 + + vld1.u8 {q0}, [r12]! + + vand q15, q15, q10 + + + ;vp8_filter() function + veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value + veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value + veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value + veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value +;;;;;;;;;;;;;; + vld1.u8 {q10}, [r12]! + + ;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0) + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q11, d15, d13 + + vand q3, q3, q9 + vmovl.u8 q4, d20 + + vqsub.s8 q1, q5, q8 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + vorr q14, q13, q14 ; q14: vp8_hevmask + + ;vmul.i8 q2, q2, q10 ; 3 * ( qs0 - ps0) + vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) + vmul.i16 q11, q11, q4 + + vand q1, q1, q14 ; vp8_filter &= hev + vand q15, q15, q3 ; q15: vp8_filter_mask + ;; + ;vld1.u8 {q4}, [r12]! ;no need 7 any more + + ;vqadd.s8 q1, q1, q2 + vaddw.s8 q2, q2, d2 + vaddw.s8 q11, q11, d3 + + vld1.u8 {q9}, [r12]! + ; + vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d3, q11 + ;; + + vand q1, q1, q15 ; vp8_filter &= mask + ;; +;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7 +; vand q2, q1, q4 ; s = vp8_filter & 7 +; vqadd.s8 q1, q1, q9 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) + ;;;; +; vshr.s8 q1, q1, #3 ; vp8_filter >>= 3 +; vceq.i8 q2, q2, q9 ; s = (s==4)*-1 + ;; +; ;calculate output +; vqsub.s8 q10, q7, q1 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) +; vqadd.s8 q11, q2, q1 ; u = vp8_signed_char_clamp(s + vp8_filter) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; q10=3 + vqadd.s8 q2, q1, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) + vqadd.s8 q1, q1, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) + vshr.s8 q2, q2, #3 ; Filter2 >>= 3 + vshr.s8 q1, q1, #3 ; Filter1 >>= 3 + ;calculate output + vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) + vqsub.s8 q10, q7, q1 ; u = vp8_signed_char_clamp(qs0 - Filter1) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vrshr.s8 q1, q1, #1 ;round/shift: vp8_filter += 1; vp8_filter >>= 1 + + sub r0, r0, r1, lsl #3 + add r0, r0, #2 + + vbic q1, q1, q14 ; vp8_filter &= ~hev + + sub r2, r2, r1, lsl #3 + add r2, r2, #2 + + vqadd.s8 q13, q5, q1 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) + ;vqadd.s8 q11, q6, q11 ; u = vp8_signed_char_clamp(ps0 + u) + vqsub.s8 q12, q8, q1 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) + + veor q7, q10, q0 ; *oq0 = u^0x80 + veor q5, q13, q0 ; *op1 = u^0x80 + veor q6, q11, q0 ; *op0 = u^0x80 + veor q8, q12, q0 ; *oq1 = u^0x80 + + vswp d12, d11 + vswp d16, d13 + vswp d14, d12 + vswp d16, d15 + + ;store op1, op0, oq0, oq1 + vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 + vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1 + vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 + vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r2], r1 + vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 + vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r2], r1 + vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 + vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r2], r1 + vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 + vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r2], r1 + vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 + vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r2], r1 + vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 + vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r2], r1 + vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1 + vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2], r1 + + bx lr + ENDP ; |vp8_loop_filter_vertical_edge_uv_neon| + +;----------------- + AREA vloopfilteruv_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 16 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_vlfuv_coeff_ + DCD vlfuv_coeff +vlfuv_coeff + DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 + DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 + DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 + DCD 0x01010101, 0x01010101, 0x01010101, 0x01010101 + + END diff --git a/vp8/common/arm/neon/loopfilterverticaledge_y_neon.asm b/vp8/common/arm/neon/loopfilterverticaledge_y_neon.asm new file mode 100644 index 000000000..3a230a953 --- /dev/null +++ b/vp8/common/arm/neon/loopfilterverticaledge_y_neon.asm @@ -0,0 +1,235 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_vertical_edge_y_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit +;are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. +; r0 unsigned char *s, +; r1 int p, //pitch +; r2 const signed char *flimit, +; r3 const signed char *limit, +; stack(r4) const signed char *thresh, +; //stack(r5) int count --unused + +|vp8_loop_filter_vertical_edge_y_neon| PROC + sub r0, r0, #4 ; move src pointer down by 4 columns + ldr r12, [sp, #0] ; load thresh pointer + + vld1.u8 {d6}, [r0], r1 ; load first 8-line src data + vld1.s8 {d0[], d1[]}, [r2] ; flimit + vld1.u8 {d8}, [r0], r1 + vld1.s8 {d2[], d3[]}, [r3] ; limit + vld1.u8 {d10}, [r0], r1 + vld1.s8 {d4[], d5[]}, [r12] ; thresh + vld1.u8 {d12}, [r0], r1 + ldr r12, _vlfy_coeff_ + vld1.u8 {d14}, [r0], r1 + vld1.u8 {d16}, [r0], r1 + vld1.u8 {d18}, [r0], r1 + vld1.u8 {d20}, [r0], r1 + + vld1.u8 {d7}, [r0], r1 ; load second 8-line src data + vld1.u8 {d9}, [r0], r1 + vld1.u8 {d11}, [r0], r1 + vld1.u8 {d13}, [r0], r1 + vld1.u8 {d15}, [r0], r1 + vld1.u8 {d17}, [r0], r1 + vld1.u8 {d19}, [r0], r1 + vld1.u8 {d21}, [r0], r1 + + ;transpose to 8x16 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + ;vp8_filter_mask() function + ;vp8_hevmask() function + vabd.u8 q11, q3, q4 ; abs(p3 - p2) + vabd.u8 q12, q4, q5 ; abs(p2 - p1) + vabd.u8 q13, q5, q6 ; abs(p1 - p0) + vabd.u8 q14, q8, q7 ; abs(q1 - q0) + vabd.u8 q3, q9, q8 ; abs(q2 - q1) + vabd.u8 q4, q10, q9 ; abs(q3 - q2) + vabd.u8 q9, q6, q7 ; abs(p0 - q0) + + vcge.u8 q15, q1, q11 ; (abs(p3 - p2) > limit)*-1 + vcge.u8 q12, q1, q12 ; (abs(p2 - p1) > limit)*-1 + vcge.u8 q10, q1, q13 ; (abs(p1 - p0) > limit)*-1 + vcge.u8 q11, q1, q14 ; (abs(q1 - q0) > limit)*-1 + + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + + vcge.u8 q3, q1, q3 ; (abs(q2 - q1) > limit)*-1 + vcge.u8 q4, q1, q4 ; (abs(q3 - q2) > limit)*-1 + vadd.u8 q0, q0, q0 ; flimit * 2 + vadd.u8 q0, q0, q1 ; flimit * 2 + limit + + vand q15, q15, q12 + vand q10, q10, q11 + vand q3, q3, q4 + + vabd.u8 q2, q5, q8 ; abs(p1 - q1) + vqadd.u8 q9, q9, q9 ; abs(p0 - q0) * 2 + vshr.u8 q2, q2, #1 ; abs(p1 - q1) / 2 + vqadd.u8 q9, q9, q2 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + vcge.u8 q9, q0, q9 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 + + vld1.u8 {q0}, [r12]! + + vand q15, q15, q10 + + + ;vp8_filter() function + veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value + veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value + veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value + veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value +;;;;;;;;;;;;;; + vld1.u8 {q10}, [r12]! + + ;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0) + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q11, d15, d13 + + vand q3, q3, q9 + vmovl.u8 q4, d20 + + vqsub.s8 q1, q5, q8 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + vorr q14, q13, q14 ; q14: vp8_hevmask + + ;vmul.i8 q2, q2, q10 ; 3 * ( qs0 - ps0) + vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) + vmul.i16 q11, q11, q4 + + vand q1, q1, q14 ; vp8_filter &= hev + vand q15, q15, q3 ; q15: vp8_filter_mask + ;; + ;vld1.u8 {q4}, [r12]! ;no need 7 any more + + ;vqadd.s8 q1, q1, q2 + vaddw.s8 q2, q2, d2 + vaddw.s8 q11, q11, d3 + + vld1.u8 {q9}, [r12]! + ; + vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d3, q11 + ;; + + vand q1, q1, q15 ; vp8_filter &= mask + ;; +;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7 +; vand q2, q1, q4 ; s = vp8_filter & 7 +; vqadd.s8 q1, q1, q9 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) + ;;;; +; vshr.s8 q1, q1, #3 ; vp8_filter >>= 3 +; vceq.i8 q2, q2, q9 ; s = (s==4)*-1 + ;; +; ;calculate output +; vqsub.s8 q10, q7, q1 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) +; vqadd.s8 q11, q2, q1 ; u = vp8_signed_char_clamp(s + vp8_filter) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; q10=3 + vqadd.s8 q2, q1, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) + vqadd.s8 q1, q1, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) + vshr.s8 q2, q2, #3 ; Filter2 >>= 3 + vshr.s8 q1, q1, #3 ; Filter1 >>= 3 + ;calculate output + vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) + vqsub.s8 q10, q7, q1 ; u = vp8_signed_char_clamp(qs0 - Filter1) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vrshr.s8 q1, q1, #1 ;round/shift: vp8_filter += 1; vp8_filter >>= 1 + + sub r0, r0, r1, lsl #4 + add r0, r0, #2 + ; + + vbic q1, q1, q14 ; vp8_filter &= ~hev + add r2, r0, r1 + ; + + vqadd.s8 q13, q5, q1 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) + ;vqadd.s8 q11, q6, q11 ; u = vp8_signed_char_clamp(ps0 + u) + vqsub.s8 q12, q8, q1 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) + + veor q7, q10, q0 ; *oq0 = u^0x80 + veor q5, q13, q0 ; *op1 = u^0x80 + veor q6, q11, q0 ; *op0 = u^0x80 + veor q8, q12, q0 ; *oq1 = u^0x80 + add r3, r2, r1 + ; + vswp d12, d11 + vswp d16, d13 + add r12, r3, r1 + vswp d14, d12 + vswp d16, d15 + + ;store op1, op0, oq0, oq1 + vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0] + vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r2] + vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r3] + vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1 + add r0, r12, r1 + vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r12] + vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 + add r2, r0, r1 + vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0] + vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r2], r1 + add r3, r2, r1 + vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2] + vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r3], r1 + add r12, r3, r1 + vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r3] + vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1 + add r0, r12, r1 + vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r12] + vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r0], r1 + add r2, r0, r1 + vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0] + vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2] + + bx lr + ENDP ; |vp8_loop_filter_vertical_edge_y_neon| + +;----------------- + AREA vloopfiltery_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 16 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_vlfy_coeff_ + DCD vlfy_coeff +vlfy_coeff + DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 + DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 + DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 + DCD 0x01010101, 0x01010101, 0x01010101, 0x01010101 + + END diff --git a/vp8/common/arm/neon/mbloopfilterhorizontaledge_uv_neon.asm b/vp8/common/arm/neon/mbloopfilterhorizontaledge_uv_neon.asm new file mode 100644 index 000000000..86eddaa2e --- /dev/null +++ b/vp8/common/arm/neon/mbloopfilterhorizontaledge_uv_neon.asm @@ -0,0 +1,257 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_mbloop_filter_horizontal_edge_uv_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit +;are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. +; r0 unsigned char *u, +; r1 int p, //pitch +; r2 const signed char *flimit, +; r3 const signed char *limit, +; stack(r4) const signed char *thresh, +; stack(r5) unsigned char *v +|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC + sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines + vld1.s8 {d2[], d3[]}, [r3] ; limit + ldr r3, [sp, #4] ; load v ptr + ldr r12, [sp, #0] ; load thresh pointer + sub r3, r3, r1, lsl #2 ; move v pointer down by 4 lines + + vld1.u8 {d6}, [r0], r1 ; p3 + vld1.u8 {d7}, [r3], r1 ; p3 + vld1.u8 {d8}, [r0], r1 ; p2 + vld1.u8 {d9}, [r3], r1 ; p2 + vld1.u8 {d10}, [r0], r1 ; p1 + vld1.u8 {d11}, [r3], r1 ; p1 + vld1.u8 {d12}, [r0], r1 ; p0 + vld1.u8 {d13}, [r3], r1 ; p0 + vld1.u8 {d14}, [r0], r1 ; q0 + vld1.u8 {d15}, [r3], r1 ; q0 + vld1.u8 {d16}, [r0], r1 ; q1 + vld1.u8 {d17}, [r3], r1 ; q1 + vld1.u8 {d18}, [r0], r1 ; q2 + vld1.u8 {d19}, [r3], r1 ; q2 + vld1.u8 {d20}, [r0], r1 ; q3 + vld1.u8 {d21}, [r3], r1 ; q3 + + vld1.s8 {d4[], d5[]}, [r12] ; thresh + + ldr r12, _mbhlfuv_coeff_ + + ;vp8_filter_mask() function + ;vp8_hevmask() function + vabd.u8 q11, q3, q4 ; abs(p3 - p2) + vabd.u8 q12, q4, q5 ; abs(p2 - p1) + vabd.u8 q13, q5, q6 ; abs(p1 - p0) + vabd.u8 q14, q8, q7 ; abs(q1 - q0) + vabd.u8 q3, q9, q8 ; abs(q2 - q1) + vabd.u8 q0, q10, q9 ; abs(q3 - q2) + + vcge.u8 q15, q1, q11 ; (abs(p3 - p2) > limit)*-1 + vcge.u8 q12, q1, q12 ; (abs(p2 - p1) > limit)*-1 + vcge.u8 q10, q1, q13 ; (abs(p1 - p0) > limit)*-1 + vcge.u8 q11, q1, q14 ; (abs(q1 - q0) > limit)*-1 + vcge.u8 q3, q1, q3 ; (abs(q2 - q1) > limit)*-1 + vcge.u8 q0, q1, q0 ; (abs(q3 - q2) > limit)*-1 + + vand q15, q15, q12 + + vabd.u8 q12, q6, q7 ; abs(p0 - q0) + + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + + vld1.s8 {d4[], d5[]}, [r2] ; flimit + + vand q10, q10, q11 + vand q3, q3, q0 + + vld1.u8 {q0}, [r12]! + + vadd.u8 q2, q2, q2 ; flimit * 2 + vadd.u8 q2, q2, q1 ; flimit * 2 + limit + + vabd.u8 q1, q5, q8 ; abs(p1 - q1) + vqadd.u8 q12, q12, q12 ; abs(p0 - q0) * 2 + vshr.u8 q1, q1, #1 ; abs(p1 - q1) / 2 + vqadd.u8 q12, q12, q1 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + vcge.u8 q12, q2, q12 ; (abs(p0 - q0)*2 + abs(p1 - q1)/2 > flimit*2 + limit)*-1 + + vand q15, q15, q10 + + ;vp8_filter() function + veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value + veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value + veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value + veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value + veor q4, q4, q0 ; ps2: p2 offset to convert to a signed value + veor q9, q9, q0 ; qs2: q2 offset to convert to a signed value +;;;;;;;;;;;;; + vorr q14, q13, q14 ; q14: vp8_hevmask + + ;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0) + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q13, d15, d13 + + vqsub.s8 q1, q5, q8 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + + ;vadd.s8 q10, q2, q2 ; 3 * ( qs0 - ps0) + vadd.s16 q10, q2, q2 ; 3 * ( qs0 - ps0) + vadd.s16 q11, q13, q13 + + vand q3, q3, q12 + + ;vadd.s8 q2, q2, q10 + vadd.s16 q2, q2, q10 + vadd.s16 q13, q13, q11 + + vld1.u8 {q12}, [r12]! ;#3 + + ;vqadd.s8 q1, q1, q2 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q13, q13, d3 + + vand q15, q15, q3 ; q15: vp8_filter_mask + vld1.u8 {q11}, [r12]! ;#4 + + vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d3, q13 + +;;;;;;;;;;;;;; + vand q1, q1, q15 ; vp8_filter &= mask + + vld1.u8 {q15}, [r12]! ;#63 + ; + vand q13, q1, q14 ; Filter2: q13; Filter2 &= hev + + vld1.u8 {d7}, [r12]! ;#9 + ; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7 +; vand q2, q13, q12 ; s = Filter2 & 7 + +; vqadd.s8 q13, q13, q11 ; Filter2 = vp8_signed_char_clamp(Filter2+4) +; vld1.u8 {d6}, [r12]! ;#18 + +; sub r0, r0, r1, lsl #3 +; sub r3, r3, r1, lsl #3 + +; vshr.s8 q13, q13, #3 ; Filter2 >>= 3 +; vceq.i8 q2, q2, q11 ; s = (s==4)*-1 + +; add r0, r0, r1 +; add r3, r3, r1 + +; vqsub.s8 q7, q7, q13 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) +; vqadd.s8 q11, q2, q13 ; u = vp8_signed_char_clamp(s + Filter2) + +; vld1.u8 {d5}, [r12]! ;#27 +; vmov q10, q15 +; vmov q12, q15 + +; vqadd.s8 q6, q6, q11 ; ps0 = vp8_signed_char_clamp(ps0 + u) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vqadd.s8 q2, q13, q11 ; Filter1 = vp8_signed_char_clamp(Filter2+4) + vqadd.s8 q13, q13, q12 ; Filter2 = vp8_signed_char_clamp(Filter2+3) + + vld1.u8 {d6}, [r12]! ;#18 + + sub r0, r0, r1, lsl #3 + sub r3, r3, r1, lsl #3 + + vshr.s8 q2, q2, #3 ; Filter1 >>= 3 + vshr.s8 q13, q13, #3 ; Filter2 >>= 3 + + vmov q10, q15 + vmov q12, q15 + + vqsub.s8 q7, q7, q2 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) + + vld1.u8 {d5}, [r12]! ;#27 + + add r0, r0, r1 + add r3, r3, r1 + + vqadd.s8 q6, q6, q13 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vbic q1, q1, q14 ; Filter2: q1; vp8_filter &= ~hev; Filter2 = vp8_filter + + ; roughly 1/7th difference across boundary + ; roughly 2/7th difference across boundary + ; roughly 3/7th difference across boundary + vmov q11, q15 + vmov q13, q15 + vmov q14, q15 + + vmlal.s8 q10, d2, d7 ; Filter2 * 9 + vmlal.s8 q11, d3, d7 + vmlal.s8 q12, d2, d6 ; Filter2 * 18 + vmlal.s8 q13, d3, d6 + vmlal.s8 q14, d2, d5 ; Filter2 * 27 + vmlal.s8 q15, d3, d5 + vqshrn.s16 d20, q10, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) + vqshrn.s16 d21, q11, #7 + vqshrn.s16 d24, q12, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + vqshrn.s16 d25, q13, #7 + vqshrn.s16 d28, q14, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) + vqshrn.s16 d29, q15, #7 + + vqsub.s8 q11, q9, q10 ; s = vp8_signed_char_clamp(qs2 - u) + vqadd.s8 q10, q4, q10 ; s = vp8_signed_char_clamp(ps2 + u) + vqsub.s8 q13, q8, q12 ; s = vp8_signed_char_clamp(qs1 - u) + vqadd.s8 q12, q5, q12 ; s = vp8_signed_char_clamp(ps1 + u) + vqsub.s8 q15, q7, q14 ; s = vp8_signed_char_clamp(qs0 - u) + vqadd.s8 q14, q6, q14 ; s = vp8_signed_char_clamp(ps0 + u) + veor q9, q11, q0 ; *oq2 = s^0x80 + veor q4, q10, q0 ; *op2 = s^0x80 + veor q8, q13, q0 ; *oq1 = s^0x80 + veor q5, q12, q0 ; *op2 = s^0x80 + veor q7, q15, q0 ; *oq0 = s^0x80 + veor q6, q14, q0 ; *op0 = s^0x80 + + vst1.u8 {d8}, [r0], r1 ; store u op2 + vst1.u8 {d9}, [r3], r1 ; store v op2 + vst1.u8 {d10}, [r0], r1 ; store u op1 + vst1.u8 {d11}, [r3], r1 ; store v op1 + vst1.u8 {d12}, [r0], r1 ; store u op0 + vst1.u8 {d13}, [r3], r1 ; store v op0 + vst1.u8 {d14}, [r0], r1 ; store u oq0 + vst1.u8 {d15}, [r3], r1 ; store v oq0 + vst1.u8 {d16}, [r0], r1 ; store u oq1 + vst1.u8 {d17}, [r3], r1 ; store v oq1 + vst1.u8 {d18}, [r0], r1 ; store u oq2 + vst1.u8 {d19}, [r3], r1 ; store v oq2 + + bx lr + ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon| + +;----------------- + AREA mbhloopfilteruv_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 16 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_mbhlfuv_coeff_ + DCD mbhlfuv_coeff +mbhlfuv_coeff + DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 + DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 + DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 + DCD 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f + DCD 0x09090909, 0x09090909, 0x12121212, 0x12121212 + DCD 0x1b1b1b1b, 0x1b1b1b1b + + END diff --git a/vp8/common/arm/neon/mbloopfilterhorizontaledge_y_neon.asm b/vp8/common/arm/neon/mbloopfilterhorizontaledge_y_neon.asm new file mode 100644 index 000000000..2ab0fc240 --- /dev/null +++ b/vp8/common/arm/neon/mbloopfilterhorizontaledge_y_neon.asm @@ -0,0 +1,236 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_mbloop_filter_horizontal_edge_y_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit +;are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. +; r0 unsigned char *s, +; r1 int p, //pitch +; r2 const signed char *flimit, +; r3 const signed char *limit, +; stack(r4) const signed char *thresh, +; //stack(r5) int count --unused +|vp8_mbloop_filter_horizontal_edge_y_neon| PROC + sub r0, r0, r1, lsl #2 ; move src pointer down by 4 lines + ldr r12, [sp, #0] ; load thresh pointer + + vld1.u8 {q3}, [r0], r1 ; p3 + vld1.s8 {d2[], d3[]}, [r3] ; limit + vld1.u8 {q4}, [r0], r1 ; p2 + vld1.s8 {d4[], d5[]}, [r12] ; thresh + vld1.u8 {q5}, [r0], r1 ; p1 + ldr r12, _mbhlfy_coeff_ + vld1.u8 {q6}, [r0], r1 ; p0 + + ;vp8_filter_mask() function + ;vp8_hevmask() function + vabd.u8 q11, q3, q4 ; abs(p3 - p2) + vld1.u8 {q7}, [r0], r1 ; q0 + vabd.u8 q12, q4, q5 ; abs(p2 - p1) + vld1.u8 {q8}, [r0], r1 ; q1 + vabd.u8 q13, q5, q6 ; abs(p1 - p0) + vld1.u8 {q9}, [r0], r1 ; q2 + vabd.u8 q14, q8, q7 ; abs(q1 - q0) + vld1.u8 {q10}, [r0], r1 ; q3 + vabd.u8 q3, q9, q8 ; abs(q2 - q1) + vabd.u8 q0, q10, q9 ; abs(q3 - q2) + + vcge.u8 q15, q1, q11 ; (abs(p3 - p2) > limit)*-1 + vcge.u8 q12, q1, q12 ; (abs(p2 - p1) > limit)*-1 + vcge.u8 q10, q1, q13 ; (abs(p1 - p0) > limit)*-1 + vcge.u8 q11, q1, q14 ; (abs(q1 - q0) > limit)*-1 + vcge.u8 q3, q1, q3 ; (abs(q2 - q1) > limit)*-1 + vcge.u8 q0, q1, q0 ; (abs(q3 - q2) > limit)*-1 + + vand q15, q15, q12 + + vabd.u8 q12, q6, q7 ; abs(p0 - q0) + + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + + vld1.s8 {d4[], d5[]}, [r2] ; flimit + + vand q10, q10, q11 + vand q3, q3, q0 + + vld1.u8 {q0}, [r12]! + + vadd.u8 q2, q2, q2 ; flimit * 2 + vadd.u8 q2, q2, q1 ; flimit * 2 + limit + + vabd.u8 q1, q5, q8 ; abs(p1 - q1) + vqadd.u8 q12, q12, q12 ; abs(p0 - q0) * 2 + vshr.u8 q1, q1, #1 ; abs(p1 - q1) / 2 + vqadd.u8 q12, q12, q1 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + vcge.u8 q12, q2, q12 ; (abs(p0 - q0)*2 + abs(p1 - q1)/2 > flimit*2 + limit)*-1 + + vand q15, q15, q10 + + ;vp8_filter() function + veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value + veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value + veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value + veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value + veor q4, q4, q0 ; ps2: p2 offset to convert to a signed value + veor q9, q9, q0 ; qs2: q2 offset to convert to a signed value +;;;;;;;;;;;;; + vorr q14, q13, q14 ; q14: vp8_hevmask + + ;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0) + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q13, d15, d13 + + vqsub.s8 q1, q5, q8 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + + ;vadd.s8 q10, q2, q2 ; 3 * ( qs0 - ps0) + vadd.s16 q10, q2, q2 ; 3 * ( qs0 - ps0) + vadd.s16 q11, q13, q13 + + vand q3, q3, q12 + + ;vadd.s8 q2, q2, q10 + vadd.s16 q2, q2, q10 + vadd.s16 q13, q13, q11 + + vld1.u8 {q12}, [r12]! ;#3 + + ;vqadd.s8 q1, q1, q2 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q13, q13, d3 + + vand q15, q15, q3 ; q15: vp8_filter_mask + vld1.u8 {q11}, [r12]! ;#4 + + vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d3, q13 + +;;;;;;;;;;;;;; + vand q1, q1, q15 ; vp8_filter &= mask + + vld1.u8 {q15}, [r12]! ;#63 + ; + vand q13, q1, q14 ; Filter2: q13; Filter2 &= hev + + vld1.u8 {d7}, [r12]! ;#9 + sub r0, r0, r1, lsl #3 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7 +; vand q2, q13, q12 ; s = Filter2 & 7 + +; vqadd.s8 q13, q13, q11 ; Filter2 = vp8_signed_char_clamp(Filter2+4) +; vld1.u8 {d6}, [r12]! ;#18 + +; add r0, r0, r1 +; add r2, r0, r1 + +; vshr.s8 q13, q13, #3 ; Filter2 >>= 3 +; vceq.i8 q2, q2, q11 ; s = (s==4)*-1 + +; add r3, r2, r1 + +; vqsub.s8 q7, q7, q13 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) +; vqadd.s8 q11, q2, q13 ; u = vp8_signed_char_clamp(s + Filter2) + +; vld1.u8 {d5}, [r12]! ;#27 +; vmov q10, q15 +; vmov q12, q15 + +; vqadd.s8 q6, q6, q11 ; ps0 = vp8_signed_char_clamp(ps0 + u) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vqadd.s8 q2, q13, q11 ; Filter1 = vp8_signed_char_clamp(Filter2+4) + vqadd.s8 q13, q13, q12 ; Filter2 = vp8_signed_char_clamp(Filter2+3) + + vld1.u8 {d6}, [r12]! ;#18 + add r0, r0, r1 + add r2, r0, r1 + + vshr.s8 q2, q2, #3 ; Filter1 >>= 3 + vshr.s8 q13, q13, #3 ; Filter2 >>= 3 + + vmov q10, q15 + vmov q12, q15 + + vqsub.s8 q7, q7, q2 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) + + vld1.u8 {d5}, [r12]! ;#27 + add r3, r2, r1 + + vqadd.s8 q6, q6, q13 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vbic q1, q1, q14 ; Filter2: q1; vp8_filter &= ~hev; Filter2 = vp8_filter + + ; roughly 1/7th difference across boundary + ; roughly 2/7th difference across boundary + ; roughly 3/7th difference across boundary + vmov q11, q15 + vmov q13, q15 + vmov q14, q15 + + vmlal.s8 q10, d2, d7 ; Filter2 * 9 + vmlal.s8 q11, d3, d7 + vmlal.s8 q12, d2, d6 ; Filter2 * 18 + vmlal.s8 q13, d3, d6 + vmlal.s8 q14, d2, d5 ; Filter2 * 27 + vmlal.s8 q15, d3, d5 + vqshrn.s16 d20, q10, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) + vqshrn.s16 d21, q11, #7 + vqshrn.s16 d24, q12, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + vqshrn.s16 d25, q13, #7 + vqshrn.s16 d28, q14, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) + vqshrn.s16 d29, q15, #7 + + vqsub.s8 q11, q9, q10 ; s = vp8_signed_char_clamp(qs2 - u) + vqadd.s8 q10, q4, q10 ; s = vp8_signed_char_clamp(ps2 + u) + vqsub.s8 q13, q8, q12 ; s = vp8_signed_char_clamp(qs1 - u) + vqadd.s8 q12, q5, q12 ; s = vp8_signed_char_clamp(ps1 + u) + vqsub.s8 q15, q7, q14 ; s = vp8_signed_char_clamp(qs0 - u) + vqadd.s8 q14, q6, q14 ; s = vp8_signed_char_clamp(ps0 + u) + veor q9, q11, q0 ; *oq2 = s^0x80 + veor q4, q10, q0 ; *op2 = s^0x80 + veor q5, q12, q0 ; *op2 = s^0x80 + veor q6, q14, q0 ; *op0 = s^0x80 + veor q8, q13, q0 ; *oq1 = s^0x80 + veor q7, q15, q0 ; *oq0 = s^0x80 + + vst1.u8 {q4}, [r0] ; store op2 + vst1.u8 {q5}, [r2] ; store op1 + vst1.u8 {q6}, [r3], r1 ; store op0 + add r12, r3, r1 + vst1.u8 {q7}, [r3] ; store oq0 + vst1.u8 {q8}, [r12], r1 ; store oq1 + vst1.u8 {q9}, [r12] ; store oq2 + + bx lr + ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon| + +;----------------- + AREA mbhloopfiltery_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 16 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_mbhlfy_coeff_ + DCD mbhlfy_coeff +mbhlfy_coeff + DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 + DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 + DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 + DCD 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f + DCD 0x09090909, 0x09090909, 0x12121212, 0x12121212 + DCD 0x1b1b1b1b, 0x1b1b1b1b + + END diff --git a/vp8/common/arm/neon/mbloopfilterverticaledge_uv_neon.asm b/vp8/common/arm/neon/mbloopfilterverticaledge_uv_neon.asm new file mode 100644 index 000000000..ad5afba34 --- /dev/null +++ b/vp8/common/arm/neon/mbloopfilterverticaledge_uv_neon.asm @@ -0,0 +1,296 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit +;are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. +; r0 unsigned char *u, +; r1 int p, //pitch +; r2 const signed char *flimit, +; r3 const signed char *limit, +; stack(r4) const signed char *thresh, +; stack(r5) unsigned char *v +|vp8_mbloop_filter_vertical_edge_uv_neon| PROC + sub r0, r0, #4 ; move src pointer down by 4 columns + vld1.s8 {d2[], d3[]}, [r3] ; limit + ldr r3, [sp, #4] ; load v ptr + ldr r12, [sp, #0] ; load thresh pointer + + sub r3, r3, #4 ; move v pointer down by 4 columns + + vld1.u8 {d6}, [r0], r1 ;load u data + vld1.u8 {d7}, [r3], r1 ;load v data + vld1.u8 {d8}, [r0], r1 + vld1.u8 {d9}, [r3], r1 + vld1.u8 {d10}, [r0], r1 + vld1.u8 {d11}, [r3], r1 + vld1.u8 {d12}, [r0], r1 + vld1.u8 {d13}, [r3], r1 + vld1.u8 {d14}, [r0], r1 + vld1.u8 {d15}, [r3], r1 + vld1.u8 {d16}, [r0], r1 + vld1.u8 {d17}, [r3], r1 + vld1.u8 {d18}, [r0], r1 + vld1.u8 {d19}, [r3], r1 + vld1.u8 {d20}, [r0], r1 + vld1.u8 {d21}, [r3], r1 + + ;transpose to 8x16 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + sub sp, sp, #32 + vld1.s8 {d4[], d5[]}, [r12] ; thresh + vst1.u8 {q3}, [sp]! + ldr r12, _mbvlfuv_coeff_ + vst1.u8 {q10}, [sp]! + + ;vp8_filter_mask() function + ;vp8_hevmask() function + vabd.u8 q11, q3, q4 ; abs(p3 - p2) + vabd.u8 q12, q4, q5 ; abs(p2 - p1) + vabd.u8 q13, q5, q6 ; abs(p1 - p0) + vabd.u8 q14, q8, q7 ; abs(q1 - q0) + vabd.u8 q3, q9, q8 ; abs(q2 - q1) + vabd.u8 q0, q10, q9 ; abs(q3 - q2) + + vcge.u8 q15, q1, q11 ; (abs(p3 - p2) > limit)*-1 + vcge.u8 q12, q1, q12 ; (abs(p2 - p1) > limit)*-1 + vcge.u8 q10, q1, q13 ; (abs(p1 - p0) > limit)*-1 + vcge.u8 q11, q1, q14 ; (abs(q1 - q0) > limit)*-1 + vcge.u8 q3, q1, q3 ; (abs(q2 - q1) > limit)*-1 + vcge.u8 q0, q1, q0 ; (abs(q3 - q2) > limit)*-1 + + vand q15, q15, q12 + + vabd.u8 q12, q6, q7 ; abs(p0 - q0) + + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + + vld1.s8 {d4[], d5[]}, [r2] ; flimit + + vand q10, q10, q11 + vand q3, q3, q0 + + vld1.u8 {q0}, [r12]! + + vadd.u8 q2, q2, q2 ; flimit * 2 + vadd.u8 q2, q2, q1 ; flimit * 2 + limit + + vabd.u8 q1, q5, q8 ; abs(p1 - q1) + vqadd.u8 q12, q12, q12 ; abs(p0 - q0) * 2 + vshr.u8 q1, q1, #1 ; abs(p1 - q1) / 2 + vqadd.u8 q12, q12, q1 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + vcge.u8 q12, q2, q12 ; (abs(p0 - q0)*2 + abs(p1 - q1)/2 > flimit*2 + limit)*-1 + + vand q15, q15, q10 + + ;vp8_filter() function + veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value + veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value + veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value + veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value + veor q4, q4, q0 ; ps2: p2 offset to convert to a signed value + veor q9, q9, q0 ; qs2: q2 offset to convert to a signed value +;;;;;;;;;;;;; + vorr q14, q13, q14 ; q14: vp8_hevmask + + ;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0) + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q13, d15, d13 + + vqsub.s8 q1, q5, q8 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + + ;vadd.s8 q10, q2, q2 ; 3 * ( qs0 - ps0) + vadd.s16 q10, q2, q2 ; 3 * ( qs0 - ps0) + vadd.s16 q11, q13, q13 + + vand q3, q3, q12 + + ;vadd.s8 q2, q2, q10 + vadd.s16 q2, q2, q10 + vadd.s16 q13, q13, q11 + + vld1.u8 {q12}, [r12]! ;#3 + + ;vqadd.s8 q1, q1, q2 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q13, q13, d3 + + vand q15, q15, q3 ; q15: vp8_filter_mask + vld1.u8 {q11}, [r12]! ;#4 + + vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d3, q13 + +;;;;;;;;;;;;;; + vand q1, q1, q15 ; vp8_filter &= mask + + vld1.u8 {q15}, [r12]! ;#63 + ; + vand q13, q1, q14 ; Filter2: q13; Filter2 &= hev + + vld1.u8 {d7}, [r12]! ;#9 + ; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7 +; vand q2, q13, q12 ; s = Filter2 & 7 + +; vqadd.s8 q13, q13, q11 ; Filter2 = vp8_signed_char_clamp(Filter2+4) +; vld1.u8 {d6}, [r12]! ;#18 + +; sub r0, r0, r1, lsl #3 +; sub r3, r3, r1, lsl #3 +; sub sp, sp, #32 + +; vshr.s8 q13, q13, #3 ; Filter2 >>= 3 +; vceq.i8 q2, q2, q11 ; s = (s==4)*-1 + +; vqsub.s8 q7, q7, q13 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) +; vqadd.s8 q11, q2, q13 ; u = vp8_signed_char_clamp(s + Filter2) + +; vld1.u8 {d5}, [r12]! ;#27 +; vmov q10, q15 +; vmov q12, q15 + +; vqadd.s8 q6, q6, q11 ; ps0 = vp8_signed_char_clamp(ps0 + u) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vqadd.s8 q2, q13, q11 ; Filter1 = vp8_signed_char_clamp(Filter2+4) + vqadd.s8 q13, q13, q12 ; Filter2 = vp8_signed_char_clamp(Filter2+3) + + vld1.u8 {d6}, [r12]! ;#18 + + sub r0, r0, r1, lsl #3 + sub r3, r3, r1, lsl #3 + + vshr.s8 q2, q2, #3 ; Filter1 >>= 3 + vshr.s8 q13, q13, #3 ; Filter2 >>= 3 + + vmov q10, q15 + vmov q12, q15 + + vqsub.s8 q7, q7, q2 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) + + vld1.u8 {d5}, [r12]! ;#27 + + sub sp, sp, #32 + + vqadd.s8 q6, q6, q13 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vbic q1, q1, q14 ; Filter2: q1; vp8_filter &= ~hev; Filter2 = vp8_filter + + ; roughly 1/7th difference across boundary + ; roughly 2/7th difference across boundary + ; roughly 3/7th difference across boundary + vmov q11, q15 + vmov q13, q15 + vmov q14, q15 + + vmlal.s8 q10, d2, d7 ; Filter2 * 9 + vmlal.s8 q11, d3, d7 + vmlal.s8 q12, d2, d6 ; Filter2 * 18 + vmlal.s8 q13, d3, d6 + vmlal.s8 q14, d2, d5 ; Filter2 * 27 + vmlal.s8 q15, d3, d5 + vqshrn.s16 d20, q10, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) + vqshrn.s16 d21, q11, #7 + vqshrn.s16 d24, q12, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + vqshrn.s16 d25, q13, #7 + vqshrn.s16 d28, q14, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) + vqshrn.s16 d29, q15, #7 + + vqsub.s8 q11, q9, q10 ; s = vp8_signed_char_clamp(qs2 - u) + vqadd.s8 q10, q4, q10 ; s = vp8_signed_char_clamp(ps2 + u) + vqsub.s8 q13, q8, q12 ; s = vp8_signed_char_clamp(qs1 - u) + vqadd.s8 q12, q5, q12 ; s = vp8_signed_char_clamp(ps1 + u) + vqsub.s8 q15, q7, q14 ; s = vp8_signed_char_clamp(qs0 - u) + vqadd.s8 q14, q6, q14 ; s = vp8_signed_char_clamp(ps0 + u) + veor q9, q11, q0 ; *oq2 = s^0x80 + veor q4, q10, q0 ; *op2 = s^0x80 + veor q8, q13, q0 ; *oq1 = s^0x80 + veor q5, q12, q0 ; *op2 = s^0x80 + veor q7, q15, q0 ; *oq0 = s^0x80 + vld1.u8 {q3}, [sp]! + veor q6, q14, q0 ; *op0 = s^0x80 + vld1.u8 {q10}, [sp]! + + ;transpose to 16x8 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + ;store op2, op1, op0, oq0, oq1, oq2 + vst1.8 {d6}, [r0], r1 + vst1.8 {d7}, [r3], r1 + vst1.8 {d8}, [r0], r1 + vst1.8 {d9}, [r3], r1 + vst1.8 {d10}, [r0], r1 + vst1.8 {d11}, [r3], r1 + vst1.8 {d12}, [r0], r1 + vst1.8 {d13}, [r3], r1 + vst1.8 {d14}, [r0], r1 + vst1.8 {d15}, [r3], r1 + vst1.8 {d16}, [r0], r1 + vst1.8 {d17}, [r3], r1 + vst1.8 {d18}, [r0], r1 + vst1.8 {d19}, [r3], r1 + vst1.8 {d20}, [r0], r1 + vst1.8 {d21}, [r3], r1 + + bx lr + ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon| + +;----------------- + AREA mbvloopfilteruv_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 16 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_mbvlfuv_coeff_ + DCD mbvlfuv_coeff +mbvlfuv_coeff + DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 + DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 + DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 + DCD 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f + DCD 0x09090909, 0x09090909, 0x12121212, 0x12121212 + DCD 0x1b1b1b1b, 0x1b1b1b1b + + END diff --git a/vp8/common/arm/neon/mbloopfilterverticaledge_y_neon.asm b/vp8/common/arm/neon/mbloopfilterverticaledge_y_neon.asm new file mode 100644 index 000000000..60e517519 --- /dev/null +++ b/vp8/common/arm/neon/mbloopfilterverticaledge_y_neon.asm @@ -0,0 +1,303 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_mbloop_filter_vertical_edge_y_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit +;are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. +; r0 unsigned char *s, +; r1 int p, //pitch +; r2 const signed char *flimit, +; r3 const signed char *limit, +; stack(r4) const signed char *thresh, +; //stack(r5) int count --unused +|vp8_mbloop_filter_vertical_edge_y_neon| PROC + sub r0, r0, #4 ; move src pointer down by 4 columns + + vld1.u8 {d6}, [r0], r1 ; load first 8-line src data + ldr r12, [sp, #0] ; load thresh pointer + vld1.u8 {d8}, [r0], r1 + sub sp, sp, #32 + vld1.u8 {d10}, [r0], r1 + vld1.u8 {d12}, [r0], r1 + vld1.u8 {d14}, [r0], r1 + vld1.u8 {d16}, [r0], r1 + vld1.u8 {d18}, [r0], r1 + vld1.u8 {d20}, [r0], r1 + + vld1.u8 {d7}, [r0], r1 ; load second 8-line src data + vld1.u8 {d9}, [r0], r1 + vld1.u8 {d11}, [r0], r1 + vld1.u8 {d13}, [r0], r1 + vld1.u8 {d15}, [r0], r1 + vld1.u8 {d17}, [r0], r1 + vld1.u8 {d19}, [r0], r1 + vld1.u8 {d21}, [r0], r1 + + ;transpose to 8x16 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + vld1.s8 {d2[], d3[]}, [r3] ; limit + vst1.u8 {q3}, [sp]! + vld1.s8 {d4[], d5[]}, [r12] ; thresh + ldr r12, _mbvlfy_coeff_ + vst1.u8 {q10}, [sp]! + + ;vp8_filter_mask() function + ;vp8_hevmask() function + vabd.u8 q11, q3, q4 ; abs(p3 - p2) + vabd.u8 q12, q4, q5 ; abs(p2 - p1) + vabd.u8 q13, q5, q6 ; abs(p1 - p0) + vabd.u8 q14, q8, q7 ; abs(q1 - q0) + vabd.u8 q3, q9, q8 ; abs(q2 - q1) + vabd.u8 q0, q10, q9 ; abs(q3 - q2) + + vcge.u8 q15, q1, q11 ; (abs(p3 - p2) > limit)*-1 + vcge.u8 q12, q1, q12 ; (abs(p2 - p1) > limit)*-1 + vcge.u8 q10, q1, q13 ; (abs(p1 - p0) > limit)*-1 + vcge.u8 q11, q1, q14 ; (abs(q1 - q0) > limit)*-1 + vcge.u8 q3, q1, q3 ; (abs(q2 - q1) > limit)*-1 + vcge.u8 q0, q1, q0 ; (abs(q3 - q2) > limit)*-1 + + vand q15, q15, q12 + + vabd.u8 q12, q6, q7 ; abs(p0 - q0) + + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + + vld1.s8 {d4[], d5[]}, [r2] ; flimit + + vand q10, q10, q11 + vand q3, q3, q0 + + vld1.u8 {q0}, [r12]! + + vadd.u8 q2, q2, q2 ; flimit * 2 + vadd.u8 q2, q2, q1 ; flimit * 2 + limit + + vabd.u8 q1, q5, q8 ; abs(p1 - q1) + vqadd.u8 q12, q12, q12 ; abs(p0 - q0) * 2 + vshr.u8 q1, q1, #1 ; abs(p1 - q1) / 2 + vqadd.u8 q12, q12, q1 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + vcge.u8 q12, q2, q12 ; (abs(p0 - q0)*2 + abs(p1 - q1)/2 > flimit*2 + limit)*-1 + + vand q15, q15, q10 + + ;vp8_filter() function + veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value + veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value + veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value + veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value + veor q4, q4, q0 ; ps2: p2 offset to convert to a signed value + veor q9, q9, q0 ; qs2: q2 offset to convert to a signed value +;;;;;;;;;;;;; + vorr q14, q13, q14 ; q14: vp8_hevmask + + ;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0) + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q13, d15, d13 + + vqsub.s8 q1, q5, q8 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + + ;vadd.s8 q10, q2, q2 ; 3 * ( qs0 - ps0) + vadd.s16 q10, q2, q2 ; 3 * ( qs0 - ps0) + vadd.s16 q11, q13, q13 + + vand q3, q3, q12 + + ;vadd.s8 q2, q2, q10 + vadd.s16 q2, q2, q10 + vadd.s16 q13, q13, q11 + + vld1.u8 {q12}, [r12]! ;#3 + + ;vqadd.s8 q1, q1, q2 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q13, q13, d3 + + vand q15, q15, q3 ; q15: vp8_filter_mask + vld1.u8 {q11}, [r12]! ;#4 + + vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d3, q13 + +;;;;;;;;;;;;;; + vand q1, q1, q15 ; vp8_filter &= mask + + vld1.u8 {q15}, [r12]! ;#63 + ; + vand q13, q1, q14 ; Filter2: q13; Filter2 &= hev + + vld1.u8 {d7}, [r12]! ;#9 + ; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7 +; vand q2, q13, q12 ; s = Filter2 & 7 + +; vqadd.s8 q13, q13, q11 ; Filter2 = vp8_signed_char_clamp(Filter2+4) +; vld1.u8 {d6}, [r12]! ;#18 + +; sub r0, r0, r1, lsl #4 +; sub sp, sp, #32 +; add r2, r0, r1 + +; vshr.s8 q13, q13, #3 ; Filter2 >>= 3 +; vceq.i8 q2, q2, q11 ; s = (s==4)*-1 + +; add r3, r2, r1 + +; vqsub.s8 q7, q7, q13 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) +; vqadd.s8 q11, q2, q13 ; u = vp8_signed_char_clamp(s + Filter2) + +; vld1.u8 {d5}, [r12]! ;#27 +; vmov q10, q15 +; vmov q12, q15 + +; vqadd.s8 q6, q6, q11 ; ps0 = vp8_signed_char_clamp(ps0 + u) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vqadd.s8 q2, q13, q11 ; Filter1 = vp8_signed_char_clamp(Filter2+4) + vqadd.s8 q13, q13, q12 ; Filter2 = vp8_signed_char_clamp(Filter2+3) + + vld1.u8 {d6}, [r12]! ;#18 + sub r0, r0, r1, lsl #4 + sub sp, sp, #32 + + add r2, r0, r1 + + vshr.s8 q2, q2, #3 ; Filter1 >>= 3 + vshr.s8 q13, q13, #3 ; Filter2 >>= 3 + + vmov q10, q15 + vmov q12, q15 + + vqsub.s8 q7, q7, q2 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) + + vld1.u8 {d5}, [r12]! ;#27 + add r3, r2, r1 + + vqadd.s8 q6, q6, q13 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vbic q1, q1, q14 ; Filter2: q1; vp8_filter &= ~hev; Filter2 = vp8_filter + + ; roughly 1/7th difference across boundary + ; roughly 2/7th difference across boundary + ; roughly 3/7th difference across boundary + vmov q11, q15 + vmov q13, q15 + vmov q14, q15 + + vmlal.s8 q10, d2, d7 ; Filter2 * 9 + vmlal.s8 q11, d3, d7 + vmlal.s8 q12, d2, d6 ; Filter2 * 18 + vmlal.s8 q13, d3, d6 + vmlal.s8 q14, d2, d5 ; Filter2 * 27 + vmlal.s8 q15, d3, d5 + vqshrn.s16 d20, q10, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) + vqshrn.s16 d21, q11, #7 + vqshrn.s16 d24, q12, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + vqshrn.s16 d25, q13, #7 + vqshrn.s16 d28, q14, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) + vqshrn.s16 d29, q15, #7 + + vqsub.s8 q11, q9, q10 ; s = vp8_signed_char_clamp(qs2 - u) + vqadd.s8 q10, q4, q10 ; s = vp8_signed_char_clamp(ps2 + u) + vqsub.s8 q13, q8, q12 ; s = vp8_signed_char_clamp(qs1 - u) + vqadd.s8 q12, q5, q12 ; s = vp8_signed_char_clamp(ps1 + u) + vqsub.s8 q15, q7, q14 ; s = vp8_signed_char_clamp(qs0 - u) + vqadd.s8 q14, q6, q14 ; s = vp8_signed_char_clamp(ps0 + u) + veor q9, q11, q0 ; *oq2 = s^0x80 + veor q4, q10, q0 ; *op2 = s^0x80 + veor q8, q13, q0 ; *oq1 = s^0x80 + veor q5, q12, q0 ; *op2 = s^0x80 + veor q7, q15, q0 ; *oq0 = s^0x80 + vld1.u8 {q3}, [sp]! + veor q6, q14, q0 ; *op0 = s^0x80 + vld1.u8 {q10}, [sp]! + + ;transpose to 16x8 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + add r12, r3, r1 + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + ;store op2, op1, op0, oq0, oq1, oq2 + vst1.8 {d6}, [r0] + vst1.8 {d8}, [r2] + vst1.8 {d10}, [r3] + vst1.8 {d12}, [r12], r1 + add r0, r12, r1 + vst1.8 {d14}, [r12] + vst1.8 {d16}, [r0], r1 + add r2, r0, r1 + vst1.8 {d18}, [r0] + vst1.8 {d20}, [r2], r1 + add r3, r2, r1 + vst1.8 {d7}, [r2] + vst1.8 {d9}, [r3], r1 + add r12, r3, r1 + vst1.8 {d11}, [r3] + vst1.8 {d13}, [r12], r1 + add r0, r12, r1 + vst1.8 {d15}, [r12] + vst1.8 {d17}, [r0], r1 + add r2, r0, r1 + vst1.8 {d19}, [r0] + vst1.8 {d21}, [r2] + + bx lr + ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon| + +;----------------- + AREA mbvloopfiltery_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 16 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_mbvlfy_coeff_ + DCD mbvlfy_coeff +mbvlfy_coeff + DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 + DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 + DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 + DCD 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f + DCD 0x09090909, 0x09090909, 0x12121212, 0x12121212 + DCD 0x1b1b1b1b, 0x1b1b1b1b + + END diff --git a/vp8/common/arm/neon/recon16x16mb_neon.asm b/vp8/common/arm/neon/recon16x16mb_neon.asm new file mode 100644 index 000000000..b9ba1cbc3 --- /dev/null +++ b/vp8/common/arm/neon/recon16x16mb_neon.asm @@ -0,0 +1,130 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_recon16x16mb_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *pred_ptr, +; r1 short *diff_ptr, +; r2 unsigned char *dst_ptr, +; r3 int ystride, +; stack unsigned char *udst_ptr, +; stack unsigned char *vdst_ptr + +|vp8_recon16x16mb_neon| PROC + mov r12, #4 ;loop counter for Y loop + +recon16x16mb_loop_y + vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr + vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr + vld1.u8 {q14, q15}, [r0]! + vld1.16 {q10, q11}, [r1]! + + vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits + vmovl.u8 q1, d25 + vmovl.u8 q2, d26 + vmovl.u8 q3, d27 + vmovl.u8 q4, d28 + vmovl.u8 q5, d29 + vmovl.u8 q6, d30 + vld1.16 {q12, q13}, [r1]! + vmovl.u8 q7, d31 + vld1.16 {q14, q15}, [r1]! + + pld [r0] + pld [r1] + pld [r1, #64] + + vadd.s16 q0, q0, q8 ;add Diff data and Pred data together + vadd.s16 q1, q1, q9 + vadd.s16 q2, q2, q10 + vadd.s16 q3, q3, q11 + vadd.s16 q4, q4, q12 + vadd.s16 q5, q5, q13 + vadd.s16 q6, q6, q14 + vadd.s16 q7, q7, q15 + + vqmovun.s16 d0, q0 ;CLAMP() saturation + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q2 + vqmovun.s16 d3, q3 + vqmovun.s16 d4, q4 + vqmovun.s16 d5, q5 + vst1.u8 {q0}, [r2], r3 ;store result + vqmovun.s16 d6, q6 + vst1.u8 {q1}, [r2], r3 + vqmovun.s16 d7, q7 + vst1.u8 {q2}, [r2], r3 + subs r12, r12, #1 + + moveq r12, #2 ;loop counter for UV loop + + vst1.u8 {q3}, [r2], r3 + bne recon16x16mb_loop_y + + mov r3, r3, lsr #1 ;uv_stride = ystride>>1 + ldr r2, [sp] ;load upred_ptr + +recon16x16mb_loop_uv + vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr + vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr + vld1.u8 {q14, q15}, [r0]! + vld1.16 {q10, q11}, [r1]! + + vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits + vmovl.u8 q1, d25 + vmovl.u8 q2, d26 + vmovl.u8 q3, d27 + vmovl.u8 q4, d28 + vmovl.u8 q5, d29 + vmovl.u8 q6, d30 + vld1.16 {q12, q13}, [r1]! + vmovl.u8 q7, d31 + vld1.16 {q14, q15}, [r1]! + + vadd.s16 q0, q0, q8 ;add Diff data and Pred data together + vadd.s16 q1, q1, q9 + vadd.s16 q2, q2, q10 + vadd.s16 q3, q3, q11 + vadd.s16 q4, q4, q12 + vadd.s16 q5, q5, q13 + vadd.s16 q6, q6, q14 + + vqmovun.s16 d0, q0 ;CLAMP() saturation + vadd.s16 q7, q7, q15 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q2 + vqmovun.s16 d3, q3 + vst1.u8 {d0}, [r2], r3 ;store result + vqmovun.s16 d4, q4 + vst1.u8 {d1}, [r2], r3 + vqmovun.s16 d5, q5 + vst1.u8 {d2}, [r2], r3 + vqmovun.s16 d6, q6 + vst1.u8 {d3}, [r2], r3 + vqmovun.s16 d7, q7 + vst1.u8 {d4}, [r2], r3 + subs r12, r12, #1 + + vst1.u8 {d5}, [r2], r3 + vst1.u8 {d6}, [r2], r3 + vst1.u8 {d7}, [r2], r3 + + ldrne r2, [sp, #4] ;load vpred_ptr + bne recon16x16mb_loop_uv + + bx lr + + ENDP + END diff --git a/vp8/common/arm/neon/recon2b_neon.asm b/vp8/common/arm/neon/recon2b_neon.asm new file mode 100644 index 000000000..25aaf8c8e --- /dev/null +++ b/vp8/common/arm/neon/recon2b_neon.asm @@ -0,0 +1,53 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_recon2b_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *pred_ptr, +; r1 short *diff_ptr, +; r2 unsigned char *dst_ptr, +; r3 int stride + +|vp8_recon2b_neon| PROC + vld1.u8 {q8, q9}, [r0] ;load data from pred_ptr + vld1.16 {q4, q5}, [r1]! ;load data from diff_ptr + + vmovl.u8 q0, d16 ;modify Pred data from 8 bits to 16 bits + vld1.16 {q6, q7}, [r1]! + vmovl.u8 q1, d17 + vmovl.u8 q2, d18 + vmovl.u8 q3, d19 + + vadd.s16 q0, q0, q4 ;add Diff data and Pred data together + vadd.s16 q1, q1, q5 + vadd.s16 q2, q2, q6 + vadd.s16 q3, q3, q7 + + vqmovun.s16 d0, q0 ;CLAMP() saturation + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q2 + vqmovun.s16 d3, q3 + add r0, r2, r3 + + vst1.u8 {d0}, [r2] ;store result + vst1.u8 {d1}, [r0], r3 + add r2, r0, r3 + vst1.u8 {d2}, [r0] + vst1.u8 {d3}, [r2], r3 + + bx lr + + ENDP + END diff --git a/vp8/common/arm/neon/recon4b_neon.asm b/vp8/common/arm/neon/recon4b_neon.asm new file mode 100644 index 000000000..a4f5b806b --- /dev/null +++ b/vp8/common/arm/neon/recon4b_neon.asm @@ -0,0 +1,68 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_recon4b_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *pred_ptr, +; r1 short *diff_ptr, +; r2 unsigned char *dst_ptr, +; r3 int stride + +|vp8_recon4b_neon| PROC + vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr + vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr + vld1.u8 {q14, q15}, [r0] + vld1.16 {q10, q11}, [r1]! + + vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits + vmovl.u8 q1, d25 + vmovl.u8 q2, d26 + vmovl.u8 q3, d27 + vmovl.u8 q4, d28 + vmovl.u8 q5, d29 + vmovl.u8 q6, d30 + vld1.16 {q12, q13}, [r1]! + vmovl.u8 q7, d31 + vld1.16 {q14, q15}, [r1] + + vadd.s16 q0, q0, q8 ;add Diff data and Pred data together + vadd.s16 q1, q1, q9 + vadd.s16 q2, q2, q10 + vadd.s16 q3, q3, q11 + vadd.s16 q4, q4, q12 + vadd.s16 q5, q5, q13 + vadd.s16 q6, q6, q14 + vadd.s16 q7, q7, q15 + + vqmovun.s16 d0, q0 ;CLAMP() saturation + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q2 + vqmovun.s16 d3, q3 + vqmovun.s16 d4, q4 + vqmovun.s16 d5, q5 + vqmovun.s16 d6, q6 + vqmovun.s16 d7, q7 + add r0, r2, r3 + + vst1.u8 {q0}, [r2] ;store result + vst1.u8 {q1}, [r0], r3 + add r2, r0, r3 + vst1.u8 {q2}, [r0] + vst1.u8 {q3}, [r2], r3 + + bx lr + + ENDP + END diff --git a/vp8/common/arm/neon/reconb_neon.asm b/vp8/common/arm/neon/reconb_neon.asm new file mode 100644 index 000000000..16d85a0d5 --- /dev/null +++ b/vp8/common/arm/neon/reconb_neon.asm @@ -0,0 +1,60 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_recon_b_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *pred_ptr, +; r1 short *diff_ptr, +; r2 unsigned char *dst_ptr, +; r3 int stride + +|vp8_recon_b_neon| PROC + mov r12, #16 + + vld1.u8 {d28}, [r0], r12 ;load 4 data/line from pred_ptr + vld1.16 {q10, q11}, [r1]! ;load data from diff_ptr + vld1.u8 {d29}, [r0], r12 + vld1.16 {q11, q12}, [r1]! + vld1.u8 {d30}, [r0], r12 + vld1.16 {q12, q13}, [r1]! + vld1.u8 {d31}, [r0], r12 + vld1.16 {q13}, [r1] + + vmovl.u8 q0, d28 ;modify Pred data from 8 bits to 16 bits + vmovl.u8 q1, d29 ;Pred data in d0, d2, d4, d6 + vmovl.u8 q2, d30 + vmovl.u8 q3, d31 + + vadd.s16 d0, d0, d20 ;add Diff data and Pred data together + vadd.s16 d2, d2, d22 + vadd.s16 d4, d4, d24 + vadd.s16 d6, d6, d26 + + vqmovun.s16 d0, q0 ;CLAMP() saturation + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q2 + vqmovun.s16 d3, q3 + add r1, r2, r3 + + vst1.32 {d0[0]}, [r2] ;store result + vst1.32 {d1[0]}, [r1], r3 + add r2, r1, r3 + vst1.32 {d2[0]}, [r1] + vst1.32 {d3[0]}, [r2], r3 + + bx lr + + ENDP + END diff --git a/vp8/common/arm/neon/save_neon_reg.asm b/vp8/common/arm/neon/save_neon_reg.asm new file mode 100644 index 000000000..4873e447f --- /dev/null +++ b/vp8/common/arm/neon/save_neon_reg.asm @@ -0,0 +1,35 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_push_neon| + EXPORT |vp8_pop_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vp8_push_neon| PROC + vst1.i64 {d8, d9, d10, d11}, [r0]! + vst1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + ENDP + +|vp8_pop_neon| PROC + vld1.i64 {d8, d9, d10, d11}, [r0]! + vld1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + ENDP + + END + diff --git a/vp8/common/arm/neon/shortidct4x4llm_1_neon.asm b/vp8/common/arm/neon/shortidct4x4llm_1_neon.asm new file mode 100644 index 000000000..7d06ff908 --- /dev/null +++ b/vp8/common/arm/neon/shortidct4x4llm_1_neon.asm @@ -0,0 +1,66 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_short_idct4x4llm_1_neon| + EXPORT |vp8_dc_only_idct_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch); +; r0 short *input; +; r1 short *output; +; r2 int pitch; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +|vp8_short_idct4x4llm_1_neon| PROC + vld1.16 {d0[]}, [r0] ;load input[0] + + add r3, r1, r2 + add r12, r3, r2 + + vrshr.s16 d0, d0, #3 + + add r0, r12, r2 + + vst1.16 {d0}, [r1] + vst1.16 {d0}, [r3] + vst1.16 {d0}, [r12] + vst1.16 {d0}, [r0] + + bx lr + ENDP + +;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void vp8_dc_only_idct_c(short input_dc, short *output, int pitch); +; r0 short input_dc; +; r1 short *output; +; r2 int pitch; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +|vp8_dc_only_idct_neon| PROC + vdup.16 d0, r0 + + add r3, r1, r2 + add r12, r3, r2 + + vrshr.s16 d0, d0, #3 + + add r0, r12, r2 + + vst1.16 {d0}, [r1] + vst1.16 {d0}, [r3] + vst1.16 {d0}, [r12] + vst1.16 {d0}, [r0] + + bx lr + + ENDP + END diff --git a/vp8/common/arm/neon/shortidct4x4llm_neon.asm b/vp8/common/arm/neon/shortidct4x4llm_neon.asm new file mode 100644 index 000000000..ffecfbfbc --- /dev/null +++ b/vp8/common/arm/neon/shortidct4x4llm_neon.asm @@ -0,0 +1,126 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_short_idct4x4llm_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;************************************************************* +;void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) +;r0 short * input +;r1 short * output +;r2 int pitch +;************************************************************* +;static const int cospi8sqrt2minus1=20091; +;static const int sinpi8sqrt2 =35468; +;static const int rounding = 0; +;Optimization note: The resulted data from dequantization are signed 13-bit data that is +;in the range of [-4096, 4095]. This allows to use "vqdmulh"(neon) instruction since +;it won't go out of range (13+16+1=30bits<32bits). This instruction gives the high half +;result of the multiplication that is needed in IDCT. + +|vp8_short_idct4x4llm_neon| PROC + ldr r12, _idct_coeff_ + vld1.16 {q1, q2}, [r0] + vld1.16 {d0}, [r12] + + vswp d3, d4 ;q2(vp[4] vp[12]) + + vqdmulh.s16 q3, q2, d0[2] + vqdmulh.s16 q4, q2, d0[0] + + vqadd.s16 d12, d2, d3 ;a1 + vqsub.s16 d13, d2, d3 ;b1 + + vshr.s16 q3, q3, #1 + vshr.s16 q4, q4, #1 + + vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) + vqadd.s16 q4, q4, q2 + + ;d6 - c1:temp1 + ;d7 - d1:temp2 + ;d8 - d1:temp1 + ;d9 - c1:temp2 + + vqsub.s16 d10, d6, d9 ;c1 + vqadd.s16 d11, d7, d8 ;d1 + + vqadd.s16 d2, d12, d11 + vqadd.s16 d3, d13, d10 + vqsub.s16 d4, d13, d10 + vqsub.s16 d5, d12, d11 + + vtrn.32 d2, d4 + vtrn.32 d3, d5 + vtrn.16 d2, d3 + vtrn.16 d4, d5 + + vswp d3, d4 + + vqdmulh.s16 q3, q2, d0[2] + vqdmulh.s16 q4, q2, d0[0] + + vqadd.s16 d12, d2, d3 ;a1 + vqsub.s16 d13, d2, d3 ;b1 + + vshr.s16 q3, q3, #1 + vshr.s16 q4, q4, #1 + + vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) + vqadd.s16 q4, q4, q2 + + vqsub.s16 d10, d6, d9 ;c1 + vqadd.s16 d11, d7, d8 ;d1 + + vqadd.s16 d2, d12, d11 + vqadd.s16 d3, d13, d10 + vqsub.s16 d4, d13, d10 + vqsub.s16 d5, d12, d11 + + vrshr.s16 d2, d2, #3 + vrshr.s16 d3, d3, #3 + vrshr.s16 d4, d4, #3 + vrshr.s16 d5, d5, #3 + + add r3, r1, r2 + add r12, r3, r2 + add r0, r12, r2 + + vtrn.32 d2, d4 + vtrn.32 d3, d5 + vtrn.16 d2, d3 + vtrn.16 d4, d5 + + vst1.16 {d2}, [r1] + vst1.16 {d3}, [r3] + vst1.16 {d4}, [r12] + vst1.16 {d5}, [r0] + + bx lr + + ENDP + +;----------------- + AREA idct4x4_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_idct_coeff_ + DCD idct_coeff +idct_coeff + DCD 0x4e7b4e7b, 0x8a8c8a8c + +;20091, 20091, 35468, 35468 + + END diff --git a/vp8/common/arm/neon/sixtappredict16x16_neon.asm b/vp8/common/arm/neon/sixtappredict16x16_neon.asm new file mode 100644 index 000000000..9f5f0d2ce --- /dev/null +++ b/vp8/common/arm/neon/sixtappredict16x16_neon.asm @@ -0,0 +1,494 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_sixtap_predict16x16_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(r5) int dst_pitch + +;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to +; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication, +; the result can be negtive. So, I treat the result as s16. But, since it is also possible +; that the result can be a large positive number (> 2^15-1), which could be confused as a +; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2, +; which ensures that the result stays in s16 range. Finally, saturated add the result by +; applying 3rd filter coeff. Same applys to other filter functions. + +|vp8_sixtap_predict16x16_neon| PROC + push {r4-r5, lr} + + ldr r12, _filter16_coeff_ + ldr r4, [sp, #12] ;load parameters from stack + ldr r5, [sp, #16] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_filter16x16_only + + add r2, r12, r2, lsl #5 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + + vld1.s32 {q14, q15}, [r2] ;load first_pass filter + + beq firstpass_filter16x16_only + + sub sp, sp, #336 ;reserve space on stack for temporary storage + mov lr, sp + + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + mov r2, #7 ;loop counter + sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) + sub r0, r0, r1, lsl #1 + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vdup.8 d1, d24[4] + vdup.8 d2, d25[0] + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + +;First Pass: output_height lines x output_width columns (21x16) +filt_blk2d_fp16x16_loop_neon + vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data + vld1.u8 {d9, d10, d11}, [r0], r1 + vld1.u8 {d12, d13, d14}, [r0], r1 + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q9, d7, d0 + vmull.u8 q10, d9, d0 + vmull.u8 q11, d10, d0 + vmull.u8 q12, d12, d0 + vmull.u8 q13, d13, d0 + + vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d29, d9, d10, #1 + vext.8 d30, d12, d13, #1 + + vmlsl.u8 q8, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q10, d29, d1 + vmlsl.u8 q12, d30, d1 + + vext.8 d28, d7, d8, #1 + vext.8 d29, d10, d11, #1 + vext.8 d30, d13, d14, #1 + + vmlsl.u8 q9, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q11, d29, d1 + vmlsl.u8 q13, d30, d1 + + vext.8 d28, d6, d7, #4 ;construct src_ptr[2] + vext.8 d29, d9, d10, #4 + vext.8 d30, d12, d13, #4 + + vmlsl.u8 q8, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q10, d29, d4 + vmlsl.u8 q12, d30, d4 + + vext.8 d28, d7, d8, #4 + vext.8 d29, d10, d11, #4 + vext.8 d30, d13, d14, #4 + + vmlsl.u8 q9, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q11, d29, d4 + vmlsl.u8 q13, d30, d4 + + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d9, d10, #5 + vext.8 d30, d12, d13, #5 + + vmlal.u8 q8, d28, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q10, d29, d5 + vmlal.u8 q12, d30, d5 + + vext.8 d28, d7, d8, #5 + vext.8 d29, d10, d11, #5 + vext.8 d30, d13, d14, #5 + + vmlal.u8 q9, d28, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q11, d29, d5 + vmlal.u8 q13, d30, d5 + + vext.8 d28, d6, d7, #2 ;construct src_ptr[0] + vext.8 d29, d9, d10, #2 + vext.8 d30, d12, d13, #2 + + vmlal.u8 q8, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q10, d29, d2 + vmlal.u8 q12, d30, d2 + + vext.8 d28, d7, d8, #2 + vext.8 d29, d10, d11, #2 + vext.8 d30, d13, d14, #2 + + vmlal.u8 q9, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q11, d29, d2 + vmlal.u8 q13, d30, d2 + + vext.8 d28, d6, d7, #3 ;construct src_ptr[1] + vext.8 d29, d9, d10, #3 + vext.8 d30, d12, d13, #3 + + vext.8 d15, d7, d8, #3 + vext.8 d31, d10, d11, #3 + vext.8 d6, d13, d14, #3 + + vmull.u8 q4, d28, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q5, d29, d3 + vmull.u8 q6, d30, d3 + + vqadd.s16 q8, q4 ;sum of all (src_data*filter_parameters) + vqadd.s16 q10, q5 + vqadd.s16 q12, q6 + + vmull.u8 q6, d15, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q7, d31, d3 + vmull.u8 q3, d6, d3 + + subs r2, r2, #1 + + vqadd.s16 q9, q6 + vqadd.s16 q11, q7 + vqadd.s16 q13, q3 + + vqrshrun.s16 d6, q8, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q9, #7 + vqrshrun.s16 d8, q10, #7 + vqrshrun.s16 d9, q11, #7 + vqrshrun.s16 d10, q12, #7 + vqrshrun.s16 d11, q13, #7 + + vst1.u8 {d6, d7, d8}, [lr]! ;store result + vst1.u8 {d9, d10, d11}, [lr]! + + bne filt_blk2d_fp16x16_loop_neon + +;Second pass: 16x16 +;secondpass_filter - do first 8-columns and then second 8-columns + add r3, r12, r3, lsl #5 + sub lr, lr, #336 + + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + mov r3, #2 ;loop counter + + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + mov r2, #16 + + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d14[4] + vdup.8 d2, d15[0] + vdup.8 d3, d15[4] + vdup.8 d4, d16[0] + vdup.8 d5, d16[4] + +filt_blk2d_sp16x16_outloop_neon + vld1.u8 {d18}, [lr], r2 ;load src data + vld1.u8 {d19}, [lr], r2 + vld1.u8 {d20}, [lr], r2 + vld1.u8 {d21}, [lr], r2 + mov r12, #4 ;loop counter + vld1.u8 {d22}, [lr], r2 + +secondpass_inner_loop_neon + vld1.u8 {d23}, [lr], r2 ;load src data + vld1.u8 {d24}, [lr], r2 + vld1.u8 {d25}, [lr], r2 + vld1.u8 {d26}, [lr], r2 + + vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d19, d0 + vmull.u8 q5, d20, d0 + vmull.u8 q6, d21, d0 + + vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q4, d20, d1 + vmlsl.u8 q5, d21, d1 + vmlsl.u8 q6, d22, d1 + + vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d23, d4 + vmlsl.u8 q5, d24, d4 + vmlsl.u8 q6, d25, d4 + + vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d21, d2 + vmlal.u8 q5, d22, d2 + vmlal.u8 q6, d23, d2 + + vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q4, d24, d5 + vmlal.u8 q5, d25, d5 + vmlal.u8 q6, d26, d5 + + vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q8, d22, d3 + vmull.u8 q9, d23, d3 + vmull.u8 q10, d24, d3 + + subs r12, r12, #1 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vst1.u8 {d6}, [r4], r5 ;store result + vmov q9, q11 + vst1.u8 {d7}, [r4], r5 + vmov q10, q12 + vst1.u8 {d8}, [r4], r5 + vmov d22, d26 + vst1.u8 {d9}, [r4], r5 + + bne secondpass_inner_loop_neon + + subs r3, r3, #1 + sub lr, lr, #336 + add lr, lr, #8 + + sub r4, r4, r5, lsl #4 + add r4, r4, #8 + + bne filt_blk2d_sp16x16_outloop_neon + + add sp, sp, #336 + pop {r4-r5,pc} + +;-------------------- +firstpass_filter16x16_only + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + mov r2, #8 ;loop counter + sub r0, r0, #2 ;move srcptr back to (column-2) + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vdup.8 d1, d24[4] + vdup.8 d2, d25[0] + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + +;First Pass: output_height lines x output_width columns (16x16) +filt_blk2d_fpo16x16_loop_neon + vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data + vld1.u8 {d9, d10, d11}, [r0], r1 + + pld [r0] + pld [r0, r1] + + vmull.u8 q6, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q7, d7, d0 + vmull.u8 q8, d9, d0 + vmull.u8 q9, d10, d0 + + vext.8 d20, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d21, d9, d10, #1 + vext.8 d22, d7, d8, #1 + vext.8 d23, d10, d11, #1 + vext.8 d24, d6, d7, #4 ;construct src_ptr[2] + vext.8 d25, d9, d10, #4 + vext.8 d26, d7, d8, #4 + vext.8 d27, d10, d11, #4 + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d9, d10, #5 + + vmlsl.u8 q6, d20, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d21, d1 + vmlsl.u8 q7, d22, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q9, d23, d1 + vmlsl.u8 q6, d24, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d25, d4 + vmlsl.u8 q7, d26, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q9, d27, d4 + vmlal.u8 q6, d28, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q8, d29, d5 + + vext.8 d20, d7, d8, #5 + vext.8 d21, d10, d11, #5 + vext.8 d22, d6, d7, #2 ;construct src_ptr[0] + vext.8 d23, d9, d10, #2 + vext.8 d24, d7, d8, #2 + vext.8 d25, d10, d11, #2 + + vext.8 d26, d6, d7, #3 ;construct src_ptr[1] + vext.8 d27, d9, d10, #3 + vext.8 d28, d7, d8, #3 + vext.8 d29, d10, d11, #3 + + vmlal.u8 q7, d20, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q9, d21, d5 + vmlal.u8 q6, d22, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d23, d2 + vmlal.u8 q7, d24, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q9, d25, d2 + + vmull.u8 q10, d26, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q11, d27, d3 + vmull.u8 q12, d28, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q15, d29, d3 + + vqadd.s16 q6, q10 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q11 + vqadd.s16 q7, q12 + vqadd.s16 q9, q15 + + subs r2, r2, #1 + + vqrshrun.s16 d6, q6, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q7, #7 + vqrshrun.s16 d8, q8, #7 + vqrshrun.s16 d9, q9, #7 + + vst1.u8 {q3}, [r4], r5 ;store result + vst1.u8 {q4}, [r4], r5 + + bne filt_blk2d_fpo16x16_loop_neon + + pop {r4-r5,pc} + +;-------------------- +secondpass_filter16x16_only +;Second pass: 16x16 + add r3, r12, r3, lsl #5 + sub r0, r0, r1, lsl #1 + + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + mov r3, #2 ;loop counter + + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d14[4] + vdup.8 d2, d15[0] + vdup.8 d3, d15[4] + vdup.8 d4, d16[0] + vdup.8 d5, d16[4] + +filt_blk2d_spo16x16_outloop_neon + vld1.u8 {d18}, [r0], r1 ;load src data + vld1.u8 {d19}, [r0], r1 + vld1.u8 {d20}, [r0], r1 + vld1.u8 {d21}, [r0], r1 + mov r12, #4 ;loop counter + vld1.u8 {d22}, [r0], r1 + +secondpass_only_inner_loop_neon + vld1.u8 {d23}, [r0], r1 ;load src data + vld1.u8 {d24}, [r0], r1 + vld1.u8 {d25}, [r0], r1 + vld1.u8 {d26}, [r0], r1 + + vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d19, d0 + vmull.u8 q5, d20, d0 + vmull.u8 q6, d21, d0 + + vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q4, d20, d1 + vmlsl.u8 q5, d21, d1 + vmlsl.u8 q6, d22, d1 + + vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d23, d4 + vmlsl.u8 q5, d24, d4 + vmlsl.u8 q6, d25, d4 + + vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d21, d2 + vmlal.u8 q5, d22, d2 + vmlal.u8 q6, d23, d2 + + vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q4, d24, d5 + vmlal.u8 q5, d25, d5 + vmlal.u8 q6, d26, d5 + + vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q8, d22, d3 + vmull.u8 q9, d23, d3 + vmull.u8 q10, d24, d3 + + subs r12, r12, #1 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vst1.u8 {d6}, [r4], r5 ;store result + vmov q9, q11 + vst1.u8 {d7}, [r4], r5 + vmov q10, q12 + vst1.u8 {d8}, [r4], r5 + vmov d22, d26 + vst1.u8 {d9}, [r4], r5 + + bne secondpass_only_inner_loop_neon + + subs r3, r3, #1 + sub r0, r0, r1, lsl #4 + sub r0, r0, r1, lsl #2 + sub r0, r0, r1 + add r0, r0, #8 + + sub r4, r4, r5, lsl #4 + add r4, r4, #8 + + bne filt_blk2d_spo16x16_outloop_neon + + pop {r4-r5,pc} + + ENDP + +;----------------- + AREA subpelfilters16_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_filter16_coeff_ + DCD filter16_coeff +filter16_coeff + DCD 0, 0, 128, 0, 0, 0, 0, 0 + DCD 0, -6, 123, 12, -1, 0, 0, 0 + DCD 2, -11, 108, 36, -8, 1, 0, 0 + DCD 0, -9, 93, 50, -6, 0, 0, 0 + DCD 3, -16, 77, 77, -16, 3, 0, 0 + DCD 0, -6, 50, 93, -9, 0, 0, 0 + DCD 1, -8, 36, 108, -11, 2, 0, 0 + DCD 0, -1, 12, 123, -6, 0, 0, 0 + + END diff --git a/vp8/common/arm/neon/sixtappredict4x4_neon.asm b/vp8/common/arm/neon/sixtappredict4x4_neon.asm new file mode 100644 index 000000000..c23a9dbd1 --- /dev/null +++ b/vp8/common/arm/neon/sixtappredict4x4_neon.asm @@ -0,0 +1,425 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_sixtap_predict_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack(r4) unsigned char *dst_ptr, +; stack(lr) int dst_pitch + +|vp8_sixtap_predict_neon| PROC + push {r4, lr} + + ldr r12, _filter4_coeff_ + ldr r4, [sp, #8] ;load parameters from stack + ldr lr, [sp, #12] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_filter4x4_only + + add r2, r12, r2, lsl #5 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + vld1.s32 {q14, q15}, [r2] ;load first_pass filter + + beq firstpass_filter4x4_only + + vabs.s32 q12, q14 ;get abs(filer_parameters) + vabs.s32 q13, q15 + + sub r0, r0, #2 ;go back 2 columns of src data + sub r0, r0, r1, lsl #1 ;go back 2 lines of src data + +;First pass: output_height lines x output_width columns (9x4) + vld1.u8 {q3}, [r0], r1 ;load first 4-line src data + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vld1.u8 {q4}, [r0], r1 + vdup.8 d1, d24[4] + vld1.u8 {q5}, [r0], r1 + vdup.8 d2, d25[0] + vld1.u8 {q6}, [r0], r1 + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vext.8 d18, d6, d7, #5 ;construct src_ptr[3] + vext.8 d19, d8, d9, #5 + vext.8 d20, d10, d11, #5 + vext.8 d21, d12, d13, #5 + + vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done + vswp d11, d12 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) + vzip.32 d20, d21 + vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5]) + vmull.u8 q8, d20, d5 + + vmov q4, q3 ;keep original src data in q4 q6 + vmov q6, q5 + + vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together + vzip.32 d10, d11 + vshr.u64 q9, q4, #8 ;construct src_ptr[-1] + vshr.u64 q10, q6, #8 + vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0]) + vmlal.u8 q8, d10, d0 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #32 ;construct src_ptr[2] + vshr.u64 q5, q6, #32 + vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d20, d1 + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) + vzip.32 d10, d11 + vshr.u64 q9, q4, #16 ;construct src_ptr[0] + vshr.u64 q10, q6, #16 + vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d10, d4 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #24 ;construct src_ptr[1] + vshr.u64 q5, q6, #24 + vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d20, d2 + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) + vzip.32 d10, d11 + vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q10, d10, d3 + + vld1.u8 {q3}, [r0], r1 ;load rest 5-line src data + vld1.u8 {q4}, [r0], r1 + + vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q10 + + vld1.u8 {q5}, [r0], r1 + vld1.u8 {q6}, [r0], r1 + + vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d28, q8, #7 + + ;First Pass on rest 5-line data + vld1.u8 {q11}, [r0], r1 + + vext.8 d18, d6, d7, #5 ;construct src_ptr[3] + vext.8 d19, d8, d9, #5 + vext.8 d20, d10, d11, #5 + vext.8 d21, d12, d13, #5 + + vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done + vswp d11, d12 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) + vzip.32 d20, d21 + vext.8 d31, d22, d23, #5 ;construct src_ptr[3] + vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5]) + vmull.u8 q8, d20, d5 + vmull.u8 q12, d31, d5 ;(src_ptr[3] * vp8_filter[5]) + + vmov q4, q3 ;keep original src data in q4 q6 + vmov q6, q5 + + vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together + vzip.32 d10, d11 + vshr.u64 q9, q4, #8 ;construct src_ptr[-1] + vshr.u64 q10, q6, #8 + + vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0]) + vmlal.u8 q8, d10, d0 + vmlal.u8 q12, d22, d0 ;(src_ptr[-2] * vp8_filter[0]) + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #32 ;construct src_ptr[2] + vshr.u64 q5, q6, #32 + vext.8 d31, d22, d23, #1 ;construct src_ptr[-1] + + vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d20, d1 + vmlsl.u8 q12, d31, d1 ;-(src_ptr[-1] * vp8_filter[1]) + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) + vzip.32 d10, d11 + vshr.u64 q9, q4, #16 ;construct src_ptr[0] + vshr.u64 q10, q6, #16 + vext.8 d31, d22, d23, #4 ;construct src_ptr[2] + + vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d10, d4 + vmlsl.u8 q12, d31, d4 ;-(src_ptr[2] * vp8_filter[4]) + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #24 ;construct src_ptr[1] + vshr.u64 q5, q6, #24 + vext.8 d31, d22, d23, #2 ;construct src_ptr[0] + + vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d20, d2 + vmlal.u8 q12, d31, d2 ;(src_ptr[0] * vp8_filter[2]) + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) + vzip.32 d10, d11 + vext.8 d31, d22, d23, #3 ;construct src_ptr[1] + vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q10, d10, d3 + vmull.u8 q11, d31, d3 ;(src_ptr[1] * vp8_filter[3]) + + add r3, r12, r3, lsl #5 + + vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q10 + vqadd.s16 q12, q11 + + vext.8 d23, d27, d28, #4 + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + + vqrshrun.s16 d29, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d30, q8, #7 + vqrshrun.s16 d31, q12, #7 + +;Second pass: 4x4 + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + vext.8 d24, d28, d29, #4 + vext.8 d25, d29, d30, #4 + vext.8 d26, d30, d31, #4 + + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d14[4] + vdup.8 d2, d15[0] + vdup.8 d3, d15[4] + vdup.8 d4, d16[0] + vdup.8 d5, d16[4] + + vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d28, d0 + + vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5]) + vmull.u8 q6, d26, d5 + + vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d30, d4 + + vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q6, d24, d1 + + vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d29, d2 + + vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3]) + vmlal.u8 q6, d25, d3 + + add r0, r4, lr + add r1, r0, lr + add r2, r1, lr + + vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q6, q4 + + vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d4, q6, #7 + + vst1.32 {d3[0]}, [r4] ;store result + vst1.32 {d3[1]}, [r0] + vst1.32 {d4[0]}, [r1] + vst1.32 {d4[1]}, [r2] + + pop {r4, pc} + + +;--------------------- +firstpass_filter4x4_only + vabs.s32 q12, q14 ;get abs(filer_parameters) + vabs.s32 q13, q15 + + sub r0, r0, #2 ;go back 2 columns of src data + +;First pass: output_height lines x output_width columns (4x4) + vld1.u8 {q3}, [r0], r1 ;load first 4-line src data + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vld1.u8 {q4}, [r0], r1 + vdup.8 d1, d24[4] + vld1.u8 {q5}, [r0], r1 + vdup.8 d2, d25[0] + vld1.u8 {q6}, [r0], r1 + + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + + vext.8 d18, d6, d7, #5 ;construct src_ptr[3] + vext.8 d19, d8, d9, #5 + vext.8 d20, d10, d11, #5 + vext.8 d21, d12, d13, #5 + + vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done + vswp d11, d12 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) + vzip.32 d20, d21 + vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5]) + vmull.u8 q8, d20, d5 + + vmov q4, q3 ;keep original src data in q4 q6 + vmov q6, q5 + + vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together + vzip.32 d10, d11 + vshr.u64 q9, q4, #8 ;construct src_ptr[-1] + vshr.u64 q10, q6, #8 + vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0]) + vmlal.u8 q8, d10, d0 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #32 ;construct src_ptr[2] + vshr.u64 q5, q6, #32 + vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d20, d1 + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) + vzip.32 d10, d11 + vshr.u64 q9, q4, #16 ;construct src_ptr[0] + vshr.u64 q10, q6, #16 + vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d10, d4 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #24 ;construct src_ptr[1] + vshr.u64 q5, q6, #24 + vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d20, d2 + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) + vzip.32 d10, d11 + vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q10, d10, d3 + + add r0, r4, lr + add r1, r0, lr + add r2, r1, lr + + vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q10 + + vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d28, q8, #7 + + vst1.32 {d27[0]}, [r4] ;store result + vst1.32 {d27[1]}, [r0] + vst1.32 {d28[0]}, [r1] + vst1.32 {d28[1]}, [r2] + + pop {r4, pc} + + +;--------------------- +secondpass_filter4x4_only + sub r0, r0, r1, lsl #1 + add r3, r12, r3, lsl #5 + + vld1.32 {d27[0]}, [r0], r1 ;load src data + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + vld1.32 {d27[1]}, [r0], r1 + vabs.s32 q7, q5 + vld1.32 {d28[0]}, [r0], r1 + vabs.s32 q8, q6 + vld1.32 {d28[1]}, [r0], r1 + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vld1.32 {d29[0]}, [r0], r1 + vdup.8 d1, d14[4] + vld1.32 {d29[1]}, [r0], r1 + vdup.8 d2, d15[0] + vld1.32 {d30[0]}, [r0], r1 + vdup.8 d3, d15[4] + vld1.32 {d30[1]}, [r0], r1 + vdup.8 d4, d16[0] + vld1.32 {d31[0]}, [r0], r1 + vdup.8 d5, d16[4] + + vext.8 d23, d27, d28, #4 + vext.8 d24, d28, d29, #4 + vext.8 d25, d29, d30, #4 + vext.8 d26, d30, d31, #4 + + vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d28, d0 + + vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5]) + vmull.u8 q6, d26, d5 + + vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d30, d4 + + vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q6, d24, d1 + + vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d29, d2 + + vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3]) + vmlal.u8 q6, d25, d3 + + add r0, r4, lr + add r1, r0, lr + add r2, r1, lr + + vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q6, q4 + + vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d4, q6, #7 + + vst1.32 {d3[0]}, [r4] ;store result + vst1.32 {d3[1]}, [r0] + vst1.32 {d4[0]}, [r1] + vst1.32 {d4[1]}, [r2] + + pop {r4, pc} + + ENDP + +;----------------- + AREA subpelfilters4_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_filter4_coeff_ + DCD filter4_coeff +filter4_coeff + DCD 0, 0, 128, 0, 0, 0, 0, 0 + DCD 0, -6, 123, 12, -1, 0, 0, 0 + DCD 2, -11, 108, 36, -8, 1, 0, 0 + DCD 0, -9, 93, 50, -6, 0, 0, 0 + DCD 3, -16, 77, 77, -16, 3, 0, 0 + DCD 0, -6, 50, 93, -9, 0, 0, 0 + DCD 1, -8, 36, 108, -11, 2, 0, 0 + DCD 0, -1, 12, 123, -6, 0, 0, 0 + + END diff --git a/vp8/common/arm/neon/sixtappredict8x4_neon.asm b/vp8/common/arm/neon/sixtappredict8x4_neon.asm new file mode 100644 index 000000000..18e19f958 --- /dev/null +++ b/vp8/common/arm/neon/sixtappredict8x4_neon.asm @@ -0,0 +1,476 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_sixtap_predict8x4_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(r5) int dst_pitch + +|vp8_sixtap_predict8x4_neon| PROC + push {r4-r5, lr} + + ldr r12, _filter8_coeff_ + ldr r4, [sp, #12] ;load parameters from stack + ldr r5, [sp, #16] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_filter8x4_only + + add r2, r12, r2, lsl #5 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + + vld1.s32 {q14, q15}, [r2] ;load first_pass filter + + beq firstpass_filter8x4_only + + sub sp, sp, #32 ;reserve space on stack for temporary storage + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) + mov lr, sp + sub r0, r0, r1, lsl #1 + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vdup.8 d1, d24[4] + vdup.8 d2, d25[0] + +;First pass: output_height lines x output_width columns (9x8) + vld1.u8 {q3}, [r0], r1 ;load src data + vdup.8 d3, d25[4] + vld1.u8 {q4}, [r0], r1 + vdup.8 d4, d26[0] + vld1.u8 {q5}, [r0], r1 + vdup.8 d5, d26[4] + vld1.u8 {q6}, [r0], r1 + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q8, d8, d0 + vmull.u8 q9, d10, d0 + vmull.u8 q10, d12, d0 + + vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d29, d8, d9, #1 + vext.8 d30, d10, d11, #1 + vext.8 d31, d12, d13, #1 + + vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d29, d1 + vmlsl.u8 q9, d30, d1 + vmlsl.u8 q10, d31, d1 + + vext.8 d28, d6, d7, #4 ;construct src_ptr[2] + vext.8 d29, d8, d9, #4 + vext.8 d30, d10, d11, #4 + vext.8 d31, d12, d13, #4 + + vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d29, d4 + vmlsl.u8 q9, d30, d4 + vmlsl.u8 q10, d31, d4 + + vext.8 d28, d6, d7, #2 ;construct src_ptr[0] + vext.8 d29, d8, d9, #2 + vext.8 d30, d10, d11, #2 + vext.8 d31, d12, d13, #2 + + vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d29, d2 + vmlal.u8 q9, d30, d2 + vmlal.u8 q10, d31, d2 + + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d8, d9, #5 + vext.8 d30, d10, d11, #5 + vext.8 d31, d12, d13, #5 + + vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q8, d29, d5 + vmlal.u8 q9, d30, d5 + vmlal.u8 q10, d31, d5 + + vext.8 d28, d6, d7, #3 ;construct src_ptr[1] + vext.8 d29, d8, d9, #3 + vext.8 d30, d10, d11, #3 + vext.8 d31, d12, d13, #3 + + vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q4, d29, d3 + vmull.u8 q5, d30, d3 + vmull.u8 q6, d31, d3 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vld1.u8 {q3}, [r0], r1 ;load src data + + vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d23, q8, #7 + vqrshrun.s16 d24, q9, #7 + vqrshrun.s16 d25, q10, #7 + + vld1.u8 {q4}, [r0], r1 + vst1.u8 {d22}, [lr]! ;store result + vld1.u8 {q5}, [r0], r1 + vst1.u8 {d23}, [lr]! + vld1.u8 {q6}, [r0], r1 + vst1.u8 {d24}, [lr]! + vld1.u8 {q7}, [r0], r1 + vst1.u8 {d25}, [lr]! + + ;first_pass filtering on the rest 5-line data + vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q9, d8, d0 + vmull.u8 q10, d10, d0 + vmull.u8 q11, d12, d0 + vmull.u8 q12, d14, d0 + + vext.8 d27, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d28, d8, d9, #1 + vext.8 d29, d10, d11, #1 + vext.8 d30, d12, d13, #1 + vext.8 d31, d14, d15, #1 + + vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q9, d28, d1 + vmlsl.u8 q10, d29, d1 + vmlsl.u8 q11, d30, d1 + vmlsl.u8 q12, d31, d1 + + vext.8 d27, d6, d7, #4 ;construct src_ptr[2] + vext.8 d28, d8, d9, #4 + vext.8 d29, d10, d11, #4 + vext.8 d30, d12, d13, #4 + vext.8 d31, d14, d15, #4 + + vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q9, d28, d4 + vmlsl.u8 q10, d29, d4 + vmlsl.u8 q11, d30, d4 + vmlsl.u8 q12, d31, d4 + + vext.8 d27, d6, d7, #2 ;construct src_ptr[0] + vext.8 d28, d8, d9, #2 + vext.8 d29, d10, d11, #2 + vext.8 d30, d12, d13, #2 + vext.8 d31, d14, d15, #2 + + vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q9, d28, d2 + vmlal.u8 q10, d29, d2 + vmlal.u8 q11, d30, d2 + vmlal.u8 q12, d31, d2 + + vext.8 d27, d6, d7, #5 ;construct src_ptr[3] + vext.8 d28, d8, d9, #5 + vext.8 d29, d10, d11, #5 + vext.8 d30, d12, d13, #5 + vext.8 d31, d14, d15, #5 + + vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q9, d28, d5 + vmlal.u8 q10, d29, d5 + vmlal.u8 q11, d30, d5 + vmlal.u8 q12, d31, d5 + + vext.8 d27, d6, d7, #3 ;construct src_ptr[1] + vext.8 d28, d8, d9, #3 + vext.8 d29, d10, d11, #3 + vext.8 d30, d12, d13, #3 + vext.8 d31, d14, d15, #3 + + vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q4, d28, d3 + vmull.u8 q5, d29, d3 + vmull.u8 q6, d30, d3 + vmull.u8 q7, d31, d3 + + vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q9, q4 + vqadd.s16 q10, q5 + vqadd.s16 q11, q6 + vqadd.s16 q12, q7 + + vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d27, q9, #7 + vqrshrun.s16 d28, q10, #7 + vqrshrun.s16 d29, q11, #7 ;load intermediate data from stack + vqrshrun.s16 d30, q12, #7 + +;Second pass: 8x4 +;secondpass_filter + add r3, r12, r3, lsl #5 + sub lr, lr, #32 + + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + vld1.u8 {q11}, [lr]! + + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + vld1.u8 {q12}, [lr]! + + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d14[4] + vdup.8 d2, d15[0] + vdup.8 d3, d15[4] + vdup.8 d4, d16[0] + vdup.8 d5, d16[4] + + vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d23, d0 + vmull.u8 q5, d24, d0 + vmull.u8 q6, d25, d0 + + vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q4, d24, d1 + vmlsl.u8 q5, d25, d1 + vmlsl.u8 q6, d26, d1 + + vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d27, d4 + vmlsl.u8 q5, d28, d4 + vmlsl.u8 q6, d29, d4 + + vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d25, d2 + vmlal.u8 q5, d26, d2 + vmlal.u8 q6, d27, d2 + + vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q4, d28, d5 + vmlal.u8 q5, d29, d5 + vmlal.u8 q6, d30, d5 + + vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q8, d26, d3 + vmull.u8 q9, d27, d3 + vmull.u8 q10, d28, d3 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vst1.u8 {d6}, [r4], r5 ;store result + vst1.u8 {d7}, [r4], r5 + vst1.u8 {d8}, [r4], r5 + vst1.u8 {d9}, [r4], r5 + + add sp, sp, #32 + pop {r4-r5,pc} + +;-------------------- +firstpass_filter8x4_only + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) + vld1.u8 {q3}, [r0], r1 ;load src data + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vld1.u8 {q4}, [r0], r1 + vdup.8 d1, d24[4] + vld1.u8 {q5}, [r0], r1 + vdup.8 d2, d25[0] + vld1.u8 {q6}, [r0], r1 + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + +;First pass: output_height lines x output_width columns (4x8) + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q8, d8, d0 + vmull.u8 q9, d10, d0 + vmull.u8 q10, d12, d0 + + vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d29, d8, d9, #1 + vext.8 d30, d10, d11, #1 + vext.8 d31, d12, d13, #1 + + vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d29, d1 + vmlsl.u8 q9, d30, d1 + vmlsl.u8 q10, d31, d1 + + vext.8 d28, d6, d7, #4 ;construct src_ptr[2] + vext.8 d29, d8, d9, #4 + vext.8 d30, d10, d11, #4 + vext.8 d31, d12, d13, #4 + + vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d29, d4 + vmlsl.u8 q9, d30, d4 + vmlsl.u8 q10, d31, d4 + + vext.8 d28, d6, d7, #2 ;construct src_ptr[0] + vext.8 d29, d8, d9, #2 + vext.8 d30, d10, d11, #2 + vext.8 d31, d12, d13, #2 + + vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d29, d2 + vmlal.u8 q9, d30, d2 + vmlal.u8 q10, d31, d2 + + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d8, d9, #5 + vext.8 d30, d10, d11, #5 + vext.8 d31, d12, d13, #5 + + vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q8, d29, d5 + vmlal.u8 q9, d30, d5 + vmlal.u8 q10, d31, d5 + + vext.8 d28, d6, d7, #3 ;construct src_ptr[1] + vext.8 d29, d8, d9, #3 + vext.8 d30, d10, d11, #3 + vext.8 d31, d12, d13, #3 + + vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q4, d29, d3 + vmull.u8 q5, d30, d3 + vmull.u8 q6, d31, d3 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d23, q8, #7 + vqrshrun.s16 d24, q9, #7 + vqrshrun.s16 d25, q10, #7 + + vst1.u8 {d22}, [r4], r5 ;store result + vst1.u8 {d23}, [r4], r5 + vst1.u8 {d24}, [r4], r5 + vst1.u8 {d25}, [r4], r5 + + pop {r4-r5,pc} + +;--------------------- +secondpass_filter8x4_only +;Second pass: 8x4 + add r3, r12, r3, lsl #5 + sub r0, r0, r1, lsl #1 + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + vld1.u8 {d22}, [r0], r1 + vld1.u8 {d23}, [r0], r1 + vld1.u8 {d24}, [r0], r1 + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vld1.u8 {d25}, [r0], r1 + vdup.8 d1, d14[4] + vld1.u8 {d26}, [r0], r1 + vdup.8 d2, d15[0] + vld1.u8 {d27}, [r0], r1 + vdup.8 d3, d15[4] + vld1.u8 {d28}, [r0], r1 + vdup.8 d4, d16[0] + vld1.u8 {d29}, [r0], r1 + vdup.8 d5, d16[4] + vld1.u8 {d30}, [r0], r1 + + vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d23, d0 + vmull.u8 q5, d24, d0 + vmull.u8 q6, d25, d0 + + vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q4, d24, d1 + vmlsl.u8 q5, d25, d1 + vmlsl.u8 q6, d26, d1 + + vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d27, d4 + vmlsl.u8 q5, d28, d4 + vmlsl.u8 q6, d29, d4 + + vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d25, d2 + vmlal.u8 q5, d26, d2 + vmlal.u8 q6, d27, d2 + + vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q4, d28, d5 + vmlal.u8 q5, d29, d5 + vmlal.u8 q6, d30, d5 + + vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q8, d26, d3 + vmull.u8 q9, d27, d3 + vmull.u8 q10, d28, d3 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vst1.u8 {d6}, [r4], r5 ;store result + vst1.u8 {d7}, [r4], r5 + vst1.u8 {d8}, [r4], r5 + vst1.u8 {d9}, [r4], r5 + + pop {r4-r5,pc} + + ENDP + +;----------------- + AREA subpelfilters8_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_filter8_coeff_ + DCD filter8_coeff +filter8_coeff + DCD 0, 0, 128, 0, 0, 0, 0, 0 + DCD 0, -6, 123, 12, -1, 0, 0, 0 + DCD 2, -11, 108, 36, -8, 1, 0, 0 + DCD 0, -9, 93, 50, -6, 0, 0, 0 + DCD 3, -16, 77, 77, -16, 3, 0, 0 + DCD 0, -6, 50, 93, -9, 0, 0, 0 + DCD 1, -8, 36, 108, -11, 2, 0, 0 + DCD 0, -1, 12, 123, -6, 0, 0, 0 + + END diff --git a/vp8/common/arm/neon/sixtappredict8x8_neon.asm b/vp8/common/arm/neon/sixtappredict8x8_neon.asm new file mode 100644 index 000000000..d27485e6c --- /dev/null +++ b/vp8/common/arm/neon/sixtappredict8x8_neon.asm @@ -0,0 +1,527 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_sixtap_predict8x8_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack(r4) unsigned char *dst_ptr, +; stack(r5) int dst_pitch + +|vp8_sixtap_predict8x8_neon| PROC + push {r4-r5, lr} + + ldr r12, _filter8_coeff_ + + ldr r4, [sp, #12] ;load parameters from stack + ldr r5, [sp, #16] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_filter8x8_only + + add r2, r12, r2, lsl #5 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + + vld1.s32 {q14, q15}, [r2] ;load first_pass filter + + beq firstpass_filter8x8_only + + sub sp, sp, #64 ;reserve space on stack for temporary storage + mov lr, sp + + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + mov r2, #2 ;loop counter + sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) + sub r0, r0, r1, lsl #1 + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vdup.8 d1, d24[4] + vdup.8 d2, d25[0] + +;First pass: output_height lines x output_width columns (13x8) + vld1.u8 {q3}, [r0], r1 ;load src data + vdup.8 d3, d25[4] + vld1.u8 {q4}, [r0], r1 + vdup.8 d4, d26[0] + vld1.u8 {q5}, [r0], r1 + vdup.8 d5, d26[4] + vld1.u8 {q6}, [r0], r1 + +filt_blk2d_fp8x8_loop_neon + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q8, d8, d0 + vmull.u8 q9, d10, d0 + vmull.u8 q10, d12, d0 + + vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d29, d8, d9, #1 + vext.8 d30, d10, d11, #1 + vext.8 d31, d12, d13, #1 + + vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d29, d1 + vmlsl.u8 q9, d30, d1 + vmlsl.u8 q10, d31, d1 + + vext.8 d28, d6, d7, #4 ;construct src_ptr[2] + vext.8 d29, d8, d9, #4 + vext.8 d30, d10, d11, #4 + vext.8 d31, d12, d13, #4 + + vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d29, d4 + vmlsl.u8 q9, d30, d4 + vmlsl.u8 q10, d31, d4 + + vext.8 d28, d6, d7, #2 ;construct src_ptr[0] + vext.8 d29, d8, d9, #2 + vext.8 d30, d10, d11, #2 + vext.8 d31, d12, d13, #2 + + vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d29, d2 + vmlal.u8 q9, d30, d2 + vmlal.u8 q10, d31, d2 + + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d8, d9, #5 + vext.8 d30, d10, d11, #5 + vext.8 d31, d12, d13, #5 + + vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q8, d29, d5 + vmlal.u8 q9, d30, d5 + vmlal.u8 q10, d31, d5 + + vext.8 d28, d6, d7, #3 ;construct src_ptr[1] + vext.8 d29, d8, d9, #3 + vext.8 d30, d10, d11, #3 + vext.8 d31, d12, d13, #3 + + vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q4, d29, d3 + vmull.u8 q5, d30, d3 + vmull.u8 q6, d31, d3 + + subs r2, r2, #1 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vld1.u8 {q3}, [r0], r1 ;load src data + + vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d23, q8, #7 + vqrshrun.s16 d24, q9, #7 + vqrshrun.s16 d25, q10, #7 + + vst1.u8 {d22}, [lr]! ;store result + vld1.u8 {q4}, [r0], r1 + vst1.u8 {d23}, [lr]! + vld1.u8 {q5}, [r0], r1 + vst1.u8 {d24}, [lr]! + vld1.u8 {q6}, [r0], r1 + vst1.u8 {d25}, [lr]! + + bne filt_blk2d_fp8x8_loop_neon + + ;first_pass filtering on the rest 5-line data + ;vld1.u8 {q3}, [r0], r1 ;load src data + ;vld1.u8 {q4}, [r0], r1 + ;vld1.u8 {q5}, [r0], r1 + ;vld1.u8 {q6}, [r0], r1 + vld1.u8 {q7}, [r0], r1 + + vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q9, d8, d0 + vmull.u8 q10, d10, d0 + vmull.u8 q11, d12, d0 + vmull.u8 q12, d14, d0 + + vext.8 d27, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d28, d8, d9, #1 + vext.8 d29, d10, d11, #1 + vext.8 d30, d12, d13, #1 + vext.8 d31, d14, d15, #1 + + vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q9, d28, d1 + vmlsl.u8 q10, d29, d1 + vmlsl.u8 q11, d30, d1 + vmlsl.u8 q12, d31, d1 + + vext.8 d27, d6, d7, #4 ;construct src_ptr[2] + vext.8 d28, d8, d9, #4 + vext.8 d29, d10, d11, #4 + vext.8 d30, d12, d13, #4 + vext.8 d31, d14, d15, #4 + + vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q9, d28, d4 + vmlsl.u8 q10, d29, d4 + vmlsl.u8 q11, d30, d4 + vmlsl.u8 q12, d31, d4 + + vext.8 d27, d6, d7, #2 ;construct src_ptr[0] + vext.8 d28, d8, d9, #2 + vext.8 d29, d10, d11, #2 + vext.8 d30, d12, d13, #2 + vext.8 d31, d14, d15, #2 + + vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q9, d28, d2 + vmlal.u8 q10, d29, d2 + vmlal.u8 q11, d30, d2 + vmlal.u8 q12, d31, d2 + + vext.8 d27, d6, d7, #5 ;construct src_ptr[3] + vext.8 d28, d8, d9, #5 + vext.8 d29, d10, d11, #5 + vext.8 d30, d12, d13, #5 + vext.8 d31, d14, d15, #5 + + vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q9, d28, d5 + vmlal.u8 q10, d29, d5 + vmlal.u8 q11, d30, d5 + vmlal.u8 q12, d31, d5 + + vext.8 d27, d6, d7, #3 ;construct src_ptr[1] + vext.8 d28, d8, d9, #3 + vext.8 d29, d10, d11, #3 + vext.8 d30, d12, d13, #3 + vext.8 d31, d14, d15, #3 + + vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q4, d28, d3 + vmull.u8 q5, d29, d3 + vmull.u8 q6, d30, d3 + vmull.u8 q7, d31, d3 + + vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q9, q4 + vqadd.s16 q10, q5 + vqadd.s16 q11, q6 + vqadd.s16 q12, q7 + + add r3, r12, r3, lsl #5 + + vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8 + sub lr, lr, #64 + vqrshrun.s16 d27, q9, #7 + vld1.u8 {q9}, [lr]! ;load intermediate data from stack + vqrshrun.s16 d28, q10, #7 + vld1.u8 {q10}, [lr]! + + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + + vqrshrun.s16 d29, q11, #7 + vld1.u8 {q11}, [lr]! + + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + vqrshrun.s16 d30, q12, #7 + vld1.u8 {q12}, [lr]! + +;Second pass: 8x8 + mov r3, #2 ;loop counter + + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d14[4] + vdup.8 d2, d15[0] + vdup.8 d3, d15[4] + vdup.8 d4, d16[0] + vdup.8 d5, d16[4] + +filt_blk2d_sp8x8_loop_neon + vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d19, d0 + vmull.u8 q5, d20, d0 + vmull.u8 q6, d21, d0 + + vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q4, d20, d1 + vmlsl.u8 q5, d21, d1 + vmlsl.u8 q6, d22, d1 + + vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d23, d4 + vmlsl.u8 q5, d24, d4 + vmlsl.u8 q6, d25, d4 + + vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d21, d2 + vmlal.u8 q5, d22, d2 + vmlal.u8 q6, d23, d2 + + vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q4, d24, d5 + vmlal.u8 q5, d25, d5 + vmlal.u8 q6, d26, d5 + + vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q8, d22, d3 + vmull.u8 q9, d23, d3 + vmull.u8 q10, d24, d3 + + subs r3, r3, #1 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vmov q9, q11 + vst1.u8 {d6}, [r4], r5 ;store result + vmov q10, q12 + vst1.u8 {d7}, [r4], r5 + vmov q11, q13 + vst1.u8 {d8}, [r4], r5 + vmov q12, q14 + vst1.u8 {d9}, [r4], r5 + vmov d26, d30 + + bne filt_blk2d_sp8x8_loop_neon + + add sp, sp, #64 + pop {r4-r5,pc} + +;--------------------- +firstpass_filter8x8_only + ;add r2, r12, r2, lsl #5 ;calculate filter location + ;vld1.s32 {q14, q15}, [r2] ;load first_pass filter + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + mov r2, #2 ;loop counter + sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vdup.8 d1, d24[4] + vdup.8 d2, d25[0] + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + +;First pass: output_height lines x output_width columns (8x8) +filt_blk2d_fpo8x8_loop_neon + vld1.u8 {q3}, [r0], r1 ;load src data + vld1.u8 {q4}, [r0], r1 + vld1.u8 {q5}, [r0], r1 + vld1.u8 {q6}, [r0], r1 + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q8, d8, d0 + vmull.u8 q9, d10, d0 + vmull.u8 q10, d12, d0 + + vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d29, d8, d9, #1 + vext.8 d30, d10, d11, #1 + vext.8 d31, d12, d13, #1 + + vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d29, d1 + vmlsl.u8 q9, d30, d1 + vmlsl.u8 q10, d31, d1 + + vext.8 d28, d6, d7, #4 ;construct src_ptr[2] + vext.8 d29, d8, d9, #4 + vext.8 d30, d10, d11, #4 + vext.8 d31, d12, d13, #4 + + vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d29, d4 + vmlsl.u8 q9, d30, d4 + vmlsl.u8 q10, d31, d4 + + vext.8 d28, d6, d7, #2 ;construct src_ptr[0] + vext.8 d29, d8, d9, #2 + vext.8 d30, d10, d11, #2 + vext.8 d31, d12, d13, #2 + + vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d29, d2 + vmlal.u8 q9, d30, d2 + vmlal.u8 q10, d31, d2 + + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d8, d9, #5 + vext.8 d30, d10, d11, #5 + vext.8 d31, d12, d13, #5 + + vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q8, d29, d5 + vmlal.u8 q9, d30, d5 + vmlal.u8 q10, d31, d5 + + vext.8 d28, d6, d7, #3 ;construct src_ptr[1] + vext.8 d29, d8, d9, #3 + vext.8 d30, d10, d11, #3 + vext.8 d31, d12, d13, #3 + + vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q4, d29, d3 + vmull.u8 q5, d30, d3 + vmull.u8 q6, d31, d3 + ; + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + subs r2, r2, #1 + + vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d23, q8, #7 + vqrshrun.s16 d24, q9, #7 + vqrshrun.s16 d25, q10, #7 + + vst1.u8 {d22}, [r4], r5 ;store result + vst1.u8 {d23}, [r4], r5 + vst1.u8 {d24}, [r4], r5 + vst1.u8 {d25}, [r4], r5 + + bne filt_blk2d_fpo8x8_loop_neon + + pop {r4-r5,pc} + +;--------------------- +secondpass_filter8x8_only + sub r0, r0, r1, lsl #1 + add r3, r12, r3, lsl #5 + + vld1.u8 {d18}, [r0], r1 ;load src data + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + vld1.u8 {d19}, [r0], r1 + vabs.s32 q7, q5 + vld1.u8 {d20}, [r0], r1 + vabs.s32 q8, q6 + vld1.u8 {d21}, [r0], r1 + mov r3, #2 ;loop counter + vld1.u8 {d22}, [r0], r1 + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vld1.u8 {d23}, [r0], r1 + vdup.8 d1, d14[4] + vld1.u8 {d24}, [r0], r1 + vdup.8 d2, d15[0] + vld1.u8 {d25}, [r0], r1 + vdup.8 d3, d15[4] + vld1.u8 {d26}, [r0], r1 + vdup.8 d4, d16[0] + vld1.u8 {d27}, [r0], r1 + vdup.8 d5, d16[4] + vld1.u8 {d28}, [r0], r1 + vld1.u8 {d29}, [r0], r1 + vld1.u8 {d30}, [r0], r1 + +;Second pass: 8x8 +filt_blk2d_spo8x8_loop_neon + vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d19, d0 + vmull.u8 q5, d20, d0 + vmull.u8 q6, d21, d0 + + vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q4, d20, d1 + vmlsl.u8 q5, d21, d1 + vmlsl.u8 q6, d22, d1 + + vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d23, d4 + vmlsl.u8 q5, d24, d4 + vmlsl.u8 q6, d25, d4 + + vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d21, d2 + vmlal.u8 q5, d22, d2 + vmlal.u8 q6, d23, d2 + + vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q4, d24, d5 + vmlal.u8 q5, d25, d5 + vmlal.u8 q6, d26, d5 + + vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q8, d22, d3 + vmull.u8 q9, d23, d3 + vmull.u8 q10, d24, d3 + + subs r3, r3, #1 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vmov q9, q11 + vst1.u8 {d6}, [r4], r5 ;store result + vmov q10, q12 + vst1.u8 {d7}, [r4], r5 + vmov q11, q13 + vst1.u8 {d8}, [r4], r5 + vmov q12, q14 + vst1.u8 {d9}, [r4], r5 + vmov d26, d30 + + bne filt_blk2d_spo8x8_loop_neon + + pop {r4-r5,pc} + + ENDP + +;----------------- + AREA subpelfilters8_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_filter8_coeff_ + DCD filter8_coeff +filter8_coeff + DCD 0, 0, 128, 0, 0, 0, 0, 0 + DCD 0, -6, 123, 12, -1, 0, 0, 0 + DCD 2, -11, 108, 36, -8, 1, 0, 0 + DCD 0, -9, 93, 50, -6, 0, 0, 0 + DCD 3, -16, 77, 77, -16, 3, 0, 0 + DCD 0, -6, 50, 93, -9, 0, 0, 0 + DCD 1, -8, 36, 108, -11, 2, 0, 0 + DCD 0, -1, 12, 123, -6, 0, 0, 0 + + END diff --git a/vp8/common/arm/recon_arm.c b/vp8/common/arm/recon_arm.c new file mode 100644 index 000000000..130059e64 --- /dev/null +++ b/vp8/common/arm/recon_arm.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "recon.h" +#include "blockd.h" + +extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr); + +/* +void vp8_recon16x16mby(MACROBLOCKD *x) +{ + int i; + for(i=0;i<16;i+=4) + { + //vp8_recon4b(&x->block[i]); + BLOCKD *b = &x->block[i]; + vp8_recon4b (b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + } +} +*/ +void vp8_recon16x16mby(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x) +{ + BLOCKD *b = &x->block[0]; + RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + + //b = &x->block[4]; + b += 4; + RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + + //b = &x->block[8]; + b += 4; + RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + + //b = &x->block[12]; + b += 4; + RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); +} + +#if HAVE_ARMV7 +void vp8_recon16x16mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x) +{ + unsigned char *pred_ptr = &x->predictor[0]; + short *diff_ptr = &x->diff[0]; + unsigned char *dst_ptr = x->dst.y_buffer; + unsigned char *udst_ptr = x->dst.u_buffer; + unsigned char *vdst_ptr = x->dst.v_buffer; + int ystride = x->dst.y_stride; + //int uv_stride = x->dst.uv_stride; + + vp8_recon16x16mb_neon(pred_ptr, diff_ptr, dst_ptr, ystride, udst_ptr, vdst_ptr); +} + +#else +/* +void vp8_recon16x16mb(MACROBLOCKD *x) +{ + int i; + + for(i=0;i<16;i+=4) + { +// vp8_recon4b(&x->block[i]); + BLOCKD *b = &x->block[i]; + vp8_recon4b (b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + + } + for(i=16;i<24;i+=2) + { +// vp8_recon2b(&x->block[i]); + BLOCKD *b = &x->block[i]; + vp8_recon2b (b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + } +} +*/ +void vp8_recon16x16mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x) +{ + BLOCKD *b = &x->block[0]; + + RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 4; + RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 4; + RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 4; + RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 4; + + //b = &x->block[16]; + + RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b++; + b++; + RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b++; + b++; + RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b++; + b++; + RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); +} +#endif diff --git a/vp8/common/arm/recon_arm.h b/vp8/common/arm/recon_arm.h new file mode 100644 index 000000000..fd9f85eea --- /dev/null +++ b/vp8/common/arm/recon_arm.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef RECON_ARM_H +#define RECON_ARM_H + +#if HAVE_ARMV6 +extern prototype_recon_block(vp8_recon_b_armv6); +extern prototype_recon_block(vp8_recon2b_armv6); +extern prototype_recon_block(vp8_recon4b_armv6); + +extern prototype_copy_block(vp8_copy_mem8x8_v6); +extern prototype_copy_block(vp8_copy_mem8x4_v6); +extern prototype_copy_block(vp8_copy_mem16x16_v6); + +#undef vp8_recon_recon +#define vp8_recon_recon vp8_recon_b_armv6 + +#undef vp8_recon_recon2 +#define vp8_recon_recon2 vp8_recon2b_armv6 + +#undef vp8_recon_recon4 +#define vp8_recon_recon4 vp8_recon4b_armv6 + +#undef vp8_recon_copy8x8 +#define vp8_recon_copy8x8 vp8_copy_mem8x8_v6 + +#undef vp8_recon_copy8x4 +#define vp8_recon_copy8x4 vp8_copy_mem8x4_v6 + +#undef vp8_recon_copy16x16 +#define vp8_recon_copy16x16 vp8_copy_mem16x16_v6 +#endif + +#if HAVE_ARMV7 +extern prototype_recon_block(vp8_recon_b_neon); +extern prototype_recon_block(vp8_recon2b_neon); +extern prototype_recon_block(vp8_recon4b_neon); + +extern prototype_copy_block(vp8_copy_mem8x8_neon); +extern prototype_copy_block(vp8_copy_mem8x4_neon); +extern prototype_copy_block(vp8_copy_mem16x16_neon); + +#undef vp8_recon_recon +#define vp8_recon_recon vp8_recon_b_neon + +#undef vp8_recon_recon2 +#define vp8_recon_recon2 vp8_recon2b_neon + +#undef vp8_recon_recon4 +#define vp8_recon_recon4 vp8_recon4b_neon + +#undef vp8_recon_copy8x8 +#define vp8_recon_copy8x8 vp8_copy_mem8x8_neon + +#undef vp8_recon_copy8x4 +#define vp8_recon_copy8x4 vp8_copy_mem8x4_neon + +#undef vp8_recon_copy16x16 +#define vp8_recon_copy16x16 vp8_copy_mem16x16_neon +#endif + +#endif diff --git a/vp8/common/arm/reconintra4x4_arm.c b/vp8/common/arm/reconintra4x4_arm.c new file mode 100644 index 000000000..334d35236 --- /dev/null +++ b/vp8/common/arm/reconintra4x4_arm.c @@ -0,0 +1,408 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "recon.h" +#include "vpx_mem/vpx_mem.h" +#include "reconintra.h" + +void vp8_predict_intra4x4(BLOCKD *x, + int b_mode, + unsigned char *predictor) +{ + int i, r, c; + + unsigned char *Above = *(x->base_dst) + x->dst - x->dst_stride; + unsigned char Left[4]; + unsigned char top_left = Above[-1]; + + Left[0] = (*(x->base_dst))[x->dst - 1]; + Left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride]; + Left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride]; + Left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride]; + + switch (b_mode) + { + case B_DC_PRED: + { + int expected_dc = 0; + + for (i = 0; i < 4; i++) + { + expected_dc += Above[i]; + expected_dc += Left[i]; + } + + expected_dc = (expected_dc + 4) >> 3; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + predictor[c] = expected_dc; + } + + predictor += 16; + } + } + break; + case B_TM_PRED: + { + // prediction similar to true_motion prediction + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + int pred = Above[c] - top_left + Left[r]; + + if (pred < 0) + pred = 0; + + if (pred > 255) + pred = 255; + + predictor[c] = pred; + } + + predictor += 16; + } + } + break; + + case B_VE_PRED: + { + + unsigned int ap[4]; + ap[0] = (top_left + 2 * Above[0] + Above[1] + 2) >> 2; + ap[1] = (Above[0] + 2 * Above[1] + Above[2] + 2) >> 2; + ap[2] = (Above[1] + 2 * Above[2] + Above[3] + 2) >> 2; + ap[3] = (Above[2] + 2 * Above[3] + Above[4] + 2) >> 2; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + + predictor[c] = ap[c]; + } + + predictor += 16; + } + + } + break; + + + case B_HE_PRED: + { + + unsigned int lp[4]; + lp[0] = (top_left + 2 * Left[0] + Left[1] + 2) >> 2; + lp[1] = (Left[0] + 2 * Left[1] + Left[2] + 2) >> 2; + lp[2] = (Left[1] + 2 * Left[2] + Left[3] + 2) >> 2; + lp[3] = (Left[2] + 2 * Left[3] + Left[3] + 2) >> 2; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + predictor[c] = lp[r]; + } + + predictor += 16; + } + } + break; + case B_LD_PRED: + { + unsigned char *ptr = Above; + predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2; + predictor[0 * 16 + 1] = + predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2; + predictor[0 * 16 + 2] = + predictor[1 * 16 + 1] = + predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2; + predictor[0 * 16 + 3] = + predictor[1 * 16 + 2] = + predictor[2 * 16 + 1] = + predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2; + predictor[1 * 16 + 3] = + predictor[2 * 16 + 2] = + predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2; + predictor[2 * 16 + 3] = + predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2; + predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2; + + } + break; + case B_RD_PRED: + { + + unsigned char pp[9]; + + pp[0] = Left[3]; + pp[1] = Left[2]; + pp[2] = Left[1]; + pp[3] = Left[0]; + pp[4] = top_left; + pp[5] = Above[0]; + pp[6] = Above[1]; + pp[7] = Above[2]; + pp[8] = Above[3]; + + predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[3 * 16 + 1] = + predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[3 * 16 + 2] = + predictor[2 * 16 + 1] = + predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[3 * 16 + 3] = + predictor[2 * 16 + 2] = + predictor[1 * 16 + 1] = + predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[2 * 16 + 3] = + predictor[1 * 16 + 2] = + predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[1 * 16 + 3] = + predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; + + } + break; + case B_VR_PRED: + { + + unsigned char pp[9]; + + pp[0] = Left[3]; + pp[1] = Left[2]; + pp[2] = Left[1]; + pp[3] = Left[0]; + pp[4] = top_left; + pp[5] = Above[0]; + pp[6] = Above[1]; + pp[7] = Above[2]; + pp[8] = Above[3]; + + + predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[3 * 16 + 1] = + predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[2 * 16 + 1] = + predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1; + predictor[3 * 16 + 2] = + predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[2 * 16 + 2] = + predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1; + predictor[3 * 16 + 3] = + predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + predictor[2 * 16 + 3] = + predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1; + predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; + predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1; + + } + break; + case B_VL_PRED: + { + + unsigned char *pp = Above; + + predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1; + predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[2 * 16 + 0] = + predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1; + predictor[1 * 16 + 1] = + predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[2 * 16 + 1] = + predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1; + predictor[3 * 16 + 1] = + predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[0 * 16 + 3] = + predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1; + predictor[1 * 16 + 3] = + predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + } + break; + + case B_HD_PRED: + { + unsigned char pp[9]; + pp[0] = Left[3]; + pp[1] = Left[2]; + pp[2] = Left[1]; + pp[3] = Left[0]; + pp[4] = top_left; + pp[5] = Above[0]; + pp[6] = Above[1]; + pp[7] = Above[2]; + pp[8] = Above[3]; + + + predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1; + predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[2 * 16 + 0] = + predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1; + predictor[2 * 16 + 1] = + predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[2 * 16 + 2] = + predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1; + predictor[2 * 16 + 3] = + predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[1 * 16 + 2] = + predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1; + predictor[1 * 16 + 3] = + predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + } + break; + + + case B_HU_PRED: + { + unsigned char *pp = Left; + predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1; + predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[0 * 16 + 2] = + predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1; + predictor[0 * 16 + 3] = + predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[1 * 16 + 2] = + predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1; + predictor[1 * 16 + 3] = + predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2; + predictor[2 * 16 + 2] = + predictor[2 * 16 + 3] = + predictor[3 * 16 + 0] = + predictor[3 * 16 + 1] = + predictor[3 * 16 + 2] = + predictor[3 * 16 + 3] = pp[3]; + } + break; + + + } +} +// copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and +// to the right prediction have filled in pixels to use. +void vp8_intra_prediction_down_copy(MACROBLOCKD *x) +{ + unsigned char *above_right = *(x->block[0].base_dst) + x->block[0].dst - x->block[0].dst_stride + 16; + + unsigned int *src_ptr = (unsigned int *)above_right; + unsigned int *dst_ptr0 = (unsigned int *)(above_right + 4 * x->block[0].dst_stride); + unsigned int *dst_ptr1 = (unsigned int *)(above_right + 8 * x->block[0].dst_stride); + unsigned int *dst_ptr2 = (unsigned int *)(above_right + 12 * x->block[0].dst_stride); + + *dst_ptr0 = *src_ptr; + *dst_ptr1 = *src_ptr; + *dst_ptr2 = *src_ptr; +} + + + +/* +void vp8_recon_intra4x4mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x) +{ + int i; + + vp8_intra_prediction_down_copy(x); + + for(i=0;i<16;i++) + { + BLOCKD *b = &x->block[i]; + + vp8_predict_intra4x4(b, x->block[i].bmi.mode,x->block[i].predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + } + + vp8_recon_intra_mbuv(x); + +} +*/ +void vp8_recon_intra4x4mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x) +{ + int i; + BLOCKD *b = &x->block[0]; + + vp8_intra_prediction_down_copy(x); + + { + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + } + + vp8_recon_intra_mbuv(rtcd, x); + +} diff --git a/vp8/common/arm/reconintra_arm.c b/vp8/common/arm/reconintra_arm.c new file mode 100644 index 000000000..d7ee1ddfa --- /dev/null +++ b/vp8/common/arm/reconintra_arm.c @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "blockd.h" +#include "reconintra.h" +#include "vpx_mem/vpx_mem.h" +#include "recon.h" + +#if HAVE_ARMV7 +extern void vp8_build_intra_predictors_mby_neon_func( + unsigned char *y_buffer, + unsigned char *ypred_ptr, + int y_stride, + int mode, + int Up, + int Left); + +void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x) +{ + unsigned char *y_buffer = x->dst.y_buffer; + unsigned char *ypred_ptr = x->predictor; + int y_stride = x->dst.y_stride; + int mode = x->mbmi.mode; + int Up = x->up_available; + int Left = x->left_available; + + vp8_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left); +} +#endif + + +#if HAVE_ARMV7 +extern void vp8_build_intra_predictors_mby_s_neon_func( + unsigned char *y_buffer, + unsigned char *ypred_ptr, + int y_stride, + int mode, + int Up, + int Left); + +void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x) +{ + unsigned char *y_buffer = x->dst.y_buffer; + unsigned char *ypred_ptr = x->predictor; + int y_stride = x->dst.y_stride; + int mode = x->mbmi.mode; + int Up = x->up_available; + int Left = x->left_available; + + vp8_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left); +} + +#endif diff --git a/vp8/common/arm/subpixel_arm.h b/vp8/common/arm/subpixel_arm.h new file mode 100644 index 000000000..56aec55b9 --- /dev/null +++ b/vp8/common/arm/subpixel_arm.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef SUBPIXEL_ARM_H +#define SUBPIXEL_ARM_H + +#if HAVE_ARMV6 +extern prototype_subpixel_predict(vp8_sixtap_predict16x16_armv6); +extern prototype_subpixel_predict(vp8_sixtap_predict8x8_armv6); +extern prototype_subpixel_predict(vp8_sixtap_predict8x4_armv6); +extern prototype_subpixel_predict(vp8_sixtap_predict_armv6); +extern prototype_subpixel_predict(vp8_bilinear_predict16x16_armv6); +extern prototype_subpixel_predict(vp8_bilinear_predict8x8_armv6); +extern prototype_subpixel_predict(vp8_bilinear_predict8x4_armv6); +extern prototype_subpixel_predict(vp8_bilinear_predict4x4_armv6); + +#undef vp8_subpix_sixtap16x16 +#define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_armv6 + +#undef vp8_subpix_sixtap8x8 +#define vp8_subpix_sixtap8x8 vp8_sixtap_predict8x8_armv6 + +#undef vp8_subpix_sixtap8x4 +#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_armv6 + +#undef vp8_subpix_sixtap4x4 +#define vp8_subpix_sixtap4x4 vp8_sixtap_predict_armv6 + +#undef vp8_subpix_bilinear16x16 +#define vp8_subpix_bilinear16x16 vp8_bilinear_predict16x16_armv6 + +#undef vp8_subpix_bilinear8x8 +#define vp8_subpix_bilinear8x8 vp8_bilinear_predict8x8_armv6 + +#undef vp8_subpix_bilinear8x4 +#define vp8_subpix_bilinear8x4 vp8_bilinear_predict8x4_armv6 + +#undef vp8_subpix_bilinear4x4 +#define vp8_subpix_bilinear4x4 vp8_bilinear_predict4x4_armv6 +#endif + +#if HAVE_ARMV7 +extern prototype_subpixel_predict(vp8_sixtap_predict16x16_neon); +extern prototype_subpixel_predict(vp8_sixtap_predict8x8_neon); +extern prototype_subpixel_predict(vp8_sixtap_predict8x4_neon); +extern prototype_subpixel_predict(vp8_sixtap_predict_neon); +extern prototype_subpixel_predict(vp8_bilinear_predict16x16_neon); +extern prototype_subpixel_predict(vp8_bilinear_predict8x8_neon); +extern prototype_subpixel_predict(vp8_bilinear_predict8x4_neon); +extern prototype_subpixel_predict(vp8_bilinear_predict4x4_neon); + +#undef vp8_subpix_sixtap16x16 +#define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_neon + +#undef vp8_subpix_sixtap8x8 +#define vp8_subpix_sixtap8x8 vp8_sixtap_predict8x8_neon + +#undef vp8_subpix_sixtap8x4 +#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_neon + +#undef vp8_subpix_sixtap4x4 +#define vp8_subpix_sixtap4x4 vp8_sixtap_predict_neon + +#undef vp8_subpix_bilinear16x16 +#define vp8_subpix_bilinear16x16 vp8_bilinear_predict16x16_neon + +#undef vp8_subpix_bilinear8x8 +#define vp8_subpix_bilinear8x8 vp8_bilinear_predict8x8_neon + +#undef vp8_subpix_bilinear8x4 +#define vp8_subpix_bilinear8x4 vp8_bilinear_predict8x4_neon + +#undef vp8_subpix_bilinear4x4 +#define vp8_subpix_bilinear4x4 vp8_bilinear_predict4x4_neon +#endif + +#endif diff --git a/vp8/common/arm/systemdependent.c b/vp8/common/arm/systemdependent.c new file mode 100644 index 000000000..ecc6929c0 --- /dev/null +++ b/vp8/common/arm/systemdependent.c @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "g_common.h" +#include "pragmas.h" +#include "subpixel.h" +#include "loopfilter.h" +#include "recon.h" +#include "idct.h" +#include "onyxc_int.h" + +void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x); +extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x); +extern void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x); + +void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x); +extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x); +extern void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x); + +void vp8_machine_specific_config(VP8_COMMON *ctx) +{ +#if CONFIG_RUNTIME_CPU_DETECT + VP8_COMMON_RTCD *rtcd = &ctx->rtcd; + +#if HAVE_ARMV7 + rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_neon; + rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_neon; + rtcd->subpix.sixtap8x4 = vp8_sixtap_predict8x4_neon; + rtcd->subpix.sixtap4x4 = vp8_sixtap_predict_neon; + rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_neon; + rtcd->subpix.bilinear8x8 = vp8_bilinear_predict8x8_neon; + rtcd->subpix.bilinear8x4 = vp8_bilinear_predict8x4_neon; + rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_neon; + + rtcd->idct.idct1 = vp8_short_idct4x4llm_1_neon; + rtcd->idct.idct16 = vp8_short_idct4x4llm_neon; + rtcd->idct.idct1_scalar = vp8_dc_only_idct_neon; + rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_neon; + rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_neon; + + rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_neon; + rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_neon; + rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_neon; + rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_neon; + rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_neon; + rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_neon; + rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_neon; + rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_neon; + + rtcd->recon.copy16x16 = vp8_copy_mem16x16_neon; + rtcd->recon.copy8x8 = vp8_copy_mem8x8_neon; + rtcd->recon.copy8x4 = vp8_copy_mem8x4_neon; + rtcd->recon.recon = vp8_recon_b_neon; + rtcd->recon.recon2 = vp8_recon2b_neon; + rtcd->recon.recon4 = vp8_recon4b_neon; +#elif HAVE_ARMV6 + + rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_armv6; + rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_armv6; + rtcd->subpix.sixtap8x4 = vp8_sixtap_predict8x4_armv6; + rtcd->subpix.sixtap4x4 = vp8_sixtap_predict_armv6; + rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_armv6; + rtcd->subpix.bilinear8x8 = vp8_bilinear_predict8x8_armv6; + rtcd->subpix.bilinear8x4 = vp8_bilinear_predict8x4_armv6; + rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_armv6; + + rtcd->idct.idct1 = vp8_short_idct4x4llm_1_v6; + rtcd->idct.idct16 = vp8_short_idct4x4llm_v6_dual; + rtcd->idct.idct1_scalar = vp8_dc_only_idct_armv6; + rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_armv6; + rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_armv6; + + rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_armv6; + rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_armv6; + rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_armv6; + rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_armv6; + rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_armv6; + rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_armv6; + rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_armv6; + rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_armv6; + + rtcd->recon.copy16x16 = vp8_copy_mem16x16_v6; + rtcd->recon.copy8x8 = vp8_copy_mem8x8_v6; + rtcd->recon.copy8x4 = vp8_copy_mem8x4_v6; + rtcd->recon.recon = vp8_recon_b_armv6; + rtcd->recon.recon2 = vp8_recon2b_armv6; + rtcd->recon.recon4 = vp8_recon4b_armv6; +#else +//pure c + rtcd->idct.idct1 = vp8_short_idct4x4llm_1_c; + rtcd->idct.idct16 = vp8_short_idct4x4llm_c; + rtcd->idct.idct1_scalar = vp8_dc_only_idct_c; + rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_c; + rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_c; + + rtcd->recon.copy16x16 = vp8_copy_mem16x16_c; + rtcd->recon.copy8x8 = vp8_copy_mem8x8_c; + rtcd->recon.copy8x4 = vp8_copy_mem8x4_c; + rtcd->recon.recon = vp8_recon_b_c; + rtcd->recon.recon2 = vp8_recon2b_c; + rtcd->recon.recon4 = vp8_recon4b_c; + + rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_c; + rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_c; + rtcd->subpix.sixtap8x4 = vp8_sixtap_predict8x4_c; + rtcd->subpix.sixtap4x4 = vp8_sixtap_predict_c; + rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_c; + rtcd->subpix.bilinear8x8 = vp8_bilinear_predict8x8_c; + rtcd->subpix.bilinear8x4 = vp8_bilinear_predict8x4_c; + rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_c; + + rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_c; + rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_c; + rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_c; + rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_c; + rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_c; + rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_c; + rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_c; + rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_c; +#endif + + rtcd->postproc.down = vp8_mbpost_proc_down_c; + rtcd->postproc.across = vp8_mbpost_proc_across_ip_c; + rtcd->postproc.downacross = vp8_post_proc_down_and_across_c; + rtcd->postproc.addnoise = vp8_plane_add_noise_c; +#endif + +#if HAVE_ARMV7 + vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby_neon; + vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s_neon; +#elif HAVE_ARMV6 + vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby; + vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s; +#else + vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby; + vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s; + +#endif + +} diff --git a/vp8/common/arm/vpx_asm_offsets.c b/vp8/common/arm/vpx_asm_offsets.c new file mode 100644 index 000000000..68634bf55 --- /dev/null +++ b/vp8/common/arm/vpx_asm_offsets.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include <stddef.h> + +#if CONFIG_VP8_ENCODER +#include "vpx_scale/yv12config.h" +#endif + +#if CONFIG_VP8_DECODER +#include "onyxd_int.h" +#endif + +#define DEFINE(sym, val) int sym = val; + +/* +#define BLANK() asm volatile("\n->" : : ) +*/ + +/* + * int main(void) + * { + */ + +#if CONFIG_VP8_DECODER || CONFIG_VP8_ENCODER +DEFINE(yv12_buffer_config_y_width, offsetof(YV12_BUFFER_CONFIG, y_width)); +DEFINE(yv12_buffer_config_y_height, offsetof(YV12_BUFFER_CONFIG, y_height)); +DEFINE(yv12_buffer_config_y_stride, offsetof(YV12_BUFFER_CONFIG, y_stride)); +DEFINE(yv12_buffer_config_uv_width, offsetof(YV12_BUFFER_CONFIG, uv_width)); +DEFINE(yv12_buffer_config_uv_height, offsetof(YV12_BUFFER_CONFIG, uv_height)); +DEFINE(yv12_buffer_config_uv_stride, offsetof(YV12_BUFFER_CONFIG, uv_stride)); +DEFINE(yv12_buffer_config_y_buffer, offsetof(YV12_BUFFER_CONFIG, y_buffer)); +DEFINE(yv12_buffer_config_u_buffer, offsetof(YV12_BUFFER_CONFIG, u_buffer)); +DEFINE(yv12_buffer_config_v_buffer, offsetof(YV12_BUFFER_CONFIG, v_buffer)); +DEFINE(yv12_buffer_config_border, offsetof(YV12_BUFFER_CONFIG, border)); +#endif + +#if CONFIG_VP8_DECODER +DEFINE(mb_diff, offsetof(MACROBLOCKD, diff)); +DEFINE(mb_predictor, offsetof(MACROBLOCKD, predictor)); +DEFINE(mb_dst_y_stride, offsetof(MACROBLOCKD, dst.y_stride)); +DEFINE(mb_dst_y_buffer, offsetof(MACROBLOCKD, dst.y_buffer)); +DEFINE(mb_dst_u_buffer, offsetof(MACROBLOCKD, dst.u_buffer)); +DEFINE(mb_dst_v_buffer, offsetof(MACROBLOCKD, dst.v_buffer)); +DEFINE(mb_mbmi_mode, offsetof(MACROBLOCKD, mbmi.mode)); +DEFINE(mb_up_available, offsetof(MACROBLOCKD, up_available)); +DEFINE(mb_left_available, offsetof(MACROBLOCKD, left_available)); + +DEFINE(detok_scan, offsetof(DETOK, scan)); +DEFINE(detok_ptr_onyxblock2context_leftabove, offsetof(DETOK, ptr_onyxblock2context_leftabove)); +DEFINE(detok_onyx_coef_tree_ptr, offsetof(DETOK, vp8_coef_tree_ptr)); +DEFINE(detok_teb_base_ptr, offsetof(DETOK, teb_base_ptr)); +DEFINE(detok_norm_ptr, offsetof(DETOK, norm_ptr)); +DEFINE(detok_ptr_onyx_coef_bands_x, offsetof(DETOK, ptr_onyx_coef_bands_x)); + +DEFINE(DETOK_A, offsetof(DETOK, A)); +DEFINE(DETOK_L, offsetof(DETOK, L)); + +DEFINE(detok_qcoeff_start_ptr, offsetof(DETOK, qcoeff_start_ptr)); +DEFINE(detok_current_bc, offsetof(DETOK, current_bc)); +DEFINE(detok_coef_probs, offsetof(DETOK, coef_probs)); +DEFINE(detok_eob, offsetof(DETOK, eob)); + +DEFINE(bool_decoder_lowvalue, offsetof(BOOL_DECODER, lowvalue)); +DEFINE(bool_decoder_range, offsetof(BOOL_DECODER, range)); +DEFINE(bool_decoder_value, offsetof(BOOL_DECODER, value)); +DEFINE(bool_decoder_count, offsetof(BOOL_DECODER, count)); +DEFINE(bool_decoder_user_buffer, offsetof(BOOL_DECODER, user_buffer)); +DEFINE(bool_decoder_user_buffer_sz, offsetof(BOOL_DECODER, user_buffer_sz)); +DEFINE(bool_decoder_decode_buffer, offsetof(BOOL_DECODER, decode_buffer)); +DEFINE(bool_decoder_read_ptr, offsetof(BOOL_DECODER, read_ptr)); +DEFINE(bool_decoder_write_ptr, offsetof(BOOL_DECODER, write_ptr)); + +DEFINE(tokenextrabits_min_val, offsetof(TOKENEXTRABITS, min_val)); +DEFINE(tokenextrabits_length, offsetof(TOKENEXTRABITS, Length)); +#endif + +//add asserts for any offset that is not supported by assembly code +//add asserts for any size that is not supported by assembly code +/* + * return 0; + * } + */ diff --git a/vp8/common/bigend.h b/vp8/common/bigend.h new file mode 100644 index 000000000..6a91ba1ae --- /dev/null +++ b/vp8/common/bigend.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef _bigend_h +#define _bigend_h + +#if defined(__cplusplus) +extern "C" { +#endif + +#define invert2(x) ( (((x)>>8)&0x00ff) | (((x)<<8)&0xff00) ) +#define invert4(x) ( ((invert2(x)&0x0000ffff)<<16) | (invert2((x>>16))&0x0000ffff) ) + +#define high_byte(x) (unsigned char)x +#define mid2Byte(x) (unsigned char)(x >> 8) +#define mid1Byte(x) (unsigned char)(x >> 16) +#define low_byte(x) (unsigned char)(x >> 24) + +#define SWAPENDS 1 + +#if defined(__cplusplus) +} +#endif +#endif diff --git a/vp8/common/blockd.c b/vp8/common/blockd.c new file mode 100644 index 000000000..53f5e72d2 --- /dev/null +++ b/vp8/common/blockd.c @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "blockd.h" +#include "vpx_mem/vpx_mem.h" + +void vp8_setup_temp_context(TEMP_CONTEXT *t, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, int count) +{ + vpx_memcpy(t->l, l, sizeof(ENTROPY_CONTEXT) * count); + vpx_memcpy(t->a, a, sizeof(ENTROPY_CONTEXT) * count); +} + +const int vp8_block2left[25] = { 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 1, 1, 0, 0, 1, 1, 0}; +const int vp8_block2above[25] = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0}; +const int vp8_block2type[25] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1}; +const int vp8_block2context[25] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3}; diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h new file mode 100644 index 000000000..84ed53ad2 --- /dev/null +++ b/vp8/common/blockd.h @@ -0,0 +1,299 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_BLOCKD_H +#define __INC_BLOCKD_H + +void vpx_log(const char *format, ...); + +#include "vpx_ports/config.h" +#include "vpx_scale/yv12config.h" +#include "mv.h" +#include "treecoder.h" +#include "subpixel.h" +#include "vpx_ports/mem.h" + +#define TRUE 1 +#define FALSE 0 + +//#define DCPRED 1 +#define DCPREDSIMTHRESH 0 +#define DCPREDCNTTHRESH 3 + +#define Y1CONTEXT 0 +#define UCONTEXT 1 +#define VCONTEXT 2 +#define Y2CONTEXT 3 + +#define MB_FEATURE_TREE_PROBS 3 +#define MAX_MB_SEGMENTS 4 + +#define MAX_REF_LF_DELTAS 4 +#define MAX_MODE_LF_DELTAS 4 + +// Segment Feature Masks +#define SEGMENT_DELTADATA 0 +#define SEGMENT_ABSDATA 1 + +typedef struct +{ + int r, c; +} POS; + + +typedef int ENTROPY_CONTEXT; + +typedef struct +{ + ENTROPY_CONTEXT l[4]; + ENTROPY_CONTEXT a[4]; +} TEMP_CONTEXT; + +extern void vp8_setup_temp_context(TEMP_CONTEXT *t, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, int count); +extern const int vp8_block2left[25]; +extern const int vp8_block2above[25]; +extern const int vp8_block2type[25]; +extern const int vp8_block2context[25]; + +#define VP8_COMBINEENTROPYCONTEXTS( Dest, A, B) \ + Dest = ((A)!=0) + ((B)!=0); + + +typedef enum +{ + KEY_FRAME = 0, + INTER_FRAME = 1 +} FRAME_TYPE; + +typedef enum +{ + DC_PRED, // average of above and left pixels + V_PRED, // vertical prediction + H_PRED, // horizontal prediction + TM_PRED, // Truemotion prediction + B_PRED, // block based prediction, each block has its own prediction mode + + NEARESTMV, + NEARMV, + ZEROMV, + NEWMV, + SPLITMV, + + MB_MODE_COUNT +} MB_PREDICTION_MODE; + +// Macroblock level features +typedef enum +{ + MB_LVL_ALT_Q = 0, // Use alternate Quantizer .... + MB_LVL_ALT_LF = 1, // Use alternate loop filter value... + MB_LVL_MAX = 2, // Number of MB level features supported + +} MB_LVL_FEATURES; + +// Segment Feature Masks +#define SEGMENT_ALTQ 0x01 +#define SEGMENT_ALT_LF 0x02 + +#define VP8_YMODES (B_PRED + 1) +#define VP8_UV_MODES (TM_PRED + 1) + +#define VP8_MVREFS (1 + SPLITMV - NEARESTMV) + +typedef enum +{ + B_DC_PRED, // average of above and left pixels + B_TM_PRED, + + B_VE_PRED, // vertical prediction + B_HE_PRED, // horizontal prediction + + B_LD_PRED, + B_RD_PRED, + + B_VR_PRED, + B_VL_PRED, + B_HD_PRED, + B_HU_PRED, + + LEFT4X4, + ABOVE4X4, + ZERO4X4, + NEW4X4, + + B_MODE_COUNT +} B_PREDICTION_MODE; + +#define VP8_BINTRAMODES (B_HU_PRED + 1) /* 10 */ +#define VP8_SUBMVREFS (1 + NEW4X4 - LEFT4X4) + +/* For keyframes, intra block modes are predicted by the (already decoded) + modes for the Y blocks to the left and above us; for interframes, there + is a single probability table. */ + +typedef struct +{ + B_PREDICTION_MODE mode; + union + { + int as_int; + MV as_mv; + } mv; +} B_MODE_INFO; + + +typedef enum +{ + INTRA_FRAME = 0, + LAST_FRAME = 1, + GOLDEN_FRAME = 2, + ALTREF_FRAME = 3, + MAX_REF_FRAMES = 4 +} MV_REFERENCE_FRAME; + +typedef struct +{ + MB_PREDICTION_MODE mode, uv_mode; + MV_REFERENCE_FRAME ref_frame; + union + { + int as_int; + MV as_mv; + } mv; + int partitioning; + int partition_count; + int mb_skip_coeff; //does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens + int dc_diff; + unsigned char segment_id; // Which set of segmentation parameters should be used for this MB + int force_no_skip; + + B_MODE_INFO partition_bmi[16]; + +} MB_MODE_INFO; + + +typedef struct +{ + MB_MODE_INFO mbmi; + B_MODE_INFO bmi[16]; +} MODE_INFO; + + +typedef struct +{ + short *qcoeff; + short *dqcoeff; + unsigned char *predictor; + short *diff; + short *reference; + + short(*dequant)[4]; + + // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries + unsigned char **base_pre; + int pre; + int pre_stride; + + unsigned char **base_dst; + int dst; + int dst_stride; + + int eob; + + B_MODE_INFO bmi; + +} BLOCKD; + +typedef struct +{ + DECLARE_ALIGNED(16, short, diff[400]); // from idct diff + DECLARE_ALIGNED(16, unsigned char, predictor[384]); + DECLARE_ALIGNED(16, short, reference[384]); + DECLARE_ALIGNED(16, short, qcoeff[400]); + DECLARE_ALIGNED(16, short, dqcoeff[400]); + + // 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. + BLOCKD block[25]; + + YV12_BUFFER_CONFIG pre; // Filtered copy of previous frame reconstruction + YV12_BUFFER_CONFIG dst; + + MODE_INFO *mode_info_context; + MODE_INFO *mode_info; + + int mode_info_stride; + + FRAME_TYPE frame_type; + + MB_MODE_INFO mbmi; + + int up_available; + int left_available; + + // Y,U,V,Y2 + ENTROPY_CONTEXT *above_context[4]; // row of context for each plane + ENTROPY_CONTEXT(*left_context)[4]; // (up to) 4 contexts "" + + // 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. + unsigned char segmentation_enabled; + + // 0 (do not update) 1 (update) the macroblock segmentation map. + unsigned char update_mb_segmentation_map; + + // 0 (do not update) 1 (update) the macroblock segmentation feature data. + unsigned char update_mb_segmentation_data; + + // 0 (do not update) 1 (update) the macroblock segmentation feature data. + unsigned char mb_segement_abs_delta; + + // Per frame flags that define which MB level features (such as quantizer or loop filter level) + // are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO + vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS]; // Probability Tree used to code Segment number + + signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; // Segment parameters + + // mode_based Loop filter adjustment + unsigned char mode_ref_lf_delta_enabled; + unsigned char mode_ref_lf_delta_update; + + // Delta values have the range +/- MAX_LOOP_FILTER + //char ref_lf_deltas[MAX_REF_LF_DELTAS]; // 0 = Intra, Last, GF, ARF + //char mode_lf_deltas[MAX_MODE_LF_DELTAS]; // 0 = BPRED, ZERO_MV, MV, SPLIT + signed char ref_lf_deltas[MAX_REF_LF_DELTAS]; // 0 = Intra, Last, GF, ARF + signed char mode_lf_deltas[MAX_MODE_LF_DELTAS]; // 0 = BPRED, ZERO_MV, MV, SPLIT + + // Distance of MB away from frame edges + int mb_to_left_edge; + int mb_to_right_edge; + int mb_to_top_edge; + int mb_to_bottom_edge; + + //char * gf_active_ptr; + signed char *gf_active_ptr; + + unsigned int frames_since_golden; + unsigned int frames_till_alt_ref_frame; + vp8_subpix_fn_t subpixel_predict; + vp8_subpix_fn_t subpixel_predict8x4; + vp8_subpix_fn_t subpixel_predict8x8; + vp8_subpix_fn_t subpixel_predict16x16; + + void *current_bc; + +#if CONFIG_RUNTIME_CPU_DETECT + struct VP8_COMMON_RTCD *rtcd; +#endif +} MACROBLOCKD; + + +extern void vp8_build_block_doffsets(MACROBLOCKD *x); +extern void vp8_setup_block_dptrs(MACROBLOCKD *x); + +#endif /* __INC_BLOCKD_H */ diff --git a/vp8/common/boolcoder.h b/vp8/common/boolcoder.h new file mode 100644 index 000000000..0659d4873 --- /dev/null +++ b/vp8/common/boolcoder.h @@ -0,0 +1,569 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef bool_coder_h +#define bool_coder_h 1 + +/* Arithmetic bool coder with largish probability range. + Timothy S Murphy 6 August 2004 */ + +/* So as not to force users to drag in too much of my idiosyncratic C++ world, + I avoid fancy storage management. */ + +#include <assert.h> + +#include <stddef.h> +#include <stdio.h> + +typedef unsigned char vp8bc_index_t; // probability index + +/* There are a couple of slight variants in the details of finite-precision + arithmetic coding. May be safely ignored by most users. */ + +enum vp8bc_rounding +{ + vp8bc_down = 0, // just like VP8 + vp8bc_down_full = 1, // handles minimum probability correctly + vp8bc_up = 2 +}; + +#if _MSC_VER + +/* Note that msvc by default does not inline _anything_ (regardless of the + setting of inline_depth) and that a command-line option (-Ob1 or -Ob2) + is required to inline even the smallest functions. */ + +# pragma inline_depth( 255) // I mean it when I inline something +# pragma warning( disable : 4099) // No class vs. struct harassment +# pragma warning( disable : 4250) // dominance complaints +# pragma warning( disable : 4284) // operator-> in templates +# pragma warning( disable : 4800) // bool conversion + +// don't let prefix ++,-- stand in for postfix, disaster would ensue + +# pragma warning( error : 4620 4621) + +#endif // _MSC_VER + + +#if __cplusplus + +// Sometimes one wishes to be definite about integer lengths. + +struct int_types +{ + typedef const bool cbool; + typedef const signed char cchar; + typedef const short cshort; + typedef const int cint; + typedef const int clong; + + typedef const double cdouble; + typedef const size_t csize_t; + + typedef unsigned char uchar; // 8 bits + typedef const uchar cuchar; + + typedef short int16; + typedef unsigned short uint16; + typedef const int16 cint16; + typedef const uint16 cuint16; + + typedef int int32; + typedef unsigned int uint32; + typedef const int32 cint32; + typedef const uint32 cuint32; + + typedef unsigned int uint; + typedef unsigned int ulong; + typedef const uint cuint; + typedef const ulong culong; + + + // All structs consume space, may as well have a vptr. + + virtual ~int_types(); +}; + + +struct bool_coder_spec; +struct bool_coder; +struct bool_writer; +struct bool_reader; + + +struct bool_coder_namespace : int_types +{ + typedef vp8bc_index_t Index; + typedef bool_coder_spec Spec; + typedef const Spec c_spec; + + enum Rounding + { + Down = vp8bc_down, + down_full = vp8bc_down_full, + Up = vp8bc_up + }; +}; + + +// Archivable specification of a bool coder includes rounding spec +// and probability mapping table. The latter replaces a uchar j +// (0 <= j < 256) with an arbitrary uint16 tbl[j] = p. +// p/65536 is then the probability of a zero. + +struct bool_coder_spec : bool_coder_namespace +{ + friend struct bool_coder; + friend struct bool_writer; + friend struct bool_reader; + friend struct bool_coder_spec_float; + friend struct bool_coder_spec_explicit_table; + friend struct bool_coder_spec_exponential_table; + friend struct BPsrc; +private: + uint w; // precision + Rounding r; + + uint ebits, mbits, ebias; + uint32 mmask; + + Index max_index, half_index; + + uint32 mantissa(Index i) const + { + assert(i < half_index); + return (1 << mbits) + (i & mmask); + } + uint exponent(Index i) const + { + assert(i < half_index); + return ebias - (i >> mbits); + } + + uint16 Ptbl[256]; // kinda clunky, but so is storage management. + + /* Cost in bits of encoding a zero at every probability, scaled by 2^20. + Assumes that index is at most 8 bits wide. */ + + uint32 Ctbl[256]; + + uint32 split(Index i, uint32 R) const // 1 <= split <= max( 1, R-1) + { + if (!ebias) + return 1 + (((R - 1) * Ptbl[i]) >> 16); + + if (i >= half_index) + return R - split(max_index - i, R); + + return 1 + (((R - 1) * mantissa(i)) >> exponent(i)); + } + + uint32 max_range() const + { + return (1 << w) - (r == down_full ? 0 : 1); + } + uint32 min_range() const + { + return (1 << (w - 1)) + (r == down_full ? 1 : 0); + } + uint32 Rinc() const + { + return r == Up ? 1 : 0; + } + + void check_prec() const; + + bool float_init(uint Ebits, uint Mbits); + + void cost_init(); + + bool_coder_spec( + uint prec, Rounding rr, uint Ebits = 0, uint Mbits = 0 + ) + : w(prec), r(rr) + { + float_init(Ebits, Mbits); + } +public: + // Read complete spec from file. + bool_coder_spec(FILE *); + + // Write spec to file. + void dump(FILE *) const; + + // return probability index best approximating prob. + Index operator()(double prob) const; + + // probability corresponding to index + double operator()(Index i) const; + + Index complement(Index i) const + { + return max_index - i; + } + + Index max_index() const + { + return max_index; + } + Index half_index() const + { + return half_index; + } + + uint32 cost_zero(Index i) const + { + return Ctbl[i]; + } + uint32 cost_one(Index i) const + { + return Ctbl[ max_index - i]; + } + uint32 cost_bit(Index i, bool b) const + { + return Ctbl[b? max_index-i:i]; + } +}; + + +/* Pseudo floating-point probability specification. + + At least one of Ebits and Mbits must be nonzero. + + Since all arithmetic is done at 32 bits, Ebits is at most 5. + + Total significant bits in index is Ebits + Mbits + 1. + + Below the halfway point (i.e. when the top significant bit is 0), + the index is (e << Mbits) + m. + + The exponent e is between 0 and (2**Ebits) - 1, + the mantissa m is between 0 and (2**Mbits) - 1. + + Prepending an implicit 1 to the mantissa, the probability is then + + (2**Mbits + m) >> (e - 2**Ebits - 1 - Mbits), + + which has (1/2)**(2**Ebits + 1) as a minimum + and (1/2) * [1 - 2**(Mbits + 1)] as a maximum. + + When the index is above the halfway point, the probability is the + complement of the probability associated to the complement of the index. + + Note that the probability increases with the index and that, because of + the symmetry, we cannot encode probability exactly 1/2; though we + can get as close to 1/2 as we like, provided we have enough Mbits. + + The latter is of course not a problem in practice, one never has + exact probabilities and entropy errors are second order, that is, the + "overcoding" of a zero will be largely compensated for by the + "undercoding" of a one (or vice-versa). + + Compared to arithmetic probability specs (a la VP8), this will do better + at very high and low probabilities and worse at probabilities near 1/2, + as well as facilitating the usage of wider or narrower probability indices. +*/ + +struct bool_coder_spec_float : bool_coder_spec +{ + bool_coder_spec_float( + uint Ebits = 3, uint Mbits = 4, Rounding rr = down_full, uint prec = 12 + ) + : bool_coder_spec(prec, rr, Ebits, Mbits) + { + cost_init(); + } +}; + + +struct bool_coder_spec_explicit_table : bool_coder_spec +{ + bool_coder_spec_explicit_table( + cuint16 probability_table[256] = 0, // default is tbl[i] = i << 8. + Rounding = down_full, + uint precision = 16 + ); +}; + +// Contruct table via multiplicative interpolation between +// p[128] = 1/2 and p[0] = (1/2)^x. +// Since we are working with 16-bit precision, x is at most 16. +// For probabilities to increase with i, we must have x > 1. +// For 0 <= i <= 128, p[i] = (1/2)^{ 1 + [1 - (i/128)]*[x-1] }. +// Finally, p[128+i] = 1 - p[128 - i]. + +struct bool_coder_spec_exponential_table : bool_coder_spec +{ + bool_coder_spec_exponential_table(uint x, Rounding = down_full, uint prec = 16); +}; + + +// Commonalities between writer and reader. + +struct bool_coder : bool_coder_namespace +{ + friend struct bool_writer; + friend struct bool_reader; + friend struct BPsrc; +private: + uint32 Low, Range; + cuint32 min_range; + cuint32 rinc; + c_spec spec; + + void _reset() + { + Low = 0; + Range = spec.max_range(); + } + + bool_coder(c_spec &s) + : min_range(s.min_range()), + rinc(s.Rinc()), + spec(s) + { + _reset(); + } + + uint32 half() const + { + return 1 + ((Range - 1) >> 1); + } +public: + c_spec &Spec() const + { + return spec; + } +}; + + +struct bool_writer : bool_coder +{ + friend struct BPsrc; +private: + uchar *Bstart, *Bend, *B; + int bit_lag; + bool is_toast; + void carry(); + void reset() + { + _reset(); + bit_lag = 32 - spec.w; + is_toast = 0; + } + void raw(bool value, uint32 split); +public: + bool_writer(c_spec &, uchar *Dest, size_t Len); + virtual ~bool_writer(); + + void operator()(Index p, bool v) + { + raw(v, spec.split(p, Range)); + } + + uchar *buf() const + { + return Bstart; + } + size_t bytes_written() const + { + return B - Bstart; + } + + // Call when done with input, flushes internal state. + // DO NOT write any more data after calling this. + + bool_writer &flush(); + + void write_bits(int n, uint val) + { + if (n) + { + uint m = 1 << (n - 1); + + do + { + raw((bool)(val & m), half()); + } + while (m >>= 1); + } + } + +# if 0 + // We are agnostic about storage management. + // By default, overflows throw an assert but user can + // override to provide an expanding buffer using ... + + virtual void overflow(uint Len) const; + + // ... this function copies already-written data into new buffer + // and retains new buffer location. + + void new_buffer(uchar *dest, uint Len); + + // Note that storage management is the user's responsibility. +# endif +}; + + +// This could be adjusted to use a little less lookahead. + +struct bool_reader : bool_coder +{ + friend struct BPsrc; +private: + cuchar *const Bstart; // for debugging + cuchar *B; + cuchar *const Bend; + cuint shf; + uint bct; + bool raw(uint32 split); +public: + bool_reader(c_spec &s, cuchar *src, size_t Len); + + bool operator()(Index p) + { + return raw(spec.split(p, Range)); + } + + uint read_bits(int num_bits) + { + uint v = 0; + + while (--num_bits >= 0) + v += v + (raw(half()) ? 1 : 0); + + return v; + } +}; + +extern "C" { + +#endif /* __cplusplus */ + + + /* C interface */ + + typedef struct bool_coder_spec bool_coder_spec; + typedef struct bool_writer bool_writer; + typedef struct bool_reader bool_reader; + + typedef const bool_coder_spec c_bool_coder_spec; + typedef const bool_writer c_bool_writer; + typedef const bool_reader c_bool_reader; + + + /* Optionally override default precision when constructing coder_specs. + Just pass a zero pointer if you don't care. + Precision is at most 16 bits for table specs, at most 23 otherwise. */ + + struct vp8bc_prec + { + enum vp8bc_rounding r; /* see top header file for def */ + unsigned int prec; /* range precision in bits */ + }; + + typedef const struct vp8bc_prec vp8bc_c_prec; + + /* bool_coder_spec contains mapping of uchars to actual probabilities + (16 bit uints) as well as (usually immaterial) selection of + exact finite-precision algorithm used (for now, the latter can only + be overridden using the C++ interface). + See comments above the corresponding C++ constructors for discussion, + especially of exponential probability table generation. */ + + bool_coder_spec *vp8bc_vp8spec(); // just like vp8 + + bool_coder_spec *vp8bc_literal_spec( + const unsigned short prob_map[256], // 0 is like vp8 w/more precision + vp8bc_c_prec* + ); + + bool_coder_spec *vp8bc_float_spec( + unsigned int exponent_bits, unsigned int mantissa_bits, vp8bc_c_prec* + ); + + bool_coder_spec *vp8bc_exponential_spec(unsigned int min_exp, vp8bc_c_prec *); + + bool_coder_spec *vp8bc_spec_from_file(FILE *); + + + void vp8bc_destroy_spec(c_bool_coder_spec *); + + void vp8bc_spec_to_file(c_bool_coder_spec *, FILE *); + + + /* Nearest index to supplied probability of zero, 0 <= prob <= 1. */ + + vp8bc_index_t vp8bc_index(c_bool_coder_spec *, double prob); + + vp8bc_index_t vp8bc_index_from_counts( + c_bool_coder_spec *p, unsigned int zero_ct, unsigned int one_ct + ); + + /* In case you want to look */ + + double vp8bc_probability(c_bool_coder_spec *, vp8bc_index_t); + + /* Opposite index */ + + vp8bc_index_t vp8bc_complement(c_bool_coder_spec *, vp8bc_index_t); + + /* Cost in bits of encoding a zero at given probability, scaled by 2^20. + (assumes that an int holds at least 32 bits). */ + + unsigned int vp8bc_cost_zero(c_bool_coder_spec *, vp8bc_index_t); + + unsigned int vp8bc_cost_one(c_bool_coder_spec *, vp8bc_index_t); + unsigned int vp8bc_cost_bit(c_bool_coder_spec *, vp8bc_index_t, int); + + + /* bool_writer interface */ + + /* Length = 0 disables checking for writes beyond buffer end. */ + + bool_writer *vp8bc_create_writer( + c_bool_coder_spec *, unsigned char *Destination, size_t Length + ); + + /* Flushes out any buffered data and returns total # of bytes written. */ + + size_t vp8bc_destroy_writer(bool_writer *); + + void vp8bc_write_bool(bool_writer *, int boolean_val, vp8bc_index_t false_prob); + + void vp8bc_write_bits( + bool_writer *, unsigned int integer_value, int number_of_bits + ); + + c_bool_coder_spec *vp8bc_writer_spec(c_bool_writer *); + + + /* bool_reader interface */ + + /* Length = 0 disables checking for reads beyond buffer end. */ + + bool_reader *vp8bc_create_reader( + c_bool_coder_spec *, const unsigned char *Source, size_t Length + ); + void vp8bc_destroy_reader(bool_reader *); + + int vp8bc_read_bool(bool_reader *, vp8bc_index_t false_prob); + + unsigned int vp8bc_read_bits(bool_reader *, int number_of_bits); + + c_bool_coder_spec *vp8bc_reader_spec(c_bool_reader *); + +#if __cplusplus +} +#endif + +#endif /* bool_coder_h */ diff --git a/vp8/common/codec_common_interface.h b/vp8/common/codec_common_interface.h new file mode 100644 index 000000000..7881b0a41 --- /dev/null +++ b/vp8/common/codec_common_interface.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + +#ifndef CODEC_COMMON_INTERFACE_H +#define CODEC_COMMON_INTERFACE_H + +#define __export +#define _export +#define dll_export __declspec( dllexport ) +#define dll_import __declspec( dllimport ) + +// Playback ERROR Codes. +#define NO_DECODER_ERROR 0 +#define REMOTE_DECODER_ERROR -1 + +#define DFR_BAD_DCT_COEFF -100 +#define DFR_ZERO_LENGTH_FRAME -101 +#define DFR_FRAME_SIZE_INVALID -102 +#define DFR_OUTPUT_BUFFER_OVERFLOW -103 +#define DFR_INVALID_FRAME_HEADER -104 +#define FR_INVALID_MODE_TOKEN -110 +#define ETR_ALLOCATION_ERROR -200 +#define ETR_INVALID_ROOT_PTR -201 +#define SYNCH_ERROR -400 +#define BUFFER_UNDERFLOW_ERROR -500 +#define PB_IB_OVERFLOW_ERROR -501 + +// External error triggers +#define PB_HEADER_CHECKSUM_ERROR -601 +#define PB_DATA_CHECKSUM_ERROR -602 + +// DCT Error Codes +#define DDCT_EXPANSION_ERROR -700 +#define DDCT_INVALID_TOKEN_ERROR -701 + +// exception_errors +#define GEN_EXCEPTIONS -800 +#define EX_UNQUAL_ERROR -801 + +// Unrecoverable error codes +#define FATAL_PLAYBACK_ERROR -1000 +#define GEN_ERROR_CREATING_CDC -1001 +#define GEN_THREAD_CREATION_ERROR -1002 +#define DFR_CREATE_BMP_FAILED -1003 + +// YUV buffer configuration structure +typedef struct +{ + int y_width; + int y_height; + int y_stride; + + int uv_width; + int uv_height; + int uv_stride; + + unsigned char *y_buffer; + unsigned char *u_buffer; + unsigned char *v_buffer; + +} YUV_BUFFER_CONFIG; +typedef enum +{ + C_SET_KEY_FRAME, + C_SET_FIXED_Q, + C_SET_FIRSTPASS_FILE, + C_SET_EXPERIMENTAL_MIN, + C_SET_EXPERIMENTAL_MAX = C_SET_EXPERIMENTAL_MIN + 255, + C_SET_CHECKPROTECT, + C_SET_TESTMODE, + C_SET_INTERNAL_SIZE, + C_SET_RECOVERY_FRAME, + C_SET_REFERENCEFRAME, + C_SET_GOLDENFRAME + +#ifndef VP50_COMP_INTERFACE + // Specialist test facilities. +// C_VCAP_PARAMS, // DO NOT USE FOR NOW WITH VFW CODEC +#endif + +} C_SETTING; + +typedef unsigned long C_SET_VALUE; + + +#endif diff --git a/vp8/common/coefupdateprobs.h b/vp8/common/coefupdateprobs.h new file mode 100644 index 000000000..99affd618 --- /dev/null +++ b/vp8/common/coefupdateprobs.h @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/* Update probabilities for the nodes in the token entropy tree. + Generated file included by entropy.c */ + +const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1] = +{ + { + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, }, + {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, }, + {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, }, + }, + { + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, }, + {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, }, + {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, }, + {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, +}; diff --git a/vp8/common/common.h b/vp8/common/common.h new file mode 100644 index 000000000..29f6d371b --- /dev/null +++ b/vp8/common/common.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef common_h +#define common_h 1 + +#include <assert.h> + +/* Interface header for common constant data structures and lookup tables */ + +#include "vpx_mem/vpx_mem.h" + +#include "common_types.h" + +/* Only need this for fixed-size arrays, for structs just assign. */ + +#define vp8_copy( Dest, Src) { \ + assert( sizeof( Dest) == sizeof( Src)); \ + vpx_memcpy( Dest, Src, sizeof( Src)); \ + } + +/* Use this for variably-sized arrays. */ + +#define vp8_copy_array( Dest, Src, N) { \ + assert( sizeof( *Dest) == sizeof( *Src)); \ + vpx_memcpy( Dest, Src, N * sizeof( *Src)); \ + } + +#define vp8_zero( Dest) vpx_memset( &Dest, 0, sizeof( Dest)); + +#define vp8_zero_array( Dest, N) vpx_memset( Dest, 0, N * sizeof( *Dest)); + + +#endif /* common_h */ diff --git a/vp8/common/common_types.h b/vp8/common/common_types.h new file mode 100644 index 000000000..deb5ed8e5 --- /dev/null +++ b/vp8/common/common_types.h @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_COMMON_TYPES +#define __INC_COMMON_TYPES + +#define TRUE 1 +#define FALSE 0 + +#endif diff --git a/vp8/common/context.c b/vp8/common/context.c new file mode 100644 index 000000000..17ee8c338 --- /dev/null +++ b/vp8/common/context.c @@ -0,0 +1,398 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "entropy.h" + +/* *** GENERATED FILE: DO NOT EDIT *** */ + +#if 0 +int Contexts[vp8_coef_counter_dimen]; + +const int default_contexts[vp8_coef_counter_dimen] = +{ + { + // Block Type ( 0 ) + { + // Coeff Band ( 0 ) + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + }, + { + // Coeff Band ( 1 ) + {30190, 26544, 225, 24, 4, 0, 0, 0, 0, 0, 0, 4171593,}, + {26846, 25157, 1241, 130, 26, 6, 1, 0, 0, 0, 0, 149987,}, + {10484, 9538, 1006, 160, 36, 18, 0, 0, 0, 0, 0, 15104,}, + }, + { + // Coeff Band ( 2 ) + {25842, 40456, 1126, 83, 11, 2, 0, 0, 0, 0, 0, 0,}, + {9338, 8010, 512, 73, 7, 3, 2, 0, 0, 0, 0, 43294,}, + {1047, 751, 149, 31, 13, 6, 1, 0, 0, 0, 0, 879,}, + }, + { + // Coeff Band ( 3 ) + {26136, 9826, 252, 13, 0, 0, 0, 0, 0, 0, 0, 0,}, + {8134, 5574, 191, 14, 2, 0, 0, 0, 0, 0, 0, 35302,}, + { 605, 677, 116, 9, 1, 0, 0, 0, 0, 0, 0, 611,}, + }, + { + // Coeff Band ( 4 ) + {10263, 15463, 283, 17, 0, 0, 0, 0, 0, 0, 0, 0,}, + {2773, 2191, 128, 9, 2, 2, 0, 0, 0, 0, 0, 10073,}, + { 134, 125, 32, 4, 0, 2, 0, 0, 0, 0, 0, 50,}, + }, + { + // Coeff Band ( 5 ) + {10483, 2663, 23, 1, 0, 0, 0, 0, 0, 0, 0, 0,}, + {2137, 1251, 27, 1, 1, 0, 0, 0, 0, 0, 0, 14362,}, + { 116, 156, 14, 2, 1, 0, 0, 0, 0, 0, 0, 190,}, + }, + { + // Coeff Band ( 6 ) + {40977, 27614, 412, 28, 0, 0, 0, 0, 0, 0, 0, 0,}, + {6113, 5213, 261, 22, 3, 0, 0, 0, 0, 0, 0, 26164,}, + { 382, 312, 50, 14, 2, 0, 0, 0, 0, 0, 0, 345,}, + }, + { + // Coeff Band ( 7 ) + { 0, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 319,}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8,}, + }, + }, + { + // Block Type ( 1 ) + { + // Coeff Band ( 0 ) + {3268, 19382, 1043, 250, 93, 82, 49, 26, 17, 8, 25, 82289,}, + {8758, 32110, 5436, 1832, 827, 668, 420, 153, 24, 0, 3, 52914,}, + {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399, 59, 0, 0, 18620,}, + }, + { + // Coeff Band ( 1 ) + {12419, 8420, 452, 62, 9, 1, 0, 0, 0, 0, 0, 0,}, + {11715, 8705, 693, 92, 15, 7, 2, 0, 0, 0, 0, 53988,}, + {7603, 8585, 2306, 778, 270, 145, 39, 5, 0, 0, 0, 9136,}, + }, + { + // Coeff Band ( 2 ) + {15938, 14335, 1207, 184, 55, 13, 4, 1, 0, 0, 0, 0,}, + {7415, 6829, 1138, 244, 71, 26, 7, 0, 0, 0, 0, 9980,}, + {1580, 1824, 655, 241, 89, 46, 10, 2, 0, 0, 0, 429,}, + }, + { + // Coeff Band ( 3 ) + {19453, 5260, 201, 19, 0, 0, 0, 0, 0, 0, 0, 0,}, + {9173, 3758, 213, 22, 1, 1, 0, 0, 0, 0, 0, 9820,}, + {1689, 1277, 276, 51, 17, 4, 0, 0, 0, 0, 0, 679,}, + }, + { + // Coeff Band ( 4 ) + {12076, 10667, 620, 85, 19, 9, 5, 0, 0, 0, 0, 0,}, + {4665, 3625, 423, 55, 19, 9, 0, 0, 0, 0, 0, 5127,}, + { 415, 440, 143, 34, 20, 7, 2, 0, 0, 0, 0, 101,}, + }, + { + // Coeff Band ( 5 ) + {12183, 4846, 115, 11, 1, 0, 0, 0, 0, 0, 0, 0,}, + {4226, 3149, 177, 21, 2, 0, 0, 0, 0, 0, 0, 7157,}, + { 375, 621, 189, 51, 11, 4, 1, 0, 0, 0, 0, 198,}, + }, + { + // Coeff Band ( 6 ) + {61658, 37743, 1203, 94, 10, 3, 0, 0, 0, 0, 0, 0,}, + {15514, 11563, 903, 111, 14, 5, 0, 0, 0, 0, 0, 25195,}, + { 929, 1077, 291, 78, 14, 7, 1, 0, 0, 0, 0, 507,}, + }, + { + // Coeff Band ( 7 ) + { 0, 990, 15, 3, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 412, 13, 0, 0, 0, 0, 0, 0, 0, 0, 1641,}, + { 0, 18, 7, 1, 0, 0, 0, 0, 0, 0, 0, 30,}, + }, + }, + { + // Block Type ( 2 ) + { + // Coeff Band ( 0 ) + { 953, 24519, 628, 120, 28, 12, 4, 0, 0, 0, 0, 2248798,}, + {1525, 25654, 2647, 617, 239, 143, 42, 5, 0, 0, 0, 66837,}, + {1180, 11011, 3001, 1237, 532, 448, 239, 54, 5, 0, 0, 7122,}, + }, + { + // Coeff Band ( 1 ) + {1356, 2220, 67, 10, 4, 1, 0, 0, 0, 0, 0, 0,}, + {1450, 2544, 102, 18, 4, 3, 0, 0, 0, 0, 0, 57063,}, + {1182, 2110, 470, 130, 41, 21, 0, 0, 0, 0, 0, 6047,}, + }, + { + // Coeff Band ( 2 ) + { 370, 3378, 200, 30, 5, 4, 1, 0, 0, 0, 0, 0,}, + { 293, 1006, 131, 29, 11, 0, 0, 0, 0, 0, 0, 5404,}, + { 114, 387, 98, 23, 4, 8, 1, 0, 0, 0, 0, 236,}, + }, + { + // Coeff Band ( 3 ) + { 579, 194, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 395, 213, 5, 1, 0, 0, 0, 0, 0, 0, 0, 4157,}, + { 119, 122, 4, 0, 0, 0, 0, 0, 0, 0, 0, 300,}, + }, + { + // Coeff Band ( 4 ) + { 38, 557, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 21, 114, 12, 1, 0, 0, 0, 0, 0, 0, 0, 427,}, + { 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7,}, + }, + { + // Coeff Band ( 5 ) + { 52, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 18, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 652,}, + { 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30,}, + }, + { + // Coeff Band ( 6 ) + { 640, 569, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 25, 77, 2, 0, 0, 0, 0, 0, 0, 0, 0, 517,}, + { 4, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,}, + }, + { + // Coeff Band ( 7 ) + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + }, + }, + { + // Block Type ( 3 ) + { + // Coeff Band ( 0 ) + {2506, 20161, 2707, 767, 261, 178, 107, 30, 14, 3, 0, 100694,}, + {8806, 36478, 8817, 3268, 1280, 850, 401, 114, 42, 0, 0, 58572,}, + {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175, 32, 0, 0, 19284,}, + }, + { + // Coeff Band ( 1 ) + {9738, 11313, 959, 205, 70, 18, 11, 1, 0, 0, 0, 0,}, + {12628, 15085, 1507, 273, 52, 19, 9, 0, 0, 0, 0, 54280,}, + {10701, 15846, 5561, 1926, 813, 570, 249, 36, 0, 0, 0, 6460,}, + }, + { + // Coeff Band ( 2 ) + {6781, 22539, 2784, 634, 182, 123, 20, 4, 0, 0, 0, 0,}, + {6263, 11544, 2649, 790, 259, 168, 27, 5, 0, 0, 0, 20539,}, + {3109, 4075, 2031, 896, 457, 386, 158, 29, 0, 0, 0, 1138,}, + }, + { + // Coeff Band ( 3 ) + {11515, 4079, 465, 73, 5, 14, 2, 0, 0, 0, 0, 0,}, + {9361, 5834, 650, 96, 24, 8, 4, 0, 0, 0, 0, 22181,}, + {4343, 3974, 1360, 415, 132, 96, 14, 1, 0, 0, 0, 1267,}, + }, + { + // Coeff Band ( 4 ) + {4787, 9297, 823, 168, 44, 12, 4, 0, 0, 0, 0, 0,}, + {3619, 4472, 719, 198, 60, 31, 3, 0, 0, 0, 0, 8401,}, + {1157, 1175, 483, 182, 88, 31, 8, 0, 0, 0, 0, 268,}, + }, + { + // Coeff Band ( 5 ) + {8299, 1226, 32, 5, 1, 0, 0, 0, 0, 0, 0, 0,}, + {3502, 1568, 57, 4, 1, 1, 0, 0, 0, 0, 0, 9811,}, + {1055, 1070, 166, 29, 6, 1, 0, 0, 0, 0, 0, 527,}, + }, + { + // Coeff Band ( 6 ) + {27414, 27927, 1989, 347, 69, 26, 0, 0, 0, 0, 0, 0,}, + {5876, 10074, 1574, 341, 91, 24, 4, 0, 0, 0, 0, 21954,}, + {1571, 2171, 778, 324, 124, 65, 16, 0, 0, 0, 0, 979,}, + }, + { + // Coeff Band ( 7 ) + { 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 459,}, + { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13,}, + }, + }, +}; + +//Update probabilities for the nodes in the token entropy tree. +const vp8_prob tree_update_probs[vp8_coef_tree_dimen] = +{ + { + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, }, + {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, }, + {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, }, + }, + { + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, }, + {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, }, + {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, }, + {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, +}; +#endif diff --git a/vp8/common/debugmodes.c b/vp8/common/debugmodes.c new file mode 100644 index 000000000..e2d2d2c0f --- /dev/null +++ b/vp8/common/debugmodes.c @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include <stdio.h> +#include "blockd.h" + + +void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int frame) +{ + + int mb_row; + int mb_col; + int mb_index = 0; + FILE *mvs = fopen("mvs.stt", "a"); + + // print out the macroblock Y modes + mb_index = 0; + fprintf(mvs, "Mb Modes for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; mb_row++) + { + for (mb_col = 0; mb_col < cols; mb_col++) + { + + fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode); + + mb_index++; + } + + fprintf(mvs, "\n"); + mb_index++; + } + + fprintf(mvs, "\n"); + + mb_index = 0; + fprintf(mvs, "Mb mv ref for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; mb_row++) + { + for (mb_col = 0; mb_col < cols; mb_col++) + { + + fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame); + + mb_index++; + } + + fprintf(mvs, "\n"); + mb_index++; + } + + fprintf(mvs, "\n"); + + // print out the macroblock UV modes + mb_index = 0; + fprintf(mvs, "UV Modes for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; mb_row++) + { + for (mb_col = 0; mb_col < cols; mb_col++) + { + + fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode); + + mb_index++; + } + + mb_index++; + fprintf(mvs, "\n"); + } + + fprintf(mvs, "\n"); + + // print out the block modes + mb_index = 0; + fprintf(mvs, "Mbs for Frame %d\n", frame); + { + int b_row; + + for (b_row = 0; b_row < 4 * rows; b_row++) + { + int b_col; + int bindex; + + for (b_col = 0; b_col < 4 * cols; b_col++) + { + mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2); + bindex = (b_row & 3) * 4 + (b_col & 3); + + if (mi[mb_index].mbmi.mode == B_PRED) + fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].mode); + else + fprintf(mvs, "xx "); + + } + + fprintf(mvs, "\n"); + } + } + fprintf(mvs, "\n"); + + // print out the macroblock mvs + mb_index = 0; + fprintf(mvs, "MVs for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; mb_row++) + { + for (mb_col = 0; mb_col < cols; mb_col++) + { + fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv.as_mv.row / 2, mi[mb_index].mbmi.mv.as_mv.col / 2); + + mb_index++; + } + + mb_index++; + fprintf(mvs, "\n"); + } + + fprintf(mvs, "\n"); + + + // print out the block modes + mb_index = 0; + fprintf(mvs, "MVs for Frame %d\n", frame); + { + int b_row; + + for (b_row = 0; b_row < 4 * rows; b_row++) + { + int b_col; + int bindex; + + for (b_col = 0; b_col < 4 * cols; b_col++) + { + mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2); + bindex = (b_row & 3) * 4 + (b_col & 3); + fprintf(mvs, "%3d:%-3d ", mi[mb_index].bmi[bindex].mv.as_mv.row, mi[mb_index].bmi[bindex].mv.as_mv.col); + + } + + fprintf(mvs, "\n"); + } + } + fprintf(mvs, "\n"); + + + fclose(mvs); +} diff --git a/vp8/common/defaultcoefcounts.h b/vp8/common/defaultcoefcounts.h new file mode 100644 index 000000000..ccdf326e6 --- /dev/null +++ b/vp8/common/defaultcoefcounts.h @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/* Generated file, included by entropy.c */ + +static const unsigned int default_coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens] = +{ + + { + // Block Type ( 0 ) + { + // Coeff Band ( 0 ) + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + }, + { + // Coeff Band ( 1 ) + {30190, 26544, 225, 24, 4, 0, 0, 0, 0, 0, 0, 4171593,}, + {26846, 25157, 1241, 130, 26, 6, 1, 0, 0, 0, 0, 149987,}, + {10484, 9538, 1006, 160, 36, 18, 0, 0, 0, 0, 0, 15104,}, + }, + { + // Coeff Band ( 2 ) + {25842, 40456, 1126, 83, 11, 2, 0, 0, 0, 0, 0, 0,}, + {9338, 8010, 512, 73, 7, 3, 2, 0, 0, 0, 0, 43294,}, + {1047, 751, 149, 31, 13, 6, 1, 0, 0, 0, 0, 879,}, + }, + { + // Coeff Band ( 3 ) + {26136, 9826, 252, 13, 0, 0, 0, 0, 0, 0, 0, 0,}, + {8134, 5574, 191, 14, 2, 0, 0, 0, 0, 0, 0, 35302,}, + { 605, 677, 116, 9, 1, 0, 0, 0, 0, 0, 0, 611,}, + }, + { + // Coeff Band ( 4 ) + {10263, 15463, 283, 17, 0, 0, 0, 0, 0, 0, 0, 0,}, + {2773, 2191, 128, 9, 2, 2, 0, 0, 0, 0, 0, 10073,}, + { 134, 125, 32, 4, 0, 2, 0, 0, 0, 0, 0, 50,}, + }, + { + // Coeff Band ( 5 ) + {10483, 2663, 23, 1, 0, 0, 0, 0, 0, 0, 0, 0,}, + {2137, 1251, 27, 1, 1, 0, 0, 0, 0, 0, 0, 14362,}, + { 116, 156, 14, 2, 1, 0, 0, 0, 0, 0, 0, 190,}, + }, + { + // Coeff Band ( 6 ) + {40977, 27614, 412, 28, 0, 0, 0, 0, 0, 0, 0, 0,}, + {6113, 5213, 261, 22, 3, 0, 0, 0, 0, 0, 0, 26164,}, + { 382, 312, 50, 14, 2, 0, 0, 0, 0, 0, 0, 345,}, + }, + { + // Coeff Band ( 7 ) + { 0, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 319,}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8,}, + }, + }, + { + // Block Type ( 1 ) + { + // Coeff Band ( 0 ) + {3268, 19382, 1043, 250, 93, 82, 49, 26, 17, 8, 25, 82289,}, + {8758, 32110, 5436, 1832, 827, 668, 420, 153, 24, 0, 3, 52914,}, + {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399, 59, 0, 0, 18620,}, + }, + { + // Coeff Band ( 1 ) + {12419, 8420, 452, 62, 9, 1, 0, 0, 0, 0, 0, 0,}, + {11715, 8705, 693, 92, 15, 7, 2, 0, 0, 0, 0, 53988,}, + {7603, 8585, 2306, 778, 270, 145, 39, 5, 0, 0, 0, 9136,}, + }, + { + // Coeff Band ( 2 ) + {15938, 14335, 1207, 184, 55, 13, 4, 1, 0, 0, 0, 0,}, + {7415, 6829, 1138, 244, 71, 26, 7, 0, 0, 0, 0, 9980,}, + {1580, 1824, 655, 241, 89, 46, 10, 2, 0, 0, 0, 429,}, + }, + { + // Coeff Band ( 3 ) + {19453, 5260, 201, 19, 0, 0, 0, 0, 0, 0, 0, 0,}, + {9173, 3758, 213, 22, 1, 1, 0, 0, 0, 0, 0, 9820,}, + {1689, 1277, 276, 51, 17, 4, 0, 0, 0, 0, 0, 679,}, + }, + { + // Coeff Band ( 4 ) + {12076, 10667, 620, 85, 19, 9, 5, 0, 0, 0, 0, 0,}, + {4665, 3625, 423, 55, 19, 9, 0, 0, 0, 0, 0, 5127,}, + { 415, 440, 143, 34, 20, 7, 2, 0, 0, 0, 0, 101,}, + }, + { + // Coeff Band ( 5 ) + {12183, 4846, 115, 11, 1, 0, 0, 0, 0, 0, 0, 0,}, + {4226, 3149, 177, 21, 2, 0, 0, 0, 0, 0, 0, 7157,}, + { 375, 621, 189, 51, 11, 4, 1, 0, 0, 0, 0, 198,}, + }, + { + // Coeff Band ( 6 ) + {61658, 37743, 1203, 94, 10, 3, 0, 0, 0, 0, 0, 0,}, + {15514, 11563, 903, 111, 14, 5, 0, 0, 0, 0, 0, 25195,}, + { 929, 1077, 291, 78, 14, 7, 1, 0, 0, 0, 0, 507,}, + }, + { + // Coeff Band ( 7 ) + { 0, 990, 15, 3, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 412, 13, 0, 0, 0, 0, 0, 0, 0, 0, 1641,}, + { 0, 18, 7, 1, 0, 0, 0, 0, 0, 0, 0, 30,}, + }, + }, + { + // Block Type ( 2 ) + { + // Coeff Band ( 0 ) + { 953, 24519, 628, 120, 28, 12, 4, 0, 0, 0, 0, 2248798,}, + {1525, 25654, 2647, 617, 239, 143, 42, 5, 0, 0, 0, 66837,}, + {1180, 11011, 3001, 1237, 532, 448, 239, 54, 5, 0, 0, 7122,}, + }, + { + // Coeff Band ( 1 ) + {1356, 2220, 67, 10, 4, 1, 0, 0, 0, 0, 0, 0,}, + {1450, 2544, 102, 18, 4, 3, 0, 0, 0, 0, 0, 57063,}, + {1182, 2110, 470, 130, 41, 21, 0, 0, 0, 0, 0, 6047,}, + }, + { + // Coeff Band ( 2 ) + { 370, 3378, 200, 30, 5, 4, 1, 0, 0, 0, 0, 0,}, + { 293, 1006, 131, 29, 11, 0, 0, 0, 0, 0, 0, 5404,}, + { 114, 387, 98, 23, 4, 8, 1, 0, 0, 0, 0, 236,}, + }, + { + // Coeff Band ( 3 ) + { 579, 194, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 395, 213, 5, 1, 0, 0, 0, 0, 0, 0, 0, 4157,}, + { 119, 122, 4, 0, 0, 0, 0, 0, 0, 0, 0, 300,}, + }, + { + // Coeff Band ( 4 ) + { 38, 557, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 21, 114, 12, 1, 0, 0, 0, 0, 0, 0, 0, 427,}, + { 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7,}, + }, + { + // Coeff Band ( 5 ) + { 52, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 18, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 652,}, + { 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30,}, + }, + { + // Coeff Band ( 6 ) + { 640, 569, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 25, 77, 2, 0, 0, 0, 0, 0, 0, 0, 0, 517,}, + { 4, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,}, + }, + { + // Coeff Band ( 7 ) + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + }, + }, + { + // Block Type ( 3 ) + { + // Coeff Band ( 0 ) + {2506, 20161, 2707, 767, 261, 178, 107, 30, 14, 3, 0, 100694,}, + {8806, 36478, 8817, 3268, 1280, 850, 401, 114, 42, 0, 0, 58572,}, + {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175, 32, 0, 0, 19284,}, + }, + { + // Coeff Band ( 1 ) + {9738, 11313, 959, 205, 70, 18, 11, 1, 0, 0, 0, 0,}, + {12628, 15085, 1507, 273, 52, 19, 9, 0, 0, 0, 0, 54280,}, + {10701, 15846, 5561, 1926, 813, 570, 249, 36, 0, 0, 0, 6460,}, + }, + { + // Coeff Band ( 2 ) + {6781, 22539, 2784, 634, 182, 123, 20, 4, 0, 0, 0, 0,}, + {6263, 11544, 2649, 790, 259, 168, 27, 5, 0, 0, 0, 20539,}, + {3109, 4075, 2031, 896, 457, 386, 158, 29, 0, 0, 0, 1138,}, + }, + { + // Coeff Band ( 3 ) + {11515, 4079, 465, 73, 5, 14, 2, 0, 0, 0, 0, 0,}, + {9361, 5834, 650, 96, 24, 8, 4, 0, 0, 0, 0, 22181,}, + {4343, 3974, 1360, 415, 132, 96, 14, 1, 0, 0, 0, 1267,}, + }, + { + // Coeff Band ( 4 ) + {4787, 9297, 823, 168, 44, 12, 4, 0, 0, 0, 0, 0,}, + {3619, 4472, 719, 198, 60, 31, 3, 0, 0, 0, 0, 8401,}, + {1157, 1175, 483, 182, 88, 31, 8, 0, 0, 0, 0, 268,}, + }, + { + // Coeff Band ( 5 ) + {8299, 1226, 32, 5, 1, 0, 0, 0, 0, 0, 0, 0,}, + {3502, 1568, 57, 4, 1, 1, 0, 0, 0, 0, 0, 9811,}, + {1055, 1070, 166, 29, 6, 1, 0, 0, 0, 0, 0, 527,}, + }, + { + // Coeff Band ( 6 ) + {27414, 27927, 1989, 347, 69, 26, 0, 0, 0, 0, 0, 0,}, + {5876, 10074, 1574, 341, 91, 24, 4, 0, 0, 0, 0, 21954,}, + {1571, 2171, 778, 324, 124, 65, 16, 0, 0, 0, 0, 979,}, + }, + { + // Coeff Band ( 7 ) + { 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 459,}, + { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13,}, + }, + }, +}; diff --git a/vp8/common/dma_desc.h b/vp8/common/dma_desc.h new file mode 100644 index 000000000..5e6fa0ca9 --- /dev/null +++ b/vp8/common/dma_desc.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef _dma_desc_h +#define _dma_desc_h + +#if defined(__cplusplus) +extern "C" { +#endif + + +#define NDSIZE_LG 0x00000900 // Next Descriptor Size +#define NDSIZE_SM 0x00000800 // Next Descriptor Size +#define NDSIZE_7 0x00000700 // Next Descriptor Size +#define NDSIZE_6 0x00000600 // Next Descriptor Size +#define NDSIZE_5 0x00000500 // Next Descriptor Size +#define NDSIZE_4 0x00000400 // Next Descriptor Size +#define NDSIZE_3 0x00000300 // Next Descriptor Size +#define NDSIZE_2 0x00000200 // Next Descriptor Size +#define NDSIZE_1 0x00000100 // Next Descriptor Size + +#define FLOW_STOP 0x0000 +#define FLOW_AUTO 0x1000 +#define FLOW_DESC_AR 0x4000 +#define FLOW_DESC_SM 0x6000 +#define FLOW_DESC_LG 0x7000 + + typedef struct + { + unsigned int ndp; + //unsigned short ndpl; + //unsigned short ndph; + unsigned int sa; + //unsigned short sal; + //unsigned short sah; + + unsigned short dmacfg; + unsigned short xcnt; + unsigned short xmod; + unsigned short ycnt; + unsigned short ymod; + + } LARGE_DESC; + + typedef struct + { + unsigned short ndpl; + unsigned short sal; + unsigned short sah; + unsigned short dmacfg; + unsigned short xcnt; + unsigned short xmod; + unsigned short ycnt; + unsigned short ymod; + } SMALL_DESC; + + typedef struct + { + unsigned short sal; + unsigned short sah; + unsigned short dmacfg; + unsigned short xcnt; + unsigned short xmod; + unsigned short ycnt; + unsigned short ymod; + } ARRAY_DESC_7; + + typedef struct + { + unsigned short sal; + unsigned short sah; + unsigned short dmacfg; + unsigned short xcnt; + unsigned short xmod; + unsigned short ycnt; + } ARRAY_DESC_6; + + typedef struct + { + unsigned short sal; + unsigned short sah; + unsigned short dmacfg; + unsigned short xcnt; + unsigned short xmod; + } ARRAY_DESC_5; + + typedef struct + { + unsigned short sal; + unsigned short sah; + unsigned short dmacfg; + unsigned short xcnt; + } ARRAY_DESC_4; + + typedef struct + { + unsigned short sal; + unsigned short sah; + unsigned short dmacfg; + } ARRAY_DESC_3; + + typedef struct + { + unsigned short sal; + unsigned short sah; + } ARRAY_DESC_2; + + typedef struct + { + unsigned short sal; + } ARRAY_DESC_1; + +#if defined(__cplusplus) +} +#endif + +#endif //_dma_desc_h diff --git a/vp8/common/duck_io.h b/vp8/common/duck_io.h new file mode 100644 index 000000000..f63a5cdc1 --- /dev/null +++ b/vp8/common/duck_io.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef _duck_io_h +#define _duck_io_h + +#if defined(__cplusplus) +extern "C" { +#endif + +#if defined (_WIN32) + typedef __int64 int64_t; +#elif defined(__MWERKS__) + typedef long long int64_t; +#elif defined(__APPLE__) || defined(__POWERPC) +#include <ppc/types.h> +#else + typedef long long int64_t; +#endif + + typedef struct + { + int64_t offset; // offset to start from + int blocking; // non-zero for blocking + } re_open_t; + + + typedef enum + { + SAL_ERR_MAX = -10, + SAL_ERROR = -11, // Default error + SAL_ERR_WSASTARTUP = -12, + SAL_ERR_SOCKET_CREATE = -13, + SAL_ERR_RESOLVING_HOSTNAME = -14, + SAL_ERR_SERVER_CONNECTION = -15, + SAL_ERR_SENDING_DATA = -16, + SAL_ERR_RECEIVING_DATA = -17, + SAL_ERR_404_FILE_NOT_FOUND = -18, + SAL_ERR_PARSING_HTTP_HEADER = -19, + SAL_ERR_PARSING_CONTENT_LEN = -20, + SAL_ERR_CONNECTION_TIMEOUT = -21, + SAL_ERR_FILE_OPEN_FAILED = -22, + SAL_ERR_MIN = -23 + } SAL_ERR; /* EMH 1-15-03 */ + + + typedef struct sal_err_map_temp + { + SAL_ERR code; + const char *decode; + + } sal_err_map_t; + + + static char *sal_err_text(SAL_ERR e) + { + int t; + const sal_err_map_t g_sal_err_map[] = + { + { SAL_ERR_WSASTARTUP, "Error with WSAStartup" }, + { SAL_ERR_SOCKET_CREATE, "Error creating socket" }, + { SAL_ERR_RESOLVING_HOSTNAME, "Error resolving hostname" }, + { SAL_ERR_SERVER_CONNECTION, "Error connecting to server" }, + { SAL_ERR_SENDING_DATA, "Error sending data" }, + { SAL_ERR_RECEIVING_DATA, "Error receiving data" }, + { SAL_ERR_404_FILE_NOT_FOUND, "Error file not found " }, + { SAL_ERR_PARSING_HTTP_HEADER, "Error parsing http header" }, + { SAL_ERR_PARSING_CONTENT_LEN, "Error parsing content length" }, + { SAL_ERR_CONNECTION_TIMEOUT, "Error Connection timed out" }, + { SAL_ERR_FILE_OPEN_FAILED, "Error opening file" } + }; + + for (t = 0; t < sizeof(g_sal_err_map) / sizeof(sal_err_map_t); t++) + { + if (e == g_sal_err_map[t].code) + return (char *) g_sal_err_map[t].decode; + } + + return 0; + } + + + + + + + + int duck_open(const char *fname, unsigned long user_data); + + void duck_close(int ghndl); + + int duck_read(int ghndl, unsigned char *buf, int nbytes); + + int64_t duck_seek(int g_hndl, int64_t offs, int origin); + + int duck_read_finished(int han, int flag); /* FWG 7-9-99 */ + + int duck_name(int handle, char name[], size_t max_len); /* EMH 9-23-03 */ + + int duck_read_blocking(int handle, unsigned char *buffer, int bytes); /* EMH 9-23-03 */ + + int64_t duck_available_data(int handle); /* EMH 10-23-03 */ + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/vp8/common/entropy.c b/vp8/common/entropy.c new file mode 100644 index 000000000..e524c2acc --- /dev/null +++ b/vp8/common/entropy.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include <stdio.h> + +#include "entropy.h" +#include "string.h" +#include "blockd.h" +#include "onyxc_int.h" + +#define uchar unsigned char /* typedefs can clash */ +#define uint unsigned int + +typedef const uchar cuchar; +typedef const uint cuint; + +typedef vp8_prob Prob; + +#include "coefupdateprobs.h" + +DECLARE_ALIGNED(16, cuchar, vp8_coef_bands[16]) = { 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7}; +DECLARE_ALIGNED(16, cuchar, vp8_prev_token_class[MAX_ENTROPY_TOKENS]) = { 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0}; +DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) = +{ + 0, 1, 4, 8, + 5, 2, 3, 6, + 9, 12, 13, 10, + 7, 11, 14, 15, +}; + +DECLARE_ALIGNED(16, short, vp8_default_zig_zag_mask[16]); + +const int vp8_mb_feature_data_bits[MB_LVL_MAX] = {7, 6}; + +/* Array indices are identical to previously-existing CONTEXT_NODE indices */ + +const vp8_tree_index vp8_coef_tree[ 22] = /* corresponding _CONTEXT_NODEs */ +{ + -DCT_EOB_TOKEN, 2, /* 0 = EOB */ + -ZERO_TOKEN, 4, /* 1 = ZERO */ + -ONE_TOKEN, 6, /* 2 = ONE */ + 8, 12, /* 3 = LOW_VAL */ + -TWO_TOKEN, 10, /* 4 = TWO */ + -THREE_TOKEN, -FOUR_TOKEN, /* 5 = THREE */ + 14, 16, /* 6 = HIGH_LOW */ + -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 7 = CAT_ONE */ + 18, 20, /* 8 = CAT_THREEFOUR */ + -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 9 = CAT_THREE */ + -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 10 = CAT_FIVE */ +}; + +struct vp8_token_struct vp8_coef_encodings[vp8_coef_tokens]; + +/* Trees for extra bits. Probabilities are constant and + do not depend on previously encoded bits */ + +static const Prob Pcat1[] = { 159}; +static const Prob Pcat2[] = { 165, 145}; +static const Prob Pcat3[] = { 173, 148, 140}; +static const Prob Pcat4[] = { 176, 155, 140, 135}; +static const Prob Pcat5[] = { 180, 157, 141, 134, 130}; +static const Prob Pcat6[] = +{ 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129}; + +static vp8_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[22]; + +void vp8_init_scan_order_mask() +{ + int i; + + for (i = 0; i < 16; i++) + { + vp8_default_zig_zag_mask[vp8_default_zig_zag1d[i]] = 1 << i; + } + +} + +static void init_bit_tree(vp8_tree_index *p, int n) +{ + int i = 0; + + while (++i < n) + { + p[0] = p[1] = i << 1; + p += 2; + } + + p[0] = p[1] = 0; +} + +static void init_bit_trees() +{ + init_bit_tree(cat1, 1); + init_bit_tree(cat2, 2); + init_bit_tree(cat3, 3); + init_bit_tree(cat4, 4); + init_bit_tree(cat5, 5); + init_bit_tree(cat6, 11); +} + + +static vp8bc_index_t bcc1[1], bcc2[2], bcc3[3], bcc4[4], bcc5[5], bcc6[11]; + +vp8_extra_bit_struct vp8_extra_bits[12] = +{ + { 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 1}, + { 0, 0, 0, 0, 2}, + { 0, 0, 0, 0, 3}, + { 0, 0, 0, 0, 4}, + { cat1, Pcat1, bcc1, 1, 5}, + { cat2, Pcat2, bcc2, 2, 7}, + { cat3, Pcat3, bcc3, 3, 11}, + { cat4, Pcat4, bcc4, 4, 19}, + { cat5, Pcat5, bcc5, 5, 35}, + { cat6, Pcat6, bcc6, 11, 67}, + { 0, 0, 0, 0, 0} +}; +#include "defaultcoefcounts.h" + +void vp8_default_coef_probs(VP8_COMMON *pc) +{ + int h = 0; + + do + { + int i = 0; + + do + { + int k = 0; + + do + { + unsigned int branch_ct [vp8_coef_tokens-1] [2]; + vp8_tree_probs_from_distribution( + vp8_coef_tokens, vp8_coef_encodings, vp8_coef_tree, + pc->fc.coef_probs [h][i][k], branch_ct, default_coef_counts [h][i][k], + 256, 1); + + } + while (++k < PREV_COEF_CONTEXTS); + } + while (++i < COEF_BANDS); + } + while (++h < BLOCK_TYPES); +} + + +void vp8_coef_tree_initialize() +{ + init_bit_trees(); + vp8_tokens_from_tree(vp8_coef_encodings, vp8_coef_tree); +} diff --git a/vp8/common/entropy.h b/vp8/common/entropy.h new file mode 100644 index 000000000..1415832d5 --- /dev/null +++ b/vp8/common/entropy.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_ENTROPY_H +#define __INC_ENTROPY_H + +#include "treecoder.h" +#include "blockd.h" + +/* Coefficient token alphabet */ + +#define ZERO_TOKEN 0 //0 Extra Bits 0+0 +#define ONE_TOKEN 1 //1 Extra Bits 0+1 +#define TWO_TOKEN 2 //2 Extra Bits 0+1 +#define THREE_TOKEN 3 //3 Extra Bits 0+1 +#define FOUR_TOKEN 4 //4 Extra Bits 0+1 +#define DCT_VAL_CATEGORY1 5 //5-6 Extra Bits 1+1 +#define DCT_VAL_CATEGORY2 6 //7-10 Extra Bits 2+1 +#define DCT_VAL_CATEGORY3 7 //11-26 Extra Bits 4+1 +#define DCT_VAL_CATEGORY4 8 //11-26 Extra Bits 5+1 +#define DCT_VAL_CATEGORY5 9 //27-58 Extra Bits 5+1 +#define DCT_VAL_CATEGORY6 10 //59+ Extra Bits 11+1 +#define DCT_EOB_TOKEN 11 //EOB Extra Bits 0+0 + +#define vp8_coef_tokens 12 +#define MAX_ENTROPY_TOKENS vp8_coef_tokens +#define ENTROPY_NODES 11 + +extern const vp8_tree_index vp8_coef_tree[]; + +extern struct vp8_token_struct vp8_coef_encodings[vp8_coef_tokens]; + +typedef struct +{ + vp8_tree_p tree; + const vp8_prob *prob; + vp8bc_index_t *prob_bc; + int Len; + int base_val; +} vp8_extra_bit_struct; + +extern vp8_extra_bit_struct vp8_extra_bits[12]; /* indexed by token value */ + +#define PROB_UPDATE_BASELINE_COST 7 + +#define MAX_PROB 255 +#define DCT_MAX_VALUE 2048 + + +/* Coefficients are predicted via a 3-dimensional probability table. */ + +/* Outside dimension. 0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */ + +#define BLOCK_TYPES 4 + +/* Middle dimension is a coarsening of the coefficient's + position within the 4x4 DCT. */ + +#define COEF_BANDS 8 +extern DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]); + +/* Inside dimension is 3-valued measure of nearby complexity, that is, + the extent to which nearby coefficients are nonzero. For the first + coefficient (DC, unless block type is 0), we look at the (already encoded) + blocks above and to the left of the current block. The context index is + then the number (0,1,or 2) of these blocks having nonzero coefficients. + After decoding a coefficient, the measure is roughly the size of the + most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1). + Note that the intuitive meaning of this measure changes as coefficients + are decoded, e.g., prior to the first token, a zero means that my neighbors + are empty while, after the first token, because of the use of end-of-block, + a zero means we just decoded a zero and hence guarantees that a non-zero + coefficient will appear later in this block. However, this shift + in meaning is perfectly OK because our context depends also on the + coefficient band (and since zigzag positions 0, 1, and 2 are in + distinct bands). */ + +/*# define DC_TOKEN_CONTEXTS 3 // 00, 0!0, !0!0 */ +# define PREV_COEF_CONTEXTS 3 + +extern DECLARE_ALIGNED(16, const unsigned char, vp8_prev_token_class[vp8_coef_tokens]); + +extern const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1]; + + +struct VP8Common; +void vp8_default_coef_probs(struct VP8Common *); + +extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]); +extern short vp8_default_zig_zag_mask[16]; +extern const int vp8_mb_feature_data_bits[MB_LVL_MAX]; + +void vp8_coef_tree_initialize(void); +#endif diff --git a/vp8/common/entropymode.c b/vp8/common/entropymode.c new file mode 100644 index 000000000..7dc1acde0 --- /dev/null +++ b/vp8/common/entropymode.c @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "entropymode.h" +#include "entropy.h" +#include "vpx_mem/vpx_mem.h" + +static const unsigned int kf_y_mode_cts[VP8_YMODES] = { 1607, 915, 812, 811, 5455}; +static const unsigned int y_mode_cts [VP8_YMODES] = { 8080, 1908, 1582, 1007, 5874}; + +static const unsigned int uv_mode_cts [VP8_UV_MODES] = { 59483, 13605, 16492, 4230}; +static const unsigned int kf_uv_mode_cts[VP8_UV_MODES] = { 5319, 1904, 1703, 674}; + +static const unsigned int bmode_cts[VP8_BINTRAMODES] = +{ + 43891, 17694, 10036, 3920, 3363, 2546, 5119, 3221, 2471, 1723 +}; + +typedef enum +{ + SUBMVREF_NORMAL, + SUBMVREF_LEFT_ZED, + SUBMVREF_ABOVE_ZED, + SUBMVREF_LEFT_ABOVE_SAME, + SUBMVREF_LEFT_ABOVE_ZED +} sumvfref_t; + +int vp8_mv_cont(const MV *l, const MV *a) +{ + int lez = (l->row == 0 && l->col == 0); + int aez = (a->row == 0 && a->col == 0); + int lea = (l->row == a->row && l->col == a->col); + + if (lea && lez) + return SUBMVREF_LEFT_ABOVE_ZED; + + if (lea) + return SUBMVREF_LEFT_ABOVE_SAME; + + if (aez) + return SUBMVREF_ABOVE_ZED; + + if (lez) + return SUBMVREF_LEFT_ZED; + + return SUBMVREF_NORMAL; +} + +static const vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1] = { 180, 162, 25}; + +const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1] = +{ + { 147, 136, 18 }, + { 106, 145, 1 }, + { 179, 121, 1 }, + { 223, 1 , 34 }, + { 208, 1 , 1 } +}; + + + +vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS] = +{ + { + 0, 0, 0, 0, + 0, 0, 0, 0, + 1, 1, 1, 1, + 1, 1, 1, 1, + }, + { + 0, 0, 1, 1, + 0, 0, 1, 1, + 0, 0, 1, 1, + 0, 0, 1, 1, + }, + { + 0, 0, 1, 1, + 0, 0, 1, 1, + 2, 2, 3, 3, + 2, 2, 3, 3, + }, + { + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, 14, 15, + }, +}; + +const int vp8_mbsplit_count [VP8_NUMMBSPLITS] = { 2, 2, 4, 16}; + +const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1] = { 110, 111, 150}; + + +/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */ + +const vp8_tree_index vp8_bmode_tree[18] = /* INTRAMODECONTEXTNODE value */ +{ + -B_DC_PRED, 2, /* 0 = DC_NODE */ + -B_TM_PRED, 4, /* 1 = TM_NODE */ + -B_VE_PRED, 6, /* 2 = VE_NODE */ + 8, 12, /* 3 = COM_NODE */ + -B_HE_PRED, 10, /* 4 = HE_NODE */ + -B_RD_PRED, -B_VR_PRED, /* 5 = RD_NODE */ + -B_LD_PRED, 14, /* 6 = LD_NODE */ + -B_VL_PRED, 16, /* 7 = VL_NODE */ + -B_HD_PRED, -B_HU_PRED /* 8 = HD_NODE */ +}; + +/* Again, these trees use the same probability indices as their + explicitly-programmed predecessors. */ + +const vp8_tree_index vp8_ymode_tree[8] = +{ + -DC_PRED, 2, + 4, 6, + -V_PRED, -H_PRED, + -TM_PRED, -B_PRED +}; + +const vp8_tree_index vp8_kf_ymode_tree[8] = +{ + -B_PRED, 2, + 4, 6, + -DC_PRED, -V_PRED, + -H_PRED, -TM_PRED +}; + +const vp8_tree_index vp8_uv_mode_tree[6] = +{ + -DC_PRED, 2, + -V_PRED, 4, + -H_PRED, -TM_PRED +}; + +const vp8_tree_index vp8_mbsplit_tree[6] = +{ + -3, 2, + -2, 4, + -0, -1 +}; + +const vp8_tree_index vp8_mv_ref_tree[8] = +{ + -ZEROMV, 2, + -NEARESTMV, 4, + -NEARMV, 6, + -NEWMV, -SPLITMV +}; + +const vp8_tree_index vp8_sub_mv_ref_tree[6] = +{ + -LEFT4X4, 2, + -ABOVE4X4, 4, + -ZERO4X4, -NEW4X4 +}; + + +struct vp8_token_struct vp8_bmode_encodings [VP8_BINTRAMODES]; +struct vp8_token_struct vp8_ymode_encodings [VP8_YMODES]; +struct vp8_token_struct vp8_kf_ymode_encodings [VP8_YMODES]; +struct vp8_token_struct vp8_uv_mode_encodings [VP8_UV_MODES]; +struct vp8_token_struct vp8_mbsplit_encodings [VP8_NUMMBSPLITS]; + +struct vp8_token_struct vp8_mv_ref_encoding_array [VP8_MVREFS]; +struct vp8_token_struct vp8_sub_mv_ref_encoding_array [VP8_SUBMVREFS]; + + +const vp8_tree_index vp8_small_mvtree [14] = +{ + 2, 8, + 4, 6, + -0, -1, + -2, -3, + 10, 12, + -4, -5, + -6, -7 +}; + +struct vp8_token_struct vp8_small_mvencodings [8]; + +void vp8_init_mbmode_probs(VP8_COMMON *x) +{ + unsigned int bct [VP8_YMODES] [2]; /* num Ymodes > num UV modes */ + + vp8_tree_probs_from_distribution( + VP8_YMODES, vp8_ymode_encodings, vp8_ymode_tree, + x->fc.ymode_prob, bct, y_mode_cts, + 256, 1 + ); + vp8_tree_probs_from_distribution( + VP8_YMODES, vp8_kf_ymode_encodings, vp8_kf_ymode_tree, + x->kf_ymode_prob, bct, kf_y_mode_cts, + 256, 1 + ); + vp8_tree_probs_from_distribution( + VP8_UV_MODES, vp8_uv_mode_encodings, vp8_uv_mode_tree, + x->fc.uv_mode_prob, bct, uv_mode_cts, + 256, 1 + ); + vp8_tree_probs_from_distribution( + VP8_UV_MODES, vp8_uv_mode_encodings, vp8_uv_mode_tree, + x->kf_uv_mode_prob, bct, kf_uv_mode_cts, + 256, 1 + ); + vpx_memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob)); +} + + +static void intra_bmode_probs_from_distribution( + vp8_prob p [VP8_BINTRAMODES-1], + unsigned int branch_ct [VP8_BINTRAMODES-1] [2], + const unsigned int events [VP8_BINTRAMODES] +) +{ + vp8_tree_probs_from_distribution( + VP8_BINTRAMODES, vp8_bmode_encodings, vp8_bmode_tree, + p, branch_ct, events, + 256, 1 + ); +} + +void vp8_default_bmode_probs(vp8_prob p [VP8_BINTRAMODES-1]) +{ + unsigned int branch_ct [VP8_BINTRAMODES-1] [2]; + intra_bmode_probs_from_distribution(p, branch_ct, bmode_cts); +} + +void vp8_kf_default_bmode_probs(vp8_prob p [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1]) +{ + unsigned int branch_ct [VP8_BINTRAMODES-1] [2]; + + int i = 0; + + do + { + int j = 0; + + do + { + intra_bmode_probs_from_distribution( + p[i][j], branch_ct, vp8_kf_default_bmode_counts[i][j]); + + } + while (++j < VP8_BINTRAMODES); + } + while (++i < VP8_BINTRAMODES); +} + + +void vp8_entropy_mode_init() +{ + vp8_tokens_from_tree(vp8_bmode_encodings, vp8_bmode_tree); + vp8_tokens_from_tree(vp8_ymode_encodings, vp8_ymode_tree); + vp8_tokens_from_tree(vp8_kf_ymode_encodings, vp8_kf_ymode_tree); + vp8_tokens_from_tree(vp8_uv_mode_encodings, vp8_uv_mode_tree); + vp8_tokens_from_tree(vp8_mbsplit_encodings, vp8_mbsplit_tree); + + vp8_tokens_from_tree(VP8_MVREFENCODINGS, vp8_mv_ref_tree); + vp8_tokens_from_tree(VP8_SUBMVREFENCODINGS, vp8_sub_mv_ref_tree); + + vp8_tokens_from_tree(vp8_small_mvencodings, vp8_small_mvtree); +} diff --git a/vp8/common/entropymode.h b/vp8/common/entropymode.h new file mode 100644 index 000000000..ff630a477 --- /dev/null +++ b/vp8/common/entropymode.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_ENTROPYMODE_H +#define __INC_ENTROPYMODE_H + +#include "onyxc_int.h" +#include "treecoder.h" + +typedef const int vp8_mbsplit[16]; + +#define VP8_NUMMBSPLITS 4 + +extern vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS]; + +extern const int vp8_mbsplit_count [VP8_NUMMBSPLITS]; /* # of subsets */ + +extern const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1]; + +extern int vp8_mv_cont(const MV *l, const MV *a); +#define SUBMVREF_COUNT 5 +extern const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1]; + + +extern const unsigned int vp8_kf_default_bmode_counts [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES]; + + +extern const vp8_tree_index vp8_bmode_tree[]; + +extern const vp8_tree_index vp8_ymode_tree[]; +extern const vp8_tree_index vp8_kf_ymode_tree[]; +extern const vp8_tree_index vp8_uv_mode_tree[]; + +extern const vp8_tree_index vp8_mbsplit_tree[]; +extern const vp8_tree_index vp8_mv_ref_tree[]; +extern const vp8_tree_index vp8_sub_mv_ref_tree[]; + +extern struct vp8_token_struct vp8_bmode_encodings [VP8_BINTRAMODES]; +extern struct vp8_token_struct vp8_ymode_encodings [VP8_YMODES]; +extern struct vp8_token_struct vp8_kf_ymode_encodings [VP8_YMODES]; +extern struct vp8_token_struct vp8_uv_mode_encodings [VP8_UV_MODES]; +extern struct vp8_token_struct vp8_mbsplit_encodings [VP8_NUMMBSPLITS]; + +/* Inter mode values do not start at zero */ + +extern struct vp8_token_struct vp8_mv_ref_encoding_array [VP8_MVREFS]; +extern struct vp8_token_struct vp8_sub_mv_ref_encoding_array [VP8_SUBMVREFS]; + +#define VP8_MVREFENCODINGS (vp8_mv_ref_encoding_array - NEARESTMV) +#define VP8_SUBMVREFENCODINGS (vp8_sub_mv_ref_encoding_array - LEFT4X4) + + +extern const vp8_tree_index vp8_small_mvtree[]; + +extern struct vp8_token_struct vp8_small_mvencodings [8]; + +void vp8_entropy_mode_init(void); + +void vp8_init_mbmode_probs(VP8_COMMON *x); + +void vp8_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES-1]); +void vp8_kf_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1]); + +#endif diff --git a/vp8/common/entropymv.c b/vp8/common/entropymv.c new file mode 100644 index 000000000..2b00c17a9 --- /dev/null +++ b/vp8/common/entropymv.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "entropymv.h" + +const MV_CONTEXT vp8_mv_update_probs[2] = +{ + {{ + 237, + 246, + 253, 253, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 250, 250, 252, 254, 254 + }}, + {{ + 231, + 243, + 245, 253, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 251, 251, 254, 254, 254 + }} +}; +const MV_CONTEXT vp8_default_mv_context[2] = +{ + {{ + // row + 162, // is short + 128, // sign + 225, 146, 172, 147, 214, 39, 156, // short tree + 128, 129, 132, 75, 145, 178, 206, 239, 254, 254 // long bits + }}, + + + + {{ + // same for column + 164, // is short + 128, + 204, 170, 119, 235, 140, 230, 228, + 128, 130, 130, 74, 148, 180, 203, 236, 254, 254 // long bits + + }} +}; diff --git a/vp8/common/entropymv.h b/vp8/common/entropymv.h new file mode 100644 index 000000000..d940c599b --- /dev/null +++ b/vp8/common/entropymv.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_ENTROPYMV_H +#define __INC_ENTROPYMV_H + +#include "treecoder.h" + +enum +{ + mv_max = 1023, /* max absolute value of a MV component */ + MVvals = (2 * mv_max) + 1, /* # possible values "" */ + + mvlong_width = 10, /* Large MVs have 9 bit magnitudes */ + mvnum_short = 8, /* magnitudes 0 through 7 */ + + /* probability offsets for coding each MV component */ + + mvpis_short = 0, /* short (<= 7) vs long (>= 8) */ + MVPsign, /* sign for non-zero */ + MVPshort, /* 8 short values = 7-position tree */ + + MVPbits = MVPshort + mvnum_short - 1, /* mvlong_width long value bits */ + MVPcount = MVPbits + mvlong_width /* (with independent probabilities) */ +}; + +typedef struct mv_context +{ + vp8_prob prob[MVPcount]; /* often come in row, col pairs */ +} MV_CONTEXT; + +extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2]; + +#endif diff --git a/vp8/common/extend.c b/vp8/common/extend.c new file mode 100644 index 000000000..74079527c --- /dev/null +++ b/vp8/common/extend.c @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "extend.h" +#include "vpx_mem/vpx_mem.h" + + +static void extend_plane_borders +( + unsigned char *s, // source + int sp, // pitch + int h, // height + int w, // width + int et, // extend top border + int el, // extend left border + int eb, // extend bottom border + int er // extend right border +) +{ + + int i; + unsigned char *src_ptr1, *src_ptr2; + unsigned char *dest_ptr1, *dest_ptr2; + int linesize; + + // copy the left and right most columns out + src_ptr1 = s; + src_ptr2 = s + w - 1; + dest_ptr1 = s - el; + dest_ptr2 = s + w; + + for (i = 0; i < h - 0 + 1; i++) + { + vpx_memset(dest_ptr1, src_ptr1[0], el); + vpx_memset(dest_ptr2, src_ptr2[0], er); + src_ptr1 += sp; + src_ptr2 += sp; + dest_ptr1 += sp; + dest_ptr2 += sp; + } + + // Now copy the top and bottom source lines into each line of the respective borders + src_ptr1 = s - el; + src_ptr2 = s + sp * (h - 1) - el; + dest_ptr1 = s + sp * (-et) - el; + dest_ptr2 = s + sp * (h) - el; + linesize = el + er + w + 1; + + for (i = 0; i < (int)et; i++) + { + vpx_memcpy(dest_ptr1, src_ptr1, linesize); + dest_ptr1 += sp; + } + + for (i = 0; i < (int)eb; i++) + { + vpx_memcpy(dest_ptr2, src_ptr2, linesize); + dest_ptr2 += sp; + } +} + + +void vp8_extend_to_multiple_of16(YV12_BUFFER_CONFIG *ybf, int width, int height) +{ + int er = 0xf & (16 - (width & 0xf)); + int eb = 0xf & (16 - (height & 0xf)); + + // check for non multiples of 16 + if (er != 0 || eb != 0) + { + extend_plane_borders(ybf->y_buffer, ybf->y_stride, height, width, 0, 0, eb, er); + + //adjust for uv + height = (height + 1) >> 1; + width = (width + 1) >> 1; + er = 0x7 & (8 - (width & 0x7)); + eb = 0x7 & (8 - (height & 0x7)); + + if (er || eb) + { + extend_plane_borders(ybf->u_buffer, ybf->uv_stride, height, width, 0, 0, eb, er); + extend_plane_borders(ybf->v_buffer, ybf->uv_stride, height, width, 0, 0, eb, er); + } + } +} + +// note the extension is only for the last row, for intra prediction purpose +void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr) +{ + int i; + + YPtr += ybf->y_stride * 14; + UPtr += ybf->uv_stride * 6; + VPtr += ybf->uv_stride * 6; + + for (i = 0; i < 4; i++) + { + YPtr[i] = YPtr[-1]; + UPtr[i] = UPtr[-1]; + VPtr[i] = VPtr[-1]; + } + + YPtr += ybf->y_stride; + UPtr += ybf->uv_stride; + VPtr += ybf->uv_stride; + + for (i = 0; i < 4; i++) + { + YPtr[i] = YPtr[-1]; + UPtr[i] = UPtr[-1]; + VPtr[i] = VPtr[-1]; + } +} diff --git a/vp8/common/extend.h b/vp8/common/extend.h new file mode 100644 index 000000000..6809ae756 --- /dev/null +++ b/vp8/common/extend.h @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_EXTEND_H +#define __INC_EXTEND_H + +#include "vpx_scale/yv12config.h" + +void Extend(YV12_BUFFER_CONFIG *ybf); +void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr); +void vp8_extend_to_multiple_of16(YV12_BUFFER_CONFIG *ybf, int width, int height); + +#endif diff --git a/vp8/common/filter_c.c b/vp8/common/filter_c.c new file mode 100644 index 000000000..38991cb28 --- /dev/null +++ b/vp8/common/filter_c.c @@ -0,0 +1,539 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include <stdlib.h> + +#define BLOCK_HEIGHT_WIDTH 4 +#define VP8_FILTER_WEIGHT 128 +#define VP8_FILTER_SHIFT 7 + + +static const int bilinear_filters[8][2] = +{ + { 128, 0 }, + { 112, 16 }, + { 96, 32 }, + { 80, 48 }, + { 64, 64 }, + { 48, 80 }, + { 32, 96 }, + { 16, 112 } +}; + + +static const short sub_pel_filters[8][6] = +{ + + { 0, 0, 128, 0, 0, 0 }, // note that 1/8 pel positions are just as per alpha -0.5 bicubic + { 0, -6, 123, 12, -1, 0 }, + { 2, -11, 108, 36, -8, 1 }, // New 1/4 pel 6 tap filter + { 0, -9, 93, 50, -6, 0 }, + { 3, -16, 77, 77, -16, 3 }, // New 1/2 pel 6 tap filter + { 0, -6, 50, 93, -9, 0 }, + { 1, -8, 36, 108, -11, 2 }, // New 1/4 pel 6 tap filter + { 0, -1, 12, 123, -6, 0 }, + + + +}; + +void vp8_filter_block2d_first_pass +( + unsigned char *src_ptr, + int *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +) +{ + unsigned int i, j; + int Temp; + + for (i = 0; i < output_height; i++) + { + for (j = 0; j < output_width; j++) + { + Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) + + ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) + + ((int)src_ptr[0] * vp8_filter[2]) + + ((int)src_ptr[pixel_step] * vp8_filter[3]) + + ((int)src_ptr[2*pixel_step] * vp8_filter[4]) + + ((int)src_ptr[3*pixel_step] * vp8_filter[5]) + + (VP8_FILTER_WEIGHT >> 1); // Rounding + + // Normalize back to 0-255 + Temp = Temp >> VP8_FILTER_SHIFT; + + if (Temp < 0) + Temp = 0; + else if (Temp > 255) + Temp = 255; + + output_ptr[j] = Temp; + src_ptr++; + } + + // Next row... + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +void vp8_filter_block2d_second_pass +( + int *src_ptr, + unsigned char *output_ptr, + int output_pitch, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +) +{ + unsigned int i, j; + int Temp; + + for (i = 0; i < output_height; i++) + { + for (j = 0; j < output_width; j++) + { + // Apply filter + Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) + + ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) + + ((int)src_ptr[0] * vp8_filter[2]) + + ((int)src_ptr[pixel_step] * vp8_filter[3]) + + ((int)src_ptr[2*pixel_step] * vp8_filter[4]) + + ((int)src_ptr[3*pixel_step] * vp8_filter[5]) + + (VP8_FILTER_WEIGHT >> 1); // Rounding + + // Normalize back to 0-255 + Temp = Temp >> VP8_FILTER_SHIFT; + + if (Temp < 0) + Temp = 0; + else if (Temp > 255) + Temp = 255; + + output_ptr[j] = (unsigned char)Temp; + src_ptr++; + } + + // Start next row + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_pitch; + } +} + + +void vp8_filter_block2d +( + unsigned char *src_ptr, + unsigned char *output_ptr, + unsigned int src_pixels_per_line, + int output_pitch, + const short *HFilter, + const short *VFilter +) +{ + int FData[9*4]; // Temp data bufffer used in filtering + + // First filter 1-D horizontally... + vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter); + + // then filter verticaly... + vp8_filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter); +} + + +void vp8_block_variation_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int *HVar, + int *VVar +) +{ + int i, j; + unsigned char *Ptr = src_ptr; + + for (i = 0; i < 4; i++) + { + for (j = 0; j < 4; j++) + { + *HVar += abs((int)Ptr[j] - (int)Ptr[j+1]); + *VVar += abs((int)Ptr[j] - (int)Ptr[j+src_pixels_per_line]); + } + + Ptr += src_pixels_per_line; + } +} + + + + +void vp8_sixtap_predict_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = sub_pel_filters[xoffset]; // 6 tap + VFilter = sub_pel_filters[yoffset]; // 6 tap + + vp8_filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter); +} +void vp8_sixtap_predict8x8_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + int FData[13*16]; // Temp data bufffer used in filtering + + HFilter = sub_pel_filters[xoffset]; // 6 tap + VFilter = sub_pel_filters[yoffset]; // 6 tap + + // First filter 1-D horizontally... + vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter); + + + // then filter verticaly... + vp8_filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter); + +} + +void vp8_sixtap_predict8x4_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + int FData[13*16]; // Temp data bufffer used in filtering + + HFilter = sub_pel_filters[xoffset]; // 6 tap + VFilter = sub_pel_filters[yoffset]; // 6 tap + + // First filter 1-D horizontally... + vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter); + + + // then filter verticaly... + vp8_filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter); + +} + +void vp8_sixtap_predict16x16_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + int FData[21*24]; // Temp data bufffer used in filtering + + + HFilter = sub_pel_filters[xoffset]; // 6 tap + VFilter = sub_pel_filters[yoffset]; // 6 tap + + // First filter 1-D horizontally... + vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter); + + // then filter verticaly... + vp8_filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter); + +} + + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_first_pass + * + * INPUTS : UINT8 *src_ptr : Pointer to source block. + * UINT32 src_pixels_per_line : Stride of input block. + * UINT32 pixel_step : Offset between filter input samples (see notes). + * UINT32 output_height : Input block height. + * UINT32 output_width : Input block width. + * INT32 *vp8_filter : Array of 2 bi-linear filter taps. + * + * OUTPUTS : INT32 *output_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in + * either horizontal or vertical direction to produce the + * filtered output block. Used to implement first-pass + * of 2-D separable filter. + * + * SPECIAL NOTES : Produces INT32 output to retain precision for next pass. + * Two filter taps should sum to VP8_FILTER_WEIGHT. + * pixel_step defines whether the filter is applied + * horizontally (pixel_step=1) or vertically (pixel_step=stride). + * It defines the offset required to move from one input + * to the next. + * + ****************************************************************************/ +void vp8_filter_block2d_bil_first_pass +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const int *vp8_filter +) +{ + unsigned int i, j; + + for (i = 0; i < output_height; i++) + { + for (j = 0; j < output_width; j++) + { + // Apply bilinear filter + output_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) + + ((int)src_ptr[pixel_step] * vp8_filter[1]) + + (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT; + src_ptr++; + } + + // Next row... + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_second_pass + * + * INPUTS : INT32 *src_ptr : Pointer to source block. + * UINT32 src_pixels_per_line : Stride of input block. + * UINT32 pixel_step : Offset between filter input samples (see notes). + * UINT32 output_height : Input block height. + * UINT32 output_width : Input block width. + * INT32 *vp8_filter : Array of 2 bi-linear filter taps. + * + * OUTPUTS : UINT16 *output_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in + * either horizontal or vertical direction to produce the + * filtered output block. Used to implement second-pass + * of 2-D separable filter. + * + * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass. + * Two filter taps should sum to VP8_FILTER_WEIGHT. + * pixel_step defines whether the filter is applied + * horizontally (pixel_step=1) or vertically (pixel_step=stride). + * It defines the offset required to move from one input + * to the next. + * + ****************************************************************************/ +void vp8_filter_block2d_bil_second_pass +( + unsigned short *src_ptr, + unsigned char *output_ptr, + int output_pitch, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const int *vp8_filter +) +{ + unsigned int i, j; + int Temp; + + for (i = 0; i < output_height; i++) + { + for (j = 0; j < output_width; j++) + { + // Apply filter + Temp = ((int)src_ptr[0] * vp8_filter[0]) + + ((int)src_ptr[pixel_step] * vp8_filter[1]) + + (VP8_FILTER_WEIGHT / 2); + output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT); + src_ptr++; + } + + // Next row... + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_pitch; + } +} + + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil + * + * INPUTS : UINT8 *src_ptr : Pointer to source block. + * UINT32 src_pixels_per_line : Stride of input block. + * INT32 *HFilter : Array of 2 horizontal filter taps. + * INT32 *VFilter : Array of 2 vertical filter taps. + * + * OUTPUTS : UINT16 *output_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : 2-D filters an input block by applying a 2-tap + * bi-linear filter horizontally followed by a 2-tap + * bi-linear filter vertically on the result. + * + * SPECIAL NOTES : The largest block size can be handled here is 16x16 + * + ****************************************************************************/ +void vp8_filter_block2d_bil +( + unsigned char *src_ptr, + unsigned char *output_ptr, + unsigned int src_pixels_per_line, + unsigned int dst_pitch, + const int *HFilter, + const int *VFilter, + int Width, + int Height +) +{ + + unsigned short FData[17*16]; // Temp data bufffer used in filtering + + // First filter 1-D horizontally... + vp8_filter_block2d_bil_first_pass(src_ptr, FData, src_pixels_per_line, 1, Height + 1, Width, HFilter); + + // then 1-D vertically... + vp8_filter_block2d_bil_second_pass(FData, output_ptr, dst_pitch, Width, Width, Height, Width, VFilter); +} + + +void vp8_bilinear_predict4x4_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const int *HFilter; + const int *VFilter; + + HFilter = bilinear_filters[xoffset]; + VFilter = bilinear_filters[yoffset]; +#if 0 + { + int i; + unsigned char temp1[16]; + unsigned char temp2[16]; + + bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4); + vp8_filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4); + + for (i = 0; i < 16; i++) + { + if (temp1[i] != temp2[i]) + { + bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4); + vp8_filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4); + } + } + } +#endif + vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4); + +} + +void vp8_bilinear_predict8x8_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const int *HFilter; + const int *VFilter; + + HFilter = bilinear_filters[xoffset]; + VFilter = bilinear_filters[yoffset]; + + vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8); + +} + +void vp8_bilinear_predict8x4_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const int *HFilter; + const int *VFilter; + + HFilter = bilinear_filters[xoffset]; + VFilter = bilinear_filters[yoffset]; + + vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4); + +} + +void vp8_bilinear_predict16x16_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const int *HFilter; + const int *VFilter; + + HFilter = bilinear_filters[xoffset]; + VFilter = bilinear_filters[yoffset]; + + vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16); +} diff --git a/vp8/common/findnearmv.c b/vp8/common/findnearmv.c new file mode 100644 index 000000000..fcb1f202c --- /dev/null +++ b/vp8/common/findnearmv.c @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "findnearmv.h" + +#define FINDNEAR_SEARCH_SITES 3 + +/* Predict motion vectors using those from already-decoded nearby blocks. + Note that we only consider one 4x4 subblock from each candidate 16x16 + macroblock. */ + +typedef union +{ + unsigned int as_int; + MV as_mv; +} int_mv; /* facilitates rapid equality tests */ + +static void mv_bias(const MODE_INFO *x, int refframe, int_mv *mvp, const int *ref_frame_sign_bias) +{ + MV xmv; + xmv = x->mbmi.mv.as_mv; + + if (ref_frame_sign_bias[x->mbmi.ref_frame] != ref_frame_sign_bias[refframe]) + { + xmv.row *= -1; + xmv.col *= -1; + } + + mvp->as_mv = xmv; +} + + +void vp8_clamp_mv(MV *mv, const MACROBLOCKD *xd) +{ + if (mv->col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN)) + mv->col = xd->mb_to_left_edge - LEFT_TOP_MARGIN; + else if (mv->col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN) + mv->col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN; + + if (mv->row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN)) + mv->row = xd->mb_to_top_edge - LEFT_TOP_MARGIN; + else if (mv->row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN) + mv->row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN; +} + + +void vp8_find_near_mvs +( + MACROBLOCKD *xd, + const MODE_INFO *here, + MV *nearest, + MV *nearby, + MV *best_mv, + int cnt[4], + int refframe, + int *ref_frame_sign_bias +) +{ + const MODE_INFO *above = here - xd->mode_info_stride; + const MODE_INFO *left = here - 1; + const MODE_INFO *aboveleft = above - 1; + int_mv near_mvs[4]; + int_mv *mv = near_mvs; + int *cntx = cnt; + enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV}; + + /* Zero accumulators */ + mv[0].as_int = mv[1].as_int = mv[2].as_int = 0; + cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0; + + /* Process above */ + if (above->mbmi.ref_frame != INTRA_FRAME) + { + if (above->mbmi.mv.as_int) + { + (++mv)->as_int = above->mbmi.mv.as_int; + mv_bias(above, refframe, mv, ref_frame_sign_bias); + ++cntx; + } + + *cntx += 2; + } + + /* Process left */ + if (left->mbmi.ref_frame != INTRA_FRAME) + { + if (left->mbmi.mv.as_int) + { + int_mv this_mv; + + this_mv.as_int = left->mbmi.mv.as_int; + mv_bias(left, refframe, &this_mv, ref_frame_sign_bias); + + if (this_mv.as_int != mv->as_int) + { + (++mv)->as_int = this_mv.as_int; + ++cntx; + } + + *cntx += 2; + } + else + cnt[CNT_INTRA] += 2; + } + + /* Process above left */ + if (aboveleft->mbmi.ref_frame != INTRA_FRAME) + { + if (aboveleft->mbmi.mv.as_int) + { + int_mv this_mv; + + this_mv.as_int = aboveleft->mbmi.mv.as_int; + mv_bias(aboveleft, refframe, &this_mv, ref_frame_sign_bias); + + if (this_mv.as_int != mv->as_int) + { + (++mv)->as_int = this_mv.as_int; + ++cntx; + } + + *cntx += 1; + } + else + cnt[CNT_INTRA] += 1; + } + + /* If we have three distinct MV's ... */ + if (cnt[CNT_SPLITMV]) + { + /* See if above-left MV can be merged with NEAREST */ + if (mv->as_int == near_mvs[CNT_NEAREST].as_int) + cnt[CNT_NEAREST] += 1; + } + + cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV) + + (left->mbmi.mode == SPLITMV)) * 2 + + (aboveleft->mbmi.mode == SPLITMV); + + /* Swap near and nearest if necessary */ + if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) + { + int tmp; + tmp = cnt[CNT_NEAREST]; + cnt[CNT_NEAREST] = cnt[CNT_NEAR]; + cnt[CNT_NEAR] = tmp; + tmp = near_mvs[CNT_NEAREST].as_int; + near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int; + near_mvs[CNT_NEAR].as_int = tmp; + } + + /* Use near_mvs[0] to store the "best" MV */ + if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA]) + near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST]; + + /* Set up return values */ + *best_mv = near_mvs[0].as_mv; + *nearest = near_mvs[CNT_NEAREST].as_mv; + *nearby = near_mvs[CNT_NEAR].as_mv; + + vp8_clamp_mv(nearest, xd); + vp8_clamp_mv(nearby, xd); + vp8_clamp_mv(best_mv, xd); //TODO: move this up before the copy +} + +vp8_prob *vp8_mv_ref_probs( + vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4] +) +{ + p[0] = vp8_mode_contexts [near_mv_ref_ct[0]] [0]; + p[1] = vp8_mode_contexts [near_mv_ref_ct[1]] [1]; + p[2] = vp8_mode_contexts [near_mv_ref_ct[2]] [2]; + p[3] = vp8_mode_contexts [near_mv_ref_ct[3]] [3]; + //p[3] = vp8_mode_contexts [near_mv_ref_ct[1] + near_mv_ref_ct[2] + near_mv_ref_ct[3]] [3]; + return p; +} + +const B_MODE_INFO *vp8_left_bmi(const MODE_INFO *cur_mb, int b) +{ + if (!(b & 3)) + { + /* On L edge, get from MB to left of us */ + --cur_mb; + b += 4; + } + + return cur_mb->bmi + b - 1; +} + +const B_MODE_INFO *vp8_above_bmi(const MODE_INFO *cur_mb, int b, int mi_stride) +{ + if (!(b >> 2)) + { + /* On top edge, get from MB above us */ + cur_mb -= mi_stride; + b += 16; + } + + return cur_mb->bmi + b - 4; +} diff --git a/vp8/common/findnearmv.h b/vp8/common/findnearmv.h new file mode 100644 index 000000000..2c02033e6 --- /dev/null +++ b/vp8/common/findnearmv.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_FINDNEARMV_H +#define __INC_FINDNEARMV_H + +#include "mv.h" +#include "blockd.h" +#include "modecont.h" +#include "treecoder.h" + +void vp8_find_near_mvs +( + MACROBLOCKD *xd, + const MODE_INFO *here, + MV *nearest, MV *nearby, MV *best, + int near_mv_ref_cts[4], + int refframe, + int *ref_frame_sign_bias +); + +vp8_prob *vp8_mv_ref_probs( + vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4] +); + +const B_MODE_INFO *vp8_left_bmi(const MODE_INFO *cur_mb, int b); + +const B_MODE_INFO *vp8_above_bmi(const MODE_INFO *cur_mb, int b, int mi_stride); + +#define LEFT_TOP_MARGIN (16 << 3) +#define RIGHT_BOTTOM_MARGIN (16 << 3) + + +#endif diff --git a/vp8/common/fourcc.hpp b/vp8/common/fourcc.hpp new file mode 100644 index 000000000..5f1faed2f --- /dev/null +++ b/vp8/common/fourcc.hpp @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef FOURCC_HPP +#define FOURCC_HPP + +#include <iosfwd> +#include <cstring> + + +#if defined(__POWERPC__) || defined(__APPLE__) || defined(__MERKS__) +using namespace std; +#endif + +class four_cc +{ +public: + + four_cc(); + four_cc(const char*); + explicit four_cc(unsigned long); + + bool operator==(const four_cc&) const; + bool operator!=(const four_cc&) const; + + bool operator==(const char*) const; + bool operator!=(const char*) const; + + operator unsigned long() const; + unsigned long as_long() const; + + four_cc& operator=(unsigned long); + + char operator[](int) const; + + std::ostream& put(std::ostream&) const; + + bool printable() const; + +private: + + union + { + char code[4]; + unsigned long code_as_long; + }; + +}; + + +inline four_cc::four_cc() +{ +} + +inline four_cc::four_cc(unsigned long x) + : code_as_long(x) +{ +} + +inline four_cc::four_cc(const char* str) +{ + memcpy(code, str, 4); +} + + +inline bool four_cc::operator==(const four_cc& rhs) const +{ + return code_as_long == rhs.code_as_long; +} + +inline bool four_cc::operator!=(const four_cc& rhs) const +{ + return !operator==(rhs); +} + +inline bool four_cc::operator==(const char* rhs) const +{ + return (memcmp(code, rhs, 4) == 0); +} + +inline bool four_cc::operator!=(const char* rhs) const +{ + return !operator==(rhs); +} + + +inline four_cc::operator unsigned long() const +{ + return code_as_long; +} + +inline unsigned long four_cc::as_long() const +{ + return code_as_long; +} + +inline char four_cc::operator[](int i) const +{ + return code[i]; +} + +inline four_cc& four_cc::operator=(unsigned long val) +{ + code_as_long = val; + return *this; +} + +inline std::ostream& operator<<(std::ostream& os, const four_cc& rhs) +{ + return rhs.put(os); +} + +#endif diff --git a/vp8/common/g_common.h b/vp8/common/g_common.h new file mode 100644 index 000000000..e68c53e1c --- /dev/null +++ b/vp8/common/g_common.h @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +extern void (*vp8_clear_system_state)(void); +extern void (*vp8_plane_add_noise)(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int DPitch, int q); +extern void (*de_interlace) +( + unsigned char *src_ptr, + unsigned char *dst_ptr, + int Width, + int Height, + int Stride +); diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c new file mode 100644 index 000000000..0011ae0dc --- /dev/null +++ b/vp8/common/generic/systemdependent.c @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "g_common.h" +#include "subpixel.h" +#include "loopfilter.h" +#include "recon.h" +#include "idct.h" +#include "onyxc_int.h" + +extern void vp8_arch_x86_common_init(VP8_COMMON *ctx); + +void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x); +extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x); + +void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x); +extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x); + +void vp8_machine_specific_config(VP8_COMMON *ctx) +{ +#if CONFIG_RUNTIME_CPU_DETECT + VP8_COMMON_RTCD *rtcd = &ctx->rtcd; + + rtcd->idct.idct1 = vp8_short_idct4x4llm_1_c; + rtcd->idct.idct16 = vp8_short_idct4x4llm_c; + rtcd->idct.idct1_scalar = vp8_dc_only_idct_c; + rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_c; + rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_c; + + rtcd->recon.copy16x16 = vp8_copy_mem16x16_c; + rtcd->recon.copy8x8 = vp8_copy_mem8x8_c; + rtcd->recon.copy8x4 = vp8_copy_mem8x4_c; + rtcd->recon.recon = vp8_recon_b_c; + rtcd->recon.recon2 = vp8_recon2b_c; + rtcd->recon.recon4 = vp8_recon4b_c; + + rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_c; + rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_c; + rtcd->subpix.sixtap8x4 = vp8_sixtap_predict8x4_c; + rtcd->subpix.sixtap4x4 = vp8_sixtap_predict_c; + rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_c; + rtcd->subpix.bilinear8x8 = vp8_bilinear_predict8x8_c; + rtcd->subpix.bilinear8x4 = vp8_bilinear_predict8x4_c; + rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_c; + + rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_c; + rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_c; + rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_c; + rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_c; + rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_c; + rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_c; + rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_c; + rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_c; + +#if CONFIG_POSTPROC || CONFIG_VP8_ENCODER + rtcd->postproc.down = vp8_mbpost_proc_down_c; + rtcd->postproc.across = vp8_mbpost_proc_across_ip_c; + rtcd->postproc.downacross = vp8_post_proc_down_and_across_c; + rtcd->postproc.addnoise = vp8_plane_add_noise_c; +#endif + +#endif + // Pure C: + vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby; + vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s; + +#if ARCH_X86 || ARCH_X86_64 + vp8_arch_x86_common_init(ctx); +#endif + +} diff --git a/vp8/common/header.h b/vp8/common/header.h new file mode 100644 index 000000000..8b2b0094a --- /dev/null +++ b/vp8/common/header.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_HEADER_H +#define __INC_HEADER_H + +/* 24 bits total */ +typedef struct +{ + unsigned int type: 1; + unsigned int version: 3; + unsigned int show_frame: 1; + + /* Allow 2^20 bytes = 8 megabits for first partition */ + + unsigned int first_partition_length_in_bytes: 19; + +#ifdef PACKET_TESTING + unsigned int frame_number; + unsigned int update_gold: 1; + unsigned int uses_gold: 1; + unsigned int update_last: 1; + unsigned int uses_last: 1; +#endif + +} VP8_HEADER; + +#ifdef PACKET_TESTING +#define VP8_HEADER_SIZE 8 +#else +#define VP8_HEADER_SIZE 3 +#endif + + +#endif diff --git a/vp8/common/idct.h b/vp8/common/idct.h new file mode 100644 index 000000000..47b5f0576 --- /dev/null +++ b/vp8/common/idct.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_IDCT_H +#define __INC_IDCT_H + +#define prototype_second_order(sym) \ + void sym(short *input, short *output) + +#define prototype_idct(sym) \ + void sym(short *input, short *output, int pitch) + +#define prototype_idct_scalar(sym) \ + void sym(short input, short *output, int pitch) + +#if ARCH_X86 || ARCH_X86_64 +#include "x86/idct_x86.h" +#endif + +#if ARCH_ARM +#include "arm/idct_arm.h" +#endif + +#ifndef vp8_idct_idct1 +#define vp8_idct_idct1 vp8_short_idct4x4llm_1_c +#endif +extern prototype_idct(vp8_idct_idct1); + +#ifndef vp8_idct_idct16 +#define vp8_idct_idct16 vp8_short_idct4x4llm_c +#endif +extern prototype_idct(vp8_idct_idct16); + +#ifndef vp8_idct_idct1_scalar +#define vp8_idct_idct1_scalar vp8_dc_only_idct_c +#endif +extern prototype_idct_scalar(vp8_idct_idct1_scalar); + + +#ifndef vp8_idct_iwalsh1 +#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_c +#endif +extern prototype_second_order(vp8_idct_iwalsh1); + +#ifndef vp8_idct_iwalsh16 +#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_c +#endif +extern prototype_second_order(vp8_idct_iwalsh16); + +typedef prototype_idct((*vp8_idct_fn_t)); +typedef prototype_idct_scalar((*vp8_idct_scalar_fn_t)); +typedef prototype_second_order((*vp8_second_order_fn_t)); + +typedef struct +{ + vp8_idct_fn_t idct1; + vp8_idct_fn_t idct16; + vp8_idct_scalar_fn_t idct1_scalar; + + vp8_second_order_fn_t iwalsh1; + vp8_second_order_fn_t iwalsh16; +} vp8_idct_rtcd_vtable_t; + +#if CONFIG_RUNTIME_CPU_DETECT +#define IDCT_INVOKE(ctx,fn) (ctx)->fn +#else +#define IDCT_INVOKE(ctx,fn) vp8_idct_##fn +#endif + +#endif diff --git a/vp8/common/idctllm.c b/vp8/common/idctllm.c new file mode 100644 index 000000000..57cf8584e --- /dev/null +++ b/vp8/common/idctllm.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** + * Notes: + * + * This implementation makes use of 16 bit fixed point verio of two multiply + * constants: + * 1. sqrt(2) * cos (pi/8) + * 2. sqrt(2) * sin (pi/8) + * Becuase the first constant is bigger than 1, to maintain the same 16 bit + * fixed point precision as the second one, we use a trick of + * x * a = x + x*(a-1) + * so + * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). + **************************************************************************/ +static const int cospi8sqrt2minus1 = 20091; +static const int sinpi8sqrt2 = 35468; +static const int rounding = 0; +void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) +{ + int i; + int a1, b1, c1, d1; + + short *ip = input; + short *op = output; + int temp1, temp2; + int shortpitch = pitch >> 1; + + for (i = 0; i < 4; i++) + { + a1 = ip[0] + ip[8]; + b1 = ip[0] - ip[8]; + + temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16; + temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16); + c1 = temp1 - temp2; + + temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16); + temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16; + d1 = temp1 + temp2; + + op[shortpitch*0] = a1 + d1; + op[shortpitch*3] = a1 - d1; + + op[shortpitch*1] = b1 + c1; + op[shortpitch*2] = b1 - c1; + + ip++; + op++; + } + + ip = output; + op = output; + + for (i = 0; i < 4; i++) + { + a1 = ip[0] + ip[2]; + b1 = ip[0] - ip[2]; + + temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16; + temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16); + c1 = temp1 - temp2; + + temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16); + temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16; + d1 = temp1 + temp2; + + + op[0] = (a1 + d1 + 4) >> 3; + op[3] = (a1 - d1 + 4) >> 3; + + op[1] = (b1 + c1 + 4) >> 3; + op[2] = (b1 - c1 + 4) >> 3; + + ip += shortpitch; + op += shortpitch; + } +} + +void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch) +{ + int i; + int a1; + short *op = output; + int shortpitch = pitch >> 1; + a1 = ((input[0] + 4) >> 3); + + for (i = 0; i < 4; i++) + { + op[0] = a1; + op[1] = a1; + op[2] = a1; + op[3] = a1; + op += shortpitch; + } +} + + +void vp8_dc_only_idct_c(short input_dc, short *output, int pitch) +{ + int i; + int a1; + short *op = output; + int shortpitch = pitch >> 1; + a1 = ((input_dc + 4) >> 3); + + for (i = 0; i < 4; i++) + { + op[0] = a1; + op[1] = a1; + op[2] = a1; + op[3] = a1; + op += shortpitch; + } +} + +void vp8_short_inv_walsh4x4_c(short *input, short *output) +{ + int i; + int a1, b1, c1, d1; + int a2, b2, c2, d2; + short *ip = input; + short *op = output; + + for (i = 0; i < 4; i++) + { + a1 = ip[0] + ip[12]; + b1 = ip[4] + ip[8]; + c1 = ip[4] - ip[8]; + d1 = ip[0] - ip[12]; + + op[0] = a1 + b1; + op[4] = c1 + d1; + op[8] = a1 - b1; + op[12] = d1 - c1; + ip++; + op++; + } + + ip = output; + op = output; + + for (i = 0; i < 4; i++) + { + a1 = ip[0] + ip[3]; + b1 = ip[1] + ip[2]; + c1 = ip[1] - ip[2]; + d1 = ip[0] - ip[3]; + + a2 = a1 + b1; + b2 = c1 + d1; + c2 = a1 - b1; + d2 = d1 - c1; + + op[0] = (a2 + 3) >> 3; + op[1] = (b2 + 3) >> 3; + op[2] = (c2 + 3) >> 3; + op[3] = (d2 + 3) >> 3; + + ip += 4; + op += 4; + } +} + +void vp8_short_inv_walsh4x4_1_c(short *input, short *output) +{ + int i; + int a1; + short *op = output; + + a1 = ((input[0] + 3) >> 3); + + for (i = 0; i < 4; i++) + { + op[0] = a1; + op[1] = a1; + op[2] = a1; + op[3] = a1; + op += 4; + } +} diff --git a/vp8/common/invtrans.c b/vp8/common/invtrans.c new file mode 100644 index 000000000..1ff596ead --- /dev/null +++ b/vp8/common/invtrans.c @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "invtrans.h" + + + +static void recon_dcblock(MACROBLOCKD *x) +{ + BLOCKD *b = &x->block[24]; + int i; + + for (i = 0; i < 16; i++) + { + x->block[i].dqcoeff[0] = b->diff[i]; + } + +} + +void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch) +{ + if (b->eob > 1) + IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->diff, pitch); + else + IDCT_INVOKE(rtcd, idct1)(b->dqcoeff, b->diff, pitch); +} + + +void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x) +{ + int i; + + // do 2nd order transform on the dc block + IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->block[24].diff); + + recon_dcblock(x); + + for (i = 0; i < 16; i++) + { + vp8_inverse_transform_b(rtcd, &x->block[i], 32); + } + +} +void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x) +{ + int i; + + for (i = 16; i < 24; i++) + { + vp8_inverse_transform_b(rtcd, &x->block[i], 16); + } + +} + + +void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x) +{ + int i; + + if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV) + { + // do 2nd order transform on the dc block + + IDCT_INVOKE(rtcd, iwalsh16)(&x->block[24].dqcoeff[0], x->block[24].diff); + recon_dcblock(x); + } + + for (i = 0; i < 16; i++) + { + vp8_inverse_transform_b(rtcd, &x->block[i], 32); + } + + + for (i = 16; i < 24; i++) + { + vp8_inverse_transform_b(rtcd, &x->block[i], 16); + } + +} diff --git a/vp8/common/invtrans.h b/vp8/common/invtrans.h new file mode 100644 index 000000000..93a40f956 --- /dev/null +++ b/vp8/common/invtrans.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_INVTRANS_H +#define __INC_INVTRANS_H + +#include "vpx_ports/config.h" +#include "idct.h" +#include "blockd.h" +extern void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch); +extern void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x); +extern void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x); +extern void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x); + +#endif diff --git a/vp8/common/littlend.h b/vp8/common/littlend.h new file mode 100644 index 000000000..08c525c5d --- /dev/null +++ b/vp8/common/littlend.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef _littlend_h +#define _littlend_h + +#if defined(__cplusplus) +extern "C" { +#endif + +#define invert2(x) (x) +#define invert4(x) (x) + +#define low_byte(x) (unsigned char)x +#define mid1Byte(x) (unsigned char)(x >> 8) +#define mid2Byte(x) (unsigned char)(x >> 16) +#define high_byte(x) (unsigned char)(x >> 24) + +#define SWAPENDS 0 + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/vp8/common/loopfilter.c b/vp8/common/loopfilter.c new file mode 100644 index 000000000..79e617754 --- /dev/null +++ b/vp8/common/loopfilter.c @@ -0,0 +1,601 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "loopfilter.h" +#include "onyxc_int.h" + +typedef unsigned char uc; + + +prototype_loopfilter(vp8_loop_filter_horizontal_edge_c); +prototype_loopfilter(vp8_loop_filter_vertical_edge_c); +prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_c); +prototype_loopfilter(vp8_mbloop_filter_vertical_edge_c); +prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_c); +prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_c); + +// Horizontal MB filtering +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1); + + if (v_ptr) + vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1); +} + +void vp8_loop_filter_mbhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); +} + +// Vertical MB Filtering +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1); + + if (v_ptr) + vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1); +} + +void vp8_loop_filter_mbvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); +} + +// Horizontal B Filtering +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1); + + if (v_ptr) + vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1); +} + +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); +} + +// Vertical B Filtering +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + + if (u_ptr) + vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1); + + if (v_ptr) + vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1); +} + +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); +} + +void vp8_init_loop_filter(VP8_COMMON *cm) +{ + loop_filter_info *lfi = cm->lf_info; + LOOPFILTERTYPE lft = cm->filter_type; + int sharpness_lvl = cm->sharpness_level; + int frame_type = cm->frame_type; + int i, j; + + int block_inside_limit = 0; + int HEVThresh; + const int yhedge_boost = 2; + const int uvhedge_boost = 2; + + // For each possible value for the loop filter fill out a "loop_filter_info" entry. + for (i = 0; i <= MAX_LOOP_FILTER; i++) + { + int filt_lvl = i; + + if (frame_type == KEY_FRAME) + { + if (filt_lvl >= 40) + HEVThresh = 2; + else if (filt_lvl >= 15) + HEVThresh = 1; + else + HEVThresh = 0; + } + else + { + if (filt_lvl >= 40) + HEVThresh = 3; + else if (filt_lvl >= 20) + HEVThresh = 2; + else if (filt_lvl >= 15) + HEVThresh = 1; + else + HEVThresh = 0; + } + + // Set loop filter paramaeters that control sharpness. + block_inside_limit = filt_lvl >> (sharpness_lvl > 0); + block_inside_limit = block_inside_limit >> (sharpness_lvl > 4); + + if (sharpness_lvl > 0) + { + if (block_inside_limit > (9 - sharpness_lvl)) + block_inside_limit = (9 - sharpness_lvl); + } + + if (block_inside_limit < 1) + block_inside_limit = 1; + + for (j = 0; j < 16; j++) + { + lfi[i].lim[j] = block_inside_limit; + lfi[i].mbflim[j] = filt_lvl + yhedge_boost; + lfi[i].mbthr[j] = HEVThresh; + lfi[i].flim[j] = filt_lvl; + lfi[i].thr[j] = HEVThresh; + lfi[i].uvlim[j] = block_inside_limit; + lfi[i].uvmbflim[j] = filt_lvl + uvhedge_boost; + lfi[i].uvmbthr[j] = HEVThresh; + lfi[i].uvflim[j] = filt_lvl; + lfi[i].uvthr[j] = HEVThresh; + } + + } + + // Set up the function pointers depending on the type of loop filtering selected + if (lft == NORMAL_LOOPFILTER) + { + cm->lf_mbv = LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v); + cm->lf_bv = LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v); + cm->lf_mbh = LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h); + cm->lf_bh = LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h); + } + else + { + cm->lf_mbv = LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v); + cm->lf_bv = LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v); + cm->lf_mbh = LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h); + cm->lf_bh = LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h); + } +} + +// Put vp8_init_loop_filter() in vp8dx_create_decompressor(). Only call vp8_frame_init_loop_filter() while decoding +// each frame. Check last_frame_type to skip the function most of times. +void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type) +{ + int HEVThresh; + int i, j; + + // For each possible value for the loop filter fill out a "loop_filter_info" entry. + for (i = 0; i <= MAX_LOOP_FILTER; i++) + { + int filt_lvl = i; + + if (frame_type == KEY_FRAME) + { + if (filt_lvl >= 40) + HEVThresh = 2; + else if (filt_lvl >= 15) + HEVThresh = 1; + else + HEVThresh = 0; + } + else + { + if (filt_lvl >= 40) + HEVThresh = 3; + else if (filt_lvl >= 20) + HEVThresh = 2; + else if (filt_lvl >= 15) + HEVThresh = 1; + else + HEVThresh = 0; + } + + for (j = 0; j < 16; j++) + { + //lfi[i].lim[j] = block_inside_limit; + //lfi[i].mbflim[j] = filt_lvl+yhedge_boost; + lfi[i].mbthr[j] = HEVThresh; + //lfi[i].flim[j] = filt_lvl; + lfi[i].thr[j] = HEVThresh; + //lfi[i].uvlim[j] = block_inside_limit; + //lfi[i].uvmbflim[j] = filt_lvl+uvhedge_boost; + lfi[i].uvmbthr[j] = HEVThresh; + //lfi[i].uvflim[j] = filt_lvl; + lfi[i].uvthr[j] = HEVThresh; + } + } +} + + +void vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int *filter_level) +{ + MB_MODE_INFO *mbmi = &mbd->mode_info_context->mbmi; + + if (mbd->mode_ref_lf_delta_enabled) + { + // Aplly delta for reference frame + *filter_level += mbd->ref_lf_deltas[mbmi->ref_frame]; + + // Apply delta for mode + if (mbmi->ref_frame == INTRA_FRAME) + { + // Only the split mode BPRED has a further special case + if (mbmi->mode == B_PRED) + *filter_level += mbd->mode_lf_deltas[0]; + } + else + { + // Zero motion mode + if (mbmi->mode == ZEROMV) + *filter_level += mbd->mode_lf_deltas[1]; + + // Split MB motion mode + else if (mbmi->mode == SPLITMV) + *filter_level += mbd->mode_lf_deltas[3]; + + // All other inter motion modes (Nearest, Near, New) + else + *filter_level += mbd->mode_lf_deltas[2]; + } + + // Range check + if (*filter_level > MAX_LOOP_FILTER) + *filter_level = MAX_LOOP_FILTER; + else if (*filter_level < 0) + *filter_level = 0; + } +} + + +void vp8_loop_filter_frame +( + VP8_COMMON *cm, + MACROBLOCKD *mbd, + int default_filt_lvl +) +{ + YV12_BUFFER_CONFIG *post = cm->frame_to_show; + loop_filter_info *lfi = cm->lf_info; + int frame_type = cm->frame_type; + + int mb_row; + int mb_col; + + + int baseline_filter_level[MAX_MB_SEGMENTS]; + int filter_level; + int alt_flt_enabled = mbd->segmentation_enabled; + + int i; + unsigned char *y_ptr, *u_ptr, *v_ptr; + + mbd->mode_info_context = cm->mi; // Point at base of Mb MODE_INFO list + + // Note the baseline filter values for each segment + if (alt_flt_enabled) + { + for (i = 0; i < MAX_MB_SEGMENTS; i++) + { + // Abs value + if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) + baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i]; + // Delta Value + else + { + baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i]; + baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; // Clamp to valid range + } + } + } + else + { + for (i = 0; i < MAX_MB_SEGMENTS; i++) + baseline_filter_level[i] = default_filt_lvl; + } + + // Initialize the loop filter for this frame. + if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level)) + vp8_init_loop_filter(cm); + else if (frame_type != cm->last_frame_type) + vp8_frame_init_loop_filter(lfi, frame_type); + + // Set up the buffer pointers + y_ptr = post->y_buffer; + u_ptr = post->u_buffer; + v_ptr = post->v_buffer; + + // vp8_filter each macro block + for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) + { + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) + { + int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0; + + filter_level = baseline_filter_level[Segment]; + + // Distance of Mb to the various image edges. + // These specified to 8th pel as they are always compared to values that are in 1/8th pel units + // Apply any context driven MB level adjustment + vp8_adjust_mb_lf_value(mbd, &filter_level); + + if (filter_level) + { + if (mb_col > 0) + cm->lf_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf); + + if (mbd->mode_info_context->mbmi.dc_diff > 0) + cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf); + + // don't apply across umv border + if (mb_row > 0) + cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf); + + if (mbd->mode_info_context->mbmi.dc_diff > 0) + cm->lf_bh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf); + } + + y_ptr += 16; + u_ptr += 8; + v_ptr += 8; + + mbd->mode_info_context++; // step to next MB + } + + y_ptr += post->y_stride * 16 - post->y_width; + u_ptr += post->uv_stride * 8 - post->uv_width; + v_ptr += post->uv_stride * 8 - post->uv_width; + + mbd->mode_info_context++; // Skip border mb + } +} + + +void vp8_loop_filter_frame_yonly +( + VP8_COMMON *cm, + MACROBLOCKD *mbd, + int default_filt_lvl, + int sharpness_lvl +) +{ + YV12_BUFFER_CONFIG *post = cm->frame_to_show; + + int i; + unsigned char *y_ptr; + int mb_row; + int mb_col; + + loop_filter_info *lfi = cm->lf_info; + int baseline_filter_level[MAX_MB_SEGMENTS]; + int filter_level; + int alt_flt_enabled = mbd->segmentation_enabled; + int frame_type = cm->frame_type; + + (void) sharpness_lvl; + + //MODE_INFO * this_mb_mode_info = cm->mi; // Point at base of Mb MODE_INFO list + mbd->mode_info_context = cm->mi; // Point at base of Mb MODE_INFO list + + // Note the baseline filter values for each segment + if (alt_flt_enabled) + { + for (i = 0; i < MAX_MB_SEGMENTS; i++) + { + // Abs value + if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) + baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i]; + // Delta Value + else + { + baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i]; + baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; // Clamp to valid range + } + } + } + else + { + for (i = 0; i < MAX_MB_SEGMENTS; i++) + baseline_filter_level[i] = default_filt_lvl; + } + + // Initialize the loop filter for this frame. + if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level)) + vp8_init_loop_filter(cm); + else if (frame_type != cm->last_frame_type) + vp8_frame_init_loop_filter(lfi, frame_type); + + // Set up the buffer pointers + y_ptr = post->y_buffer; + + // vp8_filter each macro block + for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) + { + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) + { + int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0; + filter_level = baseline_filter_level[Segment]; + + // Apply any context driven MB level adjustment + vp8_adjust_mb_lf_value(mbd, &filter_level); + + if (filter_level) + { + if (mb_col > 0) + cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0); + + if (mbd->mode_info_context->mbmi.dc_diff > 0) + cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0); + + // don't apply across umv border + if (mb_row > 0) + cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0); + + if (mbd->mode_info_context->mbmi.dc_diff > 0) + cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0); + } + + y_ptr += 16; + mbd->mode_info_context ++; // step to next MB + + } + + y_ptr += post->y_stride * 16 - post->y_width; + mbd->mode_info_context ++; // Skip border mb + } + +} + + +void vp8_loop_filter_partial_frame +( + VP8_COMMON *cm, + MACROBLOCKD *mbd, + int default_filt_lvl, + int sharpness_lvl, + int Fraction +) +{ + YV12_BUFFER_CONFIG *post = cm->frame_to_show; + + int i; + unsigned char *y_ptr; + int mb_row; + int mb_col; + //int mb_rows = post->y_height >> 4; + int mb_cols = post->y_width >> 4; + + int linestocopy; + + loop_filter_info *lfi = cm->lf_info; + int baseline_filter_level[MAX_MB_SEGMENTS]; + int filter_level; + int alt_flt_enabled = mbd->segmentation_enabled; + int frame_type = cm->frame_type; + + (void) sharpness_lvl; + + //MODE_INFO * this_mb_mode_info = cm->mi + (post->y_height>>5) * (mb_cols + 1); // Point at base of Mb MODE_INFO list + mbd->mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1); // Point at base of Mb MODE_INFO list + + linestocopy = (post->y_height >> (4 + Fraction)); + + if (linestocopy < 1) + linestocopy = 1; + + linestocopy <<= 4; + + // Note the baseline filter values for each segment + if (alt_flt_enabled) + { + for (i = 0; i < MAX_MB_SEGMENTS; i++) + { + // Abs value + if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) + baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i]; + // Delta Value + else + { + baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i]; + baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; // Clamp to valid range + } + } + } + else + { + for (i = 0; i < MAX_MB_SEGMENTS; i++) + baseline_filter_level[i] = default_filt_lvl; + } + + // Initialize the loop filter for this frame. + if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level)) + vp8_init_loop_filter(cm); + else if (frame_type != cm->last_frame_type) + vp8_frame_init_loop_filter(lfi, frame_type); + + // Set up the buffer pointers + y_ptr = post->y_buffer + (post->y_height >> 5) * 16 * post->y_stride; + + // vp8_filter each macro block + for (mb_row = 0; mb_row<(linestocopy >> 4); mb_row++) + { + for (mb_col = 0; mb_col < mb_cols; mb_col++) + { + int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0; + filter_level = baseline_filter_level[Segment]; + + if (filter_level) + { + if (mb_col > 0) + cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0); + + if (mbd->mode_info_context->mbmi.dc_diff > 0) + cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0); + + cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0); + + if (mbd->mode_info_context->mbmi.dc_diff > 0) + cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0); + } + + y_ptr += 16; + mbd->mode_info_context += 1; // step to next MB + } + + y_ptr += post->y_stride * 16 - post->y_width; + mbd->mode_info_context += 1; // Skip border mb + } +} diff --git a/vp8/common/loopfilter.h b/vp8/common/loopfilter.h new file mode 100644 index 000000000..c6ce508cc --- /dev/null +++ b/vp8/common/loopfilter.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef loopfilter_h +#define loopfilter_h + +#include "vpx_ports/mem.h" + +#define MAX_LOOP_FILTER 63 + +typedef enum +{ + NORMAL_LOOPFILTER = 0, + SIMPLE_LOOPFILTER = 1 +} LOOPFILTERTYPE; + +// FRK +// Need to align this structure so when it is declared and +// passed it can be loaded into vector registers. +// FRK +typedef struct +{ + DECLARE_ALIGNED(16, signed char, lim[16]); + DECLARE_ALIGNED(16, signed char, flim[16]); + DECLARE_ALIGNED(16, signed char, thr[16]); + DECLARE_ALIGNED(16, signed char, mbflim[16]); + DECLARE_ALIGNED(16, signed char, mbthr[16]); + DECLARE_ALIGNED(16, signed char, uvlim[16]); + DECLARE_ALIGNED(16, signed char, uvflim[16]); + DECLARE_ALIGNED(16, signed char, uvthr[16]); + DECLARE_ALIGNED(16, signed char, uvmbflim[16]); + DECLARE_ALIGNED(16, signed char, uvmbthr[16]); +} loop_filter_info; + + +#define prototype_loopfilter(sym) \ + void sym(unsigned char *src, int pitch, const signed char *flimit,\ + const signed char *limit, const signed char *thresh, int count) + +#define prototype_loopfilter_block(sym) \ + void sym(unsigned char *y, unsigned char *u, unsigned char *v,\ + int ystride, int uv_stride, loop_filter_info *lfi, int simpler) + +#if ARCH_X86 || ARCH_X86_64 +#include "x86/loopfilter_x86.h" +#endif + +#if ARCH_ARM +#include "arm/loopfilter_arm.h" +#endif + +#ifndef vp8_lf_normal_mb_v +#define vp8_lf_normal_mb_v vp8_loop_filter_mbv_c +#endif +extern prototype_loopfilter_block(vp8_lf_normal_mb_v); + +#ifndef vp8_lf_normal_b_v +#define vp8_lf_normal_b_v vp8_loop_filter_bv_c +#endif +extern prototype_loopfilter_block(vp8_lf_normal_b_v); + +#ifndef vp8_lf_normal_mb_h +#define vp8_lf_normal_mb_h vp8_loop_filter_mbh_c +#endif +extern prototype_loopfilter_block(vp8_lf_normal_mb_h); + +#ifndef vp8_lf_normal_b_h +#define vp8_lf_normal_b_h vp8_loop_filter_bh_c +#endif +extern prototype_loopfilter_block(vp8_lf_normal_b_h); + + +#ifndef vp8_lf_simple_mb_v +#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_c +#endif +extern prototype_loopfilter_block(vp8_lf_simple_mb_v); + +#ifndef vp8_lf_simple_b_v +#define vp8_lf_simple_b_v vp8_loop_filter_bvs_c +#endif +extern prototype_loopfilter_block(vp8_lf_simple_b_v); + +#ifndef vp8_lf_simple_mb_h +#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_c +#endif +extern prototype_loopfilter_block(vp8_lf_simple_mb_h); + +#ifndef vp8_lf_simple_b_h +#define vp8_lf_simple_b_h vp8_loop_filter_bhs_c +#endif +extern prototype_loopfilter_block(vp8_lf_simple_b_h); + +typedef prototype_loopfilter_block((*vp8_lf_block_fn_t)); +typedef struct +{ + vp8_lf_block_fn_t normal_mb_v; + vp8_lf_block_fn_t normal_b_v; + vp8_lf_block_fn_t normal_mb_h; + vp8_lf_block_fn_t normal_b_h; + vp8_lf_block_fn_t simple_mb_v; + vp8_lf_block_fn_t simple_b_v; + vp8_lf_block_fn_t simple_mb_h; + vp8_lf_block_fn_t simple_b_h; +} vp8_loopfilter_rtcd_vtable_t; + +#if CONFIG_RUNTIME_CPU_DETECT +#define LF_INVOKE(ctx,fn) (ctx)->fn +#else +#define LF_INVOKE(ctx,fn) vp8_lf_##fn +#endif + + +#endif diff --git a/vp8/common/loopfilter_filters.c b/vp8/common/loopfilter_filters.c new file mode 100644 index 000000000..7d16e4843 --- /dev/null +++ b/vp8/common/loopfilter_filters.c @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include <stdlib.h> +#include "loopfilter.h" +#include "onyxc_int.h" + + +#define NEW_LOOPFILTER_MASK + +typedef unsigned char uc; + +__inline signed char vp8_signed_char_clamp(int t) +{ + t = (t < -128 ? -128 : t); + t = (t > 127 ? 127 : t); + return (signed char) t; +} + + +// should we apply any filter at all ( 11111111 yes, 00000000 no) +__inline signed char vp8_filter_mask(signed char limit, signed char flimit, + uc p3, uc p2, uc p1, uc p0, uc q0, uc q1, uc q2, uc q3) +{ + signed char mask = 0; + mask |= (abs(p3 - p2) > limit) * -1; + mask |= (abs(p2 - p1) > limit) * -1; + mask |= (abs(p1 - p0) > limit) * -1; + mask |= (abs(q1 - q0) > limit) * -1; + mask |= (abs(q2 - q1) > limit) * -1; + mask |= (abs(q3 - q2) > limit) * -1; +#ifndef NEW_LOOPFILTER_MASK + mask |= (abs(p0 - q0) > flimit) * -1; +#else + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit) * -1; +#endif + mask = ~mask; + return mask; +} + +// is there high variance internal edge ( 11111111 yes, 00000000 no) +__inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0, uc q1) +{ + signed char hev = 0; + hev |= (abs(p1 - p0) > thresh) * -1; + hev |= (abs(q1 - q0) > thresh) * -1; + return hev; +} + +__inline void vp8_filter(signed char mask, signed char hev, uc *op1, uc *op0, uc *oq0, uc *oq1) + +{ + signed char ps0, qs0; + signed char ps1, qs1; + signed char vp8_filter, Filter1, Filter2; + signed char u; + + ps1 = (signed char) * op1 ^ 0x80; + ps0 = (signed char) * op0 ^ 0x80; + qs0 = (signed char) * oq0 ^ 0x80; + qs1 = (signed char) * oq1 ^ 0x80; + + // add outer taps if we have high edge variance + vp8_filter = vp8_signed_char_clamp(ps1 - qs1); + vp8_filter &= hev; + + // inner taps + vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); + vp8_filter &= mask; + + // save bottom 3 bits so that we round one side +4 and the other +3 + // if it equals 4 we'll set to adjust by -1 to account for the fact + // we'd round 3 the other way + Filter1 = vp8_signed_char_clamp(vp8_filter + 4); + Filter2 = vp8_signed_char_clamp(vp8_filter + 3); + Filter1 >>= 3; + Filter2 >>= 3; + u = vp8_signed_char_clamp(qs0 - Filter1); + *oq0 = u ^ 0x80; + u = vp8_signed_char_clamp(ps0 + Filter2); + *op0 = u ^ 0x80; + vp8_filter = Filter1; + + // outer tap adjustments + vp8_filter += 1; + vp8_filter >>= 1; + vp8_filter &= ~hev; + + u = vp8_signed_char_clamp(qs1 - vp8_filter); + *oq1 = u ^ 0x80; + u = vp8_signed_char_clamp(ps1 + vp8_filter); + *op1 = u ^ 0x80; + +} +void vp8_loop_filter_horizontal_edge_c +( + unsigned char *s, + int p, //pitch + const signed char *flimit, + const signed char *limit, + const signed char *thresh, + int count +) +{ + int hev = 0; // high edge variance + signed char mask = 0; + int i = 0; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + do + { + mask = vp8_filter_mask(limit[i], flimit[i], + s[-4*p], s[-3*p], s[-2*p], s[-1*p], + s[0*p], s[1*p], s[2*p], s[3*p]); + + hev = vp8_hevmask(thresh[i], s[-2*p], s[-1*p], s[0*p], s[1*p]); + + vp8_filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p); + + ++s; + } + while (++i < count * 8); +} + +void vp8_loop_filter_vertical_edge_c +( + unsigned char *s, + int p, + const signed char *flimit, + const signed char *limit, + const signed char *thresh, + int count +) +{ + int hev = 0; // high edge variance + signed char mask = 0; + int i = 0; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + do + { + mask = vp8_filter_mask(limit[i], flimit[i], + s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]); + + hev = vp8_hevmask(thresh[i], s[-2], s[-1], s[0], s[1]); + + vp8_filter(mask, hev, s - 2, s - 1, s, s + 1); + + s += p; + } + while (++i < count * 8); +} + +__inline void vp8_mbfilter(signed char mask, signed char hev, + uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2) +{ + signed char s, u; + signed char vp8_filter, Filter1, Filter2; + signed char ps2 = (signed char) * op2 ^ 0x80; + signed char ps1 = (signed char) * op1 ^ 0x80; + signed char ps0 = (signed char) * op0 ^ 0x80; + signed char qs0 = (signed char) * oq0 ^ 0x80; + signed char qs1 = (signed char) * oq1 ^ 0x80; + signed char qs2 = (signed char) * oq2 ^ 0x80; + + // add outer taps if we have high edge variance + vp8_filter = vp8_signed_char_clamp(ps1 - qs1); + vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); + vp8_filter &= mask; + + Filter2 = vp8_filter; + Filter2 &= hev; + + // save bottom 3 bits so that we round one side +4 and the other +3 + Filter1 = vp8_signed_char_clamp(Filter2 + 4); + Filter2 = vp8_signed_char_clamp(Filter2 + 3); + Filter1 >>= 3; + Filter2 >>= 3; + qs0 = vp8_signed_char_clamp(qs0 - Filter1); + ps0 = vp8_signed_char_clamp(ps0 + Filter2); + + + // only apply wider filter if not high edge variance + vp8_filter &= ~hev; + Filter2 = vp8_filter; + + // roughly 3/7th difference across boundary + u = vp8_signed_char_clamp((63 + Filter2 * 27) >> 7); + s = vp8_signed_char_clamp(qs0 - u); + *oq0 = s ^ 0x80; + s = vp8_signed_char_clamp(ps0 + u); + *op0 = s ^ 0x80; + + // roughly 2/7th difference across boundary + u = vp8_signed_char_clamp((63 + Filter2 * 18) >> 7); + s = vp8_signed_char_clamp(qs1 - u); + *oq1 = s ^ 0x80; + s = vp8_signed_char_clamp(ps1 + u); + *op1 = s ^ 0x80; + + // roughly 1/7th difference across boundary + u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7); + s = vp8_signed_char_clamp(qs2 - u); + *oq2 = s ^ 0x80; + s = vp8_signed_char_clamp(ps2 + u); + *op2 = s ^ 0x80; +} + +void vp8_mbloop_filter_horizontal_edge_c +( + unsigned char *s, + int p, + const signed char *flimit, + const signed char *limit, + const signed char *thresh, + int count +) +{ + signed char hev = 0; // high edge variance + signed char mask = 0; + int i = 0; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + do + { + + mask = vp8_filter_mask(limit[i], flimit[i], + s[-4*p], s[-3*p], s[-2*p], s[-1*p], + s[0*p], s[1*p], s[2*p], s[3*p]); + + hev = vp8_hevmask(thresh[i], s[-2*p], s[-1*p], s[0*p], s[1*p]); + + vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p); + + ++s; + } + while (++i < count * 8); + +} + + +void vp8_mbloop_filter_vertical_edge_c +( + unsigned char *s, + int p, + const signed char *flimit, + const signed char *limit, + const signed char *thresh, + int count +) +{ + signed char hev = 0; // high edge variance + signed char mask = 0; + int i = 0; + + do + { + + mask = vp8_filter_mask(limit[i], flimit[i], + s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]); + + hev = vp8_hevmask(thresh[i], s[-2], s[-1], s[0], s[1]); + + vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2); + + s += p; + } + while (++i < count * 8); + +} + +// should we apply any filter at all ( 11111111 yes, 00000000 no) +__inline signed char vp8_simple_filter_mask(signed char limit, signed char flimit, uc p1, uc p0, uc q0, uc q1) +{ +// Why does this cause problems for win32? +// error C2143: syntax error : missing ';' before 'type' +// (void) limit; +#ifndef NEW_LOOPFILTER_MASK + signed char mask = (abs(p0 - q0) <= flimit) * -1; +#else + signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= flimit * 2 + limit) * -1; +#endif + return mask; +} + +__inline void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0, uc *oq1) +{ + signed char vp8_filter, Filter1, Filter2; + signed char p1 = (signed char) * op1 ^ 0x80; + signed char p0 = (signed char) * op0 ^ 0x80; + signed char q0 = (signed char) * oq0 ^ 0x80; + signed char q1 = (signed char) * oq1 ^ 0x80; + signed char u; + + vp8_filter = vp8_signed_char_clamp(p1 - q1); + vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (q0 - p0)); + vp8_filter &= mask; + + // save bottom 3 bits so that we round one side +4 and the other +3 + Filter1 = vp8_signed_char_clamp(vp8_filter + 4); + Filter1 >>= 3; + u = vp8_signed_char_clamp(q0 - Filter1); + *oq0 = u ^ 0x80; + + Filter2 = vp8_signed_char_clamp(vp8_filter + 3); + Filter2 >>= 3; + u = vp8_signed_char_clamp(p0 + Filter2); + *op0 = u ^ 0x80; +} + +void vp8_loop_filter_simple_horizontal_edge_c +( + unsigned char *s, + int p, + const signed char *flimit, + const signed char *limit, + const signed char *thresh, + int count +) +{ + signed char mask = 0; + int i = 0; + (void) thresh; + + do + { + //mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1*p],s[0*p]); + mask = vp8_simple_filter_mask(limit[i], flimit[i], s[-2*p], s[-1*p], s[0*p], s[1*p]); + vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p); + ++s; + } + while (++i < count * 8); +} + +void vp8_loop_filter_simple_vertical_edge_c +( + unsigned char *s, + int p, + const signed char *flimit, + const signed char *limit, + const signed char *thresh, + int count +) +{ + signed char mask = 0; + int i = 0; + (void) thresh; + + do + { + //mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1],s[0]); + mask = vp8_simple_filter_mask(limit[i], flimit[i], s[-2], s[-1], s[0], s[1]); + vp8_simple_filter(mask, s - 2, s - 1, s, s + 1); + s += p; + } + while (++i < count * 8); + +} diff --git a/vp8/common/mac_specs.h b/vp8/common/mac_specs.h new file mode 100644 index 000000000..97bffc776 --- /dev/null +++ b/vp8/common/mac_specs.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#if !defined(_mac_specs_h) +#define _mac_specs_h + + +#if defined(__cplusplus) +extern "C" { +#endif + + extern unsigned int vp8_read_tsc(); + + extern unsigned int vp8_get_processor_freq(); + + extern unsigned int vpx_has_altivec(); + +#if defined(__cplusplus) +} +#endif + + +#endif diff --git a/vp8/common/mbpitch.c b/vp8/common/mbpitch.c new file mode 100644 index 000000000..a7e0ce99a --- /dev/null +++ b/vp8/common/mbpitch.c @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "blockd.h" + +typedef enum +{ + PRED = 0, + DEST = 1, +} BLOCKSET; + +void vp8_setup_block +( + BLOCKD *b, + int mv_stride, + unsigned char **base, + int Stride, + int offset, + BLOCKSET bs +) +{ + + if (bs == DEST) + { + b->dst_stride = Stride; + b->dst = offset; + b->base_dst = base; + } + else + { + b->pre_stride = Stride; + b->pre = offset; + b->base_pre = base; + } + +} + +void vp8_setup_macroblock(MACROBLOCKD *x, BLOCKSET bs) +{ + int block; + + unsigned char **y, **u, **v; + + if (bs == DEST) + { + y = &x->dst.y_buffer; + u = &x->dst.u_buffer; + v = &x->dst.v_buffer; + } + else + { + y = &x->pre.y_buffer; + u = &x->pre.u_buffer; + v = &x->pre.v_buffer; + } + + for (block = 0; block < 16; block++) // y blocks + { + vp8_setup_block(&x->block[block], x->dst.y_stride, y, x->dst.y_stride, + (block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4, bs); + } + + for (block = 16; block < 20; block++) // U and V blocks + { + vp8_setup_block(&x->block[block], x->dst.uv_stride, u, x->dst.uv_stride, + ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4, bs); + + vp8_setup_block(&x->block[block+4], x->dst.uv_stride, v, x->dst.uv_stride, + ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4, bs); + } +} + +void vp8_setup_block_dptrs(MACROBLOCKD *x) +{ + int r, c; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + x->block[r*4+c].diff = &x->diff[r * 4 * 16 + c * 4]; + x->block[r*4+c].predictor = x->predictor + r * 4 * 16 + c * 4; + } + } + + for (r = 0; r < 2; r++) + { + for (c = 0; c < 2; c++) + { + x->block[16+r*2+c].diff = &x->diff[256 + r * 4 * 8 + c * 4]; + x->block[16+r*2+c].predictor = x->predictor + 256 + r * 4 * 8 + c * 4; + + } + } + + for (r = 0; r < 2; r++) + { + for (c = 0; c < 2; c++) + { + x->block[20+r*2+c].diff = &x->diff[320+ r * 4 * 8 + c * 4]; + x->block[20+r*2+c].predictor = x->predictor + 320 + r * 4 * 8 + c * 4; + + } + } + + x->block[24].diff = &x->diff[384]; + + for (r = 0; r < 25; r++) + { + x->block[r].qcoeff = x->qcoeff + r * 16; + x->block[r].dqcoeff = x->dqcoeff + r * 16; + } +} + +void vp8_build_block_doffsets(MACROBLOCKD *x) +{ + + // handle the destination pitch features + vp8_setup_macroblock(x, DEST); + vp8_setup_macroblock(x, PRED); +} diff --git a/vp8/common/modecont.c b/vp8/common/modecont.c new file mode 100644 index 000000000..9301a2567 --- /dev/null +++ b/vp8/common/modecont.c @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "entropy.h" + +const int vp8_mode_contexts[6][4] = +{ + { + // 0 + 7, 1, 1, 143, + }, + { + // 1 + 14, 18, 14, 107, + }, + { + // 2 + 135, 64, 57, 68, + }, + { + // 3 + 60, 56, 128, 65, + }, + { + // 4 + 159, 134, 128, 34, + }, + { + // 5 + 234, 188, 128, 28, + }, +}; diff --git a/vp8/common/modecont.h b/vp8/common/modecont.h new file mode 100644 index 000000000..0c57651ed --- /dev/null +++ b/vp8/common/modecont.h @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_MODECONT_H +#define __INC_MODECONT_H + +extern const int vp8_mode_contexts[6][4]; + +#endif diff --git a/vp8/common/modecontext.c b/vp8/common/modecontext.c new file mode 100644 index 000000000..ceee74c70 --- /dev/null +++ b/vp8/common/modecontext.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "entropymode.h" + +const unsigned int vp8_kf_default_bmode_counts [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES] = +{ + { + //Above Mode : 0 + { 43438, 2195, 470, 316, 615, 171, 217, 412, 124, 160, }, // left_mode 0 + { 5722, 2751, 296, 291, 81, 68, 80, 101, 100, 170, }, // left_mode 1 + { 1629, 201, 307, 25, 47, 16, 34, 72, 19, 28, }, // left_mode 2 + { 332, 266, 36, 500, 20, 65, 23, 14, 154, 106, }, // left_mode 3 + { 450, 97, 10, 24, 117, 10, 2, 12, 8, 71, }, // left_mode 4 + { 384, 49, 29, 44, 12, 162, 51, 5, 87, 42, }, // left_mode 5 + { 495, 53, 157, 27, 14, 57, 180, 17, 17, 34, }, // left_mode 6 + { 695, 64, 62, 9, 27, 5, 3, 147, 10, 26, }, // left_mode 7 + { 230, 54, 20, 124, 16, 125, 29, 12, 283, 37, }, // left_mode 8 + { 260, 87, 21, 120, 32, 16, 33, 16, 33, 203, }, // left_mode 9 + }, + { + //Above Mode : 1 + { 3934, 2573, 355, 137, 128, 87, 133, 117, 37, 27, }, // left_mode 0 + { 1036, 1929, 278, 135, 27, 37, 48, 55, 41, 91, }, // left_mode 1 + { 223, 256, 253, 15, 13, 9, 28, 64, 3, 3, }, // left_mode 2 + { 120, 129, 17, 316, 15, 11, 9, 4, 53, 74, }, // left_mode 3 + { 129, 58, 6, 11, 38, 2, 0, 5, 2, 67, }, // left_mode 4 + { 53, 22, 11, 16, 8, 26, 14, 3, 19, 12, }, // left_mode 5 + { 59, 26, 61, 11, 4, 9, 35, 13, 8, 8, }, // left_mode 6 + { 101, 52, 40, 8, 5, 2, 8, 59, 2, 20, }, // left_mode 7 + { 48, 34, 10, 52, 8, 15, 6, 6, 63, 20, }, // left_mode 8 + { 96, 48, 22, 63, 11, 14, 5, 8, 9, 96, }, // left_mode 9 + }, + { + //Above Mode : 2 + { 709, 461, 506, 36, 27, 33, 151, 98, 24, 6, }, // left_mode 0 + { 201, 375, 442, 27, 13, 8, 46, 58, 6, 19, }, // left_mode 1 + { 122, 140, 417, 4, 13, 3, 33, 59, 4, 2, }, // left_mode 2 + { 36, 17, 22, 16, 6, 8, 12, 17, 9, 21, }, // left_mode 3 + { 51, 15, 7, 1, 14, 0, 4, 5, 3, 22, }, // left_mode 4 + { 18, 11, 30, 9, 7, 20, 11, 5, 2, 6, }, // left_mode 5 + { 38, 21, 103, 9, 4, 12, 79, 13, 2, 5, }, // left_mode 6 + { 64, 17, 66, 2, 12, 4, 2, 65, 4, 5, }, // left_mode 7 + { 14, 7, 7, 16, 3, 11, 4, 13, 15, 16, }, // left_mode 8 + { 36, 8, 32, 9, 9, 4, 14, 7, 6, 24, }, // left_mode 9 + }, + { + //Above Mode : 3 + { 1340, 173, 36, 119, 30, 10, 13, 10, 20, 26, }, // left_mode 0 + { 156, 293, 26, 108, 5, 16, 2, 4, 23, 30, }, // left_mode 1 + { 60, 34, 13, 7, 3, 3, 0, 8, 4, 5, }, // left_mode 2 + { 72, 64, 1, 235, 3, 9, 2, 7, 28, 38, }, // left_mode 3 + { 29, 14, 1, 3, 5, 0, 2, 2, 5, 13, }, // left_mode 4 + { 22, 7, 4, 11, 2, 5, 1, 2, 6, 4, }, // left_mode 5 + { 18, 14, 5, 6, 4, 3, 14, 0, 9, 2, }, // left_mode 6 + { 41, 10, 7, 1, 2, 0, 0, 10, 2, 1, }, // left_mode 7 + { 23, 19, 2, 33, 1, 5, 2, 0, 51, 8, }, // left_mode 8 + { 33, 26, 7, 53, 3, 9, 3, 3, 9, 19, }, // left_mode 9 + }, + { + //Above Mode : 4 + { 410, 165, 43, 31, 66, 15, 30, 54, 8, 17, }, // left_mode 0 + { 115, 64, 27, 18, 30, 7, 11, 15, 4, 19, }, // left_mode 1 + { 31, 23, 25, 1, 7, 2, 2, 10, 0, 5, }, // left_mode 2 + { 17, 4, 1, 6, 8, 2, 7, 5, 5, 21, }, // left_mode 3 + { 120, 12, 1, 2, 83, 3, 0, 4, 1, 40, }, // left_mode 4 + { 4, 3, 1, 2, 1, 2, 5, 0, 3, 6, }, // left_mode 5 + { 10, 2, 13, 6, 6, 6, 8, 2, 4, 5, }, // left_mode 6 + { 58, 10, 5, 1, 28, 1, 1, 33, 1, 9, }, // left_mode 7 + { 8, 2, 1, 4, 2, 5, 1, 1, 2, 10, }, // left_mode 8 + { 76, 7, 5, 7, 18, 2, 2, 0, 5, 45, }, // left_mode 9 + }, + { + //Above Mode : 5 + { 444, 46, 47, 20, 14, 110, 60, 14, 60, 7, }, // left_mode 0 + { 59, 57, 25, 18, 3, 17, 21, 6, 14, 6, }, // left_mode 1 + { 24, 17, 20, 6, 4, 13, 7, 2, 3, 2, }, // left_mode 2 + { 13, 11, 5, 14, 4, 9, 2, 4, 15, 7, }, // left_mode 3 + { 8, 5, 2, 1, 4, 0, 1, 1, 2, 12, }, // left_mode 4 + { 19, 5, 5, 7, 4, 40, 6, 3, 10, 4, }, // left_mode 5 + { 16, 5, 9, 1, 1, 16, 26, 2, 10, 4, }, // left_mode 6 + { 11, 4, 8, 1, 1, 4, 4, 5, 4, 1, }, // left_mode 7 + { 15, 1, 3, 7, 3, 21, 7, 1, 34, 5, }, // left_mode 8 + { 18, 5, 1, 3, 4, 3, 7, 1, 2, 9, }, // left_mode 9 + }, + { + //Above Mode : 6 + { 476, 149, 94, 13, 14, 77, 291, 27, 23, 3, }, // left_mode 0 + { 79, 83, 42, 14, 2, 12, 63, 2, 4, 14, }, // left_mode 1 + { 43, 36, 55, 1, 3, 8, 42, 11, 5, 1, }, // left_mode 2 + { 9, 9, 6, 16, 1, 5, 6, 3, 11, 10, }, // left_mode 3 + { 10, 3, 1, 3, 10, 1, 0, 1, 1, 4, }, // left_mode 4 + { 14, 6, 15, 5, 1, 20, 25, 2, 5, 0, }, // left_mode 5 + { 28, 7, 51, 1, 0, 8, 127, 6, 2, 5, }, // left_mode 6 + { 13, 3, 3, 2, 3, 1, 2, 8, 1, 2, }, // left_mode 7 + { 10, 3, 3, 3, 3, 8, 2, 2, 9, 3, }, // left_mode 8 + { 13, 7, 11, 4, 0, 4, 6, 2, 5, 8, }, // left_mode 9 + }, + { + //Above Mode : 7 + { 376, 135, 119, 6, 32, 8, 31, 224, 9, 3, }, // left_mode 0 + { 93, 60, 54, 6, 13, 7, 8, 92, 2, 12, }, // left_mode 1 + { 74, 36, 84, 0, 3, 2, 9, 67, 2, 1, }, // left_mode 2 + { 19, 4, 4, 8, 8, 2, 4, 7, 6, 16, }, // left_mode 3 + { 51, 7, 4, 1, 77, 3, 0, 14, 1, 15, }, // left_mode 4 + { 7, 7, 5, 7, 4, 7, 4, 5, 0, 3, }, // left_mode 5 + { 18, 2, 19, 2, 2, 4, 12, 11, 1, 2, }, // left_mode 6 + { 129, 6, 27, 1, 21, 3, 0, 189, 0, 6, }, // left_mode 7 + { 9, 1, 2, 8, 3, 7, 0, 5, 3, 3, }, // left_mode 8 + { 20, 4, 5, 10, 4, 2, 7, 17, 3, 16, }, // left_mode 9 + }, + { + //Above Mode : 8 + { 617, 68, 34, 79, 11, 27, 25, 14, 75, 13, }, // left_mode 0 + { 51, 82, 21, 26, 6, 12, 13, 1, 26, 16, }, // left_mode 1 + { 29, 9, 12, 11, 3, 7, 1, 10, 2, 2, }, // left_mode 2 + { 17, 19, 11, 74, 4, 3, 2, 0, 58, 13, }, // left_mode 3 + { 10, 1, 1, 3, 4, 1, 0, 2, 1, 8, }, // left_mode 4 + { 14, 4, 5, 5, 1, 13, 2, 0, 27, 8, }, // left_mode 5 + { 10, 3, 5, 4, 1, 7, 6, 4, 5, 1, }, // left_mode 6 + { 10, 2, 6, 2, 1, 1, 1, 4, 2, 1, }, // left_mode 7 + { 14, 8, 5, 23, 2, 12, 6, 2, 117, 5, }, // left_mode 8 + { 9, 6, 2, 19, 1, 6, 3, 2, 9, 9, }, // left_mode 9 + }, + { + //Above Mode : 9 + { 680, 73, 22, 38, 42, 5, 11, 9, 6, 28, }, // left_mode 0 + { 113, 112, 21, 22, 10, 2, 8, 4, 6, 42, }, // left_mode 1 + { 44, 20, 24, 6, 5, 4, 3, 3, 1, 2, }, // left_mode 2 + { 40, 23, 7, 71, 5, 2, 4, 1, 7, 22, }, // left_mode 3 + { 85, 9, 4, 4, 17, 2, 0, 3, 2, 23, }, // left_mode 4 + { 13, 4, 2, 6, 1, 7, 0, 1, 7, 6, }, // left_mode 5 + { 26, 6, 8, 3, 2, 3, 8, 1, 5, 4, }, // left_mode 6 + { 54, 8, 9, 6, 7, 0, 1, 11, 1, 3, }, // left_mode 7 + { 9, 10, 4, 13, 2, 5, 4, 2, 14, 8, }, // left_mode 8 + { 92, 9, 5, 19, 15, 3, 3, 1, 6, 58, }, // left_mode 9 + }, +}; diff --git a/vp8/common/mv.h b/vp8/common/mv.h new file mode 100644 index 000000000..3d8418108 --- /dev/null +++ b/vp8/common/mv.h @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_MV_H +#define __INC_MV_H + +typedef struct +{ + short row; + short col; +} MV; + +#endif diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h new file mode 100644 index 000000000..b66c40070 --- /dev/null +++ b/vp8/common/onyx.h @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_VP8_H +#define __INC_VP8_H + +#ifdef __cplusplus +extern "C" +{ +#endif + +#include "vpx_codec/internal/vpx_codec_internal.h" +#include "vpx_scale/yv12config.h" +#include "type_aliases.h" +#include "ppflags.h" + typedef int *VP8_PTR; + + /* Create/destroy static data structures. */ + + typedef enum + { + NORMAL = 0, + FOURFIVE = 1, + THREEFIVE = 2, + ONETWO = 3 + + } VPX_SCALING; + + typedef enum + { + VP8_LAST_FLAG = 1, + VP8_GOLD_FLAG = 2, + VP8_ALT_FLAG = 4 + } VP8_REFFRAME; + + + typedef enum + { + USAGE_STREAM_FROM_SERVER = 0x0, + USAGE_LOCAL_FILE_PLAYBACK = 0x1 + } END_USAGE; + + + typedef enum + { + MODE_REALTIME = 0x0, + MODE_GOODQUALITY = 0x1, + MODE_BESTQUALITY = 0x2, + MODE_FIRSTPASS = 0x3, + MODE_SECONDPASS = 0x4, + MODE_SECONDPASS_BEST = 0x5, + } MODE; + + typedef enum + { + FRAMEFLAGS_KEY = 1, + FRAMEFLAGS_GOLDEN = 2, + FRAMEFLAGS_ALTREF = 4, + } FRAMETYPE_FLAGS; + + +#include <assert.h> + static __inline void Scale2Ratio(int mode, int *hr, int *hs) + { + switch (mode) + { + case NORMAL: + *hr = 1; + *hs = 1; + break; + case FOURFIVE: + *hr = 4; + *hs = 5; + break; + case THREEFIVE: + *hr = 3; + *hs = 5; + break; + case ONETWO: + *hr = 1; + *hs = 2; + break; + default: + *hr = 1; + *hs = 1; + assert(0); + break; + } + } + + typedef struct + { + int Version; // 4 versions of bitstream defined 0 best quality/slowest decode, 3 lowest quality/fastest decode + int Width; // width of data passed to the compressor + int Height; // height of data passed to the compressor + double frame_rate; // set to passed in framerate + int target_bandwidth; // bandwidth to be used in kilobits per second + + int noise_sensitivity; // parameter used for applying pre processing blur: recommendation 0 + int Sharpness; // parameter used for sharpening output: recommendation 0: + int cpu_used; + + // mode -> + //(0)=Realtime/Live Encoding. This mode is optimized for realtim encoding (for example, capturing + // a television signal or feed from a live camera). ( speed setting controls how fast ) + //(1)=Good Quality Fast Encoding. The encoder balances quality with the amount of time it takes to + // encode the output. ( speed setting controls how fast ) + //(2)=One Pass - Best Quality. The encoder places priority on the quality of the output over encoding + // speed. The output is compressed at the highest possible quality. This option takes the longest + // amount of time to encode. ( speed setting ignored ) + //(3)=Two Pass - First Pass. The encoder generates a file of statistics for use in the second encoding + // pass. ( speed setting controls how fast ) + //(4)=Two Pass - Second Pass. The encoder uses the statistics that were generated in the first encoding + // pass to create the compressed output. ( speed setting controls how fast ) + //(5)=Two Pass - Second Pass Best. The encoder uses the statistics that were generated in the first + // encoding pass to create the compressed output using the highest possible quality, and taking a + // longer amount of time to encode.. ( speed setting ignored ) + int Mode; // + + // Key Framing Operations + int auto_key; // automatically detect cut scenes and set the keyframes + int key_freq; // maximum distance to key frame. + + int allow_lag; // allow lagged compression (if 0 lagin frames is ignored) + int lag_in_frames; // how many frames lag before we start encoding + + //---------------------------------------------------------------- + // DATARATE CONTROL OPTIONS + + int end_usage; // vbr or cbr + + // shoot to keep buffer full at all times by undershooting a bit 95 recommended + int under_shoot_pct; + + // buffering parameters + int starting_buffer_level; // in seconds + int optimal_buffer_level; + int maximum_buffer_size; + + // controlling quality + int fixed_q; + int worst_allowed_q; + int best_allowed_q; + + // allow internal resizing ( currently disabled in the build !!!!!) + int allow_spatial_resampling; + int resample_down_water_mark; + int resample_up_water_mark; + + // allow internal frame rate alterations + int allow_df; + int drop_frames_water_mark; + + // two pass datarate control + int two_pass_vbrbias; // two pass datarate control tweaks + int two_pass_vbrmin_section; + int two_pass_vbrmax_section; + // END DATARATE CONTROL OPTIONS + //---------------------------------------------------------------- + + + // these parameters aren't to be used in final build don't use!!! + int play_alternate; + int alt_freq; + int alt_q; + int key_q; + int gold_q; + + + int multi_threaded; // how many threads to run the encoder on + int token_partitions; // how many token partitions to create for multi core decoding + int encode_breakout; // early breakout encode threshold : for video conf recommend 800 + + int error_resilient_mode; // if running over udp networks provides decodable frames after a + // dropped packet + + int arnr_max_frames; + int arnr_strength ; + int arnr_type ; + + + struct vpx_fixed_buf two_pass_stats_in; + struct vpx_codec_pkt_list *output_pkt_list; + } VP8_CONFIG; + + + void vp8_initialize(); + + VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf); + void vp8_remove_compressor(VP8_PTR *comp); + + void vp8_init_config(VP8_PTR onyx, VP8_CONFIG *oxcf); + void vp8_change_config(VP8_PTR onyx, VP8_CONFIG *oxcf); + +// receive a frames worth of data caller can assume that a copy of this frame is made +// and not just a copy of the pointer.. + int vp8_receive_raw_frame(VP8_PTR comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, INT64 time_stamp, INT64 end_time_stamp); + int vp8_get_compressed_data(VP8_PTR comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, INT64 *time_stamp, INT64 *time_end, int flush); + int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags); + + int vp8_use_as_reference(VP8_PTR comp, int ref_frame_flags); + int vp8_update_reference(VP8_PTR comp, int ref_frame_flags); + int vp8_get_reference(VP8_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd); + int vp8_set_reference(VP8_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd); + int vp8_update_entropy(VP8_PTR comp, int update); + int vp8_set_roimap(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4]); + int vp8_set_active_map(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols); + int vp8_set_internal_size(VP8_PTR comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode); + int vp8_get_quantizer(VP8_PTR c); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h new file mode 100644 index 000000000..a40ffb9f0 --- /dev/null +++ b/vp8/common/onyxc_int.h @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_VP8C_INT_H +#define __INC_VP8C_INT_H + +#include "vpx_ports/config.h" +#include "vpx_codec/internal/vpx_codec_internal.h" +#include "loopfilter.h" +#include "entropymv.h" +#include "entropy.h" +#include "idct.h" +#include "recon.h" +#include "postproc.h" + +//#ifdef PACKET_TESTING +#include "header.h" +//#endif + +/* Create/destroy static data structures. */ + +void vp8_initialize_common(void); + +#define MINQ 0 +#define MAXQ 127 +#define QINDEX_RANGE (MAXQ + 1) + + +typedef struct frame_contexts +{ + vp8_prob bmode_prob [VP8_BINTRAMODES-1]; + vp8_prob ymode_prob [VP8_YMODES-1]; /* interframe intra mode probs */ + vp8_prob uv_mode_prob [VP8_UV_MODES-1]; + vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1]; + vp8_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1]; + MV_CONTEXT mvc[2]; + MV_CONTEXT pre_mvc[2]; //not to caculate the mvcost for the frame if mvc doesn't change. +} FRAME_CONTEXT; + +typedef enum +{ + ONE_PARTITION = 0, + TWO_PARTITION = 1, + FOUR_PARTITION = 2, + EIGHT_PARTITION = 3 +} TOKEN_PARTITION; + +typedef enum +{ + RECON_CLAMP_REQUIRED = 0, + RECON_CLAMP_NOTREQUIRED = 1 +} CLAMP_TYPE; + +typedef enum +{ + SIXTAP = 0, + BILINEAR = 1 +} INTERPOLATIONFILTERTYPE; + +typedef struct VP8_COMMON_RTCD +{ +#if CONFIG_RUNTIME_CPU_DETECT + vp8_idct_rtcd_vtable_t idct; + vp8_recon_rtcd_vtable_t recon; + vp8_subpix_rtcd_vtable_t subpix; + vp8_loopfilter_rtcd_vtable_t loopfilter; + vp8_postproc_rtcd_vtable_t postproc; +#else + int unused; +#endif +} VP8_COMMON_RTCD; + +typedef struct VP8Common +{ + struct vpx_internal_error_info error; + + DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][4][4]); + DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][4][4]); + DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][4][4]); + + int Width; + int Height; + int horiz_scale; + int vert_scale; + + YUV_TYPE clr_type; + CLAMP_TYPE clamp_type; + + YV12_BUFFER_CONFIG last_frame; + YV12_BUFFER_CONFIG golden_frame; + YV12_BUFFER_CONFIG alt_ref_frame; + YV12_BUFFER_CONFIG new_frame; + YV12_BUFFER_CONFIG *frame_to_show; + YV12_BUFFER_CONFIG post_proc_buffer; + YV12_BUFFER_CONFIG temp_scale_frame; + + FRAME_TYPE last_frame_type; //Add to check if vp8_frame_init_loop_filter() can be skiped. + FRAME_TYPE frame_type; + + int show_frame; + + int frame_flags; + int MBs; + int mb_rows; + int mb_cols; + int mode_info_stride; + + // prfile settings + int mb_no_coeff_skip; + int no_lpf; + int simpler_lpf; + int use_bilinear_mc_filter; + int full_pixel; + + int base_qindex; + int last_kf_gf_q; // Q used on the last GF or KF + + int y1dc_delta_q; + int y2dc_delta_q; + int y2ac_delta_q; + int uvdc_delta_q; + int uvac_delta_q; + + unsigned int frames_since_golden; + unsigned int frames_till_alt_ref_frame; + unsigned char *gf_active_flags; // Record of which MBs still refer to last golden frame either directly or through 0,0 + int gf_active_count; + + /* We allocate a MODE_INFO struct for each macroblock, together with + an extra row on top and column on the left to simplify prediction. */ + + MODE_INFO *mip; /* Base of allocated array */ + MODE_INFO *mi; /* Corresponds to upper left visible macroblock */ + + + INTERPOLATIONFILTERTYPE mcomp_filter_type; + LOOPFILTERTYPE last_filter_type; + LOOPFILTERTYPE filter_type; + loop_filter_info lf_info[MAX_LOOP_FILTER+1]; + prototype_loopfilter_block((*lf_mbv)); + prototype_loopfilter_block((*lf_mbh)); + prototype_loopfilter_block((*lf_bv)); + prototype_loopfilter_block((*lf_bh)); + int filter_level; + int last_sharpness_level; + int sharpness_level; + + int refresh_last_frame; // Two state 0 = NO, 1 = YES + int refresh_golden_frame; // Two state 0 = NO, 1 = YES + int refresh_alt_ref_frame; // Two state 0 = NO, 1 = YES + + int copy_buffer_to_gf; // 0 none, 1 Last to GF, 2 ARF to GF + int copy_buffer_to_arf; // 0 none, 1 Last to ARF, 2 GF to ARF + + int refresh_entropy_probs; // Two state 0 = NO, 1 = YES + + int ref_frame_sign_bias[MAX_REF_FRAMES]; // Two state 0, 1 + + // Y,U,V,Y2 + ENTROPY_CONTEXT *above_context[4]; // row of context for each plane + ENTROPY_CONTEXT left_context[4][4]; // (up to) 4 contexts "" + + + // keyframe block modes are predicted by their above, left neighbors + + vp8_prob kf_bmode_prob [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1]; + vp8_prob kf_ymode_prob [VP8_YMODES-1]; /* keyframe "" */ + vp8_prob kf_uv_mode_prob [VP8_UV_MODES-1]; + + + FRAME_CONTEXT lfc; // last frame entropy + FRAME_CONTEXT fc; // this frame entropy + + unsigned int current_video_frame; + + int near_boffset[3]; + int version; + + TOKEN_PARTITION multi_token_partition; + +#ifdef PACKET_TESTING + VP8_HEADER oh; +#endif + double bitrate; + double framerate; + +#if CONFIG_RUNTIME_CPU_DETECT + VP8_COMMON_RTCD rtcd; +#endif + struct postproc_state postproc_state; +} VP8_COMMON; + + +void vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int *filter_level); +void vp8_init_loop_filter(VP8_COMMON *cm); +extern void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int filt_val); + +#endif diff --git a/vp8/common/onyxd.h b/vp8/common/onyxd.h new file mode 100644 index 000000000..644c0ec77 --- /dev/null +++ b/vp8/common/onyxd.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_VP8D_H +#define __INC_VP8D_H + + +/* Create/destroy static data structures. */ +#ifdef __cplusplus +extern "C" +{ +#endif +#include "type_aliases.h" +#include "vpx_scale/yv12config.h" +#include "ppflags.h" +#include "vpx_ports/mem.h" + + typedef void *VP8D_PTR; + typedef struct + { + int Width; + int Height; + int Version; + int postprocess; + int max_threads; + } VP8D_CONFIG; + typedef enum + { + VP8_LAST_FLAG = 1, + VP8_GOLD_FLAG = 2, + VP8_ALT_FLAG = 4 + } VP8_REFFRAME; + + typedef enum + { + VP8D_OK = 0 + } VP8D_SETTING; + + void vp8dx_initialize(void); + + void vp8dx_set_setting(VP8D_PTR comp, VP8D_SETTING oxst, int x); + + int vp8dx_get_setting(VP8D_PTR comp, VP8D_SETTING oxst); + + int vp8dx_receive_compressed_data(VP8D_PTR comp, unsigned long size, const unsigned char *dest, INT64 time_stamp); + int vp8dx_get_raw_frame(VP8D_PTR comp, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, int deblock_level, int noise_level, int flags); + + int vp8dx_get_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd); + int vp8dx_set_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd); + + VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf); + + void vp8dx_remove_decompressor(VP8D_PTR comp); + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/vp8/common/partialgfupdate.h b/vp8/common/partialgfupdate.h new file mode 100644 index 000000000..32a55ee6c --- /dev/null +++ b/vp8/common/partialgfupdate.h @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_PARTIALGFUPDATE_H +#define __INC_PARTIALGFUPDATE_H + +#include "onyxc_int.h" + +extern void update_gf_selective(ONYX_COMMON *cm, MACROBLOCKD *x); + +#endif diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c new file mode 100644 index 000000000..f019925c3 --- /dev/null +++ b/vp8/common/postproc.c @@ -0,0 +1,641 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "vpx_scale/yv12config.h" +#include "postproc.h" +#include "vpx_scale/yv12extend.h" +#include "vpx_scale/vpxscale.h" +#include "systemdependent.h" + +#include <math.h> +#include <stdlib.h> +#include <stdio.h> +// global constants + +static const short kernel5[] = +{ + 1, 1, 4, 1, 1 +}; + +const short vp8_rv[] = +{ + 8, 5, 2, 2, 8, 12, 4, 9, 8, 3, + 0, 3, 9, 0, 0, 0, 8, 3, 14, 4, + 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, + 8, 6, 10, 0, 0, 8, 9, 0, 3, 14, + 8, 11, 13, 4, 2, 9, 0, 3, 9, 6, + 1, 2, 3, 14, 13, 1, 8, 2, 9, 7, + 3, 3, 1, 13, 13, 6, 6, 5, 2, 7, + 11, 9, 11, 8, 7, 3, 2, 0, 13, 13, + 14, 4, 12, 5, 12, 10, 8, 10, 13, 10, + 4, 14, 4, 10, 0, 8, 11, 1, 13, 7, + 7, 14, 6, 14, 13, 2, 13, 5, 4, 4, + 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, + 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, + 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, + 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, + 13, 1, 12, 0, 10, 9, 7, 6, 2, 8, + 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, + 9, 6, 10, 11, 7, 8, 7, 5, 14, 8, + 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, + 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, + 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, + 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, + 0, 3, 10, 5, 8, 0, 11, 6, 7, 8, + 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, + 4, 3, 5, 6, 10, 8, 9, 4, 11, 14, + 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, + 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, + 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, + 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, + 13, 1, 12, 0, 10, 9, 7, 6, 2, 8, + 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, + 9, 6, 10, 11, 7, 8, 7, 5, 14, 8, + 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, + 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, + 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, + 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, + 0, 3, 10, 5, 8, 0, 11, 6, 7, 8, + 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, + 4, 3, 5, 6, 10, 8, 9, 4, 11, 14, + 3, 8, 3, 7, 8, 5, 11, 4, 12, 3, + 11, 9, 14, 8, 14, 13, 4, 3, 1, 2, + 14, 6, 5, 4, 4, 11, 4, 6, 2, 1, + 5, 8, 8, 12, 13, 5, 14, 10, 12, 13, + 0, 9, 5, 5, 11, 10, 13, 9, 10, 13, +}; + + +extern void vp8_blit_text(const char *msg, unsigned char *address, const int pitch); + +/*********************************************************************************************************** + */ +void vp8_post_proc_down_and_across_c +( + unsigned char *src_ptr, + unsigned char *dst_ptr, + int src_pixels_per_line, + int dst_pixels_per_line, + int rows, + int cols, + int flimit +) +{ + unsigned char *p_src, *p_dst; + int row; + int col; + int i; + int v; + int pitch = src_pixels_per_line; + unsigned char d[8]; + (void)dst_pixels_per_line; + + for (row = 0; row < rows; row++) + { + // post_proc_down for one row + p_src = src_ptr; + p_dst = dst_ptr; + + for (col = 0; col < cols; col++) + { + + int kernel = 4; + int v = p_src[col]; + + for (i = -2; i <= 2; i++) + { + if (abs(v - p_src[col+i*pitch]) > flimit) + goto down_skip_convolve; + + kernel += kernel5[2+i] * p_src[col+i*pitch]; + } + + v = (kernel >> 3); + down_skip_convolve: + p_dst[col] = v; + } + + // now post_proc_across + p_src = dst_ptr; + p_dst = dst_ptr; + + for (i = 0; i < 8; i++) + d[i] = p_src[i]; + + for (col = 0; col < cols; col++) + { + int kernel = 4; + v = p_src[col]; + + d[col&7] = v; + + for (i = -2; i <= 2; i++) + { + if (abs(v - p_src[col+i]) > flimit) + goto across_skip_convolve; + + kernel += kernel5[2+i] * p_src[col+i]; + } + + d[col&7] = (kernel >> 3); + across_skip_convolve: + + if (col >= 2) + p_dst[col-2] = d[(col-2)&7]; + } + + //handle the last two pixels + p_dst[col-2] = d[(col-2)&7]; + p_dst[col-1] = d[(col-1)&7]; + + + //next row + src_ptr += pitch; + dst_ptr += pitch; + } +} + +int vp8_q2mbl(int x) +{ + if (x < 20) x = 20; + + x = 50 + (x - 50) * 10 / 8; + return x * x / 3; +} +void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit) +{ + int r, c, i; + + unsigned char *s = src; + unsigned char d[16]; + + + for (r = 0; r < rows; r++) + { + int sumsq = 0; + int sum = 0; + + for (i = -8; i <= 6; i++) + { + sumsq += s[i] * s[i]; + sum += s[i]; + d[i+8] = 0; + } + + for (c = 0; c < cols + 8; c++) + { + int x = s[c+7] - s[c-8]; + int y = s[c+7] + s[c-8]; + + sum += x; + sumsq += x * y; + + d[c&15] = s[c]; + + if (sumsq * 15 - sum * sum < flimit) + { + d[c&15] = (8 + sum + s[c]) >> 4; + } + + s[c-8] = d[(c-8)&15]; + } + + s += pitch; + } +} + + + + + +void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit) +{ + int r, c, i; + const short *rv3 = &vp8_rv[63&rand()]; + + for (c = 0; c < cols; c++) + { + unsigned char *s = &dst[c]; + int sumsq = 0; + int sum = 0; + unsigned char d[16]; + const short *rv2 = rv3 + ((c * 17) & 127); + + for (i = -8; i <= 6; i++) + { + sumsq += s[i*pitch] * s[i*pitch]; + sum += s[i*pitch]; + } + + for (r = 0; r < rows + 8; r++) + { + sumsq += s[7*pitch] * s[ 7*pitch] - s[-8*pitch] * s[-8*pitch]; + sum += s[7*pitch] - s[-8*pitch]; + d[r&15] = s[0]; + + if (sumsq * 15 - sum * sum < flimit) + { + d[r&15] = (rv2[r&127] + sum + s[0]) >> 4; + } + + s[-8*pitch] = d[(r-8)&15]; + s += pitch; + } + } +} + + +static void vp8_deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *post, + int q, + int low_var_thresh, + int flag, + vp8_postproc_rtcd_vtable_t *rtcd) +{ + double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; + int ppl = (int)(level + .5); + (void) low_var_thresh; + (void) flag; + + POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer, source->y_stride, post->y_stride, source->y_height, source->y_width, ppl); + POSTPROC_INVOKE(rtcd, across)(post->y_buffer, post->y_stride, post->y_height, post->y_width, vp8_q2mbl(q)); + POSTPROC_INVOKE(rtcd, down)(post->y_buffer, post->y_stride, post->y_height, post->y_width, vp8_q2mbl(q)); + + POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl); + POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl); + +} + +extern void vp8_deblock(YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *post, + int q, + int low_var_thresh, + int flag, + vp8_postproc_rtcd_vtable_t *rtcd) +{ + double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; + int ppl = (int)(level + .5); + (void) low_var_thresh; + (void) flag; + + POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer, source->y_stride, post->y_stride, source->y_height, source->y_width, ppl); + POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl); + POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl); +} + +void vp8_de_noise(YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *post, + int q, + int low_var_thresh, + int flag, + vp8_postproc_rtcd_vtable_t *rtcd) +{ + double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; + int ppl = (int)(level + .5); + (void) post; + (void) low_var_thresh; + (void) flag; + + POSTPROC_INVOKE(rtcd, downacross)( + source->y_buffer + 2 * source->y_stride + 2, + source->y_buffer + 2 * source->y_stride + 2, + source->y_stride, + source->y_stride, + source->y_height - 4, + source->y_width - 4, + ppl); + POSTPROC_INVOKE(rtcd, downacross)( + source->u_buffer + 2 * source->uv_stride + 2, + source->u_buffer + 2 * source->uv_stride + 2, + source->uv_stride, + source->uv_stride, + source->uv_height - 4, + source->uv_width - 4, ppl); + POSTPROC_INVOKE(rtcd, downacross)( + source->v_buffer + 2 * source->uv_stride + 2, + source->v_buffer + 2 * source->uv_stride + 2, + source->uv_stride, + source->uv_stride, + source->uv_height - 4, + source->uv_width - 4, ppl); + +} + + +//Notes: It is better to change CHAR to unsigned or signed to +//avoid error on ARM platform. +char vp8_an[8][64][3072]; +int vp8_cd[8][64]; + + +double vp8_gaussian(double sigma, double mu, double x) +{ + return 1 / (sigma * sqrt(2.0 * 3.14159265)) * + (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma))); +} + +extern void (*vp8_clear_system_state)(void); + + +static void fillrd(struct postproc_state *state, int q, int a) +{ + char char_dist[300]; + + double sigma; + int ai = a, qi = q, i; + + vp8_clear_system_state(); + + + sigma = ai + .5 + .6 * (63 - qi) / 63.0; + + // set up a lookup table of 256 entries that matches + // a gaussian distribution with sigma determined by q. + // + { + double i; + int next, j; + + next = 0; + + for (i = -32; i < 32; i++) + { + int a = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i)); + + if (a) + { + for (j = 0; j < a; j++) + { + char_dist[next+j] = (char) i; + } + + next = next + j; + } + + } + + for (next = next; next < 256; next++) + char_dist[next] = 0; + + } + + for (i = 0; i < 3072; i++) + { + state->noise[i] = char_dist[rand() & 0xff]; + } + + for (i = 0; i < 16; i++) + { + state->blackclamp[i] = -char_dist[0]; + state->whiteclamp[i] = -char_dist[0]; + state->bothclamp[i] = -2 * char_dist[0]; + } + + state->last_q = q; + state->last_noise = a; +} + +/**************************************************************************** + * + * ROUTINE : plane_add_noise_c + * + * INPUTS : unsigned char *Start starting address of buffer to add gaussian + * noise to + * unsigned int Width width of plane + * unsigned int Height height of plane + * int Pitch distance between subsequent lines of frame + * int q quantizer used to determine amount of noise + * to add + * + * OUTPUTS : None. + * + * RETURNS : void. + * + * FUNCTION : adds gaussian noise to a plane of pixels + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void vp8_plane_add_noise_c(unsigned char *Start, char *noise, + char blackclamp[16], + char whiteclamp[16], + char bothclamp[16], + unsigned int Width, unsigned int Height, int Pitch) +{ + unsigned int i, j; + + for (i = 0; i < Height; i++) + { + unsigned char *Pos = Start + i * Pitch; + char *Ref = (char *)(noise + (rand() & 0xff)); + + for (j = 0; j < Width; j++) + { + if (Pos[j] < blackclamp[0]) + Pos[j] = blackclamp[0]; + + if (Pos[j] > 255 + whiteclamp[0]) + Pos[j] = 255 + whiteclamp[0]; + + Pos[j] += Ref[j]; + } + } +} + +#if CONFIG_RUNTIME_CPU_DETECT +#define RTCD_VTABLE(oci) (&(oci)->rtcd.postproc) +#else +#define RTCD_VTABLE(oci) NULL +#endif + +int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags) +{ + char message[512]; + int q = oci->filter_level * 10 / 6; + + if (!oci->frame_to_show) + return -1; + + if (q > 63) + q = 63; + + if (!flags) + { + *dest = *oci->frame_to_show; + + // handle problem with extending borders + dest->y_width = oci->Width; + dest->y_height = oci->Height; + dest->uv_height = dest->y_height / 2; + return 0; + + } + +#if ARCH_X86||ARCH_X86_64 + vpx_reset_mmx_state(); +#endif + + if (flags & VP8D_DEMACROBLOCK) + { + vp8_deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer, + q + (deblock_level - 5) * 10, 1, 0, RTCD_VTABLE(oci)); + } + else if (flags & VP8D_DEBLOCK) + { + vp8_deblock(oci->frame_to_show, &oci->post_proc_buffer, + q, 1, 0, RTCD_VTABLE(oci)); + } + else + { + vp8_yv12_copy_frame_ptr(oci->frame_to_show, &oci->post_proc_buffer); + } + + if (flags & VP8D_ADDNOISE) + { + if (oci->postproc_state.last_q != q + || oci->postproc_state.last_noise != noise_level) + { + fillrd(&oci->postproc_state, 63 - q, noise_level); + } + + POSTPROC_INVOKE(RTCD_VTABLE(oci), addnoise) + (oci->post_proc_buffer.y_buffer, + oci->postproc_state.noise, + oci->postproc_state.blackclamp, + oci->postproc_state.whiteclamp, + oci->postproc_state.bothclamp, + oci->post_proc_buffer.y_width, oci->post_proc_buffer.y_height, + oci->post_proc_buffer.y_stride); + } + + if (flags & VP8D_DEBUG_LEVEL1) + { + sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d", + (oci->frame_type == KEY_FRAME), + oci->refresh_golden_frame, + oci->base_qindex, + oci->filter_level, + flags, + oci->mb_cols, oci->mb_rows); + vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride); + } + else if (flags & VP8D_DEBUG_LEVEL2) + { + int i, j; + unsigned char *y_ptr; + YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; + int mb_rows = post->y_height >> 4; + int mb_cols = post->y_width >> 4; + int mb_index = 0; + MODE_INFO *mi = oci->mi; + + y_ptr = post->y_buffer + 4 * post->y_stride + 4; + + // vp8_filter each macro block + for (i = 0; i < mb_rows; i++) + { + for (j = 0; j < mb_cols; j++) + { + char zz[4]; + + sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a'); + + vp8_blit_text(zz, y_ptr, post->y_stride); + mb_index ++; + y_ptr += 16; + } + + mb_index ++; //border + y_ptr += post->y_stride * 16 - post->y_width; + + } + } + else if (flags & VP8D_DEBUG_LEVEL3) + { + int i, j; + unsigned char *y_ptr; + YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; + int mb_rows = post->y_height >> 4; + int mb_cols = post->y_width >> 4; + int mb_index = 0; + MODE_INFO *mi = oci->mi; + + y_ptr = post->y_buffer + 4 * post->y_stride + 4; + + // vp8_filter each macro block + for (i = 0; i < mb_rows; i++) + { + for (j = 0; j < mb_cols; j++) + { + char zz[4]; + + if (oci->frame_type == KEY_FRAME) + sprintf(zz, "a"); + else + sprintf(zz, "%c", mi[mb_index].mbmi.dc_diff + '0'); + + vp8_blit_text(zz, y_ptr, post->y_stride); + mb_index ++; + y_ptr += 16; + } + + mb_index ++; //border + y_ptr += post->y_stride * 16 - post->y_width; + + } + } + else if (flags & VP8D_DEBUG_LEVEL4) + { + sprintf(message, "Bitrate: %10.2f frame_rate: %10.2f ", oci->bitrate, oci->framerate); + vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride); +#if 0 + int i, j; + unsigned char *y_ptr; + YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; + int mb_rows = post->y_height >> 4; + int mb_cols = post->y_width >> 4; + int mb_index = 0; + MODE_INFO *mi = oci->mi; + + y_ptr = post->y_buffer + 4 * post->y_stride + 4; + + // vp8_filter each macro block + for (i = 0; i < mb_rows; i++) + { + for (j = 0; j < mb_cols; j++) + { + char zz[4]; + + sprintf(zz, "%c", mi[mb_index].mbmi.dc_diff + '0'); + vp8_blit_text(zz, y_ptr, post->y_stride); + mb_index ++; + y_ptr += 16; + } + + mb_index ++; //border + y_ptr += post->y_stride * 16 - post->y_width; + + } + +#endif + + } + + + + *dest = oci->post_proc_buffer; + + // handle problem with extending borders + dest->y_width = oci->Width; + dest->y_height = oci->Height; + dest->uv_height = dest->y_height / 2; + return 0; +} diff --git a/vp8/common/postproc.h b/vp8/common/postproc.h new file mode 100644 index 000000000..c45fe9252 --- /dev/null +++ b/vp8/common/postproc.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef POSTPROC_H +#define POSTPROC_H + +#define prototype_postproc_inplace(sym)\ + void sym (unsigned char *dst, int pitch, int rows, int cols,int flimit) + +#define prototype_postproc(sym)\ + void sym (unsigned char *src, unsigned char *dst, int src_pitch,\ + int dst_pitch, int rows, int cols, int flimit) + +#define prototype_postproc_addnoise(sym) \ + void sym (unsigned char *s, char *noise, char blackclamp[16],\ + char whiteclamp[16], char bothclamp[16],\ + unsigned int w, unsigned int h, int pitch) + +#if ARCH_X86 || ARCH_X86_64 +#include "x86/postproc_x86.h" +#endif + +#ifndef vp8_postproc_down +#define vp8_postproc_down vp8_mbpost_proc_down_c +#endif +extern prototype_postproc_inplace(vp8_postproc_down); + +#ifndef vp8_postproc_across +#define vp8_postproc_across vp8_mbpost_proc_across_ip_c +#endif +extern prototype_postproc_inplace(vp8_postproc_across); + +#ifndef vp8_postproc_downacross +#define vp8_postproc_downacross vp8_post_proc_down_and_across_c +#endif +extern prototype_postproc(vp8_postproc_downacross); + +#ifndef vp8_postproc_addnoise +#define vp8_postproc_addnoise vp8_plane_add_noise_c +#endif +extern prototype_postproc_addnoise(vp8_postproc_addnoise); + + +typedef prototype_postproc((*vp8_postproc_fn_t)); +typedef prototype_postproc_inplace((*vp8_postproc_inplace_fn_t)); +typedef prototype_postproc_addnoise((*vp8_postproc_addnoise_fn_t)); +typedef struct +{ + vp8_postproc_inplace_fn_t down; + vp8_postproc_inplace_fn_t across; + vp8_postproc_fn_t downacross; + vp8_postproc_addnoise_fn_t addnoise; +} vp8_postproc_rtcd_vtable_t; + +#if CONFIG_RUNTIME_CPU_DETECT +#define POSTPROC_INVOKE(ctx,fn) (ctx)->fn +#else +#define POSTPROC_INVOKE(ctx,fn) vp8_postproc_##fn +#endif + +#include "vpx_ports/mem.h" +struct postproc_state +{ + int last_q; + int last_noise; + char noise[3072]; + DECLARE_ALIGNED(16, char, blackclamp[16]); + DECLARE_ALIGNED(16, char, whiteclamp[16]); + DECLARE_ALIGNED(16, char, bothclamp[16]); +}; +#include "onyxc_int.h" +#include "ppflags.h" +int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest, + int deblock_level, int noise_level, int flags); + + +void vp8_de_noise(YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *post, + int q, + int low_var_thresh, + int flag, + vp8_postproc_rtcd_vtable_t *rtcd); +#endif diff --git a/vp8/common/ppc/copy_altivec.asm b/vp8/common/ppc/copy_altivec.asm new file mode 100644 index 000000000..e87eb2112 --- /dev/null +++ b/vp8/common/ppc/copy_altivec.asm @@ -0,0 +1,46 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + .globl copy_mem16x16_ppc + +;# r3 unsigned char *src +;# r4 int src_stride +;# r5 unsigned char *dst +;# r6 int dst_stride + +;# Make the assumption that input will not be aligned, +;# but the output will be. So two reads and a perm +;# for the input, but only one store for the output. +copy_mem16x16_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xe000 + mtspr 256, r12 ;# set VRSAVE + + li r10, 16 + mtctr r10 + +cp_16x16_loop: + lvsl v0, 0, r3 ;# permutate value for alignment + + lvx v1, 0, r3 + lvx v2, r10, r3 + + vperm v1, v1, v2, v0 + + stvx v1, 0, r5 + + add r3, r3, r4 ;# increment source pointer + add r5, r5, r6 ;# increment destination pointer + + bdnz cp_16x16_loop + + mtspr 256, r11 ;# reset old VRSAVE + + blr diff --git a/vp8/common/ppc/filter_altivec.asm b/vp8/common/ppc/filter_altivec.asm new file mode 100644 index 000000000..2a3550773 --- /dev/null +++ b/vp8/common/ppc/filter_altivec.asm @@ -0,0 +1,1012 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + .globl sixtap_predict_ppc + .globl sixtap_predict8x4_ppc + .globl sixtap_predict8x8_ppc + .globl sixtap_predict16x16_ppc + +.macro load_c V, LABEL, OFF, R0, R1 + lis \R0, \LABEL@ha + la \R1, \LABEL@l(\R0) + lvx \V, \OFF, \R1 +.endm + +.macro load_hfilter V0, V1 + load_c \V0, HFilter, r5, r9, r10 + + addi r5, r5, 16 + lvx \V1, r5, r10 +.endm + +;# Vertical filtering +.macro Vprolog + load_c v0, VFilter, r6, r3, r10 + + vspltish v5, 8 + vspltish v6, 3 + vslh v6, v5, v6 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + vspltb v1, v0, 1 + vspltb v2, v0, 2 + vspltb v3, v0, 3 + vspltb v4, v0, 4 + vspltb v5, v0, 5 + vspltb v0, v0, 0 +.endm + +.macro vpre_load + Vprolog + li r10, 16 + lvx v10, 0, r9 ;# v10..v14 = first 5 rows + lvx v11, r10, r9 + addi r9, r9, 32 + lvx v12, 0, r9 + lvx v13, r10, r9 + addi r9, r9, 32 + lvx v14, 0, r9 +.endm + +.macro Msum Re, Ro, V, T, TMP + ;# (Re,Ro) += (V*T) + vmuleub \TMP, \V, \T ;# trashes v8 + vadduhm \Re, \Re, \TMP ;# Re = evens, saturation unnecessary + vmuloub \TMP, \V, \T + vadduhm \Ro, \Ro, \TMP ;# Ro = odds +.endm + +.macro vinterp_no_store P0 P1 P2 P3 P4 P5 + vmuleub v8, \P0, v0 ;# 64 + 4 positive taps + vadduhm v16, v6, v8 + vmuloub v8, \P0, v0 + vadduhm v17, v6, v8 + Msum v16, v17, \P2, v2, v8 + Msum v16, v17, \P3, v3, v8 + Msum v16, v17, \P5, v5, v8 + + vmuleub v18, \P1, v1 ;# 2 negative taps + vmuloub v19, \P1, v1 + Msum v18, v19, \P4, v4, v8 + + vsubuhs v16, v16, v18 ;# subtract neg from pos + vsubuhs v17, v17, v19 + vsrh v16, v16, v7 ;# divide by 128 + vsrh v17, v17, v7 ;# v16 v17 = evens, odds + vmrghh v18, v16, v17 ;# v18 v19 = 16-bit result in order + vmrglh v19, v16, v17 + vpkuhus \P0, v18, v19 ;# P0 = 8-bit result +.endm + +.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5 + vmuleub v24, \P0, v13 ;# 64 + 4 positive taps + vadduhm v21, v20, v24 + vmuloub v24, \P0, v13 + vadduhm v22, v20, v24 + Msum v21, v22, \P2, v15, v25 + Msum v21, v22, \P3, v16, v25 + Msum v21, v22, \P5, v18, v25 + + vmuleub v23, \P1, v14 ;# 2 negative taps + vmuloub v24, \P1, v14 + Msum v23, v24, \P4, v17, v25 + + vsubuhs v21, v21, v23 ;# subtract neg from pos + vsubuhs v22, v22, v24 + vsrh v21, v21, v19 ;# divide by 128 + vsrh v22, v22, v19 ;# v16 v17 = evens, odds + vmrghh v23, v21, v22 ;# v18 v19 = 16-bit result in order + vmrglh v24, v21, v22 + vpkuhus \P0, v23, v24 ;# P0 = 8-bit result +.endm + + +.macro Vinterp P0 P1 P2 P3 P4 P5 + vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5 + stvx \P0, 0, r7 + add r7, r7, r8 ;# 33 ops per 16 pels +.endm + + +.macro luma_v P0, P1, P2, P3, P4, P5 + addi r9, r9, 16 ;# P5 = newest input row + lvx \P5, 0, r9 + Vinterp \P0, \P1, \P2, \P3, \P4, \P5 +.endm + +.macro luma_vtwo + luma_v v10, v11, v12, v13, v14, v15 + luma_v v11, v12, v13, v14, v15, v10 +.endm + +.macro luma_vfour + luma_vtwo + luma_v v12, v13, v14, v15, v10, v11 + luma_v v13, v14, v15, v10, v11, v12 +.endm + +.macro luma_vsix + luma_vfour + luma_v v14, v15, v10, v11, v12, v13 + luma_v v15, v10, v11, v12, v13, v14 +.endm + +.macro Interp4 R I I4 + vmsummbm \R, v13, \I, v15 + vmsummbm \R, v14, \I4, \R +.endm + +.macro Read8x8 VD, RS, RP, increment_counter + lvsl v21, 0, \RS ;# permutate value for alignment + + ;# input to filter is 21 bytes wide, output is 16 bytes. + ;# input will can span three vectors if not aligned correctly. + lvx \VD, 0, \RS + lvx v20, r10, \RS + +.if \increment_counter + add \RS, \RS, \RP +.endif + + vperm \VD, \VD, v20, v21 +.endm + +.macro interp_8x8 R + vperm v20, \R, \R, v16 ;# v20 = 0123 1234 2345 3456 + vperm v21, \R, \R, v17 ;# v21 = 4567 5678 6789 789A + Interp4 v20, v20, v21 ;# v20 = result 0 1 2 3 + vperm \R, \R, \R, v18 ;# R = 89AB 9ABC ABCx BCxx + Interp4 v21, v21, \R ;# v21 = result 4 5 6 7 + + vpkswus \R, v20, v21 ;# R = 0 1 2 3 4 5 6 7 + vsrh \R, \R, v19 + + vpkuhus \R, \R, \R ;# saturate and pack + +.endm + +.macro Read4x4 VD, RS, RP, increment_counter + lvsl v21, 0, \RS ;# permutate value for alignment + + ;# input to filter is 21 bytes wide, output is 16 bytes. + ;# input will can span three vectors if not aligned correctly. + lvx v20, 0, \RS + +.if \increment_counter + add \RS, \RS, \RP +.endif + + vperm \VD, v20, v20, v21 +.endm + .text + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch +sixtap_predict_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xff87 + ori r12, r12, 0xffc0 + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + slwi. r5, r5, 5 ;# index into horizontal filter array + + vspltish v19, 7 + + ;# If there isn't any filtering to be done for the horizontal, then + ;# just skip to the second pass. + beq- vertical_only_4x4 + + ;# load up horizontal filter + load_hfilter v13, v14 + + ;# rounding added in on the multiply + vspltisw v16, 8 + vspltisw v15, 3 + vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 + + ;# Load up permutation constants + load_c v16, B_0123, 0, r9, r10 + load_c v17, B_4567, 0, r9, r10 + load_c v18, B_89AB, 0, r9, r10 + + ;# Back off input buffer by 2 bytes. Need 2 before and 3 after + addi r3, r3, -2 + + addi r9, r3, 0 + li r10, 16 + Read8x8 v2, r3, r4, 1 + Read8x8 v3, r3, r4, 1 + Read8x8 v4, r3, r4, 1 + Read8x8 v5, r3, r4, 1 + + slwi. r6, r6, 4 ;# index into vertical filter array + + ;# filter a line + interp_8x8 v2 + interp_8x8 v3 + interp_8x8 v4 + interp_8x8 v5 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional 5 lines that are needed + ;# for the vertical filter. + beq- store_4x4 + + ;# only needed if there is a vertical filter present + ;# if the second filter is not null then need to back off by 2*pitch + sub r9, r9, r4 + sub r9, r9, r4 + + Read8x8 v0, r9, r4, 1 + Read8x8 v1, r9, r4, 0 + Read8x8 v6, r3, r4, 1 + Read8x8 v7, r3, r4, 1 + Read8x8 v8, r3, r4, 0 + + interp_8x8 v0 + interp_8x8 v1 + interp_8x8 v6 + interp_8x8 v7 + interp_8x8 v8 + + b second_pass_4x4 + +vertical_only_4x4: + ;# only needed if there is a vertical filter present + ;# if the second filter is not null then need to back off by 2*pitch + sub r3, r3, r4 + sub r3, r3, r4 + li r10, 16 + + Read8x8 v0, r3, r4, 1 + Read8x8 v1, r3, r4, 1 + Read8x8 v2, r3, r4, 1 + Read8x8 v3, r3, r4, 1 + Read8x8 v4, r3, r4, 1 + Read8x8 v5, r3, r4, 1 + Read8x8 v6, r3, r4, 1 + Read8x8 v7, r3, r4, 1 + Read8x8 v8, r3, r4, 0 + + slwi r6, r6, 4 ;# index into vertical filter array + +second_pass_4x4: + load_c v20, b_hilo_4x4, 0, r9, r10 + load_c v21, b_hilo, 0, r9, r10 + + ;# reposition input so that it can go through the + ;# filtering phase with one pass. + vperm v0, v0, v1, v20 ;# 0 1 x x + vperm v2, v2, v3, v20 ;# 2 3 x x + vperm v4, v4, v5, v20 ;# 4 5 x x + vperm v6, v6, v7, v20 ;# 6 7 x x + + vperm v0, v0, v2, v21 ;# 0 1 2 3 + vperm v4, v4, v6, v21 ;# 4 5 6 7 + + vsldoi v1, v0, v4, 4 + vsldoi v2, v0, v4, 8 + vsldoi v3, v0, v4, 12 + + vsldoi v5, v4, v8, 4 + + load_c v13, VFilter, r6, r9, r10 + + vspltish v15, 8 + vspltish v20, 3 + vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + vspltb v14, v13, 1 + vspltb v15, v13, 2 + vspltb v16, v13, 3 + vspltb v17, v13, 4 + vspltb v18, v13, 5 + vspltb v13, v13, 0 + + vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 + + stvx v0, 0, r1 + + lwz r0, 0(r1) + stw r0, 0(r7) + add r7, r7, r8 + + lwz r0, 4(r1) + stw r0, 0(r7) + add r7, r7, r8 + + lwz r0, 8(r1) + stw r0, 0(r7) + add r7, r7, r8 + + lwz r0, 12(r1) + stw r0, 0(r7) + + b exit_4x4 + +store_4x4: + + stvx v2, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + add r7, r7, r8 + + stvx v3, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + add r7, r7, r8 + + stvx v4, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + add r7, r7, r8 + + stvx v5, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + +exit_4x4: + + addi r1, r1, 32 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE + + blr + +.macro w_8x8 V, D, R, P + stvx \V, 0, r1 + lwz \R, 0(r1) + stw \R, 0(r7) + lwz \R, 4(r1) + stw \R, 4(r7) + add \D, \D, \P +.endm + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch + +sixtap_predict8x4_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xffc0 + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + slwi. r5, r5, 5 ;# index into horizontal filter array + + vspltish v19, 7 + + ;# If there isn't any filtering to be done for the horizontal, then + ;# just skip to the second pass. + beq- second_pass_pre_copy_8x4 + + load_hfilter v13, v14 + + ;# rounding added in on the multiply + vspltisw v16, 8 + vspltisw v15, 3 + vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 + + ;# Load up permutation constants + load_c v16, B_0123, 0, r9, r10 + load_c v17, B_4567, 0, r9, r10 + load_c v18, B_89AB, 0, r9, r10 + + ;# Back off input buffer by 2 bytes. Need 2 before and 3 after + addi r3, r3, -2 + + addi r9, r3, 0 + li r10, 16 + Read8x8 v2, r3, r4, 1 + Read8x8 v3, r3, r4, 1 + Read8x8 v4, r3, r4, 1 + Read8x8 v5, r3, r4, 1 + + slwi. r6, r6, 4 ;# index into vertical filter array + + ;# filter a line + interp_8x8 v2 + interp_8x8 v3 + interp_8x8 v4 + interp_8x8 v5 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional 5 lines that are needed + ;# for the vertical filter. + beq- store_8x4 + + ;# only needed if there is a vertical filter present + ;# if the second filter is not null then need to back off by 2*pitch + sub r9, r9, r4 + sub r9, r9, r4 + + Read8x8 v0, r9, r4, 1 + Read8x8 v1, r9, r4, 0 + Read8x8 v6, r3, r4, 1 + Read8x8 v7, r3, r4, 1 + Read8x8 v8, r3, r4, 0 + + interp_8x8 v0 + interp_8x8 v1 + interp_8x8 v6 + interp_8x8 v7 + interp_8x8 v8 + + b second_pass_8x4 + +second_pass_pre_copy_8x4: + ;# only needed if there is a vertical filter present + ;# if the second filter is not null then need to back off by 2*pitch + sub r3, r3, r4 + sub r3, r3, r4 + li r10, 16 + + Read8x8 v0, r3, r4, 1 + Read8x8 v1, r3, r4, 1 + Read8x8 v2, r3, r4, 1 + Read8x8 v3, r3, r4, 1 + Read8x8 v4, r3, r4, 1 + Read8x8 v5, r3, r4, 1 + Read8x8 v6, r3, r4, 1 + Read8x8 v7, r3, r4, 1 + Read8x8 v8, r3, r4, 1 + + slwi r6, r6, 4 ;# index into vertical filter array + +second_pass_8x4: + load_c v13, VFilter, r6, r9, r10 + + vspltish v15, 8 + vspltish v20, 3 + vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + vspltb v14, v13, 1 + vspltb v15, v13, 2 + vspltb v16, v13, 3 + vspltb v17, v13, 4 + vspltb v18, v13, 5 + vspltb v13, v13, 0 + + vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 + vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6 + vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7 + vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8 + + cmpi cr0, r8, 8 + beq cr0, store_aligned_8x4 + + w_8x8 v0, r7, r0, r8 + w_8x8 v1, r7, r0, r8 + w_8x8 v2, r7, r0, r8 + w_8x8 v3, r7, r0, r8 + + b exit_8x4 + +store_aligned_8x4: + + load_c v10, b_hilo, 0, r9, r10 + + vperm v0, v0, v1, v10 + vperm v2, v2, v3, v10 + + stvx v0, 0, r7 + addi r7, r7, 16 + stvx v2, 0, r7 + + b exit_8x4 + +store_8x4: + cmpi cr0, r8, 8 + beq cr0, store_aligned2_8x4 + + w_8x8 v2, r7, r0, r8 + w_8x8 v3, r7, r0, r8 + w_8x8 v4, r7, r0, r8 + w_8x8 v5, r7, r0, r8 + + b exit_8x4 + +store_aligned2_8x4: + load_c v10, b_hilo, 0, r9, r10 + + vperm v2, v2, v3, v10 + vperm v4, v4, v5, v10 + + stvx v2, 0, r7 + addi r7, r7, 16 + stvx v4, 0, r7 + +exit_8x4: + + addi r1, r1, 32 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE + + + blr + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch + +;# Because the width that needs to be filtered will fit in a single altivec +;# register there is no need to loop. Everything can stay in registers. +sixtap_predict8x8_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xffc0 + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + slwi. r5, r5, 5 ;# index into horizontal filter array + + vspltish v19, 7 + + ;# If there isn't any filtering to be done for the horizontal, then + ;# just skip to the second pass. + beq- second_pass_pre_copy_8x8 + + load_hfilter v13, v14 + + ;# rounding added in on the multiply + vspltisw v16, 8 + vspltisw v15, 3 + vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 + + ;# Load up permutation constants + load_c v16, B_0123, 0, r9, r10 + load_c v17, B_4567, 0, r9, r10 + load_c v18, B_89AB, 0, r9, r10 + + ;# Back off input buffer by 2 bytes. Need 2 before and 3 after + addi r3, r3, -2 + + addi r9, r3, 0 + li r10, 16 + Read8x8 v2, r3, r4, 1 + Read8x8 v3, r3, r4, 1 + Read8x8 v4, r3, r4, 1 + Read8x8 v5, r3, r4, 1 + Read8x8 v6, r3, r4, 1 + Read8x8 v7, r3, r4, 1 + Read8x8 v8, r3, r4, 1 + Read8x8 v9, r3, r4, 1 + + slwi. r6, r6, 4 ;# index into vertical filter array + + ;# filter a line + interp_8x8 v2 + interp_8x8 v3 + interp_8x8 v4 + interp_8x8 v5 + interp_8x8 v6 + interp_8x8 v7 + interp_8x8 v8 + interp_8x8 v9 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional 5 lines that are needed + ;# for the vertical filter. + beq- store_8x8 + + ;# only needed if there is a vertical filter present + ;# if the second filter is not null then need to back off by 2*pitch + sub r9, r9, r4 + sub r9, r9, r4 + + Read8x8 v0, r9, r4, 1 + Read8x8 v1, r9, r4, 0 + Read8x8 v10, r3, r4, 1 + Read8x8 v11, r3, r4, 1 + Read8x8 v12, r3, r4, 0 + + interp_8x8 v0 + interp_8x8 v1 + interp_8x8 v10 + interp_8x8 v11 + interp_8x8 v12 + + b second_pass_8x8 + +second_pass_pre_copy_8x8: + ;# only needed if there is a vertical filter present + ;# if the second filter is not null then need to back off by 2*pitch + sub r3, r3, r4 + sub r3, r3, r4 + li r10, 16 + + Read8x8 v0, r3, r4, 1 + Read8x8 v1, r3, r4, 1 + Read8x8 v2, r3, r4, 1 + Read8x8 v3, r3, r4, 1 + Read8x8 v4, r3, r4, 1 + Read8x8 v5, r3, r4, 1 + Read8x8 v6, r3, r4, 1 + Read8x8 v7, r3, r4, 1 + Read8x8 v8, r3, r4, 1 + Read8x8 v9, r3, r4, 1 + Read8x8 v10, r3, r4, 1 + Read8x8 v11, r3, r4, 1 + Read8x8 v12, r3, r4, 0 + + slwi r6, r6, 4 ;# index into vertical filter array + +second_pass_8x8: + load_c v13, VFilter, r6, r9, r10 + + vspltish v15, 8 + vspltish v20, 3 + vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + vspltb v14, v13, 1 + vspltb v15, v13, 2 + vspltb v16, v13, 3 + vspltb v17, v13, 4 + vspltb v18, v13, 5 + vspltb v13, v13, 0 + + vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 + vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6 + vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7 + vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8 + vinterp_no_store_8x8 v4, v5, v6, v7, v8, v9 + vinterp_no_store_8x8 v5, v6, v7, v8, v9, v10 + vinterp_no_store_8x8 v6, v7, v8, v9, v10, v11 + vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12 + + cmpi cr0, r8, 8 + beq cr0, store_aligned_8x8 + + w_8x8 v0, r7, r0, r8 + w_8x8 v1, r7, r0, r8 + w_8x8 v2, r7, r0, r8 + w_8x8 v3, r7, r0, r8 + w_8x8 v4, r7, r0, r8 + w_8x8 v5, r7, r0, r8 + w_8x8 v6, r7, r0, r8 + w_8x8 v7, r7, r0, r8 + + b exit_8x8 + +store_aligned_8x8: + + load_c v10, b_hilo, 0, r9, r10 + + vperm v0, v0, v1, v10 + vperm v2, v2, v3, v10 + vperm v4, v4, v5, v10 + vperm v6, v6, v7, v10 + + stvx v0, 0, r7 + addi r7, r7, 16 + stvx v2, 0, r7 + addi r7, r7, 16 + stvx v4, 0, r7 + addi r7, r7, 16 + stvx v6, 0, r7 + + b exit_8x8 + +store_8x8: + cmpi cr0, r8, 8 + beq cr0, store_aligned2_8x8 + + w_8x8 v2, r7, r0, r8 + w_8x8 v3, r7, r0, r8 + w_8x8 v4, r7, r0, r8 + w_8x8 v5, r7, r0, r8 + w_8x8 v6, r7, r0, r8 + w_8x8 v7, r7, r0, r8 + w_8x8 v8, r7, r0, r8 + w_8x8 v9, r7, r0, r8 + + b exit_8x8 + +store_aligned2_8x8: + load_c v10, b_hilo, 0, r9, r10 + + vperm v2, v2, v3, v10 + vperm v4, v4, v5, v10 + vperm v6, v6, v7, v10 + vperm v8, v8, v9, v10 + + stvx v2, 0, r7 + addi r7, r7, 16 + stvx v4, 0, r7 + addi r7, r7, 16 + stvx v6, 0, r7 + addi r7, r7, 16 + stvx v8, 0, r7 + +exit_8x8: + + addi r1, r1, 32 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch + +;# Two pass filtering. First pass is Horizontal edges, second pass is vertical +;# edges. One of the filters can be null, but both won't be. Needs to use a +;# temporary buffer because the source buffer can't be modified and the buffer +;# for the destination is not large enough to hold the temporary data. +sixtap_predict16x16_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xf000 + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-416(r1) ;# create space on the stack + + ;# Three possiblities + ;# 1. First filter is null. Don't use a temp buffer. + ;# 2. Second filter is null. Don't use a temp buffer. + ;# 3. Neither are null, use temp buffer. + + ;# First Pass (horizontal edge) + ;# setup pointers for src + ;# if possiblity (1) then setup the src pointer to be the orginal and jump + ;# to second pass. this is based on if x_offset is 0. + + ;# load up horizontal filter + slwi. r5, r5, 5 ;# index into horizontal filter array + + load_hfilter v4, v5 + + beq- copy_horizontal_16x21 + + ;# Back off input buffer by 2 bytes. Need 2 before and 3 after + addi r3, r3, -2 + + slwi. r6, r6, 4 ;# index into vertical filter array + + ;# setup constants + ;# v14 permutation value for alignment + load_c v14, b_hperm, 0, r9, r10 + + ;# These statements are guessing that there won't be a second pass, + ;# but if there is then inside the bypass they need to be set + li r0, 16 ;# prepare for no vertical filter + + ;# Change the output pointer and pitch to be the actual + ;# desination instead of a temporary buffer. + addi r9, r7, 0 + addi r5, r8, 0 + + ;# no vertical filter, so write the output from the first pass + ;# directly into the output buffer. + beq- no_vertical_filter_bypass + + ;# if the second filter is not null then need to back off by 2*pitch + sub r3, r3, r4 + sub r3, r3, r4 + + ;# setup counter for the number of lines that are going to be filtered + li r0, 21 + + ;# use the stack as temporary storage + la r9, 48(r1) + li r5, 16 + +no_vertical_filter_bypass: + + mtctr r0 + + ;# rounding added in on the multiply + vspltisw v10, 8 + vspltisw v12, 3 + vslw v12, v10, v12 ;# 0x00000040000000400000004000000040 + + ;# downshift by 7 ( divide by 128 ) at the end + vspltish v13, 7 + + ;# index to the next set of vectors in the row. + li r10, 16 + li r12, 32 + +horizontal_loop_16x16: + + lvsl v15, 0, r3 ;# permutate value for alignment + + ;# input to filter is 21 bytes wide, output is 16 bytes. + ;# input will can span three vectors if not aligned correctly. + lvx v1, 0, r3 + lvx v2, r10, r3 + lvx v3, r12, r3 + + vperm v8, v1, v2, v15 + vperm v9, v2, v3, v15 ;# v8 v9 = 21 input pixels left-justified + + vsldoi v11, v8, v9, 4 + + ;# set 0 + vmsummbm v6, v4, v8, v12 ;# taps times elements + vmsummbm v0, v5, v11, v6 + + ;# set 1 + vsldoi v10, v8, v9, 1 + vsldoi v11, v8, v9, 5 + + vmsummbm v6, v4, v10, v12 + vmsummbm v1, v5, v11, v6 + + ;# set 2 + vsldoi v10, v8, v9, 2 + vsldoi v11, v8, v9, 6 + + vmsummbm v6, v4, v10, v12 + vmsummbm v2, v5, v11, v6 + + ;# set 3 + vsldoi v10, v8, v9, 3 + vsldoi v11, v8, v9, 7 + + vmsummbm v6, v4, v10, v12 + vmsummbm v3, v5, v11, v6 + + vpkswus v0, v0, v1 ;# v0 = 0 4 8 C 1 5 9 D (16-bit) + vpkswus v1, v2, v3 ;# v1 = 2 6 A E 3 7 B F + + vsrh v0, v0, v13 ;# divide v0, v1 by 128 + vsrh v1, v1, v13 + + vpkuhus v0, v0, v1 ;# v0 = scrambled 8-bit result + vperm v0, v0, v0, v14 ;# v0 = correctly-ordered result + + stvx v0, 0, r9 + add r9, r9, r5 + + add r3, r3, r4 + + bdnz horizontal_loop_16x16 + + ;# check again to see if vertical filter needs to be done. + cmpi cr0, r6, 0 + beq cr0, end_16x16 + + ;# yes there is, so go to the second pass + b second_pass_16x16 + +copy_horizontal_16x21: + li r10, 21 + mtctr r10 + + li r10, 16 + + sub r3, r3, r4 + sub r3, r3, r4 + + ;# this is done above if there is a horizontal filter, + ;# if not it needs to be done down here. + slwi r6, r6, 4 ;# index into vertical filter array + + ;# always write to the stack when doing a horizontal copy + la r9, 48(r1) + +copy_horizontal_loop_16x21: + lvsl v15, 0, r3 ;# permutate value for alignment + + lvx v1, 0, r3 + lvx v2, r10, r3 + + vperm v8, v1, v2, v15 + + stvx v8, 0, r9 + addi r9, r9, 16 + + add r3, r3, r4 + + bdnz copy_horizontal_loop_16x21 + +second_pass_16x16: + + ;# always read from the stack when doing a vertical filter + la r9, 48(r1) + + ;# downshift by 7 ( divide by 128 ) at the end + vspltish v7, 7 + + vpre_load + + luma_vsix + luma_vsix + luma_vfour + +end_16x16: + + addi r1, r1, 416 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .data + + .align 4 +HFilter: + .byte 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12 + .byte -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0 + .byte 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36 + .byte -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0 + .byte 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50 + .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0 + .byte 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77 + .byte -16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0 + .byte 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93 + .byte -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0 + .byte 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108 + .byte -11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0 + .byte 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123 + .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0 + + .align 4 +VFilter: + .byte 0, 0,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 6,123, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 2, 11,108, 36, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 9, 93, 50, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 3, 16, 77, 77, 16, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 6, 50, 93, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 1, 8, 36,108, 11, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 1, 12,123, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + + .align 4 +b_hperm: + .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 + + .align 4 +B_0123: + .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + + .align 4 +B_4567: + .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 + + .align 4 +B_89AB: + .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 + + .align 4 +b_hilo: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 + + .align 4 +b_hilo_4x4: + .byte 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0 diff --git a/vp8/common/ppc/filter_bilinear_altivec.asm b/vp8/common/ppc/filter_bilinear_altivec.asm new file mode 100644 index 000000000..27e02a87f --- /dev/null +++ b/vp8/common/ppc/filter_bilinear_altivec.asm @@ -0,0 +1,676 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + .globl bilinear_predict4x4_ppc + .globl bilinear_predict8x4_ppc + .globl bilinear_predict8x8_ppc + .globl bilinear_predict16x16_ppc + +.macro load_c V, LABEL, OFF, R0, R1 + lis \R0, \LABEL@ha + la \R1, \LABEL@l(\R0) + lvx \V, \OFF, \R1 +.endm + +.macro load_vfilter V0, V1 + load_c \V0, vfilter_b, r6, r9, r10 + + addi r6, r6, 16 + lvx \V1, r6, r10 +.endm + +.macro HProlog jump_label + ;# load up horizontal filter + slwi. r5, r5, 4 ;# index into horizontal filter array + + ;# index to the next set of vectors in the row. + li r10, 16 + li r12, 32 + + ;# downshift by 7 ( divide by 128 ) at the end + vspltish v19, 7 + + ;# If there isn't any filtering to be done for the horizontal, then + ;# just skip to the second pass. + beq \jump_label + + load_c v20, hfilter_b, r5, r9, r0 + + ;# setup constants + ;# v14 permutation value for alignment + load_c v28, b_hperm_b, 0, r9, r0 + + ;# rounding added in on the multiply + vspltisw v21, 8 + vspltisw v18, 3 + vslw v18, v21, v18 ;# 0x00000040000000400000004000000040 + + slwi. r6, r6, 5 ;# index into vertical filter array +.endm + +;# Filters a horizontal line +;# expects: +;# r3 src_ptr +;# r4 pitch +;# r10 16 +;# r12 32 +;# v17 perm intput +;# v18 rounding +;# v19 shift +;# v20 filter taps +;# v21 tmp +;# v22 tmp +;# v23 tmp +;# v24 tmp +;# v25 tmp +;# v26 tmp +;# v27 tmp +;# v28 perm output +;# +.macro HFilter V + vperm v24, v21, v21, v10 ;# v20 = 0123 1234 2345 3456 + vperm v25, v21, v21, v11 ;# v21 = 4567 5678 6789 789A + + vmsummbm v24, v20, v24, v18 + vmsummbm v25, v20, v25, v18 + + vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) + + vsrh v24, v24, v19 ;# divide v0, v1 by 128 + + vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result +.endm + +.macro hfilter_8 V, increment_counter + lvsl v17, 0, r3 ;# permutate value for alignment + + ;# input to filter is 9 bytes wide, output is 8 bytes. + lvx v21, 0, r3 + lvx v22, r10, r3 + +.if \increment_counter + add r3, r3, r4 +.endif + vperm v21, v21, v22, v17 + + HFilter \V +.endm + + +.macro load_and_align_8 V, increment_counter + lvsl v17, 0, r3 ;# permutate value for alignment + + ;# input to filter is 21 bytes wide, output is 16 bytes. + ;# input will can span three vectors if not aligned correctly. + lvx v21, 0, r3 + lvx v22, r10, r3 + +.if \increment_counter + add r3, r3, r4 +.endif + + vperm \V, v21, v22, v17 +.endm + +.macro write_aligned_8 V, increment_counter + stvx \V, 0, r7 + +.if \increment_counter + add r7, r7, r8 +.endif +.endm + +.macro vfilter_16 P0 P1 + vmuleub v22, \P0, v20 ;# 64 + 4 positive taps + vadduhm v22, v18, v22 + vmuloub v23, \P0, v20 + vadduhm v23, v18, v23 + + vmuleub v24, \P1, v21 + vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary + vmuloub v25, \P1, v21 + vadduhm v23, v23, v25 ;# Ro = odds + + vsrh v22, v22, v19 ;# divide by 128 + vsrh v23, v23, v19 ;# v16 v17 = evens, odds + vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order + vmrglh v23, v22, v23 + vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result +.endm + + +.macro w_8x8 V, D, R, P + stvx \V, 0, r1 + lwz \R, 0(r1) + stw \R, 0(r7) + lwz \R, 4(r1) + stw \R, 4(r7) + add \D, \D, \P +.endm + + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch +bilinear_predict4x4_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xf830 + ori r12, r12, 0xfff8 + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + HProlog second_pass_4x4_pre_copy_b + + ;# Load up permutation constants + load_c v10, b_0123_b, 0, r9, r12 + load_c v11, b_4567_b, 0, r9, r12 + + hfilter_8 v0, 1 + hfilter_8 v1, 1 + hfilter_8 v2, 1 + hfilter_8 v3, 1 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional line that is needed + ;# for the vertical filter. + beq store_out_4x4_b + + hfilter_8 v4, 0 + + b second_pass_4x4_b + +second_pass_4x4_pre_copy_b: + slwi r6, r6, 5 ;# index into vertical filter array + + load_and_align_8 v0, 1 + load_and_align_8 v1, 1 + load_and_align_8 v2, 1 + load_and_align_8 v3, 1 + load_and_align_8 v4, 1 + +second_pass_4x4_b: + vspltish v20, 8 + vspltish v18, 3 + vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + load_vfilter v20, v21 + + vfilter_16 v0, v1 + vfilter_16 v1, v2 + vfilter_16 v2, v3 + vfilter_16 v3, v4 + +store_out_4x4_b: + + stvx v0, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + add r7, r7, r8 + + stvx v1, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + add r7, r7, r8 + + stvx v2, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + add r7, r7, r8 + + stvx v3, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + +exit_4x4: + + addi r1, r1, 32 ;# recover stack + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch +bilinear_predict8x4_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xf830 + ori r12, r12, 0xfff8 + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + HProlog second_pass_8x4_pre_copy_b + + ;# Load up permutation constants + load_c v10, b_0123_b, 0, r9, r12 + load_c v11, b_4567_b, 0, r9, r12 + + hfilter_8 v0, 1 + hfilter_8 v1, 1 + hfilter_8 v2, 1 + hfilter_8 v3, 1 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional line that is needed + ;# for the vertical filter. + beq store_out_8x4_b + + hfilter_8 v4, 0 + + b second_pass_8x4_b + +second_pass_8x4_pre_copy_b: + slwi r6, r6, 5 ;# index into vertical filter array + + load_and_align_8 v0, 1 + load_and_align_8 v1, 1 + load_and_align_8 v2, 1 + load_and_align_8 v3, 1 + load_and_align_8 v4, 1 + +second_pass_8x4_b: + vspltish v20, 8 + vspltish v18, 3 + vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + load_vfilter v20, v21 + + vfilter_16 v0, v1 + vfilter_16 v1, v2 + vfilter_16 v2, v3 + vfilter_16 v3, v4 + +store_out_8x4_b: + + cmpi cr0, r8, 8 + beq cr0, store_aligned_8x4_b + + w_8x8 v0, r7, r0, r8 + w_8x8 v1, r7, r0, r8 + w_8x8 v2, r7, r0, r8 + w_8x8 v3, r7, r0, r8 + + b exit_8x4 + +store_aligned_8x4_b: + load_c v10, b_hilo_b, 0, r9, r10 + + vperm v0, v0, v1, v10 + vperm v2, v2, v3, v10 + + stvx v0, 0, r7 + addi r7, r7, 16 + stvx v2, 0, r7 + +exit_8x4: + + addi r1, r1, 32 ;# recover stack + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch +bilinear_predict8x8_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xfff0 + ori r12, r12, 0xffff + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + HProlog second_pass_8x8_pre_copy_b + + ;# Load up permutation constants + load_c v10, b_0123_b, 0, r9, r12 + load_c v11, b_4567_b, 0, r9, r12 + + hfilter_8 v0, 1 + hfilter_8 v1, 1 + hfilter_8 v2, 1 + hfilter_8 v3, 1 + hfilter_8 v4, 1 + hfilter_8 v5, 1 + hfilter_8 v6, 1 + hfilter_8 v7, 1 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional line that is needed + ;# for the vertical filter. + beq store_out_8x8_b + + hfilter_8 v8, 0 + + b second_pass_8x8_b + +second_pass_8x8_pre_copy_b: + slwi r6, r6, 5 ;# index into vertical filter array + + load_and_align_8 v0, 1 + load_and_align_8 v1, 1 + load_and_align_8 v2, 1 + load_and_align_8 v3, 1 + load_and_align_8 v4, 1 + load_and_align_8 v5, 1 + load_and_align_8 v6, 1 + load_and_align_8 v7, 1 + load_and_align_8 v8, 0 + +second_pass_8x8_b: + vspltish v20, 8 + vspltish v18, 3 + vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + load_vfilter v20, v21 + + vfilter_16 v0, v1 + vfilter_16 v1, v2 + vfilter_16 v2, v3 + vfilter_16 v3, v4 + vfilter_16 v4, v5 + vfilter_16 v5, v6 + vfilter_16 v6, v7 + vfilter_16 v7, v8 + +store_out_8x8_b: + + cmpi cr0, r8, 8 + beq cr0, store_aligned_8x8_b + + w_8x8 v0, r7, r0, r8 + w_8x8 v1, r7, r0, r8 + w_8x8 v2, r7, r0, r8 + w_8x8 v3, r7, r0, r8 + w_8x8 v4, r7, r0, r8 + w_8x8 v5, r7, r0, r8 + w_8x8 v6, r7, r0, r8 + w_8x8 v7, r7, r0, r8 + + b exit_8x8 + +store_aligned_8x8_b: + load_c v10, b_hilo_b, 0, r9, r10 + + vperm v0, v0, v1, v10 + vperm v2, v2, v3, v10 + vperm v4, v4, v5, v10 + vperm v6, v6, v7, v10 + + stvx v0, 0, r7 + addi r7, r7, 16 + stvx v2, 0, r7 + addi r7, r7, 16 + stvx v4, 0, r7 + addi r7, r7, 16 + stvx v6, 0, r7 + +exit_8x8: + + addi r1, r1, 32 ;# recover stack + mtspr 256, r11 ;# reset old VRSAVE + + blr + +;# Filters a horizontal line +;# expects: +;# r3 src_ptr +;# r4 pitch +;# r10 16 +;# r12 32 +;# v17 perm intput +;# v18 rounding +;# v19 shift +;# v20 filter taps +;# v21 tmp +;# v22 tmp +;# v23 tmp +;# v24 tmp +;# v25 tmp +;# v26 tmp +;# v27 tmp +;# v28 perm output +;# +.macro hfilter_16 V, increment_counter + + lvsl v17, 0, r3 ;# permutate value for alignment + + ;# input to filter is 21 bytes wide, output is 16 bytes. + ;# input will can span three vectors if not aligned correctly. + lvx v21, 0, r3 + lvx v22, r10, r3 + lvx v23, r12, r3 + +.if \increment_counter + add r3, r3, r4 +.endif + vperm v21, v21, v22, v17 + vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified + + ;# set 0 + vmsummbm v24, v20, v21, v18 ;# taps times elements + + ;# set 1 + vsldoi v23, v21, v22, 1 + vmsummbm v25, v20, v23, v18 + + ;# set 2 + vsldoi v23, v21, v22, 2 + vmsummbm v26, v20, v23, v18 + + ;# set 3 + vsldoi v23, v21, v22, 3 + vmsummbm v27, v20, v23, v18 + + vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) + vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F + + vsrh v24, v24, v19 ;# divide v0, v1 by 128 + vsrh v25, v25, v19 + + vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result + vperm \V, \V, v0, v28 ;# \V = correctly-ordered result +.endm + +.macro load_and_align_16 V, increment_counter + lvsl v17, 0, r3 ;# permutate value for alignment + + ;# input to filter is 21 bytes wide, output is 16 bytes. + ;# input will can span three vectors if not aligned correctly. + lvx v21, 0, r3 + lvx v22, r10, r3 + +.if \increment_counter + add r3, r3, r4 +.endif + + vperm \V, v21, v22, v17 +.endm + +.macro write_16 V, increment_counter + stvx \V, 0, r7 + +.if \increment_counter + add r7, r7, r8 +.endif +.endm + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch +bilinear_predict16x16_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xfff8 + mtspr 256, r12 ;# set VRSAVE + + HProlog second_pass_16x16_pre_copy_b + + hfilter_16 v0, 1 + hfilter_16 v1, 1 + hfilter_16 v2, 1 + hfilter_16 v3, 1 + hfilter_16 v4, 1 + hfilter_16 v5, 1 + hfilter_16 v6, 1 + hfilter_16 v7, 1 + hfilter_16 v8, 1 + hfilter_16 v9, 1 + hfilter_16 v10, 1 + hfilter_16 v11, 1 + hfilter_16 v12, 1 + hfilter_16 v13, 1 + hfilter_16 v14, 1 + hfilter_16 v15, 1 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional line that is needed + ;# for the vertical filter. + beq store_out_16x16_b + + hfilter_16 v16, 0 + + b second_pass_16x16_b + +second_pass_16x16_pre_copy_b: + slwi r6, r6, 5 ;# index into vertical filter array + + load_and_align_16 v0, 1 + load_and_align_16 v1, 1 + load_and_align_16 v2, 1 + load_and_align_16 v3, 1 + load_and_align_16 v4, 1 + load_and_align_16 v5, 1 + load_and_align_16 v6, 1 + load_and_align_16 v7, 1 + load_and_align_16 v8, 1 + load_and_align_16 v9, 1 + load_and_align_16 v10, 1 + load_and_align_16 v11, 1 + load_and_align_16 v12, 1 + load_and_align_16 v13, 1 + load_and_align_16 v14, 1 + load_and_align_16 v15, 1 + load_and_align_16 v16, 0 + +second_pass_16x16_b: + vspltish v20, 8 + vspltish v18, 3 + vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + load_vfilter v20, v21 + + vfilter_16 v0, v1 + vfilter_16 v1, v2 + vfilter_16 v2, v3 + vfilter_16 v3, v4 + vfilter_16 v4, v5 + vfilter_16 v5, v6 + vfilter_16 v6, v7 + vfilter_16 v7, v8 + vfilter_16 v8, v9 + vfilter_16 v9, v10 + vfilter_16 v10, v11 + vfilter_16 v11, v12 + vfilter_16 v12, v13 + vfilter_16 v13, v14 + vfilter_16 v14, v15 + vfilter_16 v15, v16 + +store_out_16x16_b: + + write_16 v0, 1 + write_16 v1, 1 + write_16 v2, 1 + write_16 v3, 1 + write_16 v4, 1 + write_16 v5, 1 + write_16 v6, 1 + write_16 v7, 1 + write_16 v8, 1 + write_16 v9, 1 + write_16 v10, 1 + write_16 v11, 1 + write_16 v12, 1 + write_16 v13, 1 + write_16 v14, 1 + write_16 v15, 0 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .data + + .align 4 +hfilter_b: + .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0 + .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0 + .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0 + .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0 + .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0 + .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0 + .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0 + .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0 + + .align 4 +vfilter_b: + .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 + .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 + .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 + .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 + .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 + .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 + .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 + .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 + .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 + .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 + .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 + .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 + + .align 4 +b_hperm_b: + .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 + + .align 4 +b_0123_b: + .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + + .align 4 +b_4567_b: + .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 + +b_hilo_b: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 diff --git a/vp8/common/ppc/idctllm_altivec.asm b/vp8/common/ppc/idctllm_altivec.asm new file mode 100644 index 000000000..e88af8d7d --- /dev/null +++ b/vp8/common/ppc/idctllm_altivec.asm @@ -0,0 +1,188 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + .globl short_idct4x4llm_ppc + +.macro load_c V, LABEL, OFF, R0, R1 + lis \R0, \LABEL@ha + la \R1, \LABEL@l(\R0) + lvx \V, \OFF, \R1 +.endm + +;# r3 short *input +;# r4 short *output +;# r5 int pitch + .align 2 +short_idct4x4llm_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xfff8 + mtspr 256, r12 ;# set VRSAVE + + load_c v8, sinpi8sqrt2, 0, r9, r10 + load_c v9, cospi8sqrt2minus1, 0, r9, r10 + load_c v10, hi_hi, 0, r9, r10 + load_c v11, lo_lo, 0, r9, r10 + load_c v12, shift_16, 0, r9, r10 + + li r10, 16 + lvx v0, 0, r3 ;# input ip[0], ip[ 4] + lvx v1, r10, r3 ;# input ip[8], ip[12] + + ;# first pass + vupkhsh v2, v0 + vupkhsh v3, v1 + vaddsws v6, v2, v3 ;# a1 = ip[0]+ip[8] + vsubsws v7, v2, v3 ;# b1 = ip[0]-ip[8] + + vupklsh v0, v0 + vmulosh v4, v0, v8 + vsraw v4, v4, v12 + vaddsws v4, v4, v0 ;# ip[ 4] * sin(pi/8) * sqrt(2) + + vupklsh v1, v1 + vmulosh v5, v1, v9 + vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2) + vaddsws v5, v5, v1 + + vsubsws v4, v4, v5 ;# c1 + + vmulosh v3, v1, v8 + vsraw v3, v3, v12 + vaddsws v3, v3, v1 ;# ip[12] * sin(pi/8) * sqrt(2) + + vmulosh v5, v0, v9 + vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2) + vaddsws v5, v5, v0 + + vaddsws v3, v3, v5 ;# d1 + + vaddsws v0, v6, v3 ;# a1 + d1 + vsubsws v3, v6, v3 ;# a1 - d1 + + vaddsws v1, v7, v4 ;# b1 + c1 + vsubsws v2, v7, v4 ;# b1 - c1 + + ;# transpose input + vmrghw v4, v0, v1 ;# a0 b0 a1 b1 + vmrghw v5, v2, v3 ;# c0 d0 c1 d1 + + vmrglw v6, v0, v1 ;# a2 b2 a3 b3 + vmrglw v7, v2, v3 ;# c2 d2 c3 d3 + + vperm v0, v4, v5, v10 ;# a0 b0 c0 d0 + vperm v1, v4, v5, v11 ;# a1 b1 c1 d1 + + vperm v2, v6, v7, v10 ;# a2 b2 c2 d2 + vperm v3, v6, v7, v11 ;# a3 b3 c3 d3 + + ;# second pass + vaddsws v6, v0, v2 ;# a1 = ip[0]+ip[8] + vsubsws v7, v0, v2 ;# b1 = ip[0]-ip[8] + + vmulosh v4, v1, v8 + vsraw v4, v4, v12 + vaddsws v4, v4, v1 ;# ip[ 4] * sin(pi/8) * sqrt(2) + + vmulosh v5, v3, v9 + vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2) + vaddsws v5, v5, v3 + + vsubsws v4, v4, v5 ;# c1 + + vmulosh v2, v3, v8 + vsraw v2, v2, v12 + vaddsws v2, v2, v3 ;# ip[12] * sin(pi/8) * sqrt(2) + + vmulosh v5, v1, v9 + vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2) + vaddsws v5, v5, v1 + + vaddsws v3, v2, v5 ;# d1 + + vaddsws v0, v6, v3 ;# a1 + d1 + vsubsws v3, v6, v3 ;# a1 - d1 + + vaddsws v1, v7, v4 ;# b1 + c1 + vsubsws v2, v7, v4 ;# b1 - c1 + + vspltish v6, 4 + vspltish v7, 3 + + vpkswss v0, v0, v1 + vpkswss v1, v2, v3 + + vaddshs v0, v0, v6 + vaddshs v1, v1, v6 + + vsrah v0, v0, v7 + vsrah v1, v1, v7 + + ;# transpose output + vmrghh v2, v0, v1 ;# a0 c0 a1 c1 a2 c2 a3 c3 + vmrglh v3, v0, v1 ;# b0 d0 b1 d1 b2 d2 b3 d3 + + vmrghh v0, v2, v3 ;# a0 b0 c0 d0 a1 b1 c1 d1 + vmrglh v1, v2, v3 ;# a2 b2 c2 d2 a3 b3 c3 d3 + + stwu r1,-416(r1) ;# create space on the stack + + stvx v0, 0, r1 + lwz r6, 0(r1) + stw r6, 0(r4) + lwz r6, 4(r1) + stw r6, 4(r4) + + add r4, r4, r5 + + lwz r6, 8(r1) + stw r6, 0(r4) + lwz r6, 12(r1) + stw r6, 4(r4) + + add r4, r4, r5 + + stvx v1, 0, r1 + lwz r6, 0(r1) + stw r6, 0(r4) + lwz r6, 4(r1) + stw r6, 4(r4) + + add r4, r4, r5 + + lwz r6, 8(r1) + stw r6, 0(r4) + lwz r6, 12(r1) + stw r6, 4(r4) + + addi r1, r1, 416 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 4 +sinpi8sqrt2: + .short 35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468 + + .align 4 +cospi8sqrt2minus1: + .short 20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091 + + .align 4 +shift_16: + .long 16, 16, 16, 16 + + .align 4 +hi_hi: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 + + .align 4 +lo_lo: + .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 diff --git a/vp8/common/ppc/loopfilter_altivec.c b/vp8/common/ppc/loopfilter_altivec.c new file mode 100644 index 000000000..586eed477 --- /dev/null +++ b/vp8/common/ppc/loopfilter_altivec.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "loopfilter.h" +#include "onyxc_int.h" + +typedef void loop_filter_function_y_ppc +( + unsigned char *s, // source pointer + int p, // pitch + const signed char *flimit, + const signed char *limit, + const signed char *thresh +); + +typedef void loop_filter_function_uv_ppc +( + unsigned char *u, // source pointer + unsigned char *v, // source pointer + int p, // pitch + const signed char *flimit, + const signed char *limit, + const signed char *thresh +); + +typedef void loop_filter_function_s_ppc +( + unsigned char *s, // source pointer + int p, // pitch + const signed char *flimit +); + +loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc; +loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc; +loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc; +loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc; + +loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc; +loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc; +loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc; +loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc; + +loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc; +loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc; + +// Horizontal MB filtering +void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void)simpler_lpf; + mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr); + + if (u_ptr) + mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr); +} + +void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void)simpler_lpf; + (void)u_ptr; + (void)v_ptr; + (void)uv_stride; + loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim); +} + +// Vertical MB Filtering +void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void)simpler_lpf; + mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr); + + if (u_ptr) + mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr); +} + +void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void)simpler_lpf; + (void)u_ptr; + (void)v_ptr; + (void)uv_stride; + loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim); +} + +// Horizontal B Filtering +void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void)simpler_lpf; + // These should all be done at once with one call, instead of 3 + loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr); + loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr); + loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr); + + if (u_ptr) + loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr); +} + +void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void)simpler_lpf; + (void)u_ptr; + (void)v_ptr; + (void)uv_stride; + loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim); + loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim); + loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim); +} + +// Vertical B Filtering +void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void)simpler_lpf; + loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr); + + if (u_ptr) + loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr); +} + +void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void)simpler_lpf; + (void)u_ptr; + (void)v_ptr; + (void)uv_stride; + loop_filter_simple_vertical_edge_ppc(y_ptr + 4, y_stride, lfi->flim); + loop_filter_simple_vertical_edge_ppc(y_ptr + 8, y_stride, lfi->flim); + loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim); +} diff --git a/vp8/common/ppc/loopfilter_filters_altivec.asm b/vp8/common/ppc/loopfilter_filters_altivec.asm new file mode 100644 index 000000000..78a5cf9b3 --- /dev/null +++ b/vp8/common/ppc/loopfilter_filters_altivec.asm @@ -0,0 +1,1252 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + .globl mbloop_filter_horizontal_edge_y_ppc + .globl loop_filter_horizontal_edge_y_ppc + .globl mbloop_filter_vertical_edge_y_ppc + .globl loop_filter_vertical_edge_y_ppc + + .globl mbloop_filter_horizontal_edge_uv_ppc + .globl loop_filter_horizontal_edge_uv_ppc + .globl mbloop_filter_vertical_edge_uv_ppc + .globl loop_filter_vertical_edge_uv_ppc + + .globl loop_filter_simple_horizontal_edge_ppc + .globl loop_filter_simple_vertical_edge_ppc + + .text +;# We often need to perform transposes (and other transpose-like operations) +;# on matrices of data. This is simplified by the fact that we usually +;# operate on hunks of data whose dimensions are powers of 2, or at least +;# divisible by highish powers of 2. +;# +;# These operations can be very confusing. They become more straightforward +;# when we think of them as permutations of address bits: Concatenate a +;# group of vector registers and think of it as occupying a block of +;# memory beginning at address zero. The low four bits 0...3 of the +;# address then correspond to position within a register, the higher-order +;# address bits select the register. +;# +;# Although register selection, at the code level, is arbitrary, things +;# are simpler if we use contiguous ranges of register numbers, simpler +;# still if the low-order bits of the register number correspond to +;# conceptual address bits. We do this whenever reasonable. +;# +;# A 16x16 transpose can then be thought of as an operation on +;# a 256-element block of memory. It takes 8 bits 0...7 to address this +;# memory and the effect of a transpose is to interchange address bit +;# 0 with 4, 1 with 5, 2 with 6, and 3 with 7. Bits 0...3 index the +;# column, which is interchanged with the row addressed by bits 4..7. +;# +;# The altivec merge instructions provide a rapid means of effecting +;# many of these transforms. They operate at three widths (8,16,32). +;# Writing V(x) for vector register #x, paired merges permute address +;# indices as follows. +;# +;# 0->1 1->2 2->3 3->(4+d) (4+s)->0: +;# +;# vmrghb V( x), V( y), V( y + (1<<s)) +;# vmrglb V( x + (1<<d)), V( y), V( y + (1<<s)) +;# +;# +;# =0= 1->2 2->3 3->(4+d) (4+s)->1: +;# +;# vmrghh V( x), V( y), V( y + (1<<s)) +;# vmrglh V( x + (1<<d)), V( y), V( y + (1<<s)) +;# +;# +;# =0= =1= 2->3 3->(4+d) (4+s)->2: +;# +;# vmrghw V( x), V( y), V( y + (1<<s)) +;# vmrglw V( x + (1<<d)), V( y), V( y + (1<<s)) +;# +;# +;# Unfortunately, there is no doubleword merge instruction. +;# The following sequence uses "vperm" is a substitute. +;# Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c) +;# are in registers Vhihi and Vlolo, we can also effect the permutation +;# +;# =0= =1= =2= 3->(4+d) (4+s)->3 by the sequence: +;# +;# vperm V( x), V( y), V( y + (1<<s)), Vhihi +;# vperm V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo +;# +;# +;# Except for bits s and d, the other relationships between register +;# number (= high-order part of address) bits are at the disposal of +;# the programmer. +;# + +;# To avoid excess transposes, we filter all 3 vertical luma subblock +;# edges together. This requires a single 16x16 transpose, which, in +;# the above language, amounts to the following permutation of address +;# indices: 0<->4 1<->5 2<->6 3<->7, which we accomplish by +;# 4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0. +;# +;# Except for the fact that the destination registers get written +;# before we are done referencing the old contents, the cyclic transform +;# is effected by +;# +;# x = 0; do { +;# vmrghb V(2x), V(x), V(x+8); +;# vmrghb V(2x+1), V(x), V(x+8); +;# } while( ++x < 8); +;# +;# For clarity, and because we can afford it, we do this transpose +;# using all 32 registers, alternating the banks 0..15 and 16 .. 31, +;# leaving the final result in 16 .. 31, as the lower registers are +;# used in the filtering itself. +;# +.macro Tpair A, B, X, Y + vmrghb \A, \X, \Y + vmrglb \B, \X, \Y +.endm + +;# Each step takes 8*2 = 16 instructions + +.macro t16_even + Tpair v16,v17, v0,v8 + Tpair v18,v19, v1,v9 + Tpair v20,v21, v2,v10 + Tpair v22,v23, v3,v11 + Tpair v24,v25, v4,v12 + Tpair v26,v27, v5,v13 + Tpair v28,v29, v6,v14 + Tpair v30,v31, v7,v15 +.endm + +.macro t16_odd + Tpair v0,v1, v16,v24 + Tpair v2,v3, v17,v25 + Tpair v4,v5, v18,v26 + Tpair v6,v7, v19,v27 + Tpair v8,v9, v20,v28 + Tpair v10,v11, v21,v29 + Tpair v12,v13, v22,v30 + Tpair v14,v15, v23,v31 +.endm + +;# Whole transpose takes 4*16 = 64 instructions + +.macro t16_full + t16_odd + t16_even + t16_odd + t16_even +.endm + +;# Vertical edge filtering requires transposes. For the simple filter, +;# we need to convert 16 rows of 4 pels each into 4 registers of 16 pels +;# each. Writing 0 ... 63 for the pixel indices, the desired result is: +;# +;# v0 = 0 1 ... 14 15 +;# v1 = 16 17 ... 30 31 +;# v2 = 32 33 ... 47 48 +;# v3 = 49 50 ... 62 63 +;# +;# In frame-buffer memory, the layout is: +;# +;# 0 16 32 48 +;# 1 17 33 49 +;# ... +;# 15 31 47 63. +;# +;# We begin by reading the data 32 bits at a time (using scalar operations) +;# into a temporary array, reading the rows of the array into vector registers, +;# with the following layout: +;# +;# v0 = 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60 +;# v1 = 1 17 33 49 5 21 ... 45 61 +;# v2 = 2 18 ... 46 62 +;# v3 = 3 19 ... 47 63 +;# +;# From the "address-bit" perspective discussed above, we simply need to +;# interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone. +;# In other words, we transpose each of the four 4x4 submatrices. +;# +;# This transformation is its own inverse, and we need to perform it +;# again before writing the pixels back into the frame buffer. +;# +;# It acts in place on registers v0...v3, uses v4...v7 as temporaries, +;# and assumes that v14/v15 contain the b_hihi/b_lolo selectors +;# defined above. We think of both groups of 4 registers as having +;# "addresses" {0,1,2,3} * 16. +;# +.macro Transpose4times4x4 Vlo, Vhi + + ;# d=s=0 0->1 1->2 2->3 3->4 4->0 =5= + + vmrghb v4, v0, v1 + vmrglb v5, v0, v1 + vmrghb v6, v2, v3 + vmrglb v7, v2, v3 + + ;# d=0 s=1 =0= 1->2 2->3 3->4 4->5 5->1 + + vmrghh v0, v4, v6 + vmrglh v1, v4, v6 + vmrghh v2, v5, v7 + vmrglh v3, v5, v7 + + ;# d=s=0 =0= =1= 2->3 3->4 4->2 =5= + + vmrghw v4, v0, v1 + vmrglw v5, v0, v1 + vmrghw v6, v2, v3 + vmrglw v7, v2, v3 + + ;# d=0 s=1 =0= =1= =2= 3->4 4->5 5->3 + + vperm v0, v4, v6, \Vlo + vperm v1, v4, v6, \Vhi + vperm v2, v5, v7, \Vlo + vperm v3, v5, v7, \Vhi +.endm +;# end Transpose4times4x4 + + +;# Normal mb vertical edge filter transpose. +;# +;# We read 8 columns of data, initially in the following pattern: +;# +;# (0,0) (1,0) ... (7,0) (0,1) (1,1) ... (7,1) +;# (0,2) (1,2) ... (7,2) (0,3) (1,3) ... (7,3) +;# ... +;# (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15) +;# +;# and wish to convert to: +;# +;# (0,0) ... (0,15) +;# (1,0) ... (1,15) +;# ... +;# (7,0) ... (7,15). +;# +;# In "address bit" language, we wish to map +;# +;# 0->4 1->5 2->6 3->0 4->1 5->2 6->3, i.e., I -> (I+4) mod 7. +;# +;# This can be accomplished by 4 iterations of the cyclic transform +;# +;# I -> (I+1) mod 7; +;# +;# each iteration can be realized by (d=0, s=2): +;# +;# x = 0; do Tpair( V(2x),V(2x+1), V(x),V(x+4)) while( ++x < 4); +;# +;# The input/output is in registers v0...v7. We use v10...v17 as mirrors; +;# preserving v8 = sign converter. +;# +;# Inverse transpose is similar, except here I -> (I+3) mod 7 and the +;# result lands in the "mirror" registers v10...v17 +;# +.macro t8x16_odd + Tpair v10, v11, v0, v4 + Tpair v12, v13, v1, v5 + Tpair v14, v15, v2, v6 + Tpair v16, v17, v3, v7 +.endm + +.macro t8x16_even + Tpair v0, v1, v10, v14 + Tpair v2, v3, v11, v15 + Tpair v4, v5, v12, v16 + Tpair v6, v7, v13, v17 +.endm + +.macro transpose8x16_fwd + t8x16_odd + t8x16_even + t8x16_odd + t8x16_even +.endm + +.macro transpose8x16_inv + t8x16_odd + t8x16_even + t8x16_odd +.endm + +.macro Transpose16x16 + vmrghb v0, v16, v24 + vmrglb v1, v16, v24 + vmrghb v2, v17, v25 + vmrglb v3, v17, v25 + vmrghb v4, v18, v26 + vmrglb v5, v18, v26 + vmrghb v6, v19, v27 + vmrglb v7, v19, v27 + vmrghb v8, v20, v28 + vmrglb v9, v20, v28 + vmrghb v10, v21, v29 + vmrglb v11, v21, v29 + vmrghb v12, v22, v30 + vmrglb v13, v22, v30 + vmrghb v14, v23, v31 + vmrglb v15, v23, v31 + vmrghb v16, v0, v8 + vmrglb v17, v0, v8 + vmrghb v18, v1, v9 + vmrglb v19, v1, v9 + vmrghb v20, v2, v10 + vmrglb v21, v2, v10 + vmrghb v22, v3, v11 + vmrglb v23, v3, v11 + vmrghb v24, v4, v12 + vmrglb v25, v4, v12 + vmrghb v26, v5, v13 + vmrglb v27, v5, v13 + vmrghb v28, v6, v14 + vmrglb v29, v6, v14 + vmrghb v30, v7, v15 + vmrglb v31, v7, v15 + vmrghb v0, v16, v24 + vmrglb v1, v16, v24 + vmrghb v2, v17, v25 + vmrglb v3, v17, v25 + vmrghb v4, v18, v26 + vmrglb v5, v18, v26 + vmrghb v6, v19, v27 + vmrglb v7, v19, v27 + vmrghb v8, v20, v28 + vmrglb v9, v20, v28 + vmrghb v10, v21, v29 + vmrglb v11, v21, v29 + vmrghb v12, v22, v30 + vmrglb v13, v22, v30 + vmrghb v14, v23, v31 + vmrglb v15, v23, v31 + vmrghb v16, v0, v8 + vmrglb v17, v0, v8 + vmrghb v18, v1, v9 + vmrglb v19, v1, v9 + vmrghb v20, v2, v10 + vmrglb v21, v2, v10 + vmrghb v22, v3, v11 + vmrglb v23, v3, v11 + vmrghb v24, v4, v12 + vmrglb v25, v4, v12 + vmrghb v26, v5, v13 + vmrglb v27, v5, v13 + vmrghb v28, v6, v14 + vmrglb v29, v6, v14 + vmrghb v30, v7, v15 + vmrglb v31, v7, v15 +.endm + +;# load_g loads a global vector (whose address is in the local variable Gptr) +;# into vector register Vreg. Trashes r0 +.macro load_g Vreg, Gptr + lwz r0, \Gptr + lvx \Vreg, 0, r0 +.endm + +;# exploit the saturation here. if the answer is negative +;# it will be clamped to 0. orring 0 with a positive +;# number will be the positive number (abs) +;# RES = abs( A-B), trashes TMP +.macro Abs RES, TMP, A, B + vsububs \RES, \A, \B + vsububs \TMP, \B, \A + vor \RES, \RES, \TMP +.endm + +;# RES = Max( RES, abs( A-B)), trashes TMP +.macro max_abs RES, TMP, A, B + vsububs \TMP, \A, \B + vmaxub \RES, \RES, \TMP + vsububs \TMP, \B, \A + vmaxub \RES, \RES, \TMP +.endm + +.macro Masks + ;# build masks + ;# input is all 8 bit unsigned (0-255). need to + ;# do abs(vala-valb) > limit. but no need to compare each + ;# value to the limit. find the max of the absolute differences + ;# and compare that to the limit. + ;# First hev + Abs v14, v13, v2, v3 ;# |P1 - P0| + max_abs v14, v13, v5, v4 ;# |Q1 - Q0| + + vcmpgtub v10, v14, v10 ;# HEV = true if thresh exceeded + + ;# Next limit + max_abs v14, v13, v0, v1 ;# |P3 - P2| + max_abs v14, v13, v1, v2 ;# |P2 - P1| + max_abs v14, v13, v6, v5 ;# |Q2 - Q1| + max_abs v14, v13, v7, v6 ;# |Q3 - Q2| + + vcmpgtub v9, v14, v9 ;# R = true if limit exceeded + + ;# flimit + Abs v14, v13, v3, v4 ;# |P0 - Q0| + + vcmpgtub v8, v14, v8 ;# X = true if flimit exceeded + + vor v8, v8, v9 ;# R = true if flimit or limit exceeded + ;# done building masks +.endm + +.macro build_constants RFL, RLI, RTH, FL, LI, TH + ;# build constants + lvx \FL, 0, \RFL ;# flimit + lvx \LI, 0, \RLI ;# limit + lvx \TH, 0, \RTH ;# thresh + + vspltisb v11, 8 + vspltisb v12, 4 + vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 +.endm + +.macro load_data_y + ;# setup strides/pointers to be able to access + ;# all of the data + add r5, r4, r4 ;# r5 = 2 * stride + sub r6, r3, r5 ;# r6 -> 2 rows back + neg r7, r4 ;# r7 = -stride + + ;# load 16 pixels worth of data to work on + sub r0, r6, r5 ;# r0 -> 4 rows back (temp) + lvx v0, 0, r0 ;# P3 (read only) + lvx v1, r7, r6 ;# P2 + lvx v2, 0, r6 ;# P1 + lvx v3, r7, r3 ;# P0 + lvx v4, 0, r3 ;# Q0 + lvx v5, r4, r3 ;# Q1 + lvx v6, r5, r3 ;# Q2 + add r0, r3, r5 ;# r0 -> 2 rows fwd (temp) + lvx v7, r4, r0 ;# Q3 (read only) +.endm + +;# Expects +;# v10 == HEV +;# v13 == tmp +;# v14 == tmp +.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT + vxor \P1, \P1, v11 ;# SP1 + vxor \P0, \P0, v11 ;# SP0 + vxor \Q0, \Q0, v11 ;# SQ0 + vxor \Q1, \Q1, v11 ;# SQ1 + + vsubsbs v13, \P1, \Q1 ;# f = c (P1 - Q1) +.if \HEV_PRESENT + vand v13, v13, v10 ;# f &= hev +.endif + vsubsbs v14, \Q0, \P0 ;# -126 <= X = Q0-P0 <= +126 + vaddsbs v13, v13, v14 + vaddsbs v13, v13, v14 + vaddsbs v13, v13, v14 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) + + vandc v13, v13, v8 ;# f &= mask + + vspltisb v8, 3 + vspltisb v9, 4 + + vaddsbs v14, v13, v9 ;# f1 = c (f+4) + vaddsbs v15, v13, v8 ;# f2 = c (f+3) + + vsrab v13, v14, v8 ;# f1 >>= 3 + vsrab v15, v15, v8 ;# f2 >>= 3 + + vsubsbs \Q0, \Q0, v13 ;# u1 = c (SQ0 - f1) + vaddsbs \P0, \P0, v15 ;# u2 = c (SP0 + f2) +.endm + +.macro vp8_mbfilter + Masks + + ;# start the fitering here + vxor v1, v1, v11 ;# SP2 + vxor v2, v2, v11 ;# SP1 + vxor v3, v3, v11 ;# SP0 + vxor v4, v4, v11 ;# SQ0 + vxor v5, v5, v11 ;# SQ1 + vxor v6, v6, v11 ;# SQ2 + + ;# add outer taps if we have high edge variance + vsubsbs v13, v2, v5 ;# f = c (SP1-SQ1) + + vsubsbs v14, v4, v3 ;# SQ0-SP0 + vaddsbs v13, v13, v14 + vaddsbs v13, v13, v14 + vaddsbs v13, v13, v14 ;# f = c( c(SP1-SQ1) + 3*(SQ0-SP0)) + + vandc v13, v13, v8 ;# f &= mask + vand v15, v13, v10 ;# f2 = f & hev + + ;# save bottom 3 bits so that we round one side +4 and the other +3 + vspltisb v8, 3 + vspltisb v9, 4 + + vaddsbs v14, v15, v9 ;# f1 = c (f+4) + vaddsbs v15, v15, v8 ;# f2 = c (f+3) + + vsrab v14, v14, v8 ;# f1 >>= 3 + vsrab v15, v15, v8 ;# f2 >>= 3 + + vsubsbs v4, v4, v14 ;# u1 = c (SQ0 - f1) + vaddsbs v3, v3, v15 ;# u2 = c (SP0 + f2) + + ;# only apply wider filter if not high edge variance + vandc v13, v13, v10 ;# f &= ~hev + + vspltisb v9, 2 + vnor v8, v8, v8 + vsrb v9, v8, v9 ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f + vupkhsb v9, v9 ;# 0x003f003f003f003f003f003f003f003f + vspltisb v8, 9 + + ;# roughly 1/7th difference across boundary + vspltish v10, 7 + vmulosb v14, v8, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) + vmulesb v15, v8, v13 + vaddshs v14, v14, v9 ;# += 63 + vaddshs v15, v15, v9 + vsrah v14, v14, v10 ;# >>= 7 + vsrah v15, v15, v10 + vmrglh v10, v15, v14 + vmrghh v15, v15, v14 + + vpkshss v10, v15, v10 ;# X = saturated down to bytes + + vsubsbs v6, v6, v10 ;# subtract from Q and add to P + vaddsbs v1, v1, v10 + + vxor v6, v6, v11 + vxor v1, v1, v11 + + ;# roughly 2/7th difference across boundary + vspltish v10, 7 + vaddubm v12, v8, v8 + vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) + vmulesb v15, v12, v13 + vaddshs v14, v14, v9 + vaddshs v15, v15, v9 + vsrah v14, v14, v10 ;# >>= 7 + vsrah v15, v15, v10 + vmrglh v10, v15, v14 + vmrghh v15, v15, v14 + + vpkshss v10, v15, v10 ;# X = saturated down to bytes + + vsubsbs v5, v5, v10 ;# subtract from Q and add to P + vaddsbs v2, v2, v10 + + vxor v5, v5, v11 + vxor v2, v2, v11 + + ;# roughly 3/7th difference across boundary + vspltish v10, 7 + vaddubm v12, v12, v8 + vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) + vmulesb v15, v12, v13 + vaddshs v14, v14, v9 + vaddshs v15, v15, v9 + vsrah v14, v14, v10 ;# >>= 7 + vsrah v15, v15, v10 + vmrglh v10, v15, v14 + vmrghh v15, v15, v14 + + vpkshss v10, v15, v10 ;# X = saturated down to bytes + + vsubsbs v4, v4, v10 ;# subtract from Q and add to P + vaddsbs v3, v3, v10 + + vxor v4, v4, v11 + vxor v3, v3, v11 +.endm + +.macro SBFilter + Masks + + common_adjust v3, v4, v2, v5, 1 + + ;# outer tap adjustments + vspltisb v8, 1 + + vaddubm v13, v13, v8 ;# f += 1 + vsrab v13, v13, v8 ;# f >>= 1 + + vandc v13, v13, v10 ;# f &= ~hev + + vsubsbs v5, v5, v13 ;# u1 = c (SQ1 - f) + vaddsbs v2, v2, v13 ;# u2 = c (SP1 + f) + + vxor v2, v2, v11 + vxor v3, v3, v11 + vxor v4, v4, v11 + vxor v5, v5, v11 +.endm + + .align 2 +mbloop_filter_horizontal_edge_y_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + mtspr 256, r12 ;# set VRSAVE + + build_constants r5, r6, r7, v8, v9, v10 + + load_data_y + + vp8_mbfilter + + stvx v1, r7, r6 ;# P2 + stvx v2, 0, r6 ;# P1 + stvx v3, r7, r3 ;# P0 + stvx v4, 0, r3 ;# Q0 + stvx v5, r4, r3 ;# Q1 + stvx v6, r5, r3 ;# Q2 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 2 +;# r3 unsigned char *s +;# r4 int p +;# r5 const signed char *flimit +;# r6 const signed char *limit +;# r7 const signed char *thresh +loop_filter_horizontal_edge_y_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + mtspr 256, r12 ;# set VRSAVE + + build_constants r5, r6, r7, v8, v9, v10 + + load_data_y + + SBFilter + + stvx v2, 0, r6 ;# P1 + stvx v3, r7, r3 ;# P0 + stvx v4, 0, r3 ;# Q0 + stvx v5, r4, r3 ;# Q1 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + +;# Filtering a vertical mb. Each mb is aligned on a 16 byte boundary. +;# So we can read in an entire mb aligned. However if we want to filter the mb +;# edge we run into problems. For the loopfilter we require 4 bytes before the mb +;# and 4 after for a total of 8 bytes. Reading 16 bytes inorder to get 4 is a bit +;# of a waste. So this is an even uglier way to get around that. +;# Using the regular register file words are read in and then saved back out to +;# memory to align and order them up. Then they are read in using the +;# vector register file. +.macro RLVmb V, R + lwzux r0, r3, r4 + stw r0, 4(\R) + lwz r0,-4(r3) + stw r0, 0(\R) + lwzux r0, r3, r4 + stw r0,12(\R) + lwz r0,-4(r3) + stw r0, 8(\R) + lvx \V, 0, \R +.endm + +.macro WLVmb V, R + stvx \V, 0, \R + lwz r0,12(\R) + stwux r0, r3, r4 + lwz r0, 8(\R) + stw r0,-4(r3) + lwz r0, 4(\R) + stwux r0, r3, r4 + lwz r0, 0(\R) + stw r0,-4(r3) +.endm + + .align 2 +;# r3 unsigned char *s +;# r4 int p +;# r5 const signed char *flimit +;# r6 const signed char *limit +;# r7 const signed char *thresh +mbloop_filter_vertical_edge_y_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xc000 + mtspr 256, r12 ;# set VRSAVE + + la r9, -48(r1) ;# temporary space for reading in vectors + sub r3, r3, r4 + + RLVmb v0, r9 + RLVmb v1, r9 + RLVmb v2, r9 + RLVmb v3, r9 + RLVmb v4, r9 + RLVmb v5, r9 + RLVmb v6, r9 + RLVmb v7, r9 + + transpose8x16_fwd + + build_constants r5, r6, r7, v8, v9, v10 + + vp8_mbfilter + + transpose8x16_inv + + add r3, r3, r4 + neg r4, r4 + + WLVmb v17, r9 + WLVmb v16, r9 + WLVmb v15, r9 + WLVmb v14, r9 + WLVmb v13, r9 + WLVmb v12, r9 + WLVmb v11, r9 + WLVmb v10, r9 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + +.macro RL V, R, P + lvx \V, 0, \R + add \R, \R, \P +.endm + +.macro WL V, R, P + stvx \V, 0, \R + add \R, \R, \P +.endm + +.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3 + ;# K = |P0-P1| already + Abs v14, v13, \Q0, \Q1 ;# M = |Q0-Q1| + vmaxub v14, v14, v4 ;# M = max( |P0-P1|, |Q0-Q1|) + vcmpgtub v10, v14, v0 + + Abs v4, v5, \Q2, \Q3 ;# K = |Q2-Q3| = next |P0-P1] + + max_abs v14, v13, \Q1, \Q2 ;# M = max( M, |Q1-Q2|) + max_abs v14, v13, \P1, \P2 ;# M = max( M, |P1-P2|) + max_abs v14, v13, \P2, \P3 ;# M = max( M, |P2-P3|) + + vmaxub v14, v14, v4 ;# M = max interior abs diff + vcmpgtub v9, v14, v2 ;# M = true if int_l exceeded + + Abs v14, v13, \P0, \Q0 ;# X = Abs( P0-Q0) + vcmpgtub v8, v14, v3 ;# X = true if edge_l exceeded + vor v8, v8, v9 ;# M = true if edge_l or int_l exceeded + + ;# replace P1,Q1 w/signed versions + common_adjust \P0, \Q0, \P1, \Q1, 1 + + vaddubm v13, v13, v1 ;# -16 <= M <= 15, saturation irrelevant + vsrab v13, v13, v1 + vandc v13, v13, v10 ;# adjust P1,Q1 by (M+1)>>1 if ! hev + vsubsbs \Q1, \Q1, v13 + vaddsbs \P1, \P1, v13 + + vxor \P1, \P1, v11 ;# P1 + vxor \P0, \P0, v11 ;# P0 + vxor \Q0, \Q0, v11 ;# Q0 + vxor \Q1, \Q1, v11 ;# Q1 +.endm + + + .align 2 +;# r3 unsigned char *s +;# r4 int p +;# r5 const signed char *flimit +;# r6 const signed char *limit +;# r7 const signed char *thresh +loop_filter_vertical_edge_y_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xffff + mtspr 256, r12 ;# set VRSAVE + + addi r9, r3, 0 + RL v16, r9, r4 + RL v17, r9, r4 + RL v18, r9, r4 + RL v19, r9, r4 + RL v20, r9, r4 + RL v21, r9, r4 + RL v22, r9, r4 + RL v23, r9, r4 + RL v24, r9, r4 + RL v25, r9, r4 + RL v26, r9, r4 + RL v27, r9, r4 + RL v28, r9, r4 + RL v29, r9, r4 + RL v30, r9, r4 + lvx v31, 0, r9 + + Transpose16x16 + + vspltisb v1, 1 + + build_constants r5, r6, r7, v3, v2, v0 + + Abs v4, v5, v19, v18 ;# K(v14) = first |P0-P1| + + Fil v16, v17, v18, v19, v20, v21, v22, v23 + Fil v20, v21, v22, v23, v24, v25, v26, v27 + Fil v24, v25, v26, v27, v28, v29, v30, v31 + + Transpose16x16 + + addi r9, r3, 0 + WL v16, r9, r4 + WL v17, r9, r4 + WL v18, r9, r4 + WL v19, r9, r4 + WL v20, r9, r4 + WL v21, r9, r4 + WL v22, r9, r4 + WL v23, r9, r4 + WL v24, r9, r4 + WL v25, r9, r4 + WL v26, r9, r4 + WL v27, r9, r4 + WL v28, r9, r4 + WL v29, r9, r4 + WL v30, r9, r4 + stvx v31, 0, r9 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + +;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +.macro active_chroma_sel V + andi. r7, r3, 8 ;# row origin modulo 16 + add r7, r7, r7 ;# selects selectors + lis r12, _chromaSelectors@ha + la r0, _chromaSelectors@l(r12) + lwzux r0, r7, r0 ;# leave selector addr in r7 + + lvx \V, 0, r0 ;# mask to concatenate active U,V pels +.endm + +.macro hread_uv Dest, U, V, Offs, VMask + lvx \U, \Offs, r3 + lvx \V, \Offs, r4 + vperm \Dest, \U, \V, \VMask ;# Dest = active part of U then V +.endm + +.macro hwrite_uv New, U, V, Offs, Umask, Vmask + vperm \U, \New, \U, \Umask ;# Combine new pels with siblings + vperm \V, \New, \V, \Vmask + stvx \U, \Offs, r3 ;# Write to frame buffer + stvx \V, \Offs, r4 +.endm + +;# Process U,V in parallel. +.macro load_chroma_h + neg r9, r5 ;# r9 = -1 * stride + add r8, r9, r9 ;# r8 = -2 * stride + add r10, r5, r5 ;# r10 = 2 * stride + + active_chroma_sel v12 + + ;# P3, Q3 are read-only; need not save addresses or sibling pels + add r6, r8, r8 ;# r6 = -4 * stride + hread_uv v0, v14, v15, r6, v12 + add r6, r10, r5 ;# r6 = 3 * stride + hread_uv v7, v14, v15, r6, v12 + + ;# Others are read/write; save addresses and sibling pels + + add r6, r8, r9 ;# r6 = -3 * stride + hread_uv v1, v16, v17, r6, v12 + hread_uv v2, v18, v19, r8, v12 + hread_uv v3, v20, v21, r9, v12 + hread_uv v4, v22, v23, 0, v12 + hread_uv v5, v24, v25, r5, v12 + hread_uv v6, v26, v27, r10, v12 +.endm + +.macro uresult_sel V + load_g \V, 4(r7) +.endm + +.macro vresult_sel V + load_g \V, 8(r7) +.endm + +;# always write P1,P0,Q0,Q1 +.macro store_chroma_h + uresult_sel v11 + vresult_sel v12 + hwrite_uv v2, v18, v19, r8, v11, v12 + hwrite_uv v3, v20, v21, r9, v11, v12 + hwrite_uv v4, v22, v23, 0, v11, v12 + hwrite_uv v5, v24, v25, r5, v11, v12 +.endm + + .align 2 +;# r3 unsigned char *u +;# r4 unsigned char *v +;# r5 int p +;# r6 const signed char *flimit +;# r7 const signed char *limit +;# r8 const signed char *thresh +mbloop_filter_horizontal_edge_uv_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xffff + mtspr 256, r12 ;# set VRSAVE + + build_constants r6, r7, r8, v8, v9, v10 + + load_chroma_h + + vp8_mbfilter + + store_chroma_h + + hwrite_uv v1, v16, v17, r6, v11, v12 ;# v1 == P2 + hwrite_uv v6, v26, v27, r10, v11, v12 ;# v6 == Q2 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 2 +;# r3 unsigned char *u +;# r4 unsigned char *v +;# r5 int p +;# r6 const signed char *flimit +;# r7 const signed char *limit +;# r8 const signed char *thresh +loop_filter_horizontal_edge_uv_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xffff + mtspr 256, r12 ;# set VRSAVE + + build_constants r6, r7, r8, v8, v9, v10 + + load_chroma_h + + SBFilter + + store_chroma_h + + mtspr 256, r11 ;# reset old VRSAVE + + blr + +.macro R V, R + lwzux r0, r3, r5 + stw r0, 4(\R) + lwz r0,-4(r3) + stw r0, 0(\R) + lwzux r0, r4, r5 + stw r0,12(\R) + lwz r0,-4(r4) + stw r0, 8(\R) + lvx \V, 0, \R +.endm + + +.macro W V, R + stvx \V, 0, \R + lwz r0,12(\R) + stwux r0, r4, r5 + lwz r0, 8(\R) + stw r0,-4(r4) + lwz r0, 4(\R) + stwux r0, r3, r5 + lwz r0, 0(\R) + stw r0,-4(r3) +.endm + +.macro chroma_vread R + sub r3, r3, r5 ;# back up one line for simplicity + sub r4, r4, r5 + + R v0, \R + R v1, \R + R v2, \R + R v3, \R + R v4, \R + R v5, \R + R v6, \R + R v7, \R + + transpose8x16_fwd +.endm + +.macro chroma_vwrite R + + transpose8x16_inv + + add r3, r3, r5 + add r4, r4, r5 + neg r5, r5 ;# Write rows back in reverse order + + W v17, \R + W v16, \R + W v15, \R + W v14, \R + W v13, \R + W v12, \R + W v11, \R + W v10, \R +.endm + + .align 2 +;# r3 unsigned char *u +;# r4 unsigned char *v +;# r5 int p +;# r6 const signed char *flimit +;# r7 const signed char *limit +;# r8 const signed char *thresh +mbloop_filter_vertical_edge_uv_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xc000 + mtspr 256, r12 ;# set VRSAVE + + la r9, -48(r1) ;# temporary space for reading in vectors + + chroma_vread r9 + + build_constants r6, r7, r8, v8, v9, v10 + + vp8_mbfilter + + chroma_vwrite r9 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 2 +;# r3 unsigned char *u +;# r4 unsigned char *v +;# r5 int p +;# r6 const signed char *flimit +;# r7 const signed char *limit +;# r8 const signed char *thresh +loop_filter_vertical_edge_uv_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xc000 + mtspr 256, r12 ;# set VRSAVE + + la r9, -48(r1) ;# temporary space for reading in vectors + + chroma_vread r9 + + build_constants r6, r7, r8, v8, v9, v10 + + SBFilter + + chroma_vwrite r9 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + +;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=- + +.macro vp8_simple_filter + Abs v14, v13, v1, v2 ;# M = abs( P0 - Q0) + vcmpgtub v8, v14, v8 ;# v5 = true if _over_ limit + + ;# preserve unsigned v0 and v3 + common_adjust v1, v2, v0, v3, 0 + + vxor v1, v1, v11 + vxor v2, v2, v11 ;# cvt Q0, P0 back to pels +.endm + +.macro simple_vertical + addi r8, 0, 16 + addi r7, r5, 32 + + lvx v0, 0, r5 + lvx v1, r8, r5 + lvx v2, 0, r7 + lvx v3, r8, r7 + + lis r12, _B_hihi@ha + la r0, _B_hihi@l(r12) + lvx v16, 0, r0 + + lis r12, _B_lolo@ha + la r0, _B_lolo@l(r12) + lvx v17, 0, r0 + + Transpose4times4x4 v16, v17 + vp8_simple_filter + + vxor v0, v0, v11 + vxor v3, v3, v11 ;# cvt Q0, P0 back to pels + + Transpose4times4x4 v16, v17 + + stvx v0, 0, r5 + stvx v1, r8, r5 + stvx v2, 0, r7 + stvx v3, r8, r7 +.endm + + .align 2 +;# r3 unsigned char *s +;# r4 int p +;# r5 const signed char *flimit +loop_filter_simple_horizontal_edge_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + mtspr 256, r12 ;# set VRSAVE + + ;# build constants + lvx v8, 0, r5 ;# flimit + + vspltisb v11, 8 + vspltisb v12, 4 + vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 + + neg r5, r4 ;# r5 = -1 * stride + add r6, r5, r5 ;# r6 = -2 * stride + + lvx v0, r6, r3 ;# v0 = P1 = 16 pels two rows above edge + lvx v1, r5, r3 ;# v1 = P0 = 16 pels one row above edge + lvx v2, 0, r3 ;# v2 = Q0 = 16 pels one row below edge + lvx v3, r4, r3 ;# v3 = Q1 = 16 pels two rows below edge + + vp8_simple_filter + + stvx v1, r5, r3 ;# store P0 + stvx v2, 0, r3 ;# store Q0 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + +.macro RLV Offs + stw r0, (\Offs*4)(r5) + lwzux r0, r7, r4 +.endm + +.macro WLV Offs + lwz r0, (\Offs*4)(r5) + stwux r0, r7, r4 +.endm + + .align 2 +;# r3 unsigned char *s +;# r4 int p +;# r5 const signed char *flimit +loop_filter_simple_vertical_edge_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xc000 + mtspr 256, r12 ;# set VRSAVE + + ;# build constants + lvx v8, 0, r5 ;# flimit + + vspltisb v11, 8 + vspltisb v12, 4 + vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 + + la r5, -96(r1) ;# temporary space for reading in vectors + + ;# Store 4 pels at word "Offs" in temp array, then advance r7 + ;# to next row and read another 4 pels from the frame buffer. + + subi r7, r3, 2 ;# r7 -> 2 pels before start + lwzx r0, 0, r7 ;# read first 4 pels + + ;# 16 unaligned word accesses + RLV 0 + RLV 4 + RLV 8 + RLV 12 + RLV 1 + RLV 5 + RLV 9 + RLV 13 + RLV 2 + RLV 6 + RLV 10 + RLV 14 + RLV 3 + RLV 7 + RLV 11 + + stw r0, (15*4)(r5) ;# write last 4 pels + + simple_vertical + + ;# Read temp array, write frame buffer. + subi r7, r3, 2 ;# r7 -> 2 pels before start + lwzx r0, 0, r5 ;# read/write first 4 pels + stwx r0, 0, r7 + + WLV 4 + WLV 8 + WLV 12 + WLV 1 + WLV 5 + WLV 9 + WLV 13 + WLV 2 + WLV 6 + WLV 10 + WLV 14 + WLV 3 + WLV 7 + WLV 11 + WLV 15 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .data + +_chromaSelectors: + .long _B_hihi + .long _B_Ures0 + .long _B_Vres0 + .long 0 + .long _B_lolo + .long _B_Ures8 + .long _B_Vres8 + .long 0 + + .align 4 +_B_Vres8: + .byte 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 + + .align 4 +_B_Ures8: + .byte 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7 + + .align 4 +_B_lolo: + .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 + + .align 4 +_B_Vres0: + .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 + .align 4 +_B_Ures0: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 + + .align 4 +_B_hihi: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 diff --git a/vp8/common/ppc/platform_altivec.asm b/vp8/common/ppc/platform_altivec.asm new file mode 100644 index 000000000..227ef2a94 --- /dev/null +++ b/vp8/common/ppc/platform_altivec.asm @@ -0,0 +1,58 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + .globl save_platform_context + .globl restore_platform_context + +.macro W V P + stvx \V, 0, \P + addi \P, \P, 16 +.endm + +.macro R V P + lvx \V, 0, \P + addi \P, \P, 16 +.endm + +;# r3 context_ptr + .align 2 +save_platform_contex: + W v20, r3 + W v21, r3 + W v22, r3 + W v23, r3 + W v24, r3 + W v25, r3 + W v26, r3 + W v27, r3 + W v28, r3 + W v29, r3 + W v30, r3 + W v31, r3 + + blr + +;# r3 context_ptr + .align 2 +restore_platform_context: + R v20, r3 + R v21, r3 + R v22, r3 + R v23, r3 + R v24, r3 + R v25, r3 + R v26, r3 + R v27, r3 + R v28, r3 + R v29, r3 + R v30, r3 + R v31, r3 + + blr diff --git a/vp8/common/ppc/recon_altivec.asm b/vp8/common/ppc/recon_altivec.asm new file mode 100644 index 000000000..f478b954c --- /dev/null +++ b/vp8/common/ppc/recon_altivec.asm @@ -0,0 +1,174 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + .globl recon4b_ppc + .globl recon2b_ppc + .globl recon_b_ppc + +.macro row_of16 Diff Pred Dst Stride + lvx v1, 0, \Pred ;# v1 = pred = p0..p15 + addi \Pred, \Pred, 16 ;# next pred + vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7 + lvx v3, 0, \Diff ;# v3 = d0..d7 + vaddshs v2, v2, v3 ;# v2 = r0..r7 + vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15 + lvx v3, r8, \Diff ;# v3 = d8..d15 + addi \Diff, \Diff, 32 ;# next diff + vaddshs v3, v3, v1 ;# v3 = r8..r15 + vpkshus v2, v2, v3 ;# v2 = 8-bit r0..r15 + stvx v2, 0, \Dst ;# to dst + add \Dst, \Dst, \Stride ;# next dst +.endm + + .text + .align 2 +;# r3 = short *diff_ptr, +;# r4 = unsigned char *pred_ptr, +;# r5 = unsigned char *dst_ptr, +;# r6 = int stride +recon4b_ppc: + mfspr r0, 256 ;# get old VRSAVE + stw r0, -8(r1) ;# save old VRSAVE to stack + oris r0, r0, 0xf000 + mtspr 256,r0 ;# set VRSAVE + + vxor v0, v0, v0 + li r8, 16 + + row_of16 r3, r4, r5, r6 + row_of16 r3, r4, r5, r6 + row_of16 r3, r4, r5, r6 + row_of16 r3, r4, r5, r6 + + lwz r12, -8(r1) ;# restore old VRSAVE from stack + mtspr 256, r12 ;# reset old VRSAVE + + blr + +.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels + lvx v1, 0, \Pred ;# v1 = pred = p0..p15 + vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7 + lvx v3, 0, \Diff ;# v3 = d0..d7 + vaddshs v2, v2, v3 ;# v2 = r0..r7 + vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15 + lvx v3, r8, \Diff ;# v2 = d8..d15 + vaddshs v3, v3, v1 ;# v3 = r8..r15 + vpkshus v2, v2, v3 ;# v3 = 8-bit r0..r15 + stvx v2, 0, r10 ;# 2 rows to dst from buf + lwz r0, 0(r10) +.if \write_first_four_pels + stw r0, 0(\Dst) + .else + stwux r0, \Dst, \Stride +.endif + lwz r0, 4(r10) + stw r0, 4(\Dst) + lwz r0, 8(r10) + stwux r0, \Dst, \Stride ;# advance dst to next row + lwz r0, 12(r10) + stw r0, 4(\Dst) +.endm + + .align 2 +;# r3 = short *diff_ptr, +;# r4 = unsigned char *pred_ptr, +;# r5 = unsigned char *dst_ptr, +;# r6 = int stride + +recon2b_ppc: + mfspr r0, 256 ;# get old VRSAVE + stw r0, -8(r1) ;# save old VRSAVE to stack + oris r0, r0, 0xf000 + mtspr 256,r0 ;# set VRSAVE + + vxor v0, v0, v0 + li r8, 16 + + la r10, -48(r1) ;# buf + + two_rows_of8 r3, r4, r5, r6, 1 + + addi r4, r4, 16; ;# next pred + addi r3, r3, 32; ;# next diff + + two_rows_of8 r3, r4, r5, r6, 0 + + lwz r12, -8(r1) ;# restore old VRSAVE from stack + mtspr 256, r12 ;# reset old VRSAVE + + blr + +.macro get_two_diff_rows + stw r0, 0(r10) + lwz r0, 4(r3) + stw r0, 4(r10) + lwzu r0, 32(r3) + stw r0, 8(r10) + lwz r0, 4(r3) + stw r0, 12(r10) + lvx v3, 0, r10 +.endm + + .align 2 +;# r3 = short *diff_ptr, +;# r4 = unsigned char *pred_ptr, +;# r5 = unsigned char *dst_ptr, +;# r6 = int stride +recon_b_ppc: + mfspr r0, 256 ;# get old VRSAVE + stw r0, -8(r1) ;# save old VRSAVE to stack + oris r0, r0, 0xf000 + mtspr 256,r0 ;# set VRSAVE + + vxor v0, v0, v0 + + la r10, -48(r1) ;# buf + + lwz r0, 0(r4) + stw r0, 0(r10) + lwz r0, 16(r4) + stw r0, 4(r10) + lwz r0, 32(r4) + stw r0, 8(r10) + lwz r0, 48(r4) + stw r0, 12(r10) + + lvx v1, 0, r10; ;# v1 = pred = p0..p15 + + lwz r0, 0(r3) ;# v3 = d0..d7 + + get_two_diff_rows + + vmrghb v2, v0, v1; ;# v2 = 16-bit p0..p7 + vaddshs v2, v2, v3; ;# v2 = r0..r7 + + lwzu r0, 32(r3) ;# v3 = d8..d15 + + get_two_diff_rows + + vmrglb v1, v0, v1; ;# v1 = 16-bit p8..p15 + vaddshs v3, v3, v1; ;# v3 = r8..r15 + + vpkshus v2, v2, v3; ;# v2 = 8-bit r0..r15 + stvx v2, 0, r10; ;# 16 pels to dst from buf + + lwz r0, 0(r10) + stw r0, 0(r5) + lwz r0, 4(r10) + stwux r0, r5, r6 + lwz r0, 8(r10) + stwux r0, r5, r6 + lwz r0, 12(r10) + stwx r0, r5, r6 + + lwz r12, -8(r1) ;# restore old VRSAVE from stack + mtspr 256, r12 ;# reset old VRSAVE + + blr diff --git a/vp8/common/ppc/systemdependent.c b/vp8/common/ppc/systemdependent.c new file mode 100644 index 000000000..284731085 --- /dev/null +++ b/vp8/common/ppc/systemdependent.c @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "g_common.h" +#include "subpixel.h" +#include "loopfilter.h" +#include "recon.h" +#include "idct.h" +#include "onyxc_int.h" + +void (*vp8_short_idct4x4)(short *input, short *output, int pitch); +void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch); +void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch); + +extern void (*vp8_post_proc_down_and_across)( + unsigned char *src_ptr, + unsigned char *dst_ptr, + int src_pixels_per_line, + int dst_pixels_per_line, + int rows, + int cols, + int flimit +); + +extern void (*vp8_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit); +extern void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit); +extern void (*vp8_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit); +extern void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit); + +extern void vp8_post_proc_down_and_across_c +( + unsigned char *src_ptr, + unsigned char *dst_ptr, + int src_pixels_per_line, + int dst_pixels_per_line, + int rows, + int cols, + int flimit +); +void vp8_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a); + +extern copy_mem_block_function *vp8_copy_mem16x16; +extern copy_mem_block_function *vp8_copy_mem8x8; +extern copy_mem_block_function *vp8_copy_mem8x4; + +// PPC +extern subpixel_predict_function sixtap_predict_ppc; +extern subpixel_predict_function sixtap_predict8x4_ppc; +extern subpixel_predict_function sixtap_predict8x8_ppc; +extern subpixel_predict_function sixtap_predict16x16_ppc; +extern subpixel_predict_function bilinear_predict4x4_ppc; +extern subpixel_predict_function bilinear_predict8x4_ppc; +extern subpixel_predict_function bilinear_predict8x8_ppc; +extern subpixel_predict_function bilinear_predict16x16_ppc; + +extern copy_mem_block_function copy_mem16x16_ppc; + +void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); +void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); +void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); + +extern void short_idct4x4llm_ppc(short *input, short *output, int pitch); + +// Generic C +extern subpixel_predict_function vp8_sixtap_predict_c; +extern subpixel_predict_function vp8_sixtap_predict8x4_c; +extern subpixel_predict_function vp8_sixtap_predict8x8_c; +extern subpixel_predict_function vp8_sixtap_predict16x16_c; +extern subpixel_predict_function vp8_bilinear_predict4x4_c; +extern subpixel_predict_function vp8_bilinear_predict8x4_c; +extern subpixel_predict_function vp8_bilinear_predict8x8_c; +extern subpixel_predict_function vp8_bilinear_predict16x16_c; + +extern copy_mem_block_function vp8_copy_mem16x16_c; +extern copy_mem_block_function vp8_copy_mem8x8_c; +extern copy_mem_block_function vp8_copy_mem8x4_c; + +void vp8_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); +void vp8_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); +void vp8_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); + +extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch); +extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch); +extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch); + +// PPC +extern loop_filter_block_function loop_filter_mbv_ppc; +extern loop_filter_block_function loop_filter_bv_ppc; +extern loop_filter_block_function loop_filter_mbh_ppc; +extern loop_filter_block_function loop_filter_bh_ppc; + +extern loop_filter_block_function loop_filter_mbvs_ppc; +extern loop_filter_block_function loop_filter_bvs_ppc; +extern loop_filter_block_function loop_filter_mbhs_ppc; +extern loop_filter_block_function loop_filter_bhs_ppc; + +// Generic C +extern loop_filter_block_function vp8_loop_filter_mbv_c; +extern loop_filter_block_function vp8_loop_filter_bv_c; +extern loop_filter_block_function vp8_loop_filter_mbh_c; +extern loop_filter_block_function vp8_loop_filter_bh_c; + +extern loop_filter_block_function vp8_loop_filter_mbvs_c; +extern loop_filter_block_function vp8_loop_filter_bvs_c; +extern loop_filter_block_function vp8_loop_filter_mbhs_c; +extern loop_filter_block_function vp8_loop_filter_bhs_c; + +extern loop_filter_block_function *vp8_lf_mbvfull; +extern loop_filter_block_function *vp8_lf_mbhfull; +extern loop_filter_block_function *vp8_lf_bvfull; +extern loop_filter_block_function *vp8_lf_bhfull; + +extern loop_filter_block_function *vp8_lf_mbvsimple; +extern loop_filter_block_function *vp8_lf_mbhsimple; +extern loop_filter_block_function *vp8_lf_bvsimple; +extern loop_filter_block_function *vp8_lf_bhsimple; + +void vp8_clear_c(void) +{ +} + +void vp8_machine_specific_config(void) +{ + // Pure C: + vp8_clear_system_state = vp8_clear_c; + vp8_recon_b = vp8_recon_b_c; + vp8_recon4b = vp8_recon4b_c; + vp8_recon2b = vp8_recon2b_c; + + vp8_bilinear_predict16x16 = bilinear_predict16x16_ppc; + vp8_bilinear_predict8x8 = bilinear_predict8x8_ppc; + vp8_bilinear_predict8x4 = bilinear_predict8x4_ppc; + vp8_bilinear_predict = bilinear_predict4x4_ppc; + + vp8_sixtap_predict16x16 = sixtap_predict16x16_ppc; + vp8_sixtap_predict8x8 = sixtap_predict8x8_ppc; + vp8_sixtap_predict8x4 = sixtap_predict8x4_ppc; + vp8_sixtap_predict = sixtap_predict_ppc; + + vp8_short_idct4x4_1 = vp8_short_idct4x4llm_1_c; + vp8_short_idct4x4 = short_idct4x4llm_ppc; + vp8_dc_only_idct = vp8_dc_only_idct_c; + + vp8_lf_mbvfull = loop_filter_mbv_ppc; + vp8_lf_bvfull = loop_filter_bv_ppc; + vp8_lf_mbhfull = loop_filter_mbh_ppc; + vp8_lf_bhfull = loop_filter_bh_ppc; + + vp8_lf_mbvsimple = loop_filter_mbvs_ppc; + vp8_lf_bvsimple = loop_filter_bvs_ppc; + vp8_lf_mbhsimple = loop_filter_mbhs_ppc; + vp8_lf_bhsimple = loop_filter_bhs_ppc; + + vp8_post_proc_down_and_across = vp8_post_proc_down_and_across_c; + vp8_mbpost_proc_down = vp8_mbpost_proc_down_c; + vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_c; + vp8_plane_add_noise = vp8_plane_add_noise_c; + + vp8_copy_mem16x16 = copy_mem16x16_ppc; + vp8_copy_mem8x8 = vp8_copy_mem8x8_c; + vp8_copy_mem8x4 = vp8_copy_mem8x4_c; + +} diff --git a/vp8/common/ppflags.h b/vp8/common/ppflags.h new file mode 100644 index 000000000..c66397682 --- /dev/null +++ b/vp8/common/ppflags.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_PPFLAGS_H +#define __INC_PPFLAGS_H +enum +{ + VP8D_NOFILTERING = 0, + VP8D_DEBLOCK = 1, + VP8D_DEMACROBLOCK = 2, + VP8D_ADDNOISE = 4, + VP8D_DEBUG_LEVEL1 = 8, + VP8D_DEBUG_LEVEL2 = 16, + VP8D_DEBUG_LEVEL3 = 32, + VP8D_DEBUG_LEVEL4 = 64, +}; + +#endif diff --git a/vp8/common/pragmas.h b/vp8/common/pragmas.h new file mode 100644 index 000000000..25a4b776f --- /dev/null +++ b/vp8/common/pragmas.h @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + + + +#ifdef __INTEL_COMPILER +#pragma warning(disable:997 1011 170) +#endif +#ifdef _MSC_VER +#pragma warning(disable:4799) +#endif diff --git a/vp8/common/predictdc.c b/vp8/common/predictdc.c new file mode 100644 index 000000000..df4c96e4a --- /dev/null +++ b/vp8/common/predictdc.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include <stdlib.h> +#include "blockd.h" + + +void vp8_predict_dc(short *lastdc, short *thisdc, short quant, short *cons) +{ + int diff; + int sign; + int last_dc = *lastdc; + int this_dc = *thisdc; + + if (*cons > DCPREDCNTTHRESH) + { + this_dc += last_dc; + } + + diff = abs(last_dc - this_dc); + sign = (last_dc >> 31) ^(this_dc >> 31); + sign |= (!last_dc | !this_dc); + + if (sign) + { + *cons = 0; + } + else + { + if (diff <= DCPREDSIMTHRESH * quant) + (*cons)++ ; + } + + *thisdc = this_dc; + *lastdc = this_dc; +} diff --git a/vp8/common/predictdc.h b/vp8/common/predictdc.h new file mode 100644 index 000000000..b8871e452 --- /dev/null +++ b/vp8/common/predictdc.h @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __PREDICTDC_H +#define __PREDICTDC_H + +void uvvp8_predict_dc(short *lastdc, short *thisdc, short quant, short *cons); +void vp8_predict_dc(short *lastdc, short *thisdc, short quant, short *cons); + +#endif diff --git a/vp8/common/preproc.h b/vp8/common/preproc.h new file mode 100644 index 000000000..00ec9a8d7 --- /dev/null +++ b/vp8/common/preproc.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** +* +* Module Title : preproc.h +* +* Description : simple preprocessor +* +****************************************************************************/ + +#ifndef __INC_PREPROC_H +#define __INC_PREPROC_H + +/**************************************************************************** +* Types +****************************************************************************/ + +typedef struct +{ + unsigned char *frame_buffer; + int frame; + unsigned int *fixed_divide; + + unsigned char *frame_buffer_alloc; + unsigned int *fixed_divide_alloc; +} pre_proc_instance; + +/**************************************************************************** +* Functions. +****************************************************************************/ +void pre_proc_machine_specific_config(void); +void delete_pre_proc(pre_proc_instance *ppi); +int init_pre_proc(pre_proc_instance *ppi, int frame_size); +extern void spatial_filter_c(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int width, int height, int pitch, int strength); +extern void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength); + +#endif diff --git a/vp8/common/preprocif.h b/vp8/common/preprocif.h new file mode 100644 index 000000000..986c45b10 --- /dev/null +++ b/vp8/common/preprocif.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** +* +* Module Title : preproc_if.h +* +* Description : Pre-processor interface header file. +* +****************************************************************************/ + +#ifndef __PREPROC_IF_H +#define __PREPROC_IF_H + +/**************************************************************************** +* Header Files +****************************************************************************/ +#include "type_aliases.h" + +/**************************************************************************** +* Types +****************************************************************************/ + +typedef struct +{ + UINT8 *Yuv0ptr; + UINT8 *Yuv1ptr; + + UINT8 *frag_info; // blocks coded : passed in + UINT32 frag_info_element_size; // size of each element + UINT32 frag_info_coded_mask; // mask to get at whether fragment is coded + + UINT32 *region_index; // Gives pixel index for top left of each block + UINT32 video_frame_height; + UINT32 video_frame_width; + UINT8 hfrag_pixels; + UINT8 vfrag_pixels; + +} SCAN_CONFIG_DATA; + +typedef enum +{ + SCP_FILTER_ON_OFF, + SCP_SET_SRF_OFFSET, + SCP_SET_EBO_ON_OFF, + SCP_SET_VCAP_LEVEL_OFFSET, + SCP_SET_SHOW_LOCAL + +} SCP_SETTINGS; + +typedef struct PP_INSTANCE *x_pp_inst; + +/**************************************************************************** +* Module statics +****************************************************************************/ +/* Controls whether Early break out is on or off in default case */ +#define EARLY_BREAKOUT_DEFAULT TRUE + +/**************************************************************************** +* Functions +****************************************************************************/ +extern void set_scan_param(x_pp_inst ppi, UINT32 param_id, INT32 param_value); +extern UINT32 yuvanalyse_frame(x_pp_inst ppi, UINT32 *KFIndicator); +extern x_pp_inst create_pp_instance(void); +extern void delete_pp_instance(x_pp_inst *); +extern BOOL scan_yuvinit(x_pp_inst, SCAN_CONFIG_DATA *scan_config_ptr); + +#endif diff --git a/vp8/common/proposed.h b/vp8/common/proposed.h new file mode 100644 index 000000000..1171ede43 --- /dev/null +++ b/vp8/common/proposed.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +typedef struct core_codec *codec_ptr; +typedef struct interface_table *interface_ptr; + +typedef struct +{ + void (*Initialize)(); + void (*Shutdown)(); + codec_ptr(*Create)(); + int (*compress_frame)(codec_ptr, unsigned int *frame_flags, YV12_BUFFER_CONFIG *sd, unsigned long *size, char *dest, INT64 time_stamp); + int (*show_frame)(codec_ptr , YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags); + void (*Remove)(codec_ptr *comp); + interface_ptr(*get_interface)(unsigned int id); + +} core_codec; + +typedef struct +{ + int (*set_bitrate)(codec_ptr, END_USAGE usage, int Datarate); + int (*get_bitrate)(codec_ptr, END_USAGE *usage, int *Datarate); + int (*set_mode)(codec_ptr, MODE mode, int Speed, char *File); + int (*get_mode)(codec_ptr, MODE *mode, int *Speed, char **File); +} codec_settings_basic; + +typedef struct +{ + int (*set_bitrate)(codec_ptr, END_USAGE usage, int Datarate); + int (*get_bitrate)(codec_ptr, END_USAGE *usage, int *Datarate); + int (*set_mode)(codec_ptr, MODE mode, int Speed, char *File); + int (*get_mode)(codec_ptr, MODE *mode, int *Speed, char **File); + int (*set_denoise)(codec_ptr, int Level); + int (*get_denoise)(codec_ptr, int *Level); + int (*set_sharpness)(codec_ptr, int sharpness); + int (*get_sharpness)(codec_ptr, int *sharpness); + int (*set_keyframing)(codec_ptr, int Auto, int max_distance); + int (*get_keyframing)(codec_ptr, int *Auto, int *max_distance); + int (*set_buffering)(codec_ptr, int buffer_level, int max_buffer_level); + int (*get_buffering)(codec_ptr, int *buffer_level, int *max_buffer_level); + int (*set_adjust_frame_rate)(codec_ptr, int Allowed, int at_buffer_level_pct); + int (*get_adjust_frame_rate)(codec_ptr, int *Allowed, int *at_buffer_level_pct); + int (*set_adjust_frame_size)(codec_ptr, int Allowed, int down_at_buffer_level_pct, int up_at_buffer_level_pct); + int (*get_adjust_frame_size)(codec_ptr, int *Allowed, int *down_at_buffer_level_pct, int *up_at_buffer_level_pct); + int (*set_adjust_quality)(codec_ptr, int Allowed, int min_quantizer, int max_quantizer); + int (*get_adjust_quality)(codec_ptr, int *Allowed, int *min_quantizer, int *max_quantizer); + int (*set_vbrparms)(codec_ptr, int Bias, int Min, int Max); + int (*get_vbrparms)(codec_ptr, int *Bias, int *Min, int *Max); + +} codec_settings_v1; + +typedef struct +{ + int (*request_recovery)(codec_ptr); + int (*request_droppable)(codec_ptr); + int (*internal_size)(codec_ptr, VPX_SCALING Vertical, VPX_SCALING Horizontal); + int (*update_last)(codec_ptr); + int (*update_gold)(codec_ptr); + int (*use_only_last)(codec_ptr); + int (*use_only_gold)(codec_ptr); + int (*update_entropy)(codec_ptr); + +} codec_realtime_requests; diff --git a/vp8/common/quant_common.c b/vp8/common/quant_common.c new file mode 100644 index 000000000..09fe31fe5 --- /dev/null +++ b/vp8/common/quant_common.c @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "quant_common.h" + +static const int dc_qlookup[QINDEX_RANGE] = +{ + 4, 5, 6, 7, 8, 9, 10, 10, 11, 12, 13, 14, 15, 16, 17, 17, + 18, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 25, 25, 26, 27, 28, + 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 91, 93, 95, 96, 98, 100, 101, 102, 104, 106, 108, 110, 112, 114, 116, 118, + 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 143, 145, 148, 151, 154, 157, +}; + +static const int ac_qlookup[QINDEX_RANGE] = +{ + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, + 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, + 52, 53, 54, 55, 56, 57, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, + 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, + 110, 112, 114, 116, 119, 122, 125, 128, 131, 134, 137, 140, 143, 146, 149, 152, + 155, 158, 161, 164, 167, 170, 173, 177, 181, 185, 189, 193, 197, 201, 205, 209, + 213, 217, 221, 225, 229, 234, 239, 245, 249, 254, 259, 264, 269, 274, 279, 284, +}; + + +int vp8_dc_quant(int QIndex, int Delta) +{ + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > 127) + QIndex = 127; + else if (QIndex < 0) + QIndex = 0; + + retval = dc_qlookup[ QIndex ]; + return retval; +} + +int vp8_dc2quant(int QIndex, int Delta) +{ + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > 127) + QIndex = 127; + else if (QIndex < 0) + QIndex = 0; + + retval = dc_qlookup[ QIndex ] * 2; + return retval; + +} +int vp8_dc_uv_quant(int QIndex, int Delta) +{ + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > 127) + QIndex = 127; + else if (QIndex < 0) + QIndex = 0; + + retval = dc_qlookup[ QIndex ]; + + if (retval > 132) + retval = 132; + + return retval; +} + +int vp8_ac_yquant(int QIndex) +{ + int retval; + + if (QIndex > 127) + QIndex = 127; + else if (QIndex < 0) + QIndex = 0; + + retval = ac_qlookup[ QIndex ]; + return retval; +} + +int vp8_ac2quant(int QIndex, int Delta) +{ + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > 127) + QIndex = 127; + else if (QIndex < 0) + QIndex = 0; + + retval = (ac_qlookup[ QIndex ] * 155) / 100; + + if (retval < 8) + retval = 8; + + return retval; +} +int vp8_ac_uv_quant(int QIndex, int Delta) +{ + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > 127) + QIndex = 127; + else if (QIndex < 0) + QIndex = 0; + + retval = ac_qlookup[ QIndex ]; + return retval; +} diff --git a/vp8/common/quant_common.h b/vp8/common/quant_common.h new file mode 100644 index 000000000..0c92ce8b9 --- /dev/null +++ b/vp8/common/quant_common.h @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "string.h" +#include "blockd.h" +#include "onyxc_int.h" + +extern int vp8_ac_yquant(int QIndex); +extern int vp8_dc_quant(int QIndex, int Delta); +extern int vp8_dc2quant(int QIndex, int Delta); +extern int vp8_ac2quant(int QIndex, int Delta); +extern int vp8_dc_uv_quant(int QIndex, int Delta); +extern int vp8_ac_uv_quant(int QIndex, int Delta); diff --git a/vp8/common/recon.c b/vp8/common/recon.c new file mode 100644 index 000000000..d1268ea22 --- /dev/null +++ b/vp8/common/recon.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "recon.h" +#include "blockd.h" + +void vp8_recon_b_c +( + unsigned char *pred_ptr, + short *diff_ptr, + unsigned char *dst_ptr, + int stride +) +{ + int r, c; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + int a = diff_ptr[c] + pred_ptr[c] ; + + if (a < 0) + a = 0; + + if (a > 255) + a = 255; + + dst_ptr[c] = (unsigned char) a ; + } + + dst_ptr += stride; + diff_ptr += 16; + pred_ptr += 16; + } +} + +void vp8_recon4b_c +( + unsigned char *pred_ptr, + short *diff_ptr, + unsigned char *dst_ptr, + int stride +) +{ + int r, c; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 16; c++) + { + int a = diff_ptr[c] + pred_ptr[c] ; + + if (a < 0) + a = 0; + + if (a > 255) + a = 255; + + dst_ptr[c] = (unsigned char) a ; + } + + dst_ptr += stride; + diff_ptr += 16; + pred_ptr += 16; + } +} + +void vp8_recon2b_c +( + unsigned char *pred_ptr, + short *diff_ptr, + unsigned char *dst_ptr, + int stride +) +{ + int r, c; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 8; c++) + { + int a = diff_ptr[c] + pred_ptr[c] ; + + if (a < 0) + a = 0; + + if (a > 255) + a = 255; + + dst_ptr[c] = (unsigned char) a ; + } + + dst_ptr += stride; + diff_ptr += 8; + pred_ptr += 8; + } +} + +void vp8_recon16x16mby(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x) +{ + int i; + + for (i = 0; i < 16; i += 4) + { + BLOCKD *b = &x->block[i]; + + RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + } +} + +void vp8_recon16x16mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x) +{ + int i; + + for (i = 0; i < 16; i += 4) + { + BLOCKD *b = &x->block[i]; + + RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + } + + for (i = 16; i < 24; i += 2) + { + BLOCKD *b = &x->block[i]; + + RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + } +} diff --git a/vp8/common/recon.h b/vp8/common/recon.h new file mode 100644 index 000000000..f65a90f7e --- /dev/null +++ b/vp8/common/recon.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_RECON_H +#define __INC_RECON_H + +#define prototype_copy_block(sym) \ + void sym(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch) + +#define prototype_recon_block(sym) \ + void sym(unsigned char *pred, short *diff, unsigned char *dst, int pitch); + +#if ARCH_X86 || ARCH_X86_64 +#include "x86/recon_x86.h" +#endif + +#if ARCH_ARM +#include "arm/recon_arm.h" +#endif + +#ifndef vp8_recon_copy16x16 +#define vp8_recon_copy16x16 vp8_copy_mem16x16_c +#endif +extern prototype_copy_block(vp8_recon_copy16x16); + +#ifndef vp8_recon_copy8x8 +#define vp8_recon_copy8x8 vp8_copy_mem8x8_c +#endif +extern prototype_copy_block(vp8_recon_copy8x8); + +#ifndef vp8_recon_copy8x4 +#define vp8_recon_copy8x4 vp8_copy_mem8x4_c +#endif +extern prototype_copy_block(vp8_recon_copy8x4); + +#ifndef vp8_recon_recon +#define vp8_recon_recon vp8_recon_b_c +#endif +extern prototype_recon_block(vp8_recon_recon); + +#ifndef vp8_recon_recon2 +#define vp8_recon_recon2 vp8_recon2b_c +#endif +extern prototype_recon_block(vp8_recon_recon2); + +#ifndef vp8_recon_recon4 +#define vp8_recon_recon4 vp8_recon4b_c +#endif +extern prototype_recon_block(vp8_recon_recon4); + +typedef prototype_copy_block((*vp8_copy_block_fn_t)); +typedef prototype_recon_block((*vp8_recon_fn_t)); +typedef struct +{ + vp8_copy_block_fn_t copy16x16; + vp8_copy_block_fn_t copy8x8; + vp8_copy_block_fn_t copy8x4; + vp8_recon_fn_t recon; + vp8_recon_fn_t recon2; + vp8_recon_fn_t recon4; +} vp8_recon_rtcd_vtable_t; + +#if CONFIG_RUNTIME_CPU_DETECT +#define RECON_INVOKE(ctx,fn) (ctx)->fn +#else +#define RECON_INVOKE(ctx,fn) vp8_recon_##fn +#endif + +#include "blockd.h" +void vp8_recon16x16mby(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x); +void vp8_recon16x16mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x); +void vp8_recon_intra4x4mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x); +void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x); +#endif diff --git a/vp8/common/reconinter.c b/vp8/common/reconinter.c new file mode 100644 index 000000000..c48886deb --- /dev/null +++ b/vp8/common/reconinter.c @@ -0,0 +1,680 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "recon.h" +#include "subpixel.h" +#include "blockd.h" +#include "reconinter.h" +#if CONFIG_RUNTIME_CPU_DETECT +#include "onyxc_int.h" +#endif + +// use this define on systems where unaligned int reads and writes are +// not allowed, i.e. ARM architectures +//#define MUST_BE_ALIGNED + + +static const int bbb[4] = {0, 2, 8, 10}; + + + +void vp8_copy_mem16x16_c( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) +{ + + int r; + + for (r = 0; r < 16; r++) + { +#ifdef MUST_BE_ALIGNED + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; + dst[4] = src[4]; + dst[5] = src[5]; + dst[6] = src[6]; + dst[7] = src[7]; + dst[8] = src[8]; + dst[9] = src[9]; + dst[10] = src[10]; + dst[11] = src[11]; + dst[12] = src[12]; + dst[13] = src[13]; + dst[14] = src[14]; + dst[15] = src[15]; + +#else + ((int *)dst)[0] = ((int *)src)[0] ; + ((int *)dst)[1] = ((int *)src)[1] ; + ((int *)dst)[2] = ((int *)src)[2] ; + ((int *)dst)[3] = ((int *)src)[3] ; + +#endif + src += src_stride; + dst += dst_stride; + + } + +} + +void vp8_copy_mem8x8_c( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) +{ + int r; + + for (r = 0; r < 8; r++) + { +#ifdef MUST_BE_ALIGNED + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; + dst[4] = src[4]; + dst[5] = src[5]; + dst[6] = src[6]; + dst[7] = src[7]; +#else + ((int *)dst)[0] = ((int *)src)[0] ; + ((int *)dst)[1] = ((int *)src)[1] ; +#endif + src += src_stride; + dst += dst_stride; + + } + +} + +void vp8_copy_mem8x4_c( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) +{ + int r; + + for (r = 0; r < 4; r++) + { +#ifdef MUST_BE_ALIGNED + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; + dst[4] = src[4]; + dst[5] = src[5]; + dst[6] = src[6]; + dst[7] = src[7]; +#else + ((int *)dst)[0] = ((int *)src)[0] ; + ((int *)dst)[1] = ((int *)src)[1] ; +#endif + src += src_stride; + dst += dst_stride; + + } + +} + + + +void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf) +{ + int r; + unsigned char *ptr_base; + unsigned char *ptr; + unsigned char *pred_ptr = d->predictor; + + ptr_base = *(d->base_pre); + + if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) + { + ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); + sppf(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch); + } + else + { + ptr_base += d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); + ptr = ptr_base; + + for (r = 0; r < 4; r++) + { +#ifdef MUST_BE_ALIGNED + pred_ptr[0] = ptr[0]; + pred_ptr[1] = ptr[1]; + pred_ptr[2] = ptr[2]; + pred_ptr[3] = ptr[3]; +#else + *(int *)pred_ptr = *(int *)ptr ; +#endif + pred_ptr += pitch; + ptr += d->pre_stride; + } + } +} + +void vp8_build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, int pitch) +{ + unsigned char *ptr_base; + unsigned char *ptr; + unsigned char *pred_ptr = d->predictor; + + ptr_base = *(d->base_pre); + ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); + + if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) + { + x->subpixel_predict8x8(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch); + } + else + { + RECON_INVOKE(&x->rtcd->recon, copy8x8)(ptr, d->pre_stride, pred_ptr, pitch); + } +} + +void vp8_build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, int pitch) +{ + unsigned char *ptr_base; + unsigned char *ptr; + unsigned char *pred_ptr = d->predictor; + + ptr_base = *(d->base_pre); + ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); + + if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) + { + x->subpixel_predict8x4(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch); + } + else + { + RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr, d->pre_stride, pred_ptr, pitch); + } +} + + +void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x) +{ + int i; + + if (x->mbmi.ref_frame != INTRA_FRAME && x->mbmi.mode != SPLITMV) + { + unsigned char *uptr, *vptr; + unsigned char *upred_ptr = &x->predictor[256]; + unsigned char *vpred_ptr = &x->predictor[320]; + + int mv_row = x->block[16].bmi.mv.as_mv.row; + int mv_col = x->block[16].bmi.mv.as_mv.col; + int offset; + int pre_stride = x->block[16].pre_stride; + + offset = (mv_row >> 3) * pre_stride + (mv_col >> 3); + uptr = x->pre.u_buffer + offset; + vptr = x->pre.v_buffer + offset; + + if ((mv_row | mv_col) & 7) + { + x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8); + x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8); + } + else + { + RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, upred_ptr, 8); + RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, vpred_ptr, 8); + } + } + else + { + for (i = 16; i < 24; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + vp8_build_inter_predictors2b(x, d0, 8); + else + { + vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict); + vp8_build_inter_predictors_b(d1, 8, x->subpixel_predict); + } + } + } +} + + +void vp8_build_inter_predictors_mby(MACROBLOCKD *x) +{ + if (x->mbmi.ref_frame != INTRA_FRAME && x->mbmi.mode != SPLITMV) + { + unsigned char *ptr_base; + unsigned char *ptr; + unsigned char *pred_ptr = x->predictor; + int mv_row = x->mbmi.mv.as_mv.row; + int mv_col = x->mbmi.mv.as_mv.col; + int pre_stride = x->block[0].pre_stride; + + ptr_base = x->pre.y_buffer; + ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3); + + if ((mv_row | mv_col) & 7) + { + x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16); + } + else + { + RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, pred_ptr, 16); + } + } + else + { + int i; + + if (x->mbmi.partitioning < 3) + { + for (i = 0; i < 4; i++) + { + BLOCKD *d = &x->block[bbb[i]]; + vp8_build_inter_predictors4b(x, d, 16); + } + + } + else + { + for (i = 0; i < 16; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + vp8_build_inter_predictors2b(x, d0, 16); + else + { + vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict); + vp8_build_inter_predictors_b(d1, 16, x->subpixel_predict); + } + + } + } + } +} + +void vp8_build_inter_predictors_mb(MACROBLOCKD *x) +{ + if (x->mbmi.ref_frame != INTRA_FRAME && x->mbmi.mode != SPLITMV) + { + int offset; + unsigned char *ptr_base; + unsigned char *ptr; + unsigned char *uptr, *vptr; + unsigned char *pred_ptr = x->predictor; + unsigned char *upred_ptr = &x->predictor[256]; + unsigned char *vpred_ptr = &x->predictor[320]; + + int mv_row = x->mbmi.mv.as_mv.row; + int mv_col = x->mbmi.mv.as_mv.col; + int pre_stride = x->block[0].pre_stride; + + ptr_base = x->pre.y_buffer; + ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3); + + if ((mv_row | mv_col) & 7) + { + x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16); + } + else + { + RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, pred_ptr, 16); + } + + mv_row = x->block[16].bmi.mv.as_mv.row; + mv_col = x->block[16].bmi.mv.as_mv.col; + pre_stride >>= 1; + offset = (mv_row >> 3) * pre_stride + (mv_col >> 3); + uptr = x->pre.u_buffer + offset; + vptr = x->pre.v_buffer + offset; + + if ((mv_row | mv_col) & 7) + { + x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8); + x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8); + } + else + { + RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, upred_ptr, 8); + RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, vpred_ptr, 8); + } + } + else + { + int i; + + if (x->mbmi.partitioning < 3) + { + for (i = 0; i < 4; i++) + { + BLOCKD *d = &x->block[bbb[i]]; + vp8_build_inter_predictors4b(x, d, 16); + } + } + else + { + for (i = 0; i < 16; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + vp8_build_inter_predictors2b(x, d0, 16); + else + { + vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict); + vp8_build_inter_predictors_b(d1, 16, x->subpixel_predict); + } + + } + + } + + for (i = 16; i < 24; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + vp8_build_inter_predictors2b(x, d0, 8); + else + { + vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict); + vp8_build_inter_predictors_b(d1, 8, x->subpixel_predict); + } + + } + + } +} + +void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel) +{ + int i, j; + + if (x->mbmi.mode == SPLITMV) + { + for (i = 0; i < 2; i++) + { + for (j = 0; j < 2; j++) + { + int yoffset = i * 8 + j * 2; + int uoffset = 16 + i * 2 + j; + int voffset = 20 + i * 2 + j; + + int temp; + + temp = x->block[yoffset ].bmi.mv.as_mv.row + + x->block[yoffset+1].bmi.mv.as_mv.row + + x->block[yoffset+4].bmi.mv.as_mv.row + + x->block[yoffset+5].bmi.mv.as_mv.row; + + if (temp < 0) temp -= 4; + else temp += 4; + + x->block[uoffset].bmi.mv.as_mv.row = temp / 8; + + if (fullpixel) + x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & 0xfffffff8; + + temp = x->block[yoffset ].bmi.mv.as_mv.col + + x->block[yoffset+1].bmi.mv.as_mv.col + + x->block[yoffset+4].bmi.mv.as_mv.col + + x->block[yoffset+5].bmi.mv.as_mv.col; + + if (temp < 0) temp -= 4; + else temp += 4; + + x->block[uoffset].bmi.mv.as_mv.col = temp / 8; + + if (fullpixel) + x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & 0xfffffff8; + + x->block[voffset].bmi.mv.as_mv.row = x->block[uoffset].bmi.mv.as_mv.row ; + x->block[voffset].bmi.mv.as_mv.col = x->block[uoffset].bmi.mv.as_mv.col ; + } + } + } + else + { + int mvrow = x->mbmi.mv.as_mv.row; + int mvcol = x->mbmi.mv.as_mv.col; + + if (mvrow < 0) + mvrow -= 1; + else + mvrow += 1; + + if (mvcol < 0) + mvcol -= 1; + else + mvcol += 1; + + mvrow /= 2; + mvcol /= 2; + + for (i = 0; i < 8; i++) + { + x->block[ 16 + i].bmi.mv.as_mv.row = mvrow; + x->block[ 16 + i].bmi.mv.as_mv.col = mvcol; + + if (fullpixel) + { + x->block[ 16 + i].bmi.mv.as_mv.row = mvrow & 0xfffffff8; + x->block[ 16 + i].bmi.mv.as_mv.col = mvcol & 0xfffffff8; + } + } + } +} + + +// The following functions are wriiten for skip_recon_mb() to call. Since there is no recon in this +// situation, we can write the result directly to dst buffer instead of writing it to predictor +// buffer and then copying it to dst buffer. +static void vp8_build_inter_predictors_b_s(BLOCKD *d, unsigned char *dst_ptr, vp8_subpix_fn_t sppf) +{ + int r; + unsigned char *ptr_base; + unsigned char *ptr; + //unsigned char *pred_ptr = d->predictor; + int dst_stride = d->dst_stride; + int pre_stride = d->pre_stride; + + ptr_base = *(d->base_pre); + + if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) + { + ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); + sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_ptr, dst_stride); + } + else + { + ptr_base += d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); + ptr = ptr_base; + + for (r = 0; r < 4; r++) + { +#ifdef MUST_BE_ALIGNED + dst_ptr[0] = ptr[0]; + dst_ptr[1] = ptr[1]; + dst_ptr[2] = ptr[2]; + dst_ptr[3] = ptr[3]; +#else + *(int *)dst_ptr = *(int *)ptr ; +#endif + dst_ptr += dst_stride; + ptr += pre_stride; + } + } +} + + + +void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x) +{ + //unsigned char *pred_ptr = x->block[0].predictor; + //unsigned char *dst_ptr = *(x->block[0].base_dst) + x->block[0].dst; + unsigned char *pred_ptr = x->predictor; + unsigned char *dst_ptr = x->dst.y_buffer; + + if (x->mbmi.mode != SPLITMV) + { + int offset; + unsigned char *ptr_base; + unsigned char *ptr; + unsigned char *uptr, *vptr; + //unsigned char *pred_ptr = x->predictor; + //unsigned char *upred_ptr = &x->predictor[256]; + //unsigned char *vpred_ptr = &x->predictor[320]; + unsigned char *udst_ptr = x->dst.u_buffer; + unsigned char *vdst_ptr = x->dst.v_buffer; + + int mv_row = x->mbmi.mv.as_mv.row; + int mv_col = x->mbmi.mv.as_mv.col; + int pre_stride = x->dst.y_stride; //x->block[0].pre_stride; + + ptr_base = x->pre.y_buffer; + ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3); + + if ((mv_row | mv_col) & 7) + { + x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_ptr, x->dst.y_stride); //x->block[0].dst_stride); + } + else + { + RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_ptr, x->dst.y_stride); //x->block[0].dst_stride); + } + + mv_row = x->block[16].bmi.mv.as_mv.row; + mv_col = x->block[16].bmi.mv.as_mv.col; + pre_stride >>= 1; + offset = (mv_row >> 3) * pre_stride + (mv_col >> 3); + uptr = x->pre.u_buffer + offset; + vptr = x->pre.v_buffer + offset; + + if ((mv_row | mv_col) & 7) + { + x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, udst_ptr, x->dst.uv_stride); + x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vdst_ptr, x->dst.uv_stride); + } + else + { + RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, udst_ptr, x->dst.uv_stride); + RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, vdst_ptr, x->dst.uv_stride); + } + } + else + { + //note: this whole ELSE part is not executed at all. So, no way to test the correctness of my modification. Later, + //if sth is wrong, go back to what it is in build_inter_predictors_mb. + int i; + + if (x->mbmi.partitioning < 3) + { + for (i = 0; i < 4; i++) + { + BLOCKD *d = &x->block[bbb[i]]; + //vp8_build_inter_predictors4b(x, d, 16); + + { + unsigned char *ptr_base; + unsigned char *ptr; + unsigned char *pred_ptr = d->predictor; + + ptr_base = *(d->base_pre); + ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); + + if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) + { + x->subpixel_predict8x8(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_ptr, x->dst.y_stride); //x->block[0].dst_stride); + } + else + { + RECON_INVOKE(&x->rtcd->recon, copy8x8)(ptr, d->pre_stride, dst_ptr, x->dst.y_stride); //x->block[0].dst_stride); + } + } + } + } + else + { + for (i = 0; i < 16; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + { + //vp8_build_inter_predictors2b(x, d0, 16); + unsigned char *ptr_base; + unsigned char *ptr; + unsigned char *pred_ptr = d0->predictor; + + ptr_base = *(d0->base_pre); + ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3); + + if (d0->bmi.mv.as_mv.row & 7 || d0->bmi.mv.as_mv.col & 7) + { + x->subpixel_predict8x4(ptr, d0->pre_stride, d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7, dst_ptr, x->dst.y_stride); + } + else + { + RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr, d0->pre_stride, dst_ptr, x->dst.y_stride); + } + } + else + { + vp8_build_inter_predictors_b_s(d0, dst_ptr, x->subpixel_predict); + vp8_build_inter_predictors_b_s(d1, dst_ptr, x->subpixel_predict); + } + } + } + + for (i = 16; i < 24; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + { + //vp8_build_inter_predictors2b(x, d0, 8); + unsigned char *ptr_base; + unsigned char *ptr; + unsigned char *pred_ptr = d0->predictor; + + ptr_base = *(d0->base_pre); + ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3); + + if (d0->bmi.mv.as_mv.row & 7 || d0->bmi.mv.as_mv.col & 7) + { + x->subpixel_predict8x4(ptr, d0->pre_stride, d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7, dst_ptr, x->dst.y_stride); + } + else + { + RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr, d0->pre_stride, dst_ptr, x->dst.y_stride); + } + } + else + { + vp8_build_inter_predictors_b_s(d0, dst_ptr, x->subpixel_predict); + vp8_build_inter_predictors_b_s(d1, dst_ptr, x->subpixel_predict); + } + } + } +} diff --git a/vp8/common/reconinter.h b/vp8/common/reconinter.h new file mode 100644 index 000000000..b2d1ae97a --- /dev/null +++ b/vp8/common/reconinter.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_RECONINTER_H +#define __INC_RECONINTER_H + +extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x); +extern void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x); + +extern void vp8_build_inter_predictors_mby(MACROBLOCKD *x); +extern void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel); +extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf); +extern void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x); + +#endif diff --git a/vp8/common/reconintra.c b/vp8/common/reconintra.c new file mode 100644 index 000000000..e33bce348 --- /dev/null +++ b/vp8/common/reconintra.c @@ -0,0 +1,555 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "recon.h" +#include "reconintra.h" +#include "vpx_mem/vpx_mem.h" + +// For skip_recon_mb(), add vp8_build_intra_predictors_mby_s(MACROBLOCKD *x) and +// vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x). + +void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x) +{ + int i; + + for (i = 16; i < 24; i += 2) + { + BLOCKD *b = &x->block[i]; + RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + } +} + +void vp8_build_intra_predictors_mby(MACROBLOCKD *x) +{ + + unsigned char *yabove_row = x->dst.y_buffer - x->dst.y_stride; + unsigned char yleft_col[16]; + unsigned char ytop_left = yabove_row[-1]; + unsigned char *ypred_ptr = x->predictor; + int r, c, i; + + for (i = 0; i < 16; i++) + { + yleft_col[i] = x->dst.y_buffer [i* x->dst.y_stride -1]; + } + + // for Y + switch (x->mbmi.mode) + { + case DC_PRED: + { + int expected_dc; + int i; + int shift; + int average = 0; + + + if (x->up_available || x->left_available) + { + if (x->up_available) + { + for (i = 0; i < 16; i++) + { + average += yabove_row[i]; + } + } + + if (x->left_available) + { + + for (i = 0; i < 16; i++) + { + average += yleft_col[i]; + } + + } + + + + shift = 3 + x->up_available + x->left_available; + expected_dc = (average + (1 << (shift - 1))) >> shift; + } + else + { + expected_dc = 128; + } + + vpx_memset(ypred_ptr, expected_dc, 256); + } + break; + case V_PRED: + { + + for (r = 0; r < 16; r++) + { + + ((int *)ypred_ptr)[0] = ((int *)yabove_row)[0]; + ((int *)ypred_ptr)[1] = ((int *)yabove_row)[1]; + ((int *)ypred_ptr)[2] = ((int *)yabove_row)[2]; + ((int *)ypred_ptr)[3] = ((int *)yabove_row)[3]; + ypred_ptr += 16; + } + } + break; + case H_PRED: + { + + for (r = 0; r < 16; r++) + { + + vpx_memset(ypred_ptr, yleft_col[r], 16); + ypred_ptr += 16; + } + + } + break; + case TM_PRED: + { + + for (r = 0; r < 16; r++) + { + for (c = 0; c < 16; c++) + { + int pred = yleft_col[r] + yabove_row[ c] - ytop_left; + + if (pred < 0) + pred = 0; + + if (pred > 255) + pred = 255; + + ypred_ptr[c] = pred; + } + + ypred_ptr += 16; + } + + } + break; + case B_PRED: + case NEARESTMV: + case NEARMV: + case ZEROMV: + case NEWMV: + case SPLITMV: + case MB_MODE_COUNT: + break; + } +} + +void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x) +{ + + unsigned char *yabove_row = x->dst.y_buffer - x->dst.y_stride; + unsigned char yleft_col[16]; + unsigned char ytop_left = yabove_row[-1]; + unsigned char *ypred_ptr = x->predictor; + int r, c, i; + + int y_stride = x->dst.y_stride; + ypred_ptr = x->dst.y_buffer; //x->predictor; + + for (i = 0; i < 16; i++) + { + yleft_col[i] = x->dst.y_buffer [i* x->dst.y_stride -1]; + } + + // for Y + switch (x->mbmi.mode) + { + case DC_PRED: + { + int expected_dc; + int i; + int shift; + int average = 0; + + + if (x->up_available || x->left_available) + { + if (x->up_available) + { + for (i = 0; i < 16; i++) + { + average += yabove_row[i]; + } + } + + if (x->left_available) + { + + for (i = 0; i < 16; i++) + { + average += yleft_col[i]; + } + + } + + + + shift = 3 + x->up_available + x->left_available; + expected_dc = (average + (1 << (shift - 1))) >> shift; + } + else + { + expected_dc = 128; + } + + //vpx_memset(ypred_ptr, expected_dc, 256); + for (r = 0; r < 16; r++) + { + vpx_memset(ypred_ptr, expected_dc, 16); + ypred_ptr += y_stride; //16; + } + } + break; + case V_PRED: + { + + for (r = 0; r < 16; r++) + { + + ((int *)ypred_ptr)[0] = ((int *)yabove_row)[0]; + ((int *)ypred_ptr)[1] = ((int *)yabove_row)[1]; + ((int *)ypred_ptr)[2] = ((int *)yabove_row)[2]; + ((int *)ypred_ptr)[3] = ((int *)yabove_row)[3]; + ypred_ptr += y_stride; //16; + } + } + break; + case H_PRED: + { + + for (r = 0; r < 16; r++) + { + + vpx_memset(ypred_ptr, yleft_col[r], 16); + ypred_ptr += y_stride; //16; + } + + } + break; + case TM_PRED: + { + + for (r = 0; r < 16; r++) + { + for (c = 0; c < 16; c++) + { + int pred = yleft_col[r] + yabove_row[ c] - ytop_left; + + if (pred < 0) + pred = 0; + + if (pred > 255) + pred = 255; + + ypred_ptr[c] = pred; + } + + ypred_ptr += y_stride; //16; + } + + } + break; + case B_PRED: + case NEARESTMV: + case NEARMV: + case ZEROMV: + case NEWMV: + case SPLITMV: + case MB_MODE_COUNT: + break; + } +} + +void vp8_build_intra_predictors_mbuv(MACROBLOCKD *x) +{ + unsigned char *uabove_row = x->dst.u_buffer - x->dst.uv_stride; + unsigned char uleft_col[16]; + unsigned char utop_left = uabove_row[-1]; + unsigned char *vabove_row = x->dst.v_buffer - x->dst.uv_stride; + unsigned char vleft_col[20]; + unsigned char vtop_left = vabove_row[-1]; + unsigned char *upred_ptr = &x->predictor[256]; + unsigned char *vpred_ptr = &x->predictor[320]; + int i, j; + + for (i = 0; i < 8; i++) + { + uleft_col[i] = x->dst.u_buffer [i* x->dst.uv_stride -1]; + vleft_col[i] = x->dst.v_buffer [i* x->dst.uv_stride -1]; + } + + switch (x->mbmi.uv_mode) + { + case DC_PRED: + { + int expected_udc; + int expected_vdc; + int i; + int shift; + int Uaverage = 0; + int Vaverage = 0; + + if (x->up_available) + { + for (i = 0; i < 8; i++) + { + Uaverage += uabove_row[i]; + Vaverage += vabove_row[i]; + } + } + + if (x->left_available) + { + for (i = 0; i < 8; i++) + { + Uaverage += uleft_col[i]; + Vaverage += vleft_col[i]; + } + } + + if (!x->up_available && !x->left_available) + { + expected_udc = 128; + expected_vdc = 128; + } + else + { + shift = 2 + x->up_available + x->left_available; + expected_udc = (Uaverage + (1 << (shift - 1))) >> shift; + expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift; + } + + + vpx_memset(upred_ptr, expected_udc, 64); + vpx_memset(vpred_ptr, expected_vdc, 64); + + + } + break; + case V_PRED: + { + int i; + + for (i = 0; i < 8; i++) + { + vpx_memcpy(upred_ptr, uabove_row, 8); + vpx_memcpy(vpred_ptr, vabove_row, 8); + upred_ptr += 8; + vpred_ptr += 8; + } + + } + break; + case H_PRED: + { + int i; + + for (i = 0; i < 8; i++) + { + vpx_memset(upred_ptr, uleft_col[i], 8); + vpx_memset(vpred_ptr, vleft_col[i], 8); + upred_ptr += 8; + vpred_ptr += 8; + } + } + + break; + case TM_PRED: + { + int i; + + for (i = 0; i < 8; i++) + { + for (j = 0; j < 8; j++) + { + int predu = uleft_col[i] + uabove_row[j] - utop_left; + int predv = vleft_col[i] + vabove_row[j] - vtop_left; + + if (predu < 0) + predu = 0; + + if (predu > 255) + predu = 255; + + if (predv < 0) + predv = 0; + + if (predv > 255) + predv = 255; + + upred_ptr[j] = predu; + vpred_ptr[j] = predv; + } + + upred_ptr += 8; + vpred_ptr += 8; + } + + } + break; + case B_PRED: + case NEARESTMV: + case NEARMV: + case ZEROMV: + case NEWMV: + case SPLITMV: + case MB_MODE_COUNT: + break; + } +} + +void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x) +{ + unsigned char *uabove_row = x->dst.u_buffer - x->dst.uv_stride; + unsigned char uleft_col[16]; + unsigned char utop_left = uabove_row[-1]; + unsigned char *vabove_row = x->dst.v_buffer - x->dst.uv_stride; + unsigned char vleft_col[20]; + unsigned char vtop_left = vabove_row[-1]; + unsigned char *upred_ptr = x->dst.u_buffer; //&x->predictor[256]; + unsigned char *vpred_ptr = x->dst.v_buffer; //&x->predictor[320]; + int uv_stride = x->dst.uv_stride; + + int i, j; + + for (i = 0; i < 8; i++) + { + uleft_col[i] = x->dst.u_buffer [i* x->dst.uv_stride -1]; + vleft_col[i] = x->dst.v_buffer [i* x->dst.uv_stride -1]; + } + + switch (x->mbmi.uv_mode) + { + case DC_PRED: + { + int expected_udc; + int expected_vdc; + int i; + int shift; + int Uaverage = 0; + int Vaverage = 0; + + if (x->up_available) + { + for (i = 0; i < 8; i++) + { + Uaverage += uabove_row[i]; + Vaverage += vabove_row[i]; + } + } + + if (x->left_available) + { + for (i = 0; i < 8; i++) + { + Uaverage += uleft_col[i]; + Vaverage += vleft_col[i]; + } + } + + if (!x->up_available && !x->left_available) + { + expected_udc = 128; + expected_vdc = 128; + } + else + { + shift = 2 + x->up_available + x->left_available; + expected_udc = (Uaverage + (1 << (shift - 1))) >> shift; + expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift; + } + + + //vpx_memset(upred_ptr,expected_udc,64); + //vpx_memset(vpred_ptr,expected_vdc,64); + for (i = 0; i < 8; i++) + { + vpx_memset(upred_ptr, expected_udc, 8); + vpx_memset(vpred_ptr, expected_vdc, 8); + upred_ptr += uv_stride; //8; + vpred_ptr += uv_stride; //8; + } + } + break; + case V_PRED: + { + int i; + + for (i = 0; i < 8; i++) + { + vpx_memcpy(upred_ptr, uabove_row, 8); + vpx_memcpy(vpred_ptr, vabove_row, 8); + upred_ptr += uv_stride; //8; + vpred_ptr += uv_stride; //8; + } + + } + break; + case H_PRED: + { + int i; + + for (i = 0; i < 8; i++) + { + vpx_memset(upred_ptr, uleft_col[i], 8); + vpx_memset(vpred_ptr, vleft_col[i], 8); + upred_ptr += uv_stride; //8; + vpred_ptr += uv_stride; //8; + } + } + + break; + case TM_PRED: + { + int i; + + for (i = 0; i < 8; i++) + { + for (j = 0; j < 8; j++) + { + int predu = uleft_col[i] + uabove_row[j] - utop_left; + int predv = vleft_col[i] + vabove_row[j] - vtop_left; + + if (predu < 0) + predu = 0; + + if (predu > 255) + predu = 255; + + if (predv < 0) + predv = 0; + + if (predv > 255) + predv = 255; + + upred_ptr[j] = predu; + vpred_ptr[j] = predv; + } + + upred_ptr += uv_stride; //8; + vpred_ptr += uv_stride; //8; + } + + } + break; + case B_PRED: + case NEARESTMV: + case NEARMV: + case ZEROMV: + case NEWMV: + case SPLITMV: + case MB_MODE_COUNT: + break; + } +} diff --git a/vp8/common/reconintra.h b/vp8/common/reconintra.h new file mode 100644 index 000000000..d63aa15cb --- /dev/null +++ b/vp8/common/reconintra.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_RECONINTRA_H +#define __INC_RECONINTRA_H + +extern void init_intra_left_above_pixels(MACROBLOCKD *x); + +extern void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x); +extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x); +extern void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x); +extern void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x); +extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x); +extern void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x); + +extern void vp8_build_intra_predictors_mbuv(MACROBLOCKD *x); +extern void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x); + +extern void vp8_predict_intra4x4(BLOCKD *x, int b_mode, unsigned char *Predictor); + +#endif diff --git a/vp8/common/reconintra4x4.c b/vp8/common/reconintra4x4.c new file mode 100644 index 000000000..d92d5c96a --- /dev/null +++ b/vp8/common/reconintra4x4.c @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "recon.h" +#include "vpx_mem/vpx_mem.h" +#include "reconintra.h" + +void vp8_predict_intra4x4(BLOCKD *x, + int b_mode, + unsigned char *predictor) +{ + int i, r, c; + + unsigned char *Above = *(x->base_dst) + x->dst - x->dst_stride; + unsigned char Left[4]; + unsigned char top_left = Above[-1]; + + Left[0] = (*(x->base_dst))[x->dst - 1]; + Left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride]; + Left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride]; + Left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride]; + + switch (b_mode) + { + case B_DC_PRED: + { + int expected_dc = 0; + + for (i = 0; i < 4; i++) + { + expected_dc += Above[i]; + expected_dc += Left[i]; + } + + expected_dc = (expected_dc + 4) >> 3; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + predictor[c] = expected_dc; + } + + predictor += 16; + } + } + break; + case B_TM_PRED: + { + // prediction similar to true_motion prediction + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + int pred = Above[c] - top_left + Left[r]; + + if (pred < 0) + pred = 0; + + if (pred > 255) + pred = 255; + + predictor[c] = pred; + } + + predictor += 16; + } + } + break; + + case B_VE_PRED: + { + + unsigned int ap[4]; + ap[0] = (top_left + 2 * Above[0] + Above[1] + 2) >> 2; + ap[1] = (Above[0] + 2 * Above[1] + Above[2] + 2) >> 2; + ap[2] = (Above[1] + 2 * Above[2] + Above[3] + 2) >> 2; + ap[3] = (Above[2] + 2 * Above[3] + Above[4] + 2) >> 2; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + + predictor[c] = ap[c]; + } + + predictor += 16; + } + + } + break; + + + case B_HE_PRED: + { + + unsigned int lp[4]; + lp[0] = (top_left + 2 * Left[0] + Left[1] + 2) >> 2; + lp[1] = (Left[0] + 2 * Left[1] + Left[2] + 2) >> 2; + lp[2] = (Left[1] + 2 * Left[2] + Left[3] + 2) >> 2; + lp[3] = (Left[2] + 2 * Left[3] + Left[3] + 2) >> 2; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + predictor[c] = lp[r]; + } + + predictor += 16; + } + } + break; + case B_LD_PRED: + { + unsigned char *ptr = Above; + predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2; + predictor[0 * 16 + 1] = + predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2; + predictor[0 * 16 + 2] = + predictor[1 * 16 + 1] = + predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2; + predictor[0 * 16 + 3] = + predictor[1 * 16 + 2] = + predictor[2 * 16 + 1] = + predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2; + predictor[1 * 16 + 3] = + predictor[2 * 16 + 2] = + predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2; + predictor[2 * 16 + 3] = + predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2; + predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2; + + } + break; + case B_RD_PRED: + { + + unsigned char pp[9]; + + pp[0] = Left[3]; + pp[1] = Left[2]; + pp[2] = Left[1]; + pp[3] = Left[0]; + pp[4] = top_left; + pp[5] = Above[0]; + pp[6] = Above[1]; + pp[7] = Above[2]; + pp[8] = Above[3]; + + predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[3 * 16 + 1] = + predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[3 * 16 + 2] = + predictor[2 * 16 + 1] = + predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[3 * 16 + 3] = + predictor[2 * 16 + 2] = + predictor[1 * 16 + 1] = + predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[2 * 16 + 3] = + predictor[1 * 16 + 2] = + predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[1 * 16 + 3] = + predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; + + } + break; + case B_VR_PRED: + { + + unsigned char pp[9]; + + pp[0] = Left[3]; + pp[1] = Left[2]; + pp[2] = Left[1]; + pp[3] = Left[0]; + pp[4] = top_left; + pp[5] = Above[0]; + pp[6] = Above[1]; + pp[7] = Above[2]; + pp[8] = Above[3]; + + + predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[3 * 16 + 1] = + predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[2 * 16 + 1] = + predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1; + predictor[3 * 16 + 2] = + predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[2 * 16 + 2] = + predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1; + predictor[3 * 16 + 3] = + predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + predictor[2 * 16 + 3] = + predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1; + predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; + predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1; + + } + break; + case B_VL_PRED: + { + + unsigned char *pp = Above; + + predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1; + predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[2 * 16 + 0] = + predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1; + predictor[1 * 16 + 1] = + predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[2 * 16 + 1] = + predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1; + predictor[3 * 16 + 1] = + predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[0 * 16 + 3] = + predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1; + predictor[1 * 16 + 3] = + predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + } + break; + + case B_HD_PRED: + { + unsigned char pp[9]; + pp[0] = Left[3]; + pp[1] = Left[2]; + pp[2] = Left[1]; + pp[3] = Left[0]; + pp[4] = top_left; + pp[5] = Above[0]; + pp[6] = Above[1]; + pp[7] = Above[2]; + pp[8] = Above[3]; + + + predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1; + predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[2 * 16 + 0] = + predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1; + predictor[2 * 16 + 1] = + predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[2 * 16 + 2] = + predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1; + predictor[2 * 16 + 3] = + predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[1 * 16 + 2] = + predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1; + predictor[1 * 16 + 3] = + predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + } + break; + + + case B_HU_PRED: + { + unsigned char *pp = Left; + predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1; + predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[0 * 16 + 2] = + predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1; + predictor[0 * 16 + 3] = + predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[1 * 16 + 2] = + predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1; + predictor[1 * 16 + 3] = + predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2; + predictor[2 * 16 + 2] = + predictor[2 * 16 + 3] = + predictor[3 * 16 + 0] = + predictor[3 * 16 + 1] = + predictor[3 * 16 + 2] = + predictor[3 * 16 + 3] = pp[3]; + } + break; + + + } +} +// copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and +// to the right prediction have filled in pixels to use. +void vp8_intra_prediction_down_copy(MACROBLOCKD *x) +{ + unsigned char *above_right = *(x->block[0].base_dst) + x->block[0].dst - x->block[0].dst_stride + 16; + + unsigned int *src_ptr = (unsigned int *)above_right; + unsigned int *dst_ptr0 = (unsigned int *)(above_right + 4 * x->block[0].dst_stride); + unsigned int *dst_ptr1 = (unsigned int *)(above_right + 8 * x->block[0].dst_stride); + unsigned int *dst_ptr2 = (unsigned int *)(above_right + 12 * x->block[0].dst_stride); + + *dst_ptr0 = *src_ptr; + *dst_ptr1 = *src_ptr; + *dst_ptr2 = *src_ptr; +} + + +void vp8_recon_intra4x4mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x) +{ + int i; + + vp8_intra_prediction_down_copy(x); + + for (i = 0; i < 16; i++) + { + BLOCKD *b = &x->block[i]; + + vp8_predict_intra4x4(b, x->block[i].bmi.mode, x->block[i].predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + } + + vp8_recon_intra_mbuv(rtcd, x); + +} diff --git a/vp8/common/reconintra4x4.h b/vp8/common/reconintra4x4.h new file mode 100644 index 000000000..788c8c40a --- /dev/null +++ b/vp8/common/reconintra4x4.h @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_RECONINTRA4x4_H +#define __INC_RECONINTRA4x4_H + +extern void vp8_intra_prediction_down_copy(MACROBLOCKD *x); + +#endif diff --git a/vp8/common/segmentation_common.c b/vp8/common/segmentation_common.c new file mode 100644 index 000000000..72b8c874b --- /dev/null +++ b/vp8/common/segmentation_common.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "segmentation_common.h" +#include "vpx_mem/vpx_mem.h" + +void vp8_update_gf_useage_maps(VP8_COMMON *cm, MACROBLOCKD *xd) +{ + int mb_row, mb_col; + + MODE_INFO *this_mb_mode_info = cm->mi; + + xd->gf_active_ptr = (signed char *)cm->gf_active_flags; + + if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame)) + { + // Reset Gf useage monitors + vpx_memset(cm->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols)); + cm->gf_active_count = cm->mb_rows * cm->mb_cols; + } + else + { + // for each macroblock row in image + for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) + { + // for each macroblock col in image + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) + { + + // If using golden then set GF active flag if not already set. + // If using last frame 0,0 mode then leave flag as it is + // else if using non 0,0 motion or intra modes then clear flag if it is currently set + if ((this_mb_mode_info->mbmi.ref_frame == GOLDEN_FRAME) || (this_mb_mode_info->mbmi.ref_frame == ALTREF_FRAME)) + { + if (*(xd->gf_active_ptr) == 0) + { + *(xd->gf_active_ptr) = 1; + cm->gf_active_count ++; + } + } + else if ((this_mb_mode_info->mbmi.mode != ZEROMV) && *(xd->gf_active_ptr)) + { + *(xd->gf_active_ptr) = 0; + cm->gf_active_count--; + } + + xd->gf_active_ptr++; // Step onto next entry + this_mb_mode_info++; // skip to next mb + + } + + // this is to account for the border + this_mb_mode_info++; + } + } +} diff --git a/vp8/common/segmentation_common.h b/vp8/common/segmentation_common.h new file mode 100644 index 000000000..bb93533a3 --- /dev/null +++ b/vp8/common/segmentation_common.h @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "string.h" +#include "blockd.h" +#include "onyxc_int.h" + +extern void vp8_update_gf_useage_maps(VP8_COMMON *cm, MACROBLOCKD *xd); diff --git a/vp8/common/setupintrarecon.c b/vp8/common/setupintrarecon.c new file mode 100644 index 000000000..dcaafe6c6 --- /dev/null +++ b/vp8/common/setupintrarecon.c @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "setupintrarecon.h" +#include "vpx_mem/vpx_mem.h" + +void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf) +{ + int i; + + // set up frame new frame for intra coded blocks + vpx_memset(ybf->y_buffer - 1 - 2 * ybf->y_stride, 127, ybf->y_width + 5); + vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5); + + for (i = 0; i < ybf->y_height; i++) + ybf->y_buffer[ybf->y_stride *i - 1] = (unsigned char) 129; + + vpx_memset(ybf->u_buffer - 1 - 2 * ybf->uv_stride, 127, ybf->uv_width + 5); + vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5); + + for (i = 0; i < ybf->uv_height; i++) + ybf->u_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129; + + vpx_memset(ybf->v_buffer - 1 - 2 * ybf->uv_stride, 127, ybf->uv_width + 5); + vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5); + + for (i = 0; i < ybf->uv_height; i++) + ybf->v_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129; + +} diff --git a/vp8/common/setupintrarecon.h b/vp8/common/setupintrarecon.h new file mode 100644 index 000000000..6ec79b29c --- /dev/null +++ b/vp8/common/setupintrarecon.h @@ -0,0 +1,12 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_scale/yv12config.h" +extern void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf); diff --git a/vp8/common/subpixel.h b/vp8/common/subpixel.h new file mode 100644 index 000000000..fbd5f4daf --- /dev/null +++ b/vp8/common/subpixel.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef SUBPIXEL_H +#define SUBPIXEL_H + +#define prototype_subpixel_predict(sym) \ + void sym(unsigned char *src, int src_pitch, int xofst, int yofst, \ + unsigned char *dst, int dst_pitch) + +#if ARCH_X86 || ARCH_X86_64 +#include "x86/subpixel_x86.h" +#endif + +#if ARCH_ARM +#include "arm/subpixel_arm.h" +#endif + +#ifndef vp8_subpix_sixtap16x16 +#define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_c +#endif +extern prototype_subpixel_predict(vp8_subpix_sixtap16x16); + +#ifndef vp8_subpix_sixtap8x8 +#define vp8_subpix_sixtap8x8 vp8_sixtap_predict8x8_c +#endif +extern prototype_subpixel_predict(vp8_subpix_sixtap8x8); + +#ifndef vp8_subpix_sixtap8x4 +#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_c +#endif +extern prototype_subpixel_predict(vp8_subpix_sixtap8x4); + +#ifndef vp8_subpix_sixtap4x4 +#define vp8_subpix_sixtap4x4 vp8_sixtap_predict_c +#endif +extern prototype_subpixel_predict(vp8_subpix_sixtap4x4); + +#ifndef vp8_subpix_bilinear16x16 +#define vp8_subpix_bilinear16x16 vp8_bilinear_predict16x16_c +#endif +extern prototype_subpixel_predict(vp8_subpix_bilinear16x16); + +#ifndef vp8_subpix_bilinear8x8 +#define vp8_subpix_bilinear8x8 vp8_bilinear_predict8x8_c +#endif +extern prototype_subpixel_predict(vp8_subpix_bilinear8x8); + +#ifndef vp8_subpix_bilinear8x4 +#define vp8_subpix_bilinear8x4 vp8_bilinear_predict8x4_c +#endif +extern prototype_subpixel_predict(vp8_subpix_bilinear8x4); + +#ifndef vp8_subpix_bilinear4x4 +#define vp8_subpix_bilinear4x4 vp8_bilinear_predict4x4_c +#endif +extern prototype_subpixel_predict(vp8_subpix_bilinear4x4); + +typedef prototype_subpixel_predict((*vp8_subpix_fn_t)); +typedef struct +{ + vp8_subpix_fn_t sixtap16x16; + vp8_subpix_fn_t sixtap8x8; + vp8_subpix_fn_t sixtap8x4; + vp8_subpix_fn_t sixtap4x4; + vp8_subpix_fn_t bilinear16x16; + vp8_subpix_fn_t bilinear8x8; + vp8_subpix_fn_t bilinear8x4; + vp8_subpix_fn_t bilinear4x4; +} vp8_subpix_rtcd_vtable_t; + +#if CONFIG_RUNTIME_CPU_DETECT +#define SUBPIX_INVOKE(ctx,fn) (ctx)->fn +#else +#define SUBPIX_INVOKE(ctx,fn) vp8_subpix_##fn +#endif + +#endif diff --git a/vp8/common/swapyv12buffer.c b/vp8/common/swapyv12buffer.c new file mode 100644 index 000000000..afe6a885e --- /dev/null +++ b/vp8/common/swapyv12buffer.c @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "swapyv12buffer.h" + +void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame) +{ + unsigned char *temp; + + temp = last_frame->buffer_alloc; + last_frame->buffer_alloc = new_frame->buffer_alloc; + new_frame->buffer_alloc = temp; + + temp = last_frame->y_buffer; + last_frame->y_buffer = new_frame->y_buffer; + new_frame->y_buffer = temp; + + temp = last_frame->u_buffer; + last_frame->u_buffer = new_frame->u_buffer; + new_frame->u_buffer = temp; + + temp = last_frame->v_buffer; + last_frame->v_buffer = new_frame->v_buffer; + new_frame->v_buffer = temp; + +} diff --git a/vp8/common/swapyv12buffer.h b/vp8/common/swapyv12buffer.h new file mode 100644 index 000000000..caf9499d9 --- /dev/null +++ b/vp8/common/swapyv12buffer.h @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef SWAPYV12_BUFFER_H +#define SWAPYV12_BUFFER_H + +#include "vpx_scale/yv12config.h" + +void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame); + +#endif diff --git a/vp8/common/systemdependent.h b/vp8/common/systemdependent.h new file mode 100644 index 000000000..1829b649c --- /dev/null +++ b/vp8/common/systemdependent.h @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#if ARCH_X86 || ARCH_X86_64 +void vpx_reset_mmx_state(void); +#define vp8_clear_system_state() vpx_reset_mmx_state() +#else +#define vp8_clear_system_state() +#endif + +struct VP8Common; +void vp8_machine_specific_config(struct VP8Common *); diff --git a/vp8/common/textblit.c b/vp8/common/textblit.c new file mode 100644 index 000000000..a45937b12 --- /dev/null +++ b/vp8/common/textblit.c @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + + + +void vp8_blit_text(const char *msg, unsigned char *address, const int pitch) +{ + int letter_bitmap; + unsigned char *output_pos = address; + int colpos; + const int font[] = + { + 0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, 0x18000, + 0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, 0x80000, 0x111110, + 0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, 0x4D6B7, 0x456AA, + 0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, 0x8A880, 0x52940, 0x22A20, + 0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, 0x8C62E, 0xE8C63F, 0x118D6BF, + 0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, 0xF8C628, 0x8A89F, 0x108421F, + 0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, 0x164C62E, 0x12694BF, 0x8AD6A2, + 0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, 0x1151151, 0x117041, 0x119D731, + 0x47E0, 0x1041041, 0xFC400, 0x10440, 0x1084210, 0x820 + }; + colpos = 0; + + while (msg[colpos] != 0) + { + char letter = msg[colpos]; + int fontcol, fontrow; + + if (letter <= 'Z' && letter >= ' ') + letter_bitmap = font[letter-' ']; + else if (letter <= 'z' && letter >= 'a') + letter_bitmap = font[letter-'a'+'A' - ' ']; + else + letter_bitmap = font[0]; + + for (fontcol = 6; fontcol >= 0 ; fontcol--) + for (fontrow = 0; fontrow < 5; fontrow++) + output_pos[fontrow *pitch + fontcol] = + ((letter_bitmap >> (fontcol * 5)) & (1 << fontrow) ? 255 : 0); + + output_pos += 7; + colpos++; + } +} diff --git a/vp8/common/threading.h b/vp8/common/threading.h new file mode 100644 index 000000000..a02cb244b --- /dev/null +++ b/vp8/common/threading.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef _PTHREAD_EMULATION +#define _PTHREAD_EMULATION + +#define VPXINFINITE 10000 //10second. + +/* Thread management macros */ +#ifdef _WIN32 +/* Win32 */ +#define _WIN32_WINNT 0x500 /* WINBASE.H - Enable signal_object_and_wait */ +#include <process.h> +#include <windows.h> +#define THREAD_FUNCTION DWORD WINAPI +#define THREAD_FUNCTION_RETURN DWORD +#define THREAD_SPECIFIC_INDEX DWORD +#define pthread_t HANDLE +#define pthread_attr_t DWORD +#define pthread_create(thhandle,attr,thfunc,tharg) (int)((*thhandle=(HANDLE)_beginthreadex(NULL,0,(unsigned int (__stdcall *)(void *))thfunc,tharg,0,NULL))==NULL) +#define pthread_join(thread, result) ((WaitForSingleObject((thread),VPXINFINITE)!=WAIT_OBJECT_0) || !CloseHandle(thread)) +#define pthread_detach(thread) if(thread!=NULL)CloseHandle(thread) +#define thread_sleep(nms) Sleep(nms) +#define pthread_cancel(thread) terminate_thread(thread,0) +#define ts_key_create(ts_key, destructor) {ts_key = TlsAlloc();}; +#define pthread_getspecific(ts_key) TlsGetValue(ts_key) +#define pthread_setspecific(ts_key, value) TlsSetValue(ts_key, (void *)value) +#define pthread_self() GetCurrentThreadId() +#else +#ifdef __APPLE__ +#include <mach/semaphore.h> +#include <mach/task.h> +#include <time.h> +#include <unistd.h> + +#else +#include <semaphore.h> +#endif + +#include <pthread.h> +/* pthreads */ +/* Nearly everything is already defined */ +#define THREAD_FUNCTION void * +#define THREAD_FUNCTION_RETURN void * +#define THREAD_SPECIFIC_INDEX pthread_key_t +#define ts_key_create(ts_key, destructor) pthread_key_create (&(ts_key), destructor); +#endif + +/* Syncrhronization macros: Win32 and Pthreads */ +#ifdef _WIN32 +#define sem_t HANDLE +#define pause(voidpara) __asm PAUSE +#define sem_init(sem, sem_attr1, sem_init_value) (int)((*sem = CreateEvent(NULL,FALSE,FALSE,NULL))==NULL) +#define sem_wait(sem) (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem,VPXINFINITE)) +#define sem_post(sem) SetEvent(*sem) +#define sem_destroy(sem) if(*sem)((int)(CloseHandle(*sem))==TRUE) +#define thread_sleep(nms) Sleep(nms) + +#else + +#ifdef __APPLE__ +#define sem_t semaphore_t +#define sem_init(X,Y,Z) semaphore_create(mach_task_self(), X, SYNC_POLICY_FIFO, Z) +#define sem_wait(sem) (semaphore_wait(*sem) ) +#define sem_post(sem) semaphore_signal(*sem) +#define sem_destroy(sem) semaphore_destroy(mach_task_self(),*sem) +#define thread_sleep(nms) // { struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} +#else +#include <unistd.h> +#define thread_sleep(nms) usleep(nms*1000);// {struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} +#endif +/* Not Windows. Assume pthreads */ + +#endif + +#if ARCH_X86 || ARCH_X86_64 +#include "vpx_ports/x86.h" +#else +#define x86_pause_hint() +#endif + +#endif diff --git a/vp8/common/treecoder.c b/vp8/common/treecoder.c new file mode 100644 index 000000000..4ad018d49 --- /dev/null +++ b/vp8/common/treecoder.c @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#if CONFIG_DEBUG +#include <assert.h> +#endif +#include <stdio.h> + +#include "treecoder.h" + +static void tree2tok( + struct vp8_token_struct *const p, + vp8_tree t, + int i, + int v, + int L +) +{ + v += v; + ++L; + + do + { + const vp8_tree_index j = t[i++]; + + if (j <= 0) + { + p[-j].value = v; + p[-j].Len = L; + } + else + tree2tok(p, t, j, v, L); + } + while (++v & 1); +} + +void vp8_tokens_from_tree(struct vp8_token_struct *p, vp8_tree t) +{ + tree2tok(p, t, 0, 0, 0); +} + +static void branch_counts( + int n, /* n = size of alphabet */ + vp8_token tok [ /* n */ ], + vp8_tree tree, + unsigned int branch_ct [ /* n-1 */ ] [2], + const unsigned int num_events[ /* n */ ] +) +{ + const int tree_len = n - 1; + int t = 0; + +#if CONFIG_DEBUG + assert(tree_len); +#endif + + do + { + branch_ct[t][0] = branch_ct[t][1] = 0; + } + while (++t < tree_len); + + t = 0; + + do + { + int L = tok[t].Len; + const int enc = tok[t].value; + const unsigned int ct = num_events[t]; + + vp8_tree_index i = 0; + + do + { + const int b = (enc >> --L) & 1; + const int j = i >> 1; +#if CONFIG_DEBUG + assert(j < tree_len && 0 <= L); +#endif + + branch_ct [j] [b] += ct; + i = tree[ i + b]; + } + while (i > 0); + +#if CONFIG_DEBUG + assert(!L); +#endif + } + while (++t < n); + +} + + +void vp8_tree_probs_from_distribution( + int n, /* n = size of alphabet */ + vp8_token tok [ /* n */ ], + vp8_tree tree, + vp8_prob probs [ /* n-1 */ ], + unsigned int branch_ct [ /* n-1 */ ] [2], + const unsigned int num_events[ /* n */ ], + unsigned int Pfac, + int rd +) +{ + const int tree_len = n - 1; + int t = 0; + + branch_counts(n, tok, tree, branch_ct, num_events); + + do + { + const unsigned int *const c = branch_ct[t]; + const unsigned int tot = c[0] + c[1]; + +#if CONFIG_DEBUG + assert(tot < (1 << 24)); /* no overflow below */ +#endif + + if (tot) + { + const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot; + probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */ + } + else + probs[t] = vp8_prob_half; + } + while (++t < tree_len); +} diff --git a/vp8/common/treecoder.h b/vp8/common/treecoder.h new file mode 100644 index 000000000..0356d2b02 --- /dev/null +++ b/vp8/common/treecoder.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_TREECODER_H +#define __INC_TREECODER_H + +typedef unsigned char vp8bc_index_t; // probability index + + +typedef unsigned char vp8_prob; + +#define vp8_prob_half ( (vp8_prob) 128) + +typedef signed char vp8_tree_index; +struct bool_coder_spec; + +typedef struct bool_coder_spec bool_coder_spec; +typedef struct bool_writer bool_writer; +typedef struct bool_reader bool_reader; + +typedef const bool_coder_spec c_bool_coder_spec; +typedef const bool_writer c_bool_writer; +typedef const bool_reader c_bool_reader; + + + +# define vp8_complement( x) (255 - x) + + +/* We build coding trees compactly in arrays. + Each node of the tree is a pair of vp8_tree_indices. + Array index often references a corresponding probability table. + Index <= 0 means done encoding/decoding and value = -Index, + Index > 0 means need another bit, specification at index. + Nonnegative indices are always even; processing begins at node 0. */ + +typedef const vp8_tree_index vp8_tree[], *vp8_tree_p; + + +typedef const struct vp8_token_struct +{ + int value; + int Len; +} vp8_token; + +/* Construct encoding array from tree. */ + +void vp8_tokens_from_tree(struct vp8_token_struct *, vp8_tree); + + +/* Convert array of token occurrence counts into a table of probabilities + for the associated binary encoding tree. Also writes count of branches + taken for each node on the tree; this facilitiates decisions as to + probability updates. */ + +void vp8_tree_probs_from_distribution( + int n, /* n = size of alphabet */ + vp8_token tok [ /* n */ ], + vp8_tree tree, + vp8_prob probs [ /* n-1 */ ], + unsigned int branch_ct [ /* n-1 */ ] [2], + const unsigned int num_events[ /* n */ ], + unsigned int Pfactor, + int Round +); + +/* Variant of above using coder spec rather than hardwired 8-bit probs. */ + +void vp8bc_tree_probs_from_distribution( + int n, /* n = size of alphabet */ + vp8_token tok [ /* n */ ], + vp8_tree tree, + vp8_prob probs [ /* n-1 */ ], + unsigned int branch_ct [ /* n-1 */ ] [2], + const unsigned int num_events[ /* n */ ], + c_bool_coder_spec *s +); + + +#endif diff --git a/vp8/common/type_aliases.h b/vp8/common/type_aliases.h new file mode 100644 index 000000000..addd26469 --- /dev/null +++ b/vp8/common/type_aliases.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** +* +* Module Title : type_aliases.h +* +* Description : Standard type aliases +* +****************************************************************************/ +#ifndef __INC_TYPE_ALIASES_H +#define __INC_TYPE_ALIASES_H + +/**************************************************************************** +* Macros +****************************************************************************/ +#define EXPORT +#define IMPORT extern /* Used to declare imported data & routines */ +#define PRIVATE static /* Used to declare & define module-local data */ +#define LOCAL static /* Used to define all persistent routine-local data */ +#define STD_IN_PATH 0 /* Standard input path */ +#define STD_OUT_PATH 1 /* Standard output path */ +#define STD_ERR_PATH 2 /* Standard error path */ +#define STD_IN_FILE stdin /* Standard input file pointer */ +#define STD_OUT_FILE stdout /* Standard output file pointer */ +#define STD_ERR_FILE stderr /* Standard error file pointer */ +#define max_int 0x7FFFFFFF + +#define __export +#define _export + +#define CCONV + +#ifndef NULL +#ifdef __cplusplus +#define NULL 0 +#else +#define NULL ((void *)0) +#endif +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + +#ifndef TRUE +#define TRUE 1 +#endif + +/**************************************************************************** +* Typedefs +****************************************************************************/ +#ifndef TYPE_INT8 +#define TYPE_INT8 +typedef signed char INT8; +#endif + +#ifndef TYPE_INT16 +//#define TYPE_INT16 +typedef signed short INT16; +#endif + +#ifndef TYPE_INT32 +//#define TYPE_INT32 +typedef signed int INT32; +#endif + +#ifndef TYPE_UINT8 +//#define TYPE_UINT8 +typedef unsigned char UINT8; +#endif + +#ifndef TYPE_UINT32 +//#define TYPE_UINT32 +typedef unsigned int UINT32; +#endif + +#ifndef TYPE_UINT16 +//#define TYPE_UINT16 +typedef unsigned short UINT16; +#endif + +#ifndef TYPE_BOOL +//#define TYPE_BOOL +typedef int BOOL; +#endif + +typedef unsigned char BOOLEAN; + +#ifdef _MSC_VER +typedef __int64 INT64; +#else + +#ifndef TYPE_INT64 +#ifdef _TMS320C6X +//for now we only have 40bits +typedef long INT64; +#else +typedef long long INT64; +#endif +#endif + +#endif + +/* Floating point */ +typedef double FLOAT64; +typedef float FLOAT32; + +#endif diff --git a/vp8/common/vfwsetting.hpp b/vp8/common/vfwsetting.hpp new file mode 100644 index 000000000..e352e7a19 --- /dev/null +++ b/vp8/common/vfwsetting.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#if !defined(VFWSETTING_HPP) +#define VFWSETTING_HPP +//______________________________________________________________________________ +// +// VFWSetting.hpp +// + +#include "four_cc.hpp" +#include <iosfwd> + +namespace vpxvp +{ + + //-------------------------------------- + class VFWSetting + { + friend std::ostream& operator<<(std::ostream& os, const VFWSetting& vfws); + + public: + + enum Mode + { + m_setting, + m_config + }; + + enum + { + header_size = 8, + Size = 16 + }; + + VFWSetting(four_cc fcc); + ~VFWSetting(); + + four_cc fcc() const; + Mode mode() const; + + int setting() const; + int value() const; + void setting_value(int i_setting, int i_value); // Sets mode to m_setting + + long size() const; + const void* data() const; + int data(const void* p_data, unsigned long ul_size); + + private: + + VFWSetting(const VFWSetting& vfws); // Not implemented + VFWSetting& operator=(const VFWSetting& vfws); // Not implemented + + int extract_(const void* p_data, unsigned long ul_size); + void update_() const; + + four_cc m_fcc; + Mode m_mode; + int m_i_setting; + int m_i_value; + + mutable unsigned char m_p_data[Size]; + }; + +} // namespace vpxvp + +#endif // VFWSETTING_HPP diff --git a/vp8/common/vpx_ref_build_prefix.h b/vp8/common/vpx_ref_build_prefix.h new file mode 100644 index 000000000..40608c6dd --- /dev/null +++ b/vp8/common/vpx_ref_build_prefix.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef _VPX_REF_BUILD_PREFIX_h +#define _VPX_REF_BUILD_PREFIX_h + +#if defined(__cplusplus) +extern "C" { +#endif + + +#if defined(__cplusplus) +} +#endif + +#endif /* include guards */ diff --git a/vp8/common/vpxblit.h b/vp8/common/vpxblit.h new file mode 100644 index 000000000..d03e0bd02 --- /dev/null +++ b/vp8/common/vpxblit.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef VPXBLIT_H_INCL +#define VPXBLIT_H_INCL +/*============================================================================== + Includes +==============================================================================*/ + +/*============================================================================== + Defines +==============================================================================*/ + + +#ifdef VPX_BIG_ENDIAN +#define BYTE_ZERO(X) ((X & 0xFF000000) >> (24 - 2) ) +#define BYTE_ONE(X) ((X & 0x00FF0000) >> (16 - 2) ) +#define BYTE_TWO(X) ((X & 0x0000FF00) >> (8 - 2) ) +#define BYTE_THREE(X) ((X & 0x000000FF) << (0 + 2) ) + +#define BYTE_ZERO_UV(X) ((X & 0x0000FF00) >> (8 - 2) ) +#define BYTE_ONE_UV(X) ((X & 0x000000FF) << (0 + 2) ) + +#define REREFERENCE(X) (*((int *) &(X))) + +#else + +#define BYTE_THREE(X) ((X & 0xFF000000) >> (24 - 2) ) +#define BYTE_TWO(X) ((X & 0x00FF0000) >> (16 - 2) ) +#define BYTE_ONE(X) ((X & 0x0000FF00) >> (8 - 2) ) +#define BYTE_ZERO(X) ((X & 0x000000FF) << (0 + 2) ) + +#define BYTE_ONE_UV(X) ((X & 0x0000FF00) >> (8 - 2) ) +#define BYTE_ZERO_UV(X) ((X & 0x000000FF) << (0 + 2) ) + +#define REREFERENCE(X) (*((int *) &(X))) + +#endif + + +/*============================================================================== + Type Definitions +==============================================================================*/ +typedef struct // YUV buffer configuration structure +{ + int y_width; + int y_height; + int y_stride; + + int uv_width; + int uv_height; + int uv_stride; + + char *y_buffer; + char *u_buffer; + char *v_buffer; + + char *uv_start; + int uv_dst_area; + int uv_used_area; + +} VPX_BLIT_CONFIG; + +typedef struct tx86_params +{ + unsigned int pushed_registers[6]; + unsigned int return_address; + unsigned int dst; + unsigned int scrn_pitch; + VPX_BLIT_CONFIG *buff_config; +} x86_params; + +/*============================================================================= + Enums +==============================================================================*/ + + +/*============================================================================== + Structures +==============================================================================*/ + +/*============================================================================== + Constants +==============================================================================*/ + + +/*============================================================================== + Variables +==============================================================================*/ + + + + +/*============================================================================== + Function Protoypes/MICROS +==============================================================================*/ +int vpx_get_size_of_pixel(unsigned int bd); +void *vpx_get_blitter(unsigned int bd); +void vpx_set_blit(void); +void vpx_destroy_blit(void); + + + +#endif //VPXBLIT_H_INCL diff --git a/vp8/common/vpxblit_c64.h b/vp8/common/vpxblit_c64.h new file mode 100644 index 000000000..a8e28f59a --- /dev/null +++ b/vp8/common/vpxblit_c64.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef _VPX_BLIT_C64_h +#define _VPX_BLIT_C64_h + +/**************************************************************************** +* Typedefs +****************************************************************************/ + +typedef struct // YUV buffer configuration structure +{ + int y_width; + int y_height; + int y_stride; + + int uv_width; + int uv_height; + int uv_stride; + + unsigned char *y_buffer; + unsigned char *u_buffer; + unsigned char *v_buffer; + + unsigned char *y_ptr_scrn; + unsigned char *u_ptr_scrn; + unsigned char *v_ptr_scrn; + +} DXV_YUV_BUFFER_CONFIG; + +typedef struct +{ + unsigned char *rgbptr_scrn; + unsigned char *y_ptr_scrn; + unsigned char *u_ptr_scrn; + unsigned char *v_ptr_scrn; + unsigned char *rgbptr_scrn2; +} DXV_FINAL_VIDEO; + +#endif /* include guards */ diff --git a/vp8/common/vpxerrors.h b/vp8/common/vpxerrors.h new file mode 100644 index 000000000..e4c9f3ef3 --- /dev/null +++ b/vp8/common/vpxerrors.h @@ -0,0 +1,12 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + + +#define ALLOC_FAILURE -2 diff --git a/vp8/common/x86/boolcoder.cxx b/vp8/common/x86/boolcoder.cxx new file mode 100644 index 000000000..06faca69c --- /dev/null +++ b/vp8/common/x86/boolcoder.cxx @@ -0,0 +1,493 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + + +/* Arithmetic bool coder with largish probability range. + Timothy S Murphy 6 August 2004 */ + +#include <assert.h> +#include <math.h> + +#include "bool_coder.h" + +#if tim_vp8 + extern "C" { +# include "VP8cx/treewriter.h" + } +#endif + +int_types::~int_types() {} + +void bool_coder_spec::check_prec() const { + assert( w && (r==Up || w > 1) && w < 24 && (ebias || w < 17)); +} + +bool bool_coder_spec::float_init( uint Ebits, uint Mbits) { + uint b = (ebits = Ebits) + (mbits = Mbits); + if( b) { + assert( ebits < 6 && w + mbits < 31); + assert( ebits + mbits < sizeof(Index) * 8); + ebias = (1 << ebits) + 1 + mbits; + mmask = (1 << mbits) - 1; + max_index = ( ( half_index = 1 << b ) << 1) - 1; + } else { + ebias = 0; + max_index = 255; + half_index = 128; + } + check_prec(); + return b? 1:0; +} + +void bool_coder_spec::cost_init() +{ + static cdouble c = -(1 << 20)/log( 2.); + + FILE *f = fopen( "costs.txt", "w"); + assert( f); + + assert( sizeof(int) >= 4); /* for C interface */ + assert( max_index <= 255); /* size of Ctbl */ + uint i = 0; do { + cdouble p = ( *this)( (Index) i); + Ctbl[i] = (uint32) ( log( p) * c); + fprintf( + f, "cost( %d -> %10.7f) = %10d = %12.5f bits\n", + i, p, Ctbl[i], (double) Ctbl[i] / (1<<20) + ); + } while( ++i <= max_index); + fclose( f); +} + +bool_coder_spec_explicit_table::bool_coder_spec_explicit_table( + cuint16 tbl[256], Rounding rr, uint prec +) + : bool_coder_spec( prec, rr) +{ + check_prec(); + uint i = 0; + if( tbl) + do { Ptbl[i] = tbl[i];} while( ++i < 256); + else + do { Ptbl[i] = i << 8;} while( ++i < 256); + cost_init(); +} + + +bool_coder_spec_exponential_table::bool_coder_spec_exponential_table( + uint x, Rounding rr, uint prec +) + : bool_coder_spec( prec, rr) +{ + assert( x > 1 && x <= 16); + check_prec(); + Ptbl[128] = 32768u; + Ptbl[0] = (uint16) pow( 2., 16. - x); + --x; + int i=1; do { + cdouble d = pow( .5, 1. + (1. - i/128.)*x) * 65536.; + uint16 v = (uint16) d; + if( v < i) + v = i; + Ptbl[256-i] = (uint16) ( 65536U - (Ptbl[i] = v)); + } while( ++i < 128); + cost_init(); +} + +bool_coder_spec::bool_coder_spec( FILE *fp) { + fscanf( fp, "%d", &w); + int v; + fscanf( fp, "%d", &v); + assert( 0 <= v && v <= 2); + r = (Rounding) v; + fscanf( fp, "%d", &ebits); + fscanf( fp, "%d", &mbits); + if( float_init( ebits, mbits)) + return; + int i=0; do { + uint v; + fscanf( fp, "%d", &v); + assert( 0 <=v && v <= 65535U); + Ptbl[i] = v; + } while( ++i < 256); + cost_init(); +} + +void bool_coder_spec::dump( FILE *fp) const { + fprintf( fp, "%d %d %d %d\n", w, (int) r, ebits, mbits); + if( ebits || mbits) + return; + int i=0; do { fprintf( fp, "%d\n", Ptbl[i]);} while( ++i < 256); +} + +vp8bc_index_t bool_coder_spec::operator()( double p) const +{ + if( p <= 0.) + return 0; + if( p >= 1.) + return max_index; + if( ebias) { + if( p > .5) + return max_index - ( *this)( 1. - p); + int e; + uint m = (uint) ldexp( frexp( p, &e), mbits + 2); + uint x = 1 << (mbits + 1); + assert( x <= m && m < x<<1); + if( (m = (m >> 1) + (m & 1)) >= x) { + m = x >> 1; + ++e; + } + int y = 1 << ebits; + if( (e += y) >= y) + return half_index - 1; + if( e < 0) + return 0; + return (Index) ( (e << mbits) + (m & mmask)); + } + + cuint16 v = (uint16) (p * 65536.); + int i = 128; + int j = 128; + uint16 w; + while( w = Ptbl[i], j >>= 1) { + if( w < v) + i += j; + else if( w == v) + return (uchar) i; + else + i -= j; + } + if( w > v) { + cuint16 x = Ptbl[i-1]; + if( v <= x || w - v > v - x) + --i; + } else if( w < v && i < 255) { + cuint16 x = Ptbl[i+1]; + if( x <= v || x - v < v - w) + ++i; + } + return (Index) i; +} + +double bool_coder_spec::operator()( Index i) const { + if( !ebias) + return Ptbl[i]/65536.; + if( i >= half_index) + return 1. - ( *this)( (Index) (max_index - i)); + return ldexp( (double)mantissa( i), - (int) exponent( i)); +} + + + +void bool_writer::carry() { + uchar *p = B; + assert( p > Bstart); + while( *--p == 255) { assert( p > Bstart); *p = 0;} + ++*p; +} + + +bool_writer::bool_writer( c_spec& s, uchar *Dest, size_t Len) + : bool_coder( s), + Bstart( Dest), + Bend( Len? Dest+Len : 0), + B( Dest) +{ + assert( Dest); + reset(); +} + +bool_writer::~bool_writer() { flush();} + +#if 1 + extern "C" { int bc_v = 0;} +#else +# define bc_v 0 +#endif + + +void bool_writer::raw( bool value, uint32 s) { + uint32 L = Low; + + assert( Range >= min_range && Range <= spec.max_range()); + assert( !is_toast && s && s < Range); + + if( bc_v) printf( + "Writing a %d, B %x Low %x Range %x s %x blag %d ...\n", + value? 1:0, B-Bstart, Low, Range, s, bit_lag + ); + if( value) { + L += s; + s = Range - s; + } else + s -= rinc; + if( s < min_range) { + int ct = bit_lag; do { + if( !--ct) { + ct = 8; + if( L & (1 << 31)) + carry(); + assert( !Bend || B < Bend); + *B++ = (uchar) (L >> 23); + L &= (1<<23) - 1; + } + } while( L += L, (s += s + rinc) < min_range); + bit_lag = ct; + } + Low = L; + Range = s; + if( bc_v) + printf( + "...done, B %x Low %x Range %x blag %d \n", + B-Bstart, Low, Range, bit_lag + ); +} + +bool_writer& bool_writer::flush() { + if( is_toast) + return *this; + int b = bit_lag; + uint32 L = Low; + assert( b); + if( L & (1 << (32 - b))) + carry(); + L <<= b & 7; + b >>= 3; + while( --b >= 0) + L <<= 8; + b = 4; + assert( !Bend || B + 4 <= Bend); + do { + *B++ = (uchar) (L >> 24); + L <<= 8; + } while( --b); + is_toast = 1; + return *this; +} + + +bool_reader::bool_reader( c_spec& s, cuchar *src, size_t Len) + : bool_coder( s), + Bstart( src), + B( src), + Bend( Len? src+Len : 0), + shf( 32 - s.w), + bct( 8) +{ + int i = 4; do { Low <<= 8; Low |= *B++;} while( --i); +} + + +bool bool_reader::raw( uint32 s) { + + bool val = 0; + uint32 L = Low; + cuint32 S = s << shf; + + assert( Range >= min_range && Range <= spec.max_range()); + assert( s && s < Range && (L >> shf) < Range); + + if( bc_v) + printf( + "Reading, B %x Low %x Range %x s %x bct %d ...\n", + B-Bstart, Low, Range, s, bct + ); + + if( L >= S) { + L -= S; + s = Range - s; + assert( L < (s << shf)); + val = 1; + } else + s -= rinc; + if( s < min_range) { + int ct = bct; + do { + assert( ~L & (1 << 31)); + L += L; + if( !--ct) { + ct = 8; + if( !Bend || B < Bend) + L |= *B++; + } + } while( (s += s + rinc) < min_range); + bct = ct; + } + Low = L; + Range = s; + if( bc_v) + printf( + "...done, val %d B %x Low %x Range %x bct %d\n", + val? 1:0, B-Bstart, Low, Range, bct + ); + return val; +} + + +/* C interfaces */ + +// spec interface + +struct NS : bool_coder_namespace { + static Rounding r( vp8bc_c_prec *p, Rounding rr =down_full) { + return p? (Rounding) p->r : rr; + } +}; + +bool_coder_spec *vp8bc_vp6spec() { + return new bool_coder_spec_explicit_table( 0, bool_coder_namespace::Down, 8); +} +bool_coder_spec *vp8bc_float_spec( + unsigned int Ebits, unsigned int Mbits, vp8bc_c_prec *p +) { + return new bool_coder_spec_float( Ebits, Mbits, NS::r( p), p? p->prec : 12); +} +bool_coder_spec *vp8bc_literal_spec( + const unsigned short m[256], vp8bc_c_prec *p +) { + return new bool_coder_spec_explicit_table( m, NS::r( p), p? p->prec : 16); +} +bool_coder_spec *vp8bc_exponential_spec( unsigned int x, vp8bc_c_prec *p) +{ + return new bool_coder_spec_exponential_table( x, NS::r( p), p? p->prec : 16); +} +bool_coder_spec *vp8bc_spec_from_file( FILE *fp) { + return new bool_coder_spec( fp); +} +void vp8bc_destroy_spec( c_bool_coder_spec *p) { delete p;} + +void vp8bc_spec_to_file( c_bool_coder_spec *p, FILE *fp) { p->dump( fp);} + +vp8bc_index_t vp8bc_index( c_bool_coder_spec *p, double x) { + return ( *p)( x); +} + +vp8bc_index_t vp8bc_index_from_counts( + c_bool_coder_spec *p, unsigned int L, unsigned int R +) { + return ( *p)( (R += L)? (double) L/R : .5); +} + +double vp8bc_probability( c_bool_coder_spec *p, vp8bc_index_t i) { + return ( *p)( i); +} + +vp8bc_index_t vp8bc_complement( c_bool_coder_spec *p, vp8bc_index_t i) { + return p->complement( i); +} +unsigned int vp8bc_cost_zero( c_bool_coder_spec *p, vp8bc_index_t i) { + return p->cost_zero( i); +} +unsigned int vp8bc_cost_one( c_bool_coder_spec *p, vp8bc_index_t i) { + return p->cost_one( i); +} +unsigned int vp8bc_cost_bit( c_bool_coder_spec *p, vp8bc_index_t i, int v) { + return p->cost_bit( i, v); +} + +#if tim_vp8 + extern "C" int tok_verbose; + +# define dbg_l 1000000 + + static vp8bc_index_t dbg_i [dbg_l]; + static char dbg_v [dbg_l]; + static size_t dbg_w = 0, dbg_r = 0; +#endif + +// writer interface + +bool_writer *vp8bc_create_writer( + c_bool_coder_spec *p, unsigned char *D, size_t L +) { + return new bool_writer( *p, D, L); +} + +size_t vp8bc_destroy_writer( bool_writer *p) { + const size_t s = p->flush().bytes_written(); + delete p; + return s; +} + +void vp8bc_write_bool( bool_writer *p, int v, vp8bc_index_t i) +{ +# if tim_vp8 + // bc_v = dbg_w < 10; + if( bc_v = tok_verbose) + printf( " writing %d at prob %d\n", v? 1:0, i); + accum_entropy_bc( &p->Spec(), i, v); + + ( *p)( i, (bool) v); + + if( dbg_w < dbg_l) { + dbg_i [dbg_w] = i; + dbg_v [dbg_w++] = v? 1:0; + } +# else + ( *p)( i, (bool) v); +# endif +} + +void vp8bc_write_bits( bool_writer *p, unsigned int v, int n) +{ +# if tim_vp8 + { + c_bool_coder_spec * const s = & p->Spec(); + const vp8bc_index_t i = s->half_index(); + int m = n; + while( --m >= 0) + accum_entropy_bc( s, i, (v>>m) & 1); + } +# endif + + p->write_bits( n, v); +} + +c_bool_coder_spec *vp8bc_writer_spec( c_bool_writer *w) { return & w->Spec();} + +// reader interface + +bool_reader *vp8bc_create_reader( + c_bool_coder_spec *p, const unsigned char *S, size_t L +) { + return new bool_reader( *p, S, L); +} + +void vp8bc_destroy_reader( bool_reader * p) { delete p;} + +int vp8bc_read_bool( bool_reader *p, vp8bc_index_t i) +{ +# if tim_vp8 + // bc_v = dbg_r < 10; + bc_v = tok_verbose; + const int v = ( *p)( i)? 1:0; + if( tok_verbose) + printf( " reading %d at prob %d\n", v, i); + if( dbg_r < dbg_l) { + assert( dbg_r <= dbg_w); + if( i != dbg_i[dbg_r] || v != dbg_v[dbg_r]) { + printf( + "Position %d: INCORRECTLY READING %d prob %d, wrote %d prob %d\n", + dbg_r, v, i, dbg_v[dbg_r], dbg_i[dbg_r] + ); + } + ++dbg_r; + } + return v; +# else + return ( *p)( i)? 1:0; +# endif +} + +unsigned int vp8bc_read_bits( bool_reader *p, int n) { return p->read_bits( n);} + +c_bool_coder_spec *vp8bc_reader_spec( c_bool_reader *r) { return & r->Spec();} + +#undef bc_v diff --git a/vp8/common/x86/idct_x86.h b/vp8/common/x86/idct_x86.h new file mode 100644 index 000000000..5dfb212e1 --- /dev/null +++ b/vp8/common/x86/idct_x86.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef IDCT_X86_H +#define IDCT_X86_H + +/* Note: + * + * This platform is commonly built for runtime CPU detection. If you modify + * any of the function mappings present in this file, be sure to also update + * them in the function pointer initialization code + */ + +#if HAVE_MMX +extern prototype_idct(vp8_short_idct4x4llm_1_mmx); +extern prototype_idct(vp8_short_idct4x4llm_mmx); +extern prototype_idct_scalar(vp8_dc_only_idct_mmx); + +extern prototype_second_order(vp8_short_inv_walsh4x4_mmx); +extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_idct_idct1 +#define vp8_idct_idct1 vp8_short_idct4x4llm_1_mmx + +#undef vp8_idct_idct16 +#define vp8_idct_idct16 vp8_short_idct4x4llm_mmx + +#undef vp8_idct_idct1_scalar +#define vp8_idct_idct1_scalar vp8_dc_only_idct_mmx + +#undef vp8_idct_iwalsh16 +#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_mmx + +#undef vp8_idct_iwalsh1 +#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_mmx + +#endif +#endif + +#if HAVE_SSE2 + +extern prototype_second_order(vp8_short_inv_walsh4x4_sse2); + +#if !CONFIG_RUNTIME_CPU_DETECT + +#undef vp8_idct_iwalsh16 +#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_sse2 + +#endif + +#endif + + + +#endif diff --git a/vp8/common/x86/idctllm_mmx.asm b/vp8/common/x86/idctllm_mmx.asm new file mode 100644 index 000000000..2751c6934 --- /dev/null +++ b/vp8/common/x86/idctllm_mmx.asm @@ -0,0 +1,265 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +; /**************************************************************************** +; * Notes: +; * +; * This implementation makes use of 16 bit fixed point verio of two multiply +; * constants: +; * 1. sqrt(2) * cos (pi/8) +; * 2. sqrt(2) * sin (pi/8) +; * Becuase the first constant is bigger than 1, to maintain the same 16 bit +; * fixed point prrcision as the second one, we use a trick of +; * x * a = x + x*(a-1) +; * so +; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). +; * +; * For the second constant, becuase of the 16bit version is 35468, which +; * is bigger than 32768, in signed 16 bit multiply, it become a negative +; * number. +; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x +; * +; **************************************************************************/ + + +;void short_idct4x4llm_mmx(short *input, short *output, int pitch) +global sym(vp8_short_idct4x4llm_mmx) +sym(vp8_short_idct4x4llm_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + GET_GOT rbx + ; end prolog + + mov rax, arg(0) ;input + mov rdx, arg(1) ;output + + movq mm0, [rax ] + movq mm1, [rax+ 8] + + movq mm2, [rax+16] + movq mm3, [rax+24] + + movsxd rax, dword ptr arg(2) ;pitch + + psubw mm0, mm2 ; b1= 0-2 + paddw mm2, mm2 ; + + movq mm5, mm1 + paddw mm2, mm0 ; a1 =0+2 + + pmulhw mm5, [x_s1sqr2 GLOBAL] ; + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) + + movq mm7, mm3 ; + pmulhw mm7, [x_c1sqr2less1 GLOBAL] ; + + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw mm7, mm5 ; c1 + + movq mm5, mm1 + movq mm4, mm3 + + pmulhw mm5, [x_c1sqr2less1 GLOBAL] + paddw mm5, mm1 + + pmulhw mm3, [x_s1sqr2 GLOBAL] + paddw mm3, mm4 + + paddw mm3, mm5 ; d1 + movq mm6, mm2 ; a1 + + movq mm4, mm0 ; b1 + paddw mm2, mm3 ;0 + + paddw mm4, mm7 ;1 + psubw mm0, mm7 ;2 + + psubw mm6, mm3 ;3 + + movq mm1, mm2 ; 03 02 01 00 + movq mm3, mm4 ; 23 22 21 20 + + punpcklwd mm1, mm0 ; 11 01 10 00 + punpckhwd mm2, mm0 ; 13 03 12 02 + + punpcklwd mm3, mm6 ; 31 21 30 20 + punpckhwd mm4, mm6 ; 33 23 32 22 + + movq mm0, mm1 ; 11 01 10 00 + movq mm5, mm2 ; 13 03 12 02 + + punpckldq mm0, mm3 ; 30 20 10 00 + punpckhdq mm1, mm3 ; 31 21 11 01 + + punpckldq mm2, mm4 ; 32 22 12 02 + punpckhdq mm5, mm4 ; 33 23 13 03 + + movq mm3, mm5 ; 33 23 13 03 + + psubw mm0, mm2 ; b1= 0-2 + paddw mm2, mm2 ; + + movq mm5, mm1 + paddw mm2, mm0 ; a1 =0+2 + + pmulhw mm5, [x_s1sqr2 GLOBAL] ; + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) + + movq mm7, mm3 ; + pmulhw mm7, [x_c1sqr2less1 GLOBAL] ; + + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw mm7, mm5 ; c1 + + movq mm5, mm1 + movq mm4, mm3 + + pmulhw mm5, [x_c1sqr2less1 GLOBAL] + paddw mm5, mm1 + + pmulhw mm3, [x_s1sqr2 GLOBAL] + paddw mm3, mm4 + + paddw mm3, mm5 ; d1 + paddw mm0, [fours GLOBAL] + + paddw mm2, [fours GLOBAL] + movq mm6, mm2 ; a1 + + movq mm4, mm0 ; b1 + paddw mm2, mm3 ;0 + + paddw mm4, mm7 ;1 + psubw mm0, mm7 ;2 + + psubw mm6, mm3 ;3 + psraw mm2, 3 + + psraw mm0, 3 + psraw mm4, 3 + + psraw mm6, 3 + + movq mm1, mm2 ; 03 02 01 00 + movq mm3, mm4 ; 23 22 21 20 + + punpcklwd mm1, mm0 ; 11 01 10 00 + punpckhwd mm2, mm0 ; 13 03 12 02 + + punpcklwd mm3, mm6 ; 31 21 30 20 + punpckhwd mm4, mm6 ; 33 23 32 22 + + movq mm0, mm1 ; 11 01 10 00 + movq mm5, mm2 ; 13 03 12 02 + + punpckldq mm0, mm3 ; 30 20 10 00 + punpckhdq mm1, mm3 ; 31 21 11 01 + + punpckldq mm2, mm4 ; 32 22 12 02 + punpckhdq mm5, mm4 ; 33 23 13 03 + + movq [rdx], mm0 + + movq [rdx+rax], mm1 + movq [rdx+rax*2], mm2 + + add rdx, rax + movq [rdx+rax*2], mm5 + + ; begin epilog + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch) +global sym(vp8_short_idct4x4llm_1_mmx) +sym(vp8_short_idct4x4llm_1_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + GET_GOT rbx + ; end prolog + + mov rax, arg(0) ;input + movd mm0, [rax] + + paddw mm0, [fours GLOBAL] + mov rdx, arg(1) ;output + + psraw mm0, 3 + movsxd rax, dword ptr arg(2) ;pitch + + punpcklwd mm0, mm0 + punpckldq mm0, mm0 + + movq [rdx], mm0 + movq [rdx+rax], mm0 + + movq [rdx+rax*2], mm0 + add rdx, rax + + movq [rdx+rax*2], mm0 + + + ; begin epilog + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void dc_only_idct_mmx(short input_dc, short *output, int pitch) +global sym(vp8_dc_only_idct_mmx) +sym(vp8_dc_only_idct_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + GET_GOT rbx + ; end prolog + + movd mm0, arg(0) ;input_dc + + paddw mm0, [fours GLOBAL] + mov rdx, arg(1) ;output + + psraw mm0, 3 + movsxd rax, dword ptr arg(2) ;pitch + + punpcklwd mm0, mm0 + punpckldq mm0, mm0 + + movq [rdx], mm0 + movq [rdx+rax], mm0 + + movq [rdx+rax*2], mm0 + add rdx, rax + + movq [rdx+rax*2], mm0 + + ; begin epilog + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +x_s1sqr2: + times 4 dw 0x8A8C +align 16 +x_c1sqr2less1: + times 4 dw 0x4E7B +align 16 +fours: + times 4 dw 0x0004 diff --git a/vp8/common/x86/iwalsh_mmx.asm b/vp8/common/x86/iwalsh_mmx.asm new file mode 100644 index 000000000..562e5908f --- /dev/null +++ b/vp8/common/x86/iwalsh_mmx.asm @@ -0,0 +1,172 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_short_inv_walsh4x4_1_mmx(short *input, short *output) +global sym(vp8_short_inv_walsh4x4_1_mmx) +sym(vp8_short_inv_walsh4x4_1_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) + mov rax, 3 + + mov rdi, arg(1) + add rax, [rsi] ;input[0] + 3 + + movd mm0, eax + + punpcklwd mm0, mm0 ;x x val val + + punpckldq mm0, mm0 ;val val val val + + psraw mm0, 3 ;(input[0] + 3) >> 3 + + movq [rdi + 0], mm0 + movq [rdi + 8], mm0 + movq [rdi + 16], mm0 + movq [rdi + 24], mm0 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_short_inv_walsh4x4_mmx(short *input, short *output) +global sym(vp8_short_inv_walsh4x4_mmx) +sym(vp8_short_inv_walsh4x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + mov rax, 3 + mov rsi, arg(0) + mov rdi, arg(1) + shl rax, 16 + + movq mm0, [rsi + 0] ;ip[0] + movq mm1, [rsi + 8] ;ip[4] + or rax, 3 ;00030003h + + movq mm2, [rsi + 16] ;ip[8] + movq mm3, [rsi + 24] ;ip[12] + + movd mm7, rax + movq mm4, mm0 + + punpcklwd mm7, mm7 ;0003000300030003h + movq mm5, mm1 + + paddw mm4, mm3 ;ip[0] + ip[12] aka al + paddw mm5, mm2 ;ip[4] + ip[8] aka bl + + movq mm6, mm4 ;temp al + + paddw mm4, mm5 ;al + bl + psubw mm6, mm5 ;al - bl + + psubw mm0, mm3 ;ip[0] - ip[12] aka d1 + psubw mm1, mm2 ;ip[4] - ip[8] aka c1 + + movq mm5, mm0 ;temp dl + + paddw mm0, mm1 ;dl + cl + psubw mm5, mm1 ;dl - cl + + ; 03 02 01 00 + ; 13 12 11 10 + ; 23 22 21 20 + ; 33 32 31 30 + + movq mm3, mm4 ; 03 02 01 00 + punpcklwd mm4, mm0 ; 11 01 10 00 + punpckhwd mm3, mm0 ; 13 03 12 02 + + movq mm1, mm6 ; 23 22 21 20 + punpcklwd mm6, mm5 ; 31 21 30 20 + punpckhwd mm1, mm5 ; 33 23 32 22 + + movq mm0, mm4 ; 11 01 10 00 + movq mm2, mm3 ; 13 03 12 02 + + punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] + punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4] + + punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8] + punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12] +;~~~~~~~~~~~~~~~~~~~~~ + movq mm1, mm0 + movq mm5, mm4 + + paddw mm1, mm3 ;ip[0] + ip[12] aka al + paddw mm5, mm2 ;ip[4] + ip[8] aka bl + + movq mm6, mm1 ;temp al + + paddw mm1, mm5 ;al + bl + psubw mm6, mm5 ;al - bl + + psubw mm0, mm3 ;ip[0] - ip[12] aka d1 + psubw mm4, mm2 ;ip[4] - ip[8] aka c1 + + movq mm5, mm0 ;temp dl + + paddw mm0, mm4 ;dl + cl + psubw mm5, mm4 ;dl - cl +;~~~~~~~~~~~~~~~~~~~~~ + movq mm3, mm1 ; 03 02 01 00 + punpcklwd mm1, mm0 ; 11 01 10 00 + punpckhwd mm3, mm0 ; 13 03 12 02 + + movq mm4, mm6 ; 23 22 21 20 + punpcklwd mm6, mm5 ; 31 21 30 20 + punpckhwd mm4, mm5 ; 33 23 32 22 + + movq mm0, mm1 ; 11 01 10 00 + movq mm2, mm3 ; 13 03 12 02 + + punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] + punpckhdq mm1, mm6 ; 31 21 11 01 aka ip[4] + + punpckldq mm2, mm4 ; 32 22 12 02 aka ip[8] + punpckhdq mm3, mm4 ; 33 23 13 03 aka ip[12] + + paddw mm0, mm7 + paddw mm1, mm7 + paddw mm2, mm7 + paddw mm3, mm7 + + psraw mm0, 3 + psraw mm1, 3 + psraw mm2, 3 + psraw mm3, 3 + + movq [rdi + 0], mm0 + movq [rdi + 8], mm1 + movq [rdi + 16], mm2 + movq [rdi + 24], mm3 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + diff --git a/vp8/common/x86/iwalsh_sse2.asm b/vp8/common/x86/iwalsh_sse2.asm new file mode 100644 index 000000000..96943dfb8 --- /dev/null +++ b/vp8/common/x86/iwalsh_sse2.asm @@ -0,0 +1,116 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_short_inv_walsh4x4_sse2(short *input, short *output) +global sym(vp8_short_inv_walsh4x4_sse2) +sym(vp8_short_inv_walsh4x4_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) + mov rdi, arg(1) + mov rax, 3 + + movdqa xmm0, [rsi + 0] ;ip[4] ip[0] + movdqa xmm1, [rsi + 16] ;ip[12] ip[8] + + shl rax, 16 + or rax, 3 ;00030003h + + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm0 ;ip[4] ip[0] + + paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + + movdqa xmm4, xmm0 + punpcklqdq xmm0, xmm3 ;d1 a1 + punpckhqdq xmm4, xmm3 ;c1 b1 + movd xmm7, eax + + movdqa xmm1, xmm4 ;c1 b1 + paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] + +;;;temp output +;; movdqu [rdi + 0], xmm4 +;; movdqu [rdi + 16], xmm3 + +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; 13 12 11 10 03 02 01 00 + ; + ; 33 32 31 30 23 22 21 20 + ; + movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00 + punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00 + punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10 + movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00 + punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02 + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm4 ;ip[4] ip[0] + + pshufd xmm7, xmm7, 0 ;03 03 03 03 03 03 03 03 + + paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm3 ;d1 a1 + punpckhqdq xmm5, xmm3 ;c1 b1 + + movdqa xmm1, xmm5 ;c1 b1 + paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; 13 12 11 10 03 02 01 00 + ; + ; 33 32 31 30 23 22 21 20 + ; + movdqa xmm0, xmm5 ; 13 12 11 10 03 02 01 00 + punpcklwd xmm5, xmm4 ; 23 03 22 02 21 01 20 00 + punpckhwd xmm0, xmm4 ; 33 13 32 12 31 11 30 10 + movdqa xmm1, xmm5 ; 23 03 22 02 21 01 20 00 + punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02 +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + paddw xmm5, xmm7 + paddw xmm1, xmm7 + + psraw xmm5, 3 + psraw xmm1, 3 + + movdqa [rdi + 0], xmm5 + movdqa [rdi + 16], xmm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +x_s1sqr2: + times 4 dw 0x8A8C +align 16 +x_c1sqr2less1: + times 4 dw 0x4E7B +align 16 +fours: + times 4 dw 0x0004 diff --git a/vp8/common/x86/loopfilter_mmx.asm b/vp8/common/x86/loopfilter_mmx.asm new file mode 100644 index 000000000..6e4d2b651 --- /dev/null +++ b/vp8/common/x86/loopfilter_mmx.asm @@ -0,0 +1,1776 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;void vp8_loop_filter_horizontal_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *flimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_loop_filter_horizontal_edge_mmx) +sym(vp8_loop_filter_horizontal_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + movsxd rcx, dword ptr arg(5) ;count +next8_h: + mov rdx, arg(3) ;limit + movq mm7, [rdx] + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + + ; calculate breakout conditions + movq mm2, [rdi+2*rax] ; q3 + movq mm1, [rsi+2*rax] ; q2 + movq mm6, mm1 ; q2 + psubusb mm1, mm2 ; q2-=q3 + psubusb mm2, mm6 ; q3-=q2 + por mm1, mm2 ; abs(q3-q2) + psubusb mm1, mm7 ; + + + movq mm4, [rsi+rax] ; q1 + movq mm3, mm4 ; q1 + psubusb mm4, mm6 ; q1-=q2 + psubusb mm6, mm3 ; q2-=q1 + por mm4, mm6 ; abs(q2-q1) + + psubusb mm4, mm7 + por mm1, mm4 + + movq mm4, [rsi] ; q0 + movq mm0, mm4 ; q0 + psubusb mm4, mm3 ; q0-=q1 + psubusb mm3, mm0 ; q1-=q0 + por mm4, mm3 ; abs(q0-q1) + movq t0, mm4 ; save to t0 + psubusb mm4, mm7 + por mm1, mm4 + + + neg rax ; negate pitch to deal with above border + + movq mm2, [rsi+4*rax] ; p3 + movq mm4, [rdi+4*rax] ; p2 + movq mm5, mm4 ; p2 + psubusb mm4, mm2 ; p2-=p3 + psubusb mm2, mm5 ; p3-=p2 + por mm4, mm2 ; abs(p3 - p2) + psubusb mm4, mm7 + por mm1, mm4 + + + movq mm4, [rsi+2*rax] ; p1 + movq mm3, mm4 ; p1 + psubusb mm4, mm5 ; p1-=p2 + psubusb mm5, mm3 ; p2-=p1 + por mm4, mm5 ; abs(p2 - p1) + psubusb mm4, mm7 + por mm1, mm4 + + movq mm2, mm3 ; p1 + + movq mm4, [rsi+rax] ; p0 + movq mm5, mm4 ; p0 + psubusb mm4, mm3 ; p0-=p1 + psubusb mm3, mm5 ; p1-=p0 + por mm4, mm3 ; abs(p1 - p0) + movq t1, mm4 ; save to t1 + psubusb mm4, mm7 + por mm1, mm4 + + movq mm3, [rdi] ; q1 + movq mm4, mm3 ; q1 + psubusb mm3, mm2 ; q1-=p1 + psubusb mm2, mm4 ; p1-=q1 + por mm2, mm3 ; abs(p1-q1) + pand mm2, [tfe GLOBAL] ; set lsb of each byte to zero + psrlw mm2, 1 ; abs(p1-q1)/2 + + movq mm6, mm5 ; p0 + movq mm3, [rsi] ; q0 + psubusb mm5, mm3 ; p0-=q0 + psubusb mm3, mm6 ; q0-=p0 + por mm5, mm3 ; abs(p0 - q0) + paddusb mm5, mm5 ; abs(p0-q0)*2 + paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + mov rdx, arg(2) ;flimit ; get flimit + movq mm2, [rdx] ; flimit mm2 + paddb mm2, mm2 ; flimit*2 (less than 255) + paddb mm7, mm2 ; flimit * 2 + limit (less than 255) + + psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + por mm1, mm5 + pxor mm5, mm5 + pcmpeqb mm1, mm5 ; mask mm1 + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movq mm7, [rdx] ; + movq mm4, t0 ; get abs (q1 - q0) + psubusb mm4, mm7 + movq mm3, t1 ; get abs (p1 - p0) + psubusb mm3, mm7 + paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + + pcmpeqb mm4, mm5 + + pcmpeqb mm5, mm5 + pxor mm4, mm5 + + + ; start work on filters + movq mm2, [rsi+2*rax] ; p1 + movq mm7, [rdi] ; q1 + pxor mm2, [t80 GLOBAL] ; p1 offset to convert to signed values + pxor mm7, [t80 GLOBAL] ; q1 offset to convert to signed values + psubsb mm2, mm7 ; p1 - q1 + pand mm2, mm4 ; high var mask (hvm)(p1 - q1) + pxor mm6, [t80 GLOBAL] ; offset to convert to signed values + pxor mm0, [t80 GLOBAL] ; offset to convert to signed values + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) + paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) + paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) + pand mm1, mm2 ; mask filter values we don't care about + movq mm2, mm1 + paddsb mm1, [t4 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 + paddsb mm2, [t3 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 + + pxor mm0, mm0 ; + pxor mm5, mm5 + punpcklbw mm0, mm2 ; + punpckhbw mm5, mm2 ; + psraw mm0, 11 ; + psraw mm5, 11 + packsswb mm0, mm5 + movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + + pxor mm0, mm0 ; 0 + movq mm5, mm1 ; abcdefgh + punpcklbw mm0, mm1 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + pxor mm1, mm1 ; 0 + punpckhbw mm1, mm5 ; a0b0c0d0 + psraw mm1, 11 ; sign extended shift right by 3 + movq mm5, mm0 ; save results + + packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + paddsw mm5, [ones GLOBAL] + paddsw mm1, [ones GLOBAL] + psraw mm5, 1 ; partial shifted one more time for 2nd tap + psraw mm1, 1 ; partial shifted one more time for 2nd tap + packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + pandn mm4, mm5 ; high edge variance additive + + paddsb mm6, mm2 ; p0+= p0 add + pxor mm6, [t80 GLOBAL] ; unoffset + movq [rsi+rax], mm6 ; write back + + movq mm6, [rsi+2*rax] ; p1 + pxor mm6, [t80 GLOBAL] ; reoffset + paddsb mm6, mm4 ; p1+= p1 add + pxor mm6, [t80 GLOBAL] ; unoffset + movq [rsi+2*rax], mm6 ; write back + + psubsb mm3, mm0 ; q0-= q0 add + pxor mm3, [t80 GLOBAL] ; unoffset + movq [rsi], mm3 ; write back + + psubsb mm7, mm4 ; q1-= q1 add + pxor mm7, [t80 GLOBAL] ; unoffset + movq [rdi], mm7 ; write back + + add rsi,8 + neg rax + dec rcx + jnz next8_h + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_loop_filter_vertical_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *flimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_loop_filter_vertical_edge_mmx) +sym(vp8_loop_filter_vertical_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 64 ; reserve 64 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + %define srct [rsp + 32] ;__declspec(align(16)) char srct[32]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi + rax*4 - 4] + + movsxd rcx, dword ptr arg(5) ;count +next8_v: + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + + + ;transpose + movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 + movq mm7, mm6 ; 77 76 75 74 73 72 71 70 + + punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64 + punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60 + + movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 + movq mm5, mm4 ; 47 46 45 44 43 42 41 40 + + punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44 + punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40 + + movq mm3, mm5 ; 57 47 56 46 55 45 54 44 + punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 + + punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 + movq mm2, mm4 ; 53 43 52 42 51 41 50 40 + + punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 + punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 + + neg rax + movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 + + movq mm1, mm6 ; 27 26 25 24 23 22 21 20 + punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24 + + punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20 + movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 + + punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 + movq mm0, mm7 ; 17 07 16 06 15 05 14 04 + + punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 + punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 + + movq mm6, mm7 ; 37 27 17 07 36 26 16 06 + punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 + + punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 + + movq mm5, mm6 ; 76 66 56 46 36 26 16 06 + psubusb mm5, mm7 ; q2-q3 + + psubusb mm7, mm6 ; q3-q2 + por mm7, mm5; ; mm7=abs (q3-q2) + + movq mm5, mm0 ; 35 25 15 05 34 24 14 04 + punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 + + punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 + movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 + + psubusb mm3, mm6 ; q1-q2 + psubusb mm6, mm5 ; q2-q1 + + por mm6, mm3 ; mm6=abs(q2-q1) + lea rdx, srct + + movq [rdx+24], mm5 ; save q1 + movq [rdx+16], mm0 ; save q0 + + movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 + punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 + + movq mm0, mm3 ; 13 03 12 02 11 01 10 00 + punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 + + punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 + movq mm1, mm0 ; 31 21 11 01 30 20 10 00 + + punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 + punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 + + movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 + psubusb mm2, mm0 ; p2-p3 + + psubusb mm0, mm1 ; p3-p2 + por mm0, mm2 ; mm0=abs(p3-p2) + + movq mm2, mm3 ; 33 23 13 03 32 22 12 02 + punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 + + punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 + movq [rdx+8], mm3 ; save p0 + + movq [rdx], mm2 ; save p1 + movq mm5, mm2 ; mm5 = p1 + + psubusb mm2, mm1 ; p1-p2 + psubusb mm1, mm5 ; p2-p1 + + por mm1, mm2 ; mm1=abs(p2-p1) + mov rdx, arg(3) ;limit + + movq mm4, [rdx] ; mm4 = limit + psubusb mm7, mm4 + + psubusb mm0, mm4 + psubusb mm1, mm4 + + psubusb mm6, mm4 + por mm7, mm6 + + por mm0, mm1 + por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit + + movq mm1, mm5 ; p1 + + movq mm7, mm3 ; mm3=mm7=p0 + psubusb mm7, mm5 ; p0 - p1 + + psubusb mm5, mm3 ; p1 - p0 + por mm5, mm7 ; abs(p1-p0) + + movq t0, mm5 ; save abs(p1-p0) + lea rdx, srct + + psubusb mm5, mm4 + por mm0, mm5 ; mm0=mask + + movq mm5, [rdx+16] ; mm5=q0 + movq mm7, [rdx+24] ; mm7=q1 + + movq mm6, mm5 ; mm6=q0 + movq mm2, mm7 ; q1 + psubusb mm5, mm7 ; q0-q1 + + psubusb mm7, mm6 ; q1-q0 + por mm7, mm5 ; abs(q1-q0) + + movq t1, mm7 ; save abs(q1-q0) + psubusb mm7, mm4 + + por mm0, mm7 ; mask + + movq mm5, mm2 ; q1 + psubusb mm5, mm1 ; q1-=p1 + psubusb mm1, mm2 ; p1-=q1 + por mm5, mm1 ; abs(p1-q1) + pand mm5, [tfe GLOBAL] ; set lsb of each byte to zero + psrlw mm5, 1 ; abs(p1-q1)/2 + + mov rdx, arg(2) ;flimit ; + + movq mm2, [rdx] ;flimit mm2 + movq mm1, mm3 ; mm1=mm3=p0 + + movq mm7, mm6 ; mm7=mm6=q0 + psubusb mm1, mm7 ; p0-q0 + + psubusb mm7, mm3 ; q0-p0 + por mm1, mm7 ; abs(q0-p0) + paddusb mm1, mm1 ; abs(q0-p0)*2 + paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + paddb mm2, mm2 ; flimit*2 (less than 255) + paddb mm4, mm2 ; flimit * 2 + limit (less than 255) + + psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + por mm1, mm0; ; mask + + pxor mm0, mm0 + pcmpeqb mm1, mm0 + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movq mm7, [rdx] + ; + movq mm4, t0 ; get abs (q1 - q0) + psubusb mm4, mm7 + + movq mm3, t1 ; get abs (p1 - p0) + psubusb mm3, mm7 + + por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + pcmpeqb mm4, mm0 + + pcmpeqb mm0, mm0 + pxor mm4, mm0 + + + + ; start work on filters + lea rdx, srct + + movq mm2, [rdx] ; p1 + movq mm7, [rdx+24] ; q1 + + movq mm6, [rdx+8] ; p0 + movq mm0, [rdx+16] ; q0 + + pxor mm2, [t80 GLOBAL] ; p1 offset to convert to signed values + pxor mm7, [t80 GLOBAL] ; q1 offset to convert to signed values + + psubsb mm2, mm7 ; p1 - q1 + pand mm2, mm4 ; high var mask (hvm)(p1 - q1) + + pxor mm6, [t80 GLOBAL] ; offset to convert to signed values + pxor mm0, [t80 GLOBAL] ; offset to convert to signed values + + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + + paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) + paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) + + paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) + pand mm1, mm2 ; mask filter values we don't care about + + movq mm2, mm1 + paddsb mm1, [t4 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 + + paddsb mm2, [t3 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 + pxor mm0, mm0 ; + + pxor mm5, mm5 + punpcklbw mm0, mm2 ; + + punpckhbw mm5, mm2 ; + psraw mm0, 11 ; + + psraw mm5, 11 + packsswb mm0, mm5 + + movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + + pxor mm0, mm0 ; 0 + movq mm5, mm1 ; abcdefgh + + punpcklbw mm0, mm1 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + + pxor mm1, mm1 ; 0 + punpckhbw mm1, mm5 ; a0b0c0d0 + + psraw mm1, 11 ; sign extended shift right by 3 + movq mm5, mm0 ; save results + + packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + paddsw mm5, [ones GLOBAL] + + paddsw mm1, [ones GLOBAL] + psraw mm5, 1 ; partial shifted one more time for 2nd tap + + psraw mm1, 1 ; partial shifted one more time for 2nd tap + packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + + pandn mm4, mm5 ; high edge variance additive + + paddsb mm6, mm2 ; p0+= p0 add + pxor mm6, [t80 GLOBAL] ; unoffset + + ; mm6=p0 ; + movq mm1, [rdx] ; p1 + pxor mm1, [t80 GLOBAL] ; reoffset + + paddsb mm1, mm4 ; p1+= p1 add + pxor mm1, [t80 GLOBAL] ; unoffset + ; mm6 = p0 mm1 = p1 + + psubsb mm3, mm0 ; q0-= q0 add + pxor mm3, [t80 GLOBAL] ; unoffset + + ; mm3 = q0 + psubsb mm7, mm4 ; q1-= q1 add + pxor mm7, [t80 GLOBAL] ; unoffset + ; mm7 = q1 + + ; tranpose and write back + ; mm1 = 72 62 52 42 32 22 12 02 + ; mm6 = 73 63 53 43 33 23 13 03 + ; mm3 = 74 64 54 44 34 24 14 04 + ; mm7 = 75 65 55 45 35 25 15 05 + + movq mm2, mm1 ; 72 62 52 42 32 22 12 02 + punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02 + + movq mm4, mm3 ; 74 64 54 44 34 24 14 04 + punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42 + + punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04 + punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44 + + movq mm6, mm2 ; 33 32 23 22 13 12 03 02 + punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02 + + punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22 + movq mm5, mm1 ; 73 72 63 62 53 52 43 42 + + punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42 + punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62 + + + ; mm2 = 15 14 13 12 05 04 03 02 + ; mm6 = 35 34 33 32 25 24 23 22 + ; mm5 = 55 54 53 52 45 44 43 42 + ; mm1 = 75 74 73 72 65 64 63 62 + + + + movd [rsi+rax*4+2], mm2 + psrlq mm2, 32 + + movd [rdi+rax*4+2], mm2 + movd [rsi+rax*2+2], mm6 + + psrlq mm6, 32 + movd [rsi+rax+2],mm6 + + movd [rsi+2], mm1 + psrlq mm1, 32 + + movd [rdi+2], mm1 + neg rax + + movd [rdi+rax+2],mm5 + psrlq mm5, 32 + + movd [rdi+rax*2+2], mm5 + + lea rsi, [rsi+rax*8] + dec rcx + jnz next8_v + + add rsp, 64 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_mbloop_filter_horizontal_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *flimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_mbloop_filter_horizontal_edge_mmx) +sym(vp8_mbloop_filter_horizontal_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + movsxd rcx, dword ptr arg(5) ;count +next8_mbh: + mov rdx, arg(3) ;limit + movq mm7, [rdx] + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + + ; calculate breakout conditions + movq mm2, [rdi+2*rax] ; q3 + + movq mm1, [rsi+2*rax] ; q2 + movq mm6, mm1 ; q2 + psubusb mm1, mm2 ; q2-=q3 + psubusb mm2, mm6 ; q3-=q2 + por mm1, mm2 ; abs(q3-q2) + psubusb mm1, mm7 + + + ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit + movq mm4, [rsi+rax] ; q1 + movq mm3, mm4 ; q1 + psubusb mm4, mm6 ; q1-=q2 + psubusb mm6, mm3 ; q2-=q1 + por mm4, mm6 ; abs(q2-q1) + psubusb mm4, mm7 + por mm1, mm4 + + + ; mm1 = mask, mm3=q1, mm7 = limit + + movq mm4, [rsi] ; q0 + movq mm0, mm4 ; q0 + psubusb mm4, mm3 ; q0-=q1 + psubusb mm3, mm0 ; q1-=q0 + por mm4, mm3 ; abs(q0-q1) + movq t0, mm4 ; save to t0 + psubusb mm4, mm7 + por mm1, mm4 + + + ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) + + neg rax ; negate pitch to deal with above border + + movq mm2, [rsi+4*rax] ; p3 + movq mm4, [rdi+4*rax] ; p2 + movq mm5, mm4 ; p2 + psubusb mm4, mm2 ; p2-=p3 + psubusb mm2, mm5 ; p3-=p2 + por mm4, mm2 ; abs(p3 - p2) + psubusb mm4, mm7 + por mm1, mm4 + ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) + + movq mm4, [rsi+2*rax] ; p1 + movq mm3, mm4 ; p1 + psubusb mm4, mm5 ; p1-=p2 + psubusb mm5, mm3 ; p2-=p1 + por mm4, mm5 ; abs(p2 - p1) + psubusb mm4, mm7 + por mm1, mm4 + + movq mm2, mm3 ; p1 + + + ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) + + movq mm4, [rsi+rax] ; p0 + movq mm5, mm4 ; p0 + psubusb mm4, mm3 ; p0-=p1 + psubusb mm3, mm5 ; p1-=p0 + por mm4, mm3 ; abs(p1 - p0) + movq t1, mm4 ; save to t1 + psubusb mm4, mm7 + por mm1, mm4 + ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0) + ; mm5 = p0 + movq mm3, [rdi] ; q1 + movq mm4, mm3 ; q1 + psubusb mm3, mm2 ; q1-=p1 + psubusb mm2, mm4 ; p1-=q1 + por mm2, mm3 ; abs(p1-q1) + pand mm2, [tfe GLOBAL] ; set lsb of each byte to zero + psrlw mm2, 1 ; abs(p1-q1)/2 + + movq mm6, mm5 ; p0 + movq mm3, mm0 ; q0 + psubusb mm5, mm3 ; p0-=q0 + psubusb mm3, mm6 ; q0-=p0 + por mm5, mm3 ; abs(p0 - q0) + paddusb mm5, mm5 ; abs(p0-q0)*2 + paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + mov rdx, arg(2) ;flimit ; get flimit + movq mm2, [rdx] ; flimit mm2 + paddb mm2, mm2 ; flimit*2 (less than 255) + paddb mm7, mm2 ; flimit * 2 + limit (less than 255) + + psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + por mm1, mm5 + pxor mm5, mm5 + pcmpeqb mm1, mm5 ; mask mm1 + + ; mm1 = mask, mm0=q0, mm7 = flimit, t0 = abs(q0-q1) t1 = abs(p1-p0) + ; mm6 = p0, + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movq mm7, [rdx] ; + movq mm4, t0 ; get abs (q1 - q0) + psubusb mm4, mm7 + movq mm3, t1 ; get abs (p1 - p0) + psubusb mm3, mm7 + paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + + pcmpeqb mm4, mm5 + + pcmpeqb mm5, mm5 + pxor mm4, mm5 + + + + ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0) + ; mm6 = p0, mm4=hev + ; start work on filters + movq mm2, [rsi+2*rax] ; p1 + movq mm7, [rdi] ; q1 + pxor mm2, [t80 GLOBAL] ; p1 offset to convert to signed values + pxor mm7, [t80 GLOBAL] ; q1 offset to convert to signed values + psubsb mm2, mm7 ; p1 - q1 + + pxor mm6, [t80 GLOBAL] ; offset to convert to signed values + pxor mm0, [t80 GLOBAL] ; offset to convert to signed values + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) + paddsb mm2, mm0 ; 2 * (q0 - p0) + paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) + pand mm1, mm2 ; mask filter values we don't care about + + + ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 + movq mm2, mm1 ; vp8_filter + pand mm2, mm4; ; Filter2 = vp8_filter & hev + + movq mm5, mm2 ; + paddsb mm5, [t3 GLOBAL]; + + pxor mm0, mm0 ; 0 + pxor mm7, mm7 ; 0 + + punpcklbw mm0, mm5 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + punpckhbw mm7, mm5 ; a0b0c0d0 + psraw mm7, 11 ; sign extended shift right by 3 + packsswb mm0, mm7 ; Filter2 >>=3; + + movq mm5, mm0 ; Filter2 + + paddsb mm2, [t4 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 4) + pxor mm0, mm0 ; 0 + pxor mm7, mm7 ; 0 + + punpcklbw mm0, mm2 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + punpckhbw mm7, mm2 ; a0b0c0d0 + psraw mm7, 11 ; sign extended shift right by 3 + packsswb mm0, mm7 ; Filter2 >>=3; + + ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 + psubsb mm3, mm0 ; qs0 =qs0 - filter1 + paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 + + ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 + ; vp8_filter &= ~hev; + ; Filter2 = vp8_filter; + pandn mm4, mm1 ; vp8_filter&=~hev + + + ; mm3=qs0, mm4=filter2, mm6=ps0 + + ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); + ; s = vp8_signed_char_clamp(qs0 - u); + ; *oq0 = s^0x80; + ; s = vp8_signed_char_clamp(ps0 + u); + ; *op0 = s^0x80; + pxor mm0, mm0 + + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [s27 GLOBAL] + pmulhw mm2, [s27 GLOBAL] + paddw mm1, [s63 GLOBAL] + paddw mm2, [s63 GLOBAL] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + psubsb mm3, mm1 + paddsb mm6, mm1 + + pxor mm3, [t80 GLOBAL] + pxor mm6, [t80 GLOBAL] + movq [rsi+rax], mm6 + movq [rsi], mm3 + + ; roughly 2/7th difference across boundary + ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); + ; s = vp8_signed_char_clamp(qs1 - u); + ; *oq1 = s^0x80; + ; s = vp8_signed_char_clamp(ps1 + u); + ; *op1 = s^0x80; + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [s18 GLOBAL] + pmulhw mm2, [s18 GLOBAL] + paddw mm1, [s63 GLOBAL] + paddw mm2, [s63 GLOBAL] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + movq mm3, [rdi] + movq mm6, [rsi+rax*2] ; p1 + + pxor mm3, [t80 GLOBAL] + pxor mm6, [t80 GLOBAL] + + paddsb mm6, mm1 + psubsb mm3, mm1 + + pxor mm6, [t80 GLOBAL] + pxor mm3, [t80 GLOBAL] + movq [rdi], mm3 + movq [rsi+rax*2], mm6 + + ; roughly 1/7th difference across boundary + ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); + ; s = vp8_signed_char_clamp(qs2 - u); + ; *oq2 = s^0x80; + ; s = vp8_signed_char_clamp(ps2 + u); + ; *op2 = s^0x80; + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [s9 GLOBAL] + pmulhw mm2, [s9 GLOBAL] + paddw mm1, [s63 GLOBAL] + paddw mm2, [s63 GLOBAL] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + + movq mm6, [rdi+rax*4] + neg rax + movq mm3, [rdi+rax ] + + pxor mm6, [t80 GLOBAL] + pxor mm3, [t80 GLOBAL] + + paddsb mm6, mm1 + psubsb mm3, mm1 + + pxor mm6, [t80 GLOBAL] + pxor mm3, [t80 GLOBAL] + movq [rdi+rax ], mm3 + neg rax + movq [rdi+rax*4], mm6 + +;EARLY_BREAK_OUT: + neg rax + add rsi,8 + dec rcx + jnz next8_mbh + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_mbloop_filter_vertical_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *flimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_mbloop_filter_vertical_edge_mmx) +sym(vp8_mbloop_filter_vertical_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 96 ; reserve 96 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi + rax*4 - 4] + + movsxd rcx, dword ptr arg(5) ;count +next8_mbv: + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + + ;transpose + movq mm0, [rdi+2*rax] ; 77 76 75 74 73 72 71 70 + movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 + + movq mm7, mm6 ; 77 76 75 74 73 72 71 70 + punpckhbw mm7, mm0 ; 77 67 76 66 75 65 74 64 + + punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 + movq mm0, [rsi+rax] ; 57 56 55 54 53 52 51 50 + + movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 + movq mm5, mm4 ; 47 46 45 44 43 42 41 40 + + punpckhbw mm5, mm0 ; 57 47 56 46 55 45 54 44 + punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 + + movq mm3, mm5 ; 57 47 56 46 55 45 54 44 + punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 + + punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 + movq mm2, mm4 ; 53 43 52 42 51 41 50 40 + + punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 + punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 + + neg rax + + movq mm7, [rsi+rax] ; 37 36 35 34 33 32 31 30 + movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 + + movq mm1, mm6 ; 27 26 25 24 23 22 21 20 + punpckhbw mm6, mm7 ; 37 27 36 36 35 25 34 24 + + punpcklbw mm1, mm7 ; 33 23 32 22 31 21 30 20 + + movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 + punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 + + movq mm0, mm7 ; 17 07 16 06 15 05 14 04 + punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 + + punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 + movq mm6, mm7 ; 37 27 17 07 36 26 16 06 + + punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 + punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 + + lea rdx, srct + movq mm5, mm6 ; 76 66 56 46 36 26 16 06 + + movq [rdx+56], mm7 + psubusb mm5, mm7 ; q2-q3 + + + movq [rdx+48], mm6 + psubusb mm7, mm6 ; q3-q2 + + por mm7, mm5; ; mm7=abs (q3-q2) + movq mm5, mm0 ; 35 25 15 05 34 24 14 04 + + punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 + punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 + + movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 + psubusb mm3, mm6 ; q1-q2 + + psubusb mm6, mm5 ; q2-q1 + por mm6, mm3 ; mm6=abs(q2-q1) + + movq [rdx+40], mm5 ; save q1 + movq [rdx+32], mm0 ; save q0 + + movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 + punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 + + movq mm0, mm3 ; 13 03 12 02 11 01 10 00 + punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 + + punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 + movq mm1, mm0 ; 31 21 11 01 30 20 10 00 + + punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 + punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 + + movq [rdx], mm0 ; save p3 + movq [rdx+8], mm1 ; save p2 + + movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 + psubusb mm2, mm0 ; p2-p3 + + psubusb mm0, mm1 ; p3-p2 + por mm0, mm2 ; mm0=abs(p3-p2) + + movq mm2, mm3 ; 33 23 13 03 32 22 12 02 + punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 + + punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 + movq [rdx+24], mm3 ; save p0 + + movq [rdx+16], mm2 ; save p1 + movq mm5, mm2 ; mm5 = p1 + + psubusb mm2, mm1 ; p1-p2 + psubusb mm1, mm5 ; p2-p1 + + por mm1, mm2 ; mm1=abs(p2-p1) + mov rdx, arg(3) ;limit + + movq mm4, [rdx] ; mm4 = limit + psubusb mm7, mm4 ; abs(q3-q2) > limit + + psubusb mm0, mm4 ; abs(p3-p2) > limit + psubusb mm1, mm4 ; abs(p2-p1) > limit + + psubusb mm6, mm4 ; abs(q2-q1) > limit + por mm7, mm6 ; or + + por mm0, mm1 ; + por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit + + movq mm1, mm5 ; p1 + + movq mm7, mm3 ; mm3=mm7=p0 + psubusb mm7, mm5 ; p0 - p1 + + psubusb mm5, mm3 ; p1 - p0 + por mm5, mm7 ; abs(p1-p0) + + movq t0, mm5 ; save abs(p1-p0) + lea rdx, srct + + psubusb mm5, mm4 ; mm5 = abs(p1-p0) > limit + por mm0, mm5 ; mm0=mask + + movq mm5, [rdx+32] ; mm5=q0 + movq mm7, [rdx+40] ; mm7=q1 + + movq mm6, mm5 ; mm6=q0 + movq mm2, mm7 ; q1 + psubusb mm5, mm7 ; q0-q1 + + psubusb mm7, mm6 ; q1-q0 + por mm7, mm5 ; abs(q1-q0) + + movq t1, mm7 ; save abs(q1-q0) + psubusb mm7, mm4 ; mm7=abs(q1-q0)> limit + + por mm0, mm7 ; mask + + movq mm5, mm2 ; q1 + psubusb mm5, mm1 ; q1-=p1 + psubusb mm1, mm2 ; p1-=q1 + por mm5, mm1 ; abs(p1-q1) + pand mm5, [tfe GLOBAL] ; set lsb of each byte to zero + psrlw mm5, 1 ; abs(p1-q1)/2 + + mov rdx, arg(2) ;flimit ; + + movq mm2, [rdx] ;flimit mm2 + movq mm1, mm3 ; mm1=mm3=p0 + + movq mm7, mm6 ; mm7=mm6=q0 + psubusb mm1, mm7 ; p0-q0 + + psubusb mm7, mm3 ; q0-p0 + por mm1, mm7 ; abs(q0-p0) + paddusb mm1, mm1 ; abs(q0-p0)*2 + paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + paddb mm2, mm2 ; flimit*2 (less than 255) + paddb mm4, mm2 ; flimit * 2 + limit (less than 255) + + psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + por mm1, mm0; ; mask + + pxor mm0, mm0 + pcmpeqb mm1, mm0 + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movq mm7, [rdx] + ; + movq mm4, t0 ; get abs (q1 - q0) + psubusb mm4, mm7 ; abs(q1 - q0) > thresh + + movq mm3, t1 ; get abs (p1 - p0) + psubusb mm3, mm7 ; abs(p1 - p0)> thresh + + por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + pcmpeqb mm4, mm0 + + pcmpeqb mm0, mm0 + pxor mm4, mm0 + + + + + ; start work on filters + lea rdx, srct + + ; start work on filters + movq mm2, [rdx+16] ; p1 + movq mm7, [rdx+40] ; q1 + pxor mm2, [t80 GLOBAL] ; p1 offset to convert to signed values + pxor mm7, [t80 GLOBAL] ; q1 offset to convert to signed values + psubsb mm2, mm7 ; p1 - q1 + + movq mm6, [rdx+24] ; p0 + movq mm0, [rdx+32] ; q0 + pxor mm6, [t80 GLOBAL] ; offset to convert to signed values + pxor mm0, [t80 GLOBAL] ; offset to convert to signed values + + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) + paddsb mm2, mm0 ; 2 * (q0 - p0) + paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) + pand mm1, mm2 ; mask filter values we don't care about + + ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 + movq mm2, mm1 ; vp8_filter + pand mm2, mm4; ; Filter2 = vp8_filter & hev + + movq mm5, mm2 ; + paddsb mm5, [t3 GLOBAL]; + + pxor mm0, mm0 ; 0 + pxor mm7, mm7 ; 0 + + punpcklbw mm0, mm5 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + punpckhbw mm7, mm5 ; a0b0c0d0 + psraw mm7, 11 ; sign extended shift right by 3 + packsswb mm0, mm7 ; Filter2 >>=3; + + movq mm5, mm0 ; Filter2 + + paddsb mm2, [t4 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 4) + pxor mm0, mm0 ; 0 + pxor mm7, mm7 ; 0 + + punpcklbw mm0, mm2 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + punpckhbw mm7, mm2 ; a0b0c0d0 + psraw mm7, 11 ; sign extended shift right by 3 + packsswb mm0, mm7 ; Filter2 >>=3; + + ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 + psubsb mm3, mm0 ; qs0 =qs0 - filter1 + paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 + + ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 + ; vp8_filter &= ~hev; + ; Filter2 = vp8_filter; + pandn mm4, mm1 ; vp8_filter&=~hev + + + ; mm3=qs0, mm4=filter2, mm6=ps0 + + ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); + ; s = vp8_signed_char_clamp(qs0 - u); + ; *oq0 = s^0x80; + ; s = vp8_signed_char_clamp(ps0 + u); + ; *op0 = s^0x80; + pxor mm0, mm0 + + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [s27 GLOBAL] + pmulhw mm2, [s27 GLOBAL] + paddw mm1, [s63 GLOBAL] + paddw mm2, [s63 GLOBAL] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + psubsb mm3, mm1 + paddsb mm6, mm1 + + pxor mm3, [t80 GLOBAL] + pxor mm6, [t80 GLOBAL] + movq [rdx+24], mm6 + movq [rdx+32], mm3 + + ; roughly 2/7th difference across boundary + ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); + ; s = vp8_signed_char_clamp(qs1 - u); + ; *oq1 = s^0x80; + ; s = vp8_signed_char_clamp(ps1 + u); + ; *op1 = s^0x80; + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [s18 GLOBAL] + pmulhw mm2, [s18 GLOBAL] + paddw mm1, [s63 GLOBAL] + paddw mm2, [s63 GLOBAL] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + movq mm3, [rdx + 40] + movq mm6, [rdx + 16] ; p1 + pxor mm3, [t80 GLOBAL] + pxor mm6, [t80 GLOBAL] + + paddsb mm6, mm1 + psubsb mm3, mm1 + + pxor mm6, [t80 GLOBAL] + pxor mm3, [t80 GLOBAL] + movq [rdx + 40], mm3 + movq [rdx + 16], mm6 + + ; roughly 1/7th difference across boundary + ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); + ; s = vp8_signed_char_clamp(qs2 - u); + ; *oq2 = s^0x80; + ; s = vp8_signed_char_clamp(ps2 + u); + ; *op2 = s^0x80; + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [s9 GLOBAL] + pmulhw mm2, [s9 GLOBAL] + paddw mm1, [s63 GLOBAL] + paddw mm2, [s63 GLOBAL] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + movq mm6, [rdx+ 8] + movq mm3, [rdx+48] + + pxor mm6, [t80 GLOBAL] + pxor mm3, [t80 GLOBAL] + + paddsb mm6, mm1 + psubsb mm3, mm1 + + pxor mm6, [t80 GLOBAL] ; mm6 = 71 61 51 41 31 21 11 01 + pxor mm3, [t80 GLOBAL] ; mm3 = 76 66 56 46 36 26 15 06 + + ; tranpose and write back + movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00 + movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00 + + punpcklbw mm0, mm6 ; mm0 = 31 30 21 20 11 10 01 00 + punpckhbw mm1, mm6 ; mm3 = 71 70 61 60 51 50 41 40 + + movq mm2, [rdx+16] ; mm2 = 72 62 52 42 32 22 12 02 + movq mm6, mm2 ; mm3 = 72 62 52 42 32 22 12 02 + + punpcklbw mm2, [rdx+24] ; mm2 = 33 32 23 22 13 12 03 02 + punpckhbw mm6, [rdx+24] ; mm3 = 73 72 63 62 53 52 43 42 + + movq mm5, mm0 ; mm5 = 31 30 21 20 11 10 01 00 + punpcklwd mm0, mm2 ; mm0 = 13 12 11 10 03 02 01 00 + + punpckhwd mm5, mm2 ; mm5 = 33 32 31 30 23 22 21 20 + movq mm4, mm1 ; mm4 = 71 70 61 60 51 50 41 40 + + punpcklwd mm1, mm6 ; mm1 = 53 52 51 50 43 42 41 40 + punpckhwd mm4, mm6 ; mm4 = 73 72 71 70 63 62 61 60 + + movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 + punpcklbw mm2, [rdx+40] ; mm2 = 35 34 25 24 15 14 05 04 + + movq mm6, mm3 ; mm6 = 76 66 56 46 36 26 15 06 + punpcklbw mm6, [rdx+56] ; mm6 = 37 36 27 26 17 16 07 06 + + movq mm7, mm2 ; mm7 = 35 34 25 24 15 14 05 04 + punpcklwd mm2, mm6 ; mm2 = 17 16 15 14 07 06 05 04 + + punpckhwd mm7, mm6 ; mm7 = 37 36 35 34 27 26 25 24 + movq mm6, mm0 ; mm6 = 13 12 11 10 03 02 01 00 + + punpckldq mm0, mm2 ; mm0 = 07 06 05 04 03 02 01 00 + punpckhdq mm6, mm2 ; mm6 = 17 16 15 14 13 12 11 10 + + movq [rsi+rax*4], mm0 ; write out + movq [rdi+rax*4], mm6 ; write out + + movq mm0, mm5 ; mm0 = 33 32 31 30 23 22 21 20 + punpckldq mm0, mm7 ; mm0 = 27 26 25 24 23 22 20 20 + + punpckhdq mm5, mm7 ; mm5 = 37 36 35 34 33 32 31 30 + movq [rsi+rax*2], mm0 ; write out + + movq [rdi+rax*2], mm5 ; write out + movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 + + punpckhbw mm2, [rdx+40] ; mm2 = 75 74 65 64 54 54 45 44 + punpckhbw mm3, [rdx+56] ; mm3 = 77 76 67 66 57 56 47 46 + + movq mm5, mm2 ; mm5 = 75 74 65 64 54 54 45 44 + punpcklwd mm2, mm3 ; mm2 = 57 56 55 54 47 46 45 44 + + punpckhwd mm5, mm3 ; mm5 = 77 76 75 74 67 66 65 64 + movq mm0, mm1 ; mm0= 53 52 51 50 43 42 41 40 + + movq mm3, mm4 ; mm4 = 73 72 71 70 63 62 61 60 + punpckldq mm0, mm2 ; mm0 = 47 46 45 44 43 42 41 40 + + punpckhdq mm1, mm2 ; mm1 = 57 56 55 54 53 52 51 50 + movq [rsi], mm0 ; write out + + movq [rdi], mm1 ; write out + neg rax + + punpckldq mm3, mm5 ; mm3 = 67 66 65 64 63 62 61 60 + punpckhdq mm4, mm5 ; mm4 = 77 76 75 74 73 72 71 60 + + movq [rsi+rax*2], mm3 + movq [rdi+rax*2], mm4 + + lea rsi, [rsi+rax*8] + dec rcx + + jnz next8_mbv + + add rsp, 96 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_loop_filter_simple_horizontal_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *flimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_loop_filter_simple_horizontal_edge_mmx) +sym(vp8_loop_filter_simple_horizontal_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + movsxd rcx, dword ptr arg(5) ;count +nexts8_h: + mov rdx, arg(3) ;limit + movq mm7, [rdx] + mov rdx, arg(2) ;flimit ; get flimit + movq mm3, [rdx] ; + paddb mm3, mm3 ; flimit*2 (less than 255) + paddb mm3, mm7 ; flimit * 2 + limit (less than 255) + + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + neg rax + + ; calculate mask + movq mm1, [rsi+2*rax] ; p1 + movq mm0, [rdi] ; q1 + movq mm2, mm1 + movq mm7, mm0 + movq mm4, mm0 + psubusb mm0, mm1 ; q1-=p1 + psubusb mm1, mm4 ; p1-=q1 + por mm1, mm0 ; abs(p1-q1) + pand mm1, [tfe GLOBAL] ; set lsb of each byte to zero + psrlw mm1, 1 ; abs(p1-q1)/2 + + movq mm5, [rsi+rax] ; p0 + movq mm4, [rsi] ; q0 + movq mm0, mm4 ; q0 + movq mm6, mm5 ; p0 + psubusb mm5, mm4 ; p0-=q0 + psubusb mm4, mm6 ; q0-=p0 + por mm5, mm4 ; abs(p0 - q0) + paddusb mm5, mm5 ; abs(p0-q0)*2 + paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + pxor mm3, mm3 + pcmpeqb mm5, mm3 + + ; start work on filters + pxor mm2, [t80 GLOBAL] ; p1 offset to convert to signed values + pxor mm7, [t80 GLOBAL] ; q1 offset to convert to signed values + psubsb mm2, mm7 ; p1 - q1 + + pxor mm6, [t80 GLOBAL] ; offset to convert to signed values + pxor mm0, [t80 GLOBAL] ; offset to convert to signed values + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0) + paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0) + paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0) + pand mm5, mm2 ; mask filter values we don't care about + + ; do + 4 side + paddsb mm5, [t4 GLOBAL] ; 3* (q0 - p0) + (p1 - q1) + 4 + + movq mm0, mm5 ; get a copy of filters + psllw mm0, 8 ; shift left 8 + psraw mm0, 3 ; arithmetic shift right 11 + psrlw mm0, 8 + movq mm1, mm5 ; get a copy of filters + psraw mm1, 11 ; arithmetic shift right 11 + psllw mm1, 8 ; shift left 8 to put it back + + por mm0, mm1 ; put the two together to get result + + psubsb mm3, mm0 ; q0-= q0 add + pxor mm3, [t80 GLOBAL] ; unoffset + movq [rsi], mm3 ; write back + + + ; now do +3 side + psubsb mm5, [t1s GLOBAL] ; +3 instead of +4 + + movq mm0, mm5 ; get a copy of filters + psllw mm0, 8 ; shift left 8 + psraw mm0, 3 ; arithmetic shift right 11 + psrlw mm0, 8 + psraw mm5, 11 ; arithmetic shift right 11 + psllw mm5, 8 ; shift left 8 to put it back + por mm0, mm5 ; put the two together to get result + + + paddsb mm6, mm0 ; p0+= p0 add + pxor mm6, [t80 GLOBAL] ; unoffset + movq [rsi+rax], mm6 ; write back + + add rsi,8 + neg rax + dec rcx + jnz nexts8_h + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_loop_filter_simple_vertical_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *flimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_loop_filter_simple_vertical_edge_mmx) +sym(vp8_loop_filter_simple_vertical_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi + rax*4- 2]; ; + movsxd rcx, dword ptr arg(5) ;count +nexts8_v: + + lea rdi, [rsi + rax]; + movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70 + + movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60 + punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 + + movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50 + movd mm4, [rsi] ; xx xx xx xx 43 42 41 40 + + punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 + movq mm5, mm4 ; 53 43 52 42 51 41 50 40 + + punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40 + punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42 + + neg rax + + movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30 + movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20 + + punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20 + movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10 + + movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00 + punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00 + + movq mm2, mm0 ; 13 03 12 02 11 01 10 00 + punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00 + + punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02 + movq mm1, mm0 ; 13 03 12 02 11 01 10 00 + + punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1 + movq mm3, mm2 ; 33 23 13 03 32 22 12 02 + + punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0 + punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0 + + punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1 + + + ; calculate mask + movq mm6, mm0 ; p1 + movq mm7, mm3 ; q1 + psubusb mm7, mm6 ; q1-=p1 + psubusb mm6, mm3 ; p1-=q1 + por mm6, mm7 ; abs(p1-q1) + pand mm6, [tfe GLOBAL] ; set lsb of each byte to zero + psrlw mm6, 1 ; abs(p1-q1)/2 + + movq mm5, mm1 ; p0 + movq mm4, mm2 ; q0 + + psubusb mm5, mm2 ; p0-=q0 + psubusb mm4, mm1 ; q0-=p0 + + por mm5, mm4 ; abs(p0 - q0) + paddusb mm5, mm5 ; abs(p0-q0)*2 + paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + mov rdx, arg(2) ;flimit ; get flimit + movq mm7, [rdx] + mov rdx, arg(3) ; get limit + movq mm6, [rdx] + paddb mm7, mm7 ; flimit*2 (less than 255) + paddb mm7, mm6 ; flimit * 2 + limit (less than 255) + + psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + pxor mm7, mm7 + pcmpeqb mm5, mm7 ; mm5 = mask + + ; start work on filters + movq t0, mm0 + movq t1, mm3 + + pxor mm0, [t80 GLOBAL] ; p1 offset to convert to signed values + pxor mm3, [t80 GLOBAL] ; q1 offset to convert to signed values + + psubsb mm0, mm3 ; p1 - q1 + movq mm6, mm1 ; p0 + + movq mm7, mm2 ; q0 + pxor mm6, [t80 GLOBAL] ; offset to convert to signed values + + pxor mm7, [t80 GLOBAL] ; offset to convert to signed values + movq mm3, mm7 ; offseted ; q0 + + psubsb mm7, mm6 ; q0 - p0 + paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0) + + paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0) + paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0) + + pand mm5, mm0 ; mask filter values we don't care about + + paddsb mm5, [t4 GLOBAL] ; 3* (q0 - p0) + (p1 - q1) + 4 + + movq mm0, mm5 ; get a copy of filters + psllw mm0, 8 ; shift left 8 + psraw mm0, 3 ; arithmetic shift right 11 + psrlw mm0, 8 + + movq mm7, mm5 ; get a copy of filters + psraw mm7, 11 ; arithmetic shift right 11 + psllw mm7, 8 ; shift left 8 to put it back + + por mm0, mm7 ; put the two together to get result + + psubsb mm3, mm0 ; q0-= q0sz add + pxor mm3, [t80 GLOBAL] ; unoffset + + ; now do +3 side + psubsb mm5, [t1s GLOBAL] ; +3 instead of +4 + + movq mm0, mm5 ; get a copy of filters + psllw mm0, 8 ; shift left 8 + psraw mm0, 3 ; arithmetic shift right 11 + psrlw mm0, 8 + + psraw mm5, 11 ; arithmetic shift right 11 + psllw mm5, 8 ; shift left 8 to put it back + por mm0, mm5 ; put the two together to get result + + paddsb mm6, mm0 ; p0+= p0 add + pxor mm6, [t80 GLOBAL] ; unoffset + + + movq mm0, t0 + movq mm4, t1 + + ; mm0 = 70 60 50 40 30 20 10 00 + ; mm6 = 71 61 51 41 31 21 11 01 + ; mm3 = 72 62 52 42 32 22 12 02 + ; mm4 = 73 63 53 43 33 23 13 03 + ; transpose back to write out + + movq mm1, mm0 ; + punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00 + + punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40 + movq mm2, mm3 ; + + punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02 + movq mm5, mm1 ; 71 70 61 60 51 50 41 40 + + punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42 + movq mm6, mm0 ; 31 30 21 20 11 10 01 00 + + punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00 + punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20 + + movd [rsi+rax*4], mm0 ; write 03 02 01 00 + punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40 + + psrlq mm0, 32 ; xx xx xx xx 13 12 11 10 + punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60 + + movd [rdi+rax*4], mm0 ; write 13 12 11 10 + movd [rsi+rax*2], mm6 ; write 23 22 21 20 + + psrlq mm6, 32 ; 33 32 31 30 + movd [rsi], mm1 ; write 43 42 41 40 + + movd [rsi + rax], mm6 ; write 33 32 31 30 + neg rax + + movd [rsi + rax*2], mm5 ; write 63 62 61 60 + psrlq mm1, 32 ; 53 52 51 50 + + movd [rdi], mm1 ; write out 53 52 51 50 + psrlq mm5, 32 ; 73 72 71 70 + + movd [rdi + rax*2], mm5 ; write 73 72 71 70 + + lea rsi, [rsi+rax*8] ; next 8 + + dec rcx + jnz nexts8_v + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + + +;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr, +; int y_stride, +; loop_filter_info *lfi) +;{ +; +; +; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2); +; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2); +; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2); +;} + +SECTION_RODATA +align 16 +tfe: + times 8 db 0xfe +align 16 +t80: + times 8 db 0x80 +align 16 +t1s: + times 8 db 0x01 +align 16 +t3: + times 8 db 0x03 +align 16 +t4: + times 8 db 0x04 +align 16 +ones: + times 4 dw 0x0001 +align 16 +s27: + times 4 dw 0x1b00 +align 16 +s18: + times 4 dw 0x1200 +align 16 +s9: + times 4 dw 0x0900 +align 16 +s63: + times 4 dw 0x003f diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm new file mode 100644 index 000000000..5275dfa3b --- /dev/null +++ b/vp8/common/x86/loopfilter_sse2.asm @@ -0,0 +1,1978 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;void vp8_loop_filter_horizontal_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *flimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_loop_filter_horizontal_edge_sse2) +sym(vp8_loop_filter_horizontal_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + mov rdx, arg(3) ;limit + movdqa xmm7, XMMWORD PTR [rdx] + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + + ; calculate breakout conditions + movdqu xmm2, [rdi+2*rax] ; q3 + movdqu xmm1, [rsi+2*rax] ; q2 + movdqa xmm6, xmm1 ; q2 + psubusb xmm1, xmm2 ; q2-=q3 + psubusb xmm2, xmm6 ; q3-=q2 + por xmm1, xmm2 ; abs(q3-q2) + psubusb xmm1, xmm7 ; + + + movdqu xmm4, [rsi+rax] ; q1 + movdqa xmm3, xmm4 ; q1 + psubusb xmm4, xmm6 ; q1-=q2 + psubusb xmm6, xmm3 ; q2-=q1 + por xmm4, xmm6 ; abs(q2-q1) + + psubusb xmm4, xmm7 + por xmm1, xmm4 + + movdqu xmm4, [rsi] ; q0 + movdqa xmm0, xmm4 ; q0 + psubusb xmm4, xmm3 ; q0-=q1 + psubusb xmm3, xmm0 ; q1-=q0 + por xmm4, xmm3 ; abs(q0-q1) + movdqa t0, xmm4 ; save to t0 + psubusb xmm4, xmm7 + por xmm1, xmm4 + + neg rax ; negate pitch to deal with above border + movdqu xmm2, [rsi+4*rax] ; p3 + movdqu xmm4, [rdi+4*rax] ; p2 + movdqa xmm5, xmm4 ; p2 + psubusb xmm4, xmm2 ; p2-=p3 + psubusb xmm2, xmm5 ; p3-=p2 + por xmm4, xmm2 ; abs(p3 - p2) + psubusb xmm4, xmm7 + por xmm1, xmm4 + + + movdqu xmm4, [rsi+2*rax] ; p1 + movdqa xmm3, xmm4 ; p1 + psubusb xmm4, xmm5 ; p1-=p2 + psubusb xmm5, xmm3 ; p2-=p1 + por xmm4, xmm5 ; abs(p2 - p1) + psubusb xmm4, xmm7 + por xmm1, xmm4 + + movdqa xmm2, xmm3 ; p1 + + movdqu xmm4, [rsi+rax] ; p0 + movdqa xmm5, xmm4 ; p0 + psubusb xmm4, xmm3 ; p0-=p1 + psubusb xmm3, xmm5 ; p1-=p0 + por xmm4, xmm3 ; abs(p1 - p0) + movdqa t1, xmm4 ; save to t1 + psubusb xmm4, xmm7 + por xmm1, xmm4 + + movdqu xmm3, [rdi] ; q1 + movdqa xmm4, xmm3 ; q1 + psubusb xmm3, xmm2 ; q1-=p1 + psubusb xmm2, xmm4 ; p1-=q1 + por xmm2, xmm3 ; abs(p1-q1) + pand xmm2, [tfe GLOBAL] ; set lsb of each byte to zero + psrlw xmm2, 1 ; abs(p1-q1)/2 + + movdqa xmm6, xmm5 ; p0 + movdqu xmm3, [rsi] ; q0 + psubusb xmm5, xmm3 ; p0-=q0 + psubusb xmm3, xmm6 ; q0-=p0 + por xmm5, xmm3 ; abs(p0 - q0) + paddusb xmm5, xmm5 ; abs(p0-q0)*2 + paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + mov rdx, arg(2) ;flimit ; get flimit + movdqa xmm2, [rdx] ; + + paddb xmm2, xmm2 ; flimit*2 (less than 255) + paddb xmm7, xmm2 ; flimit * 2 + limit (less than 255) + + psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + por xmm1, xmm5 + pxor xmm5, xmm5 + pcmpeqb xmm1, xmm5 ; mask mm1 + + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movdqa xmm7, [rdx] ; + movdqa xmm4, t0 ; get abs (q1 - q0) + psubusb xmm4, xmm7 + movdqa xmm3, t1 ; get abs (p1 - p0) + psubusb xmm3, xmm7 + paddb xmm4, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + pcmpeqb xmm4, xmm5 + pcmpeqb xmm5, xmm5 + pxor xmm4, xmm5 + + + ; start work on filters + movdqu xmm2, [rsi+2*rax] ; p1 + movdqu xmm7, [rdi] ; q1 + pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values + pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values + psubsb xmm2, xmm7 ; p1 - q1 + pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) + pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values + pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values + movdqa xmm3, xmm0 ; q0 + psubsb xmm0, xmm6 ; q0 - p0 + paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) + paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) + paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1) + pand xmm1, xmm2 ; mask filter values we don't care about + movdqa xmm2, xmm1 + paddsb xmm1, [t4 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 + paddsb xmm2, [t3 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 + + pxor xmm0, xmm0 ; + pxor xmm5, xmm5 + punpcklbw xmm0, xmm2 ; + punpckhbw xmm5, xmm2 ; + psraw xmm0, 11 ; + psraw xmm5, 11 + packsswb xmm0, xmm5 + movdqa xmm2, xmm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + + pxor xmm0, xmm0 ; 0 + movdqa xmm5, xmm1 ; abcdefgh + punpcklbw xmm0, xmm1 ; e0f0g0h0 + psraw xmm0, 11 ; sign extended shift right by 3 + pxor xmm1, xmm1 ; 0 + punpckhbw xmm1, xmm5 ; a0b0c0d0 + psraw xmm1, 11 ; sign extended shift right by 3 + movdqa xmm5, xmm0 ; save results + + packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + paddsw xmm5, [ones GLOBAL] + paddsw xmm1, [ones GLOBAL] + psraw xmm5, 1 ; partial shifted one more time for 2nd tap + psraw xmm1, 1 ; partial shifted one more time for 2nd tap + packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + pandn xmm4, xmm5 ; high edge variance additive + + paddsb xmm6, xmm2 ; p0+= p0 add + pxor xmm6, [t80 GLOBAL] ; unoffset + movdqu [rsi+rax], xmm6 ; write back + + movdqu xmm6, [rsi+2*rax] ; p1 + pxor xmm6, [t80 GLOBAL] ; reoffset + paddsb xmm6, xmm4 ; p1+= p1 add + pxor xmm6, [t80 GLOBAL] ; unoffset + movdqu [rsi+2*rax], xmm6 ; write back + + psubsb xmm3, xmm0 ; q0-= q0 add + pxor xmm3, [t80 GLOBAL] ; unoffset + movdqu [rsi], xmm3 ; write back + + psubsb xmm7, xmm4 ; q1-= q1 add + pxor xmm7, [t80 GLOBAL] ; unoffset + movdqu [rdi], xmm7 ; write back + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_loop_filter_vertical_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *flimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_loop_filter_vertical_edge_sse2) +sym(vp8_loop_filter_vertical_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 96 ; reserve 96 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; + %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi + rax*4 - 4] + mov rdi, rsi ; rdi points to row +1 for indirect addressing + + add rdi, rax + lea rcx, [rdi + rax *8] + + ;transpose + movq xmm7, QWORD PTR [rsi+2*rax] ; 67 66 65 64 63 62 61 60 + movq xmm6, QWORD PTR [rdi+2*rax] ; 77 76 75 74 73 72 71 70 + + punpcklbw xmm7, xmm6 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 + movq xmm5, QWORD PTR [rsi] ; 47 46 45 44 43 42 41 40 + + movq xmm4, QWORD PTR [rsi+rax] ; 57 56 55 54 53 52 51 50 + punpcklbw xmm5, xmm4 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 + + movdqa xmm3, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 + punpckhwd xmm5, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 + + lea rsi, [rsi+ rax*8] + + punpcklwd xmm3, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 + movq xmm6, QWORD PTR [rsi + 2*rax] ; e7 e6 e5 e4 e3 e2 e1 e0 + + movq xmm7, QWORD PTR [rcx + 2*rax] ; f7 f6 f5 f4 f3 f2 f1 f0 + punpcklbw xmm6, xmm7 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 + + movq xmm4, QWORD PTR [rsi] ; c7 c6 c5 c4 c3 c2 c1 c0 + movq xmm7, QWORD PTR [rsi + rax] ; d7 d6 d5 d4 d3 d2 d1 d0 + + punpcklbw xmm4, xmm7 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 c1 d0 c0 + movdqa xmm7, xmm4 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 c1 d0 c0 + + punpckhwd xmm7, xmm6 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4 + punpcklwd xmm4, xmm6 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 + + ; xmm3 xmm4, xmm5 xmm7 in use + neg rax + + lea rsi, [rsi+rax*8] + movq xmm6, QWORD PTR [rsi+rax*2] ; 27 26 25 24 23 22 21 20 + + movq xmm1, QWORD PTR [rsi+rax ] ; 37 36 35 34 33 32 31 30 + punpcklbw xmm6, xmm1 ; 37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20 + + movq xmm2, QWORD PTR [rsi+rax*4] ; 07 06 05 04 03 02 01 00 + movq xmm1, QWORD PTR [rdi+rax*4] ; 17 16 15 14 13 12 11 10 + + punpcklbw xmm2, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 + movdqa xmm0, xmm2 + + punpckhwd xmm2, xmm6 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 + punpcklwd xmm0, xmm6 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 + + movdqa xmm6, xmm2 + punpckldq xmm2, xmm5 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 + + punpckhdq xmm6, xmm5 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 + ;xmm0 xmm2 xmm3 xmm4, xmm6, xmm7 + + movdqa xmm5, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 + punpckhdq xmm5, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + + punpckldq xmm0, xmm3 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 + lea rsi, [rcx+rax] + ; xmm1, xmm3 free + movq xmm1, QWORD PTR [rsi+rax*2] ; a7 a6 a5 a4 a3 a2 a1 a0 + movq xmm3, QWORD PTR [rsi+rax] ; b7 b6 b5 b4 b3 b2 b1 b0 + + punpcklbw xmm1, xmm3 ; + lea rdx, srct ; + + movdqa [rdx+16], xmm1 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 + movq xmm3, QWORD PTR [rsi+rax*4] ; 87 86 85 84 83 82 81 80 + + movq xmm1, QWORD PTR [rcx+rax*4] + punpcklbw xmm3, xmm1 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 + + movdqa [rdx], xmm3 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 + + punpckhwd xmm3, [rdx+16] ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 + movdqa xmm1, xmm3 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 + + punpckhdq xmm1, xmm7 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86 + punpckldq xmm3, xmm7 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84 + + movdqa xmm7, xmm2 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 + punpcklqdq xmm7, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + + punpckhqdq xmm2, xmm3 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 + movdqa [rdx+32], xmm7 ; save 4s + + movdqa [rdx+48], xmm2 ; save 5s + movdqa xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 + + punpckhqdq xmm7, xmm1 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07 = q3 + punpcklqdq xmm6, xmm1 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 = q2 + + ; free 1, 3 xmm7-7s xmm6-6s, xmm2-5s + movq xmm1, QWORD PTR [rdx] ; 93 83 92 82 91 81 90 80 + movq xmm3, QWORD PTR [rdx+16] ; b3 a3 b2 a2 b1 a1 b0 a0 + + punpcklwd xmm1, xmm3 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 + movdqa xmm3, xmm1 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 + + punpckhdq xmm3, xmm4 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 + punpckldq xmm1, xmm4 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 + + movdqa xmm4, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + punpcklqdq xmm5, xmm3 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + + punpckhqdq xmm4, xmm3 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + movdqa [rdx], xmm5 ; save 2s + + movdqa [rdx+16], xmm4 ; save 3s + + movdqa xmm3, xmm6 ; + psubusb xmm3, xmm7 ; q3 - q2 + + psubusb xmm7, xmm6 ; q2 - q3 + por xmm7, xmm3 ; abs(q3-q2) + + movdqa xmm3, xmm2 ; q1 + psubusb xmm3, xmm6 ; q1 - q2 + + psubusb xmm6, xmm2 ; q2 - q1 + por xmm6, xmm3 ; abs(q2-q1) + + + movdqa xmm3, xmm0 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 + punpcklqdq xmm0, xmm1 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + + punpckhqdq xmm3, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + movdqa xmm1, xmm3 + + psubusb xmm3, xmm0 ; p2-p3 + psubusb xmm0, xmm1 ; p3-p2 + + por xmm0, xmm3 ; abs(p3-p2) + movdqa xmm3, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + + psubusb xmm3, xmm1 ; p1-p2 + psubusb xmm1, xmm5 ; p2-p1 + + por xmm1, xmm3 ; abs(p1-p2) + mov rdx, arg(3) ;limit + + movdqa xmm3, [rdx] ; limit + + psubusb xmm7, xmm3 + psubusb xmm0, xmm3 + + psubusb xmm1, xmm3 + psubusb xmm6, xmm3 + + por xmm7, xmm6 + por xmm0, xmm1 + + por xmm0, xmm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit + + movdqa xmm1, xmm5 ; p1 + + movdqa xmm7, xmm4 ; xmm4 xmm7 = p0 + + psubusb xmm7, xmm5 ; p0 - p1 + psubusb xmm5, xmm4 ; p1 - p0 + + por xmm5, xmm7 ; abs(p1-p0) + movdqa t0, xmm5 ; save abs(p1-p0) + + lea rdx, srct + psubusb xmm5, xmm3 + + por xmm0, xmm5 ; xmm0=mask + movdqa xmm5, [rdx+32] ; xmm5=q0 + + movdqa xmm7, [rdx+48] ; xmm7=q1 + movdqa xmm6, xmm5 ; mm6=q0 + + movdqa xmm2, xmm7 ; q1 + + psubusb xmm5, xmm7 ; q0-q1 + psubusb xmm7, xmm6 ; q1-q0 + + por xmm7, xmm5 ; abs(q1-q0) + movdqa t1, xmm7 ; save abs(q1-q0) + + psubusb xmm7, xmm3 + por xmm0, xmm7 ; mask + + movdqa xmm5, xmm2 ; q1 + psubusb xmm5, xmm1 ; q1-=p1 + psubusb xmm1, xmm2 ; p1-=q1 + por xmm5, xmm1 ; abs(p1-q1) + pand xmm5, [tfe GLOBAL] ; set lsb of each byte to zero + psrlw xmm5, 1 ; abs(p1-q1)/2 + + mov rdx, arg(2) ;flimit ; + movdqa xmm2, [rdx] ;flimit xmm2 + + movdqa xmm1, xmm4 ; xmm1=xmm4=p0 + + movdqa xmm7, xmm6 ; xmm7=xmm6=q0 + psubusb xmm1, xmm7 ; p0-q0 + + psubusb xmm7, xmm4 ; q0-p0 + por xmm1, xmm7 ; abs(q0-p0) + paddusb xmm1, xmm1 ; abs(q0-p0)*2 + paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + paddb xmm2, xmm2 ; flimit*2 (less than 255) + paddb xmm3, xmm2 ; flimit * 2 + limit (less than 255) + + psubusb xmm1, xmm3 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + + por xmm1, xmm0; ; mask + + pxor xmm0, xmm0 + pcmpeqb xmm1, xmm0 + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movdqa xmm7, [rdx] + + ; + movdqa xmm4, t0 ; get abs (q1 - q0) + psubusb xmm4, xmm7 + + movdqa xmm3, t1 ; get abs (p1 - p0) + psubusb xmm3, xmm7 + + por xmm4, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + pcmpeqb xmm4, xmm0 + + pcmpeqb xmm0, xmm0 + pxor xmm4, xmm0 + + ; start work on filters + lea rdx, srct + + movdqa xmm2, [rdx] ; p1 + movdqa xmm7, [rdx+48] ; q1 + + movdqa xmm6, [rdx+16] ; p0 + movdqa xmm0, [rdx+32] ; q0 + + pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values + pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values + + psubsb xmm2, xmm7 ; p1 - q1 + pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) + + pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values + pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values + + movdqa xmm3, xmm0 ; q0 + psubsb xmm0, xmm6 ; q0 - p0 + + paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) + paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) + + paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1) + pand xmm1, xmm2 ; mask filter values we don't care about + + movdqa xmm2, xmm1 + paddsb xmm1, [t4 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 + + paddsb xmm2, [t3 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 + pxor xmm0, xmm0 ; + + pxor xmm5, xmm5 + punpcklbw xmm0, xmm2 ; + + punpckhbw xmm5, xmm2 ; + psraw xmm0, 11 ; + + psraw xmm5, 11 + packsswb xmm0, xmm5 + + movdqa xmm2, xmm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + + pxor xmm0, xmm0 ; 0 + movdqa xmm5, xmm1 ; abcdefgh + + punpcklbw xmm0, xmm1 ; e0f0g0h0 + psraw xmm0, 11 ; sign extended shift right by 3 + + pxor xmm1, xmm1 ; 0 + punpckhbw xmm1, xmm5 ; a0b0c0d0 + + psraw xmm1, 11 ; sign extended shift right by 3 + movdqa xmm5, xmm0 ; save results + + packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + paddsw xmm5, [ones GLOBAL] + + paddsw xmm1, [ones GLOBAL] + psraw xmm5, 1 ; partial shifted one more time for 2nd tap + + psraw xmm1, 1 ; partial shifted one more time for 2nd tap + packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + + pandn xmm4, xmm5 ; high edge variance additive + + paddsb xmm6, xmm2 ; p0+= p0 add + pxor xmm6, [t80 GLOBAL] ; unoffset + + ; mm6=p0 ; + movdqa xmm1, [rdx] ; p1 + pxor xmm1, [t80 GLOBAL] ; reoffset + + paddsb xmm1, xmm4 ; p1+= p1 add + pxor xmm1, [t80 GLOBAL] ; unoffset + ; mm6 = p0 mm1 = p1 + + psubsb xmm3, xmm0 ; q0-= q0 add + pxor xmm3, [t80 GLOBAL] ; unoffset + + ; mm3 = q0 + psubsb xmm7, xmm4 ; q1-= q1 add + pxor xmm7, [t80 GLOBAL] ; unoffset + ; mm7 = q1 + + ; tranpose and write back + ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 + movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + + movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 + punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 + + movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 + + punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 + movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + punpcklwd xmm1, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 + punpckhwd xmm5, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 + + ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 + ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 + ; xmm5 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 + lea rsi, [rsi+rax*8] + + movd [rsi+rax*4+2], xmm2 + psrldq xmm2, 4 + + movd [rdi+rax*4+2], xmm2 + psrldq xmm2, 4 + + movd [rsi+rax*2+2], xmm2 + psrldq xmm2, 4 + + movd [rdi+rax*2+2], xmm2 + movd [rsi+2], xmm6 + + psrldq xmm6, 4 + movd [rdi+2], xmm6 + + psrldq xmm6, 4 + neg rax + + movd [rdi+rax+2], xmm6 + psrldq xmm6, 4 + + movd [rdi+rax*2+2], xmm6 + lea rsi, [rsi+rax*8] + + neg rax + ;;;;;;;;;;;;;;;;;;;;/ + movd [rsi+rax*4+2], xmm1 + psrldq xmm1, 4 + + movd [rcx+rax*4+2], xmm1 + psrldq xmm1, 4 + + movd [rsi+rax*2+2], xmm1 + psrldq xmm1, 4 + + movd [rcx+rax*2+2], xmm1 + psrldq xmm1, 4 + + movd [rsi+2], xmm5 + psrldq xmm5, 4 + + movd [rcx+2], xmm5 + psrldq xmm5, 4 + + neg rax + movd [rcx+rax+2], xmm5 + + psrldq xmm5, 4 + movd [rcx+rax*2+2], xmm5 + + add rsp, 96 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_mbloop_filter_horizontal_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *flimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_mbloop_filter_horizontal_edge_sse2) +sym(vp8_mbloop_filter_horizontal_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + mov rdx, arg(3) ;limit + movdqa xmm7, XMMWORD PTR [rdx] + + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + + ; calculate breakout conditions + movdqa xmm2, XMMWORD PTR [rdi+2*rax] ; q3 + movdqa xmm1, XMMWORD PTR [rsi+2*rax] ; q2 + + movdqa xmm6, xmm1 ; q2 + psubusb xmm1, xmm2 ; q2-=q3 + + + psubusb xmm2, xmm6 ; q3-=q2 + por xmm1, xmm2 ; abs(q3-q2) + + psubusb xmm1, xmm7 + + ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit + movdqa xmm4, XMMWORD PTR [rsi+rax] ; q1 + movdqa xmm3, xmm4 ; q1 + + psubusb xmm4, xmm6 ; q1-=q2 + psubusb xmm6, xmm3 ; q2-=q1 + + por xmm4, xmm6 ; abs(q2-q1) + psubusb xmm4, xmm7 + + por xmm1, xmm4 + ; mm1 = mask, mm3=q1, mm7 = limit + + movdqa xmm4, XMMWORD PTR [rsi] ; q0 + movdqa xmm0, xmm4 ; q0 + + psubusb xmm4, xmm3 ; q0-=q1 + psubusb xmm3, xmm0 ; q1-=q0 + + por xmm4, xmm3 ; abs(q0-q1) + movdqa t0, xmm4 ; save to t0 + + psubusb xmm4, xmm7 + por xmm1, xmm4 + + ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) + neg rax ; negate pitch to deal with above border + + movdqa xmm2, XMMWORD PTR [rsi+4*rax] ; p3 + movdqa xmm4, XMMWORD PTR [rdi+4*rax] ; p2 + + movdqa xmm5, xmm4 ; p2 + psubusb xmm4, xmm2 ; p2-=p3 + + psubusb xmm2, xmm5 ; p3-=p2 + por xmm4, xmm2 ; abs(p3 - p2) + + psubusb xmm4, xmm7 + por xmm1, xmm4 + + ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) + movdqa xmm4, XMMWORD PTR [rsi+2*rax] ; p1 + movdqa xmm3, xmm4 ; p1 + + psubusb xmm4, xmm5 ; p1-=p2 + psubusb xmm5, xmm3 ; p2-=p1 + + por xmm4, xmm5 ; abs(p2 - p1) + psubusb xmm4, xmm7 + + por xmm1, xmm4 + + movdqa xmm2, xmm3 ; p1 + + ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) + movdqa xmm4, XMMWORD PTR [rsi+rax] ; p0 + movdqa xmm5, xmm4 ; p0 + + psubusb xmm4, xmm3 ; p0-=p1 + psubusb xmm3, xmm5 ; p1-=p0 + + por xmm4, xmm3 ; abs(p1 - p0) + movdqa t1, xmm4 ; save to t1 + + psubusb xmm4, xmm7 + por xmm1, xmm4 + + ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0) + ; mm5 = p0 + movdqa xmm3, XMMWORD PTR [rdi] ; q1 + movdqa xmm4, xmm3 ; q1 + psubusb xmm3, xmm2 ; q1-=p1 + psubusb xmm2, xmm4 ; p1-=q1 + por xmm2, xmm3 ; abs(p1-q1) + pand xmm2, [tfe GLOBAL] ; set lsb of each byte to zero + psrlw xmm2, 1 ; abs(p1-q1)/2 + + movdqa xmm6, xmm5 ; p0 + movdqa xmm3, xmm0 ; q0 + + psubusb xmm5, xmm3 ; p0-=q0 + psubusb xmm3, xmm6 ; q0-=p0 + + por xmm5, xmm3 ; abs(p0 - q0) + paddusb xmm5, xmm5 ; abs(p0-q0)*2 + paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + mov rdx, arg(2) ;flimit ; get flimit + movdqa xmm2, XMMWORD PTR [rdx] ; + paddb xmm2, xmm2 ; flimit*2 (less than 255) + paddb xmm7, xmm2 ; flimit * 2 + limit (less than 255) + + psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + por xmm1, xmm5 + pxor xmm5, xmm5 + pcmpeqb xmm1, xmm5 ; mask mm1 + ; mm1 = mask, mm0=q0, mm7 = flimit, t0 = abs(q0-q1) t1 = abs(p1-p0) + ; mm6 = p0, + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movdqa xmm7, XMMWORD PTR [rdx] ; + + movdqa xmm4, t0 ; get abs (q1 - q0) + psubusb xmm4, xmm7 + + movdqa xmm3, t1 ; get abs (p1 - p0) + psubusb xmm3, xmm7 + + paddb xmm4, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + pcmpeqb xmm4, xmm5 + + pcmpeqb xmm5, xmm5 + pxor xmm4, xmm5 + ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0) + ; mm6 = p0, mm4=hev + ; start work on filters + movdqa xmm2, XMMWORD PTR [rsi+2*rax] ; p1 + movdqa xmm7, XMMWORD PTR [rdi] ; q1 + + pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values + pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values + + psubsb xmm2, xmm7 ; p1 - q1 + pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values + + pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values + movdqa xmm3, xmm0 ; q0 + + psubsb xmm0, xmm6 ; q0 - p0 + paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1) + + paddsb xmm2, xmm0 ; 2 * (q0 - p0) + paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1) + + pand xmm1, xmm2 ; mask filter values we don't care about + ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 + movdqa xmm2, xmm1 ; vp8_filter + pand xmm2, xmm4; ; Filter2 = vp8_filter & hev + + + movdqa xmm5, xmm2 ; + paddsb xmm5, [t3 GLOBAL]; + + pxor xmm0, xmm0 ; 0 + pxor xmm7, xmm7 ; 0 + + punpcklbw xmm0, xmm5 ; e0f0g0h0 + psraw xmm0, 11 ; sign extended shift right by 3 + + punpckhbw xmm7, xmm5 ; a0b0c0d0 + psraw xmm7, 11 ; sign extended shift right by 3 + + packsswb xmm0, xmm7 ; Filter2 >>=3; + movdqa xmm5, xmm0 ; Filter2 + + paddsb xmm2, [t4 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 4) + pxor xmm0, xmm0 ; 0 + + pxor xmm7, xmm7 ; 0 + punpcklbw xmm0, xmm2 ; e0f0g0h0 + + psraw xmm0, 11 ; sign extended shift right by 3 + punpckhbw xmm7, xmm2 ; a0b0c0d0 + + psraw xmm7, 11 ; sign extended shift right by 3 + packsswb xmm0, xmm7 ; Filter2 >>=3; + + ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 + psubsb xmm3, xmm0 ; qs0 =qs0 - filter1 + paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2 + + ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 + ; vp8_filter &= ~hev; + ; Filter2 = vp8_filter; + pandn xmm4, xmm1 ; vp8_filter&=~hev + + + ; mm3=qs0, mm4=filter2, mm6=ps0 + + ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); + ; s = vp8_signed_char_clamp(qs0 - u); + ; *oq0 = s^0x80; + ; s = vp8_signed_char_clamp(ps0 + u); + ; *op0 = s^0x80; + pxor xmm0, xmm0 + pxor xmm1, xmm1 + + pxor xmm2, xmm2 + punpcklbw xmm1, xmm4 + + punpckhbw xmm2, xmm4 + pmulhw xmm1, [s27 GLOBAL] + + pmulhw xmm2, [s27 GLOBAL] + paddw xmm1, [s63 GLOBAL] + + paddw xmm2, [s63 GLOBAL] + psraw xmm1, 7 + + psraw xmm2, 7 + packsswb xmm1, xmm2 + + psubsb xmm3, xmm1 + paddsb xmm6, xmm1 + + pxor xmm3, [t80 GLOBAL] + pxor xmm6, [t80 GLOBAL] + + movdqa XMMWORD PTR [rsi+rax], xmm6 + movdqa XMMWORD PTR [rsi], xmm3 + + ; roughly 2/7th difference across boundary + ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); + ; s = vp8_signed_char_clamp(qs1 - u); + ; *oq1 = s^0x80; + ; s = vp8_signed_char_clamp(ps1 + u); + ; *op1 = s^0x80; + pxor xmm1, xmm1 + pxor xmm2, xmm2 + + punpcklbw xmm1, xmm4 + punpckhbw xmm2, xmm4 + + pmulhw xmm1, [s18 GLOBAL] + pmulhw xmm2, [s18 GLOBAL] + + paddw xmm1, [s63 GLOBAL] + paddw xmm2, [s63 GLOBAL] + + psraw xmm1, 7 + psraw xmm2, 7 + + packsswb xmm1, xmm2 + + movdqa xmm3, XMMWORD PTR [rdi] + movdqa xmm6, XMMWORD PTR [rsi+rax*2] ; p1 + + pxor xmm3, [t80 GLOBAL] + pxor xmm6, [t80 GLOBAL] + + paddsb xmm6, xmm1 + psubsb xmm3, xmm1 + + pxor xmm6, [t80 GLOBAL] + pxor xmm3, [t80 GLOBAL] + + movdqa XMMWORD PTR [rdi], xmm3 + movdqa XMMWORD PTR [rsi+rax*2],xmm6 + + ; roughly 1/7th difference across boundary + ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); + ; s = vp8_signed_char_clamp(qs2 - u); + ; *oq2 = s^0x80; + ; s = vp8_signed_char_clamp(ps2 + u); + ; *op2 = s^0x80; + pxor xmm1, xmm1 + pxor xmm2, xmm2 + + punpcklbw xmm1, xmm4 + punpckhbw xmm2, xmm4 + + pmulhw xmm1, [s9 GLOBAL] + pmulhw xmm2, [s9 GLOBAL] + + paddw xmm1, [s63 GLOBAL] + paddw xmm2, [s63 GLOBAL] + + psraw xmm1, 7 + psraw xmm2, 7 + + packsswb xmm1, xmm2 + + + movdqa xmm6, XMMWORD PTR [rdi+rax*4] + neg rax + + movdqa xmm3, XMMWORD PTR [rdi+rax ] + + pxor xmm6, [t80 GLOBAL] + pxor xmm3, [t80 GLOBAL] + + paddsb xmm6, xmm1 + psubsb xmm3, xmm1 + + pxor xmm6, [t80 GLOBAL] + pxor xmm3, [t80 GLOBAL] + + movdqa XMMWORD PTR [rdi+rax ], xmm3 + neg rax + + movdqa XMMWORD PTR [rdi+rax*4], xmm6 + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_mbloop_filter_vertical_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *flimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_mbloop_filter_vertical_edge_sse2) +sym(vp8_mbloop_filter_vertical_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 160 ; reserve 160 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; + %define srct [rsp + 32] ;__declspec(align(16)) char srct[128]; + + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi + rax*4 - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + + mov rcx, rax + neg rcx + + ; Transpose + movq xmm0, QWORD PTR [rdi+rax*2] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 + movq xmm7, QWORD PTR [rsi+rax*2] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 + + punpcklbw xmm7, xmm0 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 + movq xmm0, QWORD PTR [rsi+rax] ; + + movq xmm5, QWORD PTR [rsi] ; + punpcklbw xmm5, xmm0 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 + + movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 + punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 + + punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 + movq xmm7, QWORD PTR [rsi + rcx] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 + + movq xmm0, QWORD PTR [rsi + rcx*2] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 + punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20 + + movq xmm4, QWORD PTR [rsi + rcx*4] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 + movq xmm7, QWORD PTR [rdi + rcx*4] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 + + punpcklbw xmm4, xmm7 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 + movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 + + punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 + punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 + + movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 + movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 + + punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 + punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 + + punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 + + movdqa t0, xmm2 ; save to free XMM2 + ;movdqa t1, xmm3 + + ; XMM3 XMM4 XMM7 in use + lea rsi, [rsi+rax*8] + lea rdi, [rdi+rax*8] + + movq xmm6, QWORD PTR [rdi+rax*2] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 + movq xmm5, QWORD PTR [rsi+rax*2] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 + + punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 + movq xmm6, QWORD PTR [rsi+rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 + + movq xmm1, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 + punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0 + + movdqa xmm6, xmm1 ; + punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4 + + punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 + movq xmm5, QWORD PTR [rsi+rcx] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 + + movq xmm0, QWORD PTR [rsi+rcx*2] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 + punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 + + movq xmm2, QWORD PTR [rsi+rcx*4] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 + movq xmm5, QWORD PTR [rdi+rcx*4] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 + + punpcklbw xmm2, xmm5 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 + movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 + + punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 + punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 + + movdqa xmm0, xmm5 + punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 + + + punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 + movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 + + punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84 + punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86 + + movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 + punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 + + + lea rdx, srct + punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07 + + movdqa [rdx+112], xmm7 ; save 7 + movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + + movdqa [rdx+96], xmm6 ; save 6 + punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + + punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + movdqa [rdx+32], xmm2 ; save 2 + + movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 + punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + + movdqa [rdx+48], xmm3 ; save 3 + punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 + + movdqa [rdx+64], xmm4 ; save 4 + movdqa [rdx+80], xmm5 ; save 5 + + movdqa xmm1, t0 ; get + movdqa xmm2, xmm1 ; + + punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + + movdqa [rdx+16], xmm1 + movdqa [rdx], xmm2 + + movdqa xmm0, xmm6 ; q2 + psubusb xmm0, xmm7 ; q2-q3 + + psubusb xmm7, xmm6 ; q3-q2 + por xmm7, xmm0 ; abs (q3-q2) + + movdqa xmm1, xmm5 ; q1 + psubusb xmm1, xmm6 ; q1-q2 + + psubusb xmm6, xmm5 ; q2-q1 + por xmm6, xmm1 ; abs (q2-q1) + + ;/* + ;movdqa xmm0, xmm4 ; q0 + ;psubusb xmm0 xmm5 ; q0-q1 + ; + ;pusbusb xmm5, xmm4 ; q1-q0 + ;por xmm5, xmm0 ; abs (q1-q0) + ;*/ + + movdqa xmm1, [rdx+16] ; p2 + movdqa xmm0, xmm1 + + psubusb xmm0, xmm2 ; p2 - p3; + psubusb xmm2, xmm1 ; p3 - p2; + + por xmm0, xmm2 ; abs(p2-p3) + + movdqa xmm2, [rdx+32] ; p1 + movdqa xmm5, xmm2 ; p1 + + psubusb xmm5, xmm1 ; p1-p2 + psubusb xmm1, xmm2 ; p2-p1 + + por xmm1, xmm5 ; abs(p2-p1) + mov rdx, arg(3) ;limit + + movdqa xmm4, [rdx] ; limit + psubusb xmm7, xmm4 ; + + + psubusb xmm0, xmm4 ; abs(p3-p2) > limit + psubusb xmm1, xmm4 ; abs(p2-p1) > limit + + psubusb xmm6, xmm4 ; abs(q2-q1) > limit + por xmm7, xmm6 ; or + + por xmm0, xmm1 ; + por xmm0, xmm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit + + movdqa xmm1, xmm2 ; p1 + + movdqa xmm7, xmm3 ; p0 + psubusb xmm7, xmm2 ; p0-p1 + + psubusb xmm2, xmm3 ; p1-p0 + por xmm2, xmm7 ; abs(p1-p0) + + movdqa t0, xmm2 ; save abs(p1-p0) + lea rdx, srct + + psubusb xmm2, xmm4 ; abs(p1-p0)>limit + por xmm0, xmm2 ; mask + + movdqa xmm5, [rdx+64] ; q0 + movdqa xmm7, [rdx+80] ; q1 + + movdqa xmm6, xmm5 ; q0 + movdqa xmm2, xmm7 ; q1 + psubusb xmm5, xmm7 ; q0-q1 + + psubusb xmm7, xmm6 ; q1-q0 + por xmm7, xmm5 ; abs(q1-q0) + + movdqa t1, xmm7 ; save abs(q1-q0) + psubusb xmm7, xmm4 ; abs(q1-q0)> limit + + por xmm0, xmm7 ; mask + + movdqa xmm5, xmm2 ; q1 + psubusb xmm5, xmm1 ; q1-=p1 + psubusb xmm1, xmm2 ; p1-=q1 + por xmm5, xmm1 ; abs(p1-q1) + pand xmm5, [tfe GLOBAL] ; set lsb of each byte to zero + psrlw xmm5, 1 ; abs(p1-q1)/2 + + mov rdx, arg(2) ;flimit ; + movdqa xmm2, [rdx] ; flimit + + movdqa xmm1, xmm3 ; p0 + movdqa xmm7, xmm6 ; q0 + psubusb xmm1, xmm7 ; p0-q0 + psubusb xmm7, xmm3 ; q0-p0 + por xmm1, xmm7 ; abs(q0-p0) + paddusb xmm1, xmm1 ; abs(q0-p0)*2 + paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + paddb xmm2, xmm2 ; flimit*2 (less than 255) + paddb xmm4, xmm2 ; flimit * 2 + limit (less than 255) + + psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + por xmm1, xmm0; ; mask + pxor xmm0, xmm0 + pcmpeqb xmm1, xmm0 + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movdqa xmm7, [rdx] + + movdqa xmm4, t0 ; get abs (q1 - q0) + psubusb xmm4, xmm7 ; abs(q1 - q0) > thresh + + movdqa xmm3, t1 ; get abs (p1 - p0) + psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh + + por xmm4, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + pcmpeqb xmm4, xmm0 + + pcmpeqb xmm0, xmm0 + pxor xmm4, xmm0 + + + ; start work on filters + lea rdx, srct + + ; start work on filters + movdqa xmm2, [rdx+32] ; p1 + movdqa xmm7, [rdx+80] ; q1 + + pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values + pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values + + psubsb xmm2, xmm7 ; p1 - q1 + movdqa xmm6, [rdx+48] ; p0 + + movdqa xmm0, [rdx+64] ; q0 + pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values + + pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values + movdqa xmm3, xmm0 ; q0 + + psubsb xmm0, xmm6 ; q0 - p0 + paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1) + + paddsb xmm2, xmm0 ; 2 * (q0 - p0) + paddsb xmm2, xmm0 ; 3 * (q0 - p0)+ (p1 - q1) + + pand xmm1, xmm2 ; mask filter values we don't care about + + ; xmm1 = vp8_filter, xmm4=hev, xmm6=ps0, xmm3=qs0 + movdqa xmm2, xmm1 ; vp8_filter + pand xmm2, xmm4; ; Filter2 = vp8_filter & hev + + movdqa xmm5, xmm2 + paddsb xmm5, [t3 GLOBAL] + + pxor xmm0, xmm0 ; 0 + pxor xmm7, xmm7 ; 0 + + punpcklbw xmm0, xmm5 ; e0f0g0h0 + psraw xmm0, 11 ; sign extended shift right by 3 + + punpckhbw xmm7, xmm5 ; a0b0c0d0 + psraw xmm7, 11 ; sign extended shift right by 3 + + packsswb xmm0, xmm7 ; Filter2 >>=3; + movdqa xmm5, xmm0 ; Filter2 + + paddsb xmm2, [t4 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 4) + pxor xmm0, xmm0 ; 0 + + pxor xmm7, xmm7 ; 0 + punpcklbw xmm0, xmm2 ; e0f0g0h0 + + psraw xmm0, 11 ; sign extended shift right by 3 + punpckhbw xmm7, xmm2 ; a0b0c0d0 + + psraw xmm7, 11 ; sign extended shift right by 3 + packsswb xmm0, xmm7 ; Filter2 >>=3; + + ; xmm0= filter2 xmm1 = vp8_filter, xmm3 =qs0 xmm5=s xmm4 =hev xmm6=ps0 + psubsb xmm3, xmm0 ; qs0 =qs0 - filter1 + paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2 + + + ; xmm1=vp8_filter, xmm3=qs0, xmm4 =hev xmm6=ps0 + ; vp8_filter &= ~hev; + ; Filter2 = vp8_filter; + pandn xmm4, xmm1 ; vp8_filter&=~hev + + ; xmm3=qs0, xmm4=filter2, xmm6=ps0 + ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); + ; s = vp8_signed_char_clamp(qs0 - u); + ; *oq0 = s^0x80; + ; s = vp8_signed_char_clamp(ps0 + u); + ; *op0 = s^0x80; + pxor xmm0, xmm0 + pxor xmm1, xmm1 + + pxor xmm2, xmm2 + punpcklbw xmm1, xmm4 + + punpckhbw xmm2, xmm4 + pmulhw xmm1, [s27 GLOBAL] + + pmulhw xmm2, [s27 GLOBAL] + paddw xmm1, [s63 GLOBAL] + + paddw xmm2, [s63 GLOBAL] + psraw xmm1, 7 + + psraw xmm2, 7 + packsswb xmm1, xmm2 + + psubsb xmm3, xmm1 + paddsb xmm6, xmm1 + + pxor xmm3, [t80 GLOBAL] + pxor xmm6, [t80 GLOBAL] + + movdqa [rdx+48], xmm6 + movdqa [rdx+64], xmm3 + + ; roughly 2/7th difference across boundary + ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); + ; s = vp8_signed_char_clamp(qs1 - u); + ; *oq1 = s^0x80; + ; s = vp8_signed_char_clamp(ps1 + u); + ; *op1 = s^0x80; + pxor xmm1, xmm1 + pxor xmm2, xmm2 + + punpcklbw xmm1, xmm4 + punpckhbw xmm2, xmm4 + + pmulhw xmm1, [s18 GLOBAL] + pmulhw xmm2, [s18 GLOBAL] + + paddw xmm1, [s63 GLOBAL] + paddw xmm2, [s63 GLOBAL] + + psraw xmm1, 7 + psraw xmm2, 7 + + packsswb xmm1, xmm2 + + movdqa xmm3, [rdx + 80] ;/q1 + movdqa xmm6, [rdx + 32] ; p1 + + pxor xmm3, [t80 GLOBAL] + pxor xmm6, [t80 GLOBAL] + + paddsb xmm6, xmm1 + psubsb xmm3, xmm1 + + pxor xmm6, [t80 GLOBAL] + pxor xmm3, [t80 GLOBAL] + + movdqa [rdx + 80], xmm3 + movdqa [rdx + 32], xmm6 + + + ; roughly 1/7th difference across boundary + ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); + ; s = vp8_signed_char_clamp(qs2 - u); + ; *oq2 = s^0x80; + ; s = vp8_signed_char_clamp(ps2 + u); + ; *op2 = s^0x80; + pxor xmm1, xmm1 + pxor xmm2, xmm2 + + punpcklbw xmm1, xmm4 + punpckhbw xmm2, xmm4 + + pmulhw xmm1, [s9 GLOBAL] + pmulhw xmm2, [s9 GLOBAL] + + paddw xmm1, [s63 GLOBAL] + paddw xmm2, [s63 GLOBAL] + + psraw xmm1, 7 + psraw xmm2, 7 + + packsswb xmm1, xmm2 + + movdqa xmm6, [rdx+16] + movdqa xmm3, [rdx+96] + + pxor xmm6, [t80 GLOBAL] + pxor xmm3, [t80 GLOBAL] + + paddsb xmm6, xmm1 + psubsb xmm3, xmm1 + + pxor xmm6, [t80 GLOBAL] ; xmm6 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + pxor xmm3, [t80 GLOBAL] ; xmm3 = f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 15 06 + + + ; transpose and write back + movdqa xmm0, [rdx] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + + punpcklbw xmm0, xmm6 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 + punpckhbw xmm1, xmm6 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 + + movdqa xmm2, [rdx+32] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + movdqa xmm6, xmm2 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + + punpcklbw xmm2, [rdx+48] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + punpckhbw xmm6, [rdx+48] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + movdqa xmm5, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 + punpcklwd xmm0, xmm2 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 + + punpckhwd xmm5, xmm2 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 + movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 + + punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 + punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 + + movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + punpcklbw xmm2, [rdx+80] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 + + movdqa xmm6, xmm3 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 + punpcklbw xmm6, [rdx+112] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06 + + movdqa xmm7, xmm2 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 + punpcklwd xmm2, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04 + + punpckhwd xmm7, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44 + movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 + + punpckldq xmm0, xmm2 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00 + punpckhdq xmm6, xmm2 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20 + + lea rsi, [rsi+rcx*8] + lea rdi, [rdi+rcx*8] + + movq QWORD PTR [rsi+rcx*4], xmm0 + psrldq xmm0, 8 + + movq QWORD PTR [rsi+rcx*2], xmm6 + psrldq xmm6, 8 + + movq QWORD PTR [rdi+rcx*4], xmm0 + movq QWORD PTR [rsi+rcx], xmm6 + + movdqa xmm0, xmm5 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 + punpckldq xmm0, xmm7 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40 + + punpckhdq xmm5, xmm7 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60 + + movq QWORD PTR [rsi], xmm0 + psrldq xmm0, 8 + + movq QWORD PTR [rsi+rax*2], xmm5 + psrldq xmm5, 8 + + movq QWORD PTR [rsi+rax], xmm0 + movq QWORD PTR [rdi+rax*2], xmm5 + + movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + punpckhbw xmm2, [rdx+80] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 + + punpckhbw xmm3, [rdx+112] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86 + movdqa xmm0, xmm2 + + punpcklwd xmm0, xmm3 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84 + punpckhwd xmm2, xmm3 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4 + + movdqa xmm3, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 + punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80 + + punpckhdq xmm3, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0 + + lea rsi, [rsi+rax*8] + lea rdi, [rdi+rax*8] + + movq QWORD PTR [rsi+rcx*4], xmm1 + psrldq xmm1, 8 + + movq QWORD PTR [rsi+rcx*2], xmm3 + psrldq xmm3, 8 + + movq QWORD PTR [rdi+rcx*4], xmm1 + movq QWORD PTR [rsi+rcx], xmm3 + + movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 + punpckldq xmm1, xmm2 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0 + + punpckhdq xmm4, xmm2 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0 + movq QWORD PTR [rsi], xmm1 + + psrldq xmm1, 8 + + movq QWORD PTR [rsi+rax*2], xmm4 + psrldq xmm4, 8 + + movq QWORD PTR [rsi+rax], xmm1 + movq QWORD PTR [rdi+rax*2], xmm4 + + add rsp, 160 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_loop_filter_simple_horizontal_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *flimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_loop_filter_simple_horizontal_edge_sse2) +sym(vp8_loop_filter_simple_horizontal_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + mov rdx, arg(2) ;flimit ; get flimit + movdqa xmm3, XMMWORD PTR [rdx] + mov rdx, arg(3) ;limit + movdqa xmm7, XMMWORD PTR [rdx] + + paddb xmm3, xmm3 ; flimit*2 (less than 255) + paddb xmm3, xmm7 ; flimit * 2 + limit (less than 255) + + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + neg rax + + ; calculate mask + movdqu xmm1, [rsi+2*rax] ; p1 + movdqu xmm0, [rdi] ; q1 + movdqa xmm2, xmm1 + movdqa xmm7, xmm0 + movdqa xmm4, xmm0 + psubusb xmm0, xmm1 ; q1-=p1 + psubusb xmm1, xmm4 ; p1-=q1 + por xmm1, xmm0 ; abs(p1-q1) + pand xmm1, [tfe GLOBAL] ; set lsb of each byte to zero + psrlw xmm1, 1 ; abs(p1-q1)/2 + + movdqu xmm5, [rsi+rax] ; p0 + movdqu xmm4, [rsi] ; q0 + movdqa xmm0, xmm4 ; q0 + movdqa xmm6, xmm5 ; p0 + psubusb xmm5, xmm4 ; p0-=q0 + psubusb xmm4, xmm6 ; q0-=p0 + por xmm5, xmm4 ; abs(p0 - q0) + paddusb xmm5, xmm5 ; abs(p0-q0)*2 + paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + pxor xmm3, xmm3 + pcmpeqb xmm5, xmm3 + + ; start work on filters + pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values + pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values + psubsb xmm2, xmm7 ; p1 - q1 + + pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values + pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values + movdqa xmm3, xmm0 ; q0 + psubsb xmm0, xmm6 ; q0 - p0 + paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0) + paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0) + paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0) + pand xmm5, xmm2 ; mask filter values we don't care about + + ; do + 4 side + paddsb xmm5, [t4 GLOBAL] ; 3* (q0 - p0) + (p1 - q1) + 4 + + movdqa xmm0, xmm5 ; get a copy of filters + psllw xmm0, 8 ; shift left 8 + psraw xmm0, 3 ; arithmetic shift right 11 + psrlw xmm0, 8 + movdqa xmm1, xmm5 ; get a copy of filters + psraw xmm1, 11 ; arithmetic shift right 11 + psllw xmm1, 8 ; shift left 8 to put it back + + por xmm0, xmm1 ; put the two together to get result + + psubsb xmm3, xmm0 ; q0-= q0 add + pxor xmm3, [t80 GLOBAL] ; unoffset + movdqu [rsi], xmm3 ; write back + + ; now do +3 side + psubsb xmm5, [t1s GLOBAL] ; +3 instead of +4 + + movdqa xmm0, xmm5 ; get a copy of filters + psllw xmm0, 8 ; shift left 8 + psraw xmm0, 3 ; arithmetic shift right 11 + psrlw xmm0, 8 + psraw xmm5, 11 ; arithmetic shift right 11 + psllw xmm5, 8 ; shift left 8 to put it back + por xmm0, xmm5 ; put the two together to get result + + + paddsb xmm6, xmm0 ; p0+= p0 add + pxor xmm6, [t80 GLOBAL] ; unoffset + movdqu [rsi+rax], xmm6 ; write back + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_loop_filter_simple_vertical_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *flimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_loop_filter_simple_vertical_edge_sse2) +sym(vp8_loop_filter_simple_vertical_edge_sse2): + push rbp ; save old base pointer value. + mov rbp, rsp ; set new base pointer value. + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx ; save callee-saved reg + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi - 2 ] + lea rdi, [rsi + rax] + lea rdx, [rsi + rax*4] + lea rcx, [rdx + rax] + + movdqu xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00 + movdqu xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40 + movdqu xmm2, [rdi] ; 13 12 11 10 + movdqu xmm3, [rcx] ; 53 52 51 50 + punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00 + punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10 + + movdqu xmm4, [rsi + rax*2] ; 23 22 21 20 + movdqu xmm5, [rdx + rax*2] ; 63 62 61 60 + movdqu xmm6, [rdi + rax*2] ; 33 32 31 30 + movdqu xmm7, [rcx + rax*2] ; 73 72 71 70 + punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20 + punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30 + + punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 + punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 + + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 + + movdqa xmm2, xmm0 + punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 + punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + + movdqa t0, xmm0 ; save to t0 + movdqa t1, xmm2 ; save to t1 + + lea rsi, [rsi + rax*8] + lea rdi, [rsi + rax] + lea rdx, [rsi + rax*4] + lea rcx, [rdx + rax] + + movdqu xmm4, [rsi] ; 83 82 81 80 + movdqu xmm1, [rdx] ; c3 c2 c1 c0 + movdqu xmm6, [rdi] ; 93 92 91 90 + movdqu xmm3, [rcx] ; d3 d2 d1 d0 + punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80 + punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90 + + movdqu xmm0, [rsi + rax*2] ; a3 a2 a1 a0 + movdqu xmm5, [rdx + rax*2] ; e3 e2 e1 e0 + movdqu xmm2, [rdi + rax*2] ; b3 b2 b1 b0 + movdqu xmm7, [rcx + rax*2] ; f3 f2 f1 f0 + punpckldq xmm0, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0 + punpckldq xmm2, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0 + + punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80 + punpcklbw xmm0, xmm2 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0 + + movdqa xmm1, xmm4 + punpcklwd xmm4, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 + punpckhwd xmm1, xmm0 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 + + movdqa xmm6, xmm4 + punpckldq xmm4, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 + punpckhdq xmm6, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 + + movdqa xmm0, t0 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 + movdqa xmm2, t1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + + punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + + ; calculate mask + movdqa xmm6, xmm0 ; p1 + movdqa xmm7, xmm3 ; q1 + psubusb xmm7, xmm0 ; q1-=p1 + psubusb xmm6, xmm3 ; p1-=q1 + por xmm6, xmm7 ; abs(p1-q1) + pand xmm6, [tfe GLOBAL] ; set lsb of each byte to zero + psrlw xmm6, 1 ; abs(p1-q1)/2 + + movdqa xmm5, xmm1 ; p0 + movdqa xmm4, xmm2 ; q0 + psubusb xmm5, xmm2 ; p0-=q0 + psubusb xmm4, xmm1 ; q0-=p0 + por xmm5, xmm4 ; abs(p0 - q0) + paddusb xmm5, xmm5 ; abs(p0-q0)*2 + paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + mov rdx, arg(2) ;flimit + movdqa xmm7, XMMWORD PTR [rdx] + mov rdx, arg(3) ; get limit + movdqa xmm6, XMMWORD PTR [rdx] + paddb xmm7, xmm7 ; flimit*2 (less than 255) + paddb xmm7, xmm6 ; flimit * 2 + limit (less than 255) + + psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + pxor xmm7, xmm7 + pcmpeqb xmm5, xmm7 ; mm5 = mask + + ; start work on filters + movdqa t0, xmm0 + movdqa t1, xmm3 + + pxor xmm0, [t80 GLOBAL] ; p1 offset to convert to signed values + pxor xmm3, [t80 GLOBAL] ; q1 offset to convert to signed values + + psubsb xmm0, xmm3 ; p1 - q1 + movdqa xmm6, xmm1 ; p0 + + movdqa xmm7, xmm2 ; q0 + pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values + + pxor xmm7, [t80 GLOBAL] ; offset to convert to signed values + movdqa xmm3, xmm7 ; offseted ; q0 + + psubsb xmm7, xmm6 ; q0 - p0 + paddsb xmm0, xmm7 ; p1 - q1 + 1 * (q0 - p0) + + paddsb xmm0, xmm7 ; p1 - q1 + 2 * (q0 - p0) + paddsb xmm0, xmm7 ; p1 - q1 + 3 * (q0 - p0) + + pand xmm5, xmm0 ; mask filter values we don't care about + + + paddsb xmm5, [t4 GLOBAL] ; 3* (q0 - p0) + (p1 - q1) + 4 + + movdqa xmm0, xmm5 ; get a copy of filters + psllw xmm0, 8 ; shift left 8 + + psraw xmm0, 3 ; arithmetic shift right 11 + psrlw xmm0, 8 + + movdqa xmm7, xmm5 ; get a copy of filters + psraw xmm7, 11 ; arithmetic shift right 11 + + psllw xmm7, 8 ; shift left 8 to put it back + por xmm0, xmm7 ; put the two together to get result + + psubsb xmm3, xmm0 ; q0-= q0sz add + pxor xmm3, [t80 GLOBAL] ; unoffset q0 + + ; now do +3 side + psubsb xmm5, [t1s GLOBAL] ; +3 instead of +4 + movdqa xmm0, xmm5 ; get a copy of filters + + psllw xmm0, 8 ; shift left 8 + psraw xmm0, 3 ; arithmetic shift right 11 + + psrlw xmm0, 8 + psraw xmm5, 11 ; arithmetic shift right 11 + + psllw xmm5, 8 ; shift left 8 to put it back + por xmm0, xmm5 ; put the two together to get result + + paddsb xmm6, xmm0 ; p0+= p0 add + pxor xmm6, [t80 GLOBAL] ; unoffset p0 + + movdqa xmm0, t0 ; p1 + movdqa xmm4, t1 ; q1 + + ; transpose back to write out + ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm6 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 + punpckhbw xmm1, xmm6 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 + + movdqa xmm5, xmm3 + punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + movdqa xmm2, xmm0 + punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 + punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 + + movdqa xmm3, xmm1 + punpcklwd xmm1, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 + punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 + + ; write out order: xmm0 xmm2 xmm1 xmm3 + lea rdx, [rsi + rax*4] + + movd [rsi], xmm1 ; write the second 8-line result + psrldq xmm1, 4 + movd [rdi], xmm1 + psrldq xmm1, 4 + movd [rsi + rax*2], xmm1 + psrldq xmm1, 4 + movd [rdi + rax*2], xmm1 + + movd [rdx], xmm3 + psrldq xmm3, 4 + movd [rcx], xmm3 + psrldq xmm3, 4 + movd [rdx + rax*2], xmm3 + psrldq xmm3, 4 + movd [rcx + rax*2], xmm3 + + neg rax + lea rsi, [rsi + rax*8] + neg rax + lea rdi, [rsi + rax] + lea rdx, [rsi + rax*4] + lea rcx, [rdx + rax] + + movd [rsi], xmm0 ; write the first 8-line result + psrldq xmm0, 4 + movd [rdi], xmm0 + psrldq xmm0, 4 + movd [rsi + rax*2], xmm0 + psrldq xmm0, 4 + movd [rdi + rax*2], xmm0 + + movd [rdx], xmm2 + psrldq xmm2, 4 + movd [rcx], xmm2 + psrldq xmm2, 4 + movd [rdx + rax*2], xmm2 + psrldq xmm2, 4 + movd [rcx + rax*2], xmm2 + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +tfe: + times 16 db 0xfe +align 16 +t80: + times 16 db 0x80 +align 16 +t1s: + times 16 db 0x01 +align 16 +t3: + times 16 db 0x03 +align 16 +t4: + times 16 db 0x04 +align 16 +ones: + times 8 dw 0x0001 +align 16 +s27: + times 8 dw 0x1b00 +align 16 +s18: + times 8 dw 0x1200 +align 16 +s9: + times 8 dw 0x0900 +align 16 +s63: + times 8 dw 0x003f diff --git a/vp8/common/x86/loopfilter_x86.c b/vp8/common/x86/loopfilter_x86.c new file mode 100644 index 000000000..143ee7469 --- /dev/null +++ b/vp8/common/x86/loopfilter_x86.c @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "loopfilter.h" + +prototype_loopfilter(vp8_loop_filter_horizontal_edge_c); +prototype_loopfilter(vp8_loop_filter_vertical_edge_c); +prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_c); +prototype_loopfilter(vp8_mbloop_filter_vertical_edge_c); +prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_c); +prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_c); + +prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx); +prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx); +prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx); +prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx); +prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx); +prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx); + +prototype_loopfilter(vp8_loop_filter_vertical_edge_sse2); +prototype_loopfilter(vp8_loop_filter_horizontal_edge_sse2); +prototype_loopfilter(vp8_mbloop_filter_vertical_edge_sse2); +prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_sse2); +prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_sse2); +prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_sse2); +prototype_loopfilter(vp8_fast_loop_filter_vertical_edges_sse2); + +#if HAVE_MMX +// Horizontal MB filtering +void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1); + + if (v_ptr) + vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1); +} + + +void vp8_loop_filter_mbhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); +} + + +// Vertical MB Filtering +void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1); + + if (v_ptr) + vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1); +} + + +void vp8_loop_filter_mbvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); +} + + +// Horizontal B Filtering +void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1); + + if (v_ptr) + vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1); +} + + +void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); +} + + +// Vertical B Filtering +void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + + if (u_ptr) + vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1); + + if (v_ptr) + vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1); +} + + +void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); +} +#endif + + +// Horizontal MB filtering +#if HAVE_SSE2 +void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1); + + if (v_ptr) + vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1); +} + + +void vp8_loop_filter_mbhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); +} + + +// Vertical MB Filtering +void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1); + + if (v_ptr) + vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1); +} + + +void vp8_loop_filter_mbvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); +} + + +// Horizontal B Filtering +void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1); + + if (v_ptr) + vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1); +} + + +void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); +} + + +// Vertical B Filtering +void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + + if (u_ptr) + vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1); + + if (v_ptr) + vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1); +} + + +void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); +} + +#endif + +#if 0 +void vp8_fast_loop_filter_vertical_edges_sse(unsigned char *y_ptr, + int y_stride, + loop_filter_info *lfi) +{ + + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); +} +#endif diff --git a/vp8/common/x86/loopfilter_x86.h b/vp8/common/x86/loopfilter_x86.h new file mode 100644 index 000000000..c87f38a31 --- /dev/null +++ b/vp8/common/x86/loopfilter_x86.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef LOOPFILTER_X86_H +#define LOOPFILTER_X86_H + +/* Note: + * + * This platform is commonly built for runtime CPU detection. If you modify + * any of the function mappings present in this file, be sure to also update + * them in the function pointer initialization code + */ + +#if HAVE_MMX +extern prototype_loopfilter_block(vp8_loop_filter_mbv_mmx); +extern prototype_loopfilter_block(vp8_loop_filter_bv_mmx); +extern prototype_loopfilter_block(vp8_loop_filter_mbh_mmx); +extern prototype_loopfilter_block(vp8_loop_filter_bh_mmx); +extern prototype_loopfilter_block(vp8_loop_filter_mbvs_mmx); +extern prototype_loopfilter_block(vp8_loop_filter_bvs_mmx); +extern prototype_loopfilter_block(vp8_loop_filter_mbhs_mmx); +extern prototype_loopfilter_block(vp8_loop_filter_bhs_mmx); + + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_lf_normal_mb_v +#define vp8_lf_normal_mb_v vp8_loop_filter_mbv_mmx + +#undef vp8_lf_normal_b_v +#define vp8_lf_normal_b_v vp8_loop_filter_bv_mmx + +#undef vp8_lf_normal_mb_h +#define vp8_lf_normal_mb_h vp8_loop_filter_mbh_mmx + +#undef vp8_lf_normal_b_h +#define vp8_lf_normal_b_h vp8_loop_filter_bh_mmx + +#undef vp8_lf_simple_mb_v +#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_mmx + +#undef vp8_lf_simple_b_v +#define vp8_lf_simple_b_v vp8_loop_filter_bvs_mmx + +#undef vp8_lf_simple_mb_h +#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_mmx + +#undef vp8_lf_simple_b_h +#define vp8_lf_simple_b_h vp8_loop_filter_bhs_mmx +#endif +#endif + + +#if HAVE_SSE2 +extern prototype_loopfilter_block(vp8_loop_filter_mbv_sse2); +extern prototype_loopfilter_block(vp8_loop_filter_bv_sse2); +extern prototype_loopfilter_block(vp8_loop_filter_mbh_sse2); +extern prototype_loopfilter_block(vp8_loop_filter_bh_sse2); +extern prototype_loopfilter_block(vp8_loop_filter_mbvs_sse2); +extern prototype_loopfilter_block(vp8_loop_filter_bvs_sse2); +extern prototype_loopfilter_block(vp8_loop_filter_mbhs_sse2); +extern prototype_loopfilter_block(vp8_loop_filter_bhs_sse2); + + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_lf_normal_mb_v +#define vp8_lf_normal_mb_v vp8_loop_filter_mbv_sse2 + +#undef vp8_lf_normal_b_v +#define vp8_lf_normal_b_v vp8_loop_filter_bv_sse2 + +#undef vp8_lf_normal_mb_h +#define vp8_lf_normal_mb_h vp8_loop_filter_mbh_sse2 + +#undef vp8_lf_normal_b_h +#define vp8_lf_normal_b_h vp8_loop_filter_bh_sse2 + +#undef vp8_lf_simple_mb_v +#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_sse2 + +#undef vp8_lf_simple_b_v +#define vp8_lf_simple_b_v vp8_loop_filter_bvs_sse2 + +#undef vp8_lf_simple_mb_h +#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_sse2 + +#undef vp8_lf_simple_b_h +#define vp8_lf_simple_b_h vp8_loop_filter_bhs_sse2 +#endif +#endif + + +#endif diff --git a/vp8/common/x86/postproc_mmx.asm b/vp8/common/x86/postproc_mmx.asm new file mode 100644 index 000000000..721c8d612 --- /dev/null +++ b/vp8/common/x86/postproc_mmx.asm @@ -0,0 +1,533 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define VP8_FILTER_WEIGHT 128 +%define VP8_FILTER_SHIFT 7 + +;void vp8_post_proc_down_and_across_mmx +;( +; unsigned char *src_ptr, +; unsigned char *dst_ptr, +; int src_pixels_per_line, +; int dst_pixels_per_line, +; int rows, +; int cols, +; int flimit +;) +global sym(vp8_post_proc_down_and_across_mmx) +sym(vp8_post_proc_down_and_across_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 + ; move the global rd onto the stack, since we don't have enough registers + ; to do PIC addressing + movq mm0, [rd GLOBAL] + sub rsp, 8 + movq [rsp], mm0 +%define RD [rsp] +%else +%define RD [rd GLOBAL] +%endif + + push rbx + lea rbx, [Blur GLOBAL] + movd mm2, dword ptr arg(6) ;flimit + punpcklwd mm2, mm2 + punpckldq mm2, mm2 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;dst_ptr + + movsxd rcx, DWORD PTR arg(4) ;rows + movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? + pxor mm0, mm0 ; mm0 = 00000000 + +nextrow: + + xor rdx, rdx ; clear out rdx for use as loop counter +nextcol: + + pxor mm7, mm7 ; mm7 = 00000000 + movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps + movq mm3, [rsi] ; mm4 = r0 p0..p7 + punpcklbw mm3, mm0 ; mm3 = p0..p3 + movq mm1, mm3 ; mm1 = p0..p3 + pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers + + movq mm6, [rbx + 48] ; mm6 = kernel 3 taps + movq mm5, [rsi + rax] ; mm4 = r1 p0..p7 + punpcklbw mm5, mm0 ; mm5 = r1 p0..p3 + pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers + paddusw mm3, mm6 ; mm3 += mm6 + + ; thresholding + movq mm7, mm1 ; mm7 = r0 p0..p3 + psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3 + psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3 + paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) + pcmpgtw mm7, mm2 + + movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers + movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7 + punpcklbw mm5, mm0 ; mm5 = r2 p0..p3 + pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = r0 p0..p3 + psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3 + psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3 + paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + + neg rax + movq mm6, [rbx ] ; kernel 0 taps + movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7 + punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3 + pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = r0 p0..p3 + psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3 + psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3 + paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + movq mm6, [rbx + 16] ; kernel 1 taps + movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7 + punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3 + pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = r0 p0..p3 + psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3 + psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3 + paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + + paddusw mm3, RD ; mm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 + + pand mm1, mm7 ; mm1 select vals > thresh from source + pandn mm7, mm3 ; mm7 select vals < thresh from blurred result + paddusw mm1, mm7 ; combination + + packuswb mm1, mm0 ; pack to bytes + + movd [rdi], mm1 ; + neg rax ; pitch is positive + + + add rsi, 4 + add rdi, 4 + add rdx, 4 + + cmp edx, dword ptr arg(5) ;cols + jl nextcol + ; done with the all cols, start the across filtering in place + sub rsi, rdx + sub rdi, rdx + + + push rax + xor rdx, rdx + mov rax, [rdi-4]; + +acrossnextcol: + pxor mm7, mm7 ; mm7 = 00000000 + movq mm6, [rbx + 32 ] ; + movq mm4, [rdi+rdx] ; mm4 = p0..p7 + movq mm3, mm4 ; mm3 = p0..p7 + punpcklbw mm3, mm0 ; mm3 = p0..p3 + movq mm1, mm3 ; mm1 = p0..p3 + pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers + + movq mm6, [rbx + 48] + psrlq mm4, 8 ; mm4 = p1..p7 + movq mm5, mm4 ; mm5 = p1..p7 + punpcklbw mm5, mm0 ; mm5 = p1..p4 + pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers + paddusw mm3, mm6 ; mm3 += mm6 + + ; thresholding + movq mm7, mm1 ; mm7 = p0..p3 + psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4 + psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 + paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4) + pcmpgtw mm7, mm2 + + movq mm6, [rbx + 64 ] + psrlq mm4, 8 ; mm4 = p2..p7 + movq mm5, mm4 ; mm5 = p2..p7 + punpcklbw mm5, mm0 ; mm5 = p2..p5 + pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = p0..p3 + psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 + psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 + paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + + movq mm6, [rbx ] + movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5 + movq mm5, mm4 ; mm5 = p-2..p5 + punpcklbw mm5, mm0 ; mm5 = p-2..p1 + pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = p0..p3 + psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 + psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 + paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + movq mm6, [rbx + 16] + psrlq mm4, 8 ; mm4 = p-1..p5 + punpcklbw mm4, mm0 ; mm4 = p-1..p2 + pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = p0..p3 + psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4 + psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3 + paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + paddusw mm3, RD ; mm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 + + pand mm1, mm7 ; mm1 select vals > thresh from source + pandn mm7, mm3 ; mm7 select vals < thresh from blurred result + paddusw mm1, mm7 ; combination + + packuswb mm1, mm0 ; pack to bytes + mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes + movd eax, mm1 + + add rdx, 4 + cmp edx, dword ptr arg(5) ;cols + jl acrossnextcol; + + mov DWORD PTR [rdi+rdx-4], eax + pop rax + + ; done with this rwo + add rsi,rax ; next line + movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch? + add rdi,rax ; next destination + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch? + + dec rcx ; decrement count + jnz nextrow ; next row + pop rbx + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret +%undef RD + + +;void vp8_mbpost_proc_down_mmx(unsigned char *dst, +; int pitch, int rows, int cols,int flimit) +extern sym(vp8_rv) +global sym(vp8_mbpost_proc_down_mmx) +sym(vp8_mbpost_proc_down_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 136 + + ; unsigned char d[16][8] at [rsp] + ; create flimit2 at [rsp+128] + mov eax, dword ptr arg(4) ;flimit + mov [rsp+128], eax + mov [rsp+128+4], eax +%define flimit2 [rsp+128] + +%if ABI_IS_32BIT=0 + lea r8, [sym(vp8_rv) GLOBAL] +%endif + + ;rows +=8; + add dword ptr arg(2), 8 + + ;for(c=0; c<cols; c+=4) +loop_col: + mov rsi, arg(0) ;s + pxor mm0, mm0 ; + + movsxd rax, dword ptr arg(1) ;pitch ; + neg rax ; rax = -pitch + + lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] + neg rax + + + pxor mm5, mm5 + pxor mm6, mm6 ; + + pxor mm7, mm7 ; + mov rdi, rsi + + mov rcx, 15 ; + +loop_initvar: + movd mm1, DWORD PTR [rdi]; + punpcklbw mm1, mm0 ; + + paddw mm5, mm1 ; + pmullw mm1, mm1 ; + + movq mm2, mm1 ; + punpcklwd mm1, mm0 ; + + punpckhwd mm2, mm0 ; + paddd mm6, mm1 ; + + paddd mm7, mm2 ; + lea rdi, [rdi+rax] ; + + dec rcx + jne loop_initvar + ;save the var and sum + xor rdx, rdx +loop_row: + movd mm1, DWORD PTR [rsi] ; [s-pitch*8] + movd mm2, DWORD PTR [rdi] ; [s+pitch*7] + + punpcklbw mm1, mm0 + punpcklbw mm2, mm0 + + paddw mm5, mm2 + psubw mm5, mm1 + + pmullw mm2, mm2 + movq mm4, mm2 + + punpcklwd mm2, mm0 + punpckhwd mm4, mm0 + + paddd mm6, mm2 + paddd mm7, mm4 + + pmullw mm1, mm1 + movq mm2, mm1 + + punpcklwd mm1, mm0 + psubd mm6, mm1 + + punpckhwd mm2, mm0 + psubd mm7, mm2 + + + movq mm3, mm6 + pslld mm3, 4 + + psubd mm3, mm6 + movq mm1, mm5 + + movq mm4, mm5 + pmullw mm1, mm1 + + pmulhw mm4, mm4 + movq mm2, mm1 + + punpcklwd mm1, mm4 + punpckhwd mm2, mm4 + + movq mm4, mm7 + pslld mm4, 4 + + psubd mm4, mm7 + + psubd mm3, mm1 + psubd mm4, mm2 + + psubd mm3, flimit2 + psubd mm4, flimit2 + + psrad mm3, 31 + psrad mm4, 31 + + packssdw mm3, mm4 + packsswb mm3, mm0 + + movd mm1, DWORD PTR [rsi+rax*8] + + movq mm2, mm1 + punpcklbw mm1, mm0 + + paddw mm1, mm5 + mov rcx, rdx + + and rcx, 127 +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 + push rax + lea rax, [sym(vp8_rv) GLOBAL] + movq mm4, [rax + rcx*2] ;vp8_rv[rcx*2] + pop rax +%elif ABI_IS_32BIT=0 + movq mm4, [r8 + rcx*2] ;vp8_rv[rcx*2] +%else + movq mm4, [sym(vp8_rv) + rcx*2] +%endif + paddw mm1, mm4 + ;paddw xmm1, eight8s + psraw mm1, 4 + + packuswb mm1, mm0 + pand mm1, mm3 + + pandn mm3, mm2 + por mm1, mm3 + + and rcx, 15 + movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4] + + mov rcx, rdx + sub rcx, 8 + + and rcx, 15 + movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4] + + movd [rsi], mm1 + lea rsi, [rsi+rax] + + lea rdi, [rdi+rax] + add rdx, 1 + + cmp edx, dword arg(2) ;rows + jl loop_row + + + add dword arg(0), 4 ; s += 4 + sub dword arg(3), 4 ; cols -= 4 + cmp dword arg(3), 0 + jg loop_col + + add rsp, 136 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret +%undef flimit2 + + +;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise, +; unsigned char blackclamp[16], +; unsigned char whiteclamp[16], +; unsigned char bothclamp[16], +; unsigned int Width, unsigned int Height, int Pitch) +extern sym(rand) +global sym(vp8_plane_add_noise_mmx) +sym(vp8_plane_add_noise_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +addnoise_loop: + call sym(rand) WRT_PLT + mov rcx, arg(1) ;noise + and rax, 0xff + add rcx, rax + + ; we rely on the fact that the clamping vectors are stored contiguously + ; in black/white/both order. Note that we have to reload this here because + ; rdx could be trashed by rand() + mov rdx, arg(2) ; blackclamp + + + mov rdi, rcx + movsxd rcx, dword arg(5) ;[Width] + mov rsi, arg(0) ;Pos + xor rax,rax + +addnoise_nextset: + movq mm1,[rsi+rax] ; get the source + + psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise + paddusb mm1, [rdx+32] ;bothclamp + psubusb mm1, [rdx+16] ;whiteclamp + + movq mm2,[rdi+rax] ; get the noise for this line + paddb mm1,mm2 ; add it in + movq [rsi+rax],mm1 ; store the result + + add rax,8 ; move to the next line + + cmp rax, rcx + jl addnoise_nextset + + movsxd rax, dword arg(7) ; Pitch + add arg(0), rax ; Start += Pitch + sub dword arg(6), 1 ; Height -= 1 + jg addnoise_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +Blur: + times 16 dw 16 + times 8 dw 64 + times 16 dw 16 + times 8 dw 0 + +rd: + times 4 dw 0x40 diff --git a/vp8/common/x86/postproc_mmx.c b/vp8/common/x86/postproc_mmx.c new file mode 100644 index 000000000..095797b1e --- /dev/null +++ b/vp8/common/x86/postproc_mmx.c @@ -0,0 +1,1507 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include <math.h> +#include <stdlib.h> +#include "vpx_scale/yv12config.h" +#include "pragmas.h" + +#define VP8_FILTER_WEIGHT 128 +#define VP8_FILTER_SHIFT 7 + + + +/* static constants */ +__declspec(align(16)) +const static short Blur[48] = +{ + + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 64, 64, 64, 64, 64, 64, 64, 64, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 0, 0, 0, 0, 0, 0, 0, 0, + +}; +#define RD __declspec(align(16)) __int64 rd = 0x0040004000400040; +#define R4D2 __declspec(align(16)) __int64 rd42[2] = {0x0004000400040004,0x0004000400040004}; + +#ifndef RELOCATEABLE +const static RD; +const static R4D2; +#endif + + +/* external references */ +extern double vp8_gaussian(double sigma, double mu, double x); +extern short vp8_rv[]; +extern int vp8_q2mbl(int x) ; + + + +void vp8_post_proc_down_and_across_mmx +( + unsigned char *src_ptr, + unsigned char *dst_ptr, + int src_pixels_per_line, + int dst_pixels_per_line, + int rows, + int cols, + int flimit +) +{ +#ifdef RELOCATEABLE + RD + R4D2 +#endif + + __asm + { + push ebx + lea ebx, Blur + movd mm2, flimit + punpcklwd mm2, mm2 + punpckldq mm2, mm2 + + mov esi, src_ptr + mov edi, dst_ptr + + mov ecx, DWORD PTR rows + mov eax, src_pixels_per_line ; + destination pitch? + pxor mm0, mm0 ; + mm0 = 00000000 + + nextrow: + + xor edx, edx ; + + clear out edx for use as loop counter + nextcol: + + pxor mm7, mm7 ; + + mm7 = 00000000 + movq mm6, [ebx + 32 ] ; + mm6 = kernel 2 taps + movq mm3, [esi] ; + mm4 = r0 p0..p7 + punpcklbw mm3, mm0 ; + mm3 = p0..p3 + movq mm1, mm3 ; + mm1 = p0..p3 + pmullw mm3, mm6 ; + mm3 *= kernel 2 modifiers + + movq mm6, [ebx + 48] ; + mm6 = kernel 3 taps + movq mm5, [esi + eax] ; + mm4 = r1 p0..p7 + punpcklbw mm5, mm0 ; + mm5 = r1 p0..p3 + pmullw mm6, mm5 ; + mm6 *= p0..p3 * kernel 3 modifiers + paddusw mm3, mm6 ; + mm3 += mm6 + + ; + thresholding + movq mm7, mm1 ; + mm7 = r0 p0..p3 + psubusw mm7, mm5 ; + mm7 = r0 p0..p3 - r1 p0..p3 + psubusw mm5, mm1 ; + mm5 = r1 p0..p3 - r0 p0..p3 + paddusw mm7, mm5 ; + mm7 = abs(r0 p0..p3 - r1 p0..p3) + pcmpgtw mm7, mm2 + + movq mm6, [ebx + 64 ] ; + mm6 = kernel 4 modifiers + movq mm5, [esi + 2*eax] ; + mm4 = r2 p0..p7 + punpcklbw mm5, mm0 ; + mm5 = r2 p0..p3 + pmullw mm6, mm5 ; + mm5 *= kernel 4 modifiers + paddusw mm3, mm6 ; + mm3 += mm5 + + ; + thresholding + movq mm6, mm1 ; + mm6 = r0 p0..p3 + psubusw mm6, mm5 ; + mm6 = r0 p0..p3 - r2 p0..p3 + psubusw mm5, mm1 ; + mm5 = r2 p0..p3 - r2 p0..p3 + paddusw mm6, mm5 ; + mm6 = abs(r0 p0..p3 - r2 p0..p3) + pcmpgtw mm6, mm2 + por mm7, mm6 ; + accumulate thresholds + + + neg eax + movq mm6, [ebx ] ; + kernel 0 taps + movq mm5, [esi+2*eax] ; + mm4 = r-2 p0..p7 + punpcklbw mm5, mm0 ; + mm5 = r-2 p0..p3 + pmullw mm6, mm5 ; + mm5 *= kernel 0 modifiers + paddusw mm3, mm6 ; + mm3 += mm5 + + ; + thresholding + movq mm6, mm1 ; + mm6 = r0 p0..p3 + psubusw mm6, mm5 ; + mm6 = p0..p3 - r-2 p0..p3 + psubusw mm5, mm1 ; + mm5 = r-2 p0..p3 - p0..p3 + paddusw mm6, mm5 ; + mm6 = abs(r0 p0..p3 - r-2 p0..p3) + pcmpgtw mm6, mm2 + por mm7, mm6 ; + accumulate thresholds + + movq mm6, [ebx + 16] ; + kernel 1 taps + movq mm4, [esi+eax] ; + mm4 = r-1 p0..p7 + punpcklbw mm4, mm0 ; + mm4 = r-1 p0..p3 + pmullw mm6, mm4 ; + mm4 *= kernel 1 modifiers. + paddusw mm3, mm6 ; + mm3 += mm5 + + ; + thresholding + movq mm6, mm1 ; + mm6 = r0 p0..p3 + psubusw mm6, mm4 ; + mm6 = p0..p3 - r-2 p0..p3 + psubusw mm4, mm1 ; + mm5 = r-1 p0..p3 - p0..p3 + paddusw mm6, mm4 ; + mm6 = abs(r0 p0..p3 - r-1 p0..p3) + pcmpgtw mm6, mm2 + por mm7, mm6 ; + accumulate thresholds + + + paddusw mm3, rd ; + mm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; + mm3 /= 128 + + pand mm1, mm7 ; + mm1 select vals > thresh from source + pandn mm7, mm3 ; + mm7 select vals < thresh from blurred result + paddusw mm1, mm7 ; + combination + + packuswb mm1, mm0 ; + pack to bytes + + movd [edi], mm1 ; + neg eax ; + pitch is positive + + + add esi, 4 + add edi, 4 + add edx, 4 + + cmp edx, cols + jl nextcol + // done with the all cols, start the across filtering in place + sub esi, edx + sub edi, edx + + + push eax + xor edx, edx + mov eax, [edi-4]; + + acrossnextcol: + pxor mm7, mm7 ; + mm7 = 00000000 + movq mm6, [ebx + 32 ] ; + movq mm4, [edi+edx] ; + mm4 = p0..p7 + movq mm3, mm4 ; + mm3 = p0..p7 + punpcklbw mm3, mm0 ; + mm3 = p0..p3 + movq mm1, mm3 ; + mm1 = p0..p3 + pmullw mm3, mm6 ; + mm3 *= kernel 2 modifiers + + movq mm6, [ebx + 48] + psrlq mm4, 8 ; + mm4 = p1..p7 + movq mm5, mm4 ; + mm5 = p1..p7 + punpcklbw mm5, mm0 ; + mm5 = p1..p4 + pmullw mm6, mm5 ; + mm6 *= p1..p4 * kernel 3 modifiers + paddusw mm3, mm6 ; + mm3 += mm6 + + ; + thresholding + movq mm7, mm1 ; + mm7 = p0..p3 + psubusw mm7, mm5 ; + mm7 = p0..p3 - p1..p4 + psubusw mm5, mm1 ; + mm5 = p1..p4 - p0..p3 + paddusw mm7, mm5 ; + mm7 = abs(p0..p3 - p1..p4) + pcmpgtw mm7, mm2 + + movq mm6, [ebx + 64 ] + psrlq mm4, 8 ; + mm4 = p2..p7 + movq mm5, mm4 ; + mm5 = p2..p7 + punpcklbw mm5, mm0 ; + mm5 = p2..p5 + pmullw mm6, mm5 ; + mm5 *= kernel 4 modifiers + paddusw mm3, mm6 ; + mm3 += mm5 + + ; + thresholding + movq mm6, mm1 ; + mm6 = p0..p3 + psubusw mm6, mm5 ; + mm6 = p0..p3 - p1..p4 + psubusw mm5, mm1 ; + mm5 = p1..p4 - p0..p3 + paddusw mm6, mm5 ; + mm6 = abs(p0..p3 - p1..p4) + pcmpgtw mm6, mm2 + por mm7, mm6 ; + accumulate thresholds + + + movq mm6, [ebx ] + movq mm4, [edi+edx-2] ; + mm4 = p-2..p5 + movq mm5, mm4 ; + mm5 = p-2..p5 + punpcklbw mm5, mm0 ; + mm5 = p-2..p1 + pmullw mm6, mm5 ; + mm5 *= kernel 0 modifiers + paddusw mm3, mm6 ; + mm3 += mm5 + + ; + thresholding + movq mm6, mm1 ; + mm6 = p0..p3 + psubusw mm6, mm5 ; + mm6 = p0..p3 - p1..p4 + psubusw mm5, mm1 ; + mm5 = p1..p4 - p0..p3 + paddusw mm6, mm5 ; + mm6 = abs(p0..p3 - p1..p4) + pcmpgtw mm6, mm2 + por mm7, mm6 ; + accumulate thresholds + + movq mm6, [ebx + 16] + psrlq mm4, 8 ; + mm4 = p-1..p5 + punpcklbw mm4, mm0 ; + mm4 = p-1..p2 + pmullw mm6, mm4 ; + mm4 *= kernel 1 modifiers. + paddusw mm3, mm6 ; + mm3 += mm5 + + ; + thresholding + movq mm6, mm1 ; + mm6 = p0..p3 + psubusw mm6, mm4 ; + mm6 = p0..p3 - p1..p4 + psubusw mm4, mm1 ; + mm5 = p1..p4 - p0..p3 + paddusw mm6, mm4 ; + mm6 = abs(p0..p3 - p1..p4) + pcmpgtw mm6, mm2 + por mm7, mm6 ; + accumulate thresholds + + paddusw mm3, rd ; + mm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; + mm3 /= 128 + + pand mm1, mm7 ; + mm1 select vals > thresh from source + pandn mm7, mm3 ; + mm7 select vals < thresh from blurred result + paddusw mm1, mm7 ; + combination + + packuswb mm1, mm0 ; + pack to bytes + mov DWORD PTR [edi+edx-4], eax ; + store previous four bytes + movd eax, mm1 + + add edx, 4 + cmp edx, cols + jl acrossnextcol; + + mov DWORD PTR [edi+edx-4], eax + pop eax + + // done with this rwo + add esi, eax ; + next line + mov eax, dst_pixels_per_line ; + destination pitch? + add edi, eax ; + next destination + mov eax, src_pixels_per_line ; + destination pitch? + + dec ecx ; + decrement count + jnz nextrow ; + next row + pop ebx + + } +} + + + +void vp8_post_proc_down_and_across_xmm +( + unsigned char *src_ptr, + unsigned char *dst_ptr, + int src_pixels_per_line, + int dst_pixels_per_line, + int rows, + int cols, + int flimit +) +{ +#ifdef RELOCATEABLE + R4D2 +#endif + + __asm + { + movd xmm2, flimit + punpcklwd xmm2, xmm2 + punpckldq xmm2, xmm2 + punpcklqdq xmm2, xmm2 + + mov esi, src_ptr + mov edi, dst_ptr + + mov ecx, DWORD PTR rows + mov eax, src_pixels_per_line ; + destination pitch? + pxor xmm0, xmm0 ; + mm0 = 00000000 + + nextrow: + + xor edx, edx ; + + clear out edx for use as loop counter + nextcol: + movq xmm3, QWORD PTR [esi] ; + + mm4 = r0 p0..p7 + punpcklbw xmm3, xmm0 ; + mm3 = p0..p3 + movdqa xmm1, xmm3 ; + mm1 = p0..p3 + psllw xmm3, 2 ; + + movq xmm5, QWORD PTR [esi + eax] ; + mm4 = r1 p0..p7 + punpcklbw xmm5, xmm0 ; + mm5 = r1 p0..p3 + paddusw xmm3, xmm5 ; + mm3 += mm6 + + ; + thresholding + movdqa xmm7, xmm1 ; + mm7 = r0 p0..p3 + psubusw xmm7, xmm5 ; + mm7 = r0 p0..p3 - r1 p0..p3 + psubusw xmm5, xmm1 ; + mm5 = r1 p0..p3 - r0 p0..p3 + paddusw xmm7, xmm5 ; + mm7 = abs(r0 p0..p3 - r1 p0..p3) + pcmpgtw xmm7, xmm2 + + movq xmm5, QWORD PTR [esi + 2*eax] ; + mm4 = r2 p0..p7 + punpcklbw xmm5, xmm0 ; + mm5 = r2 p0..p3 + paddusw xmm3, xmm5 ; + mm3 += mm5 + + ; + thresholding + movdqa xmm6, xmm1 ; + mm6 = r0 p0..p3 + psubusw xmm6, xmm5 ; + mm6 = r0 p0..p3 - r2 p0..p3 + psubusw xmm5, xmm1 ; + mm5 = r2 p0..p3 - r2 p0..p3 + paddusw xmm6, xmm5 ; + mm6 = abs(r0 p0..p3 - r2 p0..p3) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; + accumulate thresholds + + + neg eax + movq xmm5, QWORD PTR [esi+2*eax] ; + mm4 = r-2 p0..p7 + punpcklbw xmm5, xmm0 ; + mm5 = r-2 p0..p3 + paddusw xmm3, xmm5 ; + mm3 += mm5 + + ; + thresholding + movdqa xmm6, xmm1 ; + mm6 = r0 p0..p3 + psubusw xmm6, xmm5 ; + mm6 = p0..p3 - r-2 p0..p3 + psubusw xmm5, xmm1 ; + mm5 = r-2 p0..p3 - p0..p3 + paddusw xmm6, xmm5 ; + mm6 = abs(r0 p0..p3 - r-2 p0..p3) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; + accumulate thresholds + + movq xmm4, QWORD PTR [esi+eax] ; + mm4 = r-1 p0..p7 + punpcklbw xmm4, xmm0 ; + mm4 = r-1 p0..p3 + paddusw xmm3, xmm4 ; + mm3 += mm5 + + ; + thresholding + movdqa xmm6, xmm1 ; + mm6 = r0 p0..p3 + psubusw xmm6, xmm4 ; + mm6 = p0..p3 - r-2 p0..p3 + psubusw xmm4, xmm1 ; + mm5 = r-1 p0..p3 - p0..p3 + paddusw xmm6, xmm4 ; + mm6 = abs(r0 p0..p3 - r-1 p0..p3) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; + accumulate thresholds + + + paddusw xmm3, rd42 ; + mm3 += round value + psraw xmm3, 3 ; + mm3 /= 8 + + pand xmm1, xmm7 ; + mm1 select vals > thresh from source + pandn xmm7, xmm3 ; + mm7 select vals < thresh from blurred result + paddusw xmm1, xmm7 ; + combination + + packuswb xmm1, xmm0 ; + pack to bytes + movq QWORD PTR [edi], xmm1 ; + + neg eax ; + pitch is positive + add esi, 8 + add edi, 8 + + add edx, 8 + cmp edx, cols + + jl nextcol + + // done with the all cols, start the across filtering in place + sub esi, edx + sub edi, edx + + xor edx, edx + movq mm0, QWORD PTR [edi-8]; + + acrossnextcol: + movq xmm7, QWORD PTR [edi +edx -2] + movd xmm4, DWORD PTR [edi +edx +6] + + pslldq xmm4, 8 + por xmm4, xmm7 + + movdqa xmm3, xmm4 + psrldq xmm3, 2 + punpcklbw xmm3, xmm0 ; + mm3 = p0..p3 + movdqa xmm1, xmm3 ; + mm1 = p0..p3 + psllw xmm3, 2 + + + movdqa xmm5, xmm4 + psrldq xmm5, 3 + punpcklbw xmm5, xmm0 ; + mm5 = p1..p4 + paddusw xmm3, xmm5 ; + mm3 += mm6 + + ; + thresholding + movdqa xmm7, xmm1 ; + mm7 = p0..p3 + psubusw xmm7, xmm5 ; + mm7 = p0..p3 - p1..p4 + psubusw xmm5, xmm1 ; + mm5 = p1..p4 - p0..p3 + paddusw xmm7, xmm5 ; + mm7 = abs(p0..p3 - p1..p4) + pcmpgtw xmm7, xmm2 + + movdqa xmm5, xmm4 + psrldq xmm5, 4 + punpcklbw xmm5, xmm0 ; + mm5 = p2..p5 + paddusw xmm3, xmm5 ; + mm3 += mm5 + + ; + thresholding + movdqa xmm6, xmm1 ; + mm6 = p0..p3 + psubusw xmm6, xmm5 ; + mm6 = p0..p3 - p1..p4 + psubusw xmm5, xmm1 ; + mm5 = p1..p4 - p0..p3 + paddusw xmm6, xmm5 ; + mm6 = abs(p0..p3 - p1..p4) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; + accumulate thresholds + + + movdqa xmm5, xmm4 ; + mm5 = p-2..p5 + punpcklbw xmm5, xmm0 ; + mm5 = p-2..p1 + paddusw xmm3, xmm5 ; + mm3 += mm5 + + ; + thresholding + movdqa xmm6, xmm1 ; + mm6 = p0..p3 + psubusw xmm6, xmm5 ; + mm6 = p0..p3 - p1..p4 + psubusw xmm5, xmm1 ; + mm5 = p1..p4 - p0..p3 + paddusw xmm6, xmm5 ; + mm6 = abs(p0..p3 - p1..p4) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; + accumulate thresholds + + psrldq xmm4, 1 ; + mm4 = p-1..p5 + punpcklbw xmm4, xmm0 ; + mm4 = p-1..p2 + paddusw xmm3, xmm4 ; + mm3 += mm5 + + ; + thresholding + movdqa xmm6, xmm1 ; + mm6 = p0..p3 + psubusw xmm6, xmm4 ; + mm6 = p0..p3 - p1..p4 + psubusw xmm4, xmm1 ; + mm5 = p1..p4 - p0..p3 + paddusw xmm6, xmm4 ; + mm6 = abs(p0..p3 - p1..p4) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; + accumulate thresholds + + paddusw xmm3, rd42 ; + mm3 += round value + psraw xmm3, 3 ; + mm3 /= 8 + + pand xmm1, xmm7 ; + mm1 select vals > thresh from source + pandn xmm7, xmm3 ; + mm7 select vals < thresh from blurred result + paddusw xmm1, xmm7 ; + combination + + packuswb xmm1, xmm0 ; + pack to bytes + movq QWORD PTR [edi+edx-8], mm0 ; + store previous four bytes + movdq2q mm0, xmm1 + + add edx, 8 + cmp edx, cols + jl acrossnextcol; + + // last 8 pixels + movq QWORD PTR [edi+edx-8], mm0 + + // done with this rwo + add esi, eax ; + next line + mov eax, dst_pixels_per_line ; + destination pitch? + add edi, eax ; + next destination + mov eax, src_pixels_per_line ; + destination pitch? + + dec ecx ; + decrement count + jnz nextrow ; + next row + } +} + + +void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols, int flimit) +{ + int c, i; + __declspec(align(16)) + int flimit2[2]; + __declspec(align(16)) + unsigned char d[16][8]; + + flimit = vp8_q2mbl(flimit); + + for (i = 0; i < 2; i++) + flimit2[i] = flimit; + + rows += 8; + + for (c = 0; c < cols; c += 4) + { + unsigned char *s = &dst[c]; + + __asm + { + mov esi, s ; + pxor mm0, mm0 ; + + mov eax, pitch ; + neg eax // eax = -pitch + + lea esi, [esi + eax*8]; // edi = s[-pitch*8] + neg eax + + + pxor mm5, mm5 + pxor mm6, mm6 ; + + pxor mm7, mm7 ; + mov edi, esi + + mov ecx, 15 ; + + loop_initvar: + movd mm1, DWORD PTR [edi]; + punpcklbw mm1, mm0 ; + + paddw mm5, mm1 ; + pmullw mm1, mm1 ; + + movq mm2, mm1 ; + punpcklwd mm1, mm0 ; + + punpckhwd mm2, mm0 ; + paddd mm6, mm1 ; + + paddd mm7, mm2 ; + lea edi, [edi+eax] ; + + dec ecx + jne loop_initvar + //save the var and sum + xor edx, edx + loop_row: + movd mm1, DWORD PTR [esi] // [s-pitch*8] + movd mm2, DWORD PTR [edi] // [s+pitch*7] + + punpcklbw mm1, mm0 + punpcklbw mm2, mm0 + + paddw mm5, mm2 + psubw mm5, mm1 + + pmullw mm2, mm2 + movq mm4, mm2 + + punpcklwd mm2, mm0 + punpckhwd mm4, mm0 + + paddd mm6, mm2 + paddd mm7, mm4 + + pmullw mm1, mm1 + movq mm2, mm1 + + punpcklwd mm1, mm0 + psubd mm6, mm1 + + punpckhwd mm2, mm0 + psubd mm7, mm2 + + + movq mm3, mm6 + pslld mm3, 4 + + psubd mm3, mm6 + movq mm1, mm5 + + movq mm4, mm5 + pmullw mm1, mm1 + + pmulhw mm4, mm4 + movq mm2, mm1 + + punpcklwd mm1, mm4 + punpckhwd mm2, mm4 + + movq mm4, mm7 + pslld mm4, 4 + + psubd mm4, mm7 + + psubd mm3, mm1 + psubd mm4, mm2 + + psubd mm3, flimit2 + psubd mm4, flimit2 + + psrad mm3, 31 + psrad mm4, 31 + + packssdw mm3, mm4 + packsswb mm3, mm0 + + movd mm1, DWORD PTR [esi+eax*8] + + movq mm2, mm1 + punpcklbw mm1, mm0 + + paddw mm1, mm5 + mov ecx, edx + + and ecx, 127 + movq mm4, vp8_rv[ecx*2] + + paddw mm1, mm4 + //paddw xmm1, eight8s + psraw mm1, 4 + + packuswb mm1, mm0 + pand mm1, mm3 + + pandn mm3, mm2 + por mm1, mm3 + + and ecx, 15 + movd DWORD PTR d[ecx*4], mm1 + + mov ecx, edx + sub ecx, 8 + + and ecx, 15 + movd mm1, DWORD PTR d[ecx*4] + + movd [esi], mm1 + lea esi, [esi+eax] + + lea edi, [edi+eax] + add edx, 1 + + cmp edx, rows + jl loop_row + + } + + } +} + +void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols, int flimit) +{ + int c, i; + __declspec(align(16)) + int flimit4[4]; + __declspec(align(16)) + unsigned char d[16][8]; + + flimit = vp8_q2mbl(flimit); + + for (i = 0; i < 4; i++) + flimit4[i] = flimit; + + rows += 8; + + for (c = 0; c < cols; c += 8) + { + unsigned char *s = &dst[c]; + + __asm + { + mov esi, s ; + pxor xmm0, xmm0 ; + + mov eax, pitch ; + neg eax // eax = -pitch + + lea esi, [esi + eax*8]; // edi = s[-pitch*8] + neg eax + + + pxor xmm5, xmm5 + pxor xmm6, xmm6 ; + + pxor xmm7, xmm7 ; + mov edi, esi + + mov ecx, 15 ; + + loop_initvar: + movq xmm1, QWORD PTR [edi]; + punpcklbw xmm1, xmm0 ; + + paddw xmm5, xmm1 ; + pmullw xmm1, xmm1 ; + + movdqa xmm2, xmm1 ; + punpcklwd xmm1, xmm0 ; + + punpckhwd xmm2, xmm0 ; + paddd xmm6, xmm1 ; + + paddd xmm7, xmm2 ; + lea edi, [edi+eax] ; + + dec ecx + jne loop_initvar + //save the var and sum + xor edx, edx + loop_row: + movq xmm1, QWORD PTR [esi] // [s-pitch*8] + movq xmm2, QWORD PTR [edi] // [s+pitch*7] + + punpcklbw xmm1, xmm0 + punpcklbw xmm2, xmm0 + + paddw xmm5, xmm2 + psubw xmm5, xmm1 + + pmullw xmm2, xmm2 + movdqa xmm4, xmm2 + + punpcklwd xmm2, xmm0 + punpckhwd xmm4, xmm0 + + paddd xmm6, xmm2 + paddd xmm7, xmm4 + + pmullw xmm1, xmm1 + movdqa xmm2, xmm1 + + punpcklwd xmm1, xmm0 + psubd xmm6, xmm1 + + punpckhwd xmm2, xmm0 + psubd xmm7, xmm2 + + + movdqa xmm3, xmm6 + pslld xmm3, 4 + + psubd xmm3, xmm6 + movdqa xmm1, xmm5 + + movdqa xmm4, xmm5 + pmullw xmm1, xmm1 + + pmulhw xmm4, xmm4 + movdqa xmm2, xmm1 + + punpcklwd xmm1, xmm4 + punpckhwd xmm2, xmm4 + + movdqa xmm4, xmm7 + pslld xmm4, 4 + + psubd xmm4, xmm7 + + psubd xmm3, xmm1 + psubd xmm4, xmm2 + + psubd xmm3, flimit4 + psubd xmm4, flimit4 + + psrad xmm3, 31 + psrad xmm4, 31 + + packssdw xmm3, xmm4 + packsswb xmm3, xmm0 + + movq xmm1, QWORD PTR [esi+eax*8] + + movq xmm2, xmm1 + punpcklbw xmm1, xmm0 + + paddw xmm1, xmm5 + mov ecx, edx + + and ecx, 127 + movdqu xmm4, vp8_rv[ecx*2] + + paddw xmm1, xmm4 + //paddw xmm1, eight8s + psraw xmm1, 4 + + packuswb xmm1, xmm0 + pand xmm1, xmm3 + + pandn xmm3, xmm2 + por xmm1, xmm3 + + and ecx, 15 + movq QWORD PTR d[ecx*8], xmm1 + + mov ecx, edx + sub ecx, 8 + + and ecx, 15 + movq mm0, d[ecx*8] + + movq [esi], mm0 + lea esi, [esi+eax] + + lea edi, [edi+eax] + add edx, 1 + + cmp edx, rows + jl loop_row + + } + + } +} +#if 0 +/**************************************************************************** + * + * ROUTINE : plane_add_noise_wmt + * + * INPUTS : unsigned char *Start starting address of buffer to add gaussian + * noise to + * unsigned int Width width of plane + * unsigned int Height height of plane + * int Pitch distance between subsequent lines of frame + * int q quantizer used to determine amount of noise + * to add + * + * OUTPUTS : None. + * + * RETURNS : void. + * + * FUNCTION : adds gaussian noise to a plane of pixels + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) +{ + unsigned int i; + + __declspec(align(16)) unsigned char blackclamp[16]; + __declspec(align(16)) unsigned char whiteclamp[16]; + __declspec(align(16)) unsigned char bothclamp[16]; + char char_dist[300]; + char Rand[2048]; + double sigma; +// return; + __asm emms + sigma = a + .5 + .6 * (63 - q) / 63.0; + + // set up a lookup table of 256 entries that matches + // a gaussian distribution with sigma determined by q. + // + { + double i; + int next, j; + + next = 0; + + for (i = -32; i < 32; i++) + { + double g = 256 * vp8_gaussian(sigma, 0, 1.0 * i); + int a = (int)(g + .5); + + if (a) + { + for (j = 0; j < a; j++) + { + char_dist[next+j] = (char) i; + } + + next = next + j; + } + + } + + for (next = next; next < 256; next++) + char_dist[next] = 0; + + } + + for (i = 0; i < 2048; i++) + { + Rand[i] = char_dist[rand() & 0xff]; + } + + for (i = 0; i < 16; i++) + { + blackclamp[i] = -char_dist[0]; + whiteclamp[i] = -char_dist[0]; + bothclamp[i] = -2 * char_dist[0]; + } + + for (i = 0; i < Height; i++) + { + unsigned char *Pos = Start + i * Pitch; + char *Ref = Rand + (rand() & 0xff); + + __asm + { + mov ecx, [Width] + mov esi, Pos + mov edi, Ref + xor eax, eax + + nextset: + movdqu xmm1, [esi+eax] // get the source + + psubusb xmm1, blackclamp // clamp both sides so we don't outrange adding noise + paddusb xmm1, bothclamp + psubusb xmm1, whiteclamp + + movdqu xmm2, [edi+eax] // get the noise for this line + paddb xmm1, xmm2 // add it in + movdqu [esi+eax], xmm1 // store the result + + add eax, 16 // move to the next line + + cmp eax, ecx + jl nextset + + + } + + } +} +#endif +__declspec(align(16)) +static const int four8s[4] = { 8, 8, 8, 8}; +void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, int pitch, int rows, int cols, int flimit) +{ + int r, i; + __declspec(align(16)) + int flimit4[4]; + unsigned char *s = src; + int sumsq; + int sum; + + + flimit = vp8_q2mbl(flimit); + flimit4[0] = + flimit4[1] = + flimit4[2] = + flimit4[3] = flimit; + + for (r = 0; r < rows; r++) + { + + + sumsq = 0; + sum = 0; + + for (i = -8; i <= 6; i++) + { + sumsq += s[i] * s[i]; + sum += s[i]; + } + + __asm + { + mov eax, sumsq + movd xmm7, eax + + mov eax, sum + movd xmm6, eax + + mov esi, s + xor ecx, ecx + + mov edx, cols + add edx, 8 + pxor mm0, mm0 + pxor mm1, mm1 + + pxor xmm0, xmm0 + nextcol4: + + movd xmm1, DWORD PTR [esi+ecx-8] // -8 -7 -6 -5 + movd xmm2, DWORD PTR [esi+ecx+7] // +7 +8 +9 +10 + + punpcklbw xmm1, xmm0 // expanding + punpcklbw xmm2, xmm0 // expanding + + punpcklwd xmm1, xmm0 // expanding to dwords + punpcklwd xmm2, xmm0 // expanding to dwords + + psubd xmm2, xmm1 // 7--8 8--7 9--6 10--5 + paddd xmm1, xmm1 // -8*2 -7*2 -6*2 -5*2 + + paddd xmm1, xmm2 // 7+-8 8+-7 9+-6 10+-5 + pmaddwd xmm1, xmm2 // squared of 7+-8 8+-7 9+-6 10+-5 + + paddd xmm6, xmm2 + paddd xmm7, xmm1 + + pshufd xmm6, xmm6, 0 // duplicate the last ones + pshufd xmm7, xmm7, 0 // duplicate the last ones + + psrldq xmm1, 4 // 8--7 9--6 10--5 0000 + psrldq xmm2, 4 // 8--7 9--6 10--5 0000 + + pshufd xmm3, xmm1, 3 // 0000 8--7 8--7 8--7 squared + pshufd xmm4, xmm2, 3 // 0000 8--7 8--7 8--7 squared + + paddd xmm6, xmm4 + paddd xmm7, xmm3 + + pshufd xmm3, xmm1, 01011111b // 0000 0000 9--6 9--6 squared + pshufd xmm4, xmm2, 01011111b // 0000 0000 9--6 9--6 squared + + paddd xmm7, xmm3 + paddd xmm6, xmm4 + + pshufd xmm3, xmm1, 10111111b // 0000 0000 8--7 8--7 squared + pshufd xmm4, xmm2, 10111111b // 0000 0000 8--7 8--7 squared + + paddd xmm7, xmm3 + paddd xmm6, xmm4 + + movdqa xmm3, xmm6 + pmaddwd xmm3, xmm3 + + movdqa xmm5, xmm7 + pslld xmm5, 4 + + psubd xmm5, xmm7 + psubd xmm5, xmm3 + + psubd xmm5, flimit4 + psrad xmm5, 31 + + packssdw xmm5, xmm0 + packsswb xmm5, xmm0 + + movd xmm1, DWORD PTR [esi+ecx] + movq xmm2, xmm1 + + punpcklbw xmm1, xmm0 + punpcklwd xmm1, xmm0 + + paddd xmm1, xmm6 + paddd xmm1, four8s + + psrad xmm1, 4 + packssdw xmm1, xmm0 + + packuswb xmm1, xmm0 + pand xmm1, xmm5 + + pandn xmm5, xmm2 + por xmm5, xmm1 + + movd [esi+ecx-8], mm0 + movq mm0, mm1 + + movdq2q mm1, xmm5 + psrldq xmm7, 12 + + psrldq xmm6, 12 + add ecx, 4 + + cmp ecx, edx + jl nextcol4 + + } + s += pitch; + } +} + +#if 0 + +/**************************************************************************** + * + * ROUTINE : plane_add_noise_mmx + * + * INPUTS : unsigned char *Start starting address of buffer to add gaussian + * noise to + * unsigned int Width width of plane + * unsigned int Height height of plane + * int Pitch distance between subsequent lines of frame + * int q quantizer used to determine amount of noise + * to add + * + * OUTPUTS : None. + * + * RETURNS : void. + * + * FUNCTION : adds gaussian noise to a plane of pixels + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) +{ + unsigned int i; + int Pitch4 = Pitch * 4; + const int noise_amount = 2; + const int noise_adder = 2 * noise_amount + 1; + + __declspec(align(16)) unsigned char blackclamp[16]; + __declspec(align(16)) unsigned char whiteclamp[16]; + __declspec(align(16)) unsigned char bothclamp[16]; + + char char_dist[300]; + char Rand[2048]; + + double sigma; + __asm emms + sigma = a + .5 + .6 * (63 - q) / 63.0; + + // set up a lookup table of 256 entries that matches + // a gaussian distribution with sigma determined by q. + // + { + double i, sum = 0; + int next, j; + + next = 0; + + for (i = -32; i < 32; i++) + { + int a = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i)); + + if (a) + { + for (j = 0; j < a; j++) + { + char_dist[next+j] = (char) i; + } + + next = next + j; + } + + } + + for (next = next; next < 256; next++) + char_dist[next] = 0; + + } + + for (i = 0; i < 2048; i++) + { + Rand[i] = char_dist[rand() & 0xff]; + } + + for (i = 0; i < 16; i++) + { + blackclamp[i] = -char_dist[0]; + whiteclamp[i] = -char_dist[0]; + bothclamp[i] = -2 * char_dist[0]; + } + + for (i = 0; i < Height; i++) + { + unsigned char *Pos = Start + i * Pitch; + char *Ref = Rand + (rand() & 0xff); + + __asm + { + mov ecx, [Width] + mov esi, Pos + mov edi, Ref + xor eax, eax + + nextset: + movq mm1, [esi+eax] // get the source + + psubusb mm1, blackclamp // clamp both sides so we don't outrange adding noise + paddusb mm1, bothclamp + psubusb mm1, whiteclamp + + movq mm2, [edi+eax] // get the noise for this line + paddb mm1, mm2 // add it in + movq [esi+eax], mm1 // store the result + + add eax, 8 // move to the next line + + cmp eax, ecx + jl nextset + + + } + + } +} +#else +extern char an[8][64][3072]; +extern int cd[8][64]; + +void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) +{ + unsigned int i; + __declspec(align(16)) unsigned char blackclamp[16]; + __declspec(align(16)) unsigned char whiteclamp[16]; + __declspec(align(16)) unsigned char bothclamp[16]; + + + __asm emms + + for (i = 0; i < 16; i++) + { + blackclamp[i] = -cd[a][q]; + whiteclamp[i] = -cd[a][q]; + bothclamp[i] = -2 * cd[a][q]; + } + + for (i = 0; i < Height; i++) + { + unsigned char *Pos = Start + i * Pitch; + char *Ref = an[a][q] + (rand() & 0xff); + + __asm + { + mov ecx, [Width] + mov esi, Pos + mov edi, Ref + xor eax, eax + + nextset: + movq mm1, [esi+eax] // get the source + + psubusb mm1, blackclamp // clamp both sides so we don't outrange adding noise + paddusb mm1, bothclamp + psubusb mm1, whiteclamp + + movq mm2, [edi+eax] // get the noise for this line + paddb mm1, mm2 // add it in + movq [esi+eax], mm1 // store the result + + add eax, 8 // move to the next line + + cmp eax, ecx + jl nextset + } + } +} + + +void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) +{ + unsigned int i; + + __declspec(align(16)) unsigned char blackclamp[16]; + __declspec(align(16)) unsigned char whiteclamp[16]; + __declspec(align(16)) unsigned char bothclamp[16]; + + __asm emms + + for (i = 0; i < 16; i++) + { + blackclamp[i] = -cd[a][q]; + whiteclamp[i] = -cd[a][q]; + bothclamp[i] = -2 * cd[a][q]; + } + + for (i = 0; i < Height; i++) + { + unsigned char *Pos = Start + i * Pitch; + char *Ref = an[a][q] + (rand() & 0xff); + + __asm + { + mov ecx, [Width] + mov esi, Pos + mov edi, Ref + xor eax, eax + + nextset: + movdqu xmm1, [esi+eax] // get the source + + psubusb xmm1, blackclamp // clamp both sides so we don't outrange adding noise + paddusb xmm1, bothclamp + psubusb xmm1, whiteclamp + + movdqu xmm2, [edi+eax] // get the noise for this line + paddb xmm1, xmm2 // add it in + movdqu [esi+eax], xmm1 // store the result + + add eax, 16 // move to the next line + + cmp eax, ecx + jl nextset + } + } +} + +#endif diff --git a/vp8/common/x86/postproc_sse2.asm b/vp8/common/x86/postproc_sse2.asm new file mode 100644 index 000000000..bfa36fa70 --- /dev/null +++ b/vp8/common/x86/postproc_sse2.asm @@ -0,0 +1,688 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_post_proc_down_and_across_xmm +;( +; unsigned char *src_ptr, +; unsigned char *dst_ptr, +; int src_pixels_per_line, +; int dst_pixels_per_line, +; int rows, +; int cols, +; int flimit +;) +global sym(vp8_post_proc_down_and_across_xmm) +sym(vp8_post_proc_down_and_across_xmm): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 + ALIGN_STACK 16, rax + ; move the global rd onto the stack, since we don't have enough registers + ; to do PIC addressing + movdqa xmm0, [rd42 GLOBAL] + sub rsp, 16 + movdqa [rsp], xmm0 +%define RD42 [rsp] +%else +%define RD42 [rd42 GLOBAL] +%endif + + + movd xmm2, dword ptr arg(6) ;flimit + punpcklwd xmm2, xmm2 + punpckldq xmm2, xmm2 + punpcklqdq xmm2, xmm2 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;dst_ptr + + movsxd rcx, DWORD PTR arg(4) ;rows + movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? + pxor xmm0, xmm0 ; mm0 = 00000000 + +nextrow: + + xor rdx, rdx ; clear out rdx for use as loop counter +nextcol: + movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7 + punpcklbw xmm3, xmm0 ; mm3 = p0..p3 + movdqa xmm1, xmm3 ; mm1 = p0..p3 + psllw xmm3, 2 ; + + movq xmm5, QWORD PTR [rsi + rax] ; mm4 = r1 p0..p7 + punpcklbw xmm5, xmm0 ; mm5 = r1 p0..p3 + paddusw xmm3, xmm5 ; mm3 += mm6 + + ; thresholding + movdqa xmm7, xmm1 ; mm7 = r0 p0..p3 + psubusw xmm7, xmm5 ; mm7 = r0 p0..p3 - r1 p0..p3 + psubusw xmm5, xmm1 ; mm5 = r1 p0..p3 - r0 p0..p3 + paddusw xmm7, xmm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) + pcmpgtw xmm7, xmm2 + + movq xmm5, QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7 + punpcklbw xmm5, xmm0 ; mm5 = r2 p0..p3 + paddusw xmm3, xmm5 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 + psubusw xmm6, xmm5 ; mm6 = r0 p0..p3 - r2 p0..p3 + psubusw xmm5, xmm1 ; mm5 = r2 p0..p3 - r2 p0..p3 + paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + + neg rax + movq xmm5, QWORD PTR [rsi+2*rax] ; mm4 = r-2 p0..p7 + punpcklbw xmm5, xmm0 ; mm5 = r-2 p0..p3 + paddusw xmm3, xmm5 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 + psubusw xmm6, xmm5 ; mm6 = p0..p3 - r-2 p0..p3 + psubusw xmm5, xmm1 ; mm5 = r-2 p0..p3 - p0..p3 + paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + movq xmm4, QWORD PTR [rsi+rax] ; mm4 = r-1 p0..p7 + punpcklbw xmm4, xmm0 ; mm4 = r-1 p0..p3 + paddusw xmm3, xmm4 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 + psubusw xmm6, xmm4 ; mm6 = p0..p3 - r-2 p0..p3 + psubusw xmm4, xmm1 ; mm5 = r-1 p0..p3 - p0..p3 + paddusw xmm6, xmm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + + paddusw xmm3, RD42 ; mm3 += round value + psraw xmm3, 3 ; mm3 /= 8 + + pand xmm1, xmm7 ; mm1 select vals > thresh from source + pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result + paddusw xmm1, xmm7 ; combination + + packuswb xmm1, xmm0 ; pack to bytes + movq QWORD PTR [rdi], xmm1 ; + + neg rax ; pitch is positive + add rsi, 8 + add rdi, 8 + + add rdx, 8 + cmp edx, dword arg(5) ;cols + + jl nextcol + + ; done with the all cols, start the across filtering in place + sub rsi, rdx + sub rdi, rdx + + xor rdx, rdx + movq mm0, QWORD PTR [rdi-8]; + +acrossnextcol: + movq xmm7, QWORD PTR [rdi +rdx -2] + movd xmm4, DWORD PTR [rdi +rdx +6] + + pslldq xmm4, 8 + por xmm4, xmm7 + + movdqa xmm3, xmm4 + psrldq xmm3, 2 + punpcklbw xmm3, xmm0 ; mm3 = p0..p3 + movdqa xmm1, xmm3 ; mm1 = p0..p3 + psllw xmm3, 2 + + + movdqa xmm5, xmm4 + psrldq xmm5, 3 + punpcklbw xmm5, xmm0 ; mm5 = p1..p4 + paddusw xmm3, xmm5 ; mm3 += mm6 + + ; thresholding + movdqa xmm7, xmm1 ; mm7 = p0..p3 + psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4 + psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 + paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4) + pcmpgtw xmm7, xmm2 + + movdqa xmm5, xmm4 + psrldq xmm5, 4 + punpcklbw xmm5, xmm0 ; mm5 = p2..p5 + paddusw xmm3, xmm5 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = p0..p3 + psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4 + psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 + paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + + movdqa xmm5, xmm4 ; mm5 = p-2..p5 + punpcklbw xmm5, xmm0 ; mm5 = p-2..p1 + paddusw xmm3, xmm5 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = p0..p3 + psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4 + psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 + paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + psrldq xmm4, 1 ; mm4 = p-1..p5 + punpcklbw xmm4, xmm0 ; mm4 = p-1..p2 + paddusw xmm3, xmm4 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = p0..p3 + psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4 + psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3 + paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + paddusw xmm3, RD42 ; mm3 += round value + psraw xmm3, 3 ; mm3 /= 8 + + pand xmm1, xmm7 ; mm1 select vals > thresh from source + pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result + paddusw xmm1, xmm7 ; combination + + packuswb xmm1, xmm0 ; pack to bytes + movq QWORD PTR [rdi+rdx-8], mm0 ; store previous four bytes + movdq2q mm0, xmm1 + + add rdx, 8 + cmp edx, dword arg(5) ;cols + jl acrossnextcol; + + ; last 8 pixels + movq QWORD PTR [rdi+rdx-8], mm0 + + ; done with this rwo + add rsi,rax ; next line + mov eax, dword arg(3) ;dst_pixels_per_line ; destination pitch? + add rdi,rax ; next destination + mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch? + + dec rcx ; decrement count + jnz nextrow ; next row + +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 + add rsp,16 + pop rsp +%endif + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret +%undef RD42 + + +;void vp8_mbpost_proc_down_xmm(unsigned char *dst, +; int pitch, int rows, int cols,int flimit) +extern sym(vp8_rv) +global sym(vp8_mbpost_proc_down_xmm) +sym(vp8_mbpost_proc_down_xmm): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 128+16 + + ; unsigned char d[16][8] at [rsp] + ; create flimit2 at [rsp+128] + mov eax, dword ptr arg(4) ;flimit + mov [rsp+128], eax + mov [rsp+128+4], eax + mov [rsp+128+8], eax + mov [rsp+128+12], eax +%define flimit4 [rsp+128] + +%if ABI_IS_32BIT=0 + lea r8, [sym(vp8_rv) GLOBAL] +%endif + + ;rows +=8; + add dword arg(2), 8 + + ;for(c=0; c<cols; c+=8) +loop_col: + mov rsi, arg(0) ; s + pxor xmm0, xmm0 ; + + movsxd rax, dword ptr arg(1) ;pitch ; + neg rax ; rax = -pitch + + lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] + neg rax + + + pxor xmm5, xmm5 + pxor xmm6, xmm6 ; + + pxor xmm7, xmm7 ; + mov rdi, rsi + + mov rcx, 15 ; + +loop_initvar: + movq xmm1, QWORD PTR [rdi]; + punpcklbw xmm1, xmm0 ; + + paddw xmm5, xmm1 ; + pmullw xmm1, xmm1 ; + + movdqa xmm2, xmm1 ; + punpcklwd xmm1, xmm0 ; + + punpckhwd xmm2, xmm0 ; + paddd xmm6, xmm1 ; + + paddd xmm7, xmm2 ; + lea rdi, [rdi+rax] ; + + dec rcx + jne loop_initvar + ;save the var and sum + xor rdx, rdx +loop_row: + movq xmm1, QWORD PTR [rsi] ; [s-pitch*8] + movq xmm2, QWORD PTR [rdi] ; [s+pitch*7] + + punpcklbw xmm1, xmm0 + punpcklbw xmm2, xmm0 + + paddw xmm5, xmm2 + psubw xmm5, xmm1 + + pmullw xmm2, xmm2 + movdqa xmm4, xmm2 + + punpcklwd xmm2, xmm0 + punpckhwd xmm4, xmm0 + + paddd xmm6, xmm2 + paddd xmm7, xmm4 + + pmullw xmm1, xmm1 + movdqa xmm2, xmm1 + + punpcklwd xmm1, xmm0 + psubd xmm6, xmm1 + + punpckhwd xmm2, xmm0 + psubd xmm7, xmm2 + + + movdqa xmm3, xmm6 + pslld xmm3, 4 + + psubd xmm3, xmm6 + movdqa xmm1, xmm5 + + movdqa xmm4, xmm5 + pmullw xmm1, xmm1 + + pmulhw xmm4, xmm4 + movdqa xmm2, xmm1 + + punpcklwd xmm1, xmm4 + punpckhwd xmm2, xmm4 + + movdqa xmm4, xmm7 + pslld xmm4, 4 + + psubd xmm4, xmm7 + + psubd xmm3, xmm1 + psubd xmm4, xmm2 + + psubd xmm3, flimit4 + psubd xmm4, flimit4 + + psrad xmm3, 31 + psrad xmm4, 31 + + packssdw xmm3, xmm4 + packsswb xmm3, xmm0 + + movq xmm1, QWORD PTR [rsi+rax*8] + + movq xmm2, xmm1 + punpcklbw xmm1, xmm0 + + paddw xmm1, xmm5 + mov rcx, rdx + + and rcx, 127 +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 + push rax + lea rax, [sym(vp8_rv) GLOBAL] + movdqu xmm4, [rax + rcx*2] ;vp8_rv[rcx*2] + pop rax +%elif ABI_IS_32BIT=0 + movdqu xmm4, [r8 + rcx*2] ;vp8_rv[rcx*2] +%else + movdqu xmm4, [sym(vp8_rv) + rcx*2] +%endif + + paddw xmm1, xmm4 + ;paddw xmm1, eight8s + psraw xmm1, 4 + + packuswb xmm1, xmm0 + pand xmm1, xmm3 + + pandn xmm3, xmm2 + por xmm1, xmm3 + + and rcx, 15 + movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8] + + mov rcx, rdx + sub rcx, 8 + + and rcx, 15 + movq mm0, [rsp + rcx*8] ;d[rcx*8] + + movq [rsi], mm0 + lea rsi, [rsi+rax] + + lea rdi, [rdi+rax] + add rdx, 1 + + cmp edx, dword arg(2) ;rows + jl loop_row + + add dword arg(0), 8 ; s += 8 + sub dword arg(3), 8 ; cols -= 8 + cmp dword arg(3), 0 + jg loop_col + + add rsp, 128+16 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret +%undef flimit4 + + +;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, +; int pitch, int rows, int cols,int flimit) +global sym(vp8_mbpost_proc_across_ip_xmm) +sym(vp8_mbpost_proc_across_ip_xmm): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 + + ; create flimit4 at [rsp] + mov eax, dword ptr arg(4) ;flimit + mov [rsp], eax + mov [rsp+4], eax + mov [rsp+8], eax + mov [rsp+12], eax +%define flimit4 [rsp] + + + ;for(r=0;r<rows;r++) +ip_row_loop: + + xor rdx, rdx ;sumsq=0; + xor rcx, rcx ;sum=0; + mov rsi, arg(0); s + mov rdi, -8 +ip_var_loop: + ;for(i=-8;i<=6;i++) + ;{ + ; sumsq += s[i]*s[i]; + ; sum += s[i]; + ;} + movzx eax, byte [rsi+rdi] + add ecx, eax + mul al + add edx, eax + add rdi, 1 + cmp rdi, 6 + jle ip_var_loop + + + ;mov rax, sumsq + ;movd xmm7, rax + movd xmm7, edx + + ;mov rax, sum + ;movd xmm6, rax + movd xmm6, ecx + + mov rsi, arg(0) ;s + xor rcx, rcx + + movsxd rdx, dword arg(3) ;cols + add rdx, 8 + pxor mm0, mm0 + pxor mm1, mm1 + + pxor xmm0, xmm0 +nextcol4: + + movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5 + movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10 + + punpcklbw xmm1, xmm0 ; expanding + punpcklbw xmm2, xmm0 ; expanding + + punpcklwd xmm1, xmm0 ; expanding to dwords + punpcklwd xmm2, xmm0 ; expanding to dwords + + psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5 + paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2 + + paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5 + pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5 + + paddd xmm6, xmm2 + paddd xmm7, xmm1 + + pshufd xmm6, xmm6, 0 ; duplicate the last ones + pshufd xmm7, xmm7, 0 ; duplicate the last ones + + psrldq xmm1, 4 ; 8--7 9--6 10--5 0000 + psrldq xmm2, 4 ; 8--7 9--6 10--5 0000 + + pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared + pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared + + paddd xmm6, xmm4 + paddd xmm7, xmm3 + + pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared + pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared + + paddd xmm7, xmm3 + paddd xmm6, xmm4 + + pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared + pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared + + paddd xmm7, xmm3 + paddd xmm6, xmm4 + + movdqa xmm3, xmm6 + pmaddwd xmm3, xmm3 + + movdqa xmm5, xmm7 + pslld xmm5, 4 + + psubd xmm5, xmm7 + psubd xmm5, xmm3 + + psubd xmm5, flimit4 + psrad xmm5, 31 + + packssdw xmm5, xmm0 + packsswb xmm5, xmm0 + + movd xmm1, DWORD PTR [rsi+rcx] + movq xmm2, xmm1 + + punpcklbw xmm1, xmm0 + punpcklwd xmm1, xmm0 + + paddd xmm1, xmm6 + paddd xmm1, [four8s GLOBAL] + + psrad xmm1, 4 + packssdw xmm1, xmm0 + + packuswb xmm1, xmm0 + pand xmm1, xmm5 + + pandn xmm5, xmm2 + por xmm5, xmm1 + + movd [rsi+rcx-8], mm0 + movq mm0, mm1 + + movdq2q mm1, xmm5 + psrldq xmm7, 12 + + psrldq xmm6, 12 + add rcx, 4 + + cmp rcx, rdx + jl nextcol4 + + ;s+=pitch; + movsxd rax, dword arg(1) + add arg(0), rax + + sub dword arg(2), 1 ;rows-=1 + cmp dword arg(2), 0 + jg ip_row_loop + + add rsp, 16 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret +%undef flimit4 + + +;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise, +; unsigned char blackclamp[16], +; unsigned char whiteclamp[16], +; unsigned char bothclamp[16], +; unsigned int Width, unsigned int Height, int Pitch) +extern sym(rand) +global sym(vp8_plane_add_noise_wmt) +sym(vp8_plane_add_noise_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +addnoise_loop: + call sym(rand) WRT_PLT + mov rcx, arg(1) ;noise + and rax, 0xff + add rcx, rax + + ; we rely on the fact that the clamping vectors are stored contiguously + ; in black/white/both order. Note that we have to reload this here because + ; rdx could be trashed by rand() + mov rdx, arg(2) ; blackclamp + + + mov rdi, rcx + movsxd rcx, dword arg(5) ;[Width] + mov rsi, arg(0) ;Pos + xor rax,rax + +addnoise_nextset: + movdqu xmm1,[rsi+rax] ; get the source + + psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise + paddusb xmm1, [rdx+32] ;bothclamp + psubusb xmm1, [rdx+16] ;whiteclamp + + movdqu xmm2,[rdi+rax] ; get the noise for this line + paddb xmm1,xmm2 ; add it in + movdqu [rsi+rax],xmm1 ; store the result + + add rax,16 ; move to the next line + + cmp rax, rcx + jl addnoise_nextset + + movsxd rax, dword arg(7) ; Pitch + add arg(0), rax ; Start += Pitch + sub dword arg(6), 1 ; Height -= 1 + jg addnoise_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +rd42: + times 8 dw 0x04 +four8s: + times 4 dd 8 diff --git a/vp8/common/x86/postproc_x86.h b/vp8/common/x86/postproc_x86.h new file mode 100644 index 000000000..49a190793 --- /dev/null +++ b/vp8/common/x86/postproc_x86.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef POSTPROC_X86_H +#define POSTPROC_X86_H + +/* Note: + * + * This platform is commonly built for runtime CPU detection. If you modify + * any of the function mappings present in this file, be sure to also update + * them in the function pointer initialization code + */ + +#if HAVE_MMX +extern prototype_postproc_inplace(vp8_mbpost_proc_down_mmx); +extern prototype_postproc(vp8_post_proc_down_and_across_mmx); +extern prototype_postproc_addnoise(vp8_plane_add_noise_mmx); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_postproc_down +#define vp8_postproc_down vp8_mbpost_proc_down_mmx + +#undef vp8_postproc_downacross +#define vp8_postproc_downacross vp8_post_proc_down_and_across_mmx + +#undef vp8_postproc_addnoise +#define vp8_postproc_addnoise vp8_plane_add_noise_mmx + +#endif +#endif + + +#if HAVE_SSE2 +extern prototype_postproc_inplace(vp8_mbpost_proc_down_xmm); +extern prototype_postproc_inplace(vp8_mbpost_proc_across_ip_xmm); +extern prototype_postproc(vp8_post_proc_down_and_across_xmm); +extern prototype_postproc_addnoise(vp8_plane_add_noise_wmt); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_postproc_down +#define vp8_postproc_down vp8_mbpost_proc_down_xmm + +#undef vp8_postproc_across +#define vp8_postproc_across vp8_mbpost_proc_across_ip_xmm + +#undef vp8_postproc_downacross +#define vp8_postproc_downacross vp8_post_proc_down_and_across_xmm + +#undef vp8_postproc_addnoise +#define vp8_postproc_addnoise vp8_plane_add_noise_wmt + + +#endif +#endif + +#endif diff --git a/vp8/common/x86/recon_mmx.asm b/vp8/common/x86/recon_mmx.asm new file mode 100644 index 000000000..ba60c5db7 --- /dev/null +++ b/vp8/common/x86/recon_mmx.asm @@ -0,0 +1,320 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" +;void vp8_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride) +global sym(vp8_recon_b_mmx) +sym(vp8_recon_b_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rdi, arg(2) ;d + mov rdx, arg(1) ;q + movsxd rax, dword ptr arg(3) ;stride + pxor mm0, mm0 + + movd mm1, [rsi] + punpcklbw mm1, mm0 + paddsw mm1, [rdx] + packuswb mm1, mm0 ; pack and unpack to saturate + movd [rdi], mm1 + + movd mm2, [rsi+16] + punpcklbw mm2, mm0 + paddsw mm2, [rdx+32] + packuswb mm2, mm0 ; pack and unpack to saturate + movd [rdi+rax], mm2 + + movd mm3, [rsi+32] + punpcklbw mm3, mm0 + paddsw mm3, [rdx+64] + packuswb mm3, mm0 ; pack and unpack to saturate + movd [rdi+2*rax], mm3 + + add rdi, rax + movd mm4, [rsi+48] + punpcklbw mm4, mm0 + paddsw mm4, [rdx+96] + packuswb mm4, mm0 ; pack and unpack to saturate + movd [rdi+2*rax], mm4 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void copy_mem8x8_mmx( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride +; ) +global sym(vp8_copy_mem8x8_mmx) +sym(vp8_copy_mem8x8_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src; + movq mm0, [rsi] + + movsxd rax, dword ptr arg(1) ;src_stride; + mov rdi, arg(2) ;dst; + + movq mm1, [rsi+rax] + movq mm2, [rsi+rax*2] + + movsxd rcx, dword ptr arg(3) ;dst_stride + lea rsi, [rsi+rax*2] + + movq [rdi], mm0 + add rsi, rax + + movq [rdi+rcx], mm1 + movq [rdi+rcx*2], mm2 + + + lea rdi, [rdi+rcx*2] + movq mm3, [rsi] + + add rdi, rcx + movq mm4, [rsi+rax] + + movq mm5, [rsi+rax*2] + movq [rdi], mm3 + + lea rsi, [rsi+rax*2] + movq [rdi+rcx], mm4 + + movq [rdi+rcx*2], mm5 + lea rdi, [rdi+rcx*2] + + movq mm0, [rsi+rax] + movq mm1, [rsi+rax*2] + + movq [rdi+rcx], mm0 + movq [rdi+rcx*2],mm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void copy_mem8x4_mmx( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride +; ) +global sym(vp8_copy_mem8x4_mmx) +sym(vp8_copy_mem8x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src; + movq mm0, [rsi] + + movsxd rax, dword ptr arg(1) ;src_stride; + mov rdi, arg(2) ;dst; + + movq mm1, [rsi+rax] + movq mm2, [rsi+rax*2] + + movsxd rcx, dword ptr arg(3) ;dst_stride + lea rsi, [rsi+rax*2] + + movq [rdi], mm0 + movq [rdi+rcx], mm1 + + movq [rdi+rcx*2], mm2 + lea rdi, [rdi+rcx*2] + + movq mm3, [rsi+rax] + movq [rdi+rcx], mm3 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void copy_mem16x16_mmx( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride +; ) +global sym(vp8_copy_mem16x16_mmx) +sym(vp8_copy_mem16x16_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src; + movsxd rax, dword ptr arg(1) ;src_stride; + + mov rdi, arg(2) ;dst; + movsxd rcx, dword ptr arg(3) ;dst_stride + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq [rdi], mm0 + movq [rdi+8], mm3 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm new file mode 100644 index 000000000..f2685a76f --- /dev/null +++ b/vp8/common/x86/recon_sse2.asm @@ -0,0 +1,228 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" +;void vp8_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride) +global sym(vp8_recon2b_sse2) +sym(vp8_recon2b_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rdi, arg(2) ;d + mov rdx, arg(1) ;q + movsxd rax, dword ptr arg(3) ;stride + pxor xmm0, xmm0 + + movq xmm1, MMWORD PTR [rsi] + punpcklbw xmm1, xmm0 + paddsw xmm1, XMMWORD PTR [rdx] + packuswb xmm1, xmm0 ; pack and unpack to saturate + movq MMWORD PTR [rdi], xmm1 + + + movq xmm2, MMWORD PTR [rsi+8] + punpcklbw xmm2, xmm0 + paddsw xmm2, XMMWORD PTR [rdx+16] + packuswb xmm2, xmm0 ; pack and unpack to saturate + movq MMWORD PTR [rdi+rax], xmm2 + + + movq xmm3, MMWORD PTR [rsi+16] + punpcklbw xmm3, xmm0 + paddsw xmm3, XMMWORD PTR [rdx+32] + packuswb xmm3, xmm0 ; pack and unpack to saturate + movq MMWORD PTR [rdi+rax*2], xmm3 + + add rdi, rax + movq xmm4, MMWORD PTR [rsi+24] + punpcklbw xmm4, xmm0 + paddsw xmm4, XMMWORD PTR [rdx+48] + packuswb xmm4, xmm0 ; pack and unpack to saturate + movq MMWORD PTR [rdi+rax*2], xmm4 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride) +global sym(vp8_recon4b_sse2) +sym(vp8_recon4b_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rdi, arg(2) ;d + mov rdx, arg(1) ;q + movsxd rax, dword ptr arg(3) ;stride + pxor xmm0, xmm0 + + movdqa xmm1, XMMWORD PTR [rsi] + movdqa xmm5, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm5, xmm0 + paddsw xmm1, XMMWORD PTR [rdx] + paddsw xmm5, XMMWORD PTR [rdx+16] + packuswb xmm1, xmm5 ; pack and unpack to saturate + movdqa XMMWORD PTR [rdi], xmm1 + + + movdqa xmm2, XMMWORD PTR [rsi+16] + movdqa xmm6, xmm2 + punpcklbw xmm2, xmm0 + punpckhbw xmm6, xmm0 + paddsw xmm2, XMMWORD PTR [rdx+32] + paddsw xmm6, XMMWORD PTR [rdx+48] + packuswb xmm2, xmm6 ; pack and unpack to saturate + movdqa XMMWORD PTR [rdi+rax], xmm2 + + + movdqa xmm3, XMMWORD PTR [rsi+32] + movdqa xmm7, xmm3 + punpcklbw xmm3, xmm0 + punpckhbw xmm7, xmm0 + paddsw xmm3, XMMWORD PTR [rdx+64] + paddsw xmm7, XMMWORD PTR [rdx+80] + packuswb xmm3, xmm7 ; pack and unpack to saturate + movdqa XMMWORD PTR [rdi+rax*2], xmm3 + + add rdi, rax + movdqa xmm4, XMMWORD PTR [rsi+48] + movdqa xmm5, xmm4 + punpcklbw xmm4, xmm0 + punpckhbw xmm5, xmm0 + paddsw xmm4, XMMWORD PTR [rdx+96] + paddsw xmm5, XMMWORD PTR [rdx+112] + packuswb xmm4, xmm5 ; pack and unpack to saturate + movdqa XMMWORD PTR [rdi+rax*2], xmm4 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void copy_mem16x16_sse2( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride +; ) +global sym(vp8_copy_mem16x16_sse2) +sym(vp8_copy_mem16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src; + movdqu xmm0, [rsi] + + movsxd rax, dword ptr arg(1) ;src_stride; + mov rdi, arg(2) ;dst; + + movdqu xmm1, [rsi+rax] + movdqu xmm2, [rsi+rax*2] + + movsxd rcx, dword ptr arg(3) ;dst_stride + lea rsi, [rsi+rax*2] + + movdqa [rdi], xmm0 + add rsi, rax + + movdqa [rdi+rcx], xmm1 + movdqa [rdi+rcx*2],xmm2 + + lea rdi, [rdi+rcx*2] + movdqu xmm3, [rsi] + + add rdi, rcx + movdqu xmm4, [rsi+rax] + + movdqu xmm5, [rsi+rax*2] + lea rsi, [rsi+rax*2] + + movdqa [rdi], xmm3 + add rsi, rax + + movdqa [rdi+rcx], xmm4 + movdqa [rdi+rcx*2],xmm5 + + lea rdi, [rdi+rcx*2] + movdqu xmm0, [rsi] + + add rdi, rcx + movdqu xmm1, [rsi+rax] + + movdqu xmm2, [rsi+rax*2] + lea rsi, [rsi+rax*2] + + movdqa [rdi], xmm0 + add rsi, rax + + movdqa [rdi+rcx], xmm1 + + movdqa [rdi+rcx*2], xmm2 + movdqu xmm3, [rsi] + + movdqu xmm4, [rsi+rax] + lea rdi, [rdi+rcx*2] + + add rdi, rcx + movdqu xmm5, [rsi+rax*2] + + lea rsi, [rsi+rax*2] + movdqa [rdi], xmm3 + + add rsi, rax + movdqa [rdi+rcx], xmm4 + + movdqa [rdi+rcx*2],xmm5 + movdqu xmm0, [rsi] + + lea rdi, [rdi+rcx*2] + movdqu xmm1, [rsi+rax] + + add rdi, rcx + movdqu xmm2, [rsi+rax*2] + + lea rsi, [rsi+rax*2] + movdqa [rdi], xmm0 + + movdqa [rdi+rcx], xmm1 + movdqa [rdi+rcx*2],xmm2 + + movdqu xmm3, [rsi+rax] + lea rdi, [rdi+rcx*2] + + movdqa [rdi+rcx], xmm3 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/common/x86/recon_x86.h b/vp8/common/x86/recon_x86.h new file mode 100644 index 000000000..c46977842 --- /dev/null +++ b/vp8/common/x86/recon_x86.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef RECON_X86_H +#define RECON_X86_H + +/* Note: + * + * This platform is commonly built for runtime CPU detection. If you modify + * any of the function mappings present in this file, be sure to also update + * them in the function pointer initialization code + */ + +#if HAVE_MMX +extern prototype_recon_block(vp8_recon_b_mmx); +extern prototype_copy_block(vp8_copy_mem8x8_mmx); +extern prototype_copy_block(vp8_copy_mem8x4_mmx); +extern prototype_copy_block(vp8_copy_mem16x16_mmx); + + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_recon_recon +#define vp8_recon_recon vp8_recon_b_mmx + +#undef vp8_recon_copy8x8 +#define vp8_recon_copy8x8 vp8_copy_mem8x8_mmx + +#undef vp8_recon_copy8x4 +#define vp8_recon_copy8x4 vp8_copy_mem8x4_mmx + +#undef vp8_recon_copy16x16 +#define vp8_recon_copy16x16 vp8_copy_mem16x16_mmx + +#endif +#endif + +#if HAVE_SSE2 +extern prototype_recon_block(vp8_recon2b_sse2); +extern prototype_recon_block(vp8_recon4b_sse2); +extern prototype_copy_block(vp8_copy_mem16x16_sse2); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_recon_recon2 +#define vp8_recon_recon2 vp8_recon2b_sse2 + +#undef vp8_recon_recon4 +#define vp8_recon_recon4 vp8_recon4b_sse2 + +#undef vp8_recon_copy16x16 +#define vp8_recon_copy16x16 vp8_copy_mem16x16_sse2 + +#endif +#endif +#endif diff --git a/vp8/common/x86/subpixel_mmx.asm b/vp8/common/x86/subpixel_mmx.asm new file mode 100644 index 000000000..c50211813 --- /dev/null +++ b/vp8/common/x86/subpixel_mmx.asm @@ -0,0 +1,817 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +%define BLOCK_HEIGHT_WIDTH 4 +%define vp8_filter_weight 128 +%define VP8_FILTER_SHIFT 7 + + +;void vp8_filter_block1d_h6_mmx +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short * vp8_filter +;) +global sym(vp8_filter_block1d_h6_mmx) +sym(vp8_filter_block1d_h6_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(6) ;vp8_filter + + movq mm1, [rdx + 16] ; do both the negative taps first!!! + movq mm2, [rdx + 32] ; + movq mm6, [rdx + 48] ; + movq mm7, [rdx + 64] ; + + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(5) ;output_width ; destination pitch? + pxor mm0, mm0 ; mm0 = 00000000 + +nextrow: + movq mm3, [rsi-2] ; mm3 = p-2..p5 + movq mm4, mm3 ; mm4 = p-2..p5 + psrlq mm3, 8 ; mm3 = p-1..p5 + punpcklbw mm3, mm0 ; mm3 = p-1..p2 + pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. + + movq mm5, mm4 ; mm5 = p-2..p5 + punpckhbw mm4, mm0 ; mm5 = p2..p5 + pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers + paddsw mm3, mm4 ; mm3 += mm5 + + movq mm4, mm5 ; mm4 = p-2..p5; + psrlq mm5, 16 ; mm5 = p0..p5; + punpcklbw mm5, mm0 ; mm5 = p0..p3 + pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers + paddsw mm3, mm5 ; mm3 += mm5 + + movq mm5, mm4 ; mm5 = p-2..p5 + psrlq mm4, 24 ; mm4 = p1..p5 + punpcklbw mm4, mm0 ; mm4 = p1..p4 + pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers + paddsw mm3, mm4 ; mm3 += mm5 + + ; do outer positive taps + movd mm4, [rsi+3] + punpcklbw mm4, mm0 ; mm5 = p3..p6 + pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers + paddsw mm3, mm4 ; mm3 += mm5 + + punpcklbw mm5, mm0 ; mm5 = p-2..p1 + pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers + paddsw mm3, mm5 ; mm3 += mm5 + + paddsw mm3, [rd GLOBAL] ; mm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 + packuswb mm3, mm0 ; pack and unpack to saturate + punpcklbw mm3, mm0 ; + + movq [rdi], mm3 ; store the results in the destination + +%if ABI_IS_32BIT + add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line + add rdi, rax; +%else + movsxd r8, dword ptr arg(2) ;src_pixels_per_line + add rdi, rax; + + add rsi, r8 ; next line +%endif + + dec rcx ; decrement count + jnz nextrow ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +; +; THIS FUNCTION APPEARS TO BE UNUSED +; +;void vp8_filter_block1d_v6_mmx +;( +; short *src_ptr, +; unsigned char *output_ptr, +; unsigned int pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short * vp8_filter +;) +global sym(vp8_filter_block1d_v6_mmx) +sym(vp8_filter_block1d_v6_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movq mm5, [rd GLOBAL] + push rbx + mov rbx, arg(6) ;vp8_filter + movq mm1, [rbx + 16] ; do both the negative taps first!!! + movq mm2, [rbx + 32] ; + movq mm6, [rbx + 48] ; + movq mm7, [rbx + 64] ; + + movsxd rdx, dword ptr arg(2) ;pixels_per_line + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + sub rsi, rdx + sub rsi, rdx + movsxd rcx, DWORD PTR arg(4) ;output_height + movsxd rax, DWORD PTR arg(5) ;output_width ; destination pitch? + pxor mm0, mm0 ; mm0 = 00000000 + + +nextrow_v: + movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 + pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. + + + movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2 + pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0 + pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi] ; mm4 = p0..p3 = row -2 + pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + + add rsi, rdx ; move source forward 1 line to avoid 3 * pitch + movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1 + pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3 + pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + + paddsw mm3, mm5 ; mm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 + packuswb mm3, mm0 ; pack and saturate + + movd [rdi],mm3 ; store the results in the destination + + add rdi,rax; + + dec rcx ; decrement count + jnz nextrow_v ; next row + + pop rbx + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1dc_v6_mmx +;( +; short *src_ptr, +; unsigned char *output_ptr, +; int output_pitch, +; unsigned int pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short * vp8_filter +;) +global sym(vp8_filter_block1dc_v6_mmx) +sym(vp8_filter_block1dc_v6_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movq mm5, [rd GLOBAL] + push rbx + mov rbx, arg(7) ;vp8_filter + movq mm1, [rbx + 16] ; do both the negative taps first!!! + movq mm2, [rbx + 32] ; + movq mm6, [rbx + 48] ; + movq mm7, [rbx + 64] ; + + movsxd rdx, dword ptr arg(3) ;pixels_per_line + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + sub rsi, rdx + sub rsi, rdx + movsxd rcx, DWORD PTR arg(5) ;output_height + movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch? + pxor mm0, mm0 ; mm0 = 00000000 + + +nextrow_cv: + movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 + pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. + + + movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2 + pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0 + pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi] ; mm4 = p0..p3 = row -2 + pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + + add rsi, rdx ; move source forward 1 line to avoid 3 * pitch + movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1 + pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3 + pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + + paddsw mm3, mm5 ; mm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 + packuswb mm3, mm0 ; pack and saturate + + movd [rdi],mm3 ; store the results in the destination + ; the subsequent iterations repeat 3 out of 4 of these reads. Since the + ; recon block should be in cache this shouldn't cost much. Its obviously + ; avoidable!!!. + lea rdi, [rdi+rax] ; + dec rcx ; decrement count + jnz nextrow_cv ; next row + + pop rbx + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void bilinear_predict8x8_mmx +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp8_bilinear_predict8x8_mmx) +sym(vp8_bilinear_predict8x8_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ;const short *HFilter = bilinear_filters_mmx[xoffset]; + ;const short *VFilter = bilinear_filters_mmx[yoffset]; + + movsxd rax, dword ptr arg(2) ;xoffset + mov rdi, arg(4) ;dst_ptr ; + + shl rax, 5 ; offset * 32 + lea rcx, [sym(vp8_bilinear_filters_mmx) GLOBAL] + + add rax, rcx ; HFilter + mov rsi, arg(0) ;src_ptr ; + + movsxd rdx, dword ptr arg(5) ;dst_pitch + movq mm1, [rax] ; + + movq mm2, [rax+16] ; + movsxd rax, dword ptr arg(3) ;yoffset + + pxor mm0, mm0 ; + + shl rax, 5 ; offset*32 + add rax, rcx ; VFilter + + lea rcx, [rdi+rdx*8] ; + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; + + + + ; get the first horizontal line done ; + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movq mm4, mm3 ; make a copy of current line + + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + punpckhbw mm4, mm0 ; + + pmullw mm3, mm1 ; + pmullw mm4, mm1 ; + + movq mm5, [rsi+1] ; + movq mm6, mm5 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 ; + + pmullw mm5, mm2 ; + pmullw mm6, mm2 ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + paddw mm3, [rd GLOBAL] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [rd GLOBAL] ; + psraw mm4, VP8_FILTER_SHIFT ; + + movq mm7, mm3 ; + packuswb mm7, mm4 ; + + add rsi, rdx ; next line +next_row_8x8: + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movq mm4, mm3 ; make a copy of current line + + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + punpckhbw mm4, mm0 ; + + pmullw mm3, mm1 ; + pmullw mm4, mm1 ; + + movq mm5, [rsi+1] ; + movq mm6, mm5 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 ; + + pmullw mm5, mm2 ; + pmullw mm6, mm2 ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + movq mm5, mm7 ; + movq mm6, mm7 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 + + pmullw mm5, [rax] ; + pmullw mm6, [rax] ; + + paddw mm3, [rd GLOBAL] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [rd GLOBAL] ; + psraw mm4, VP8_FILTER_SHIFT ; + + movq mm7, mm3 ; + packuswb mm7, mm4 ; + + + pmullw mm3, [rax+16] ; + pmullw mm4, [rax+16] ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + + paddw mm3, [rd GLOBAL] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [rd GLOBAL] ; + psraw mm4, VP8_FILTER_SHIFT ; + + packuswb mm3, mm4 + + movq [rdi], mm3 ; store the results in the destination + +%if ABI_IS_32BIT + add rsi, rdx ; next line + add rdi, dword ptr arg(5) ;dst_pitch ; +%else + movsxd r8, dword ptr arg(5) ;dst_pitch + add rsi, rdx ; next line + add rdi, r8 ;dst_pitch +%endif + cmp rdi, rcx ; + jne next_row_8x8 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void bilinear_predict8x4_mmx +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp8_bilinear_predict8x4_mmx) +sym(vp8_bilinear_predict8x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ;const short *HFilter = bilinear_filters_mmx[xoffset]; + ;const short *VFilter = bilinear_filters_mmx[yoffset]; + + movsxd rax, dword ptr arg(2) ;xoffset + mov rdi, arg(4) ;dst_ptr ; + + lea rcx, [sym(vp8_bilinear_filters_mmx) GLOBAL] + shl rax, 5 + + mov rsi, arg(0) ;src_ptr ; + add rax, rcx + + movsxd rdx, dword ptr arg(5) ;dst_pitch + movq mm1, [rax] ; + + movq mm2, [rax+16] ; + movsxd rax, dword ptr arg(3) ;yoffset + + pxor mm0, mm0 ; + shl rax, 5 + + add rax, rcx + lea rcx, [rdi+rdx*4] ; + + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; + + ; get the first horizontal line done ; + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movq mm4, mm3 ; make a copy of current line + + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + punpckhbw mm4, mm0 ; + + pmullw mm3, mm1 ; + pmullw mm4, mm1 ; + + movq mm5, [rsi+1] ; + movq mm6, mm5 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 ; + + pmullw mm5, mm2 ; + pmullw mm6, mm2 ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + paddw mm3, [rd GLOBAL] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [rd GLOBAL] ; + psraw mm4, VP8_FILTER_SHIFT ; + + movq mm7, mm3 ; + packuswb mm7, mm4 ; + + add rsi, rdx ; next line +next_row_8x4: + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movq mm4, mm3 ; make a copy of current line + + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + punpckhbw mm4, mm0 ; + + pmullw mm3, mm1 ; + pmullw mm4, mm1 ; + + movq mm5, [rsi+1] ; + movq mm6, mm5 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 ; + + pmullw mm5, mm2 ; + pmullw mm6, mm2 ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + movq mm5, mm7 ; + movq mm6, mm7 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 + + pmullw mm5, [rax] ; + pmullw mm6, [rax] ; + + paddw mm3, [rd GLOBAL] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [rd GLOBAL] ; + psraw mm4, VP8_FILTER_SHIFT ; + + movq mm7, mm3 ; + packuswb mm7, mm4 ; + + + pmullw mm3, [rax+16] ; + pmullw mm4, [rax+16] ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + + paddw mm3, [rd GLOBAL] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [rd GLOBAL] ; + psraw mm4, VP8_FILTER_SHIFT ; + + packuswb mm3, mm4 + + movq [rdi], mm3 ; store the results in the destination + +%if ABI_IS_32BIT + add rsi, rdx ; next line + add rdi, dword ptr arg(5) ;dst_pitch ; +%else + movsxd r8, dword ptr arg(5) ;dst_pitch + add rsi, rdx ; next line + add rdi, r8 +%endif + cmp rdi, rcx ; + jne next_row_8x4 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void bilinear_predict4x4_mmx +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp8_bilinear_predict4x4_mmx) +sym(vp8_bilinear_predict4x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ;const short *HFilter = bilinear_filters_mmx[xoffset]; + ;const short *VFilter = bilinear_filters_mmx[yoffset]; + + movsxd rax, dword ptr arg(2) ;xoffset + mov rdi, arg(4) ;dst_ptr ; + + lea rcx, [sym(vp8_bilinear_filters_mmx) GLOBAL] + shl rax, 5 + + add rax, rcx ; HFilter + mov rsi, arg(0) ;src_ptr ; + + movsxd rdx, dword ptr arg(5) ;ldst_pitch + movq mm1, [rax] ; + + movq mm2, [rax+16] ; + movsxd rax, dword ptr arg(3) ;yoffset + + pxor mm0, mm0 ; + shl rax, 5 + + add rax, rcx + lea rcx, [rdi+rdx*4] ; + + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; + + ; get the first horizontal line done ; + movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + + pmullw mm3, mm1 ; + movd mm5, [rsi+1] ; + + punpcklbw mm5, mm0 ; + pmullw mm5, mm2 ; + + paddw mm3, mm5 ; + paddw mm3, [rd GLOBAL] ; xmm3 += round value + + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + movq mm7, mm3 ; + packuswb mm7, mm0 ; + + add rsi, rdx ; next line +next_row_4x4: + movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + + pmullw mm3, mm1 ; + movd mm5, [rsi+1] ; + + punpcklbw mm5, mm0 ; + pmullw mm5, mm2 ; + + paddw mm3, mm5 ; + + movq mm5, mm7 ; + punpcklbw mm5, mm0 ; + + pmullw mm5, [rax] ; + paddw mm3, [rd GLOBAL] ; xmm3 += round value + + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + movq mm7, mm3 ; + + packuswb mm7, mm0 ; + + pmullw mm3, [rax+16] ; + paddw mm3, mm5 ; + + + paddw mm3, [rd GLOBAL] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + packuswb mm3, mm0 + movd [rdi], mm3 ; store the results in the destination + +%if ABI_IS_32BIT + add rsi, rdx ; next line + add rdi, dword ptr arg(5) ;dst_pitch ; +%else + movsxd r8, dword ptr arg(5) ;dst_pitch ; + add rsi, rdx ; next line + add rdi, r8 +%endif + + cmp rdi, rcx ; + jne next_row_4x4 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + + +SECTION_RODATA +align 16 +rd: + times 4 dw 0x40 + +align 16 +global sym(vp8_six_tap_mmx) +sym(vp8_six_tap_mmx): + times 8 dw 0 + times 8 dw 0 + times 8 dw 128 + times 8 dw 0 + times 8 dw 0 + times 8 dw 0 + + times 8 dw 0 + times 8 dw -6 + times 8 dw 123 + times 8 dw 12 + times 8 dw -1 + times 8 dw 0 + + times 8 dw 2 + times 8 dw -11 + times 8 dw 108 + times 8 dw 36 + times 8 dw -8 + times 8 dw 1 + + times 8 dw 0 + times 8 dw -9 + times 8 dw 93 + times 8 dw 50 + times 8 dw -6 + times 8 dw 0 + + times 8 dw 3 + times 8 dw -16 + times 8 dw 77 + times 8 dw 77 + times 8 dw -16 + times 8 dw 3 + + times 8 dw 0 + times 8 dw -6 + times 8 dw 50 + times 8 dw 93 + times 8 dw -9 + times 8 dw 0 + + times 8 dw 1 + times 8 dw -8 + times 8 dw 36 + times 8 dw 108 + times 8 dw -11 + times 8 dw 2 + + times 8 dw 0 + times 8 dw -1 + times 8 dw 12 + times 8 dw 123 + times 8 dw -6 + times 8 dw 0 + + +align 16 +global sym(vp8_bilinear_filters_mmx) +sym(vp8_bilinear_filters_mmx): + times 8 dw 128 + times 8 dw 0 + + times 8 dw 112 + times 8 dw 16 + + times 8 dw 96 + times 8 dw 32 + + times 8 dw 80 + times 8 dw 48 + + times 8 dw 64 + times 8 dw 64 + + times 8 dw 48 + times 8 dw 80 + + times 8 dw 32 + times 8 dw 96 + + times 8 dw 16 + times 8 dw 112 diff --git a/vp8/common/x86/subpixel_sse2.asm b/vp8/common/x86/subpixel_sse2.asm new file mode 100644 index 000000000..dee04f2d9 --- /dev/null +++ b/vp8/common/x86/subpixel_sse2.asm @@ -0,0 +1,1032 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define BLOCK_HEIGHT_WIDTH 4 +%define VP8_FILTER_WEIGHT 128 +%define VP8_FILTER_SHIFT 7 + + +;/************************************************************************************ +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The +; input pixel array has output_height rows. This routine assumes that output_height is an +; even number. This function handles 8 pixels in horizontal direction, calculating ONE +; rows each iteration to take advantage of the 128 bits operations. +;*************************************************************************************/ +;void vp8_filter_block1d8_h6_sse2 +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short *vp8_filter +;) +global sym(vp8_filter_block1d8_h6_sse2) +sym(vp8_filter_block1d8_h6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(6) ;vp8_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(1) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(5) ;output_width +%endif + pxor xmm0, xmm0 ; clear xmm0 for unpack + +filter_block1d8_h6_rowloop: + movq xmm3, MMWORD PTR [rsi - 2] + movq xmm1, MMWORD PTR [rsi + 6] + + prefetcht2 [rsi+rax-2] + + pslldq xmm1, 8 + por xmm1, xmm3 + + movdqa xmm4, xmm1 + movdqa xmm5, xmm1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm1 + paddsw xmm4, [rd GLOBAL] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 + punpcklbw xmm4, xmm0 + + movdqa XMMWORD Ptr [rdi], xmm4 + lea rsi, [rsi + rax] + +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(5) ;[output_width] +%else + add rdi, r8 +%endif + dec rcx + + jnz filter_block1d8_h6_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d16_h6_sse2 +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short *vp8_filter +;) +;/************************************************************************************ +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The +; input pixel array has output_height rows. This routine assumes that output_height is an +; even number. This function handles 8 pixels in horizontal direction, calculating ONE +; rows each iteration to take advantage of the 128 bits operations. +;*************************************************************************************/ +global sym(vp8_filter_block1d16_h6_sse2) +sym(vp8_filter_block1d16_h6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(6) ;vp8_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(1) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(5) ;output_width +%endif + + pxor xmm0, xmm0 ; clear xmm0 for unpack + +filter_block1d16_h6_sse2_rowloop: + movq xmm3, MMWORD PTR [rsi - 2] + movq xmm1, MMWORD PTR [rsi + 6] + + movq xmm2, MMWORD PTR [rsi +14] + pslldq xmm2, 8 + + por xmm2, xmm1 + prefetcht2 [rsi+rax-2] + + pslldq xmm1, 8 + por xmm1, xmm3 + + movdqa xmm4, xmm1 + movdqa xmm5, xmm1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm1 + paddsw xmm4, [rd GLOBAL] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 + punpcklbw xmm4, xmm0 + + movdqa XMMWORD Ptr [rdi], xmm4 + + movdqa xmm3, xmm2 + movdqa xmm4, xmm2 + + movdqa xmm5, xmm2 + movdqa xmm6, xmm2 + + movdqa xmm7, xmm2 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm2 + paddsw xmm4, [rd GLOBAL] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 + punpcklbw xmm4, xmm0 + + movdqa XMMWORD Ptr [rdi+16], xmm4 + + lea rsi, [rsi + rax] +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(5) ;[output_width] +%else + add rdi, r8 +%endif + + dec rcx + jnz filter_block1d16_h6_sse2_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d8_v6_sse2 +;( +; short *src_ptr, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short * vp8_filter +;) +;/************************************************************************************ +; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The +; input pixel array has output_height rows. +;*************************************************************************************/ +global sym(vp8_filter_block1d8_v6_sse2) +sym(vp8_filter_block1d8_v6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rax, arg(7) ;vp8_filter + movsxd rdx, dword ptr arg(3) ;pixels_per_line + + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + + sub rsi, rdx + sub rsi, rdx + + movsxd rcx, DWORD PTR arg(5) ;[output_height] + pxor xmm0, xmm0 ; clear xmm0 + + movdqa xmm7, XMMWORD PTR [rd GLOBAL] +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(2) ; dst_ptich +%endif + +vp8_filter_block1d8_v6_sse2_loop: + movdqa xmm1, XMMWORD PTR [rsi] + pmullw xmm1, [rax] + + movdqa xmm2, XMMWORD PTR [rsi + rdx] + pmullw xmm2, [rax + 16] + + movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] + pmullw xmm3, [rax + 32] + + movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] + pmullw xmm5, [rax + 64] + + add rsi, rdx + movdqa xmm4, XMMWORD PTR [rsi + rdx * 2] + + pmullw xmm4, [rax + 48] + movdqa xmm6, XMMWORD PTR [rsi + rdx * 4] + + pmullw xmm6, [rax + 80] + + paddsw xmm2, xmm5 + paddsw xmm2, xmm3 + + paddsw xmm2, xmm1 + paddsw xmm2, xmm4 + + paddsw xmm2, xmm6 + paddsw xmm2, xmm7 + + psraw xmm2, 7 + packuswb xmm2, xmm0 ; pack and saturate + + movq QWORD PTR [rdi], xmm2 ; store the results in the destination +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(2) ;[dst_ptich] +%else + add rdi, r8 +%endif + dec rcx ; decrement count + jnz vp8_filter_block1d8_v6_sse2_loop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_unpack_block1d16_h6_sse2 +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int output_height, +; unsigned int output_width +;) +global sym(vp8_unpack_block1d16_h6_sse2) +sym(vp8_unpack_block1d16_h6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;output_ptr + + movsxd rcx, dword ptr arg(3) ;output_height + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source + + pxor xmm0, xmm0 ; clear xmm0 for unpack +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source +%endif + +unpack_block1d16_h6_sse2_rowloop: + movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2 + movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + punpcklbw xmm1, xmm0 + + movdqa XMMWORD Ptr [rdi], xmm1 + movdqa XMMWORD Ptr [rdi + 16], xmm3 + + lea rsi, [rsi + rax] +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(4) ;[output_width] +%else + add rdi, r8 +%endif + dec rcx + jnz unpack_block1d16_h6_sse2_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_unpack_block1d8_h6_sse2 +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int output_height, +; unsigned int output_width +;) +global sym(vp8_unpack_block1d8_h6_sse2) +sym(vp8_unpack_block1d8_h6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;output_ptr + + movsxd rcx, dword ptr arg(3) ;output_height + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source + + pxor xmm0, xmm0 ; clear xmm0 for unpack +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source +%endif + +unpack_block1d8_h6_sse2_rowloop: + movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2 + lea rsi, [rsi + rax] + + punpcklbw xmm1, xmm0 + movdqa XMMWORD Ptr [rdi], xmm1 + +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(4) ;[output_width] +%else + add rdi, r8 +%endif + dec rcx + jnz unpack_block1d8_h6_sse2_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_pack_block1d8_v6_sse2 +;( +; short *src_ptr, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int pixels_per_line, +; unsigned int output_height, +; unsigned int output_width +;) +global sym(vp8_pack_block1d8_v6_sse2) +sym(vp8_pack_block1d8_v6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, dword ptr arg(3) ;pixels_per_line + mov rdi, arg(1) ;output_ptr + + mov rsi, arg(0) ;src_ptr + movsxd rcx, DWORD PTR arg(4) ;[output_height] +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(5) ;output_width ; Pitch for Source +%endif + +pack_block1d8_v6_sse2_loop: + movdqa xmm0, XMMWORD PTR [rsi] + packuswb xmm0, xmm0 + + movq QWORD PTR [rdi], xmm0 ; store the results in the destination + lea rsi, [rsi+rdx] + +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(5) ;[output_width] +%else + add rdi, r8 +%endif + dec rcx ; decrement count + jnz pack_block1d8_v6_sse2_loop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_pack_block1d16_v6_sse2 +;( +; short *src_ptr, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int pixels_per_line, +; unsigned int output_height, +; unsigned int output_width +;) +global sym(vp8_pack_block1d16_v6_sse2) +sym(vp8_pack_block1d16_v6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, dword ptr arg(3) ;pixels_per_line + mov rdi, arg(1) ;output_ptr + + mov rsi, arg(0) ;src_ptr + movsxd rcx, DWORD PTR arg(4) ;[output_height] +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(2) ;dst_pitch +%endif + +pack_block1d16_v6_sse2_loop: + movdqa xmm0, XMMWORD PTR [rsi] + movdqa xmm1, XMMWORD PTR [rsi+16] + + packuswb xmm0, xmm1 + movdqa XMMWORD PTR [rdi], xmm0 ; store the results in the destination + + add rsi, rdx +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(2) ;dst_pitch +%else + add rdi, r8 +%endif + dec rcx ; decrement count + jnz pack_block1d16_v6_sse2_loop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_bilinear_predict16x16_sse2 +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +extern sym(vp8_bilinear_filters_mmx) +global sym(vp8_bilinear_predict16x16_sse2) +sym(vp8_bilinear_predict16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ;const short *HFilter = bilinear_filters_mmx[xoffset] + ;const short *VFilter = bilinear_filters_mmx[yoffset] + + lea rcx, [sym(vp8_bilinear_filters_mmx) GLOBAL] + movsxd rax, dword ptr arg(2) ;xoffset + + cmp rax, 0 ;skip first_pass filter if xoffset=0 + je b16x16_sp_only + + shl rax, 5 + add rax, rcx ;HFilter + + mov rdi, arg(4) ;dst_ptr + mov rsi, arg(0) ;src_ptr + movsxd rdx, dword ptr arg(5) ;dst_pitch + + movdqa xmm1, [rax] + movdqa xmm2, [rax+16] + + movsxd rax, dword ptr arg(3) ;yoffset + + cmp rax, 0 ;skip second_pass filter if yoffset=0 + je b16x16_fp_only + + shl rax, 5 + add rax, rcx ;VFilter + + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line + + pxor xmm0, xmm0 + +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(5) ;dst_pitch +%endif + ; get the first horizontal line done + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movdqa xmm4, xmm3 ; make a copy of current line + + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 + punpckhbw xmm4, xmm0 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm1 + + movdqu xmm5, [rsi+1] + movdqa xmm6, xmm5 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + + pmullw xmm5, xmm2 + pmullw xmm6, xmm2 + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + paddw xmm3, [rd GLOBAL] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [rd GLOBAL] + psraw xmm4, VP8_FILTER_SHIFT + + movdqa xmm7, xmm3 + packuswb xmm7, xmm4 + + add rsi, rdx ; next line +next_row: + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movdqa xmm4, xmm3 ; make a copy of current line + + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 + punpckhbw xmm4, xmm0 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm1 + + movdqu xmm5, [rsi+1] + movdqa xmm6, xmm5 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + + pmullw xmm5, xmm2 + pmullw xmm6, xmm2 + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + movdqa xmm5, xmm7 + movdqa xmm6, xmm7 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + + pmullw xmm5, [rax] + pmullw xmm6, [rax] + + paddw xmm3, [rd GLOBAL] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [rd GLOBAL] + psraw xmm4, VP8_FILTER_SHIFT + + movdqa xmm7, xmm3 + packuswb xmm7, xmm4 + + pmullw xmm3, [rax+16] + pmullw xmm4, [rax+16] + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + paddw xmm3, [rd GLOBAL] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [rd GLOBAL] + psraw xmm4, VP8_FILTER_SHIFT + + packuswb xmm3, xmm4 + movdqa [rdi], xmm3 ; store the results in the destination + + add rsi, rdx ; next line +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(5) ;dst_pitch +%else + add rdi, r8 +%endif + + cmp rdi, rcx + jne next_row + + jmp done + +b16x16_sp_only: + movsxd rax, dword ptr arg(3) ;yoffset + shl rax, 5 + add rax, rcx ;VFilter + + mov rdi, arg(4) ;dst_ptr + mov rsi, arg(0) ;src_ptr + movsxd rdx, dword ptr arg(5) ;dst_pitch + + movdqa xmm1, [rax] + movdqa xmm2, [rax+16] + + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + + pxor xmm0, xmm0 + + ; get the first horizontal line done + movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + + add rsi, rax ; next line +next_row_spo: + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + + movdqa xmm5, xmm7 + movdqa xmm6, xmm7 + + movdqa xmm4, xmm3 ; make a copy of current line + movdqa xmm7, xmm3 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 + punpckhbw xmm4, xmm0 + + pmullw xmm5, xmm1 + pmullw xmm6, xmm1 + pmullw xmm3, xmm2 + pmullw xmm4, xmm2 + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + paddw xmm3, [rd GLOBAL] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [rd GLOBAL] + psraw xmm4, VP8_FILTER_SHIFT + + packuswb xmm3, xmm4 + movdqa [rdi], xmm3 ; store the results in the destination + + add rsi, rax ; next line + add rdi, rdx ;dst_pitch + cmp rdi, rcx + jne next_row_spo + + jmp done + +b16x16_fp_only: + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + pxor xmm0, xmm0 + +next_row_fpo: + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movdqa xmm4, xmm3 ; make a copy of current line + + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 + punpckhbw xmm4, xmm0 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm1 + + movdqu xmm5, [rsi+1] + movdqa xmm6, xmm5 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + + pmullw xmm5, xmm2 + pmullw xmm6, xmm2 + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + paddw xmm3, [rd GLOBAL] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [rd GLOBAL] + psraw xmm4, VP8_FILTER_SHIFT + + packuswb xmm3, xmm4 + movdqa [rdi], xmm3 ; store the results in the destination + + add rsi, rax ; next line + add rdi, rdx ; dst_pitch + cmp rdi, rcx + jne next_row_fpo + +done: + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_bilinear_predict8x8_sse2 +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +extern sym(vp8_bilinear_filters_mmx) +global sym(vp8_bilinear_predict8x8_sse2) +sym(vp8_bilinear_predict8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 144 ; reserve 144 bytes + + ;const short *HFilter = bilinear_filters_mmx[xoffset] + ;const short *VFilter = bilinear_filters_mmx[yoffset] + lea rcx, [sym(vp8_bilinear_filters_mmx) GLOBAL] + + mov rsi, arg(0) ;src_ptr + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line + + ;Read 9-line unaligned data in and put them on stack. This gives a big + ;performance boost. + movdqu xmm0, [rsi] + lea rax, [rdx + rdx*2] + movdqu xmm1, [rsi+rdx] + movdqu xmm2, [rsi+rdx*2] + add rsi, rax + movdqu xmm3, [rsi] + movdqu xmm4, [rsi+rdx] + movdqu xmm5, [rsi+rdx*2] + add rsi, rax + movdqu xmm6, [rsi] + movdqu xmm7, [rsi+rdx] + + movdqa XMMWORD PTR [rsp], xmm0 + + movdqu xmm0, [rsi+rdx*2] + + movdqa XMMWORD PTR [rsp+16], xmm1 + movdqa XMMWORD PTR [rsp+32], xmm2 + movdqa XMMWORD PTR [rsp+48], xmm3 + movdqa XMMWORD PTR [rsp+64], xmm4 + movdqa XMMWORD PTR [rsp+80], xmm5 + movdqa XMMWORD PTR [rsp+96], xmm6 + movdqa XMMWORD PTR [rsp+112], xmm7 + movdqa XMMWORD PTR [rsp+128], xmm0 + + movsxd rax, dword ptr arg(2) ;xoffset + shl rax, 5 + add rax, rcx ;HFilter + + mov rdi, arg(4) ;dst_ptr + movsxd rdx, dword ptr arg(5) ;dst_pitch + + movdqa xmm1, [rax] + movdqa xmm2, [rax+16] + + movsxd rax, dword ptr arg(3) ;yoffset + shl rax, 5 + add rax, rcx ;VFilter + + lea rcx, [rdi+rdx*8] + + movdqa xmm5, [rax] + movdqa xmm6, [rax+16] + + pxor xmm0, xmm0 + + ; get the first horizontal line done + movdqa xmm3, XMMWORD PTR [rsp] + movdqa xmm4, xmm3 ; make a copy of current line + psrldq xmm4, 1 + + punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 + punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm2 + + paddw xmm3, xmm4 + + paddw xmm3, [rd GLOBAL] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + movdqa xmm7, xmm3 + add rsp, 16 ; next line +next_row8x8: + movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + movdqa xmm4, xmm3 ; make a copy of current line + psrldq xmm4, 1 + + punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 + punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm2 + + paddw xmm3, xmm4 + pmullw xmm7, xmm5 + + paddw xmm3, [rd GLOBAL] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + movdqa xmm4, xmm3 + + pmullw xmm3, xmm6 + paddw xmm3, xmm7 + + movdqa xmm7, xmm4 + + paddw xmm3, [rd GLOBAL] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + packuswb xmm3, xmm0 + movq [rdi], xmm3 ; store the results in the destination + + add rsp, 16 ; next line + add rdi, rdx + + cmp rdi, rcx + jne next_row8x8 + + ;add rsp, 144 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +rd: + times 8 dw 0x40 diff --git a/vp8/common/x86/subpixel_x86.h b/vp8/common/x86/subpixel_x86.h new file mode 100644 index 000000000..efa7b2e09 --- /dev/null +++ b/vp8/common/x86/subpixel_x86.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef SUBPIXEL_X86_H +#define SUBPIXEL_X86_H + +/* Note: + * + * This platform is commonly built for runtime CPU detection. If you modify + * any of the function mappings present in this file, be sure to also update + * them in the function pointer initialization code + */ + +#if HAVE_MMX +extern prototype_subpixel_predict(vp8_sixtap_predict16x16_mmx); +extern prototype_subpixel_predict(vp8_sixtap_predict8x8_mmx); +extern prototype_subpixel_predict(vp8_sixtap_predict8x4_mmx); +extern prototype_subpixel_predict(vp8_sixtap_predict4x4_mmx); +extern prototype_subpixel_predict(vp8_bilinear_predict16x16_mmx); +extern prototype_subpixel_predict(vp8_bilinear_predict8x8_mmx); +extern prototype_subpixel_predict(vp8_bilinear_predict8x4_mmx); +extern prototype_subpixel_predict(vp8_bilinear_predict4x4_mmx); + + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_subpix_sixtap16x16 +#define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_mmx + +#undef vp8_subpix_sixtap8x8 +#define vp8_subpix_sixtap8x8 vp8_sixtap_predict8x8_mmx + +#undef vp8_subpix_sixtap8x4 +#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_mmx + +#undef vp8_subpix_sixtap4x4 +#define vp8_subpix_sixtap4x4 vp8_sixtap_predict4x4_mmx + +#undef vp8_subpix_bilinear16x16 +#define vp8_subpix_bilinear16x16 vp8_bilinear_predict16x16_mmx + +#undef vp8_subpix_bilinear8x8 +#define vp8_subpix_bilinear8x8 vp8_bilinear_predict8x8_mmx + +#undef vp8_subpix_bilinear8x4 +#define vp8_subpix_bilinear8x4 vp8_bilinear_predict8x4_mmx + +#undef vp8_subpix_bilinear4x4 +#define vp8_subpix_bilinear4x4 vp8_bilinear_predict4x4_mmx + +#endif +#endif + + +#if HAVE_SSE2 +extern prototype_subpixel_predict(vp8_sixtap_predict16x16_sse2); +extern prototype_subpixel_predict(vp8_sixtap_predict8x8_sse2); +extern prototype_subpixel_predict(vp8_sixtap_predict8x4_sse2); +extern prototype_subpixel_predict(vp8_bilinear_predict16x16_sse2); +extern prototype_subpixel_predict(vp8_bilinear_predict8x8_sse2); + + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_subpix_sixtap16x16 +#define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_sse2 + +#undef vp8_subpix_sixtap8x8 +#define vp8_subpix_sixtap8x8 vp8_sixtap_predict8x8_sse2 + +#undef vp8_subpix_sixtap8x4 +#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_sse2 + +#undef vp8_subpix_bilinear16x16 +#define vp8_subpix_bilinear16x16 vp8_bilinear_predict16x16_sse2 + +#undef vp8_subpix_bilinear8x8 +#define vp8_subpix_bilinear8x8 vp8_bilinear_predict8x8_sse2 + +#endif +#endif + +#endif diff --git a/vp8/common/x86/vp8_asm_stubs.c b/vp8/common/x86/vp8_asm_stubs.c new file mode 100644 index 000000000..68454f709 --- /dev/null +++ b/vp8/common/x86/vp8_asm_stubs.c @@ -0,0 +1,342 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "vpx_ports/mem.h" +#include "subpixel.h" + +extern const short vp8_six_tap_mmx[8][6*8]; +extern const short vp8_bilinear_filters_mmx[8][2*8]; + +extern void vp8_filter_block1d_h6_mmx +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); +extern void vp8_filter_block1dc_v6_mmx +( + unsigned short *src_ptr, + unsigned char *output_ptr, + int output_pitch, + unsigned int pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); +extern void vp8_filter_block1d8_h6_sse2 +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); +extern void vp8_filter_block1d16_h6_sse2 +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); +extern void vp8_filter_block1d8_v6_sse2 +( + unsigned short *src_ptr, + unsigned char *output_ptr, + int dst_ptich, + unsigned int pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); +extern void vp8_unpack_block1d16_h6_sse2 +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + unsigned int output_width +); +extern void vp8_unpack_block1d8_h6_sse2 +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + unsigned int output_width +); +extern void vp8_pack_block1d8_v6_sse2 +( + unsigned short *src_ptr, + unsigned char *output_ptr, + int dst_ptich, + unsigned int pixels_per_line, + unsigned int output_height, + unsigned int output_width +); +extern void vp8_pack_block1d16_v6_sse2 +( + unsigned short *src_ptr, + unsigned char *output_ptr, + int dst_ptich, + unsigned int pixels_per_line, + unsigned int output_height, + unsigned int output_width +); +extern prototype_subpixel_predict(vp8_bilinear_predict8x8_mmx); + + +#if HAVE_MMX +void vp8_sixtap_predict4x4_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 16*16); // Temp data bufffer used in filtering + const short *HFilter, *VFilter; + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter); + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4 , 4, 4, VFilter); + +} + + +void vp8_sixtap_predict16x16_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + + DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); // Temp data bufffer used in filtering + + const short *HFilter, *VFilter; + + + HFilter = vp8_six_tap_mmx[xoffset]; + + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter); + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 21, 32, HFilter); + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, FData2 + 8, src_pixels_per_line, 1, 21, 32, HFilter); + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, FData2 + 12, src_pixels_per_line, 1, 21, 32, HFilter); + + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, 16, VFilter); + vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16 , 16, 16, VFilter); + vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter); + vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16 , 16, 16, VFilter); + +} + + +void vp8_sixtap_predict8x8_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + + DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); // Temp data bufffer used in filtering + + const short *HFilter, *VFilter; + + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter); + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 13, 16, HFilter); + + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, 8, VFilter); + vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 8, 8, VFilter); + +} + + +void vp8_sixtap_predict8x4_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + + DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); // Temp data bufffer used in filtering + + const short *HFilter, *VFilter; + + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter); + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 9, 16, HFilter); + + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, 8, VFilter); + vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 4, 8, VFilter); + +} + + + +void vp8_bilinear_predict16x16_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + vp8_bilinear_predict8x8_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pitch); + vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + 8, dst_pitch); + vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8, dst_pitch); + vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8 + 8, dst_pitch); +} +#endif + + +#if HAVE_SSE2 +void vp8_sixtap_predict16x16_sse2 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch + +) +{ + DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); // Temp data bufffer used in filtering + + const short *HFilter, *VFilter; + + if (xoffset) + { + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter); + } + else + { + vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 21, 32); + } + + if (yoffset) + { + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1d8_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, 16, VFilter); + vp8_filter_block1d8_v6_sse2(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter); + } + else + { + vp8_pack_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16); + } +} + + +void vp8_sixtap_predict8x8_sse2 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); // Temp data bufffer used in filtering + const short *HFilter, *VFilter; + + if (xoffset) + { + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter); + } + else + { + vp8_unpack_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 13, 16); + } + + if (yoffset) + { + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, dst_pitch, VFilter); + } + else + { + vp8_pack_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, dst_pitch); + } + + +} + + +void vp8_sixtap_predict8x4_sse2 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); // Temp data bufffer used in filtering + const short *HFilter, *VFilter; + + if (xoffset) + { + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter); + } + else + { + vp8_unpack_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 9, 16); + } + + if (yoffset) + { + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, dst_pitch, VFilter); + } + else + { + vp8_pack_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 4, dst_pitch); + } + + +} +#endif diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c new file mode 100644 index 000000000..5312e06da --- /dev/null +++ b/vp8/common/x86/x86_systemdependent.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "vpx_ports/x86.h" +#include "g_common.h" +#include "subpixel.h" +#include "loopfilter.h" +#include "recon.h" +#include "idct.h" +#include "pragmas.h" +#include "onyxc_int.h" + +void vp8_arch_x86_common_init(VP8_COMMON *ctx) +{ +#if CONFIG_RUNTIME_CPU_DETECT + VP8_COMMON_RTCD *rtcd = &ctx->rtcd; + int flags = x86_simd_caps(); + int mmx_enabled = flags & HAS_MMX; + int xmm_enabled = flags & HAS_SSE; + int wmt_enabled = flags & HAS_SSE2; + + /* Note: + * + * This platform can be built without runtime CPU detection as well. If + * you modify any of the function mappings present in this file, be sure + * to also update them in static mapings (<arch>/filename_<arch>.h) + */ + + /* Override default functions with fastest ones for this CPU. */ +#if HAVE_MMX + + if (mmx_enabled) + { + rtcd->idct.idct1 = vp8_short_idct4x4llm_1_mmx; + rtcd->idct.idct16 = vp8_short_idct4x4llm_mmx; + rtcd->idct.idct1_scalar = vp8_dc_only_idct_mmx; + rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_mmx; + rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_mmx; + + + + rtcd->recon.recon = vp8_recon_b_mmx; + rtcd->recon.copy8x8 = vp8_copy_mem8x8_mmx; + rtcd->recon.copy8x4 = vp8_copy_mem8x4_mmx; + rtcd->recon.copy16x16 = vp8_copy_mem16x16_mmx; + + rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_mmx; + rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_mmx; + rtcd->subpix.sixtap8x4 = vp8_sixtap_predict8x4_mmx; + rtcd->subpix.sixtap4x4 = vp8_sixtap_predict4x4_mmx; + rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_mmx; + rtcd->subpix.bilinear8x8 = vp8_bilinear_predict8x8_mmx; + rtcd->subpix.bilinear8x4 = vp8_bilinear_predict8x4_mmx; + rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_mmx; + + rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_mmx; + rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_mmx; + rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_mmx; + rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_mmx; + rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_mmx; + rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_mmx; + rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_mmx; + rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_mmx; + +#if CONFIG_POSTPROC + rtcd->postproc.down = vp8_mbpost_proc_down_mmx; + //rtcd->postproc.across = vp8_mbpost_proc_across_ip_c; + rtcd->postproc.downacross = vp8_post_proc_down_and_across_mmx; + rtcd->postproc.addnoise = vp8_plane_add_noise_mmx; +#endif + } + +#endif +#if HAVE_SSE2 + + if (wmt_enabled) + { + rtcd->recon.recon2 = vp8_recon2b_sse2; + rtcd->recon.recon4 = vp8_recon4b_sse2; + rtcd->recon.copy16x16 = vp8_copy_mem16x16_sse2; + + rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_sse2; + + rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_sse2; + rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_sse2; + rtcd->subpix.sixtap8x4 = vp8_sixtap_predict8x4_sse2; + rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_sse2; + rtcd->subpix.bilinear8x8 = vp8_bilinear_predict8x8_sse2; + + rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_sse2; + rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_sse2; + rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_sse2; + rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_sse2; + rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_sse2; + rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_sse2; + rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_sse2; + rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_sse2; + +#if CONFIG_POSTPROC + rtcd->postproc.down = vp8_mbpost_proc_down_xmm; + rtcd->postproc.across = vp8_mbpost_proc_across_ip_xmm; + rtcd->postproc.downacross = vp8_post_proc_down_and_across_xmm; + rtcd->postproc.addnoise = vp8_plane_add_noise_wmt; +#endif + } + +#endif +#endif +} |