diff options
author | Scott LaVarnway <slavarnway@google.com> | 2014-09-03 11:26:27 -0700 |
---|---|---|
committer | Johann <johannkoenig@google.com> | 2014-09-03 13:41:27 -0700 |
commit | dcbfacbb984cdcd5b5a5030103305fa3669dfece (patch) | |
tree | d0e5c4a405d7174d87ef3bd3f8be6ba8e9564a4a | |
parent | 9293d267d22def752366c9512be98f2d51fd2c15 (diff) | |
download | libvpx-dcbfacbb984cdcd5b5a5030103305fa3669dfece.tar libvpx-dcbfacbb984cdcd5b5a5030103305fa3669dfece.tar.gz libvpx-dcbfacbb984cdcd5b5a5030103305fa3669dfece.tar.bz2 libvpx-dcbfacbb984cdcd5b5a5030103305fa3669dfece.zip |
Neon version of vp8_build_intra_predictors_mby_s() and
vp8_build_intra_predictors_mbuv_s().
This patch replaces the assembly version with an intrinsic
version.
On a Nexus 7, vpxenc (in realtime mode, speed -12)
reported a performance improvement of ~2.6%.
Change-Id: I9ef65bad929450c0215253fdae1c16c8b4a8f26f
-rw-r--r-- | test/intrapred_test.cc | 10 | ||||
-rw-r--r-- | vp8/common/arm/neon/buildintrapredictorsmby_neon.asm | 595 | ||||
-rw-r--r-- | vp8/common/arm/neon/reconintra_neon.c | 210 | ||||
-rw-r--r-- | vp8/common/arm/reconintra_arm.c | 58 | ||||
-rw-r--r-- | vp8/common/rtcd_defs.pl | 5 | ||||
-rw-r--r-- | vp8/vp8_common.mk | 3 |
6 files changed, 223 insertions, 658 deletions
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc index ead476030..f0d9c3414 100644 --- a/test/intrapred_test.cc +++ b/test/intrapred_test.cc @@ -294,6 +294,11 @@ INSTANTIATE_TEST_CASE_P(SSSE3, IntraPredYTest, ::testing::Values( vp8_build_intra_predictors_mby_s_ssse3)); #endif +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P(NEON, IntraPredYTest, + ::testing::Values( + vp8_build_intra_predictors_mby_s_neon)); +#endif typedef void (*IntraPredUvFunc)(MACROBLOCKD *x, uint8_t *uabove_row, @@ -382,5 +387,10 @@ INSTANTIATE_TEST_CASE_P(SSSE3, IntraPredUVTest, ::testing::Values( vp8_build_intra_predictors_mbuv_s_ssse3)); #endif +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P(NEON, IntraPredUVTest, + ::testing::Values( + vp8_build_intra_predictors_mbuv_s_neon)); +#endif } // namespace diff --git a/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm b/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm deleted file mode 100644 index a8730aa04..000000000 --- a/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm +++ /dev/null @@ -1,595 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_build_intra_predictors_mby_neon_func| - EXPORT |vp8_build_intra_predictors_mby_s_neon_func| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *y_buffer -; r1 unsigned char *ypred_ptr -; r2 int y_stride -; r3 int mode -; stack int Up -; stack int Left - -|vp8_build_intra_predictors_mby_neon_func| PROC - push {r4-r8, lr} - vpush {d8-d15} - - cmp r3, #0 - beq case_dc_pred - cmp r3, #1 - beq case_v_pred - cmp r3, #2 - beq case_h_pred - cmp r3, #3 - beq case_tm_pred - -case_dc_pred - ldr r4, [sp, #88] ; Up - ldr r5, [sp, #92] ; Left - - ; Default the DC average to 128 - mov r12, #128 - vdup.u8 q0, r12 - - ; Zero out running sum - mov r12, #0 - - ; compute shift and jump - adds r7, r4, r5 - beq skip_dc_pred_up_left - - ; Load above row, if it exists - cmp r4, #0 - beq skip_dc_pred_up - - sub r6, r0, r2 - vld1.8 {q1}, [r6] - vpaddl.u8 q2, q1 - vpaddl.u16 q3, q2 - vpaddl.u32 q4, q3 - - vmov.32 r4, d8[0] - vmov.32 r6, d9[0] - - add r12, r4, r6 - - ; Move back to interger registers - -skip_dc_pred_up - - cmp r5, #0 - beq skip_dc_pred_left - - sub r0, r0, #1 - - ; Load left row, if it exists - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0] - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - -skip_dc_pred_left - add r7, r7, #3 ; Shift - sub r4, r7, #1 - mov r5, #1 - add r12, r12, r5, lsl r4 - mov r5, r12, lsr r7 ; expected_dc - - vdup.u8 q0, r5 - -skip_dc_pred_up_left - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - - vpop {d8-d15} - pop {r4-r8,pc} -case_v_pred - ; Copy down above row - sub r6, r0, r2 - vld1.8 {q0}, [r6] - - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vpop {d8-d15} - pop {r4-r8,pc} - -case_h_pred - ; Load 4x yleft_col - sub r0, r0, #1 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - vpop {d8-d15} - pop {r4-r8,pc} - -case_tm_pred - ; Load yabove_row - sub r3, r0, r2 - vld1.8 {q8}, [r3] - - ; Load ytop_left - sub r3, r3, #1 - ldrb r7, [r3] - - vdup.u16 q7, r7 - - ; Compute yabove_row - ytop_left - mov r3, #1 - vdup.u8 q0, r3 - - vmull.u8 q4, d16, d0 - vmull.u8 q5, d17, d0 - - vsub.s16 q4, q4, q7 - vsub.s16 q5, q5, q7 - - ; Load 4x yleft_col - sub r0, r0, #1 - mov r12, #4 - -case_tm_pred_loop - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u16 q0, r3 - vdup.u16 q1, r4 - vdup.u16 q2, r5 - vdup.u16 q3, r6 - - vqadd.s16 q8, q0, q4 - vqadd.s16 q9, q0, q5 - - vqadd.s16 q10, q1, q4 - vqadd.s16 q11, q1, q5 - - vqadd.s16 q12, q2, q4 - vqadd.s16 q13, q2, q5 - - vqadd.s16 q14, q3, q4 - vqadd.s16 q15, q3, q5 - - vqshrun.s16 d0, q8, #0 - vqshrun.s16 d1, q9, #0 - - vqshrun.s16 d2, q10, #0 - vqshrun.s16 d3, q11, #0 - - vqshrun.s16 d4, q12, #0 - vqshrun.s16 d5, q13, #0 - - vqshrun.s16 d6, q14, #0 - vqshrun.s16 d7, q15, #0 - - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - subs r12, r12, #1 - bne case_tm_pred_loop - - vpop {d8-d15} - pop {r4-r8,pc} - - ENDP - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; r0 unsigned char *y_buffer -; r1 unsigned char *ypred_ptr -; r2 int y_stride -; r3 int mode -; stack int Up -; stack int Left - -|vp8_build_intra_predictors_mby_s_neon_func| PROC - push {r4-r8, lr} - vpush {d8-d15} - - mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor; - - cmp r3, #0 - beq case_dc_pred_s - cmp r3, #1 - beq case_v_pred_s - cmp r3, #2 - beq case_h_pred_s - cmp r3, #3 - beq case_tm_pred_s - -case_dc_pred_s - ldr r4, [sp, #88] ; Up - ldr r5, [sp, #92] ; Left - - ; Default the DC average to 128 - mov r12, #128 - vdup.u8 q0, r12 - - ; Zero out running sum - mov r12, #0 - - ; compute shift and jump - adds r7, r4, r5 - beq skip_dc_pred_up_left_s - - ; Load above row, if it exists - cmp r4, #0 - beq skip_dc_pred_up_s - - sub r6, r0, r2 - vld1.8 {q1}, [r6] - vpaddl.u8 q2, q1 - vpaddl.u16 q3, q2 - vpaddl.u32 q4, q3 - - vmov.32 r4, d8[0] - vmov.32 r6, d9[0] - - add r12, r4, r6 - - ; Move back to interger registers - -skip_dc_pred_up_s - - cmp r5, #0 - beq skip_dc_pred_left_s - - sub r0, r0, #1 - - ; Load left row, if it exists - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0] - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - -skip_dc_pred_left_s - add r7, r7, #3 ; Shift - sub r4, r7, #1 - mov r5, #1 - add r12, r12, r5, lsl r4 - mov r5, r12, lsr r7 ; expected_dc - - vdup.u8 q0, r5 - -skip_dc_pred_up_left_s - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - - vpop {d8-d15} - pop {r4-r8,pc} -case_v_pred_s - ; Copy down above row - sub r6, r0, r2 - vld1.8 {q0}, [r6] - - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - - vpop {d8-d15} - pop {r4-r8,pc} - -case_h_pred_s - ; Load 4x yleft_col - sub r0, r0, #1 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - vpop {d8-d15} - pop {r4-r8,pc} - -case_tm_pred_s - ; Load yabove_row - sub r3, r0, r2 - vld1.8 {q8}, [r3] - - ; Load ytop_left - sub r3, r3, #1 - ldrb r7, [r3] - - vdup.u16 q7, r7 - - ; Compute yabove_row - ytop_left - mov r3, #1 - vdup.u8 q0, r3 - - vmull.u8 q4, d16, d0 - vmull.u8 q5, d17, d0 - - vsub.s16 q4, q4, q7 - vsub.s16 q5, q5, q7 - - ; Load 4x yleft_col - sub r0, r0, #1 - mov r12, #4 - -case_tm_pred_loop_s - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u16 q0, r3 - vdup.u16 q1, r4 - vdup.u16 q2, r5 - vdup.u16 q3, r6 - - vqadd.s16 q8, q0, q4 - vqadd.s16 q9, q0, q5 - - vqadd.s16 q10, q1, q4 - vqadd.s16 q11, q1, q5 - - vqadd.s16 q12, q2, q4 - vqadd.s16 q13, q2, q5 - - vqadd.s16 q14, q3, q4 - vqadd.s16 q15, q3, q5 - - vqshrun.s16 d0, q8, #0 - vqshrun.s16 d1, q9, #0 - - vqshrun.s16 d2, q10, #0 - vqshrun.s16 d3, q11, #0 - - vqshrun.s16 d4, q12, #0 - vqshrun.s16 d5, q13, #0 - - vqshrun.s16 d6, q14, #0 - vqshrun.s16 d7, q15, #0 - - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - subs r12, r12, #1 - bne case_tm_pred_loop_s - - vpop {d8-d15} - pop {r4-r8,pc} - - ENDP - - - END diff --git a/vp8/common/arm/neon/reconintra_neon.c b/vp8/common/arm/neon/reconintra_neon.c new file mode 100644 index 000000000..af52cd5ea --- /dev/null +++ b/vp8/common/arm/neon/reconintra_neon.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "vp8/common/blockd.h" + +void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x, + unsigned char * yabove_row, + unsigned char * yleft, + int left_stride, + unsigned char * ypred_ptr, + int y_stride) { + const int mode = x->mode_info_context->mbmi.mode; + int i; + + switch (mode) { + case DC_PRED: + { + int shift = x->up_available + x->left_available; + uint8x16_t v_expected_dc = vdupq_n_u8(128); + + if (shift) { + unsigned int average = 0; + int expected_dc; + if (x->up_available) { + const uint8x16_t v_above = vld1q_u8(yabove_row); + const uint16x8_t a = vpaddlq_u8(v_above); + const uint32x4_t b = vpaddlq_u16(a); + const uint64x2_t c = vpaddlq_u32(b); + const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)), + vreinterpret_u32_u64(vget_high_u64(c))); + average = vget_lane_u32(d, 0); + } + if (x->left_available) { + for (i = 0; i < 16; ++i) { + average += yleft[0]; + yleft += left_stride; + } + } + shift += 3; + expected_dc = (average + (1 << (shift - 1))) >> shift; + v_expected_dc = vmovq_n_u8((uint8_t)expected_dc); + } + for (i = 0; i < 16; ++i) { + vst1q_u8(ypred_ptr, v_expected_dc); + ypred_ptr += y_stride; + } + } + break; + case V_PRED: + { + const uint8x16_t v_above = vld1q_u8(yabove_row); + for (i = 0; i < 16; ++i) { + vst1q_u8(ypred_ptr, v_above); + ypred_ptr += y_stride; + } + } + break; + case H_PRED: + { + for (i = 0; i < 16; ++i) { + const uint8x16_t v_yleft = vmovq_n_u8((uint8_t)yleft[0]); + yleft += left_stride; + vst1q_u8(ypred_ptr, v_yleft); + ypred_ptr += y_stride; + } + } + break; + case TM_PRED: + { + const uint16x8_t v_ytop_left = vmovq_n_u16((int16_t)yabove_row[-1]); + const uint8x16_t v_above = vld1q_u8(yabove_row); + for (i = 0; i < 16; ++i) { + const uint8x8_t v_yleft = vmov_n_u8((int8_t)yleft[0]); + const uint16x8_t a_lo = vaddl_u8(vget_low_u8(v_above), v_yleft); + const uint16x8_t a_hi = vaddl_u8(vget_high_u8(v_above), v_yleft); + const int16x8_t b_lo = vsubq_s16(vreinterpretq_s16_u16(a_lo), + vreinterpretq_s16_u16(v_ytop_left)); + const int16x8_t b_hi = vsubq_s16(vreinterpretq_s16_u16(a_hi), + vreinterpretq_s16_u16(v_ytop_left)); + const uint8x8_t pred_lo = vqmovun_s16(b_lo); + const uint8x8_t pred_hi = vqmovun_s16(b_hi); + + vst1q_u8(ypred_ptr, vcombine_u8(pred_lo, pred_hi)); + ypred_ptr += y_stride; + yleft += left_stride; + } + } + break; + } +} + +void vp8_build_intra_predictors_mbuv_s_neon(MACROBLOCKD *x, + unsigned char * uabove_row, + unsigned char * vabove_row, + unsigned char * uleft, + unsigned char * vleft, + int left_stride, + unsigned char * upred_ptr, + unsigned char * vpred_ptr, + int pred_stride) { + const int mode = x->mode_info_context->mbmi.uv_mode; + int i; + + switch (mode) { + case DC_PRED: + { + int shift = x->up_available + x->left_available; + uint8x8_t v_expected_udc = vdup_n_u8(128); + uint8x8_t v_expected_vdc = vdup_n_u8(128); + + if (shift) { + unsigned int average_u = 0; + unsigned int average_v = 0; + int expected_udc; + int expected_vdc; + if (x->up_available) { + const uint8x8_t v_uabove = vld1_u8(uabove_row); + const uint8x8_t v_vabove = vld1_u8(vabove_row); + const uint16x8_t a = vpaddlq_u8(vcombine_u8(v_uabove, v_vabove)); + const uint32x4_t b = vpaddlq_u16(a); + const uint64x2_t c = vpaddlq_u32(b); + average_u = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 0); + average_v = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 2); + } + if (x->left_available) { + for (i = 0; i < 8; ++i) { + average_u += uleft[0]; + uleft += left_stride; + average_v += vleft[0]; + vleft += left_stride; + } + } + shift += 2; + expected_udc = (average_u + (1 << (shift - 1))) >> shift; + expected_vdc = (average_v + (1 << (shift - 1))) >> shift; + v_expected_udc = vmov_n_u8((uint8_t)expected_udc); + v_expected_vdc = vmov_n_u8((uint8_t)expected_vdc); + } + for (i = 0; i < 8; ++i) { + vst1_u8(upred_ptr, v_expected_udc); + upred_ptr += pred_stride; + vst1_u8(vpred_ptr, v_expected_vdc); + vpred_ptr += pred_stride; + } + } + break; + case V_PRED: + { + const uint8x8_t v_uabove = vld1_u8(uabove_row); + const uint8x8_t v_vabove = vld1_u8(vabove_row); + for (i = 0; i < 8; ++i) { + vst1_u8(upred_ptr, v_uabove); + upred_ptr += pred_stride; + vst1_u8(vpred_ptr, v_vabove); + vpred_ptr += pred_stride; + } + } + break; + case H_PRED: + { + for (i = 0; i < 8; ++i) { + const uint8x8_t v_uleft = vmov_n_u8((uint8_t)uleft[0]); + const uint8x8_t v_vleft = vmov_n_u8((uint8_t)vleft[0]); + uleft += left_stride; + vleft += left_stride; + vst1_u8(upred_ptr, v_uleft); + upred_ptr += pred_stride; + vst1_u8(vpred_ptr, v_vleft); + vpred_ptr += pred_stride; + } + } + break; + case TM_PRED: + { + const uint16x8_t v_utop_left = vmovq_n_u16((int16_t)uabove_row[-1]); + const uint16x8_t v_vtop_left = vmovq_n_u16((int16_t)vabove_row[-1]); + const uint8x8_t v_uabove = vld1_u8(uabove_row); + const uint8x8_t v_vabove = vld1_u8(vabove_row); + for (i = 0; i < 8; ++i) { + const uint8x8_t v_uleft = vmov_n_u8((int8_t)uleft[0]); + const uint8x8_t v_vleft = vmov_n_u8((int8_t)vleft[0]); + const uint16x8_t a_u = vaddl_u8(v_uabove, v_uleft); + const uint16x8_t a_v = vaddl_u8(v_vabove, v_vleft); + const int16x8_t b_u = vsubq_s16(vreinterpretq_s16_u16(a_u), + vreinterpretq_s16_u16(v_utop_left)); + const int16x8_t b_v = vsubq_s16(vreinterpretq_s16_u16(a_v), + vreinterpretq_s16_u16(v_vtop_left)); + const uint8x8_t pred_u = vqmovun_s16(b_u); + const uint8x8_t pred_v = vqmovun_s16(b_v); + + vst1_u8(upred_ptr, pred_u); + vst1_u8(vpred_ptr, pred_v); + upred_ptr += pred_stride; + vpred_ptr += pred_stride; + uleft += left_stride; + vleft += left_stride; + } + } + break; + } +} diff --git a/vp8/common/arm/reconintra_arm.c b/vp8/common/arm/reconintra_arm.c deleted file mode 100644 index e55a33cbb..000000000 --- a/vp8/common/arm/reconintra_arm.c +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_config.h" -#include "vp8_rtcd.h" -#include "vp8/common/blockd.h" -#include "vpx_mem/vpx_mem.h" - -#if HAVE_NEON_ASM -extern void vp8_build_intra_predictors_mby_neon_func( - unsigned char *y_buffer, - unsigned char *ypred_ptr, - int y_stride, - int mode, - int Up, - int Left); - -void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x) -{ - unsigned char *y_buffer = x->dst.y_buffer; - unsigned char *ypred_ptr = x->predictor; - int y_stride = x->dst.y_stride; - int mode = x->mode_info_context->mbmi.mode; - int Up = x->up_available; - int Left = x->left_available; - - vp8_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left); -} - -extern void vp8_build_intra_predictors_mby_s_neon_func( - unsigned char *y_buffer, - unsigned char *ypred_ptr, - int y_stride, - int mode, - int Up, - int Left); - -void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x) -{ - unsigned char *y_buffer = x->dst.y_buffer; - unsigned char *ypred_ptr = x->predictor; - int y_stride = x->dst.y_stride; - int mode = x->mode_info_context->mbmi.mode; - int Up = x->up_available; - int Left = x->left_available; - - vp8_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left); -} - -#endif diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 97914ccf2..ea260b961 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -149,11 +149,10 @@ $vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6; $vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2; add_proto qw/void vp8_build_intra_predictors_mby_s/, "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride"; -specialize qw/vp8_build_intra_predictors_mby_s sse2 ssse3/; -#TODO: fix assembly for neon +specialize qw/vp8_build_intra_predictors_mby_s sse2 ssse3 neon/; add_proto qw/void vp8_build_intra_predictors_mbuv_s/, "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride"; -specialize qw/vp8_build_intra_predictors_mbuv_s sse2 ssse3/; +specialize qw/vp8_build_intra_predictors_mbuv_s sse2 ssse3 neon/; add_proto qw/void vp8_intra4x4_predict/, "unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left"; specialize qw/vp8_intra4x4_predict media/; diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 331a1ca71..50ef633e1 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -156,9 +156,7 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM) # common (neon) -#VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/reconintra_arm.c VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM) -#VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/buildintrapredictorsmby_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/idct_blk_neon.c # common (neon intrinsics) @@ -172,6 +170,7 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iwalsh_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfilter_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimplehorizontaledge_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/reconintra_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sad_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c |