diff options
Diffstat (limited to 'vp9/common/arm/neon')
-rw-r--r-- | vp9/common/arm/neon/vp9_avg_neon.asm | 116 | ||||
-rw-r--r-- | vp9/common/arm/neon/vp9_convolve8_avg_neon.asm | 24 | ||||
-rw-r--r-- | vp9/common/arm/neon/vp9_convolve8_neon.asm | 16 | ||||
-rw-r--r-- | vp9/common/arm/neon/vp9_convolve_neon.c | 5 | ||||
-rw-r--r-- | vp9/common/arm/neon/vp9_copy_neon.asm | 84 |
5 files changed, 223 insertions, 22 deletions
diff --git a/vp9/common/arm/neon/vp9_avg_neon.asm b/vp9/common/arm/neon/vp9_avg_neon.asm new file mode 100644 index 000000000..7d2453021 --- /dev/null +++ b/vp9/common/arm/neon/vp9_avg_neon.asm @@ -0,0 +1,116 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_convolve_avg_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vp9_convolve_avg_neon| PROC + push {r4-r6, lr} + ldrd r4, r5, [sp, #32] + mov r6, r2 + + cmp r4, #32 + bgt avg64 + beq avg32 + cmp r4, #8 + bgt avg16 + beq avg8 + b avg4 + +avg64 + sub lr, r1, #32 + sub r4, r3, #32 +avg64_h + pld [r0, r1, lsl #1] + vld1.8 {q0-q1}, [r0]! + vld1.8 {q2-q3}, [r0], lr + pld [r2, r3] + vld1.8 {q8-q9}, [r6@128]! + vld1.8 {q10-q11}, [r6@128], r4 + vrhadd.u8 q0, q0, q8 + vrhadd.u8 q1, q1, q9 + vrhadd.u8 q2, q2, q10 + vrhadd.u8 q3, q3, q11 + vst1.8 {q0-q1}, [r2@128]! + vst1.8 {q2-q3}, [r2@128], r4 + subs r5, r5, #1 + bgt avg64_h + pop {r4-r6, pc} + +avg32 + vld1.8 {q0-q1}, [r0], r1 + vld1.8 {q2-q3}, [r0], r1 + vld1.8 {q8-q9}, [r6@128], r3 + vld1.8 {q10-q11}, [r6@128], r3 + pld [r0] + vrhadd.u8 q0, q0, q8 + pld [r0, r1] + vrhadd.u8 q1, q1, q9 + pld [r6] + vrhadd.u8 q2, q2, q10 + pld [r6, r3] + vrhadd.u8 q3, q3, q11 + vst1.8 {q0-q1}, [r2@128], r3 + vst1.8 {q2-q3}, [r2@128], r3 + subs r5, r5, #2 + bgt avg32 + pop {r4-r6, pc} + +avg16 + vld1.8 {q0}, [r0], r1 + vld1.8 {q1}, [r0], r1 + vld1.8 {q2}, [r6@128], r3 + vld1.8 {q3}, [r6@128], r3 + pld [r0] + pld [r0, r1] + vrhadd.u8 q0, q0, q2 + pld [r6] + pld [r6, r3] + vrhadd.u8 q1, q1, q3 + vst1.8 {q0}, [r2@128], r3 + vst1.8 {q1}, [r2@128], r3 + subs r5, r5, #2 + bgt avg16 + pop {r4-r6, pc} + +avg8 + vld1.8 {d0}, [r0], r1 + vld1.8 {d1}, [r0], r1 + vld1.8 {d2}, [r6@64], r3 + vld1.8 {d3}, [r6@64], r3 + pld [r0] + pld [r0, r1] + vrhadd.u8 q0, q0, q1 + pld [r6] + pld [r6, r3] + vst1.8 {d0}, [r2@64], r3 + vst1.8 {d1}, [r2@64], r3 + subs r5, r5, #2 + bgt avg8 + pop {r4-r6, pc} + +avg4 + vld1.32 {d0[0]}, [r0], r1 + vld1.32 {d0[1]}, [r0], r1 + vld1.32 {d2[0]}, [r6@32], r3 + vld1.32 {d2[1]}, [r6@32], r3 + vrhadd.u8 d0, d0, d2 + vst1.32 {d0[0]}, [r2@32], r3 + vst1.32 {d0[1]}, [r2@32], r3 + subs r5, r5, #2 + bgt avg4 + pop {r4-r6, pc} + ENDP + + END diff --git a/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm b/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm index 437295245..6b20cb9bf 100644 --- a/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm +++ b/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm @@ -159,10 +159,10 @@ loop_horiz ; average the new value and the dst value vrhadd.u8 q1, q1, q3 - vst1.u32 {d2[0]}, [r2], r3 - vst1.u32 {d3[0]}, [r2], r3 - vst1.u32 {d2[1]}, [r2], r3 - vst1.u32 {d3[1]}, [r2], r4 + vst1.u32 {d2[0]}, [r2@32], r3 + vst1.u32 {d3[0]}, [r2@32], r3 + vst1.u32 {d2[1]}, [r2@32], r3 + vst1.u32 {d3[1]}, [r2@32], r4 vmov q8, q9 vmov d20, d23 @@ -234,10 +234,10 @@ loop_vert vmovl.u8 q12, d24 vmovl.u8 q13, d26 - vld1.u32 {d6[0]}, [r5], r3 - vld1.u32 {d6[1]}, [r8], r3 - vld1.u32 {d7[0]}, [r5], r3 - vld1.u32 {d7[1]}, [r8], r3 + vld1.u32 {d6[0]}, [r5@32], r3 + vld1.u32 {d6[1]}, [r8@32], r3 + vld1.u32 {d7[0]}, [r5@32], r3 + vld1.u32 {d7[1]}, [r8@32], r3 pld [r7] pld [r4] @@ -276,10 +276,10 @@ loop_vert sub r5, r5, r3, lsl #1 ; reset for store sub r8, r8, r3, lsl #1 - vst1.u32 {d2[0]}, [r5], r3 - vst1.u32 {d2[1]}, [r8], r3 - vst1.u32 {d3[0]}, [r5], r3 - vst1.u32 {d3[1]}, [r8], r3 + vst1.u32 {d2[0]}, [r5@32], r3 + vst1.u32 {d2[1]}, [r8@32], r3 + vst1.u32 {d3[0]}, [r5@32], r3 + vst1.u32 {d3[1]}, [r8@32], r3 vmov q8, q10 vmov d18, d22 diff --git a/vp9/common/arm/neon/vp9_convolve8_neon.asm b/vp9/common/arm/neon/vp9_convolve8_neon.asm index 667cddb36..45258454c 100644 --- a/vp9/common/arm/neon/vp9_convolve8_neon.asm +++ b/vp9/common/arm/neon/vp9_convolve8_neon.asm @@ -148,10 +148,10 @@ loop_horiz vtrn.32 d2, d3 vtrn.8 d2, d3 - vst1.u32 {d2[0]}, [r2], r3 - vst1.u32 {d3[0]}, [r2], r3 - vst1.u32 {d2[1]}, [r2], r3 - vst1.u32 {d3[1]}, [r2], r4 + vst1.u32 {d2[0]}, [r2@32], r3 + vst1.u32 {d3[0]}, [r2@32], r3 + vst1.u32 {d2[1]}, [r2@32], r3 + vst1.u32 {d3[1]}, [r2@32], r4 vmov q8, q9 vmov d20, d23 @@ -254,10 +254,10 @@ loop_vert vqmovn.u16 d2, q1 vqmovn.u16 d3, q2 - vst1.u32 {d2[0]}, [r5], r3 - vst1.u32 {d2[1]}, [r8], r3 - vst1.u32 {d3[0]}, [r5], r3 - vst1.u32 {d3[1]}, [r8], r3 + vst1.u32 {d2[0]}, [r5@32], r3 + vst1.u32 {d2[1]}, [r8@32], r3 + vst1.u32 {d3[0]}, [r5@32], r3 + vst1.u32 {d3[1]}, [r8@32], r3 vmov q8, q10 vmov d18, d22 diff --git a/vp9/common/arm/neon/vp9_convolve_neon.c b/vp9/common/arm/neon/vp9_convolve_neon.c index 6e37ff6f9..d8b24bfaf 100644 --- a/vp9/common/arm/neon/vp9_convolve_neon.c +++ b/vp9/common/arm/neon/vp9_convolve_neon.c @@ -10,6 +10,7 @@ #include "./vp9_rtcd.h" #include "vp9/common/vp9_common.h" +#include "vpx_ports/mem.h" void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, @@ -19,7 +20,7 @@ void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4). */ - uint8_t temp[64 * 72]; + DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72); // Account for the vertical phase needing 3 lines prior and 4 lines post int intermediate_height = h + 7; @@ -53,7 +54,7 @@ void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { - uint8_t temp[64 * 72]; + DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72); int intermediate_height = h + 7; if (x_step_q4 != 16 || y_step_q4 != 16) diff --git a/vp9/common/arm/neon/vp9_copy_neon.asm b/vp9/common/arm/neon/vp9_copy_neon.asm new file mode 100644 index 000000000..a0bd04a35 --- /dev/null +++ b/vp9/common/arm/neon/vp9_copy_neon.asm @@ -0,0 +1,84 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_convolve_copy_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vp9_convolve_copy_neon| PROC + push {r4-r5, lr} + ldrd r4, r5, [sp, #28] + + cmp r4, #32 + bgt copy64 + beq copy32 + cmp r4, #8 + bgt copy16 + beq copy8 + b copy4 + +copy64 + sub lr, r1, #32 + sub r3, r3, #32 +copy64_h + pld [r0, r1, lsl #1] + vld1.8 {q0-q1}, [r0]! + vld1.8 {q2-q3}, [r0], lr + vst1.8 {q0-q1}, [r2@128]! + vst1.8 {q2-q3}, [r2@128], r3 + subs r5, r5, #1 + bgt copy64_h + pop {r4-r5, pc} + +copy32 + pld [r0, r1, lsl #1] + vld1.8 {q0-q1}, [r0], r1 + pld [r0, r1, lsl #1] + vld1.8 {q2-q3}, [r0], r1 + vst1.8 {q0-q1}, [r2@128], r3 + vst1.8 {q2-q3}, [r2@128], r3 + subs r5, r5, #2 + bgt copy32 + pop {r4-r5, pc} + +copy16 + pld [r0, r1, lsl #1] + vld1.8 {q0}, [r0], r1 + pld [r0, r1, lsl #1] + vld1.8 {q1}, [r0], r1 + vst1.8 {q0}, [r2@128], r3 + vst1.8 {q1}, [r2@128], r3 + subs r5, r5, #2 + bgt copy16 + pop {r4-r5, pc} + +copy8 + pld [r0, r1, lsl #1] + vld1.8 {d0}, [r0], r1 + pld [r0, r1, lsl #1] + vld1.8 {d2}, [r0], r1 + vst1.8 {d0}, [r2@64], r3 + vst1.8 {d2}, [r2@64], r3 + subs r5, r5, #2 + bgt copy8 + pop {r4-r5, pc} + +copy4 + ldr r12, [r0], r1 + str r12, [r2], r3 + subs r5, r5, #1 + bgt copy4 + pop {r4-r5, pc} + ENDP + + END |