diff options
author | hkuang <hkuang@google.com> | 2014-01-06 14:34:09 -0800 |
---|---|---|
committer | hkuang <hkuang@google.com> | 2014-01-08 11:58:42 -0800 |
commit | 691111aacf58b050ee7f40c9a6a9ac69bfc612a2 (patch) | |
tree | 4f9a1abdd7ad8f36f174953498bef2a3eb758a62 | |
parent | fb53409d2a4030f0af48850f33aad617cc9c3004 (diff) | |
download | libvpx-691111aacf58b050ee7f40c9a6a9ac69bfc612a2.tar libvpx-691111aacf58b050ee7f40c9a6a9ac69bfc612a2.tar.gz libvpx-691111aacf58b050ee7f40c9a6a9ac69bfc612a2.tar.bz2 libvpx-691111aacf58b050ee7f40c9a6a9ac69bfc612a2.zip |
Add initial intra frame neon optimization. 1~2% gain.
More intra optimizations will be added.
Change-Id: I33ae8d93f6002bf7b64cc2669602d9e6bfa5a6e8
-rw-r--r-- | vp9/common/arm/neon/vp9_reconintra_neon.asm | 286 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.sh | 16 | ||||
-rw-r--r-- | vp9/vp9_common.mk | 1 |
3 files changed, 295 insertions, 8 deletions
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.asm b/vp9/common/arm/neon/vp9_reconintra_neon.asm new file mode 100644 index 000000000..f106bc78e --- /dev/null +++ b/vp9/common/arm/neon/vp9_reconintra_neon.asm @@ -0,0 +1,286 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_v_predictor_4x4_neon| + EXPORT |vp9_v_predictor_8x8_neon| + EXPORT |vp9_v_predictor_16x16_neon| + EXPORT |vp9_v_predictor_32x32_neon| + EXPORT |vp9_h_predictor_4x4_neon| + EXPORT |vp9_h_predictor_8x8_neon| + EXPORT |vp9_h_predictor_16x16_neon| + EXPORT |vp9_h_predictor_32x32_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_v_predictor_4x4_neon| PROC + vld1.32 {d0[0]}, [r2] + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + bx lr + ENDP ; |vp9_v_predictor_4x4_neon| + +;void vp9_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_v_predictor_8x8_neon| PROC + vld1.8 {d0}, [r2] + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + bx lr + ENDP ; |vp9_v_predictor_8x8_neon| + +;void vp9_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_v_predictor_16x16_neon| PROC + vld1.8 {q0}, [r2] + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + bx lr + ENDP ; |vp9_v_predictor_16x16_neon| + +;void vp9_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_v_predictor_32x32_neon| PROC + vld1.8 {q0, q1}, [r2] + mov r2, #2 +loop_v + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + subs r2, r2, #1 + bgt loop_v + bx lr + ENDP ; |vp9_v_predictor_32x32_neon| + +;void vp9_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_h_predictor_4x4_neon| PROC + vld1.32 {d1[0]}, [r3] + vdup.8 d0, d1[0] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[1] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[2] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[3] + vst1.32 {d0[0]}, [r0], r1 + bx lr + ENDP ; |vp9_h_predictor_4x4_neon| + +;void vp9_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_h_predictor_8x8_neon| PROC + vld1.64 {d1}, [r3] + vdup.8 d0, d1[0] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[1] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[2] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[3] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[4] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[5] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[6] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[7] + vst1.64 {d0}, [r0], r1 + bx lr + ENDP ; |vp9_h_predictor_8x8_neon| + +;void vp9_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_h_predictor_16x16_neon| PROC + vld1.8 {q1}, [r3] + vdup.8 q0, d2[0] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[1] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[2] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[3] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[4] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[5] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[6] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[7] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[0] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[1] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[2] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[3] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[4] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[5] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[6] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[7] + vst1.8 {q0}, [r0], r1 + bx lr + ENDP ; |vp9_h_predictor_16x16_neon| + +;void vp9_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_h_predictor_32x32_neon| PROC + sub r1, r1, #16 + mov r2, #2 +loop_h + vld1.8 {q1}, [r3]! + vdup.8 q0, d2[0] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[1] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[2] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[3] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[4] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[5] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[6] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[7] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[0] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[1] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[2] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[3] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[4] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[5] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[6] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[7] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + subs r2, r2, #1 + bgt loop_h + bx lr + ENDP ; |vp9_h_predictor_32x32_neon| + + END diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 727f5c437..cc571072f 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -42,7 +42,7 @@ prototype void vp9_d63_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const ui specialize vp9_d63_predictor_4x4 $ssse3_x86inc prototype void vp9_h_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_h_predictor_4x4 $ssse3_x86inc dspr2 +specialize vp9_h_predictor_4x4 $ssse3_x86inc neon dspr2 prototype void vp9_d117_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d117_predictor_4x4 @@ -54,7 +54,7 @@ prototype void vp9_d153_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const u specialize vp9_d153_predictor_4x4 $ssse3_x86inc prototype void vp9_v_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_v_predictor_4x4 $sse_x86inc +specialize vp9_v_predictor_4x4 $sse_x86inc neon prototype void vp9_tm_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_tm_predictor_4x4 $sse_x86inc dspr2 @@ -81,7 +81,7 @@ prototype void vp9_d63_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const ui specialize vp9_d63_predictor_8x8 $ssse3_x86inc prototype void vp9_h_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_h_predictor_8x8 $ssse3_x86inc dspr2 +specialize vp9_h_predictor_8x8 $ssse3_x86inc neon dspr2 prototype void vp9_d117_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d117_predictor_8x8 @@ -93,7 +93,7 @@ prototype void vp9_d153_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const u specialize vp9_d153_predictor_8x8 $ssse3_x86inc prototype void vp9_v_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_v_predictor_8x8 $sse_x86inc +specialize vp9_v_predictor_8x8 $sse_x86inc neon prototype void vp9_tm_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_tm_predictor_8x8 $sse2_x86inc dspr2 @@ -120,7 +120,7 @@ prototype void vp9_d63_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const specialize vp9_d63_predictor_16x16 $ssse3_x86inc prototype void vp9_h_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_h_predictor_16x16 $ssse3_x86inc dspr2 +specialize vp9_h_predictor_16x16 $ssse3_x86inc neon dspr2 prototype void vp9_d117_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d117_predictor_16x16 @@ -132,7 +132,7 @@ prototype void vp9_d153_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const specialize vp9_d153_predictor_16x16 $ssse3_x86inc prototype void vp9_v_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_v_predictor_16x16 $sse2_x86inc +specialize vp9_v_predictor_16x16 $sse2_x86inc neon prototype void vp9_tm_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_tm_predictor_16x16 $sse2_x86inc @@ -159,7 +159,7 @@ prototype void vp9_d63_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const specialize vp9_d63_predictor_32x32 $ssse3_x86inc prototype void vp9_h_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_h_predictor_32x32 $ssse3_x86inc +specialize vp9_h_predictor_32x32 $ssse3_x86inc neon prototype void vp9_d117_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d117_predictor_32x32 @@ -171,7 +171,7 @@ prototype void vp9_d153_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const specialize vp9_d153_predictor_32x32 prototype void vp9_v_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_v_predictor_32x32 $sse2_x86inc +specialize vp9_v_predictor_32x32 $sse2_x86inc neon prototype void vp9_tm_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_tm_predictor_32x32 $sse2_x86_64 diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 01c55a4ad..51493b2ca 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -138,5 +138,6 @@ VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_copy_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_avg_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_save_reg_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_reconintra_neon$(ASM) $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.sh)) |