From 56a8a0b54bc7c5c85ba734672dbeee2eae7cdf10 Mon Sep 17 00:00:00 2001 From: Frank Galligan Date: Fri, 24 Jan 2014 11:07:14 -0800 Subject: Optimize vp9_tm_predictor_8x8_neon function Change-Id: Ia12aae491202098ff66366145aa0c3da38dc97e5 --- vp9/common/arm/neon/vp9_reconintra_neon.asm | 63 ++++++++++++++--------------- 1 file changed, 31 insertions(+), 32 deletions(-) (limited to 'vp9') diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.asm b/vp9/common/arm/neon/vp9_reconintra_neon.asm index 71bf24c9f..c90c26b36 100644 --- a/vp9/common/arm/neon/vp9_reconintra_neon.asm +++ b/vp9/common/arm/neon/vp9_reconintra_neon.asm @@ -353,53 +353,52 @@ loop_h vsubl.u8 q3, d2, d0 ; Load left row by row and compute left + (above - ytop_left) + vld1.u8 {d6}, [r3] + ; 1st row and 2nd row - ldrb r12, [r3], #1 - ldrb r2, [r3], #1 - vdup.u16 q1, r12 - vdup.u16 q2, r2 - vadd.s16 q1, q1, q3 - vadd.s16 q2, q2, q3 - vqshrun.s16 d0, q1, #0 - vqshrun.s16 d1, q2, #0 - vst1.64 {d0}, [r0], r1 - vst1.64 {d1}, [r0], r1 + vdup.u8 d0, d6[0] + vdup.u8 d1, d6[1] + vaddw.s16 q1, q3, d0 + vaddw.s16 q2, q3, d1 ; 3rd row and 4th row - ldrb r12, [r3], #1 - ldrb r2, [r3], #1 - vdup.u16 q1, r12 - vdup.u16 q2, r2 - vadd.s16 q1, q1, q3 - vadd.s16 q2, q2, q3 + vdup.u8 d0, d6[2] + vdup.u8 d1, d6[3] + vaddw.s16 q8, q3, d0 + vaddw.s16 q9, q3, d1 + vqshrun.s16 d0, q1, #0 vqshrun.s16 d1, q2, #0 + vqshrun.s16 d2, q8, #0 + vqshrun.s16 d3, q9, #0 + vst1.64 {d0}, [r0], r1 vst1.64 {d1}, [r0], r1 + vst1.64 {d2}, [r0], r1 + vst1.64 {d3}, [r0], r1 ; 5th row and 6th row - ldrb r12, [r3], #1 - ldrb r2, [r3], #1 - vdup.u16 q1, r12 - vdup.u16 q2, r2 - vadd.s16 q1, q1, q3 - vadd.s16 q2, q2, q3 - vqshrun.s16 d0, q1, #0 - vqshrun.s16 d1, q2, #0 - vst1.64 {d0}, [r0], r1 - vst1.64 {d1}, [r0], r1 + vdup.u8 d0, d6[4] + vdup.u8 d1, d6[5] + vaddw.s16 q1, q3, d0 + vaddw.s16 q2, q3, d1 ; 7rd row and 8th row - ldrb r12, [r3], #1 - ldrb r2, [r3], #1 - vdup.u16 q1, r12 - vdup.u16 q2, r2 - vadd.s16 q1, q1, q3 - vadd.s16 q2, q2, q3 + vdup.u8 d0, d6[6] + vdup.u8 d1, d6[7] + vaddw.s16 q8, q3, d0 + vaddw.s16 q9, q3, d1 + vqshrun.s16 d0, q1, #0 vqshrun.s16 d1, q2, #0 + vqshrun.s16 d2, q8, #0 + vqshrun.s16 d3, q9, #0 + vst1.64 {d0}, [r0], r1 vst1.64 {d1}, [r0], r1 + vst1.64 {d2}, [r0], r1 + vst1.64 {d3}, [r0], r1 + bx lr ENDP ; |vp9_tm_predictor_8x8_neon| -- cgit v1.2.3