summaryrefslogtreecommitdiff
path: root/vp8
diff options
context:
space:
mode:
authorJohann <johannkoenig@google.com>2010-09-20 10:47:22 -0700
committerCode Review <code-review@webmproject.org>2010-09-20 10:47:22 -0700
commit9c9afbab856a7d58590f467821013cfb8f88d1e8 (patch)
tree04f8e4b36085ae3c03ee28bc0a78730af0fe7e52 /vp8
parent9100073e8d99f2cf1b0b2d2288687d193295addf (diff)
parent14ba764219296ec74fab5647ca7bdc2e4ca693ce (diff)
downloadlibvpx-9c9afbab856a7d58590f467821013cfb8f88d1e8.tar
libvpx-9c9afbab856a7d58590f467821013cfb8f88d1e8.tar.gz
libvpx-9c9afbab856a7d58590f467821013cfb8f88d1e8.tar.bz2
libvpx-9c9afbab856a7d58590f467821013cfb8f88d1e8.zip
Merge "Update NEON wide idcts"
Diffstat (limited to 'vp8')
-rw-r--r--vp8/decoder/arm/neon/dequant_dc_idct_neon.asm136
-rw-r--r--vp8/decoder/arm/neon/idct_blk_neon.c142
-rw-r--r--vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm79
-rw-r--r--vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm69
-rw-r--r--vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm190
-rw-r--r--vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm183
-rw-r--r--vp8/vp8dx_arm.mk5
7 files changed, 579 insertions, 225 deletions
diff --git a/vp8/decoder/arm/neon/dequant_dc_idct_neon.asm b/vp8/decoder/arm/neon/dequant_dc_idct_neon.asm
deleted file mode 100644
index f68a78095..000000000
--- a/vp8/decoder/arm/neon/dequant_dc_idct_neon.asm
+++ /dev/null
@@ -1,136 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_dequant_dc_idct_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_dequant_dc_idct_add_neon(short *input, short *dq, unsigned char *pred,
-; unsigned char *dest, int pitch, int stride,
-; int Dc);
-; r0 short *input,
-; r1 short *dq,
-; r2 unsigned char *pred
-; r3 unsigned char *dest
-; sp int pitch
-; sp+4 int stride
-; sp+8 int Dc
-|vp8_dequant_dc_idct_add_neon| PROC
- vld1.16 {q3, q4}, [r0]
- vld1.16 {q5, q6}, [r1]
-
- ldr r1, [sp, #8] ;load Dc from stack
-
- ldr r12, _CONSTANTS_
-
- vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon
- vmul.i16 q2, q4, q6
-
- vmov.16 d2[0], r1
-
- ldr r1, [sp] ; pitch
- vld1.32 {d14[0]}, [r2], r1
- vld1.32 {d14[1]}, [r2], r1
- vld1.32 {d15[0]}, [r2], r1
- vld1.32 {d15[1]}, [r2]
-
- ldr r1, [sp, #4] ; stride
-
-;|short_idct4x4llm_neon| PROC
- vld1.16 {d0}, [r12]
- vswp d3, d4 ;q2(vp[4] vp[12])
-
- vqdmulh.s16 q3, q2, d0[2]
- vqdmulh.s16 q4, q2, d0[0]
-
- vqadd.s16 d12, d2, d3 ;a1
- vqsub.s16 d13, d2, d3 ;b1
-
- vshr.s16 q3, q3, #1
- vshr.s16 q4, q4, #1
-
- vqadd.s16 q3, q3, q2
- vqadd.s16 q4, q4, q2
-
- vqsub.s16 d10, d6, d9 ;c1
- vqadd.s16 d11, d7, d8 ;d1
-
- vqadd.s16 d2, d12, d11
- vqadd.s16 d3, d13, d10
- vqsub.s16 d4, d13, d10
- vqsub.s16 d5, d12, d11
-
- vtrn.32 d2, d4
- vtrn.32 d3, d5
- vtrn.16 d2, d3
- vtrn.16 d4, d5
-
-; memset(input, 0, 32) -- 32bytes
- vmov.i16 q14, #0
-
- vswp d3, d4
- vqdmulh.s16 q3, q2, d0[2]
- vqdmulh.s16 q4, q2, d0[0]
-
- vqadd.s16 d12, d2, d3 ;a1
- vqsub.s16 d13, d2, d3 ;b1
-
- vmov q15, q14
-
- vshr.s16 q3, q3, #1
- vshr.s16 q4, q4, #1
-
- vqadd.s16 q3, q3, q2
- vqadd.s16 q4, q4, q2
-
- vqsub.s16 d10, d6, d9 ;c1
- vqadd.s16 d11, d7, d8 ;d1
-
- vqadd.s16 d2, d12, d11
- vqadd.s16 d3, d13, d10
- vqsub.s16 d4, d13, d10
- vqsub.s16 d5, d12, d11
-
- vst1.16 {q14, q15}, [r0]
-
- vrshr.s16 d2, d2, #3
- vrshr.s16 d3, d3, #3
- vrshr.s16 d4, d4, #3
- vrshr.s16 d5, d5, #3
-
- vtrn.32 d2, d4
- vtrn.32 d3, d5
- vtrn.16 d2, d3
- vtrn.16 d4, d5
-
- vaddw.u8 q1, q1, d14
- vaddw.u8 q2, q2, d15
-
- vqmovun.s16 d0, q1
- vqmovun.s16 d1, q2
-
- vst1.32 {d0[0]}, [r3], r1
- vst1.32 {d0[1]}, [r3], r1
- vst1.32 {d1[0]}, [r3], r1
- vst1.32 {d1[1]}, [r3]
-
- bx lr
-
- ENDP ; |vp8_dequant_dc_idct_add_neon|
-
-; Constant Pool
-_CONSTANTS_ DCD cospi8sqrt2minus1
-cospi8sqrt2minus1 DCD 0x4e7b4e7b
-sinpi8sqrt2 DCD 0x8a8c8a8c
-
- END
diff --git a/vp8/decoder/arm/neon/idct_blk_neon.c b/vp8/decoder/arm/neon/idct_blk_neon.c
index 4725e6240..103cce74e 100644
--- a/vp8/decoder/arm/neon/idct_blk_neon.c
+++ b/vp8/decoder/arm/neon/idct_blk_neon.c
@@ -12,6 +12,21 @@
#include "idct.h"
#include "dequantize.h"
+/* place these declarations here because we don't want to maintain them
+ * outside of this scope
+ */
+void idct_dequant_dc_full_2x_neon
+ (short *input, short *dq, unsigned char *pre, unsigned char *dst,
+ int stride, short *dc);
+void idct_dequant_dc_0_2x_neon
+ (short *dc, unsigned char *pre, unsigned char *dst, int stride);
+void idct_dequant_full_2x_neon
+ (short *q, short *dq, unsigned char *pre, unsigned char *dst,
+ int pitch, int stride);
+void idct_dequant_0_2x_neon
+ (short *q, short dq, unsigned char *pre, int pitch,
+ unsigned char *dst, int stride);
+
void vp8_dequant_dc_idct_add_y_block_neon
(short *q, short *dq, unsigned char *pre,
unsigned char *dst, int stride, char *eobs, short *dc)
@@ -20,25 +35,15 @@ void vp8_dequant_dc_idct_add_y_block_neon
for (i = 0; i < 4; i++)
{
- if (eobs[0] > 1)
- vp8_dequant_dc_idct_add_neon (q, dq, pre, dst, 16, stride, dc[0]);
- else
- vp8_dc_only_idct_add_neon (dc[0], pre, dst, 16, stride);
-
- if (eobs[1] > 1)
- vp8_dequant_dc_idct_add_neon (q+16, dq, pre+4, dst+4, 16, stride, dc[1]);
- else
- vp8_dc_only_idct_add_neon (dc[1], pre+4, dst+4, 16, stride);
-
- if (eobs[2] > 1)
- vp8_dequant_dc_idct_add_neon (q+32, dq, pre+8, dst+8, 16, stride, dc[2]);
+ if (((short *)eobs)[0] & 0xfefe)
+ idct_dequant_dc_full_2x_neon (q, dq, pre, dst, stride, dc);
else
- vp8_dc_only_idct_add_neon (dc[2], pre+8, dst+8, 16, stride);
+ idct_dequant_dc_0_2x_neon(dc, pre, dst, stride);
- if (eobs[3] > 1)
- vp8_dequant_dc_idct_add_neon (q+48, dq, pre+12, dst+12, 16, stride, dc[3]);
+ if (((short *)eobs)[1] & 0xfefe)
+ idct_dequant_dc_full_2x_neon (q+32, dq, pre+8, dst+8, stride, dc+2);
else
- vp8_dc_only_idct_add_neon (dc[3], pre+12, dst+12, 16, stride);
+ idct_dequant_dc_0_2x_neon(dc+2, pre+8, dst+8, stride);
q += 64;
dc += 4;
@@ -56,37 +61,15 @@ void vp8_dequant_idct_add_y_block_neon
for (i = 0; i < 4; i++)
{
- if (eobs[0] > 1)
- vp8_dequant_idct_add_neon (q, dq, pre, dst, 16, stride);
+ if (((short *)eobs)[0] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, pre, dst, 16, stride);
else
- {
- vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dst, 16, stride);
- ((int *)q)[0] = 0;
- }
+ idct_dequant_0_2x_neon (q, dq[0], pre, 16, dst, stride);
- if (eobs[1] > 1)
- vp8_dequant_idct_add_neon (q+16, dq, pre+4, dst+4, 16, stride);
+ if (((short *)eobs)[1] & 0xfefe)
+ idct_dequant_full_2x_neon (q+32, dq, pre+8, dst+8, 16, stride);
else
- {
- vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dst+4, 16, stride);
- ((int *)(q+16))[0] = 0;
- }
-
- if (eobs[2] > 1)
- vp8_dequant_idct_add_neon (q+32, dq, pre+8, dst+8, 16, stride);
- else
- {
- vp8_dc_only_idct_add_neon (q[32]*dq[0], pre+8, dst+8, 16, stride);
- ((int *)(q+32))[0] = 0;
- }
-
- if (eobs[3] > 1)
- vp8_dequant_idct_add_neon (q+48, dq, pre+12, dst+12, 16, stride);
- else
- {
- vp8_dc_only_idct_add_neon (q[48]*dq[0], pre+12, dst+12, 16, stride);
- ((int *)(q+48))[0] = 0;
- }
+ idct_dequant_0_2x_neon (q+32, dq[0], pre+8, 16, dst+8, stride);
q += 64;
pre += 64;
@@ -101,51 +84,34 @@ void vp8_dequant_idct_add_uv_block_neon
{
int i;
- for (i = 0; i < 2; i++)
- {
- if (eobs[0] > 1)
- vp8_dequant_idct_add_neon (q, dq, pre, dstu, 8, stride);
- else
- {
- vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dstu, 8, stride);
- ((int *)q)[0] = 0;
- }
+ if (((short *)eobs)[0] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, pre, dstu, 8, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstu, stride);
- if (eobs[1] > 1)
- vp8_dequant_idct_add_neon (q+16, dq, pre+4, dstu+4, 8, stride);
- else
- {
- vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dstu+4, 8, stride);
- ((int *)(q+16))[0] = 0;
- }
-
- q += 32;
- pre += 32;
- dstu += 4*stride;
- eobs += 2;
- }
+ q += 32;
+ pre += 32;
+ dstu += 4*stride;
- for (i = 0; i < 2; i++)
- {
- if (eobs[0] > 1)
- vp8_dequant_idct_add_neon (q, dq, pre, dstv, 8, stride);
- else
- {
- vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dstv, 8, stride);
- ((int *)q)[0] = 0;
- }
+ if (((short *)eobs)[1] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, pre, dstu, 8, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstu, stride);
- if (eobs[1] > 1)
- vp8_dequant_idct_add_neon (q+16, dq, pre+4, dstv+4, 8, stride);
- else
- {
- vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dstv+4, 8, stride);
- ((int *)(q+16))[0] = 0;
- }
-
- q += 32;
- pre += 32;
- dstv += 4*stride;
- eobs += 2;
- }
+ q += 32;
+ pre += 32;
+
+ if (((short *)eobs)[2] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, pre, dstv, 8, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstv, stride);
+
+ q += 32;
+ pre += 32;
+ dstv += 4*stride;
+
+ if (((short *)eobs)[3] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, pre, dstv, 8, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstv, stride);
}
diff --git a/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
new file mode 100644
index 000000000..456f8e1d4
--- /dev/null
+++ b/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
@@ -0,0 +1,79 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |idct_dequant_0_2x_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *pre,
+; int pitch, unsigned char *dst, int stride);
+; r0 *q
+; r1 dq
+; r2 *pre
+; r3 pitch
+; sp *dst
+; sp+4 stride
+|idct_dequant_0_2x_neon| PROC
+ add r12, r2, #4
+ vld1.32 {d2[0]}, [r2], r3
+ vld1.32 {d2[1]}, [r2], r3
+ vld1.32 {d4[0]}, [r2], r3
+ vld1.32 {d4[1]}, [r2]
+ vld1.32 {d8[0]}, [r12], r3
+ vld1.32 {d8[1]}, [r12], r3
+ vld1.32 {d10[0]}, [r12], r3
+ vld1.32 {d10[1]}, [r12]
+
+ ldrh r12, [r0] ; lo q
+ ldrh r2, [r0, #32] ; hi q
+ mov r3, #0
+ strh r3, [r0]
+ strh r3, [r0, #32]
+
+ sxth r12, r12 ; lo
+ mul r0, r12, r1
+ add r0, r0, #4
+ asr r0, r0, #3
+ vdup.16 q0, r0
+ sxth r2, r2 ; hi
+ mul r0, r2, r1
+ add r0, r0, #4
+ asr r0, r0, #3
+ vdup.16 q3, r0
+
+ vaddw.u8 q1, q0, d2 ; lo
+ vaddw.u8 q2, q0, d4
+ vaddw.u8 q4, q3, d8 ; hi
+ vaddw.u8 q5, q3, d10
+
+ ldr r2, [sp] ; dst
+ ldr r3, [sp, #4] ; stride
+
+ vqmovun.s16 d2, q1 ; lo
+ vqmovun.s16 d4, q2
+ vqmovun.s16 d8, q4 ; hi
+ vqmovun.s16 d10, q5
+
+ add r0, r2, #4
+ vst1.32 {d2[0]}, [r2], r3 ; lo
+ vst1.32 {d2[1]}, [r2], r3
+ vst1.32 {d4[0]}, [r2], r3
+ vst1.32 {d4[1]}, [r2]
+ vst1.32 {d8[0]}, [r0], r3 ; hi
+ vst1.32 {d8[1]}, [r0], r3
+ vst1.32 {d10[0]}, [r0], r3
+ vst1.32 {d10[1]}, [r0]
+
+ bx lr
+
+ ENDP ; |idct_dequant_0_2x_neon|
+ END
diff --git a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
new file mode 100644
index 000000000..0dc036acb
--- /dev/null
+++ b/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
@@ -0,0 +1,69 @@
+;
+; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |idct_dequant_dc_0_2x_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_dc_0_2x_neon(short *dc, unsigned char *pre,
+; unsigned char *dst, int stride);
+; r0 *dc
+; r1 *pre
+; r2 *dst
+; r3 stride
+|idct_dequant_dc_0_2x_neon| PROC
+ ldr r0, [r0] ; *dc
+ mov r12, #16
+
+ vld1.32 {d2[0]}, [r1], r12 ; lo
+ vld1.32 {d2[1]}, [r1], r12
+ vld1.32 {d4[0]}, [r1], r12
+ vld1.32 {d4[1]}, [r1]
+ sub r1, r1, #44
+ vld1.32 {d8[0]}, [r1], r12 ; hi
+ vld1.32 {d8[1]}, [r1], r12
+ vld1.32 {d10[0]}, [r1], r12
+ vld1.32 {d10[1]}, [r1]
+
+ sxth r1, r0 ; lo *dc
+ add r1, r1, #4
+ asr r1, r1, #3
+ vdup.16 q0, r1
+ sxth r0, r0, ror #16 ; hi *dc
+ add r0, r0, #4
+ asr r0, r0, #3
+ vdup.16 q3, r0
+
+ vaddw.u8 q1, q0, d2 ; lo
+ vaddw.u8 q2, q0, d4
+ vaddw.u8 q4, q3, d8 ; hi
+ vaddw.u8 q5, q3, d10
+
+ vqmovun.s16 d2, q1 ; lo
+ vqmovun.s16 d4, q2
+ vqmovun.s16 d8, q4 ; hi
+ vqmovun.s16 d10, q5
+
+ add r0, r2, #4
+ vst1.32 {d2[0]}, [r2], r3 ; lo
+ vst1.32 {d2[1]}, [r2], r3
+ vst1.32 {d4[0]}, [r2], r3
+ vst1.32 {d4[1]}, [r2]
+ vst1.32 {d8[0]}, [r0], r3 ; hi
+ vst1.32 {d8[1]}, [r0], r3
+ vst1.32 {d10[0]}, [r0], r3
+ vst1.32 {d10[1]}, [r0]
+
+ bx lr
+
+ ENDP ;|idct_dequant_dc_0_2x_neon|
+ END
diff --git a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
new file mode 100644
index 000000000..babfb91ad
--- /dev/null
+++ b/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
@@ -0,0 +1,190 @@
+;
+; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |idct_dequant_dc_full_2x_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_dc_full_2x_neon(short *q, short *dq, unsigned char *pre,
+; unsigned char *dst, int stride, short *dc);
+; r0 *q,
+; r1 *dq,
+; r2 *pre
+; r3 *dst
+; sp stride
+; sp+4 *dc
+|idct_dequant_dc_full_2x_neon| PROC
+ vld1.16 {q3, q4}, [r0] ; lo input
+ vld1.16 {q5, q6}, [r1] ; use the same dq for both
+ mov r1, #16 ; pitch
+ add r0, r0, #32
+ vld1.16 {q10, q11}, [r0] ; hi input
+ add r12, r2, #4
+ vld1.32 {d14[0]}, [r2], r1 ; lo pred
+ vld1.32 {d14[1]}, [r2], r1
+ vld1.32 {d15[0]}, [r2], r1
+ vld1.32 {d15[1]}, [r2]
+ vld1.32 {d28[0]}, [r12], r1 ; hi pred
+ vld1.32 {d28[1]}, [r12], r1
+ vld1.32 {d29[0]}, [r12], r1
+ ldr r1, [sp, #4] ; dc
+ vld1.32 {d29[1]}, [r12]
+
+ ldr r2, _CONSTANTS_
+
+ ldrh r12, [r1], #2 ; lo *dc
+ ldrh r1, [r1] ; hi *dc
+
+ vmul.i16 q1, q3, q5 ; lo input * dq
+ vmul.i16 q2, q4, q6
+ vmul.i16 q8, q10, q5 ; hi input * dq
+ vmul.i16 q9, q11, q6
+
+ vmov.16 d2[0], r12 ; move lo dc up to neon, overwrite first element
+ vmov.16 d16[0], r1 ; move hi dc up to neon, overwrite first element
+
+ ldr r1, [sp] ; stride
+
+ vld1.16 {d0}, [r2]
+ vswp d3, d4 ; lo q2(vp[4] vp[12])
+ vswp d17, d18 ; hi q2(vp[4] vp[12])
+
+ vqdmulh.s16 q3, q2, d0[2] ; lo * constants
+ vqdmulh.s16 q4, q2, d0[0]
+ vqdmulh.s16 q10, q9, d0[2] ; hi * constants
+ vqdmulh.s16 q11, q9, d0[0]
+
+ vqadd.s16 d12, d2, d3 ; lo a1
+ vqsub.s16 d13, d2, d3 ; lo b1
+ vqadd.s16 d26, d16, d17 ; hi a1
+ vqsub.s16 d27, d16, d17 ; hi b1
+
+ vshr.s16 q3, q3, #1 ; lo
+ vshr.s16 q4, q4, #1
+ vshr.s16 q10, q10, #1 ; hi
+ vshr.s16 q11, q11, #1
+
+ vqadd.s16 q3, q3, q2 ; lo
+ vqadd.s16 q4, q4, q2
+ vqadd.s16 q10, q10, q9 ; hi
+ vqadd.s16 q11, q11, q9
+
+ vqsub.s16 d10, d6, d9 ; lo c1
+ vqadd.s16 d11, d7, d8 ; lo d1
+ vqsub.s16 d24, d20, d23 ; hi c1
+ vqadd.s16 d25, d21, d22 ; hi d1
+
+ vqadd.s16 d2, d12, d11 ; lo
+ vqadd.s16 d3, d13, d10
+ vqsub.s16 d4, d13, d10
+ vqsub.s16 d5, d12, d11
+ vqadd.s16 d16, d26, d25 ; hi
+ vqadd.s16 d17, d27, d24
+ vqsub.s16 d18, d27, d24
+ vqsub.s16 d19, d26, d25
+
+ vtrn.32 d2, d4 ; lo
+ vtrn.32 d3, d5
+ vtrn.16 d2, d3
+ vtrn.16 d4, d5
+ vtrn.32 d16, d18 ; hi
+ vtrn.32 d17, d19
+ vtrn.16 d16, d17
+ vtrn.16 d18, d19
+
+ vswp d3, d4 ; lo
+ vqdmulh.s16 q3, q2, d0[2]
+ vqdmulh.s16 q4, q2, d0[0]
+ vswp d17, d18 ; hi
+ vqdmulh.s16 q10, q9, d0[2]
+ vqdmulh.s16 q11, q9, d0[0]
+
+ vqadd.s16 d12, d2, d3 ; lo a1
+ vqsub.s16 d13, d2, d3 ; lo b1
+ vqadd.s16 d26, d16, d17 ; hi a1
+ vqsub.s16 d27, d16, d17 ; hi b1
+
+ vshr.s16 q3, q3, #1 ; lo
+ vshr.s16 q4, q4, #1
+ vshr.s16 q10, q10, #1 ; hi
+ vshr.s16 q11, q11, #1
+
+ vqadd.s16 q3, q3, q2 ; lo
+ vqadd.s16 q4, q4, q2
+ vqadd.s16 q10, q10, q9 ; hi
+ vqadd.s16 q11, q11, q9
+
+ vqsub.s16 d10, d6, d9 ; lo c1
+ vqadd.s16 d11, d7, d8 ; lo d1
+ vqsub.s16 d24, d20, d23 ; hi c1
+ vqadd.s16 d25, d21, d22 ; hi d1
+
+ vqadd.s16 d2, d12, d11 ; lo
+ vqadd.s16 d3, d13, d10
+ vqsub.s16 d4, d13, d10
+ vqsub.s16 d5, d12, d11
+ vqadd.s16 d16, d26, d25 ; hi
+ vqadd.s16 d17, d27, d24
+ vqsub.s16 d18, d27, d24
+ vqsub.s16 d19, d26, d25
+
+ vrshr.s16 q1, q1, #3 ; lo
+ vrshr.s16 q2, q2, #3
+ vrshr.s16 q8, q8, #3 ; hi
+ vrshr.s16 q9, q9, #3
+
+ vtrn.32 d2, d4 ; lo
+ vtrn.32 d3, d5
+ vtrn.16 d2, d3
+ vtrn.16 d4, d5
+ vtrn.32 d16, d18 ; hi
+ vtrn.32 d17, d19
+ vtrn.16 d16, d17
+ vtrn.16 d18, d19
+
+ vaddw.u8 q1, q1, d14 ; lo
+ vaddw.u8 q2, q2, d15
+ vaddw.u8 q8, q8, d28 ; hi
+ vaddw.u8 q9, q9, d29
+
+ vmov.i16 q14, #0
+ vmov q15, q14
+ vst1.16 {q14, q15}, [r0] ; write over high input
+ sub r0, r0, #32
+ vst1.16 {q14, q15}, [r0] ; write over low input
+
+ vqmovun.s16 d0, q1 ; lo
+ vqmovun.s16 d1, q2
+ vqmovun.s16 d2, q8 ; hi
+ vqmovun.s16 d3, q9
+
+ add r2, r3, #4 ; hi
+ vst1.32 {d0[0]}, [r3], r1 ; lo
+ vst1.32 {d0[1]}, [r3], r1
+ vst1.32 {d1[0]}, [r3], r1
+ vst1.32 {d1[1]}, [r3]
+ vst1.32 {d2[0]}, [r2], r1 ; hi
+ vst1.32 {d2[1]}, [r2], r1
+ vst1.32 {d3[0]}, [r2], r1
+ vst1.32 {d3[1]}, [r2]
+
+ bx lr
+
+ ENDP ; |idct_dequant_dc_full_2x_neon|
+
+; Constant Pool
+_CONSTANTS_ DCD cospi8sqrt2minus1
+cospi8sqrt2minus1 DCD 0x4e7b4e7b
+sinpi8sqrt2 DCD 0x8a8c8a8c
+
+ END
diff --git a/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
new file mode 100644
index 000000000..14960f99c
--- /dev/null
+++ b/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
@@ -0,0 +1,183 @@
+;
+; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |idct_dequant_full_2x_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *pre,
+; unsigned char *dst, int pitch, int stride);
+; r0 *q,
+; r1 *dq,
+; r2 *pre
+; r3 *dst
+; sp pitch
+; sp+4 stride
+|idct_dequant_full_2x_neon| PROC
+ vld1.16 {q3, q4}, [r0] ; lo input
+ vld1.16 {q5, q6}, [r1] ; use the same dq for both
+ ldr r1, [sp] ; pitch
+ add r0, r0, #32
+ vld1.16 {q10, q11}, [r0] ; hi input
+ add r12, r2, #4
+ vld1.32 {d14[0]}, [r2], r1 ; lo pred
+ vld1.32 {d14[1]}, [r2], r1
+ vld1.32 {d15[0]}, [r2], r1
+ vld1.32 {d15[1]}, [r2]
+ vld1.32 {d28[0]}, [r12], r1 ; hi pred
+ vld1.32 {d28[1]}, [r12], r1
+ vld1.32 {d29[0]}, [r12], r1
+ vld1.32 {d29[1]}, [r12]
+
+ ldr r2, _CONSTANTS_
+
+ vmul.i16 q1, q3, q5 ; lo input * dq
+ vmul.i16 q2, q4, q6
+ vmul.i16 q8, q10, q5 ; hi input * dq
+ vmul.i16 q9, q11, q6
+
+ ldr r1, [sp, #4] ; stride
+
+ vld1.16 {d0}, [r2]
+ vswp d3, d4 ; lo q2(vp[4] vp[12])
+ vswp d17, d18 ; hi q2(vp[4] vp[12])
+
+ vqdmulh.s16 q3, q2, d0[2] ; lo * constants
+ vqdmulh.s16 q4, q2, d0[0]
+ vqdmulh.s16 q10, q9, d0[2] ; hi * constants
+ vqdmulh.s16 q11, q9, d0[0]
+
+ vqadd.s16 d12, d2, d3 ; lo a1
+ vqsub.s16 d13, d2, d3 ; lo b1
+ vqadd.s16 d26, d16, d17 ; hi a1
+ vqsub.s16 d27, d16, d17 ; hi b1
+
+ vshr.s16 q3, q3, #1 ; lo
+ vshr.s16 q4, q4, #1
+ vshr.s16 q10, q10, #1 ; hi
+ vshr.s16 q11, q11, #1
+
+ vqadd.s16 q3, q3, q2 ; lo
+ vqadd.s16 q4, q4, q2
+ vqadd.s16 q10, q10, q9 ; hi
+ vqadd.s16 q11, q11, q9
+
+ vqsub.s16 d10, d6, d9 ; lo c1
+ vqadd.s16 d11, d7, d8 ; lo d1
+ vqsub.s16 d24, d20, d23 ; hi c1
+ vqadd.s16 d25, d21, d22 ; hi d1
+
+ vqadd.s16 d2, d12, d11 ; lo
+ vqadd.s16 d3, d13, d10
+ vqsub.s16 d4, d13, d10
+ vqsub.s16 d5, d12, d11
+ vqadd.s16 d16, d26, d25 ; hi
+ vqadd.s16 d17, d27, d24
+ vqsub.s16 d18, d27, d24
+ vqsub.s16 d19, d26, d25
+
+ vtrn.32 d2, d4 ; lo
+ vtrn.32 d3, d5
+ vtrn.16 d2, d3
+ vtrn.16 d4, d5
+ vtrn.32 d16, d18 ; hi
+ vtrn.32 d17, d19
+ vtrn.16 d16, d17
+ vtrn.16 d18, d19
+
+ vswp d3, d4 ; lo
+ vqdmulh.s16 q3, q2, d0[2]
+ vqdmulh.s16 q4, q2, d0[0]
+ vswp d17, d18 ; hi
+ vqdmulh.s16 q10, q9, d0[2]
+ vqdmulh.s16 q11, q9, d0[0]
+
+ vqadd.s16 d12, d2, d3 ; lo a1
+ vqsub.s16 d13, d2, d3 ; lo b1
+ vqadd.s16 d26, d16, d17 ; hi a1
+ vqsub.s16 d27, d16, d17 ; hi b1
+
+ vshr.s16 q3, q3, #1 ; lo
+ vshr.s16 q4, q4, #1
+ vshr.s16 q10, q10, #1 ; hi
+ vshr.s16 q11, q11, #1
+
+ vqadd.s16 q3, q3, q2 ; lo
+ vqadd.s16 q4, q4, q2
+ vqadd.s16 q10, q10, q9 ; hi
+ vqadd.s16 q11, q11, q9
+
+ vqsub.s16 d10, d6, d9 ; lo c1
+ vqadd.s16 d11, d7, d8 ; lo d1
+ vqsub.s16 d24, d20, d23 ; hi c1
+ vqadd.s16 d25, d21, d22 ; hi d1
+
+ vqadd.s16 d2, d12, d11 ; lo
+ vqadd.s16 d3, d13, d10
+ vqsub.s16 d4, d13, d10
+ vqsub.s16 d5, d12, d11
+ vqadd.s16 d16, d26, d25 ; hi
+ vqadd.s16 d17, d27, d24
+ vqsub.s16 d18, d27, d24
+ vqsub.s16 d19, d26, d25
+
+ vrshr.s16 q1, q1, #3 ; lo
+ vrshr.s16 q2, q2, #3
+ vrshr.s16 q8, q8, #3 ; hi
+ vrshr.s16 q9, q9, #3
+
+ vtrn.32 d2, d4 ; lo
+ vtrn.32 d3, d5
+ vtrn.16 d2, d3
+ vtrn.16 d4, d5
+ vtrn.32 d16, d18 ; hi
+ vtrn.32 d17, d19
+ vtrn.16 d16, d17
+ vtrn.16 d18, d19
+
+ vaddw.u8 q1, q1, d14 ; lo
+ vaddw.u8 q2, q2, d15
+ vaddw.u8 q8, q8, d28 ; hi
+ vaddw.u8 q9, q9, d29
+
+ vmov.i16 q14, #0
+ vmov q15, q14
+ vst1.16 {q14, q15}, [r0] ; write over high input
+ sub r0, r0, #32
+ vst1.16 {q14, q15}, [r0] ; write over low input
+
+ vqmovun.s16 d0, q1 ; lo
+ vqmovun.s16 d1, q2
+ vqmovun.s16 d2, q8 ; hi
+ vqmovun.s16 d3, q9
+
+ add r2, r3, #4 ; hi
+ vst1.32 {d0[0]}, [r3], r1 ; lo
+ vst1.32 {d0[1]}, [r3], r1
+ vst1.32 {d1[0]}, [r3], r1
+ vst1.32 {d1[1]}, [r3]
+ vst1.32 {d2[0]}, [r2], r1 ; hi
+ vst1.32 {d2[1]}, [r2], r1
+ vst1.32 {d3[0]}, [r2], r1
+ vst1.32 {d3[1]}, [r2]
+
+ bx lr
+
+ ENDP ; |idct_dequant_full_2x_neon|
+
+; Constant Pool
+_CONSTANTS_ DCD cospi8sqrt2minus1
+cospi8sqrt2minus1 DCD 0x4e7b4e7b
+sinpi8sqrt2 DCD 0x8a8c8a8c
+
+ END
diff --git a/vp8/vp8dx_arm.mk b/vp8/vp8dx_arm.mk
index 989232cd3..ae0610cda 100644
--- a/vp8/vp8dx_arm.mk
+++ b/vp8/vp8dx_arm.mk
@@ -25,7 +25,10 @@ VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantize_v6$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/idct_blk_v6.c
#File list for neon
-VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_dc_idct_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_dc_full_2x_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_dc_0_2x_neon$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_idct_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_full_2x_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_0_2x_neon$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantizeb_neon$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_blk_neon.c