summaryrefslogtreecommitdiff
path: root/vp9
diff options
context:
space:
mode:
Diffstat (limited to 'vp9')
-rw-r--r--vp9/common/arm/neon/vp9_convolve8_avg_neon.c5
-rw-r--r--vp9/common/arm/neon/vp9_convolve8_neon.c5
-rw-r--r--vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm69
-rw-r--r--vp9/common/arm/neon/vp9_idct16x16_add_neon.c4
-rw-r--r--vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c10
-rw-r--r--vp9/common/arm/neon/vp9_idct32x32_add_neon.c14
-rw-r--r--vp9/common/arm/neon/vp9_idct8x8_add_neon.c6
-rw-r--r--vp9/common/arm/neon/vp9_iht4x4_add_neon.asm237
-rw-r--r--vp9/common/arm/neon/vp9_iht4x4_add_neon.c248
-rw-r--r--vp9/common/arm/neon/vp9_iht8x8_add_neon.asm698
-rw-r--r--vp9/common/arm/neon/vp9_iht8x8_add_neon.c624
-rw-r--r--vp9/common/arm/neon/vp9_loopfilter_16_neon.c171
-rw-r--r--vp9/common/arm/neon/vp9_loopfilter_neon.c6
-rw-r--r--vp9/common/arm/neon/vp9_reconintra_neon.c473
-rw-r--r--vp9/common/arm/neon/vp9_reconintra_neon_asm.asm (renamed from vp9/common/arm/neon/vp9_reconintra_neon.asm)0
-rw-r--r--vp9/common/vp9_alloccommon.c10
-rw-r--r--vp9/common/vp9_blockd.h18
-rw-r--r--vp9/common/vp9_entropymode.c3
-rw-r--r--vp9/common/vp9_entropymode.h1
-rw-r--r--vp9/common/vp9_loopfilter.c2
-rw-r--r--vp9/common/vp9_mfqe.c89
-rw-r--r--vp9/common/vp9_onyxc_int.h6
-rw-r--r--vp9/common/vp9_postproc.c12
-rw-r--r--vp9/common/vp9_rtcd.c2
-rw-r--r--vp9/common/vp9_rtcd_defs.pl73
-rw-r--r--vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c108
-rw-r--r--vp9/decoder/vp9_decodeframe.c6
-rw-r--r--vp9/decoder/vp9_decoder.c7
-rw-r--r--vp9/decoder/vp9_read_bit_buffer.c16
-rw-r--r--vp9/encoder/arm/neon/vp9_quantize_neon.c3
-rw-r--r--vp9/encoder/vp9_bitstream.c2
-rw-r--r--vp9/encoder/vp9_block.h2
-rw-r--r--vp9/encoder/vp9_dct.c3
-rw-r--r--vp9/encoder/vp9_denoiser.c5
-rw-r--r--vp9/encoder/vp9_encodeframe.c50
-rw-r--r--vp9/encoder/vp9_encodemb.c60
-rw-r--r--vp9/encoder/vp9_encoder.c34
-rw-r--r--vp9/encoder/vp9_encoder.h7
-rw-r--r--vp9/encoder/vp9_lookahead.c4
-rw-r--r--vp9/encoder/vp9_pickmode.c68
-rw-r--r--vp9/encoder/vp9_quantize.c75
-rw-r--r--vp9/encoder/vp9_quantize.h2
-rw-r--r--vp9/encoder/vp9_rd.c6
-rw-r--r--vp9/encoder/vp9_rdopt.c74
-rw-r--r--vp9/encoder/vp9_speed_features.c10
-rw-r--r--vp9/encoder/vp9_svc_layercontext.c4
-rw-r--r--vp9/encoder/vp9_temporal_filter.c5
-rw-r--r--vp9/encoder/vp9_tokenize.c172
-rw-r--r--vp9/encoder/vp9_tokenize.h41
-rw-r--r--vp9/encoder/x86/vp9_dct_sse2.c3
-rw-r--r--vp9/encoder/x86/vp9_dct_ssse3.c3
-rw-r--r--vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c26
-rw-r--r--vp9/encoder/x86/vp9_quantize_sse2.c8
-rw-r--r--vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm12
-rw-r--r--vp9/vp9_common.mk11
-rw-r--r--vp9/vp9_cx_iface.c5
-rw-r--r--vp9/vp9_dx_iface.c24
57 files changed, 2121 insertions, 1521 deletions
diff --git a/vp9/common/arm/neon/vp9_convolve8_avg_neon.c b/vp9/common/arm/neon/vp9_convolve8_avg_neon.c
index 2f8dda07c..dd569d348 100644
--- a/vp9/common/arm/neon/vp9_convolve8_avg_neon.c
+++ b/vp9/common/arm/neon/vp9_convolve8_avg_neon.c
@@ -11,6 +11,9 @@
#include <stddef.h>
#include <arm_neon.h>
+#include "./vpx_config.h"
+#include "vpx_ports/mem.h"
+
void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
@@ -22,7 +25,7 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
const int16_t *filter_y, int y_step_q4,
int w, int h);
-static inline int32x4_t MULTIPLY_BY_Q0(
+static INLINE int32x4_t MULTIPLY_BY_Q0(
int16x4_t dsrc0,
int16x4_t dsrc1,
int16x4_t dsrc2,
diff --git a/vp9/common/arm/neon/vp9_convolve8_neon.c b/vp9/common/arm/neon/vp9_convolve8_neon.c
index c8704aa9c..5c555c458 100644
--- a/vp9/common/arm/neon/vp9_convolve8_neon.c
+++ b/vp9/common/arm/neon/vp9_convolve8_neon.c
@@ -11,6 +11,9 @@
#include <stddef.h>
#include <arm_neon.h>
+#include "./vpx_config.h"
+#include "vpx_ports/mem.h"
+
void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
@@ -22,7 +25,7 @@ void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
const int16_t *filter_y, int y_step_q4,
int w, int h);
-static inline int32x4_t MULTIPLY_BY_Q0(
+static INLINE int32x4_t MULTIPLY_BY_Q0(
int16x4_t dsrc0,
int16x4_t dsrc1,
int16x4_t dsrc2,
diff --git a/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm b/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm
deleted file mode 100644
index 60a0d98c5..000000000
--- a/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm
+++ /dev/null
@@ -1,69 +0,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
-
- EXPORT |vp9_dc_only_idct_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp9_dc_only_idct_add_neon(int input_dc, uint8_t *pred_ptr,
-; uint8_t *dst_ptr, int pitch, int stride)
-;
-; r0 int input_dc
-; r1 uint8_t *pred_ptr
-; r2 uint8_t *dst_ptr
-; r3 int pitch
-; sp int stride
-
-|vp9_dc_only_idct_add_neon| PROC
-
- ; generate cospi_16_64 = 11585
- mov r12, #0x2d00
- add r12, #0x41
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- mul r0, r0, r12 ; input_dc * cospi_16_64
- add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
- asr r0, r0, #14 ; >> DCT_CONST_BITS
-
- ; dct_const_round_shift(out * cospi_16_64)
- mul r0, r0, r12 ; out * cospi_16_64
- add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
- asr r0, r0, #14 ; >> DCT_CONST_BITS
-
- ; ROUND_POWER_OF_TWO(out, 4)
- add r0, r0, #8 ; + (1 <<((4) - 1))
- asr r0, r0, #4 ; >> 4
-
- vdup.16 q0, r0; ; duplicate a1
- ldr r12, [sp] ; load stride
-
- vld1.32 {d2[0]}, [r1], r3
- vld1.32 {d2[1]}, [r1], r3
- vld1.32 {d4[0]}, [r1], r3
- vld1.32 {d4[1]}, [r1]
-
- vaddw.u8 q1, q0, d2 ; a1 + pred_ptr[c]
- vaddw.u8 q2, q0, d4
-
- vqmovun.s16 d2, q1 ; clip_pixel
- vqmovun.s16 d4, q2
-
- vst1.32 {d2[0]}, [r2], r12
- vst1.32 {d2[1]}, [r2], r12
- vst1.32 {d4[0]}, [r2], r12
- vst1.32 {d4[1]}, [r2]
-
- bx lr
- ENDP ; |vp9_dc_only_idct_add_neon|
-
- END
diff --git a/vp9/common/arm/neon/vp9_idct16x16_add_neon.c b/vp9/common/arm/neon/vp9_idct16x16_add_neon.c
index 68d7cccc0..5fa3f5c01 100644
--- a/vp9/common/arm/neon/vp9_idct16x16_add_neon.c
+++ b/vp9/common/arm/neon/vp9_idct16x16_add_neon.c
@@ -10,6 +10,8 @@
#include <arm_neon.h>
+#include "./vpx_config.h"
+
static int16_t cospi_2_64 = 16305;
static int16_t cospi_4_64 = 16069;
static int16_t cospi_6_64 = 15679;
@@ -26,7 +28,7 @@ static int16_t cospi_26_64 = 4756;
static int16_t cospi_28_64 = 3196;
static int16_t cospi_30_64 = 1606;
-static inline void TRANSPOSE8X8(
+static INLINE void TRANSPOSE8X8(
int16x8_t *q8s16,
int16x8_t *q9s16,
int16x8_t *q10s16,
diff --git a/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c b/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c
index 1bfee22b2..d0e4b4f40 100644
--- a/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c
+++ b/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c
@@ -11,7 +11,9 @@
#include <arm_neon.h>
#include "vp9/common/vp9_idct.h"
-static inline void LD_16x8(
+#include "./vpx_config.h"
+
+static INLINE void LD_16x8(
uint8_t *d,
int d_stride,
uint8x16_t *q8u8,
@@ -40,7 +42,7 @@ static inline void LD_16x8(
return;
}
-static inline void ADD_DIFF_16x8(
+static INLINE void ADD_DIFF_16x8(
uint8x16_t qdiffu8,
uint8x16_t *q8u8,
uint8x16_t *q9u8,
@@ -61,7 +63,7 @@ static inline void ADD_DIFF_16x8(
return;
}
-static inline void SUB_DIFF_16x8(
+static INLINE void SUB_DIFF_16x8(
uint8x16_t qdiffu8,
uint8x16_t *q8u8,
uint8x16_t *q9u8,
@@ -82,7 +84,7 @@ static inline void SUB_DIFF_16x8(
return;
}
-static inline void ST_16x8(
+static INLINE void ST_16x8(
uint8_t *d,
int d_stride,
uint8x16_t *q8u8,
diff --git a/vp9/common/arm/neon/vp9_idct32x32_add_neon.c b/vp9/common/arm/neon/vp9_idct32x32_add_neon.c
index 53f721b44..309bdf8d7 100644
--- a/vp9/common/arm/neon/vp9_idct32x32_add_neon.c
+++ b/vp9/common/arm/neon/vp9_idct32x32_add_neon.c
@@ -10,6 +10,8 @@
#include <arm_neon.h>
+#include "./vpx_config.h"
+
static int16_t cospi_1_64 = 16364;
static int16_t cospi_2_64 = 16305;
static int16_t cospi_3_64 = 16207;
@@ -57,7 +59,7 @@ static int16_t cospi_31_64 = 804;
#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
__STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \
q6s16, q7s16, q8s16, q9s16);
-static inline void __STORE_COMBINE_CENTER_RESULTS(
+static INLINE void __STORE_COMBINE_CENTER_RESULTS(
uint8_t *p1,
uint8_t *p2,
int stride,
@@ -105,7 +107,7 @@ static inline void __STORE_COMBINE_CENTER_RESULTS(
#define STORE_COMBINE_EXTREME_RESULTS(r7, r6); \
__STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \
q4s16, q5s16, q6s16, q7s16);
-static inline void __STORE_COMBINE_EXTREME_RESULTS(
+static INLINE void __STORE_COMBINE_EXTREME_RESULTS(
uint8_t *p1,
uint8_t *p2,
int stride,
@@ -152,7 +154,7 @@ static inline void __STORE_COMBINE_EXTREME_RESULTS(
#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
-static inline void DO_BUTTERFLY(
+static INLINE void DO_BUTTERFLY(
int16x8_t q14s16,
int16x8_t q13s16,
int16_t first_const,
@@ -194,7 +196,7 @@ static inline void DO_BUTTERFLY(
return;
}
-static inline void idct32_transpose_pair(
+static INLINE void idct32_transpose_pair(
int16_t *input,
int16_t *t_buf) {
int16_t *in;
@@ -288,7 +290,7 @@ static inline void idct32_transpose_pair(
return;
}
-static inline void idct32_bands_end_1st_pass(
+static INLINE void idct32_bands_end_1st_pass(
int16_t *out,
int16x8_t q2s16,
int16x8_t q3s16,
@@ -383,7 +385,7 @@ static inline void idct32_bands_end_1st_pass(
return;
}
-static inline void idct32_bands_end_2nd_pass(
+static INLINE void idct32_bands_end_2nd_pass(
int16_t *out,
uint8_t *dest,
int stride,
diff --git a/vp9/common/arm/neon/vp9_idct8x8_add_neon.c b/vp9/common/arm/neon/vp9_idct8x8_add_neon.c
index 50587f6bc..2b3c1ce60 100644
--- a/vp9/common/arm/neon/vp9_idct8x8_add_neon.c
+++ b/vp9/common/arm/neon/vp9_idct8x8_add_neon.c
@@ -10,6 +10,8 @@
#include <arm_neon.h>
+#include "./vpx_config.h"
+
static int16_t cospi_4_64 = 16069;
static int16_t cospi_8_64 = 15137;
static int16_t cospi_12_64 = 13623;
@@ -18,7 +20,7 @@ static int16_t cospi_20_64 = 9102;
static int16_t cospi_24_64 = 6270;
static int16_t cospi_28_64 = 3196;
-static inline void TRANSPOSE8X8(
+static INLINE void TRANSPOSE8X8(
int16x8_t *q8s16,
int16x8_t *q9s16,
int16x8_t *q10s16,
@@ -87,7 +89,7 @@ static inline void TRANSPOSE8X8(
return;
}
-static inline void IDCT8x8_1D(
+static INLINE void IDCT8x8_1D(
int16x8_t *q8s16,
int16x8_t *q9s16,
int16x8_t *q10s16,
diff --git a/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm b/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm
deleted file mode 100644
index 2f326e24c..000000000
--- a/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm
+++ /dev/null
@@ -1,237 +0,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vp9_iht4x4_16_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
- ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are
- ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain
- ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back
- ; into d16-d19 registers. This macro will touch q10- q15 registers and use
- ; them as buffer during calculation.
- MACRO
- IDCT4x4_1D
- ; stage 1
- vadd.s16 d23, d16, d18 ; (input[0] + input[2])
- vsub.s16 d24, d16, d18 ; (input[0] - input[2])
-
- vmull.s16 q15, d17, d2 ; input[1] * cospi_24_64
- vmull.s16 q10, d17, d0 ; input[1] * cospi_8_64
- vmull.s16 q13, d23, d1 ; (input[0] + input[2]) * cospi_16_64
- vmull.s16 q14, d24, d1 ; (input[0] - input[2]) * cospi_16_64
- vmlsl.s16 q15, d19, d0 ; input[1] * cospi_24_64 - input[3] * cospi_8_64
- vmlal.s16 q10, d19, d2 ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-
- ; dct_const_round_shift
- vqrshrn.s32 d26, q13, #14
- vqrshrn.s32 d27, q14, #14
- vqrshrn.s32 d29, q15, #14
- vqrshrn.s32 d28, q10, #14
-
- ; stage 2
- ; output[0] = step[0] + step[3];
- ; output[1] = step[1] + step[2];
- ; output[3] = step[0] - step[3];
- ; output[2] = step[1] - step[2];
- vadd.s16 q8, q13, q14
- vsub.s16 q9, q13, q14
- vswp d18, d19
- MEND
-
- ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which
- ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9.
- ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be
- ; stored back into d16-d19 registers. This macro will touch q11,q12,q13,
- ; q14,q15 registers and use them as buffer during calculation.
- MACRO
- IADST4x4_1D
- vmull.s16 q10, d3, d16 ; s0 = sinpi_1_9 * x0
- vmull.s16 q11, d4, d16 ; s1 = sinpi_2_9 * x0
- vmull.s16 q12, d6, d17 ; s2 = sinpi_3_9 * x1
- vmull.s16 q13, d5, d18 ; s3 = sinpi_4_9 * x2
- vmull.s16 q14, d3, d18 ; s4 = sinpi_1_9 * x2
- vmovl.s16 q15, d16 ; expand x0 from 16 bit to 32 bit
- vaddw.s16 q15, q15, d19 ; x0 + x3
- vmull.s16 q8, d4, d19 ; s5 = sinpi_2_9 * x3
- vsubw.s16 q15, q15, d18 ; s7 = x0 + x3 - x2
- vmull.s16 q9, d5, d19 ; s6 = sinpi_4_9 * x3
-
- vadd.s32 q10, q10, q13 ; x0 = s0 + s3 + s5
- vadd.s32 q10, q10, q8
- vsub.s32 q11, q11, q14 ; x1 = s1 - s4 - s6
- vdup.32 q8, r0 ; duplicate sinpi_3_9
- vsub.s32 q11, q11, q9
- vmul.s32 q15, q15, q8 ; x2 = sinpi_3_9 * s7
-
- vadd.s32 q13, q10, q12 ; s0 = x0 + x3
- vadd.s32 q10, q10, q11 ; x0 + x1
- vadd.s32 q14, q11, q12 ; s1 = x1 + x3
- vsub.s32 q10, q10, q12 ; s3 = x0 + x1 - x3
-
- ; dct_const_round_shift
- vqrshrn.s32 d16, q13, #14
- vqrshrn.s32 d17, q14, #14
- vqrshrn.s32 d18, q15, #14
- vqrshrn.s32 d19, q10, #14
- MEND
-
- ; Generate cosine constants in d6 - d8 for the IDCT
- MACRO
- GENERATE_COSINE_CONSTANTS
- ; cospi_8_64 = 15137 = 0x3b21
- mov r0, #0x3b00
- add r0, #0x21
- ; cospi_16_64 = 11585 = 0x2d41
- mov r3, #0x2d00
- add r3, #0x41
- ; cospi_24_64 = 6270 = 0x187e
- mov r12, #0x1800
- add r12, #0x7e
-
- ; generate constant vectors
- vdup.16 d0, r0 ; duplicate cospi_8_64
- vdup.16 d1, r3 ; duplicate cospi_16_64
- vdup.16 d2, r12 ; duplicate cospi_24_64
- MEND
-
- ; Generate sine constants in d1 - d4 for the IADST.
- MACRO
- GENERATE_SINE_CONSTANTS
- ; sinpi_1_9 = 5283 = 0x14A3
- mov r0, #0x1400
- add r0, #0xa3
- ; sinpi_2_9 = 9929 = 0x26C9
- mov r3, #0x2600
- add r3, #0xc9
- ; sinpi_4_9 = 15212 = 0x3B6C
- mov r12, #0x3b00
- add r12, #0x6c
-
- ; generate constant vectors
- vdup.16 d3, r0 ; duplicate sinpi_1_9
-
- ; sinpi_3_9 = 13377 = 0x3441
- mov r0, #0x3400
- add r0, #0x41
-
- vdup.16 d4, r3 ; duplicate sinpi_2_9
- vdup.16 d5, r12 ; duplicate sinpi_4_9
- vdup.16 q3, r0 ; duplicate sinpi_3_9
- MEND
-
- ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19.
- MACRO
- TRANSPOSE4X4
- vtrn.16 d16, d17
- vtrn.16 d18, d19
- vtrn.32 q8, q9
- MEND
-
- AREA Block, CODE, READONLY ; name this block of code
-;void vp9_iht4x4_16_add_neon(int16_t *input, uint8_t *dest,
-; int dest_stride, int tx_type)
-;
-; r0 int16_t input
-; r1 uint8_t *dest
-; r2 int dest_stride
-; r3 int tx_type)
-; This function will only handle tx_type of 1,2,3.
-|vp9_iht4x4_16_add_neon| PROC
-
- ; load the inputs into d16-d19
- vld1.s16 {q8,q9}, [r0]!
-
- ; transpose the input data
- TRANSPOSE4X4
-
- ; decide the type of transform
- cmp r3, #2
- beq idct_iadst
- cmp r3, #3
- beq iadst_iadst
-
-iadst_idct
- ; generate constants
- GENERATE_COSINE_CONSTANTS
- GENERATE_SINE_CONSTANTS
-
- ; first transform rows
- IDCT4x4_1D
-
- ; transpose the matrix
- TRANSPOSE4X4
-
- ; then transform columns
- IADST4x4_1D
-
- b end_vp9_iht4x4_16_add_neon
-
-idct_iadst
- ; generate constants
- GENERATE_COSINE_CONSTANTS
- GENERATE_SINE_CONSTANTS
-
- ; first transform rows
- IADST4x4_1D
-
- ; transpose the matrix
- TRANSPOSE4X4
-
- ; then transform columns
- IDCT4x4_1D
-
- b end_vp9_iht4x4_16_add_neon
-
-iadst_iadst
- ; generate constants
- GENERATE_SINE_CONSTANTS
-
- ; first transform rows
- IADST4x4_1D
-
- ; transpose the matrix
- TRANSPOSE4X4
-
- ; then transform columns
- IADST4x4_1D
-
-end_vp9_iht4x4_16_add_neon
- ; ROUND_POWER_OF_TWO(temp_out[j], 4)
- vrshr.s16 q8, q8, #4
- vrshr.s16 q9, q9, #4
-
- vld1.32 {d26[0]}, [r1], r2
- vld1.32 {d26[1]}, [r1], r2
- vld1.32 {d27[0]}, [r1], r2
- vld1.32 {d27[1]}, [r1]
-
- ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
- vaddw.u8 q8, q8, d26
- vaddw.u8 q9, q9, d27
-
- ; clip_pixel
- vqmovun.s16 d26, q8
- vqmovun.s16 d27, q9
-
- ; do the stores in reverse order with negative post-increment, by changing
- ; the sign of the stride
- rsb r2, r2, #0
- vst1.32 {d27[1]}, [r1], r2
- vst1.32 {d27[0]}, [r1], r2
- vst1.32 {d26[1]}, [r1], r2
- vst1.32 {d26[0]}, [r1] ; no post-increment
- bx lr
- ENDP ; |vp9_iht4x4_16_add_neon|
-
- END
diff --git a/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
new file mode 100644
index 000000000..1761fada2
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+
+static int16_t sinpi_1_9 = 0x14a3;
+static int16_t sinpi_2_9 = 0x26c9;
+static int16_t sinpi_3_9 = 0x3441;
+static int16_t sinpi_4_9 = 0x3b6c;
+static int16_t cospi_8_64 = 0x3b21;
+static int16_t cospi_16_64 = 0x2d41;
+static int16_t cospi_24_64 = 0x187e;
+
+static INLINE void TRANSPOSE4X4(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16) {
+ int32x4_t q8s32, q9s32;
+ int16x4x2_t d0x2s16, d1x2s16;
+ int32x4x2_t q0x2s32;
+
+ d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
+ d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));
+
+ q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
+ q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
+ q0x2s32 = vtrnq_s32(q8s32, q9s32);
+
+ *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
+ *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
+ return;
+}
+
+static INLINE void GENERATE_COSINE_CONSTANTS(
+ int16x4_t *d0s16,
+ int16x4_t *d1s16,
+ int16x4_t *d2s16) {
+ *d0s16 = vdup_n_s16(cospi_8_64);
+ *d1s16 = vdup_n_s16(cospi_16_64);
+ *d2s16 = vdup_n_s16(cospi_24_64);
+ return;
+}
+
+static INLINE void GENERATE_SINE_CONSTANTS(
+ int16x4_t *d3s16,
+ int16x4_t *d4s16,
+ int16x4_t *d5s16,
+ int16x8_t *q3s16) {
+ *d3s16 = vdup_n_s16(sinpi_1_9);
+ *d4s16 = vdup_n_s16(sinpi_2_9);
+ *q3s16 = vdupq_n_s16(sinpi_3_9);
+ *d5s16 = vdup_n_s16(sinpi_4_9);
+ return;
+}
+
+static INLINE void IDCT4x4_1D(
+ int16x4_t *d0s16,
+ int16x4_t *d1s16,
+ int16x4_t *d2s16,
+ int16x8_t *q8s16,
+ int16x8_t *q9s16) {
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
+ int16x4_t d26s16, d27s16, d28s16, d29s16;
+ int32x4_t q10s32, q13s32, q14s32, q15s32;
+ int16x8_t q13s16, q14s16;
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+
+ d23s16 = vadd_s16(d16s16, d18s16);
+ d24s16 = vsub_s16(d16s16, d18s16);
+
+ q15s32 = vmull_s16(d17s16, *d2s16);
+ q10s32 = vmull_s16(d17s16, *d0s16);
+ q13s32 = vmull_s16(d23s16, *d1s16);
+ q14s32 = vmull_s16(d24s16, *d1s16);
+ q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
+ q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
+
+ d26s16 = vqrshrn_n_s32(q13s32, 14);
+ d27s16 = vqrshrn_n_s32(q14s32, 14);
+ d29s16 = vqrshrn_n_s32(q15s32, 14);
+ d28s16 = vqrshrn_n_s32(q10s32, 14);
+
+ q13s16 = vcombine_s16(d26s16, d27s16);
+ q14s16 = vcombine_s16(d28s16, d29s16);
+ *q8s16 = vaddq_s16(q13s16, q14s16);
+ *q9s16 = vsubq_s16(q13s16, q14s16);
+ *q9s16 = vcombine_s16(vget_high_s16(*q9s16),
+ vget_low_s16(*q9s16)); // vswp
+ return;
+}
+
+static INLINE void IADST4x4_1D(
+ int16x4_t *d3s16,
+ int16x4_t *d4s16,
+ int16x4_t *d5s16,
+ int16x8_t *q3s16,
+ int16x8_t *q8s16,
+ int16x8_t *q9s16) {
+ int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
+ int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+
+ d6s16 = vget_low_s16(*q3s16);
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+
+ q10s32 = vmull_s16(*d3s16, d16s16);
+ q11s32 = vmull_s16(*d4s16, d16s16);
+ q12s32 = vmull_s16(d6s16, d17s16);
+ q13s32 = vmull_s16(*d5s16, d18s16);
+ q14s32 = vmull_s16(*d3s16, d18s16);
+ q15s32 = vmovl_s16(d16s16);
+ q15s32 = vaddw_s16(q15s32, d19s16);
+ q8s32 = vmull_s16(*d4s16, d19s16);
+ q15s32 = vsubw_s16(q15s32, d18s16);
+ q9s32 = vmull_s16(*d5s16, d19s16);
+
+ q10s32 = vaddq_s32(q10s32, q13s32);
+ q10s32 = vaddq_s32(q10s32, q8s32);
+ q11s32 = vsubq_s32(q11s32, q14s32);
+ q8s32 = vdupq_n_s32(sinpi_3_9);
+ q11s32 = vsubq_s32(q11s32, q9s32);
+ q15s32 = vmulq_s32(q15s32, q8s32);
+
+ q13s32 = vaddq_s32(q10s32, q12s32);
+ q10s32 = vaddq_s32(q10s32, q11s32);
+ q14s32 = vaddq_s32(q11s32, q12s32);
+ q10s32 = vsubq_s32(q10s32, q12s32);
+
+ d16s16 = vqrshrn_n_s32(q13s32, 14);
+ d17s16 = vqrshrn_n_s32(q14s32, 14);
+ d18s16 = vqrshrn_n_s32(q15s32, 14);
+ d19s16 = vqrshrn_n_s32(q10s32, 14);
+
+ *q8s16 = vcombine_s16(d16s16, d17s16);
+ *q9s16 = vcombine_s16(d18s16, d19s16);
+ return;
+}
+
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
+ int dest_stride, int tx_type) {
+ uint8x8_t d26u8, d27u8;
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
+ uint32x2_t d26u32, d27u32;
+ int16x8_t q3s16, q8s16, q9s16;
+ uint16x8_t q8u16, q9u16;
+
+ d26u32 = d27u32 = vdup_n_u32(0);
+
+ q8s16 = vld1q_s16(input);
+ q9s16 = vld1q_s16(input + 8);
+
+ TRANSPOSE4X4(&q8s16, &q9s16);
+
+ switch (tx_type) {
+ case 0: // idct_idct is not supported. Fall back to C
+ vp9_iht4x4_16_add_c(input, dest, dest_stride, tx_type);
+ return;
+ break;
+ case 1: // iadst_idct
+ // generate constants
+ GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+ GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+ // first transform rows
+ IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+
+ // transpose the matrix
+ TRANSPOSE4X4(&q8s16, &q9s16);
+
+ // then transform columns
+ IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+ break;
+ case 2: // idct_iadst
+ // generate constantsyy
+ GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+ GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+ // first transform rows
+ IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+ // transpose the matrix
+ TRANSPOSE4X4(&q8s16, &q9s16);
+
+ // then transform columns
+ IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+ break;
+ case 3: // iadst_iadst
+ // generate constants
+ GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+ // first transform rows
+ IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+ // transpose the matrix
+ TRANSPOSE4X4(&q8s16, &q9s16);
+
+ // then transform columns
+ IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+ break;
+ default: // iadst_idct
+ assert(0);
+ break;
+ }
+
+ q8s16 = vrshrq_n_s16(q8s16, 4);
+ q9s16 = vrshrq_n_s16(q9s16, 4);
+
+ d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
+ dest += dest_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
+ dest += dest_stride;
+ d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
+ dest += dest_stride;
+ d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
+
+ d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
+ dest -= dest_stride;
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
+ dest -= dest_stride;
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
+ dest -= dest_stride;
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
+ return;
+}
diff --git a/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm b/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
deleted file mode 100644
index b41f5661b..000000000
--- a/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
+++ /dev/null
@@ -1,698 +0,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vp9_iht8x8_64_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
- ; Generate IADST constants in r0 - r12 for the IADST.
- MACRO
- GENERATE_IADST_CONSTANTS
- ; generate cospi_2_64 = 16305
- mov r0, #0x3f00
- add r0, #0xb1
-
- ; generate cospi_30_64 = 1606
- mov r1, #0x600
- add r1, #0x46
-
- ; generate cospi_10_64 = 14449
- mov r2, #0x3800
- add r2, #0x71
-
- ; generate cospi_22_64 = 7723
- mov r3, #0x1e00
- add r3, #0x2b
-
- ; generate cospi_18_64 = 10394
- mov r4, #0x2800
- add r4, #0x9a
-
- ; generate cospi_14_64 = 12665
- mov r5, #0x3100
- add r5, #0x79
-
- ; generate cospi_26_64 = 4756
- mov r6, #0x1200
- add r6, #0x94
-
- ; generate cospi_6_64 = 15679
- mov r7, #0x3d00
- add r7, #0x3f
-
- ; generate cospi_8_64 = 15137
- mov r8, #0x3b00
- add r8, #0x21
-
- ; generate cospi_24_64 = 6270
- mov r9, #0x1800
- add r9, #0x7e
-
- ; generate 0
- mov r10, #0
-
- ; generate cospi_16_64 = 11585
- mov r12, #0x2d00
- add r12, #0x41
- MEND
-
- ; Generate IDCT constants in r3 - r9 for the IDCT.
- MACRO
- GENERATE_IDCT_CONSTANTS
- ; generate cospi_28_64 = 3196
- mov r3, #0x0c00
- add r3, #0x7c
-
- ; generate cospi_4_64 = 16069
- mov r4, #0x3e00
- add r4, #0xc5
-
- ; generate cospi_12_64 = 13623
- mov r5, #0x3500
- add r5, #0x37
-
- ; generate cospi_20_64 = 9102
- mov r6, #0x2300
- add r6, #0x8e
-
- ; generate cospi_16_64 = 11585
- mov r7, #0x2d00
- add r7, #0x41
-
- ; generate cospi_24_64 = 6270
- mov r8, #0x1800
- add r8, #0x7e
-
- ; generate cospi_8_64 = 15137
- mov r9, #0x3b00
- add r9, #0x21
- MEND
-
- ; Transpose a 8x8 16bits data matrix. Datas are loaded in q8-q15.
- MACRO
- TRANSPOSE8X8
- vswp d17, d24
- vswp d23, d30
- vswp d21, d28
- vswp d19, d26
- vtrn.32 q8, q10
- vtrn.32 q9, q11
- vtrn.32 q12, q14
- vtrn.32 q13, q15
- vtrn.16 q8, q9
- vtrn.16 q10, q11
- vtrn.16 q12, q13
- vtrn.16 q14, q15
- MEND
-
- ; Parallel 1D IDCT on all the columns of a 8x8 16bits data matrix which are
- ; loaded in q8-q15. The IDCT constants are loaded in r3 - r9. The output
- ; will be stored back into q8-q15 registers. This macro will touch q0-q7
- ; registers and use them as buffer during calculation.
- MACRO
- IDCT8x8_1D
- ; stage 1
- vdup.16 d0, r3 ; duplicate cospi_28_64
- vdup.16 d1, r4 ; duplicate cospi_4_64
- vdup.16 d2, r5 ; duplicate cospi_12_64
- vdup.16 d3, r6 ; duplicate cospi_20_64
-
- ; input[1] * cospi_28_64
- vmull.s16 q2, d18, d0
- vmull.s16 q3, d19, d0
-
- ; input[5] * cospi_12_64
- vmull.s16 q5, d26, d2
- vmull.s16 q6, d27, d2
-
- ; input[1]*cospi_28_64-input[7]*cospi_4_64
- vmlsl.s16 q2, d30, d1
- vmlsl.s16 q3, d31, d1
-
- ; input[5] * cospi_12_64 - input[3] * cospi_20_64
- vmlsl.s16 q5, d22, d3
- vmlsl.s16 q6, d23, d3
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d8, q2, #14 ; >> 14
- vqrshrn.s32 d9, q3, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d10, q5, #14 ; >> 14
- vqrshrn.s32 d11, q6, #14 ; >> 14
-
- ; input[1] * cospi_4_64
- vmull.s16 q2, d18, d1
- vmull.s16 q3, d19, d1
-
- ; input[5] * cospi_20_64
- vmull.s16 q9, d26, d3
- vmull.s16 q13, d27, d3
-
- ; input[1]*cospi_4_64+input[7]*cospi_28_64
- vmlal.s16 q2, d30, d0
- vmlal.s16 q3, d31, d0
-
- ; input[5] * cospi_20_64 + input[3] * cospi_12_64
- vmlal.s16 q9, d22, d2
- vmlal.s16 q13, d23, d2
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d14, q2, #14 ; >> 14
- vqrshrn.s32 d15, q3, #14 ; >> 14
-
- ; stage 2 & stage 3 - even half
- vdup.16 d0, r7 ; duplicate cospi_16_64
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d12, q9, #14 ; >> 14
- vqrshrn.s32 d13, q13, #14 ; >> 14
-
- ; input[0] * cospi_16_64
- vmull.s16 q2, d16, d0
- vmull.s16 q3, d17, d0
-
- ; input[0] * cospi_16_64
- vmull.s16 q13, d16, d0
- vmull.s16 q15, d17, d0
-
- ; (input[0] + input[2]) * cospi_16_64
- vmlal.s16 q2, d24, d0
- vmlal.s16 q3, d25, d0
-
- ; (input[0] - input[2]) * cospi_16_64
- vmlsl.s16 q13, d24, d0
- vmlsl.s16 q15, d25, d0
-
- vdup.16 d0, r8 ; duplicate cospi_24_64
- vdup.16 d1, r9 ; duplicate cospi_8_64
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d18, q2, #14 ; >> 14
- vqrshrn.s32 d19, q3, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d22, q13, #14 ; >> 14
- vqrshrn.s32 d23, q15, #14 ; >> 14
-
- ; input[1] * cospi_24_64
- vmull.s16 q2, d20, d0
- vmull.s16 q3, d21, d0
-
- ; input[1] * cospi_8_64
- vmull.s16 q8, d20, d1
- vmull.s16 q12, d21, d1
-
- ; input[1] * cospi_24_64 - input[3] * cospi_8_64
- vmlsl.s16 q2, d28, d1
- vmlsl.s16 q3, d29, d1
-
- ; input[1] * cospi_8_64 + input[3] * cospi_24_64
- vmlal.s16 q8, d28, d0
- vmlal.s16 q12, d29, d0
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d26, q2, #14 ; >> 14
- vqrshrn.s32 d27, q3, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d30, q8, #14 ; >> 14
- vqrshrn.s32 d31, q12, #14 ; >> 14
-
- vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3]
- vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2]
- vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2]
- vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3]
-
- ; stage 3 -odd half
- vdup.16 d16, r7 ; duplicate cospi_16_64
-
- ; stage 2 - odd half
- vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]
- vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]
- vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]
- vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7]
-
- ; step2[6] * cospi_16_64
- vmull.s16 q9, d28, d16
- vmull.s16 q10, d29, d16
-
- ; step2[6] * cospi_16_64
- vmull.s16 q11, d28, d16
- vmull.s16 q12, d29, d16
-
- ; (step2[6] - step2[5]) * cospi_16_64
- vmlsl.s16 q9, d26, d16
- vmlsl.s16 q10, d27, d16
-
- ; (step2[5] + step2[6]) * cospi_16_64
- vmlal.s16 q11, d26, d16
- vmlal.s16 q12, d27, d16
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d10, q9, #14 ; >> 14
- vqrshrn.s32 d11, q10, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d12, q11, #14 ; >> 14
- vqrshrn.s32 d13, q12, #14 ; >> 14
-
- ; stage 4
- vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7];
- vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6];
- vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5];
- vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4];
- vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4];
- vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5];
- vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6];
- vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7];
- MEND
-
- ; Parallel 1D IADST on all the columns of a 8x8 16bits data matrix which
- ; loaded in q8-q15. IADST constants are loaded in r0 - r12 registers. The
- ; output will be stored back into q8-q15 registers. This macro will touch
- ; q0 - q7 registers and use them as buffer during calculation.
- MACRO
- IADST8X8_1D
- vdup.16 d14, r0 ; duplicate cospi_2_64
- vdup.16 d15, r1 ; duplicate cospi_30_64
-
- ; cospi_2_64 * x0
- vmull.s16 q1, d30, d14
- vmull.s16 q2, d31, d14
-
- ; cospi_30_64 * x0
- vmull.s16 q3, d30, d15
- vmull.s16 q4, d31, d15
-
- vdup.16 d30, r4 ; duplicate cospi_18_64
- vdup.16 d31, r5 ; duplicate cospi_14_64
-
- ; s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
- vmlal.s16 q1, d16, d15
- vmlal.s16 q2, d17, d15
-
- ; s1 = cospi_30_64 * x0 - cospi_2_64 * x1
- vmlsl.s16 q3, d16, d14
- vmlsl.s16 q4, d17, d14
-
- ; cospi_18_64 * x4
- vmull.s16 q5, d22, d30
- vmull.s16 q6, d23, d30
-
- ; cospi_14_64 * x4
- vmull.s16 q7, d22, d31
- vmull.s16 q8, d23, d31
-
- ; s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
- vmlal.s16 q5, d24, d31
- vmlal.s16 q6, d25, d31
-
- ; s5 = cospi_14_64 * x4 - cospi_18_64 * x5
- vmlsl.s16 q7, d24, d30
- vmlsl.s16 q8, d25, d30
-
- ; (s0 + s4)
- vadd.s32 q11, q1, q5
- vadd.s32 q12, q2, q6
-
- vdup.16 d0, r2 ; duplicate cospi_10_64
- vdup.16 d1, r3 ; duplicate cospi_22_64
-
- ; (s0 - s4)
- vsub.s32 q1, q1, q5
- vsub.s32 q2, q2, q6
-
- ; x0 = dct_const_round_shift(s0 + s4);
- vqrshrn.s32 d22, q11, #14 ; >> 14
- vqrshrn.s32 d23, q12, #14 ; >> 14
-
- ; (s1 + s5)
- vadd.s32 q12, q3, q7
- vadd.s32 q15, q4, q8
-
- ; (s1 - s5)
- vsub.s32 q3, q3, q7
- vsub.s32 q4, q4, q8
-
- ; x4 = dct_const_round_shift(s0 - s4);
- vqrshrn.s32 d2, q1, #14 ; >> 14
- vqrshrn.s32 d3, q2, #14 ; >> 14
-
- ; x1 = dct_const_round_shift(s1 + s5);
- vqrshrn.s32 d24, q12, #14 ; >> 14
- vqrshrn.s32 d25, q15, #14 ; >> 14
-
- ; x5 = dct_const_round_shift(s1 - s5);
- vqrshrn.s32 d6, q3, #14 ; >> 14
- vqrshrn.s32 d7, q4, #14 ; >> 14
-
- ; cospi_10_64 * x2
- vmull.s16 q4, d26, d0
- vmull.s16 q5, d27, d0
-
- ; cospi_22_64 * x2
- vmull.s16 q2, d26, d1
- vmull.s16 q6, d27, d1
-
- vdup.16 d30, r6 ; duplicate cospi_26_64
- vdup.16 d31, r7 ; duplicate cospi_6_64
-
- ; s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
- vmlal.s16 q4, d20, d1
- vmlal.s16 q5, d21, d1
-
- ; s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
- vmlsl.s16 q2, d20, d0
- vmlsl.s16 q6, d21, d0
-
- ; cospi_26_64 * x6
- vmull.s16 q0, d18, d30
- vmull.s16 q13, d19, d30
-
- ; s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
- vmlal.s16 q0, d28, d31
- vmlal.s16 q13, d29, d31
-
- ; cospi_6_64 * x6
- vmull.s16 q10, d18, d31
- vmull.s16 q9, d19, d31
-
- ; s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
- vmlsl.s16 q10, d28, d30
- vmlsl.s16 q9, d29, d30
-
- ; (s3 + s7)
- vadd.s32 q14, q2, q10
- vadd.s32 q15, q6, q9
-
- ; (s3 - s7)
- vsub.s32 q2, q2, q10
- vsub.s32 q6, q6, q9
-
- ; x3 = dct_const_round_shift(s3 + s7);
- vqrshrn.s32 d28, q14, #14 ; >> 14
- vqrshrn.s32 d29, q15, #14 ; >> 14
-
- ; x7 = dct_const_round_shift(s3 - s7);
- vqrshrn.s32 d4, q2, #14 ; >> 14
- vqrshrn.s32 d5, q6, #14 ; >> 14
-
- ; (s2 + s6)
- vadd.s32 q9, q4, q0
- vadd.s32 q10, q5, q13
-
- ; (s2 - s6)
- vsub.s32 q4, q4, q0
- vsub.s32 q5, q5, q13
-
- vdup.16 d30, r8 ; duplicate cospi_8_64
- vdup.16 d31, r9 ; duplicate cospi_24_64
-
- ; x2 = dct_const_round_shift(s2 + s6);
- vqrshrn.s32 d18, q9, #14 ; >> 14
- vqrshrn.s32 d19, q10, #14 ; >> 14
-
- ; x6 = dct_const_round_shift(s2 - s6);
- vqrshrn.s32 d8, q4, #14 ; >> 14
- vqrshrn.s32 d9, q5, #14 ; >> 14
-
- ; cospi_8_64 * x4
- vmull.s16 q5, d2, d30
- vmull.s16 q6, d3, d30
-
- ; cospi_24_64 * x4
- vmull.s16 q7, d2, d31
- vmull.s16 q0, d3, d31
-
- ; s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
- vmlal.s16 q5, d6, d31
- vmlal.s16 q6, d7, d31
-
- ; s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
- vmlsl.s16 q7, d6, d30
- vmlsl.s16 q0, d7, d30
-
- ; cospi_8_64 * x7
- vmull.s16 q1, d4, d30
- vmull.s16 q3, d5, d30
-
- ; cospi_24_64 * x7
- vmull.s16 q10, d4, d31
- vmull.s16 q2, d5, d31
-
- ; s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
- vmlsl.s16 q1, d8, d31
- vmlsl.s16 q3, d9, d31
-
- ; s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
- vmlal.s16 q10, d8, d30
- vmlal.s16 q2, d9, d30
-
- vadd.s16 q8, q11, q9 ; x0 = s0 + s2;
-
- vsub.s16 q11, q11, q9 ; x2 = s0 - s2;
-
- vadd.s16 q4, q12, q14 ; x1 = s1 + s3;
-
- vsub.s16 q12, q12, q14 ; x3 = s1 - s3;
-
- ; (s4 + s6)
- vadd.s32 q14, q5, q1
- vadd.s32 q15, q6, q3
-
- ; (s4 - s6)
- vsub.s32 q5, q5, q1
- vsub.s32 q6, q6, q3
-
- ; x4 = dct_const_round_shift(s4 + s6);
- vqrshrn.s32 d18, q14, #14 ; >> 14
- vqrshrn.s32 d19, q15, #14 ; >> 14
-
- ; x6 = dct_const_round_shift(s4 - s6);
- vqrshrn.s32 d10, q5, #14 ; >> 14
- vqrshrn.s32 d11, q6, #14 ; >> 14
-
- ; (s5 + s7)
- vadd.s32 q1, q7, q10
- vadd.s32 q3, q0, q2
-
- ; (s5 - s7))
- vsub.s32 q7, q7, q10
- vsub.s32 q0, q0, q2
-
- ; x5 = dct_const_round_shift(s5 + s7);
- vqrshrn.s32 d28, q1, #14 ; >> 14
- vqrshrn.s32 d29, q3, #14 ; >> 14
-
- ; x7 = dct_const_round_shift(s5 - s7);
- vqrshrn.s32 d14, q7, #14 ; >> 14
- vqrshrn.s32 d15, q0, #14 ; >> 14
-
- vdup.16 d30, r12 ; duplicate cospi_16_64
-
- ; cospi_16_64 * x2
- vmull.s16 q2, d22, d30
- vmull.s16 q3, d23, d30
-
- ; cospi_6_64 * x6
- vmull.s16 q13, d22, d30
- vmull.s16 q1, d23, d30
-
- ; cospi_16_64 * x2 + cospi_16_64 * x3;
- vmlal.s16 q2, d24, d30
- vmlal.s16 q3, d25, d30
-
- ; cospi_16_64 * x2 - cospi_16_64 * x3;
- vmlsl.s16 q13, d24, d30
- vmlsl.s16 q1, d25, d30
-
- ; x2 = dct_const_round_shift(s2);
- vqrshrn.s32 d4, q2, #14 ; >> 14
- vqrshrn.s32 d5, q3, #14 ; >> 14
-
- ;x3 = dct_const_round_shift(s3);
- vqrshrn.s32 d24, q13, #14 ; >> 14
- vqrshrn.s32 d25, q1, #14 ; >> 14
-
- ; cospi_16_64 * x6
- vmull.s16 q13, d10, d30
- vmull.s16 q1, d11, d30
-
- ; cospi_6_64 * x6
- vmull.s16 q11, d10, d30
- vmull.s16 q0, d11, d30
-
- ; cospi_16_64 * x6 + cospi_16_64 * x7;
- vmlal.s16 q13, d14, d30
- vmlal.s16 q1, d15, d30
-
- ; cospi_16_64 * x6 - cospi_16_64 * x7;
- vmlsl.s16 q11, d14, d30
- vmlsl.s16 q0, d15, d30
-
- ; x6 = dct_const_round_shift(s6);
- vqrshrn.s32 d20, q13, #14 ; >> 14
- vqrshrn.s32 d21, q1, #14 ; >> 14
-
- ;x7 = dct_const_round_shift(s7);
- vqrshrn.s32 d12, q11, #14 ; >> 14
- vqrshrn.s32 d13, q0, #14 ; >> 14
-
- vdup.16 q5, r10 ; duplicate 0
-
- vsub.s16 q9, q5, q9 ; output[1] = -x4;
- vsub.s16 q11, q5, q2 ; output[3] = -x2;
- vsub.s16 q13, q5, q6 ; output[5] = -x7;
- vsub.s16 q15, q5, q4 ; output[7] = -x1;
- MEND
-
-
- AREA Block, CODE, READONLY ; name this block of code
-;void vp9_iht8x8_64_add_neon(int16_t *input, uint8_t *dest,
-; int dest_stride, int tx_type)
-;
-; r0 int16_t input
-; r1 uint8_t *dest
-; r2 int dest_stride
-; r3 int tx_type)
-; This function will only handle tx_type of 1,2,3.
-|vp9_iht8x8_64_add_neon| PROC
-
- ; load the inputs into d16-d19
- vld1.s16 {q8,q9}, [r0]!
- vld1.s16 {q10,q11}, [r0]!
- vld1.s16 {q12,q13}, [r0]!
- vld1.s16 {q14,q15}, [r0]!
-
- push {r0-r10}
- vpush {d8-d15}
-
- ; transpose the input data
- TRANSPOSE8X8
-
- ; decide the type of transform
- cmp r3, #2
- beq idct_iadst
- cmp r3, #3
- beq iadst_iadst
-
-iadst_idct
- ; generate IDCT constants
- GENERATE_IDCT_CONSTANTS
-
- ; first transform rows
- IDCT8x8_1D
-
- ; transpose the matrix
- TRANSPOSE8X8
-
- ; generate IADST constants
- GENERATE_IADST_CONSTANTS
-
- ; then transform columns
- IADST8X8_1D
-
- b end_vp9_iht8x8_64_add_neon
-
-idct_iadst
- ; generate IADST constants
- GENERATE_IADST_CONSTANTS
-
- ; first transform rows
- IADST8X8_1D
-
- ; transpose the matrix
- TRANSPOSE8X8
-
- ; generate IDCT constants
- GENERATE_IDCT_CONSTANTS
-
- ; then transform columns
- IDCT8x8_1D
-
- b end_vp9_iht8x8_64_add_neon
-
-iadst_iadst
- ; generate IADST constants
- GENERATE_IADST_CONSTANTS
-
- ; first transform rows
- IADST8X8_1D
-
- ; transpose the matrix
- TRANSPOSE8X8
-
- ; then transform columns
- IADST8X8_1D
-
-end_vp9_iht8x8_64_add_neon
- vpop {d8-d15}
- pop {r0-r10}
-
- ; ROUND_POWER_OF_TWO(temp_out[j], 5)
- vrshr.s16 q8, q8, #5
- vrshr.s16 q9, q9, #5
- vrshr.s16 q10, q10, #5
- vrshr.s16 q11, q11, #5
- vrshr.s16 q12, q12, #5
- vrshr.s16 q13, q13, #5
- vrshr.s16 q14, q14, #5
- vrshr.s16 q15, q15, #5
-
- ; save dest pointer
- mov r0, r1
-
- ; load destination data
- vld1.64 {d0}, [r1], r2
- vld1.64 {d1}, [r1], r2
- vld1.64 {d2}, [r1], r2
- vld1.64 {d3}, [r1], r2
- vld1.64 {d4}, [r1], r2
- vld1.64 {d5}, [r1], r2
- vld1.64 {d6}, [r1], r2
- vld1.64 {d7}, [r1]
-
- ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
- vaddw.u8 q8, q8, d0
- vaddw.u8 q9, q9, d1
- vaddw.u8 q10, q10, d2
- vaddw.u8 q11, q11, d3
- vaddw.u8 q12, q12, d4
- vaddw.u8 q13, q13, d5
- vaddw.u8 q14, q14, d6
- vaddw.u8 q15, q15, d7
-
- ; clip_pixel
- vqmovun.s16 d0, q8
- vqmovun.s16 d1, q9
- vqmovun.s16 d2, q10
- vqmovun.s16 d3, q11
- vqmovun.s16 d4, q12
- vqmovun.s16 d5, q13
- vqmovun.s16 d6, q14
- vqmovun.s16 d7, q15
-
- ; store the data
- vst1.64 {d0}, [r0], r2
- vst1.64 {d1}, [r0], r2
- vst1.64 {d2}, [r0], r2
- vst1.64 {d3}, [r0], r2
- vst1.64 {d4}, [r0], r2
- vst1.64 {d5}, [r0], r2
- vst1.64 {d6}, [r0], r2
- vst1.64 {d7}, [r0], r2
- bx lr
- ENDP ; |vp9_iht8x8_64_add_neon|
-
- END
diff --git a/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
new file mode 100644
index 000000000..04b342c3d
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
@@ -0,0 +1,624 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+
+static int16_t cospi_2_64 = 16305;
+static int16_t cospi_4_64 = 16069;
+static int16_t cospi_6_64 = 15679;
+static int16_t cospi_8_64 = 15137;
+static int16_t cospi_10_64 = 14449;
+static int16_t cospi_12_64 = 13623;
+static int16_t cospi_14_64 = 12665;
+static int16_t cospi_16_64 = 11585;
+static int16_t cospi_18_64 = 10394;
+static int16_t cospi_20_64 = 9102;
+static int16_t cospi_22_64 = 7723;
+static int16_t cospi_24_64 = 6270;
+static int16_t cospi_26_64 = 4756;
+static int16_t cospi_28_64 = 3196;
+static int16_t cospi_30_64 = 1606;
+
+static INLINE void TRANSPOSE8X8(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16,
+ int16x8_t *q10s16,
+ int16x8_t *q11s16,
+ int16x8_t *q12s16,
+ int16x8_t *q13s16,
+ int16x8_t *q14s16,
+ int16x8_t *q15s16) {
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+ int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
+ *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
+ *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
+ *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
+ *q12s16 = vcombine_s16(d17s16, d25s16);
+ *q13s16 = vcombine_s16(d19s16, d27s16);
+ *q14s16 = vcombine_s16(d21s16, d29s16);
+ *q15s16 = vcombine_s16(d23s16, d31s16);
+
+ q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
+ vreinterpretq_s32_s16(*q10s16));
+ q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
+ vreinterpretq_s32_s16(*q11s16));
+ q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
+ vreinterpretq_s32_s16(*q14s16));
+ q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
+ vreinterpretq_s32_s16(*q15s16));
+
+ q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
+ vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
+ q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
+ vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
+ q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
+ vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
+ q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
+ vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
+
+ *q8s16 = q0x2s16.val[0];
+ *q9s16 = q0x2s16.val[1];
+ *q10s16 = q1x2s16.val[0];
+ *q11s16 = q1x2s16.val[1];
+ *q12s16 = q2x2s16.val[0];
+ *q13s16 = q2x2s16.val[1];
+ *q14s16 = q3x2s16.val[0];
+ *q15s16 = q3x2s16.val[1];
+ return;
+}
+
+static INLINE void IDCT8x8_1D(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16,
+ int16x8_t *q10s16,
+ int16x8_t *q11s16,
+ int16x8_t *q12s16,
+ int16x8_t *q13s16,
+ int16x8_t *q14s16,
+ int16x8_t *q15s16) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+
+ d0s16 = vdup_n_s16(cospi_28_64);
+ d1s16 = vdup_n_s16(cospi_4_64);
+ d2s16 = vdup_n_s16(cospi_12_64);
+ d3s16 = vdup_n_s16(cospi_20_64);
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ q2s32 = vmull_s16(d18s16, d0s16);
+ q3s32 = vmull_s16(d19s16, d0s16);
+ q5s32 = vmull_s16(d26s16, d2s16);
+ q6s32 = vmull_s16(d27s16, d2s16);
+
+ q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+ q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+ q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
+ q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+
+ d8s16 = vqrshrn_n_s32(q2s32, 14);
+ d9s16 = vqrshrn_n_s32(q3s32, 14);
+ d10s16 = vqrshrn_n_s32(q5s32, 14);
+ d11s16 = vqrshrn_n_s32(q6s32, 14);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ q2s32 = vmull_s16(d18s16, d1s16);
+ q3s32 = vmull_s16(d19s16, d1s16);
+ q9s32 = vmull_s16(d26s16, d3s16);
+ q13s32 = vmull_s16(d27s16, d3s16);
+
+ q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
+ q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
+ q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+ q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
+
+ d14s16 = vqrshrn_n_s32(q2s32, 14);
+ d15s16 = vqrshrn_n_s32(q3s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q13s32, 14);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+ q7s16 = vcombine_s16(d14s16, d15s16);
+
+ d0s16 = vdup_n_s16(cospi_16_64);
+
+ q2s32 = vmull_s16(d16s16, d0s16);
+ q3s32 = vmull_s16(d17s16, d0s16);
+ q13s32 = vmull_s16(d16s16, d0s16);
+ q15s32 = vmull_s16(d17s16, d0s16);
+
+ q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
+ q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
+ q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
+ q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
+
+ d0s16 = vdup_n_s16(cospi_24_64);
+ d1s16 = vdup_n_s16(cospi_8_64);
+
+ d18s16 = vqrshrn_n_s32(q2s32, 14);
+ d19s16 = vqrshrn_n_s32(q3s32, 14);
+ d22s16 = vqrshrn_n_s32(q13s32, 14);
+ d23s16 = vqrshrn_n_s32(q15s32, 14);
+ *q9s16 = vcombine_s16(d18s16, d19s16);
+ *q11s16 = vcombine_s16(d22s16, d23s16);
+
+ q2s32 = vmull_s16(d20s16, d0s16);
+ q3s32 = vmull_s16(d21s16, d0s16);
+ q8s32 = vmull_s16(d20s16, d1s16);
+ q12s32 = vmull_s16(d21s16, d1s16);
+
+ q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
+ q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
+ q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
+ q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
+
+ d26s16 = vqrshrn_n_s32(q2s32, 14);
+ d27s16 = vqrshrn_n_s32(q3s32, 14);
+ d30s16 = vqrshrn_n_s32(q8s32, 14);
+ d31s16 = vqrshrn_n_s32(q12s32, 14);
+ *q13s16 = vcombine_s16(d26s16, d27s16);
+ *q15s16 = vcombine_s16(d30s16, d31s16);
+
+ q0s16 = vaddq_s16(*q9s16, *q15s16);
+ q1s16 = vaddq_s16(*q11s16, *q13s16);
+ q2s16 = vsubq_s16(*q11s16, *q13s16);
+ q3s16 = vsubq_s16(*q9s16, *q15s16);
+
+ *q13s16 = vsubq_s16(q4s16, q5s16);
+ q4s16 = vaddq_s16(q4s16, q5s16);
+ *q14s16 = vsubq_s16(q7s16, q6s16);
+ q7s16 = vaddq_s16(q7s16, q6s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+
+ d16s16 = vdup_n_s16(cospi_16_64);
+
+ q9s32 = vmull_s16(d28s16, d16s16);
+ q10s32 = vmull_s16(d29s16, d16s16);
+ q11s32 = vmull_s16(d28s16, d16s16);
+ q12s32 = vmull_s16(d29s16, d16s16);
+
+ q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
+ q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+ q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+ q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+ d10s16 = vqrshrn_n_s32(q9s32, 14);
+ d11s16 = vqrshrn_n_s32(q10s32, 14);
+ d12s16 = vqrshrn_n_s32(q11s32, 14);
+ d13s16 = vqrshrn_n_s32(q12s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ *q8s16 = vaddq_s16(q0s16, q7s16);
+ *q9s16 = vaddq_s16(q1s16, q6s16);
+ *q10s16 = vaddq_s16(q2s16, q5s16);
+ *q11s16 = vaddq_s16(q3s16, q4s16);
+ *q12s16 = vsubq_s16(q3s16, q4s16);
+ *q13s16 = vsubq_s16(q2s16, q5s16);
+ *q14s16 = vsubq_s16(q1s16, q6s16);
+ *q15s16 = vsubq_s16(q0s16, q7s16);
+ return;
+}
+
+static INLINE void IADST8X8_1D(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16,
+ int16x8_t *q10s16,
+ int16x8_t *q11s16,
+ int16x8_t *q12s16,
+ int16x8_t *q13s16,
+ int16x8_t *q14s16,
+ int16x8_t *q15s16) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int16x8_t q2s16, q4s16, q5s16, q6s16;
+ int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32;
+ int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ d14s16 = vdup_n_s16(cospi_2_64);
+ d15s16 = vdup_n_s16(cospi_30_64);
+
+ q1s32 = vmull_s16(d30s16, d14s16);
+ q2s32 = vmull_s16(d31s16, d14s16);
+ q3s32 = vmull_s16(d30s16, d15s16);
+ q4s32 = vmull_s16(d31s16, d15s16);
+
+ d30s16 = vdup_n_s16(cospi_18_64);
+ d31s16 = vdup_n_s16(cospi_14_64);
+
+ q1s32 = vmlal_s16(q1s32, d16s16, d15s16);
+ q2s32 = vmlal_s16(q2s32, d17s16, d15s16);
+ q3s32 = vmlsl_s16(q3s32, d16s16, d14s16);
+ q4s32 = vmlsl_s16(q4s32, d17s16, d14s16);
+
+ q5s32 = vmull_s16(d22s16, d30s16);
+ q6s32 = vmull_s16(d23s16, d30s16);
+ q7s32 = vmull_s16(d22s16, d31s16);
+ q8s32 = vmull_s16(d23s16, d31s16);
+
+ q5s32 = vmlal_s16(q5s32, d24s16, d31s16);
+ q6s32 = vmlal_s16(q6s32, d25s16, d31s16);
+ q7s32 = vmlsl_s16(q7s32, d24s16, d30s16);
+ q8s32 = vmlsl_s16(q8s32, d25s16, d30s16);
+
+ q11s32 = vaddq_s32(q1s32, q5s32);
+ q12s32 = vaddq_s32(q2s32, q6s32);
+ q1s32 = vsubq_s32(q1s32, q5s32);
+ q2s32 = vsubq_s32(q2s32, q6s32);
+
+ d22s16 = vqrshrn_n_s32(q11s32, 14);
+ d23s16 = vqrshrn_n_s32(q12s32, 14);
+ *q11s16 = vcombine_s16(d22s16, d23s16);
+
+ q12s32 = vaddq_s32(q3s32, q7s32);
+ q15s32 = vaddq_s32(q4s32, q8s32);
+ q3s32 = vsubq_s32(q3s32, q7s32);
+ q4s32 = vsubq_s32(q4s32, q8s32);
+
+ d2s16 = vqrshrn_n_s32(q1s32, 14);
+ d3s16 = vqrshrn_n_s32(q2s32, 14);
+ d24s16 = vqrshrn_n_s32(q12s32, 14);
+ d25s16 = vqrshrn_n_s32(q15s32, 14);
+ d6s16 = vqrshrn_n_s32(q3s32, 14);
+ d7s16 = vqrshrn_n_s32(q4s32, 14);
+ *q12s16 = vcombine_s16(d24s16, d25s16);
+
+ d0s16 = vdup_n_s16(cospi_10_64);
+ d1s16 = vdup_n_s16(cospi_22_64);
+ q4s32 = vmull_s16(d26s16, d0s16);
+ q5s32 = vmull_s16(d27s16, d0s16);
+ q2s32 = vmull_s16(d26s16, d1s16);
+ q6s32 = vmull_s16(d27s16, d1s16);
+
+ d30s16 = vdup_n_s16(cospi_26_64);
+ d31s16 = vdup_n_s16(cospi_6_64);
+
+ q4s32 = vmlal_s16(q4s32, d20s16, d1s16);
+ q5s32 = vmlal_s16(q5s32, d21s16, d1s16);
+ q2s32 = vmlsl_s16(q2s32, d20s16, d0s16);
+ q6s32 = vmlsl_s16(q6s32, d21s16, d0s16);
+
+ q0s32 = vmull_s16(d18s16, d30s16);
+ q13s32 = vmull_s16(d19s16, d30s16);
+
+ q0s32 = vmlal_s16(q0s32, d28s16, d31s16);
+ q13s32 = vmlal_s16(q13s32, d29s16, d31s16);
+
+ q10s32 = vmull_s16(d18s16, d31s16);
+ q9s32 = vmull_s16(d19s16, d31s16);
+
+ q10s32 = vmlsl_s16(q10s32, d28s16, d30s16);
+ q9s32 = vmlsl_s16(q9s32, d29s16, d30s16);
+
+ q14s32 = vaddq_s32(q2s32, q10s32);
+ q15s32 = vaddq_s32(q6s32, q9s32);
+ q2s32 = vsubq_s32(q2s32, q10s32);
+ q6s32 = vsubq_s32(q6s32, q9s32);
+
+ d28s16 = vqrshrn_n_s32(q14s32, 14);
+ d29s16 = vqrshrn_n_s32(q15s32, 14);
+ d4s16 = vqrshrn_n_s32(q2s32, 14);
+ d5s16 = vqrshrn_n_s32(q6s32, 14);
+ *q14s16 = vcombine_s16(d28s16, d29s16);
+
+ q9s32 = vaddq_s32(q4s32, q0s32);
+ q10s32 = vaddq_s32(q5s32, q13s32);
+ q4s32 = vsubq_s32(q4s32, q0s32);
+ q5s32 = vsubq_s32(q5s32, q13s32);
+
+ d30s16 = vdup_n_s16(cospi_8_64);
+ d31s16 = vdup_n_s16(cospi_24_64);
+
+ d18s16 = vqrshrn_n_s32(q9s32, 14);
+ d19s16 = vqrshrn_n_s32(q10s32, 14);
+ d8s16 = vqrshrn_n_s32(q4s32, 14);
+ d9s16 = vqrshrn_n_s32(q5s32, 14);
+ *q9s16 = vcombine_s16(d18s16, d19s16);
+
+ q5s32 = vmull_s16(d2s16, d30s16);
+ q6s32 = vmull_s16(d3s16, d30s16);
+ q7s32 = vmull_s16(d2s16, d31s16);
+ q0s32 = vmull_s16(d3s16, d31s16);
+
+ q5s32 = vmlal_s16(q5s32, d6s16, d31s16);
+ q6s32 = vmlal_s16(q6s32, d7s16, d31s16);
+ q7s32 = vmlsl_s16(q7s32, d6s16, d30s16);
+ q0s32 = vmlsl_s16(q0s32, d7s16, d30s16);
+
+ q1s32 = vmull_s16(d4s16, d30s16);
+ q3s32 = vmull_s16(d5s16, d30s16);
+ q10s32 = vmull_s16(d4s16, d31s16);
+ q2s32 = vmull_s16(d5s16, d31s16);
+
+ q1s32 = vmlsl_s16(q1s32, d8s16, d31s16);
+ q3s32 = vmlsl_s16(q3s32, d9s16, d31s16);
+ q10s32 = vmlal_s16(q10s32, d8s16, d30s16);
+ q2s32 = vmlal_s16(q2s32, d9s16, d30s16);
+
+ *q8s16 = vaddq_s16(*q11s16, *q9s16);
+ *q11s16 = vsubq_s16(*q11s16, *q9s16);
+ q4s16 = vaddq_s16(*q12s16, *q14s16);
+ *q12s16 = vsubq_s16(*q12s16, *q14s16);
+
+ q14s32 = vaddq_s32(q5s32, q1s32);
+ q15s32 = vaddq_s32(q6s32, q3s32);
+ q5s32 = vsubq_s32(q5s32, q1s32);
+ q6s32 = vsubq_s32(q6s32, q3s32);
+
+ d18s16 = vqrshrn_n_s32(q14s32, 14);
+ d19s16 = vqrshrn_n_s32(q15s32, 14);
+ d10s16 = vqrshrn_n_s32(q5s32, 14);
+ d11s16 = vqrshrn_n_s32(q6s32, 14);
+ *q9s16 = vcombine_s16(d18s16, d19s16);
+
+ q1s32 = vaddq_s32(q7s32, q10s32);
+ q3s32 = vaddq_s32(q0s32, q2s32);
+ q7s32 = vsubq_s32(q7s32, q10s32);
+ q0s32 = vsubq_s32(q0s32, q2s32);
+
+ d28s16 = vqrshrn_n_s32(q1s32, 14);
+ d29s16 = vqrshrn_n_s32(q3s32, 14);
+ d14s16 = vqrshrn_n_s32(q7s32, 14);
+ d15s16 = vqrshrn_n_s32(q0s32, 14);
+ *q14s16 = vcombine_s16(d28s16, d29s16);
+
+ d30s16 = vdup_n_s16(cospi_16_64);
+
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ q2s32 = vmull_s16(d22s16, d30s16);
+ q3s32 = vmull_s16(d23s16, d30s16);
+ q13s32 = vmull_s16(d22s16, d30s16);
+ q1s32 = vmull_s16(d23s16, d30s16);
+
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ q2s32 = vmlal_s16(q2s32, d24s16, d30s16);
+ q3s32 = vmlal_s16(q3s32, d25s16, d30s16);
+ q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
+ q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
+
+ d4s16 = vqrshrn_n_s32(q2s32, 14);
+ d5s16 = vqrshrn_n_s32(q3s32, 14);
+ d24s16 = vqrshrn_n_s32(q13s32, 14);
+ d25s16 = vqrshrn_n_s32(q1s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ *q12s16 = vcombine_s16(d24s16, d25s16);
+
+ q13s32 = vmull_s16(d10s16, d30s16);
+ q1s32 = vmull_s16(d11s16, d30s16);
+ q11s32 = vmull_s16(d10s16, d30s16);
+ q0s32 = vmull_s16(d11s16, d30s16);
+
+ q13s32 = vmlal_s16(q13s32, d14s16, d30s16);
+ q1s32 = vmlal_s16(q1s32, d15s16, d30s16);
+ q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
+ q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
+
+ d20s16 = vqrshrn_n_s32(q13s32, 14);
+ d21s16 = vqrshrn_n_s32(q1s32, 14);
+ d12s16 = vqrshrn_n_s32(q11s32, 14);
+ d13s16 = vqrshrn_n_s32(q0s32, 14);
+ *q10s16 = vcombine_s16(d20s16, d21s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ q5s16 = vdupq_n_s16(0);
+
+ *q9s16 = vsubq_s16(q5s16, *q9s16);
+ *q11s16 = vsubq_s16(q5s16, q2s16);
+ *q13s16 = vsubq_s16(q5s16, q6s16);
+ *q15s16 = vsubq_s16(q5s16, q4s16);
+ return;
+}
+
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
+ int dest_stride, int tx_type) {
+ int i;
+ uint8_t *d1, *d2;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8;
+ uint64x1_t d0u64, d1u64, d2u64, d3u64;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+ q8s16 = vld1q_s16(input);
+ q9s16 = vld1q_s16(input + 8);
+ q10s16 = vld1q_s16(input + 8 * 2);
+ q11s16 = vld1q_s16(input + 8 * 3);
+ q12s16 = vld1q_s16(input + 8 * 4);
+ q13s16 = vld1q_s16(input + 8 * 5);
+ q14s16 = vld1q_s16(input + 8 * 6);
+ q15s16 = vld1q_s16(input + 8 * 7);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ switch (tx_type) {
+ case 0: // idct_idct is not supported. Fall back to C
+ vp9_iht8x8_64_add_c(input, dest, dest_stride, tx_type);
+ return;
+ break;
+ case 1: // iadst_idct
+ // generate IDCT constants
+ // GENERATE_IDCT_CONSTANTS
+
+ // first transform rows
+ IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // transpose the matrix
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // generate IADST constants
+ // GENERATE_IADST_CONSTANTS
+
+ // then transform columns
+ IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+ break;
+ case 2: // idct_iadst
+ // generate IADST constants
+ // GENERATE_IADST_CONSTANTS
+
+ // first transform rows
+ IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // transpose the matrix
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // generate IDCT constants
+ // GENERATE_IDCT_CONSTANTS
+
+ // then transform columns
+ IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+ break;
+ case 3: // iadst_iadst
+ // generate IADST constants
+ // GENERATE_IADST_CONSTANTS
+
+ // first transform rows
+ IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // transpose the matrix
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // then transform columns
+ IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+ break;
+ default: // iadst_idct
+ assert(0);
+ break;
+ }
+
+ q8s16 = vrshrq_n_s16(q8s16, 5);
+ q9s16 = vrshrq_n_s16(q9s16, 5);
+ q10s16 = vrshrq_n_s16(q10s16, 5);
+ q11s16 = vrshrq_n_s16(q11s16, 5);
+ q12s16 = vrshrq_n_s16(q12s16, 5);
+ q13s16 = vrshrq_n_s16(q13s16, 5);
+ q14s16 = vrshrq_n_s16(q14s16, 5);
+ q15s16 = vrshrq_n_s16(q15s16, 5);
+
+ for (d1 = d2 = dest, i = 0; i < 2; i++) {
+ if (i != 0) {
+ q8s16 = q12s16;
+ q9s16 = q13s16;
+ q10s16 = q14s16;
+ q11s16 = q15s16;
+ }
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+ vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+ vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ }
+ return;
+}
diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
index 97fe02805..09f470e97 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
+++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -8,9 +8,178 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <arm_neon.h>
+
#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
+static INLINE void vp9_loop_filter_neon_16(
+ uint8x16_t qblimit, // blimit
+ uint8x16_t qlimit, // limit
+ uint8x16_t qthresh, // thresh
+ uint8x16_t q3, // p3
+ uint8x16_t q4, // p2
+ uint8x16_t q5, // p1
+ uint8x16_t q6, // p0
+ uint8x16_t q7, // q0
+ uint8x16_t q8, // q1
+ uint8x16_t q9, // q2
+ uint8x16_t q10, // q3
+ uint8x16_t *q5r, // p1
+ uint8x16_t *q6r, // p0
+ uint8x16_t *q7r, // q0
+ uint8x16_t *q8r) { // q1
+ uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+ int16x8_t q2s16, q11s16;
+ uint16x8_t q4u16;
+ int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
+ int8x8_t d2s8, d3s8;
+
+ q11u8 = vabdq_u8(q3, q4);
+ q12u8 = vabdq_u8(q4, q5);
+ q13u8 = vabdq_u8(q5, q6);
+ q14u8 = vabdq_u8(q8, q7);
+ q3 = vabdq_u8(q9, q8);
+ q4 = vabdq_u8(q10, q9);
+
+ q11u8 = vmaxq_u8(q11u8, q12u8);
+ q12u8 = vmaxq_u8(q13u8, q14u8);
+ q3 = vmaxq_u8(q3, q4);
+ q15u8 = vmaxq_u8(q11u8, q12u8);
+
+ q9 = vabdq_u8(q6, q7);
+
+ // vp8_hevmask
+ q13u8 = vcgtq_u8(q13u8, qthresh);
+ q14u8 = vcgtq_u8(q14u8, qthresh);
+ q15u8 = vmaxq_u8(q15u8, q3);
+
+ q2u8 = vabdq_u8(q5, q8);
+ q9 = vqaddq_u8(q9, q9);
+
+ q15u8 = vcgeq_u8(qlimit, q15u8);
+
+ // vp8_filter() function
+ // convert to signed
+ q10 = vdupq_n_u8(0x80);
+ q8 = veorq_u8(q8, q10);
+ q7 = veorq_u8(q7, q10);
+ q6 = veorq_u8(q6, q10);
+ q5 = veorq_u8(q5, q10);
+
+ q2u8 = vshrq_n_u8(q2u8, 1);
+ q9 = vqaddq_u8(q9, q2u8);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+ vget_low_s8(vreinterpretq_s8_u8(q6)));
+ q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+ vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+ q9 = vcgeq_u8(qblimit, q9);
+
+ q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
+ vreinterpretq_s8_u8(q8));
+
+ q14u8 = vorrq_u8(q13u8, q14u8);
+
+ q4u16 = vdupq_n_u16(3);
+ q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
+ q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
+
+ q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
+ q15u8 = vandq_u8(q15u8, q9);
+
+ q1s8 = vreinterpretq_s8_u8(q1u8);
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+ q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
+
+ q4 = vdupq_n_u8(3);
+ q9 = vdupq_n_u8(4);
+ // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+ d2s8 = vqmovn_s16(q2s16);
+ d3s8 = vqmovn_s16(q11s16);
+ q1s8 = vcombine_s8(d2s8, d3s8);
+ q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
+ q1s8 = vreinterpretq_s8_u8(q1u8);
+
+ q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
+ q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q1s8 = vshrq_n_s8(q1s8, 3);
+
+ q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
+ q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
+
+ q1s8 = vrshrq_n_s8(q1s8, 1);
+ q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+ q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
+ q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
+
+ *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
+ *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10);
+ *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
+ *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
+ return;
+}
+
+#if !HAVE_NEON_ASM
+void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
+ uint8x16_t qblimit, qlimit, qthresh;
+ uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
+
+ dblimit0 = vld1_u8(blimit0);
+ dlimit0 = vld1_u8(limit0);
+ dthresh0 = vld1_u8(thresh0);
+ dblimit1 = vld1_u8(blimit1);
+ dlimit1 = vld1_u8(limit1);
+ dthresh1 = vld1_u8(thresh1);
+ qblimit = vcombine_u8(dblimit0, dblimit1);
+ qlimit = vcombine_u8(dlimit0, dlimit1);
+ qthresh = vcombine_u8(dthresh0, dthresh1);
+
+ s -= (p << 2);
+
+ q3u8 = vld1q_u8(s);
+ s += p;
+ q4u8 = vld1q_u8(s);
+ s += p;
+ q5u8 = vld1q_u8(s);
+ s += p;
+ q6u8 = vld1q_u8(s);
+ s += p;
+ q7u8 = vld1q_u8(s);
+ s += p;
+ q8u8 = vld1q_u8(s);
+ s += p;
+ q9u8 = vld1q_u8(s);
+ s += p;
+ q10u8 = vld1q_u8(s);
+
+ vp9_loop_filter_neon_16(qblimit, qlimit, qthresh,
+ q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8,
+ &q5u8, &q6u8, &q7u8, &q8u8);
+
+ s -= (p * 5);
+ vst1q_u8(s, q5u8);
+ s += p;
+ vst1q_u8(s, q6u8);
+ s += p;
+ vst1q_u8(s, q7u8);
+ s += p;
+ vst1q_u8(s, q8u8);
+ return;
+}
+#endif // !HAVE_NEON_ASM
+
void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
const uint8_t *blimit0,
const uint8_t *limit0,
diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon.c b/vp9/common/arm/neon/vp9_loopfilter_neon.c
index f54d7a94b..079d26677 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_neon.c
+++ b/vp9/common/arm/neon/vp9_loopfilter_neon.c
@@ -10,7 +10,9 @@
#include <arm_neon.h>
-static inline void vp9_loop_filter_neon(
+#include "./vpx_config.h"
+
+static INLINE void vp9_loop_filter_neon(
uint8x8_t dblimit, // flimit
uint8x8_t dlimit, // limit
uint8x8_t dthresh, // thresh
@@ -271,7 +273,7 @@ void vp9_lpf_vertical_4_neon(
return;
}
-static inline void vp9_mbloop_filter_neon(
+static INLINE void vp9_mbloop_filter_neon(
uint8x8_t dblimit, // mblimit
uint8x8_t dlimit, // limit
uint8x8_t dthresh, // thresh
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.c b/vp9/common/arm/neon/vp9_reconintra_neon.c
new file mode 100644
index 000000000..d0beaa720
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.c
@@ -0,0 +1,473 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stddef.h>
+#include <arm_neon.h>
+
+void vp9_v_predictor_4x4_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ int i;
+ uint32x2_t d0u32 = vdup_n_u32(0);
+ (void)left;
+
+ d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
+ for (i = 0; i < 4; i++, dst += y_stride)
+ vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+ return;
+}
+
+void vp9_v_predictor_8x8_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ int i;
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ (void)left;
+
+ d0u8 = vld1_u8(above);
+ for (i = 0; i < 8; i++, dst += y_stride)
+ vst1_u8(dst, d0u8);
+ return;
+}
+
+void vp9_v_predictor_16x16_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ int i;
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ (void)left;
+
+ q0u8 = vld1q_u8(above);
+ for (i = 0; i < 16; i++, dst += y_stride)
+ vst1q_u8(dst, q0u8);
+ return;
+}
+
+void vp9_v_predictor_32x32_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ int i;
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ uint8x16_t q1u8 = vdupq_n_u8(0);
+ (void)left;
+
+ q0u8 = vld1q_u8(above);
+ q1u8 = vld1q_u8(above + 16);
+ for (i = 0; i < 32; i++, dst += y_stride) {
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ }
+ return;
+}
+
+void vp9_h_predictor_4x4_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ uint32x2_t d1u32 = vdup_n_u32(0);
+ (void)above;
+
+ d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
+
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ return;
+}
+
+void vp9_h_predictor_8x8_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ uint64x1_t d1u64 = vdup_n_u64(0);
+ (void)above;
+
+ d1u64 = vld1_u64((const uint64_t *)left);
+
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
+ vst1_u8(dst, d0u8);
+ return;
+}
+
+void vp9_h_predictor_16x16_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ int j;
+ uint8x8_t d2u8 = vdup_n_u8(0);
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ uint8x16_t q1u8 = vdupq_n_u8(0);
+ (void)above;
+
+ q1u8 = vld1q_u8(left);
+ d2u8 = vget_low_u8(q1u8);
+ for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+ q0u8 = vdupq_lane_u8(d2u8, 0);
+ vst1q_u8(dst, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 1);
+ vst1q_u8(dst, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 2);
+ vst1q_u8(dst, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 3);
+ vst1q_u8(dst, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 4);
+ vst1q_u8(dst, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 5);
+ vst1q_u8(dst, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 6);
+ vst1q_u8(dst, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 7);
+ vst1q_u8(dst, q0u8);
+ dst += y_stride;
+ }
+ return;
+}
+
+void vp9_h_predictor_32x32_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ int j, k;
+ uint8x8_t d2u8 = vdup_n_u8(0);
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ uint8x16_t q1u8 = vdupq_n_u8(0);
+ (void)above;
+
+ for (k = 0; k < 2; k++, left += 16) {
+ q1u8 = vld1q_u8(left);
+ d2u8 = vget_low_u8(q1u8);
+ for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+ q0u8 = vdupq_lane_u8(d2u8, 0);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 1);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 2);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 3);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 4);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 5);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 6);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 7);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ }
+ }
+ return;
+}
+
+void vp9_tm_predictor_4x4_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ int i;
+ uint16x8_t q1u16, q3u16;
+ int16x8_t q1s16;
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ uint32x2_t d2u32 = vdup_n_u32(0);
+
+ d0u8 = vdup_n_u8(above[-1]);
+ d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
+ q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
+ for (i = 0; i < 4; i++, dst += y_stride) {
+ q1u16 = vdupq_n_u16((uint16_t)left[i]);
+ q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
+ vreinterpretq_s16_u16(q3u16));
+ d0u8 = vqmovun_s16(q1s16);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ }
+ return;
+}
+
+void vp9_tm_predictor_8x8_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ int j;
+ uint16x8_t q0u16, q3u16, q10u16;
+ int16x8_t q0s16;
+ uint16x4_t d20u16;
+ uint8x8_t d0u8, d2u8, d30u8;
+
+ d0u8 = vdup_n_u8(above[-1]);
+ d30u8 = vld1_u8(left);
+ d2u8 = vld1_u8(above);
+ q10u16 = vmovl_u8(d30u8);
+ q3u16 = vsubl_u8(d2u8, d0u8);
+ d20u16 = vget_low_u16(q10u16);
+ for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
+ q0u16 = vdupq_lane_u16(d20u16, 0);
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+ vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += y_stride;
+ q0u16 = vdupq_lane_u16(d20u16, 1);
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+ vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += y_stride;
+ q0u16 = vdupq_lane_u16(d20u16, 2);
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+ vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += y_stride;
+ q0u16 = vdupq_lane_u16(d20u16, 3);
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+ vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += y_stride;
+ }
+ return;
+}
+
+void vp9_tm_predictor_16x16_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ int j, k;
+ uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
+ uint8x16_t q0u8, q1u8;
+ int16x8_t q0s16, q1s16, q8s16, q11s16;
+ uint16x4_t d20u16;
+ uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
+
+ q0u8 = vdupq_n_u8(above[-1]);
+ q1u8 = vld1q_u8(above);
+ q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+ q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+ for (k = 0; k < 2; k++, left += 8) {
+ d18u8 = vld1_u8(left);
+ q10u16 = vmovl_u8(d18u8);
+ d20u16 = vget_low_u16(q10u16);
+ for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
+ q0u16 = vdupq_lane_u16(d20u16, 0);
+ q8u16 = vdupq_lane_u16(d20u16, 1);
+ q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q2u16));
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q3u16));
+ q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+ vreinterpretq_s16_u16(q2u16));
+ q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+ vreinterpretq_s16_u16(q3u16));
+ d2u8 = vqmovun_s16(q1s16);
+ d3u8 = vqmovun_s16(q0s16);
+ d22u8 = vqmovun_s16(q11s16);
+ d23u8 = vqmovun_s16(q8s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+ dst += y_stride;
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+ dst += y_stride;
+
+ q0u16 = vdupq_lane_u16(d20u16, 2);
+ q8u16 = vdupq_lane_u16(d20u16, 3);
+ q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q2u16));
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q3u16));
+ q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+ vreinterpretq_s16_u16(q2u16));
+ q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+ vreinterpretq_s16_u16(q3u16));
+ d2u8 = vqmovun_s16(q1s16);
+ d3u8 = vqmovun_s16(q0s16);
+ d22u8 = vqmovun_s16(q11s16);
+ d23u8 = vqmovun_s16(q8s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+ dst += y_stride;
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+ dst += y_stride;
+ }
+ }
+ return;
+}
+
+void vp9_tm_predictor_32x32_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ int j, k;
+ uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
+ uint8x16_t q0u8, q1u8, q2u8;
+ int16x8_t q12s16, q13s16, q14s16, q15s16;
+ uint16x4_t d6u16;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
+
+ q0u8 = vdupq_n_u8(above[-1]);
+ q1u8 = vld1q_u8(above);
+ q2u8 = vld1q_u8(above + 16);
+ q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+ q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+ q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
+ q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
+ for (k = 0; k < 4; k++, left += 8) {
+ d26u8 = vld1_u8(left);
+ q3u16 = vmovl_u8(d26u8);
+ d6u16 = vget_low_u16(q3u16);
+ for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
+ q0u16 = vdupq_lane_u16(d6u16, 0);
+ q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q8u16));
+ q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += y_stride;
+
+ q0u16 = vdupq_lane_u16(d6u16, 1);
+ q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q8u16));
+ q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += y_stride;
+
+ q0u16 = vdupq_lane_u16(d6u16, 2);
+ q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q8u16));
+ q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += y_stride;
+
+ q0u16 = vdupq_lane_u16(d6u16, 3);
+ q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q8u16));
+ q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += y_stride;
+ }
+ }
+ return;
+}
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.asm b/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm
index dc9856fa8..dc9856fa8 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.asm
+++ b/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index cb299f9f7..2f75af575 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -44,8 +44,10 @@ void vp9_free_ref_frame_buffers(VP9_COMMON *cm) {
vp9_free_frame_buffer(&cm->frame_bufs[i].buf);
}
+#if CONFIG_VP9_POSTPROC
vp9_free_frame_buffer(&cm->post_proc_buffer);
vp9_free_frame_buffer(&cm->post_proc_buffer_int);
+#endif
}
void vp9_free_context_buffers(VP9_COMMON *cm) {
@@ -110,7 +112,8 @@ int vp9_alloc_ref_frame_buffers(VP9_COMMON *cm, int width, int height) {
#if CONFIG_VP9_HIGHBITDEPTH
cm->use_highbitdepth,
#endif
- VP9_ENC_BORDER_IN_PIXELS) < 0)
+ VP9_ENC_BORDER_IN_PIXELS,
+ cm->byte_alignment) < 0)
goto fail;
if (cm->frame_bufs[i].mvs == NULL) {
cm->frame_bufs[i].mvs =
@@ -126,12 +129,13 @@ int vp9_alloc_ref_frame_buffers(VP9_COMMON *cm, int width, int height) {
init_frame_bufs(cm);
-#if CONFIG_INTERNAL_STATS || CONFIG_VP9_POSTPROC
+#if CONFIG_VP9_POSTPROC
if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
#if CONFIG_VP9_HIGHBITDEPTH
cm->use_highbitdepth,
#endif
- VP9_ENC_BORDER_IN_PIXELS) < 0)
+ VP9_ENC_BORDER_IN_PIXELS,
+ cm->byte_alignment) < 0)
goto fail;
#endif
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 7d7209c56..e7fb19fd1 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -212,6 +212,12 @@ typedef struct macroblockd {
/* pointer to current frame */
const YV12_BUFFER_CONFIG *cur_buf;
+ ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+ ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
+
+ PARTITION_CONTEXT *above_seg_context;
+ PARTITION_CONTEXT left_seg_context[8];
+
/* mc buffer */
DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
@@ -221,17 +227,11 @@ typedef struct macroblockd {
DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);
#endif
- int lossless;
+ /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+ DECLARE_ALIGNED(16, tran_low_t, dqcoeff[64 * 64]);
+ int lossless;
int corrupted;
-
- DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
-
- ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
- ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
-
- PARTITION_CONTEXT *above_seg_context;
- PARTITION_CONTEXT left_seg_context[8];
} MACROBLOCKD;
static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index d7610ed28..4557e19bf 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -453,6 +453,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
vp9_default_coef_probs(cm);
vp9_init_mode_probs(cm->fc);
vp9_init_mv_probs(cm);
+ cm->fc->initialized = 1;
if (cm->frame_type == KEY_FRAME ||
cm->error_resilient_mode || cm->reset_frame_context == 3) {
@@ -469,8 +470,6 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
vpx_memset(cm->prev_mip, 0, cm->mi_stride * (cm->mi_rows + 1) *
sizeof(*cm->prev_mip));
- vpx_memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
-
vp9_zero(cm->ref_frame_sign_bias);
cm->frame_context_idx = 0;
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index 6831d3f87..6db10806d 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -50,6 +50,7 @@ typedef struct frame_contexts {
struct tx_probs tx_probs;
vp9_prob skip_probs[SKIP_CONTEXTS];
nmv_context nmvc;
+ int initialized;
} FRAME_CONTEXT;
typedef struct {
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 43a4fe5b9..58b2da75f 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -968,7 +968,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
break;
}
// The largest loopfilter we have is 16x16 so we use the 16x16 mask
- // for 32x32 transforms also also.
+ // for 32x32 transforms also.
lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];
lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];
lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];
diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c
index f1bdc1b06..92650e954 100644
--- a/vp9/common/vp9_mfqe.c
+++ b/vp9/common/vp9_mfqe.c
@@ -210,17 +210,85 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
return;
}
// No MFQE on blocks smaller than 16x16
- if (partition == PARTITION_SPLIT && bs == BLOCK_16X16) {
+ if (bs == BLOCK_16X16) {
partition = PARTITION_NONE;
}
+ if (bs == BLOCK_64X64) {
+ mi_offset = 4;
+ y_offset = 32;
+ uv_offset = 16;
+ } else {
+ mi_offset = 2;
+ y_offset = 16;
+ uv_offset = 8;
+ }
switch (partition) {
+ BLOCK_SIZE mfqe_bs, bs_tmp;
case PARTITION_HORZ:
+ if (bs == BLOCK_64X64) {
+ mfqe_bs = BLOCK_64X32;
+ bs_tmp = BLOCK_32X32;
+ } else {
+ mfqe_bs = BLOCK_32X16;
+ bs_tmp = BLOCK_16X16;
+ }
+ if (mfqe_decision(mi, mfqe_bs)) {
+ // Do mfqe on the first square partition.
+ mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
+ yd, ud, vd, yd_stride, uvd_stride);
+ // Do mfqe on the second square partition.
+ mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
+ y_stride, uv_stride, yd + y_offset, ud + uv_offset,
+ vd + uv_offset, yd_stride, uvd_stride);
+ }
+ if (mfqe_decision(mi + mi_offset * cm->mi_stride, mfqe_bs)) {
+ // Do mfqe on the first square partition.
+ mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
+ v + uv_offset * uv_stride, y_stride, uv_stride,
+ yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
+ vd + uv_offset * uvd_stride, yd_stride, uvd_stride);
+ // Do mfqe on the second square partition.
+ mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
+ u + uv_offset * uv_stride + uv_offset,
+ v + uv_offset * uv_stride + uv_offset, y_stride,
+ uv_stride, yd + y_offset * yd_stride + y_offset,
+ ud + uv_offset * uvd_stride + uv_offset,
+ vd + uv_offset * uvd_stride + uv_offset,
+ yd_stride, uvd_stride);
+ }
+ break;
case PARTITION_VERT:
- // If current block size is not square.
- // Copy the block from current frame(i.e., no mfqe is done).
- // TODO(jackychen): Rectangle blocks should also be taken into account.
- copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
- yd_stride, uvd_stride, bs);
+ if (bs == BLOCK_64X64) {
+ mfqe_bs = BLOCK_32X64;
+ bs_tmp = BLOCK_32X32;
+ } else {
+ mfqe_bs = BLOCK_16X32;
+ bs_tmp = BLOCK_16X16;
+ }
+ if (mfqe_decision(mi, mfqe_bs)) {
+ // Do mfqe on the first square partition.
+ mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
+ yd, ud, vd, yd_stride, uvd_stride);
+ // Do mfqe on the second square partition.
+ mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
+ v + uv_offset * uv_stride, y_stride, uv_stride,
+ yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
+ vd + uv_offset * uvd_stride, yd_stride, uvd_stride);
+ }
+ if (mfqe_decision(mi + mi_offset, mfqe_bs)) {
+ // Do mfqe on the first square partition.
+ mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
+ y_stride, uv_stride, yd + y_offset, ud + uv_offset,
+ vd + uv_offset, yd_stride, uvd_stride);
+ // Do mfqe on the second square partition.
+ mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
+ u + uv_offset * uv_stride + uv_offset,
+ v + uv_offset * uv_stride + uv_offset, y_stride,
+ uv_stride, yd + y_offset * yd_stride + y_offset,
+ ud + uv_offset * uvd_stride + uv_offset,
+ vd + uv_offset * uvd_stride + uv_offset,
+ yd_stride, uvd_stride);
+ }
break;
case PARTITION_NONE:
if (mfqe_decision(mi, cur_bs)) {
@@ -234,15 +302,6 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
}
break;
case PARTITION_SPLIT:
- if (bs == BLOCK_64X64) {
- mi_offset = 4;
- y_offset = 32;
- uv_offset = 16;
- } else {
- mi_offset = 2;
- y_offset = 16;
- uv_offset = 8;
- }
// Recursion on four square partitions, e.g. if bs is 64X64,
// then look into four 32X32 blocks in it.
mfqe_partition(cm, mi, subsize, y, u, v, y_stride, uv_stride, yd, ud, vd,
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 55a1f86c7..ad91c10dd 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -112,8 +112,10 @@ typedef struct VP9Common {
int new_fb_idx;
+#if CONFIG_VP9_POSTPROC
YV12_BUFFER_CONFIG post_proc_buffer;
YV12_BUFFER_CONFIG post_proc_buffer_int;
+#endif
FRAME_TYPE last_frame_type; /* last frame's frame type for motion search.*/
FRAME_TYPE frame_type;
@@ -182,7 +184,6 @@ typedef struct VP9Common {
struct segmentation seg;
// Context probabilities for reference frame prediction
- int allow_comp_inter_inter;
MV_REFERENCE_FRAME comp_fixed_ref;
MV_REFERENCE_FRAME comp_var_ref[2];
REFERENCE_MODE reference_mode;
@@ -207,6 +208,7 @@ typedef struct VP9Common {
int frame_parallel_decoding_mode;
int log2_tile_cols, log2_tile_rows;
+ int byte_alignment;
// Private data associated with the frame buffer callbacks.
void *cb_priv;
@@ -263,7 +265,7 @@ static INLINE void init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd) {
int i;
for (i = 0; i < MAX_MB_PLANE; ++i) {
- xd->plane[i].dqcoeff = xd->dqcoeff[i];
+ xd->plane[i].dqcoeff = xd->dqcoeff;
xd->above_context[i] = cm->above_context +
i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols);
}
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index e1a389132..7eac70be2 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -671,7 +671,8 @@ int vp9_post_proc_frame(struct VP9Common *cm,
#if CONFIG_VP9_HIGHBITDEPTH
cm->use_highbitdepth,
#endif // CONFIG_VP9_HIGHBITDEPTH
- VP9_ENC_BORDER_IN_PIXELS) < 0) {
+ VP9_ENC_BORDER_IN_PIXELS,
+ cm->byte_alignment) < 0) {
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate MFQE framebuffer");
}
@@ -683,19 +684,18 @@ int vp9_post_proc_frame(struct VP9Common *cm,
}
}
-#if CONFIG_VP9_POSTPROC || CONFIG_INTERNAL_STATS
if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
#if CONFIG_VP9_HIGHBITDEPTH
cm->use_highbitdepth,
#endif
- VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0)
+ VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL) < 0)
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate post-processing buffer");
-#endif
if ((flags & VP9D_MFQE) && cm->current_video_frame >= 2 &&
- cm->postproc_state.last_frame_valid &&
+ cm->postproc_state.last_frame_valid && cm->bit_depth == 8 &&
cm->postproc_state.last_base_qindex <= last_q_thresh &&
cm->base_qindex - cm->postproc_state.last_base_qindex >= q_diff_thresh) {
vp9_mfqe(cm);
@@ -749,4 +749,4 @@ int vp9_post_proc_frame(struct VP9Common *cm,
swap_mi_and_prev_mi(cm);
return 0;
}
-#endif
+#endif // CONFIG_VP9_POSTPROC
diff --git a/vp9/common/vp9_rtcd.c b/vp9/common/vp9_rtcd.c
index dc15a84ff..c777bc81f 100644
--- a/vp9/common/vp9_rtcd.c
+++ b/vp9/common/vp9_rtcd.c
@@ -16,5 +16,7 @@ void vpx_scale_rtcd(void);
void vp9_rtcd() {
vpx_scale_rtcd();
+ // TODO(JBB): Remove this once, by insuring that both the encoder and
+ // decoder setup functions are protected by once();
once(setup_rtcd_internal);
}
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 575990bb5..d2ab875e9 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -66,8 +66,7 @@ add_proto qw/void vp9_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, con
specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc";
add_proto qw/void vp9_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_4x4 neon_asm dspr2/, "$ssse3_x86inc";
-$vp9_h_predictor_4x4_neon_asm=vp9_h_predictor_4x4_neon;
+specialize qw/vp9_h_predictor_4x4 neon dspr2/, "$ssse3_x86inc";
add_proto qw/void vp9_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_d117_predictor_4x4/;
@@ -79,12 +78,10 @@ add_proto qw/void vp9_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, co
specialize qw/vp9_d153_predictor_4x4/, "$ssse3_x86inc";
add_proto qw/void vp9_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_4x4 neon_asm/, "$sse_x86inc";
-$vp9_v_predictor_4x4_neon_asm=vp9_v_predictor_4x4_neon;
+specialize qw/vp9_v_predictor_4x4 neon/, "$sse_x86inc";
add_proto qw/void vp9_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_4x4 neon_asm dspr2/, "$sse_x86inc";
-$vp9_tm_predictor_4x4_neon_asm=vp9_tm_predictor_4x4_neon;
+specialize qw/vp9_tm_predictor_4x4 neon dspr2/, "$sse_x86inc";
add_proto qw/void vp9_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_dc_predictor_4x4 dspr2/, "$sse_x86inc";
@@ -108,8 +105,7 @@ add_proto qw/void vp9_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, con
specialize qw/vp9_d63_predictor_8x8/, "$ssse3_x86inc";
add_proto qw/void vp9_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_8x8 neon_asm dspr2/, "$ssse3_x86inc";
-$vp9_h_predictor_8x8_neon_asm=vp9_h_predictor_8x8_neon;
+specialize qw/vp9_h_predictor_8x8 neon dspr2/, "$ssse3_x86inc";
add_proto qw/void vp9_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_d117_predictor_8x8/;
@@ -121,12 +117,10 @@ add_proto qw/void vp9_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, co
specialize qw/vp9_d153_predictor_8x8/, "$ssse3_x86inc";
add_proto qw/void vp9_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_8x8 neon_asm/, "$sse_x86inc";
-$vp9_v_predictor_8x8_neon_asm=vp9_v_predictor_8x8_neon;
+specialize qw/vp9_v_predictor_8x8 neon/, "$sse_x86inc";
add_proto qw/void vp9_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_8x8 neon_asm dspr2/, "$sse2_x86inc";
-$vp9_tm_predictor_8x8_neon_asm=vp9_tm_predictor_8x8_neon;
+specialize qw/vp9_tm_predictor_8x8 neon dspr2/, "$sse2_x86inc";
add_proto qw/void vp9_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_dc_predictor_8x8 dspr2/, "$sse_x86inc";
@@ -150,8 +144,7 @@ add_proto qw/void vp9_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, c
specialize qw/vp9_d63_predictor_16x16/, "$ssse3_x86inc";
add_proto qw/void vp9_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_16x16 neon_asm dspr2/, "$ssse3_x86inc";
-$vp9_h_predictor_16x16_neon_asm=vp9_h_predictor_16x16_neon;
+specialize qw/vp9_h_predictor_16x16 neon dspr2/, "$ssse3_x86inc";
add_proto qw/void vp9_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_d117_predictor_16x16/;
@@ -163,12 +156,10 @@ add_proto qw/void vp9_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride,
specialize qw/vp9_d153_predictor_16x16/, "$ssse3_x86inc";
add_proto qw/void vp9_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_16x16 neon_asm/, "$sse2_x86inc";
-$vp9_v_predictor_16x16_neon_asm=vp9_v_predictor_16x16_neon;
+specialize qw/vp9_v_predictor_16x16 neon/, "$sse2_x86inc";
add_proto qw/void vp9_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_16x16 neon_asm/, "$sse2_x86inc";
-$vp9_tm_predictor_16x16_neon_asm=vp9_tm_predictor_16x16_neon;
+specialize qw/vp9_tm_predictor_16x16 neon/, "$sse2_x86inc";
add_proto qw/void vp9_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_dc_predictor_16x16 dspr2/, "$sse2_x86inc";
@@ -192,8 +183,7 @@ add_proto qw/void vp9_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, c
specialize qw/vp9_d63_predictor_32x32/, "$ssse3_x86inc";
add_proto qw/void vp9_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_32x32 neon_asm/, "$ssse3_x86inc";
-$vp9_h_predictor_32x32_neon_asm=vp9_h_predictor_32x32_neon;
+specialize qw/vp9_h_predictor_32x32 neon/, "$ssse3_x86inc";
add_proto qw/void vp9_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_d117_predictor_32x32/;
@@ -205,12 +195,10 @@ add_proto qw/void vp9_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride,
specialize qw/vp9_d153_predictor_32x32/;
add_proto qw/void vp9_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_32x32 neon_asm/, "$sse2_x86inc";
-$vp9_v_predictor_32x32_neon_asm=vp9_v_predictor_32x32_neon;
+specialize qw/vp9_v_predictor_32x32 neon/, "$sse2_x86inc";
add_proto qw/void vp9_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_32x32 neon_asm/, "$sse2_x86_64";
-$vp9_tm_predictor_32x32_neon_asm=vp9_tm_predictor_32x32_neon;
+specialize qw/vp9_tm_predictor_32x32 neon/, "$sse2_x86_64";
add_proto qw/void vp9_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_dc_predictor_32x32/, "$sse2_x86inc";
@@ -261,8 +249,7 @@ add_proto qw/void vp9_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *
specialize qw/vp9_lpf_horizontal_4 mmx neon dspr2/;
add_proto qw/void vp9_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_horizontal_4_dual sse2 neon_asm dspr2/;
-$vp9_lpf_horizontal_4_dual_neon_asm=vp9_lpf_horizontal_4_dual_neon;
+specialize qw/vp9_lpf_horizontal_4_dual sse2 neon dspr2/;
#
# post proc
@@ -457,12 +444,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_idct32x32_1_add sse2 neon dspr2/;
add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
- specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;
- $vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon;
+ specialize qw/vp9_iht4x4_16_add sse2 neon dspr2/;
add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
- specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/;
- $vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon;
+ specialize qw/vp9_iht8x8_64_add sse2 neon dspr2/;
add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
specialize qw/vp9_iht16x16_256_add sse2 dspr2/;
@@ -1140,37 +1125,37 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
specialize qw/vp9_block_error/;
- add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_fp/;
- add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_fp_32x32/;
- add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_b/;
- add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_b_32x32/;
- add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_fdct8x8_quant/;
} else {
add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
- add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
- add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
- add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_b sse2/, "$ssse3_x86_64";
- add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64";
- add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_fdct8x8_quant sse2 ssse3/;
}
@@ -1865,16 +1850,16 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp9_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
specialize qw/vp9_highbd_subtract_block/;
- add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_highbd_quantize_fp/;
- add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_highbd_quantize_fp_32x32/;
- add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_highbd_quantize_b sse2/;
- add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_highbd_quantize_b_32x32 sse2/;
#
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
index c4efa6565..71dbb402d 100644
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
@@ -312,9 +312,11 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
unsigned int out_pitch,
unsigned int output_height,
int16_t *filter) {
- __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
+ __m128i addFilterReg64, filtersReg, minReg;
__m128i firstFilters, secondFilters, thirdFilters, forthFilters;
- __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+ __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
+ __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
+ __m128i srcReg8;
unsigned int i;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
@@ -333,27 +335,26 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
// duplicate only the forth 16 bits in the filter
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+ // load the first 7 rows of 8 bytes
+ srcReg1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
+ srcReg2 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch)[0]);
+ srcReg3 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 2)[0]);
+ srcReg4 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 3)[0]);
+ srcReg5 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 4)[0]);
+ srcReg6 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 5)[0]);
+ srcReg7 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 6)[0]);
+
for (i = 0; i < output_height; i++) {
- // load the first 8 bytes
- srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
- // load the next 8 bytes in stride of src_pitch
- srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
- srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
- srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
+ // load the last 8 bytes
+ srcReg8 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 7)[0]);
// merge the result together
- srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
- srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-
- // load the next 8 bytes in stride of src_pitch
- srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
- srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
- srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
- srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
+ srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
+ srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
// merge the result together
- srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
- srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
+ srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+ srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
@@ -377,6 +378,15 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
src_ptr+=src_pitch;
+ // shift down a row
+ srcReg1 = srcReg2;
+ srcReg2 = srcReg3;
+ srcReg3 = srcReg4;
+ srcReg4 = srcReg5;
+ srcReg5 = srcReg6;
+ srcReg6 = srcReg7;
+ srcReg7 = srcReg8;
+
// save only 8 bytes convolve result
_mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
@@ -390,9 +400,11 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
unsigned int out_pitch,
unsigned int output_height,
int16_t *filter) {
- __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
+ __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;
__m128i firstFilters, secondFilters, thirdFilters, forthFilters;
- __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
+ __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
+ __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
+ __m128i srcReg8;
unsigned int i;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
@@ -411,19 +423,24 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
// duplicate only the forth 16 bits in the filter
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+ // load the first 7 rows of 16 bytes
+ srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr));
+ srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch));
+ srcReg3 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 2));
+ srcReg4 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 3));
+ srcReg5 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 4));
+ srcReg6 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 5));
+ srcReg7 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 6));
+
for (i = 0; i < output_height; i++) {
- // load the first 16 bytes
- srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
- // load the next 16 bytes in stride of src_pitch
- srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
- srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
- srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
+ // load the last 16 bytes
+ srcReg8 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 7));
// merge the result together
- srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
- srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
- srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
- srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
+ srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2);
+ srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8);
+ srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2);
+ srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
@@ -435,25 +452,17 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
- // load the next 16 bytes in stride of two/three src_pitch
- srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
- srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
-
// merge the result together
- srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
- srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+ srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+ srcRegFilt6 = _mm_unpackhi_epi8(srcReg3, srcReg4);
// multiply 2 adjacent elements with the filter and add the result
- srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
- // load the next 16 bytes in stride of four/five src_pitch
- srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
- srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
-
// merge the result together
- srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
- srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+ srcRegFilt7 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+ srcRegFilt8 = _mm_unpackhi_epi8(srcReg5, srcReg6);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
@@ -461,13 +470,13 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
// add and saturate the results together
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
- _mm_min_epi16(srcRegFilt4, srcRegFilt7));
+ _mm_min_epi16(srcRegFilt3, srcRegFilt7));
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
_mm_min_epi16(srcRegFilt6, srcRegFilt8));
// add and saturate the results together
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
- _mm_max_epi16(srcRegFilt4, srcRegFilt7));
+ _mm_max_epi16(srcRegFilt3, srcRegFilt7));
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
_mm_max_epi16(srcRegFilt6, srcRegFilt8));
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
@@ -484,6 +493,15 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
src_ptr+=src_pitch;
+ // shift down a row
+ srcReg1 = srcReg2;
+ srcReg2 = srcReg3;
+ srcReg3 = srcReg4;
+ srcReg4 = srcReg5;
+ srcReg5 = srcReg6;
+ srcReg6 = srcReg7;
+ srcReg7 = srcReg8;
+
// save 16 bytes convolve result
_mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 58df87d0c..9677173db 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -719,6 +719,7 @@ static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
cm->use_highbitdepth,
#endif
VP9_DEC_BORDER_IN_PIXELS,
+ cm->byte_alignment,
&cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
cm->cb_priv)) {
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
@@ -793,6 +794,7 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
cm->use_highbitdepth,
#endif
VP9_DEC_BORDER_IN_PIXELS,
+ cm->byte_alignment,
&cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
cm->cb_priv)) {
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
@@ -1556,6 +1558,10 @@ void vp9_decode_frame(VP9Decoder *pbi,
vp9_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
*cm->fc = cm->frame_contexts[cm->frame_context_idx];
+ if (!cm->fc->initialized)
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+ "Uninitialized entropy context.");
+
vp9_zero(cm->counts);
xd->corrupted = 0;
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 2daf86200..1406b4034 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -15,6 +15,7 @@
#include "./vpx_scale_rtcd.h"
#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_once.h"
#include "vpx_ports/vpx_timer.h"
#include "vpx_scale/vpx_scale.h"
@@ -33,8 +34,8 @@
#include "vp9/decoder/vp9_detokenize.h"
#include "vp9/decoder/vp9_dthread.h"
-static void initialize_dec() {
- static int init_done = 0;
+static void initialize_dec(void) {
+ static volatile int init_done = 0;
if (!init_done) {
vp9_rtcd();
@@ -85,7 +86,7 @@ VP9Decoder *vp9_decoder_create() {
sizeof(*cm->frame_contexts)));
pbi->need_resync = 1;
- initialize_dec();
+ once(initialize_dec);
// Initialize the references to not point to any frame buffers.
vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
diff --git a/vp9/decoder/vp9_read_bit_buffer.c b/vp9/decoder/vp9_read_bit_buffer.c
index 3eef72844..c3b38a9c7 100644
--- a/vp9/decoder/vp9_read_bit_buffer.c
+++ b/vp9/decoder/vp9_read_bit_buffer.c
@@ -10,20 +10,20 @@
#include "vp9/decoder/vp9_read_bit_buffer.h"
size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb) {
- return (rb->bit_offset + CHAR_BIT - 1) / CHAR_BIT;
+ return (rb->bit_offset + 7) >> 3;
}
int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb) {
const size_t off = rb->bit_offset;
- const size_t p = off / CHAR_BIT;
- const int q = CHAR_BIT - 1 - (int)off % CHAR_BIT;
- if (rb->bit_buffer + p >= rb->bit_buffer_end) {
- rb->error_handler(rb->error_handler_data);
- return 0;
- } else {
- const int bit = (rb->bit_buffer[p] & (1 << q)) >> q;
+ const size_t p = off >> 3;
+ const int q = 7 - (int)(off & 0x7);
+ if (rb->bit_buffer + p < rb->bit_buffer_end) {
+ const int bit = (rb->bit_buffer[p] >> q) & 1;
rb->bit_offset = off + 1;
return bit;
+ } else {
+ rb->error_handler(rb->error_handler_data);
+ return 0;
}
}
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 8c13d0da6..9cf1e5e2c 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -26,13 +26,12 @@ void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count,
const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
- int zbin_oq_value, uint16_t *eob_ptr,
+ uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
// TODO(jingning) Decide the need of these arguments after the
// quantization process is completed.
(void)zbin_ptr;
(void)quant_shift_ptr;
- (void)zbin_oq_value;
(void)scan;
if (!skip_block) {
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 019d5b90b..752429c8f 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -1181,7 +1181,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
counts->intra_inter[i]);
- if (cm->allow_comp_inter_inter) {
+ if (cpi->allow_comp_inter_inter) {
const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 2ffc7ea67..68174a6cc 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -40,8 +40,6 @@ struct macroblock_plane {
int16_t *round;
int64_t quant_thred[2];
- // Zbin Over Quant value
- int16_t zbin_extra;
};
/* The [2] dimension is for whether we skip the EOB node (i.e. if previous
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 020a95196..506f6de84 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -339,7 +339,7 @@ void vp9_fdct8x8_quant_c(const int16_t *input, int stride,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
- int zbin_oq_value, uint16_t *eob_ptr,
+ uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
int eob = -1;
@@ -416,7 +416,6 @@ void vp9_fdct8x8_quant_c(const int16_t *input, int stride,
// quantization process is completed.
(void)zbin_ptr;
(void)quant_shift_ptr;
- (void)zbin_oq_value;
(void)iscan;
vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 4deeed217..56ec6b335 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -425,6 +425,7 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
#endif
int border) {
int i, fail;
+ const int legacy_byte_alignment = 0;
assert(denoiser != NULL);
for (i = 0; i < MAX_REF_FRAMES; ++i) {
@@ -433,7 +434,7 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
#if CONFIG_VP9_HIGHBITDEPTH
use_highbitdepth,
#endif
- border);
+ border, legacy_byte_alignment);
if (fail) {
vp9_denoiser_free(denoiser);
return 1;
@@ -448,7 +449,7 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
#if CONFIG_VP9_HIGHBITDEPTH
use_highbitdepth,
#endif
- border);
+ border, legacy_byte_alignment);
if (fail) {
vp9_denoiser_free(denoiser);
return 1;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 756393f31..4c948237d 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -423,8 +423,9 @@ static int set_vt_partitioning(VP9_COMP *cpi,
if (cm->frame_type == KEY_FRAME) {
bsize_ref = BLOCK_8X8;
- // Choose lower thresholds for key frame variance to favor split.
- threshold_bsize_ref = threshold >> 1;
+ // Choose lower thresholds for key frame variance to favor split, but keep
+ // threshold for splitting to 4x4 block still fairly high for now.
+ threshold_bsize_ref = threshold << 2;
threshold_low = threshold >> 2;
}
@@ -592,7 +593,16 @@ static void choose_partitioning(VP9_COMP *cpi,
unsigned int sse = 0;
int sum = 0;
if (x4_idx < pixels_wide && y4_idx < pixels_high) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ int s_avg;
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ s_avg = vp9_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+ } else {
+ s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+ }
+#else
int s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+#endif
// For key frame, reference is set to 128.
sum = s_avg - 128;
sse = sum * sum;
@@ -646,8 +656,6 @@ static void choose_partitioning(VP9_COMP *cpi,
for (k = 0; k < 4; ++k) {
const int x8_idx = (k & 1);
const int y8_idx = (k >> 1);
- // TODO(marpan): Allow for setting 4x4 partition on key frame.
- /*
if (cm->frame_type == KEY_FRAME) {
if (!set_vt_partitioning(cpi, xd,
&vt.split[i].split[j].split[k],
@@ -660,12 +668,11 @@ static void choose_partitioning(VP9_COMP *cpi,
BLOCK_4X4);
}
} else {
- */
set_block_size(cpi, xd,
(mi_row + y32_idx + y16_idx + y8_idx),
(mi_col + x32_idx + x16_idx + x8_idx),
BLOCK_8X8);
- // }
+ }
}
}
}
@@ -3612,7 +3619,6 @@ static void encode_frame_internal(VP9_COMP *cpi) {
if (xd->lossless) {
x->optimize = 0;
cm->lf.filter_level = 0;
- cpi->zbin_mode_boost_enabled = 0;
}
vp9_frame_init_quantizer(cpi);
@@ -3716,9 +3722,9 @@ void vp9_encode_frame(VP9_COMP *cpi) {
cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
(cm->ref_frame_sign_bias[ALTREF_FRAME] ==
cm->ref_frame_sign_bias[LAST_FRAME])) {
- cm->allow_comp_inter_inter = 0;
+ cpi->allow_comp_inter_inter = 0;
} else {
- cm->allow_comp_inter_inter = 1;
+ cpi->allow_comp_inter_inter = 1;
cm->comp_fixed_ref = ALTREF_FRAME;
cm->comp_var_ref[0] = LAST_FRAME;
cm->comp_var_ref[1] = GOLDEN_FRAME;
@@ -3742,7 +3748,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
const int is_alt_ref = frame_type == ALTREF_FRAME;
/* prediction (compound, single or hybrid) mode selection */
- if (is_alt_ref || !cm->allow_comp_inter_inter)
+ if (is_alt_ref || !cpi->allow_comp_inter_inter)
cm->reference_mode = SINGLE_REFERENCE;
else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] &&
mode_thrs[COMPOUND_REFERENCE] >
@@ -3852,24 +3858,6 @@ static void sum_intra_stats(FRAME_COUNTS *counts, const MODE_INFO *mi) {
++counts->uv_mode[y_mode][uv_mode];
}
-static int get_zbin_mode_boost(const MB_MODE_INFO *mbmi, int enabled) {
- if (enabled) {
- if (is_inter_block(mbmi)) {
- if (mbmi->mode == ZEROMV) {
- return mbmi->ref_frame[0] != LAST_FRAME ? GF_ZEROMV_ZBIN_BOOST
- : LF_ZEROMV_ZBIN_BOOST;
- } else {
- return mbmi->sb_type < BLOCK_8X8 ? SPLIT_MV_ZBIN_BOOST
- : MV_ZBIN_BOOST;
- }
- } else {
- return INTRA_ZBIN_BOOST;
- }
- } else {
- return 0;
- }
-}
-
static void encode_superblock(VP9_COMP *cpi, ThreadData *td,
TOKENEXTRA **t, int output_enabled,
int mi_row, int mi_col, BLOCK_SIZE bsize,
@@ -3905,12 +3893,6 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td,
set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
- // Experimental code. Special case for gf and arf zeromv modes.
- // Increase zbin size to suppress noise
- cpi->zbin_mode_boost = get_zbin_mode_boost(mbmi,
- cpi->zbin_mode_boost_enabled);
- vp9_update_zbin_extra(cpi, x);
-
if (!is_inter_block(mbmi)) {
int plane;
mbmi->skip = 1;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 9b2165be6..9c29eb438 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -149,7 +149,6 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
int64_t rd_cost0, rd_cost1;
int rate0, rate1, error0, error1, t0, t1;
int best, band, pt, i, final_eob;
- const TOKENVALUE *dct_value_tokens;
const int16_t *dct_value_cost;
assert((!type && !plane) || (type && plane));
@@ -169,22 +168,18 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->bd == 12) {
- dct_value_tokens = vp9_dct_value_tokens_high12_ptr;
dct_value_cost = vp9_dct_value_cost_high12_ptr;
} else if (xd->bd == 10) {
- dct_value_tokens = vp9_dct_value_tokens_high10_ptr;
dct_value_cost = vp9_dct_value_cost_high10_ptr;
} else {
- dct_value_tokens = vp9_dct_value_tokens_ptr;
dct_value_cost = vp9_dct_value_cost_ptr;
}
#else
- dct_value_tokens = vp9_dct_value_tokens_ptr;
dct_value_cost = vp9_dct_value_cost_ptr;
#endif
for (i = 0; i < eob; i++)
token_cache[scan[i]] =
- vp9_pt_energy_class[dct_value_tokens[qcoeff[scan[i]]].token];
+ vp9_pt_energy_class[vp9_get_token(qcoeff[scan[i]])];
for (i = eob; i-- > 0;) {
int base_bits, d2, dx;
@@ -198,7 +193,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
/* Evaluate the first possibility for this state. */
rate0 = tokens[next][0].rate;
rate1 = tokens[next][1].rate;
- t0 = (dct_value_tokens + x)->token;
+ t0 = vp9_get_token(x);
/* Consider both possible successor states. */
if (next < default_eob) {
band = band_translate[i + 1];
@@ -250,7 +245,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
} else {
- t0 = t1 = (dct_value_tokens + x)->token;
+ t0 = t1 = vp9_get_token(x);
}
if (next < default_eob) {
band = band_translate[i + 1];
@@ -391,28 +386,28 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
vp9_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
p->round_fp, p->quant_fp, p->quant_shift,
qcoeff, dqcoeff, pd->dequant,
- p->zbin_extra, eob, scan_order->scan,
+ eob, scan_order->scan,
scan_order->iscan);
break;
case TX_16X16:
vp9_highbd_fdct16x16(src_diff, coeff, diff_stride);
vp9_highbd_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
case TX_8X8:
vp9_highbd_fdct8x8(src_diff, coeff, diff_stride);
vp9_highbd_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
case TX_4X4:
x->fwd_txm4x4(src_diff, coeff, diff_stride);
vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
default:
@@ -427,28 +422,28 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob, scan_order->scan,
+ pd->dequant, eob, scan_order->scan,
scan_order->iscan);
break;
case TX_16X16:
vp9_fdct16x16(src_diff, coeff, diff_stride);
vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
case TX_8X8:
vp9_fdct8x8_quant(src_diff, diff_stride, coeff, 64,
x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
case TX_4X4:
x->fwd_txm4x4(src_diff, coeff, diff_stride);
vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
default:
@@ -561,28 +556,28 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
vp9_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
p->round, p->quant, p->quant_shift, qcoeff,
- dqcoeff, pd->dequant, p->zbin_extra, eob,
+ dqcoeff, pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
case TX_16X16:
vp9_highbd_fdct16x16(src_diff, coeff, diff_stride);
vp9_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
case TX_8X8:
vp9_highbd_fdct8x8(src_diff, coeff, diff_stride);
vp9_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
case TX_4X4:
x->fwd_txm4x4(src_diff, coeff, diff_stride);
vp9_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
default:
@@ -597,28 +592,28 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob, scan_order->scan,
+ pd->dequant, eob, scan_order->scan,
scan_order->iscan);
break;
case TX_16X16:
vp9_fdct16x16(src_diff, coeff, diff_stride);
vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
case TX_8X8:
vp9_fdct8x8(src_diff, coeff, diff_stride);
vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
case TX_4X4:
x->fwd_txm4x4(src_diff, coeff, diff_stride);
vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
default:
@@ -849,8 +844,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
vp9_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
p->round, p->quant, p->quant_shift,
- qcoeff, dqcoeff, pd->dequant,
- p->zbin_extra, eob,
+ qcoeff, dqcoeff, pd->dequant, eob,
scan_order->scan, scan_order->iscan);
}
if (!x->skip_encode && *eob) {
@@ -871,7 +865,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
vp9_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
}
if (!x->skip_encode && *eob) {
@@ -893,7 +887,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
vp9_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
}
if (!x->skip_encode && *eob) {
@@ -919,7 +913,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
x->fwd_txm4x4(src_diff, coeff, diff_stride);
vp9_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
}
@@ -958,7 +952,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob, scan_order->scan,
+ pd->dequant, eob, scan_order->scan,
scan_order->iscan);
}
if (!x->skip_encode && *eob)
@@ -978,7 +972,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob, scan_order->scan,
+ pd->dequant, eob, scan_order->scan,
scan_order->iscan);
}
if (!x->skip_encode && *eob)
@@ -998,7 +992,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob, scan_order->scan,
+ pd->dequant, eob, scan_order->scan,
scan_order->iscan);
}
if (!x->skip_encode && *eob)
@@ -1022,7 +1016,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
x->fwd_txm4x4(src_diff, coeff, diff_stride);
vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob, scan_order->scan,
+ pd->dequant, eob, scan_order->scan,
scan_order->iscan);
}
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 620027e2d..047b9aaef 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -181,8 +181,8 @@ static void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) {
cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
}
-void vp9_initialize_enc() {
- static int init_done = 0;
+void vp9_initialize_enc(void) {
+ static volatile int init_done = 0;
if (!init_done) {
vp9_rtcd();
@@ -490,7 +490,8 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
#if CONFIG_VP9_HIGHBITDEPTH
cm->use_highbitdepth,
#endif
- VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate altref buffer");
}
@@ -510,7 +511,8 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
#if CONFIG_VP9_HIGHBITDEPTH
cm->use_highbitdepth,
#endif
- VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate last frame buffer");
@@ -520,7 +522,8 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
#if CONFIG_VP9_HIGHBITDEPTH
cm->use_highbitdepth,
#endif
- VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate scaled source buffer");
@@ -530,7 +533,8 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
#if CONFIG_VP9_HIGHBITDEPTH
cm->use_highbitdepth,
#endif
- VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate scaled last source buffer");
}
@@ -566,7 +570,8 @@ static void update_frame_size(VP9_COMP *cpi) {
#if CONFIG_VP9_HIGHBITDEPTH
cm->use_highbitdepth,
#endif
- VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to reallocate alt_ref_buffer");
}
@@ -2472,7 +2477,8 @@ void vp9_scale_references(VP9_COMP *cpi) {
cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
cm->use_highbitdepth,
- VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL);
scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf,
(int)cm->bit_depth);
#else
@@ -2481,7 +2487,8 @@ void vp9_scale_references(VP9_COMP *cpi) {
vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf,
cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
- VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL);
scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf);
#endif // CONFIG_VP9_HIGHBITDEPTH
cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
@@ -2720,7 +2727,8 @@ void set_frame_size(VP9_COMP *cpi) {
#if CONFIG_VP9_HIGHBITDEPTH
cm->use_highbitdepth,
#endif
- VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL);
alloc_util_frame_buffers(cpi);
init_motion_estimation(cpi);
@@ -3139,12 +3147,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
vp9_clear_system_state();
- // Enable or disable mode based tweaking of the zbin.
- // For 2 pass only used where GF/ARF prediction quality
- // is above a threshold.
- cpi->zbin_mode_boost = 0;
- cpi->zbin_mode_boost_enabled = 0;
-
// Set the arf sign bias for this frame.
set_arf_sign_bias(cpi);
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 14f7c7f0c..7872e2cc1 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -315,9 +315,6 @@ typedef struct VP9_COMP {
int *nmvsadcosts[2];
int *nmvsadcosts_hp[2];
- int zbin_mode_boost;
- int zbin_mode_boost_enabled;
-
int64_t last_time_stamp_seen;
int64_t last_end_time_stamp_seen;
int64_t first_time_stamp_ever;
@@ -339,6 +336,8 @@ typedef struct VP9_COMP {
unsigned int max_mv_magnitude;
int mv_step_param;
+ int allow_comp_inter_inter;
+
// Default value is 1. From first pass stats, encode_breakout may be disabled.
ENCODE_BREAKOUT_TYPE allow_encode_breakout;
@@ -449,7 +448,7 @@ typedef struct VP9_COMP {
VP9Worker *workers;
} VP9_COMP;
-void vp9_initialize_enc();
+void vp9_initialize_enc(void);
struct VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf);
void vp9_remove_compressor(VP9_COMP *cpi);
diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index 823e7a162..708072ee2 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -65,6 +65,7 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
// Allocate the lookahead structures
ctx = calloc(1, sizeof(*ctx));
if (ctx) {
+ const int legacy_byte_alignment = 0;
unsigned int i;
ctx->max_sz = depth;
ctx->buf = calloc(depth, sizeof(*ctx->buf));
@@ -76,7 +77,8 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
#if CONFIG_VP9_HIGHBITDEPTH
use_highbitdepth,
#endif
- VP9_ENC_BORDER_IN_PIXELS))
+ VP9_ENC_BORDER_IN_PIXELS,
+ legacy_byte_alignment))
goto bail;
}
return ctx;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index b45032456..319a47833 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -249,14 +249,14 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
dc_quant >> (xd->bd - 5), &rate, &dist);
} else {
- vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
dc_quant >> 3, &rate, &dist);
}
#else
- vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
dc_quant >> 3, &rate, &dist);
#endif // CONFIG_VP9_HIGHBITDEPTH
@@ -265,14 +265,14 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize],
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
ac_quant >> (xd->bd - 5), &rate, &dist);
} else {
- vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize],
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
ac_quant >> 3, &rate, &dist);
}
#else
- vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize],
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
ac_quant >> 3, &rate, &dist);
#endif // CONFIG_VP9_HIGHBITDEPTH
@@ -447,11 +447,10 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
args->dist += dist;
}
-static const THR_MODES mode_idx[MAX_REF_FRAMES][4] = {
+static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][4] = {
{THR_DC, THR_H_PRED, THR_V_PRED, THR_TM},
{THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV},
{THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG},
- {THR_NEARESTA, THR_NEARA, THR_ZEROA, THR_NEWA},
};
static const PREDICTION_MODE intra_mode_list[] = {
@@ -522,8 +521,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
struct macroblockd_plane *const pd = &xd->plane[0];
PREDICTION_MODE best_mode = ZEROMV;
MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
- TX_SIZE best_tx_size = MIN(max_txsize_lookup[bsize],
- tx_mode_to_biggest_tx_size[cm->tx_mode]);
+ TX_SIZE best_tx_size = TX_SIZES;
INTERP_FILTER best_pred_filter = EIGHTTAP;
int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
struct buf_2d yv12_mb[4][MAX_MB_PLANE];
@@ -537,9 +535,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
// Reduce the intra cost penalty for small blocks (<=16x16).
const int reduction_fac =
(cpi->sf.partition_search_type == VAR_BASED_PARTITION &&
- bsize <= BLOCK_16X16) ? 4 : 1;
+ bsize <= BLOCK_16X16) ? 2 : 0;
const int intra_cost_penalty = vp9_get_intra_cost_penalty(
- cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) / reduction_fac;
+ cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) >> reduction_fac;
const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv,
intra_cost_penalty, 0);
const int8_t segment_id = mbmi->segment_id;
@@ -839,13 +837,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
// Perform intra prediction search, if the best SAD is above a certain
// threshold.
- if (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
- bsize <= cpi->sf.max_intra_bsize) {
+ if (best_rdc.rdcost == INT64_MAX ||
+ (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
+ bsize <= cpi->sf.max_intra_bsize)) {
struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 };
const TX_SIZE intra_tx_size =
MIN(max_txsize_lookup[bsize],
tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
int i;
+ TX_SIZE best_intra_tx_size = TX_SIZES;
if (reuse_inter_pred && best_pred != NULL) {
if (best_pred->data == orig_dst.buf) {
@@ -870,11 +870,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
pd->dst = orig_dst;
for (i = 0; i < 4; ++i) {
- const TX_SIZE saved_tx_size = mbmi->tx_size;
const PREDICTION_MODE this_mode = intra_mode_list[i];
if (!((1 << this_mode) & cpi->sf.intra_y_mode_mask[intra_tx_size]))
continue;
- skip_txfm = x->skip_txfm[0];
args.mode = this_mode;
args.rate = 0;
args.dist = 0;
@@ -891,15 +889,20 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
if (this_rdc.rdcost < best_rdc.rdcost) {
best_rdc = this_rdc;
mbmi->mode = this_mode;
- mbmi->tx_size = intra_tx_size;
+ best_intra_tx_size = mbmi->tx_size;
mbmi->ref_frame[0] = INTRA_FRAME;
mbmi->uv_mode = this_mode;
mbmi->mv[0].as_int = INVALID_MV;
- } else {
- x->skip_txfm[0] = best_mode_skip_txfm;
- mbmi->tx_size = saved_tx_size;
}
}
+
+ // Reset mb_mode_info to the best inter mode.
+ if (mbmi->ref_frame[0] != INTRA_FRAME) {
+ x->skip_txfm[0] = best_mode_skip_txfm;
+ mbmi->tx_size = best_tx_size;
+ } else {
+ mbmi->tx_size = best_intra_tx_size;
+ }
}
pd->dst = orig_dst;
@@ -923,14 +926,23 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
}
}
- if (is_inter_block(mbmi))
- vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
- cpi->sf.adaptive_rd_thresh, bsize,
- mode_idx[best_ref_frame][INTER_OFFSET(mbmi->mode)]);
- else
- vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
- cpi->sf.adaptive_rd_thresh, bsize,
- mode_idx[INTRA_FRAME][mbmi->mode]);
+ if (cpi->sf.adaptive_rd_thresh) {
+ THR_MODES best_mode_idx = is_inter_block(mbmi) ?
+ mode_idx[best_ref_frame][INTER_OFFSET(mbmi->mode)] :
+ mode_idx[INTRA_FRAME][mbmi->mode];
+ PREDICTION_MODE this_mode;
+ for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+ for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+ THR_MODES thr_mode_idx = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
+ int *freq_fact = &tile_data->thresh_freq_fact[bsize][thr_mode_idx];
+ if (thr_mode_idx == best_mode_idx)
+ *freq_fact -= (*freq_fact >> 4);
+ else
+ *freq_fact = MIN(*freq_fact + RD_THRESH_INC,
+ cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+ }
+ }
+ }
*rd_cost = best_rdc;
}
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index e7a20c4d2..389dc87e0 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -122,14 +122,13 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
- int zbin_oq_value, uint16_t *eob_ptr,
+ uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
int i, eob = -1;
// TODO(jingning) Decide the need of these arguments after the
// quantization process is completed.
(void)zbin_ptr;
(void)quant_shift_ptr;
- (void)zbin_oq_value;
(void)iscan;
vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
@@ -168,7 +167,6 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr,
tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
- int zbin_oq_value,
uint16_t *eob_ptr,
const int16_t *scan,
const int16_t *iscan) {
@@ -178,7 +176,6 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr,
// quantization process is completed.
(void)zbin_ptr;
(void)quant_shift_ptr;
- (void)zbin_oq_value;
(void)iscan;
vpx_memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
@@ -217,12 +214,11 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
- int zbin_oq_value, uint16_t *eob_ptr,
+ uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
int i, eob = -1;
(void)zbin_ptr;
(void)quant_shift_ptr;
- (void)zbin_oq_value;
(void)iscan;
vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
@@ -261,12 +257,11 @@ void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
- int zbin_oq_value, uint16_t *eob_ptr,
+ uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
int i, eob = -1;
(void)zbin_ptr;
(void)quant_shift_ptr;
- (void)zbin_oq_value;
(void)iscan;
vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
@@ -302,13 +297,11 @@ void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
- int zbin_oq_value, uint16_t *eob_ptr,
+ uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
int i, non_zero_count = (int)n_coeffs, eob = -1;
- const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
- zbin_ptr[1] + zbin_oq_value };
- const int nzbins[2] = { zbins[0] * -1,
- zbins[1] * -1 };
+ const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
+ const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
(void)iscan;
vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
@@ -355,14 +348,12 @@ void vp9_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t *dequant_ptr, int zbin_oq_value,
+ const int16_t *dequant_ptr,
uint16_t *eob_ptr, const int16_t *scan,
const int16_t *iscan) {
int i, non_zero_count = (int)n_coeffs, eob = -1;
- const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
- zbin_ptr[1] + zbin_oq_value };
- const int nzbins[2] = { zbins[0] * -1,
- zbins[1] * -1 };
+ const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
+ const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
(void)iscan;
vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
@@ -412,10 +403,10 @@ void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
- int zbin_oq_value, uint16_t *eob_ptr,
+ uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1),
- ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1) };
+ const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
int idx = 0;
@@ -471,11 +462,11 @@ void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
- int zbin_oq_value, uint16_t *eob_ptr,
+ uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1),
- ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1) };
- const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+ const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
+ const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
int idx = 0;
int idx_arr[1024];
@@ -534,7 +525,7 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
p->zbin, p->round, p->quant, p->quant_shift,
BLOCK_OFFSET(p->qcoeff, block),
BLOCK_OFFSET(pd->dqcoeff, block),
- pd->dequant, p->zbin_extra, &p->eobs[block],
+ pd->dequant, &p->eobs[block],
scan, iscan);
return;
}
@@ -544,7 +535,7 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
p->zbin, p->round, p->quant, p->quant_shift,
BLOCK_OFFSET(p->qcoeff, block),
BLOCK_OFFSET(pd->dqcoeff, block),
- pd->dequant, p->zbin_extra, &p->eobs[block], scan, iscan);
+ pd->dequant, &p->eobs[block], scan, iscan);
}
static void invert_quant(int16_t *quant, int16_t *shift, int d) {
@@ -641,7 +632,6 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
const int segment_id = xd->mi[0].src_mi->mbmi.segment_id;
const int qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
const int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
- const int zbin = cpi->zbin_mode_boost;
int i;
// Y
@@ -651,13 +641,10 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
x->plane[0].quant_shift = quants->y_quant_shift[qindex];
x->plane[0].zbin = quants->y_zbin[qindex];
x->plane[0].round = quants->y_round[qindex];
- x->plane[0].zbin_extra = (int16_t)((cm->y_dequant[qindex][1] * zbin) >> 7);
xd->plane[0].dequant = cm->y_dequant[qindex];
- x->plane[0].quant_thred[0] = (x->plane[0].zbin[0] + x->plane[0].zbin_extra) *
- (x->plane[0].zbin[0] + x->plane[0].zbin_extra);
- x->plane[0].quant_thred[1] = (x->plane[0].zbin[1] + x->plane[0].zbin_extra) *
- (x->plane[0].zbin[1] + x->plane[0].zbin_extra);
+ x->plane[0].quant_thred[0] = x->plane[0].zbin[0] * x->plane[0].zbin[0];
+ x->plane[0].quant_thred[1] = x->plane[0].zbin[1] * x->plane[0].zbin[1];
// UV
for (i = 1; i < 3; i++) {
@@ -667,15 +654,10 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
x->plane[i].zbin = quants->uv_zbin[qindex];
x->plane[i].round = quants->uv_round[qindex];
- x->plane[i].zbin_extra = (int16_t)((cm->uv_dequant[qindex][1] * zbin) >> 7);
xd->plane[i].dequant = cm->uv_dequant[qindex];
- x->plane[i].quant_thred[0] =
- (x->plane[i].zbin[0] + x->plane[i].zbin_extra) *
- (x->plane[i].zbin[0] + x->plane[i].zbin_extra);
- x->plane[i].quant_thred[1] =
- (x->plane[i].zbin[1] + x->plane[i].zbin_extra) *
- (x->plane[i].zbin[1] + x->plane[i].zbin_extra);
+ x->plane[i].quant_thred[0] = x->plane[i].zbin[0] * x->plane[i].zbin[0];
+ x->plane[i].quant_thred[1] = x->plane[i].zbin[1] * x->plane[i].zbin[1];
}
x->skip_block = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
@@ -687,20 +669,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
vp9_initialize_me_consts(cpi, x->q_index);
}
-void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {
- const int qindex = x->q_index;
- const int y_zbin_extra = (cpi->common.y_dequant[qindex][1] *
- cpi->zbin_mode_boost) >> 7;
- const int uv_zbin_extra = (cpi->common.uv_dequant[qindex][1] *
- cpi->zbin_mode_boost) >> 7;
-
- x->plane[0].zbin_extra = (int16_t)y_zbin_extra;
- x->plane[1].zbin_extra = (int16_t)uv_zbin_extra;
- x->plane[2].zbin_extra = (int16_t)uv_zbin_extra;
-}
-
void vp9_frame_init_quantizer(VP9_COMP *cpi) {
- cpi->zbin_mode_boost = 0;
vp9_init_plane_quantizers(cpi, &cpi->td.mb);
}
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index cee46e7e0..de2839f5b 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -68,8 +68,6 @@ struct VP9Common;
void vp9_frame_init_quantizer(struct VP9_COMP *cpi);
-void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x);
-
void vp9_init_plane_quantizers(struct VP9_COMP *cpi, MACROBLOCK *x);
void vp9_init_quantizer(struct VP9_COMP *cpi);
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 5b49bfc17..34d49f058 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -379,7 +379,7 @@ static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
*d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
}
-void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
+void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
unsigned int qstep, int *rate,
int64_t *dist) {
// This function models the rate and distortion for a Laplacian
@@ -395,10 +395,10 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
int d_q10, r_q10;
static const uint32_t MAX_XSQ_Q10 = 245727;
const uint64_t xsq_q10_64 =
- ((((uint64_t)qstep * qstep * n) << 10) + (var >> 1)) / var;
+ (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
const int xsq_q10 = (int)MIN(xsq_q10_64, MAX_XSQ_Q10);
model_rd_norm(xsq_q10, &r_q10, &d_q10);
- *rate = (n * r_q10 + 2) >> 2;
+ *rate = ((r_q10 << n_log2) + 2) >> 2;
*dist = (var * (int64_t)d_q10 + 512) >> 10;
}
}
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 600a3eb1a..ded082f86 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -37,6 +37,7 @@
#include "vp9/encoder/vp9_rd.h"
#include "vp9/encoder/vp9_rdopt.h"
#include "vp9/encoder/vp9_variance.h"
+#include "vp9/encoder/vp9_aq_variance.h"
#define LAST_FRAME_MODE_MASK ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
(1 << INTRA_FRAME))
@@ -48,6 +49,7 @@
#define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | 0x01)
#define MIN_EARLY_TERM_INDEX 3
+#define NEW_MV_DISCOUNT_FACTOR 8
typedef struct {
PREDICTION_MODE mode;
@@ -75,6 +77,7 @@ struct rdcost_block_args {
const scan_order *so;
};
+#define LAST_NEW_MV_INDEX 6
static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
{NEARESTMV, {LAST_FRAME, NONE}},
{NEARESTMV, {ALTREF_FRAME, NONE}},
@@ -265,15 +268,15 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
} else {
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
+ vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
pd->dequant[1] >> (xd->bd - 5),
&rate, &dist);
} else {
- vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
+ vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
pd->dequant[1] >> 3, &rate, &dist);
}
#else
- vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
+ vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
pd->dequant[1] >> 3, &rate, &dist);
#endif // CONFIG_VP9_HIGHBITDEPTH
rate_sum += rate;
@@ -370,7 +373,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
// dc token
int v = qcoeff[0];
- int prev_t = vp9_dct_value_tokens_ptr[v].token;
+ int prev_t = vp9_get_token(v);
cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
token_cache[0] = vp9_pt_energy_class[prev_t];
++token_costs;
@@ -381,7 +384,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
int t;
v = qcoeff[rc];
- t = vp9_dct_value_tokens_ptr[v].token;
+ t = vp9_get_token(v);
if (use_fast_coef_costing) {
cost += (*token_costs)[!prev_t][!prev_t][t] + vp9_dct_value_cost_ptr[v];
} else {
@@ -2355,6 +2358,27 @@ static INLINE void restore_dst_buf(MACROBLOCKD *xd,
}
}
+// In some situations we want to discount tha pparent cost of a new motion
+// vector. Where there is a subtle motion field and especially where there is
+// low spatial complexity then it can be hard to cover the cost of a new motion
+// vector in a single block, even if that motion vector reduces distortion.
+// However, once established that vector may be usable through the nearest and
+// near mv modes to reduce distortion in subsequent blocks and also improve
+// visual quality.
+static int discount_newmv_test(const VP9_COMP *cpi,
+ int this_mode,
+ int_mv this_mv,
+ int_mv (*mode_mv)[MAX_REF_FRAMES],
+ int ref_frame) {
+ return (!cpi->rc.is_src_frame_alt_ref &&
+ (this_mode == NEWMV) &&
+ (this_mv.as_int != 0) &&
+ ((mode_mv[NEARESTMV][ref_frame].as_int == 0) ||
+ (mode_mv[NEARESTMV][ref_frame].as_int == INVALID_MV)) &&
+ ((mode_mv[NEARMV][ref_frame].as_int == 0) ||
+ (mode_mv[NEARMV][ref_frame].as_int == INVALID_MV)));
+}
+
static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
BLOCK_SIZE bsize,
int64_t txfm_cache[],
@@ -2464,10 +2488,20 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
&tmp_mv, &rate_mv);
if (tmp_mv.as_int == INVALID_MV)
return INT64_MAX;
- *rate2 += rate_mv;
+
frame_mv[refs[0]].as_int =
xd->mi[0].src_mi->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
single_newmv[refs[0]].as_int = tmp_mv.as_int;
+
+ // Estimate the rate implications of a new mv but discount this
+ // under certain circumstances where we want to help initiate a weak
+ // motion field, where the distortion gain for a single block may not
+ // be enough to overcome the cost of a new mv.
+ if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0])) {
+ *rate2 += MAX((rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
+ } else {
+ *rate2 += rate_mv;
+ }
}
}
@@ -2492,11 +2526,20 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
orig_dst_stride[i] = xd->plane[i].dst.stride;
}
- /* We don't include the cost of the second reference here, because there
- * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
- * words if you present them in that order, the second one is always known
- * if the first is known */
- *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
+ // We don't include the cost of the second reference here, because there
+ // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
+ // words if you present them in that order, the second one is always known
+ // if the first is known.
+ //
+ // Under some circumstances we discount the cost of new mv mode to encourage
+ // initiation of a motion field.
+ if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]],
+ mode_mv, refs[0])) {
+ *rate2 += MIN(cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]),
+ cost_mv_ref(cpi, NEARESTMV, mbmi->mode_context[refs[0]]));
+ } else {
+ *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
+ }
if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
mbmi->mode != NEARESTMV)
@@ -2726,6 +2769,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
x->skip_encode = 0;
ctx->skip = 0;
xd->mi[0].src_mi->mbmi.ref_frame[0] = INTRA_FRAME;
+ xd->mi[0].src_mi->mbmi.ref_frame[1] = NONE;
if (bsize >= BLOCK_8X8) {
if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
@@ -2941,7 +2985,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
mode_skip_mask[INTRA_FRAME] |=
~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
- for (i = 0; i < MAX_MODES; ++i)
+ for (i = 0; i <= LAST_NEW_MV_INDEX; ++i)
+ mode_threshold[i] = 0;
+ for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5;
midx = sf->schedule_mode_search ? mode_skip_start : 0;
@@ -3065,7 +3111,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
comp_pred = second_ref_frame > INTRA_FRAME;
if (comp_pred) {
- if (!cm->allow_comp_inter_inter)
+ if (!cpi->allow_comp_inter_inter)
continue;
// Skip compound inter modes if ARF is not available.
@@ -3715,7 +3761,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
comp_pred = second_ref_frame > INTRA_FRAME;
if (comp_pred) {
- if (!cm->allow_comp_inter_inter)
+ if (!cpi->allow_comp_inter_inter)
continue;
if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
continue;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 0775b919c..15831fbbe 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -339,18 +339,10 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
sf->mv.fullpel_search_step_param = 10;
sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
}
-
- if (speed >= 12) {
+ if (speed >= 8) {
sf->adaptive_rd_thresh = 4;
sf->mv.subpel_force_stop = 2;
}
-
- if (speed >= 13) {
- int i;
- sf->max_intra_bsize = BLOCK_32X32;
- for (i = 0; i < BLOCK_SIZES; ++i)
- sf->inter_mode_mask[i] = INTER_NEAREST;
- }
}
void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 184322f4f..31e93be65 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -39,7 +39,9 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
#if CONFIG_VP9_HIGHBITDEPTH
cpi->common.use_highbitdepth,
#endif
- VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+ VP9_ENC_BORDER_IN_PIXELS,
+ cpi->common.byte_alignment,
+ NULL, NULL, NULL))
vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
"Failed to allocate empty frame for multiple frame "
"contexts");
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index a4051f05e..424cc0843 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -710,8 +710,9 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
#if CONFIG_VP9_HIGHBITDEPTH
cm->use_highbitdepth,
#endif
- VP9_ENC_BORDER_IN_PIXELS, NULL, NULL,
- NULL)) {
+ VP9_ENC_BORDER_IN_PIXELS,
+ cm->byte_alignment,
+ NULL, NULL, NULL)) {
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to reallocate alt_ref_buffer");
}
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 393eb1a4a..06bcfc317 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -23,23 +23,46 @@
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_tokenize.h"
-static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
-const TOKENVALUE *vp9_dct_value_tokens_ptr;
static int16_t dct_value_cost[DCT_MAX_VALUE * 2];
-const int16_t *vp9_dct_value_cost_ptr;
+const int16_t *vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE;
#if CONFIG_VP9_HIGHBITDEPTH
-static TOKENVALUE dct_value_tokens_high10[DCT_MAX_VALUE_HIGH10 * 2];
-const TOKENVALUE *vp9_dct_value_tokens_high10_ptr;
static int16_t dct_value_cost_high10[DCT_MAX_VALUE_HIGH10 * 2];
-const int16_t *vp9_dct_value_cost_high10_ptr;
+const int16_t *vp9_dct_value_cost_high10_ptr =
+ dct_value_cost_high10 + DCT_MAX_VALUE_HIGH10;
-static TOKENVALUE dct_value_tokens_high12[DCT_MAX_VALUE_HIGH12 * 2];
-const TOKENVALUE *vp9_dct_value_tokens_high12_ptr;
static int16_t dct_value_cost_high12[DCT_MAX_VALUE_HIGH12 * 2];
-const int16_t *vp9_dct_value_cost_high12_ptr;
+const int16_t *vp9_dct_value_cost_high12_ptr =
+ dct_value_cost_high12 + DCT_MAX_VALUE_HIGH12;
#endif
+static const TOKENVALUE dct_cat_lt_10_value_tokens[] = {
+ {9, 63}, {9, 61}, {9, 59}, {9, 57}, {9, 55}, {9, 53}, {9, 51}, {9, 49},
+ {9, 47}, {9, 45}, {9, 43}, {9, 41}, {9, 39}, {9, 37}, {9, 35}, {9, 33},
+ {9, 31}, {9, 29}, {9, 27}, {9, 25}, {9, 23}, {9, 21}, {9, 19}, {9, 17},
+ {9, 15}, {9, 13}, {9, 11}, {9, 9}, {9, 7}, {9, 5}, {9, 3}, {9, 1},
+ {8, 31}, {8, 29}, {8, 27}, {8, 25}, {8, 23}, {8, 21},
+ {8, 19}, {8, 17}, {8, 15}, {8, 13}, {8, 11}, {8, 9},
+ {8, 7}, {8, 5}, {8, 3}, {8, 1},
+ {7, 15}, {7, 13}, {7, 11}, {7, 9}, {7, 7}, {7, 5}, {7, 3}, {7, 1},
+ {6, 7}, {6, 5}, {6, 3}, {6, 1}, {5, 3}, {5, 1},
+ {4, 1}, {3, 1}, {2, 1}, {1, 1}, {0, 0},
+ {1, 0}, {2, 0}, {3, 0}, {4, 0},
+ {5, 0}, {5, 2}, {6, 0}, {6, 2}, {6, 4}, {6, 6},
+ {7, 0}, {7, 2}, {7, 4}, {7, 6}, {7, 8}, {7, 10}, {7, 12}, {7, 14},
+ {8, 0}, {8, 2}, {8, 4}, {8, 6}, {8, 8}, {8, 10}, {8, 12},
+ {8, 14}, {8, 16}, {8, 18}, {8, 20}, {8, 22}, {8, 24},
+ {8, 26}, {8, 28}, {8, 30}, {9, 0}, {9, 2},
+ {9, 4}, {9, 6}, {9, 8}, {9, 10}, {9, 12}, {9, 14}, {9, 16},
+ {9, 18}, {9, 20}, {9, 22}, {9, 24}, {9, 26}, {9, 28},
+ {9, 30}, {9, 32}, {9, 34}, {9, 36}, {9, 38}, {9, 40},
+ {9, 42}, {9, 44}, {9, 46}, {9, 48}, {9, 50}, {9, 52},
+ {9, 54}, {9, 56}, {9, 58}, {9, 60}, {9, 62}
+};
+const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens = dct_cat_lt_10_value_tokens +
+ (sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens))
+ / 2;
+
// Array indices are identical to previously-existing CONTEXT_NODE indices
const vp9_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
-EOB_TOKEN, 2, // 0 = EOB
@@ -67,21 +90,31 @@ const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
-CATEGORY5_TOKEN, -CATEGORY6_TOKEN // 7 = CAT_FIVE
};
-static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
+static const vp9_tree_index cat1[2] = {0, 0};
+static const vp9_tree_index cat2[4] = {2, 2, 0, 0};
+static const vp9_tree_index cat3[6] = {2, 2, 4, 4, 0, 0};
+static const vp9_tree_index cat4[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vp9_tree_index cat5[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vp9_tree_index cat6[28] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12,
+ 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 0, 0};
#if CONFIG_VP9_HIGHBITDEPTH
-static vp9_tree_index cat1_high10[2];
-static vp9_tree_index cat2_high10[4];
-static vp9_tree_index cat3_high10[6];
-static vp9_tree_index cat4_high10[8];
-static vp9_tree_index cat5_high10[10];
-static vp9_tree_index cat6_high10[32];
-static vp9_tree_index cat1_high12[2];
-static vp9_tree_index cat2_high12[4];
-static vp9_tree_index cat3_high12[6];
-static vp9_tree_index cat4_high12[8];
-static vp9_tree_index cat5_high12[10];
-static vp9_tree_index cat6_high12[36];
+static const vp9_tree_index cat1_high10[2] = {0, 0};
+static const vp9_tree_index cat2_high10[4] = {2, 2, 0, 0};
+static const vp9_tree_index cat3_high10[6] = {2, 2, 4, 4, 0, 0};
+static const vp9_tree_index cat4_high10[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vp9_tree_index cat5_high10[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vp9_tree_index cat6_high10[32] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
+ 12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 28, 28,
+ 30, 30, 0, 0};
+static const vp9_tree_index cat1_high12[2] = {0, 0};
+static const vp9_tree_index cat2_high12[4] = {2, 2, 0, 0};
+static const vp9_tree_index cat3_high12[6] = {2, 2, 4, 4, 0, 0};
+static const vp9_tree_index cat4_high12[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vp9_tree_index cat5_high12[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vp9_tree_index cat6_high12[36] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
+ 12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 28, 28,
+ 30, 30, 32, 32, 34, 34, 0, 0};
#endif
static void init_bit_tree(vp9_tree_index *p, int n) {
@@ -95,28 +128,6 @@ static void init_bit_tree(vp9_tree_index *p, int n) {
p[0] = p[1] = 0;
}
-static void init_bit_trees() {
- init_bit_tree(cat1, 1);
- init_bit_tree(cat2, 2);
- init_bit_tree(cat3, 3);
- init_bit_tree(cat4, 4);
- init_bit_tree(cat5, 5);
- init_bit_tree(cat6, 14);
-#if CONFIG_VP9_HIGHBITDEPTH
- init_bit_tree(cat1_high10, 1);
- init_bit_tree(cat2_high10, 2);
- init_bit_tree(cat3_high10, 3);
- init_bit_tree(cat4_high10, 4);
- init_bit_tree(cat5_high10, 5);
- init_bit_tree(cat6_high10, 16);
- init_bit_tree(cat1_high12, 1);
- init_bit_tree(cat2_high12, 2);
- init_bit_tree(cat3_high12, 3);
- init_bit_tree(cat4_high12, 4);
- init_bit_tree(cat5_high12, 5);
- init_bit_tree(cat6_high12, 18);
-#endif
-}
const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS] = {
{0, 0, 0, 0}, // ZERO_TOKEN
@@ -167,43 +178,24 @@ const vp9_extra_bit vp9_extra_bits_high12[ENTROPY_TOKENS] = {
struct vp9_token vp9_coef_encodings[ENTROPY_TOKENS];
void vp9_coef_tree_initialize() {
- init_bit_trees();
vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
}
-static void tokenize_init_one(TOKENVALUE *t, const vp9_extra_bit *const e,
+static void tokenize_init_one(const vp9_extra_bit *const e,
int16_t *value_cost, int max_value) {
int i = -max_value;
- int sign = 1;
+ TOKENVALUE t;
do {
- if (!i)
- sign = 0;
-
- {
- const int a = sign ? -i : i;
- int eb = sign;
-
- if (a > 4) {
- int j = 4;
-
- while (++j < 11 && e[j].base_val <= a) {}
-
- t[i].token = --j;
- eb |= (a - e[j].base_val) << 1;
- } else {
- t[i].token = a;
- }
- t[i].extra = eb;
- }
+ vp9_get_token_extra(i, &t.token, &t.extra);
// initialize the cost for extra bits for all possible coefficient value.
{
int cost = 0;
- const vp9_extra_bit *p = &e[t[i].token];
+ const vp9_extra_bit *p = &e[t.token];
if (p->base_val) {
- const int extra = t[i].extra;
+ const int extra = t.extra;
const int length = p->len;
if (length)
@@ -217,26 +209,14 @@ static void tokenize_init_one(TOKENVALUE *t, const vp9_extra_bit *const e,
}
void vp9_tokenize_initialize() {
- vp9_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE;
- vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE;
-
- tokenize_init_one(dct_value_tokens + DCT_MAX_VALUE, vp9_extra_bits,
+ tokenize_init_one(vp9_extra_bits,
dct_value_cost + DCT_MAX_VALUE, DCT_MAX_VALUE);
#if CONFIG_VP9_HIGHBITDEPTH
- vp9_dct_value_tokens_high10_ptr = dct_value_tokens_high10 +
- DCT_MAX_VALUE_HIGH10;
- vp9_dct_value_cost_high10_ptr = dct_value_cost_high10 + DCT_MAX_VALUE_HIGH10;
-
- tokenize_init_one(dct_value_tokens_high10 + DCT_MAX_VALUE_HIGH10,
- vp9_extra_bits_high10,
+ tokenize_init_one(vp9_extra_bits_high10,
dct_value_cost_high10 + DCT_MAX_VALUE_HIGH10,
DCT_MAX_VALUE_HIGH10);
- vp9_dct_value_tokens_high12_ptr = dct_value_tokens_high12 +
- DCT_MAX_VALUE_HIGH12;
- vp9_dct_value_cost_high12_ptr = dct_value_cost_high12 + DCT_MAX_VALUE_HIGH12;
- tokenize_init_one(dct_value_tokens_high12 + DCT_MAX_VALUE_HIGH12,
- vp9_extra_bits_high12,
+ tokenize_init_one(vp9_extra_bits_high12,
dct_value_cost_high12 + DCT_MAX_VALUE_HIGH12,
DCT_MAX_VALUE_HIGH12);
#endif
@@ -322,8 +302,8 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
td->counts->eob_branch[tx_size][type][ref];
const uint8_t *const band = get_band_translate(tx_size);
const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
- const TOKENVALUE *dct_value_tokens;
-
+ int16_t token;
+ EXTRABIT extra;
int aoff, loff;
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
@@ -333,17 +313,6 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
scan = so->scan;
nb = so->neighbors;
c = 0;
-#if CONFIG_VP9_HIGHBITDEPTH
- if (cpi->common.profile >= PROFILE_2) {
- dct_value_tokens = (cpi->common.bit_depth == VPX_BITS_10 ?
- vp9_dct_value_tokens_high10_ptr :
- vp9_dct_value_tokens_high12_ptr);
- } else {
- dct_value_tokens = vp9_dct_value_tokens_ptr;
- }
-#else
- dct_value_tokens = vp9_dct_value_tokens_ptr;
-#endif
while (c < eob) {
int v = 0;
@@ -362,14 +331,13 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
v = qcoeff[scan[c]];
}
- add_token(&t, coef_probs[band[c]][pt],
- dct_value_tokens[v].extra,
- (uint8_t)dct_value_tokens[v].token,
- (uint8_t)skip_eob,
- counts[band[c]][pt]);
+ vp9_get_token_extra(v, &token, &extra);
+
+ add_token(&t, coef_probs[band[c]][pt], extra, (uint8_t)token,
+ (uint8_t)skip_eob, counts[band[c]][pt]);
eob_branch[band[c]][pt] += !skip_eob;
- token_cache[scan[c]] = vp9_pt_energy_class[dct_value_tokens[v].token];
+ token_cache[scan[c]] = vp9_pt_energy_class[token];
++c;
pt = get_coef_context(nb, token_cache, c);
}
diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index 00afb723e..845e139f2 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -24,24 +24,23 @@ void vp9_tokenize_initialize();
#define EOSB_TOKEN 127 // Not signalled, encoder only
-typedef struct {
- int16_t token;
#if CONFIG_VP9_HIGHBITDEPTH
- int32_t extra;
+ typedef int32_t EXTRABIT;
#else
- int16_t extra;
+ typedef int16_t EXTRABIT;
#endif
+
+
+typedef struct {
+ int16_t token;
+ EXTRABIT extra;
} TOKENVALUE;
typedef struct {
const vp9_prob *context_tree;
-#if CONFIG_VP9_HIGHBITDEPTH
- int32_t extra;
-#else
- int16_t extra;
-#endif
- uint8_t token;
- uint8_t skip_eob_node;
+ EXTRABIT extra;
+ uint8_t token;
+ uint8_t skip_eob_node;
} TOKENEXTRA;
extern const vp9_tree_index vp9_coef_tree[];
@@ -63,6 +62,7 @@ extern const int16_t *vp9_dct_value_cost_ptr;
* fields are not.
*/
extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
+extern const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens;
#if CONFIG_VP9_HIGHBITDEPTH
extern const int16_t *vp9_dct_value_cost_high10_ptr;
extern const TOKENVALUE *vp9_dct_value_tokens_high10_ptr;
@@ -70,6 +70,25 @@ extern const int16_t *vp9_dct_value_cost_high12_ptr;
extern const TOKENVALUE *vp9_dct_value_tokens_high12_ptr;
#endif // CONFIG_VP9_HIGHBITDEPTH
+static INLINE void vp9_get_token_extra(int v, int16_t *token, EXTRABIT *extra) {
+ if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
+ *token = CATEGORY6_TOKEN;
+ if (v >= CAT6_MIN_VAL)
+ *extra = 2 * v - 2 * CAT6_MIN_VAL;
+ else
+ *extra = -2 * v - 2 * CAT6_MIN_VAL + 1;
+ return;
+ }
+ *token = vp9_dct_cat_lt_10_value_tokens[v].token;
+ *extra = vp9_dct_cat_lt_10_value_tokens[v].extra;
+}
+static INLINE int16_t vp9_get_token(int v) {
+ if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL)
+ return 10;
+ return vp9_dct_cat_lt_10_value_tokens[v].token;
+}
+
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index e671f3998..ae22a0b32 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -254,7 +254,7 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
const int16_t* round_ptr, const int16_t* quant_ptr,
const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
- int zbin_oq_value, uint16_t* eob_ptr,
+ uint16_t* eob_ptr,
const int16_t* scan_ptr,
const int16_t* iscan_ptr) {
__m128i zero;
@@ -287,7 +287,6 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
(void)scan_ptr;
(void)zbin_ptr;
(void)quant_shift_ptr;
- (void)zbin_oq_value;
(void)coeff_ptr;
// Pre-condition input (shift by two)
diff --git a/vp9/encoder/x86/vp9_dct_ssse3.c b/vp9/encoder/x86/vp9_dct_ssse3.c
index 237c5e278..5c0ad7892 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3.c
+++ b/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -23,7 +23,7 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
const int16_t* quant_shift_ptr,
int16_t* qcoeff_ptr,
int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
- int zbin_oq_value, uint16_t* eob_ptr,
+ uint16_t* eob_ptr,
const int16_t* scan_ptr,
const int16_t* iscan_ptr) {
__m128i zero;
@@ -57,7 +57,6 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
(void)scan_ptr;
(void)zbin_ptr;
(void)quant_shift_ptr;
- (void)zbin_oq_value;
(void)coeff_ptr;
// Pre-condition input (shift by two)
diff --git a/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c b/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
index 55c6ed71f..0bce9c321 100644
--- a/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
@@ -24,7 +24,6 @@ void vp9_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr,
tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
- int zbin_oq_value,
uint16_t *eob_ptr,
const int16_t *scan,
const int16_t *iscan) {
@@ -32,11 +31,11 @@ void vp9_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr,
__m128i zbins[2];
__m128i nzbins[2];
- zbins[0] = _mm_set_epi32((int)(zbin_ptr[1] + zbin_oq_value),
- (int)(zbin_ptr[1] + zbin_oq_value),
- (int)(zbin_ptr[1] + zbin_oq_value),
- (int)(zbin_ptr[0] + zbin_oq_value));
- zbins[1] = _mm_set1_epi32((int)(zbin_ptr[1] + zbin_oq_value));
+ zbins[0] = _mm_set_epi32((int)zbin_ptr[1],
+ (int)zbin_ptr[1],
+ (int)zbin_ptr[1],
+ (int)zbin_ptr[0]);
+ zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
nzbins[0] = _mm_setzero_si128();
nzbins[1] = _mm_setzero_si128();
@@ -111,7 +110,6 @@ void vp9_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,
tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
- int zbin_oq_value,
uint16_t *eob_ptr,
const int16_t *scan,
const int16_t *iscan) {
@@ -120,14 +118,14 @@ void vp9_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,
int idx = 0;
int idx_arr[1024];
int i, eob = -1;
- const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1);
- const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1);
+ const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
+ const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
(void)scan;
- zbins[0] = _mm_set_epi32((zbin1_tmp + zbin_oq_value),
- (zbin1_tmp + zbin_oq_value),
- (zbin1_tmp + zbin_oq_value),
- (zbin0_tmp + zbin_oq_value));
- zbins[1] = _mm_set1_epi32((zbin1_tmp + zbin_oq_value));
+ zbins[0] = _mm_set_epi32(zbin1_tmp,
+ zbin1_tmp,
+ zbin1_tmp,
+ zbin0_tmp);
+ zbins[1] = _mm_set1_epi32(zbin1_tmp);
nzbins[0] = _mm_setzero_si128();
nzbins[1] = _mm_setzero_si128();
diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c
index e06eb2f15..679c66e30 100644
--- a/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -18,7 +18,7 @@ void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
const int16_t* round_ptr, const int16_t* quant_ptr,
const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
- int zbin_oq_value, uint16_t* eob_ptr,
+ uint16_t* eob_ptr,
const int16_t* scan_ptr,
const int16_t* iscan_ptr) {
__m128i zero;
@@ -39,13 +39,10 @@ void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
// Setup global values
{
- __m128i zbin_oq;
__m128i pw_1;
- zbin_oq = _mm_set1_epi16(zbin_oq_value);
zbin = _mm_load_si128((const __m128i*)zbin_ptr);
round = _mm_load_si128((const __m128i*)round_ptr);
quant = _mm_load_si128((const __m128i*)quant_ptr);
- zbin = _mm_add_epi16(zbin, zbin_oq);
pw_1 = _mm_set1_epi16(1);
zbin = _mm_sub_epi16(zbin, pw_1);
dequant = _mm_load_si128((const __m128i*)dequant_ptr);
@@ -229,14 +226,13 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
const int16_t* round_ptr, const int16_t* quant_ptr,
const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
- int zbin_oq_value, uint16_t* eob_ptr,
+ uint16_t* eob_ptr,
const int16_t* scan_ptr,
const int16_t* iscan_ptr) {
__m128i zero;
(void)scan_ptr;
(void)zbin_ptr;
(void)quant_shift_ptr;
- (void)zbin_oq_value;
coeff_ptr += n_coeffs;
iscan_ptr += n_coeffs;
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
index f5f05e799..72e01d646 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
@@ -17,7 +17,7 @@ SECTION .text
%macro QUANTIZE_FN 2
cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
- shift, qcoeff, dqcoeff, dequant, zbin_oq, \
+ shift, qcoeff, dqcoeff, dequant, \
eob, scan, iscan
cmp dword skipm, 0
jne .blank
@@ -29,13 +29,9 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
movifnidn zbinq, zbinmp
movifnidn roundq, roundmp
movifnidn quantq, quantmp
- movd m4, dword zbin_oqm ; m4 = zbin_oq
mova m0, [zbinq] ; m0 = zbin
- punpcklwd m4, m4
mova m1, [roundq] ; m1 = round
- pshufd m4, m4, 0
mova m2, [quantq] ; m2 = quant
- paddw m0, m4 ; m0 = zbin + zbin_oq
%ifidn %1, b_32x32
pcmpeqw m5, m5
psrlw m5, 15
@@ -55,7 +51,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
psllw m4, 1
%endif
pxor m5, m5 ; m5 = dedicated zero
- DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
+ DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
lea coeffq, [ coeffq+ncoeffq*2]
lea iscanq, [ iscanq+ncoeffq*2]
lea qcoeffq, [ qcoeffq+ncoeffq*2]
@@ -220,7 +216,7 @@ QUANTIZE_FN b_32x32, 7
%macro QUANTIZE_FP 2
cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
- shift, qcoeff, dqcoeff, dequant, zbin_oq, \
+ shift, qcoeff, dqcoeff, dequant, \
eob, scan, iscan
cmp dword skipm, 0
jne .blank
@@ -248,7 +244,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
psllw m2, 1
%endif
pxor m5, m5 ; m5 = dedicated zero
- DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
+ DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
lea coeffq, [ coeffq+ncoeffq*2]
lea iscanq, [ iscanq+ncoeffq*2]
lea qcoeffq, [ qcoeffq+ncoeffq*2]
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 2504f4db9..f5e6e3190 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -133,13 +133,11 @@ ifeq ($(ARCH_X86_64), yes)
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3_x86_64.asm
endif
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht4x4_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht8x8_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_reconintra_neon$(ASM)
+
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
# neon with assembly and intrinsics implementations. If both are available
# prefer assembly.
@@ -158,8 +156,10 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon_asm$(ASM)
else
ifeq ($(HAVE_NEON), yes)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_avg_neon.c
@@ -178,6 +178,7 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon.c
endif # HAVE_NEON
endif # HAVE_NEON_ASM
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index b9fb8140c..7b4b17809 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -13,6 +13,7 @@
#include "./vpx_config.h"
#include "vpx/vpx_codec.h"
+#include "vpx_ports/vpx_once.h"
#include "vpx/internal/vpx_codec_internal.h"
#include "./vpx_version.h"
#include "vp9/encoder/vp9_encoder.h"
@@ -208,7 +209,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
"or kf_max_dist instead.");
RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2);
- RANGE_CHECK(extra_cfg, cpu_used, -16, 16);
+ RANGE_CHECK(extra_cfg, cpu_used, -8, 8);
RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
RANGE_CHECK(extra_cfg, tile_columns, 0, 6);
RANGE_CHECK(extra_cfg, tile_rows, 0, 2);
@@ -729,7 +730,7 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
}
priv->extra_cfg = default_extra_cfg;
- vp9_initialize_enc();
+ once(vp9_initialize_enc);
res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 809514001..43bf35f9c 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -44,6 +44,7 @@ struct vpx_codec_alg_priv {
int flushed;
int invert_tile_order;
int frame_parallel_decode; // frame-based threading.
+ int byte_alignment;
// External frame buffer info to save for VP9 common.
void *ext_priv; // Private data associated with the external frame buffers.
@@ -219,6 +220,7 @@ static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
VP9_COMMON *const cm = &ctx->pbi->common;
cm->new_fb_idx = -1;
+ cm->byte_alignment = ctx->byte_alignment;
if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
cm->get_fb_cb = ctx->get_ext_fb_cb;
@@ -617,6 +619,27 @@ static vpx_codec_err_t ctrl_set_decryptor(vpx_codec_alg_priv_t *ctx,
return VPX_CODEC_OK;
}
+static vpx_codec_err_t ctrl_set_byte_alignment(vpx_codec_alg_priv_t *ctx,
+ va_list args) {
+ const int legacy_byte_alignment = 0;
+ const int min_byte_alignment = 32;
+ const int max_byte_alignment = 1024;
+ const int byte_alignment = va_arg(args, int);
+
+ if (byte_alignment != legacy_byte_alignment &&
+ (byte_alignment < min_byte_alignment ||
+ byte_alignment > max_byte_alignment ||
+ (byte_alignment & (byte_alignment - 1)) != 0))
+ return VPX_CODEC_INVALID_PARAM;
+
+ ctx->byte_alignment = byte_alignment;
+ if (ctx->pbi != NULL) {
+ VP9_COMMON *const cm = &ctx->pbi->common;
+ cm->byte_alignment = byte_alignment;
+ }
+ return VPX_CODEC_OK;
+}
+
static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
{VP8_COPY_REFERENCE, ctrl_copy_reference},
@@ -629,6 +652,7 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
{VP8_SET_DBG_DISPLAY_MV, ctrl_set_dbg_options},
{VP9_INVERT_TILE_DECODE_ORDER, ctrl_set_invert_tile_order},
{VPXD_SET_DECRYPTOR, ctrl_set_decryptor},
+ {VP9_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment},
// Getters
{VP8D_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates},