diff options
Diffstat (limited to 'vp9')
57 files changed, 2121 insertions, 1521 deletions
diff --git a/vp9/common/arm/neon/vp9_convolve8_avg_neon.c b/vp9/common/arm/neon/vp9_convolve8_avg_neon.c index 2f8dda07c..dd569d348 100644 --- a/vp9/common/arm/neon/vp9_convolve8_avg_neon.c +++ b/vp9/common/arm/neon/vp9_convolve8_avg_neon.c @@ -11,6 +11,9 @@ #include <stddef.h> #include <arm_neon.h> +#include "./vpx_config.h" +#include "vpx_ports/mem.h" + void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, @@ -22,7 +25,7 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h); -static inline int32x4_t MULTIPLY_BY_Q0( +static INLINE int32x4_t MULTIPLY_BY_Q0( int16x4_t dsrc0, int16x4_t dsrc1, int16x4_t dsrc2, diff --git a/vp9/common/arm/neon/vp9_convolve8_neon.c b/vp9/common/arm/neon/vp9_convolve8_neon.c index c8704aa9c..5c555c458 100644 --- a/vp9/common/arm/neon/vp9_convolve8_neon.c +++ b/vp9/common/arm/neon/vp9_convolve8_neon.c @@ -11,6 +11,9 @@ #include <stddef.h> #include <arm_neon.h> +#include "./vpx_config.h" +#include "vpx_ports/mem.h" + void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, @@ -22,7 +25,7 @@ void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h); -static inline int32x4_t MULTIPLY_BY_Q0( +static INLINE int32x4_t MULTIPLY_BY_Q0( int16x4_t dsrc0, int16x4_t dsrc1, int16x4_t dsrc2, diff --git a/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm b/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm deleted file mode 100644 index 60a0d98c5..000000000 --- a/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm +++ /dev/null @@ -1,69 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |vp9_dc_only_idct_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void vp9_dc_only_idct_add_neon(int input_dc, uint8_t *pred_ptr, -; uint8_t *dst_ptr, int pitch, int stride) -; -; r0 int input_dc -; r1 uint8_t *pred_ptr -; r2 uint8_t *dst_ptr -; r3 int pitch -; sp int stride - -|vp9_dc_only_idct_add_neon| PROC - - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - ; dct_const_round_shift(input_dc * cospi_16_64) - mul r0, r0, r12 ; input_dc * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; ROUND_POWER_OF_TWO(out, 4) - add r0, r0, #8 ; + (1 <<((4) - 1)) - asr r0, r0, #4 ; >> 4 - - vdup.16 q0, r0; ; duplicate a1 - ldr r12, [sp] ; load stride - - vld1.32 {d2[0]}, [r1], r3 - vld1.32 {d2[1]}, [r1], r3 - vld1.32 {d4[0]}, [r1], r3 - vld1.32 {d4[1]}, [r1] - - vaddw.u8 q1, q0, d2 ; a1 + pred_ptr[c] - vaddw.u8 q2, q0, d4 - - vqmovun.s16 d2, q1 ; clip_pixel - vqmovun.s16 d4, q2 - - vst1.32 {d2[0]}, [r2], r12 - vst1.32 {d2[1]}, [r2], r12 - vst1.32 {d4[0]}, [r2], r12 - vst1.32 {d4[1]}, [r2] - - bx lr - ENDP ; |vp9_dc_only_idct_add_neon| - - END diff --git a/vp9/common/arm/neon/vp9_idct16x16_add_neon.c b/vp9/common/arm/neon/vp9_idct16x16_add_neon.c index 68d7cccc0..5fa3f5c01 100644 --- a/vp9/common/arm/neon/vp9_idct16x16_add_neon.c +++ b/vp9/common/arm/neon/vp9_idct16x16_add_neon.c @@ -10,6 +10,8 @@ #include <arm_neon.h> +#include "./vpx_config.h" + static int16_t cospi_2_64 = 16305; static int16_t cospi_4_64 = 16069; static int16_t cospi_6_64 = 15679; @@ -26,7 +28,7 @@ static int16_t cospi_26_64 = 4756; static int16_t cospi_28_64 = 3196; static int16_t cospi_30_64 = 1606; -static inline void TRANSPOSE8X8( +static INLINE void TRANSPOSE8X8( int16x8_t *q8s16, int16x8_t *q9s16, int16x8_t *q10s16, diff --git a/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c b/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c index 1bfee22b2..d0e4b4f40 100644 --- a/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c +++ b/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c @@ -11,7 +11,9 @@ #include <arm_neon.h> #include "vp9/common/vp9_idct.h" -static inline void LD_16x8( +#include "./vpx_config.h" + +static INLINE void LD_16x8( uint8_t *d, int d_stride, uint8x16_t *q8u8, @@ -40,7 +42,7 @@ static inline void LD_16x8( return; } -static inline void ADD_DIFF_16x8( +static INLINE void ADD_DIFF_16x8( uint8x16_t qdiffu8, uint8x16_t *q8u8, uint8x16_t *q9u8, @@ -61,7 +63,7 @@ static inline void ADD_DIFF_16x8( return; } -static inline void SUB_DIFF_16x8( +static INLINE void SUB_DIFF_16x8( uint8x16_t qdiffu8, uint8x16_t *q8u8, uint8x16_t *q9u8, @@ -82,7 +84,7 @@ static inline void SUB_DIFF_16x8( return; } -static inline void ST_16x8( +static INLINE void ST_16x8( uint8_t *d, int d_stride, uint8x16_t *q8u8, diff --git a/vp9/common/arm/neon/vp9_idct32x32_add_neon.c b/vp9/common/arm/neon/vp9_idct32x32_add_neon.c index 53f721b44..309bdf8d7 100644 --- a/vp9/common/arm/neon/vp9_idct32x32_add_neon.c +++ b/vp9/common/arm/neon/vp9_idct32x32_add_neon.c @@ -10,6 +10,8 @@ #include <arm_neon.h> +#include "./vpx_config.h" + static int16_t cospi_1_64 = 16364; static int16_t cospi_2_64 = 16305; static int16_t cospi_3_64 = 16207; @@ -57,7 +59,7 @@ static int16_t cospi_31_64 = 804; #define STORE_COMBINE_CENTER_RESULTS(r10, r9) \ __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \ q6s16, q7s16, q8s16, q9s16); -static inline void __STORE_COMBINE_CENTER_RESULTS( +static INLINE void __STORE_COMBINE_CENTER_RESULTS( uint8_t *p1, uint8_t *p2, int stride, @@ -105,7 +107,7 @@ static inline void __STORE_COMBINE_CENTER_RESULTS( #define STORE_COMBINE_EXTREME_RESULTS(r7, r6); \ __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \ q4s16, q5s16, q6s16, q7s16); -static inline void __STORE_COMBINE_EXTREME_RESULTS( +static INLINE void __STORE_COMBINE_EXTREME_RESULTS( uint8_t *p1, uint8_t *p2, int stride, @@ -152,7 +154,7 @@ static inline void __STORE_COMBINE_EXTREME_RESULTS( #define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \ DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB); -static inline void DO_BUTTERFLY( +static INLINE void DO_BUTTERFLY( int16x8_t q14s16, int16x8_t q13s16, int16_t first_const, @@ -194,7 +196,7 @@ static inline void DO_BUTTERFLY( return; } -static inline void idct32_transpose_pair( +static INLINE void idct32_transpose_pair( int16_t *input, int16_t *t_buf) { int16_t *in; @@ -288,7 +290,7 @@ static inline void idct32_transpose_pair( return; } -static inline void idct32_bands_end_1st_pass( +static INLINE void idct32_bands_end_1st_pass( int16_t *out, int16x8_t q2s16, int16x8_t q3s16, @@ -383,7 +385,7 @@ static inline void idct32_bands_end_1st_pass( return; } -static inline void idct32_bands_end_2nd_pass( +static INLINE void idct32_bands_end_2nd_pass( int16_t *out, uint8_t *dest, int stride, diff --git a/vp9/common/arm/neon/vp9_idct8x8_add_neon.c b/vp9/common/arm/neon/vp9_idct8x8_add_neon.c index 50587f6bc..2b3c1ce60 100644 --- a/vp9/common/arm/neon/vp9_idct8x8_add_neon.c +++ b/vp9/common/arm/neon/vp9_idct8x8_add_neon.c @@ -10,6 +10,8 @@ #include <arm_neon.h> +#include "./vpx_config.h" + static int16_t cospi_4_64 = 16069; static int16_t cospi_8_64 = 15137; static int16_t cospi_12_64 = 13623; @@ -18,7 +20,7 @@ static int16_t cospi_20_64 = 9102; static int16_t cospi_24_64 = 6270; static int16_t cospi_28_64 = 3196; -static inline void TRANSPOSE8X8( +static INLINE void TRANSPOSE8X8( int16x8_t *q8s16, int16x8_t *q9s16, int16x8_t *q10s16, @@ -87,7 +89,7 @@ static inline void TRANSPOSE8X8( return; } -static inline void IDCT8x8_1D( +static INLINE void IDCT8x8_1D( int16x8_t *q8s16, int16x8_t *q9s16, int16x8_t *q10s16, diff --git a/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm b/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm deleted file mode 100644 index 2f326e24c..000000000 --- a/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm +++ /dev/null @@ -1,237 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vp9_iht4x4_16_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are - ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain - ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back - ; into d16-d19 registers. This macro will touch q10- q15 registers and use - ; them as buffer during calculation. - MACRO - IDCT4x4_1D - ; stage 1 - vadd.s16 d23, d16, d18 ; (input[0] + input[2]) - vsub.s16 d24, d16, d18 ; (input[0] - input[2]) - - vmull.s16 q15, d17, d2 ; input[1] * cospi_24_64 - vmull.s16 q10, d17, d0 ; input[1] * cospi_8_64 - vmull.s16 q13, d23, d1 ; (input[0] + input[2]) * cospi_16_64 - vmull.s16 q14, d24, d1 ; (input[0] - input[2]) * cospi_16_64 - vmlsl.s16 q15, d19, d0 ; input[1] * cospi_24_64 - input[3] * cospi_8_64 - vmlal.s16 q10, d19, d2 ; input[1] * cospi_8_64 + input[3] * cospi_24_64 - - ; dct_const_round_shift - vqrshrn.s32 d26, q13, #14 - vqrshrn.s32 d27, q14, #14 - vqrshrn.s32 d29, q15, #14 - vqrshrn.s32 d28, q10, #14 - - ; stage 2 - ; output[0] = step[0] + step[3]; - ; output[1] = step[1] + step[2]; - ; output[3] = step[0] - step[3]; - ; output[2] = step[1] - step[2]; - vadd.s16 q8, q13, q14 - vsub.s16 q9, q13, q14 - vswp d18, d19 - MEND - - ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which - ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9. - ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be - ; stored back into d16-d19 registers. This macro will touch q11,q12,q13, - ; q14,q15 registers and use them as buffer during calculation. - MACRO - IADST4x4_1D - vmull.s16 q10, d3, d16 ; s0 = sinpi_1_9 * x0 - vmull.s16 q11, d4, d16 ; s1 = sinpi_2_9 * x0 - vmull.s16 q12, d6, d17 ; s2 = sinpi_3_9 * x1 - vmull.s16 q13, d5, d18 ; s3 = sinpi_4_9 * x2 - vmull.s16 q14, d3, d18 ; s4 = sinpi_1_9 * x2 - vmovl.s16 q15, d16 ; expand x0 from 16 bit to 32 bit - vaddw.s16 q15, q15, d19 ; x0 + x3 - vmull.s16 q8, d4, d19 ; s5 = sinpi_2_9 * x3 - vsubw.s16 q15, q15, d18 ; s7 = x0 + x3 - x2 - vmull.s16 q9, d5, d19 ; s6 = sinpi_4_9 * x3 - - vadd.s32 q10, q10, q13 ; x0 = s0 + s3 + s5 - vadd.s32 q10, q10, q8 - vsub.s32 q11, q11, q14 ; x1 = s1 - s4 - s6 - vdup.32 q8, r0 ; duplicate sinpi_3_9 - vsub.s32 q11, q11, q9 - vmul.s32 q15, q15, q8 ; x2 = sinpi_3_9 * s7 - - vadd.s32 q13, q10, q12 ; s0 = x0 + x3 - vadd.s32 q10, q10, q11 ; x0 + x1 - vadd.s32 q14, q11, q12 ; s1 = x1 + x3 - vsub.s32 q10, q10, q12 ; s3 = x0 + x1 - x3 - - ; dct_const_round_shift - vqrshrn.s32 d16, q13, #14 - vqrshrn.s32 d17, q14, #14 - vqrshrn.s32 d18, q15, #14 - vqrshrn.s32 d19, q10, #14 - MEND - - ; Generate cosine constants in d6 - d8 for the IDCT - MACRO - GENERATE_COSINE_CONSTANTS - ; cospi_8_64 = 15137 = 0x3b21 - mov r0, #0x3b00 - add r0, #0x21 - ; cospi_16_64 = 11585 = 0x2d41 - mov r3, #0x2d00 - add r3, #0x41 - ; cospi_24_64 = 6270 = 0x187e - mov r12, #0x1800 - add r12, #0x7e - - ; generate constant vectors - vdup.16 d0, r0 ; duplicate cospi_8_64 - vdup.16 d1, r3 ; duplicate cospi_16_64 - vdup.16 d2, r12 ; duplicate cospi_24_64 - MEND - - ; Generate sine constants in d1 - d4 for the IADST. - MACRO - GENERATE_SINE_CONSTANTS - ; sinpi_1_9 = 5283 = 0x14A3 - mov r0, #0x1400 - add r0, #0xa3 - ; sinpi_2_9 = 9929 = 0x26C9 - mov r3, #0x2600 - add r3, #0xc9 - ; sinpi_4_9 = 15212 = 0x3B6C - mov r12, #0x3b00 - add r12, #0x6c - - ; generate constant vectors - vdup.16 d3, r0 ; duplicate sinpi_1_9 - - ; sinpi_3_9 = 13377 = 0x3441 - mov r0, #0x3400 - add r0, #0x41 - - vdup.16 d4, r3 ; duplicate sinpi_2_9 - vdup.16 d5, r12 ; duplicate sinpi_4_9 - vdup.16 q3, r0 ; duplicate sinpi_3_9 - MEND - - ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19. - MACRO - TRANSPOSE4X4 - vtrn.16 d16, d17 - vtrn.16 d18, d19 - vtrn.32 q8, q9 - MEND - - AREA Block, CODE, READONLY ; name this block of code -;void vp9_iht4x4_16_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride, int tx_type) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride -; r3 int tx_type) -; This function will only handle tx_type of 1,2,3. -|vp9_iht4x4_16_add_neon| PROC - - ; load the inputs into d16-d19 - vld1.s16 {q8,q9}, [r0]! - - ; transpose the input data - TRANSPOSE4X4 - - ; decide the type of transform - cmp r3, #2 - beq idct_iadst - cmp r3, #3 - beq iadst_iadst - -iadst_idct - ; generate constants - GENERATE_COSINE_CONSTANTS - GENERATE_SINE_CONSTANTS - - ; first transform rows - IDCT4x4_1D - - ; transpose the matrix - TRANSPOSE4X4 - - ; then transform columns - IADST4x4_1D - - b end_vp9_iht4x4_16_add_neon - -idct_iadst - ; generate constants - GENERATE_COSINE_CONSTANTS - GENERATE_SINE_CONSTANTS - - ; first transform rows - IADST4x4_1D - - ; transpose the matrix - TRANSPOSE4X4 - - ; then transform columns - IDCT4x4_1D - - b end_vp9_iht4x4_16_add_neon - -iadst_iadst - ; generate constants - GENERATE_SINE_CONSTANTS - - ; first transform rows - IADST4x4_1D - - ; transpose the matrix - TRANSPOSE4X4 - - ; then transform columns - IADST4x4_1D - -end_vp9_iht4x4_16_add_neon - ; ROUND_POWER_OF_TWO(temp_out[j], 4) - vrshr.s16 q8, q8, #4 - vrshr.s16 q9, q9, #4 - - vld1.32 {d26[0]}, [r1], r2 - vld1.32 {d26[1]}, [r1], r2 - vld1.32 {d27[0]}, [r1], r2 - vld1.32 {d27[1]}, [r1] - - ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i] - vaddw.u8 q8, q8, d26 - vaddw.u8 q9, q9, d27 - - ; clip_pixel - vqmovun.s16 d26, q8 - vqmovun.s16 d27, q9 - - ; do the stores in reverse order with negative post-increment, by changing - ; the sign of the stride - rsb r2, r2, #0 - vst1.32 {d27[1]}, [r1], r2 - vst1.32 {d27[0]}, [r1], r2 - vst1.32 {d26[1]}, [r1], r2 - vst1.32 {d26[0]}, [r1] ; no post-increment - bx lr - ENDP ; |vp9_iht4x4_16_add_neon| - - END diff --git a/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/vp9/common/arm/neon/vp9_iht4x4_add_neon.c new file mode 100644 index 000000000..1761fada2 --- /dev/null +++ b/vp9/common/arm/neon/vp9_iht4x4_add_neon.c @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" + +static int16_t sinpi_1_9 = 0x14a3; +static int16_t sinpi_2_9 = 0x26c9; +static int16_t sinpi_3_9 = 0x3441; +static int16_t sinpi_4_9 = 0x3b6c; +static int16_t cospi_8_64 = 0x3b21; +static int16_t cospi_16_64 = 0x2d41; +static int16_t cospi_24_64 = 0x187e; + +static INLINE void TRANSPOSE4X4( + int16x8_t *q8s16, + int16x8_t *q9s16) { + int32x4_t q8s32, q9s32; + int16x4x2_t d0x2s16, d1x2s16; + int32x4x2_t q0x2s32; + + d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16)); + d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16)); + + q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1])); + q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1])); + q0x2s32 = vtrnq_s32(q8s32, q9s32); + + *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]); + *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]); + return; +} + +static INLINE void GENERATE_COSINE_CONSTANTS( + int16x4_t *d0s16, + int16x4_t *d1s16, + int16x4_t *d2s16) { + *d0s16 = vdup_n_s16(cospi_8_64); + *d1s16 = vdup_n_s16(cospi_16_64); + *d2s16 = vdup_n_s16(cospi_24_64); + return; +} + +static INLINE void GENERATE_SINE_CONSTANTS( + int16x4_t *d3s16, + int16x4_t *d4s16, + int16x4_t *d5s16, + int16x8_t *q3s16) { + *d3s16 = vdup_n_s16(sinpi_1_9); + *d4s16 = vdup_n_s16(sinpi_2_9); + *q3s16 = vdupq_n_s16(sinpi_3_9); + *d5s16 = vdup_n_s16(sinpi_4_9); + return; +} + +static INLINE void IDCT4x4_1D( + int16x4_t *d0s16, + int16x4_t *d1s16, + int16x4_t *d2s16, + int16x8_t *q8s16, + int16x8_t *q9s16) { + int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16; + int16x4_t d26s16, d27s16, d28s16, d29s16; + int32x4_t q10s32, q13s32, q14s32, q15s32; + int16x8_t q13s16, q14s16; + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + + d23s16 = vadd_s16(d16s16, d18s16); + d24s16 = vsub_s16(d16s16, d18s16); + + q15s32 = vmull_s16(d17s16, *d2s16); + q10s32 = vmull_s16(d17s16, *d0s16); + q13s32 = vmull_s16(d23s16, *d1s16); + q14s32 = vmull_s16(d24s16, *d1s16); + q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16); + q10s32 = vmlal_s16(q10s32, d19s16, *d2s16); + + d26s16 = vqrshrn_n_s32(q13s32, 14); + d27s16 = vqrshrn_n_s32(q14s32, 14); + d29s16 = vqrshrn_n_s32(q15s32, 14); + d28s16 = vqrshrn_n_s32(q10s32, 14); + + q13s16 = vcombine_s16(d26s16, d27s16); + q14s16 = vcombine_s16(d28s16, d29s16); + *q8s16 = vaddq_s16(q13s16, q14s16); + *q9s16 = vsubq_s16(q13s16, q14s16); + *q9s16 = vcombine_s16(vget_high_s16(*q9s16), + vget_low_s16(*q9s16)); // vswp + return; +} + +static INLINE void IADST4x4_1D( + int16x4_t *d3s16, + int16x4_t *d4s16, + int16x4_t *d5s16, + int16x8_t *q3s16, + int16x8_t *q8s16, + int16x8_t *q9s16) { + int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16; + int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32; + + d6s16 = vget_low_s16(*q3s16); + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + + q10s32 = vmull_s16(*d3s16, d16s16); + q11s32 = vmull_s16(*d4s16, d16s16); + q12s32 = vmull_s16(d6s16, d17s16); + q13s32 = vmull_s16(*d5s16, d18s16); + q14s32 = vmull_s16(*d3s16, d18s16); + q15s32 = vmovl_s16(d16s16); + q15s32 = vaddw_s16(q15s32, d19s16); + q8s32 = vmull_s16(*d4s16, d19s16); + q15s32 = vsubw_s16(q15s32, d18s16); + q9s32 = vmull_s16(*d5s16, d19s16); + + q10s32 = vaddq_s32(q10s32, q13s32); + q10s32 = vaddq_s32(q10s32, q8s32); + q11s32 = vsubq_s32(q11s32, q14s32); + q8s32 = vdupq_n_s32(sinpi_3_9); + q11s32 = vsubq_s32(q11s32, q9s32); + q15s32 = vmulq_s32(q15s32, q8s32); + + q13s32 = vaddq_s32(q10s32, q12s32); + q10s32 = vaddq_s32(q10s32, q11s32); + q14s32 = vaddq_s32(q11s32, q12s32); + q10s32 = vsubq_s32(q10s32, q12s32); + + d16s16 = vqrshrn_n_s32(q13s32, 14); + d17s16 = vqrshrn_n_s32(q14s32, 14); + d18s16 = vqrshrn_n_s32(q15s32, 14); + d19s16 = vqrshrn_n_s32(q10s32, 14); + + *q8s16 = vcombine_s16(d16s16, d17s16); + *q9s16 = vcombine_s16(d18s16, d19s16); + return; +} + +void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, + int dest_stride, int tx_type) { + uint8x8_t d26u8, d27u8; + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16; + uint32x2_t d26u32, d27u32; + int16x8_t q3s16, q8s16, q9s16; + uint16x8_t q8u16, q9u16; + + d26u32 = d27u32 = vdup_n_u32(0); + + q8s16 = vld1q_s16(input); + q9s16 = vld1q_s16(input + 8); + + TRANSPOSE4X4(&q8s16, &q9s16); + + switch (tx_type) { + case 0: // idct_idct is not supported. Fall back to C + vp9_iht4x4_16_add_c(input, dest, dest_stride, tx_type); + return; + break; + case 1: // iadst_idct + // generate constants + GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16); + GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); + + // first transform rows + IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16); + + // transpose the matrix + TRANSPOSE4X4(&q8s16, &q9s16); + + // then transform columns + IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); + break; + case 2: // idct_iadst + // generate constantsyy + GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16); + GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); + + // first transform rows + IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); + + // transpose the matrix + TRANSPOSE4X4(&q8s16, &q9s16); + + // then transform columns + IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16); + break; + case 3: // iadst_iadst + // generate constants + GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); + + // first transform rows + IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); + + // transpose the matrix + TRANSPOSE4X4(&q8s16, &q9s16); + + // then transform columns + IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); + break; + default: // iadst_idct + assert(0); + break; + } + + q8s16 = vrshrq_n_s16(q8s16, 4); + q9s16 = vrshrq_n_s16(q9s16, 4); + + d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0); + dest += dest_stride; + d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1); + dest += dest_stride; + d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0); + dest += dest_stride; + d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1); + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32)); + + d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1); + dest -= dest_stride; + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0); + dest -= dest_stride; + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1); + dest -= dest_stride; + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0); + return; +} diff --git a/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm b/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm deleted file mode 100644 index b41f5661b..000000000 --- a/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm +++ /dev/null @@ -1,698 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vp9_iht8x8_64_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ; Generate IADST constants in r0 - r12 for the IADST. - MACRO - GENERATE_IADST_CONSTANTS - ; generate cospi_2_64 = 16305 - mov r0, #0x3f00 - add r0, #0xb1 - - ; generate cospi_30_64 = 1606 - mov r1, #0x600 - add r1, #0x46 - - ; generate cospi_10_64 = 14449 - mov r2, #0x3800 - add r2, #0x71 - - ; generate cospi_22_64 = 7723 - mov r3, #0x1e00 - add r3, #0x2b - - ; generate cospi_18_64 = 10394 - mov r4, #0x2800 - add r4, #0x9a - - ; generate cospi_14_64 = 12665 - mov r5, #0x3100 - add r5, #0x79 - - ; generate cospi_26_64 = 4756 - mov r6, #0x1200 - add r6, #0x94 - - ; generate cospi_6_64 = 15679 - mov r7, #0x3d00 - add r7, #0x3f - - ; generate cospi_8_64 = 15137 - mov r8, #0x3b00 - add r8, #0x21 - - ; generate cospi_24_64 = 6270 - mov r9, #0x1800 - add r9, #0x7e - - ; generate 0 - mov r10, #0 - - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - MEND - - ; Generate IDCT constants in r3 - r9 for the IDCT. - MACRO - GENERATE_IDCT_CONSTANTS - ; generate cospi_28_64 = 3196 - mov r3, #0x0c00 - add r3, #0x7c - - ; generate cospi_4_64 = 16069 - mov r4, #0x3e00 - add r4, #0xc5 - - ; generate cospi_12_64 = 13623 - mov r5, #0x3500 - add r5, #0x37 - - ; generate cospi_20_64 = 9102 - mov r6, #0x2300 - add r6, #0x8e - - ; generate cospi_16_64 = 11585 - mov r7, #0x2d00 - add r7, #0x41 - - ; generate cospi_24_64 = 6270 - mov r8, #0x1800 - add r8, #0x7e - - ; generate cospi_8_64 = 15137 - mov r9, #0x3b00 - add r9, #0x21 - MEND - - ; Transpose a 8x8 16bits data matrix. Datas are loaded in q8-q15. - MACRO - TRANSPOSE8X8 - vswp d17, d24 - vswp d23, d30 - vswp d21, d28 - vswp d19, d26 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q13, q15 - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.16 q14, q15 - MEND - - ; Parallel 1D IDCT on all the columns of a 8x8 16bits data matrix which are - ; loaded in q8-q15. The IDCT constants are loaded in r3 - r9. The output - ; will be stored back into q8-q15 registers. This macro will touch q0-q7 - ; registers and use them as buffer during calculation. - MACRO - IDCT8x8_1D - ; stage 1 - vdup.16 d0, r3 ; duplicate cospi_28_64 - vdup.16 d1, r4 ; duplicate cospi_4_64 - vdup.16 d2, r5 ; duplicate cospi_12_64 - vdup.16 d3, r6 ; duplicate cospi_20_64 - - ; input[1] * cospi_28_64 - vmull.s16 q2, d18, d0 - vmull.s16 q3, d19, d0 - - ; input[5] * cospi_12_64 - vmull.s16 q5, d26, d2 - vmull.s16 q6, d27, d2 - - ; input[1]*cospi_28_64-input[7]*cospi_4_64 - vmlsl.s16 q2, d30, d1 - vmlsl.s16 q3, d31, d1 - - ; input[5] * cospi_12_64 - input[3] * cospi_20_64 - vmlsl.s16 q5, d22, d3 - vmlsl.s16 q6, d23, d3 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d8, q2, #14 ; >> 14 - vqrshrn.s32 d9, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d10, q5, #14 ; >> 14 - vqrshrn.s32 d11, q6, #14 ; >> 14 - - ; input[1] * cospi_4_64 - vmull.s16 q2, d18, d1 - vmull.s16 q3, d19, d1 - - ; input[5] * cospi_20_64 - vmull.s16 q9, d26, d3 - vmull.s16 q13, d27, d3 - - ; input[1]*cospi_4_64+input[7]*cospi_28_64 - vmlal.s16 q2, d30, d0 - vmlal.s16 q3, d31, d0 - - ; input[5] * cospi_20_64 + input[3] * cospi_12_64 - vmlal.s16 q9, d22, d2 - vmlal.s16 q13, d23, d2 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d14, q2, #14 ; >> 14 - vqrshrn.s32 d15, q3, #14 ; >> 14 - - ; stage 2 & stage 3 - even half - vdup.16 d0, r7 ; duplicate cospi_16_64 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q13, #14 ; >> 14 - - ; input[0] * cospi_16_64 - vmull.s16 q2, d16, d0 - vmull.s16 q3, d17, d0 - - ; input[0] * cospi_16_64 - vmull.s16 q13, d16, d0 - vmull.s16 q15, d17, d0 - - ; (input[0] + input[2]) * cospi_16_64 - vmlal.s16 q2, d24, d0 - vmlal.s16 q3, d25, d0 - - ; (input[0] - input[2]) * cospi_16_64 - vmlsl.s16 q13, d24, d0 - vmlsl.s16 q15, d25, d0 - - vdup.16 d0, r8 ; duplicate cospi_24_64 - vdup.16 d1, r9 ; duplicate cospi_8_64 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d18, q2, #14 ; >> 14 - vqrshrn.s32 d19, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d22, q13, #14 ; >> 14 - vqrshrn.s32 d23, q15, #14 ; >> 14 - - ; input[1] * cospi_24_64 - vmull.s16 q2, d20, d0 - vmull.s16 q3, d21, d0 - - ; input[1] * cospi_8_64 - vmull.s16 q8, d20, d1 - vmull.s16 q12, d21, d1 - - ; input[1] * cospi_24_64 - input[3] * cospi_8_64 - vmlsl.s16 q2, d28, d1 - vmlsl.s16 q3, d29, d1 - - ; input[1] * cospi_8_64 + input[3] * cospi_24_64 - vmlal.s16 q8, d28, d0 - vmlal.s16 q12, d29, d0 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d26, q2, #14 ; >> 14 - vqrshrn.s32 d27, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d30, q8, #14 ; >> 14 - vqrshrn.s32 d31, q12, #14 ; >> 14 - - vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] - vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2] - vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2] - vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] - - ; stage 3 -odd half - vdup.16 d16, r7 ; duplicate cospi_16_64 - - ; stage 2 - odd half - vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] - vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] - vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] - vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d28, d16 - vmull.s16 q10, d29, d16 - - ; step2[6] * cospi_16_64 - vmull.s16 q11, d28, d16 - vmull.s16 q12, d29, d16 - - ; (step2[6] - step2[5]) * cospi_16_64 - vmlsl.s16 q9, d26, d16 - vmlsl.s16 q10, d27, d16 - - ; (step2[5] + step2[6]) * cospi_16_64 - vmlal.s16 q11, d26, d16 - vmlal.s16 q12, d27, d16 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d10, q9, #14 ; >> 14 - vqrshrn.s32 d11, q10, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d12, q11, #14 ; >> 14 - vqrshrn.s32 d13, q12, #14 ; >> 14 - - ; stage 4 - vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; - vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; - vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; - vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; - vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; - vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; - vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; - vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; - MEND - - ; Parallel 1D IADST on all the columns of a 8x8 16bits data matrix which - ; loaded in q8-q15. IADST constants are loaded in r0 - r12 registers. The - ; output will be stored back into q8-q15 registers. This macro will touch - ; q0 - q7 registers and use them as buffer during calculation. - MACRO - IADST8X8_1D - vdup.16 d14, r0 ; duplicate cospi_2_64 - vdup.16 d15, r1 ; duplicate cospi_30_64 - - ; cospi_2_64 * x0 - vmull.s16 q1, d30, d14 - vmull.s16 q2, d31, d14 - - ; cospi_30_64 * x0 - vmull.s16 q3, d30, d15 - vmull.s16 q4, d31, d15 - - vdup.16 d30, r4 ; duplicate cospi_18_64 - vdup.16 d31, r5 ; duplicate cospi_14_64 - - ; s0 = cospi_2_64 * x0 + cospi_30_64 * x1; - vmlal.s16 q1, d16, d15 - vmlal.s16 q2, d17, d15 - - ; s1 = cospi_30_64 * x0 - cospi_2_64 * x1 - vmlsl.s16 q3, d16, d14 - vmlsl.s16 q4, d17, d14 - - ; cospi_18_64 * x4 - vmull.s16 q5, d22, d30 - vmull.s16 q6, d23, d30 - - ; cospi_14_64 * x4 - vmull.s16 q7, d22, d31 - vmull.s16 q8, d23, d31 - - ; s4 = cospi_18_64 * x4 + cospi_14_64 * x5; - vmlal.s16 q5, d24, d31 - vmlal.s16 q6, d25, d31 - - ; s5 = cospi_14_64 * x4 - cospi_18_64 * x5 - vmlsl.s16 q7, d24, d30 - vmlsl.s16 q8, d25, d30 - - ; (s0 + s4) - vadd.s32 q11, q1, q5 - vadd.s32 q12, q2, q6 - - vdup.16 d0, r2 ; duplicate cospi_10_64 - vdup.16 d1, r3 ; duplicate cospi_22_64 - - ; (s0 - s4) - vsub.s32 q1, q1, q5 - vsub.s32 q2, q2, q6 - - ; x0 = dct_const_round_shift(s0 + s4); - vqrshrn.s32 d22, q11, #14 ; >> 14 - vqrshrn.s32 d23, q12, #14 ; >> 14 - - ; (s1 + s5) - vadd.s32 q12, q3, q7 - vadd.s32 q15, q4, q8 - - ; (s1 - s5) - vsub.s32 q3, q3, q7 - vsub.s32 q4, q4, q8 - - ; x4 = dct_const_round_shift(s0 - s4); - vqrshrn.s32 d2, q1, #14 ; >> 14 - vqrshrn.s32 d3, q2, #14 ; >> 14 - - ; x1 = dct_const_round_shift(s1 + s5); - vqrshrn.s32 d24, q12, #14 ; >> 14 - vqrshrn.s32 d25, q15, #14 ; >> 14 - - ; x5 = dct_const_round_shift(s1 - s5); - vqrshrn.s32 d6, q3, #14 ; >> 14 - vqrshrn.s32 d7, q4, #14 ; >> 14 - - ; cospi_10_64 * x2 - vmull.s16 q4, d26, d0 - vmull.s16 q5, d27, d0 - - ; cospi_22_64 * x2 - vmull.s16 q2, d26, d1 - vmull.s16 q6, d27, d1 - - vdup.16 d30, r6 ; duplicate cospi_26_64 - vdup.16 d31, r7 ; duplicate cospi_6_64 - - ; s2 = cospi_10_64 * x2 + cospi_22_64 * x3; - vmlal.s16 q4, d20, d1 - vmlal.s16 q5, d21, d1 - - ; s3 = cospi_22_64 * x2 - cospi_10_64 * x3; - vmlsl.s16 q2, d20, d0 - vmlsl.s16 q6, d21, d0 - - ; cospi_26_64 * x6 - vmull.s16 q0, d18, d30 - vmull.s16 q13, d19, d30 - - ; s6 = cospi_26_64 * x6 + cospi_6_64 * x7; - vmlal.s16 q0, d28, d31 - vmlal.s16 q13, d29, d31 - - ; cospi_6_64 * x6 - vmull.s16 q10, d18, d31 - vmull.s16 q9, d19, d31 - - ; s7 = cospi_6_64 * x6 - cospi_26_64 * x7; - vmlsl.s16 q10, d28, d30 - vmlsl.s16 q9, d29, d30 - - ; (s3 + s7) - vadd.s32 q14, q2, q10 - vadd.s32 q15, q6, q9 - - ; (s3 - s7) - vsub.s32 q2, q2, q10 - vsub.s32 q6, q6, q9 - - ; x3 = dct_const_round_shift(s3 + s7); - vqrshrn.s32 d28, q14, #14 ; >> 14 - vqrshrn.s32 d29, q15, #14 ; >> 14 - - ; x7 = dct_const_round_shift(s3 - s7); - vqrshrn.s32 d4, q2, #14 ; >> 14 - vqrshrn.s32 d5, q6, #14 ; >> 14 - - ; (s2 + s6) - vadd.s32 q9, q4, q0 - vadd.s32 q10, q5, q13 - - ; (s2 - s6) - vsub.s32 q4, q4, q0 - vsub.s32 q5, q5, q13 - - vdup.16 d30, r8 ; duplicate cospi_8_64 - vdup.16 d31, r9 ; duplicate cospi_24_64 - - ; x2 = dct_const_round_shift(s2 + s6); - vqrshrn.s32 d18, q9, #14 ; >> 14 - vqrshrn.s32 d19, q10, #14 ; >> 14 - - ; x6 = dct_const_round_shift(s2 - s6); - vqrshrn.s32 d8, q4, #14 ; >> 14 - vqrshrn.s32 d9, q5, #14 ; >> 14 - - ; cospi_8_64 * x4 - vmull.s16 q5, d2, d30 - vmull.s16 q6, d3, d30 - - ; cospi_24_64 * x4 - vmull.s16 q7, d2, d31 - vmull.s16 q0, d3, d31 - - ; s4 = cospi_8_64 * x4 + cospi_24_64 * x5; - vmlal.s16 q5, d6, d31 - vmlal.s16 q6, d7, d31 - - ; s5 = cospi_24_64 * x4 - cospi_8_64 * x5; - vmlsl.s16 q7, d6, d30 - vmlsl.s16 q0, d7, d30 - - ; cospi_8_64 * x7 - vmull.s16 q1, d4, d30 - vmull.s16 q3, d5, d30 - - ; cospi_24_64 * x7 - vmull.s16 q10, d4, d31 - vmull.s16 q2, d5, d31 - - ; s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; - vmlsl.s16 q1, d8, d31 - vmlsl.s16 q3, d9, d31 - - ; s7 = cospi_8_64 * x6 + cospi_24_64 * x7; - vmlal.s16 q10, d8, d30 - vmlal.s16 q2, d9, d30 - - vadd.s16 q8, q11, q9 ; x0 = s0 + s2; - - vsub.s16 q11, q11, q9 ; x2 = s0 - s2; - - vadd.s16 q4, q12, q14 ; x1 = s1 + s3; - - vsub.s16 q12, q12, q14 ; x3 = s1 - s3; - - ; (s4 + s6) - vadd.s32 q14, q5, q1 - vadd.s32 q15, q6, q3 - - ; (s4 - s6) - vsub.s32 q5, q5, q1 - vsub.s32 q6, q6, q3 - - ; x4 = dct_const_round_shift(s4 + s6); - vqrshrn.s32 d18, q14, #14 ; >> 14 - vqrshrn.s32 d19, q15, #14 ; >> 14 - - ; x6 = dct_const_round_shift(s4 - s6); - vqrshrn.s32 d10, q5, #14 ; >> 14 - vqrshrn.s32 d11, q6, #14 ; >> 14 - - ; (s5 + s7) - vadd.s32 q1, q7, q10 - vadd.s32 q3, q0, q2 - - ; (s5 - s7)) - vsub.s32 q7, q7, q10 - vsub.s32 q0, q0, q2 - - ; x5 = dct_const_round_shift(s5 + s7); - vqrshrn.s32 d28, q1, #14 ; >> 14 - vqrshrn.s32 d29, q3, #14 ; >> 14 - - ; x7 = dct_const_round_shift(s5 - s7); - vqrshrn.s32 d14, q7, #14 ; >> 14 - vqrshrn.s32 d15, q0, #14 ; >> 14 - - vdup.16 d30, r12 ; duplicate cospi_16_64 - - ; cospi_16_64 * x2 - vmull.s16 q2, d22, d30 - vmull.s16 q3, d23, d30 - - ; cospi_6_64 * x6 - vmull.s16 q13, d22, d30 - vmull.s16 q1, d23, d30 - - ; cospi_16_64 * x2 + cospi_16_64 * x3; - vmlal.s16 q2, d24, d30 - vmlal.s16 q3, d25, d30 - - ; cospi_16_64 * x2 - cospi_16_64 * x3; - vmlsl.s16 q13, d24, d30 - vmlsl.s16 q1, d25, d30 - - ; x2 = dct_const_round_shift(s2); - vqrshrn.s32 d4, q2, #14 ; >> 14 - vqrshrn.s32 d5, q3, #14 ; >> 14 - - ;x3 = dct_const_round_shift(s3); - vqrshrn.s32 d24, q13, #14 ; >> 14 - vqrshrn.s32 d25, q1, #14 ; >> 14 - - ; cospi_16_64 * x6 - vmull.s16 q13, d10, d30 - vmull.s16 q1, d11, d30 - - ; cospi_6_64 * x6 - vmull.s16 q11, d10, d30 - vmull.s16 q0, d11, d30 - - ; cospi_16_64 * x6 + cospi_16_64 * x7; - vmlal.s16 q13, d14, d30 - vmlal.s16 q1, d15, d30 - - ; cospi_16_64 * x6 - cospi_16_64 * x7; - vmlsl.s16 q11, d14, d30 - vmlsl.s16 q0, d15, d30 - - ; x6 = dct_const_round_shift(s6); - vqrshrn.s32 d20, q13, #14 ; >> 14 - vqrshrn.s32 d21, q1, #14 ; >> 14 - - ;x7 = dct_const_round_shift(s7); - vqrshrn.s32 d12, q11, #14 ; >> 14 - vqrshrn.s32 d13, q0, #14 ; >> 14 - - vdup.16 q5, r10 ; duplicate 0 - - vsub.s16 q9, q5, q9 ; output[1] = -x4; - vsub.s16 q11, q5, q2 ; output[3] = -x2; - vsub.s16 q13, q5, q6 ; output[5] = -x7; - vsub.s16 q15, q5, q4 ; output[7] = -x1; - MEND - - - AREA Block, CODE, READONLY ; name this block of code -;void vp9_iht8x8_64_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride, int tx_type) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride -; r3 int tx_type) -; This function will only handle tx_type of 1,2,3. -|vp9_iht8x8_64_add_neon| PROC - - ; load the inputs into d16-d19 - vld1.s16 {q8,q9}, [r0]! - vld1.s16 {q10,q11}, [r0]! - vld1.s16 {q12,q13}, [r0]! - vld1.s16 {q14,q15}, [r0]! - - push {r0-r10} - vpush {d8-d15} - - ; transpose the input data - TRANSPOSE8X8 - - ; decide the type of transform - cmp r3, #2 - beq idct_iadst - cmp r3, #3 - beq iadst_iadst - -iadst_idct - ; generate IDCT constants - GENERATE_IDCT_CONSTANTS - - ; first transform rows - IDCT8x8_1D - - ; transpose the matrix - TRANSPOSE8X8 - - ; generate IADST constants - GENERATE_IADST_CONSTANTS - - ; then transform columns - IADST8X8_1D - - b end_vp9_iht8x8_64_add_neon - -idct_iadst - ; generate IADST constants - GENERATE_IADST_CONSTANTS - - ; first transform rows - IADST8X8_1D - - ; transpose the matrix - TRANSPOSE8X8 - - ; generate IDCT constants - GENERATE_IDCT_CONSTANTS - - ; then transform columns - IDCT8x8_1D - - b end_vp9_iht8x8_64_add_neon - -iadst_iadst - ; generate IADST constants - GENERATE_IADST_CONSTANTS - - ; first transform rows - IADST8X8_1D - - ; transpose the matrix - TRANSPOSE8X8 - - ; then transform columns - IADST8X8_1D - -end_vp9_iht8x8_64_add_neon - vpop {d8-d15} - pop {r0-r10} - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) - vrshr.s16 q8, q8, #5 - vrshr.s16 q9, q9, #5 - vrshr.s16 q10, q10, #5 - vrshr.s16 q11, q11, #5 - vrshr.s16 q12, q12, #5 - vrshr.s16 q13, q13, #5 - vrshr.s16 q14, q14, #5 - vrshr.s16 q15, q15, #5 - - ; save dest pointer - mov r0, r1 - - ; load destination data - vld1.64 {d0}, [r1], r2 - vld1.64 {d1}, [r1], r2 - vld1.64 {d2}, [r1], r2 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r2 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vld1.64 {d7}, [r1] - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] - vaddw.u8 q8, q8, d0 - vaddw.u8 q9, q9, d1 - vaddw.u8 q10, q10, d2 - vaddw.u8 q11, q11, d3 - vaddw.u8 q12, q12, d4 - vaddw.u8 q13, q13, d5 - vaddw.u8 q14, q14, d6 - vaddw.u8 q15, q15, d7 - - ; clip_pixel - vqmovun.s16 d0, q8 - vqmovun.s16 d1, q9 - vqmovun.s16 d2, q10 - vqmovun.s16 d3, q11 - vqmovun.s16 d4, q12 - vqmovun.s16 d5, q13 - vqmovun.s16 d6, q14 - vqmovun.s16 d7, q15 - - ; store the data - vst1.64 {d0}, [r0], r2 - vst1.64 {d1}, [r0], r2 - vst1.64 {d2}, [r0], r2 - vst1.64 {d3}, [r0], r2 - vst1.64 {d4}, [r0], r2 - vst1.64 {d5}, [r0], r2 - vst1.64 {d6}, [r0], r2 - vst1.64 {d7}, [r0], r2 - bx lr - ENDP ; |vp9_iht8x8_64_add_neon| - - END diff --git a/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/vp9/common/arm/neon/vp9_iht8x8_add_neon.c new file mode 100644 index 000000000..04b342c3d --- /dev/null +++ b/vp9/common/arm/neon/vp9_iht8x8_add_neon.c @@ -0,0 +1,624 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" + +static int16_t cospi_2_64 = 16305; +static int16_t cospi_4_64 = 16069; +static int16_t cospi_6_64 = 15679; +static int16_t cospi_8_64 = 15137; +static int16_t cospi_10_64 = 14449; +static int16_t cospi_12_64 = 13623; +static int16_t cospi_14_64 = 12665; +static int16_t cospi_16_64 = 11585; +static int16_t cospi_18_64 = 10394; +static int16_t cospi_20_64 = 9102; +static int16_t cospi_22_64 = 7723; +static int16_t cospi_24_64 = 6270; +static int16_t cospi_26_64 = 4756; +static int16_t cospi_28_64 = 3196; +static int16_t cospi_30_64 = 1606; + +static INLINE void TRANSPOSE8X8( + int16x8_t *q8s16, + int16x8_t *q9s16, + int16x8_t *q10s16, + int16x8_t *q11s16, + int16x8_t *q12s16, + int16x8_t *q13s16, + int16x8_t *q14s16, + int16x8_t *q15s16) { + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; + int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 + *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 + *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 + *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 + *q12s16 = vcombine_s16(d17s16, d25s16); + *q13s16 = vcombine_s16(d19s16, d27s16); + *q14s16 = vcombine_s16(d21s16, d29s16); + *q15s16 = vcombine_s16(d23s16, d31s16); + + q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16), + vreinterpretq_s32_s16(*q10s16)); + q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16), + vreinterpretq_s32_s16(*q11s16)); + q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16), + vreinterpretq_s32_s16(*q14s16)); + q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16), + vreinterpretq_s32_s16(*q15s16)); + + q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 + vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 + q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 + vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 + q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 + vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 + q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 + vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 + + *q8s16 = q0x2s16.val[0]; + *q9s16 = q0x2s16.val[1]; + *q10s16 = q1x2s16.val[0]; + *q11s16 = q1x2s16.val[1]; + *q12s16 = q2x2s16.val[0]; + *q13s16 = q2x2s16.val[1]; + *q14s16 = q3x2s16.val[0]; + *q15s16 = q3x2s16.val[1]; + return; +} + +static INLINE void IDCT8x8_1D( + int16x8_t *q8s16, + int16x8_t *q9s16, + int16x8_t *q10s16, + int16x8_t *q11s16, + int16x8_t *q12s16, + int16x8_t *q13s16, + int16x8_t *q14s16, + int16x8_t *q15s16) { + int16x4_t d0s16, d1s16, d2s16, d3s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; + + d0s16 = vdup_n_s16(cospi_28_64); + d1s16 = vdup_n_s16(cospi_4_64); + d2s16 = vdup_n_s16(cospi_12_64); + d3s16 = vdup_n_s16(cospi_20_64); + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + q2s32 = vmull_s16(d18s16, d0s16); + q3s32 = vmull_s16(d19s16, d0s16); + q5s32 = vmull_s16(d26s16, d2s16); + q6s32 = vmull_s16(d27s16, d2s16); + + q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); + q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); + q5s32 = vmlsl_s16(q5s32, d22s16, d3s16); + q6s32 = vmlsl_s16(q6s32, d23s16, d3s16); + + d8s16 = vqrshrn_n_s32(q2s32, 14); + d9s16 = vqrshrn_n_s32(q3s32, 14); + d10s16 = vqrshrn_n_s32(q5s32, 14); + d11s16 = vqrshrn_n_s32(q6s32, 14); + q4s16 = vcombine_s16(d8s16, d9s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + q2s32 = vmull_s16(d18s16, d1s16); + q3s32 = vmull_s16(d19s16, d1s16); + q9s32 = vmull_s16(d26s16, d3s16); + q13s32 = vmull_s16(d27s16, d3s16); + + q2s32 = vmlal_s16(q2s32, d30s16, d0s16); + q3s32 = vmlal_s16(q3s32, d31s16, d0s16); + q9s32 = vmlal_s16(q9s32, d22s16, d2s16); + q13s32 = vmlal_s16(q13s32, d23s16, d2s16); + + d14s16 = vqrshrn_n_s32(q2s32, 14); + d15s16 = vqrshrn_n_s32(q3s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q13s32, 14); + q6s16 = vcombine_s16(d12s16, d13s16); + q7s16 = vcombine_s16(d14s16, d15s16); + + d0s16 = vdup_n_s16(cospi_16_64); + + q2s32 = vmull_s16(d16s16, d0s16); + q3s32 = vmull_s16(d17s16, d0s16); + q13s32 = vmull_s16(d16s16, d0s16); + q15s32 = vmull_s16(d17s16, d0s16); + + q2s32 = vmlal_s16(q2s32, d24s16, d0s16); + q3s32 = vmlal_s16(q3s32, d25s16, d0s16); + q13s32 = vmlsl_s16(q13s32, d24s16, d0s16); + q15s32 = vmlsl_s16(q15s32, d25s16, d0s16); + + d0s16 = vdup_n_s16(cospi_24_64); + d1s16 = vdup_n_s16(cospi_8_64); + + d18s16 = vqrshrn_n_s32(q2s32, 14); + d19s16 = vqrshrn_n_s32(q3s32, 14); + d22s16 = vqrshrn_n_s32(q13s32, 14); + d23s16 = vqrshrn_n_s32(q15s32, 14); + *q9s16 = vcombine_s16(d18s16, d19s16); + *q11s16 = vcombine_s16(d22s16, d23s16); + + q2s32 = vmull_s16(d20s16, d0s16); + q3s32 = vmull_s16(d21s16, d0s16); + q8s32 = vmull_s16(d20s16, d1s16); + q12s32 = vmull_s16(d21s16, d1s16); + + q2s32 = vmlsl_s16(q2s32, d28s16, d1s16); + q3s32 = vmlsl_s16(q3s32, d29s16, d1s16); + q8s32 = vmlal_s16(q8s32, d28s16, d0s16); + q12s32 = vmlal_s16(q12s32, d29s16, d0s16); + + d26s16 = vqrshrn_n_s32(q2s32, 14); + d27s16 = vqrshrn_n_s32(q3s32, 14); + d30s16 = vqrshrn_n_s32(q8s32, 14); + d31s16 = vqrshrn_n_s32(q12s32, 14); + *q13s16 = vcombine_s16(d26s16, d27s16); + *q15s16 = vcombine_s16(d30s16, d31s16); + + q0s16 = vaddq_s16(*q9s16, *q15s16); + q1s16 = vaddq_s16(*q11s16, *q13s16); + q2s16 = vsubq_s16(*q11s16, *q13s16); + q3s16 = vsubq_s16(*q9s16, *q15s16); + + *q13s16 = vsubq_s16(q4s16, q5s16); + q4s16 = vaddq_s16(q4s16, q5s16); + *q14s16 = vsubq_s16(q7s16, q6s16); + q7s16 = vaddq_s16(q7s16, q6s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + + d16s16 = vdup_n_s16(cospi_16_64); + + q9s32 = vmull_s16(d28s16, d16s16); + q10s32 = vmull_s16(d29s16, d16s16); + q11s32 = vmull_s16(d28s16, d16s16); + q12s32 = vmull_s16(d29s16, d16s16); + + q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); + q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); + q11s32 = vmlal_s16(q11s32, d26s16, d16s16); + q12s32 = vmlal_s16(q12s32, d27s16, d16s16); + + d10s16 = vqrshrn_n_s32(q9s32, 14); + d11s16 = vqrshrn_n_s32(q10s32, 14); + d12s16 = vqrshrn_n_s32(q11s32, 14); + d13s16 = vqrshrn_n_s32(q12s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + *q8s16 = vaddq_s16(q0s16, q7s16); + *q9s16 = vaddq_s16(q1s16, q6s16); + *q10s16 = vaddq_s16(q2s16, q5s16); + *q11s16 = vaddq_s16(q3s16, q4s16); + *q12s16 = vsubq_s16(q3s16, q4s16); + *q13s16 = vsubq_s16(q2s16, q5s16); + *q14s16 = vsubq_s16(q1s16, q6s16); + *q15s16 = vsubq_s16(q0s16, q7s16); + return; +} + +static INLINE void IADST8X8_1D( + int16x8_t *q8s16, + int16x8_t *q9s16, + int16x8_t *q10s16, + int16x8_t *q11s16, + int16x8_t *q12s16, + int16x8_t *q13s16, + int16x8_t *q14s16, + int16x8_t *q15s16) { + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int16x8_t q2s16, q4s16, q5s16, q6s16; + int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32; + int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32; + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + d14s16 = vdup_n_s16(cospi_2_64); + d15s16 = vdup_n_s16(cospi_30_64); + + q1s32 = vmull_s16(d30s16, d14s16); + q2s32 = vmull_s16(d31s16, d14s16); + q3s32 = vmull_s16(d30s16, d15s16); + q4s32 = vmull_s16(d31s16, d15s16); + + d30s16 = vdup_n_s16(cospi_18_64); + d31s16 = vdup_n_s16(cospi_14_64); + + q1s32 = vmlal_s16(q1s32, d16s16, d15s16); + q2s32 = vmlal_s16(q2s32, d17s16, d15s16); + q3s32 = vmlsl_s16(q3s32, d16s16, d14s16); + q4s32 = vmlsl_s16(q4s32, d17s16, d14s16); + + q5s32 = vmull_s16(d22s16, d30s16); + q6s32 = vmull_s16(d23s16, d30s16); + q7s32 = vmull_s16(d22s16, d31s16); + q8s32 = vmull_s16(d23s16, d31s16); + + q5s32 = vmlal_s16(q5s32, d24s16, d31s16); + q6s32 = vmlal_s16(q6s32, d25s16, d31s16); + q7s32 = vmlsl_s16(q7s32, d24s16, d30s16); + q8s32 = vmlsl_s16(q8s32, d25s16, d30s16); + + q11s32 = vaddq_s32(q1s32, q5s32); + q12s32 = vaddq_s32(q2s32, q6s32); + q1s32 = vsubq_s32(q1s32, q5s32); + q2s32 = vsubq_s32(q2s32, q6s32); + + d22s16 = vqrshrn_n_s32(q11s32, 14); + d23s16 = vqrshrn_n_s32(q12s32, 14); + *q11s16 = vcombine_s16(d22s16, d23s16); + + q12s32 = vaddq_s32(q3s32, q7s32); + q15s32 = vaddq_s32(q4s32, q8s32); + q3s32 = vsubq_s32(q3s32, q7s32); + q4s32 = vsubq_s32(q4s32, q8s32); + + d2s16 = vqrshrn_n_s32(q1s32, 14); + d3s16 = vqrshrn_n_s32(q2s32, 14); + d24s16 = vqrshrn_n_s32(q12s32, 14); + d25s16 = vqrshrn_n_s32(q15s32, 14); + d6s16 = vqrshrn_n_s32(q3s32, 14); + d7s16 = vqrshrn_n_s32(q4s32, 14); + *q12s16 = vcombine_s16(d24s16, d25s16); + + d0s16 = vdup_n_s16(cospi_10_64); + d1s16 = vdup_n_s16(cospi_22_64); + q4s32 = vmull_s16(d26s16, d0s16); + q5s32 = vmull_s16(d27s16, d0s16); + q2s32 = vmull_s16(d26s16, d1s16); + q6s32 = vmull_s16(d27s16, d1s16); + + d30s16 = vdup_n_s16(cospi_26_64); + d31s16 = vdup_n_s16(cospi_6_64); + + q4s32 = vmlal_s16(q4s32, d20s16, d1s16); + q5s32 = vmlal_s16(q5s32, d21s16, d1s16); + q2s32 = vmlsl_s16(q2s32, d20s16, d0s16); + q6s32 = vmlsl_s16(q6s32, d21s16, d0s16); + + q0s32 = vmull_s16(d18s16, d30s16); + q13s32 = vmull_s16(d19s16, d30s16); + + q0s32 = vmlal_s16(q0s32, d28s16, d31s16); + q13s32 = vmlal_s16(q13s32, d29s16, d31s16); + + q10s32 = vmull_s16(d18s16, d31s16); + q9s32 = vmull_s16(d19s16, d31s16); + + q10s32 = vmlsl_s16(q10s32, d28s16, d30s16); + q9s32 = vmlsl_s16(q9s32, d29s16, d30s16); + + q14s32 = vaddq_s32(q2s32, q10s32); + q15s32 = vaddq_s32(q6s32, q9s32); + q2s32 = vsubq_s32(q2s32, q10s32); + q6s32 = vsubq_s32(q6s32, q9s32); + + d28s16 = vqrshrn_n_s32(q14s32, 14); + d29s16 = vqrshrn_n_s32(q15s32, 14); + d4s16 = vqrshrn_n_s32(q2s32, 14); + d5s16 = vqrshrn_n_s32(q6s32, 14); + *q14s16 = vcombine_s16(d28s16, d29s16); + + q9s32 = vaddq_s32(q4s32, q0s32); + q10s32 = vaddq_s32(q5s32, q13s32); + q4s32 = vsubq_s32(q4s32, q0s32); + q5s32 = vsubq_s32(q5s32, q13s32); + + d30s16 = vdup_n_s16(cospi_8_64); + d31s16 = vdup_n_s16(cospi_24_64); + + d18s16 = vqrshrn_n_s32(q9s32, 14); + d19s16 = vqrshrn_n_s32(q10s32, 14); + d8s16 = vqrshrn_n_s32(q4s32, 14); + d9s16 = vqrshrn_n_s32(q5s32, 14); + *q9s16 = vcombine_s16(d18s16, d19s16); + + q5s32 = vmull_s16(d2s16, d30s16); + q6s32 = vmull_s16(d3s16, d30s16); + q7s32 = vmull_s16(d2s16, d31s16); + q0s32 = vmull_s16(d3s16, d31s16); + + q5s32 = vmlal_s16(q5s32, d6s16, d31s16); + q6s32 = vmlal_s16(q6s32, d7s16, d31s16); + q7s32 = vmlsl_s16(q7s32, d6s16, d30s16); + q0s32 = vmlsl_s16(q0s32, d7s16, d30s16); + + q1s32 = vmull_s16(d4s16, d30s16); + q3s32 = vmull_s16(d5s16, d30s16); + q10s32 = vmull_s16(d4s16, d31s16); + q2s32 = vmull_s16(d5s16, d31s16); + + q1s32 = vmlsl_s16(q1s32, d8s16, d31s16); + q3s32 = vmlsl_s16(q3s32, d9s16, d31s16); + q10s32 = vmlal_s16(q10s32, d8s16, d30s16); + q2s32 = vmlal_s16(q2s32, d9s16, d30s16); + + *q8s16 = vaddq_s16(*q11s16, *q9s16); + *q11s16 = vsubq_s16(*q11s16, *q9s16); + q4s16 = vaddq_s16(*q12s16, *q14s16); + *q12s16 = vsubq_s16(*q12s16, *q14s16); + + q14s32 = vaddq_s32(q5s32, q1s32); + q15s32 = vaddq_s32(q6s32, q3s32); + q5s32 = vsubq_s32(q5s32, q1s32); + q6s32 = vsubq_s32(q6s32, q3s32); + + d18s16 = vqrshrn_n_s32(q14s32, 14); + d19s16 = vqrshrn_n_s32(q15s32, 14); + d10s16 = vqrshrn_n_s32(q5s32, 14); + d11s16 = vqrshrn_n_s32(q6s32, 14); + *q9s16 = vcombine_s16(d18s16, d19s16); + + q1s32 = vaddq_s32(q7s32, q10s32); + q3s32 = vaddq_s32(q0s32, q2s32); + q7s32 = vsubq_s32(q7s32, q10s32); + q0s32 = vsubq_s32(q0s32, q2s32); + + d28s16 = vqrshrn_n_s32(q1s32, 14); + d29s16 = vqrshrn_n_s32(q3s32, 14); + d14s16 = vqrshrn_n_s32(q7s32, 14); + d15s16 = vqrshrn_n_s32(q0s32, 14); + *q14s16 = vcombine_s16(d28s16, d29s16); + + d30s16 = vdup_n_s16(cospi_16_64); + + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + q2s32 = vmull_s16(d22s16, d30s16); + q3s32 = vmull_s16(d23s16, d30s16); + q13s32 = vmull_s16(d22s16, d30s16); + q1s32 = vmull_s16(d23s16, d30s16); + + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + q2s32 = vmlal_s16(q2s32, d24s16, d30s16); + q3s32 = vmlal_s16(q3s32, d25s16, d30s16); + q13s32 = vmlsl_s16(q13s32, d24s16, d30s16); + q1s32 = vmlsl_s16(q1s32, d25s16, d30s16); + + d4s16 = vqrshrn_n_s32(q2s32, 14); + d5s16 = vqrshrn_n_s32(q3s32, 14); + d24s16 = vqrshrn_n_s32(q13s32, 14); + d25s16 = vqrshrn_n_s32(q1s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + *q12s16 = vcombine_s16(d24s16, d25s16); + + q13s32 = vmull_s16(d10s16, d30s16); + q1s32 = vmull_s16(d11s16, d30s16); + q11s32 = vmull_s16(d10s16, d30s16); + q0s32 = vmull_s16(d11s16, d30s16); + + q13s32 = vmlal_s16(q13s32, d14s16, d30s16); + q1s32 = vmlal_s16(q1s32, d15s16, d30s16); + q11s32 = vmlsl_s16(q11s32, d14s16, d30s16); + q0s32 = vmlsl_s16(q0s32, d15s16, d30s16); + + d20s16 = vqrshrn_n_s32(q13s32, 14); + d21s16 = vqrshrn_n_s32(q1s32, 14); + d12s16 = vqrshrn_n_s32(q11s32, 14); + d13s16 = vqrshrn_n_s32(q0s32, 14); + *q10s16 = vcombine_s16(d20s16, d21s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + q5s16 = vdupq_n_s16(0); + + *q9s16 = vsubq_s16(q5s16, *q9s16); + *q11s16 = vsubq_s16(q5s16, q2s16); + *q13s16 = vsubq_s16(q5s16, q6s16); + *q15s16 = vsubq_s16(q5s16, q4s16); + return; +} + +void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, + int dest_stride, int tx_type) { + int i; + uint8_t *d1, *d2; + uint8x8_t d0u8, d1u8, d2u8, d3u8; + uint64x1_t d0u64, d1u64, d2u64, d3u64; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + uint16x8_t q8u16, q9u16, q10u16, q11u16; + + q8s16 = vld1q_s16(input); + q9s16 = vld1q_s16(input + 8); + q10s16 = vld1q_s16(input + 8 * 2); + q11s16 = vld1q_s16(input + 8 * 3); + q12s16 = vld1q_s16(input + 8 * 4); + q13s16 = vld1q_s16(input + 8 * 5); + q14s16 = vld1q_s16(input + 8 * 6); + q15s16 = vld1q_s16(input + 8 * 7); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + switch (tx_type) { + case 0: // idct_idct is not supported. Fall back to C + vp9_iht8x8_64_add_c(input, dest, dest_stride, tx_type); + return; + break; + case 1: // iadst_idct + // generate IDCT constants + // GENERATE_IDCT_CONSTANTS + + // first transform rows + IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + // transpose the matrix + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + // generate IADST constants + // GENERATE_IADST_CONSTANTS + + // then transform columns + IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + break; + case 2: // idct_iadst + // generate IADST constants + // GENERATE_IADST_CONSTANTS + + // first transform rows + IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + // transpose the matrix + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + // generate IDCT constants + // GENERATE_IDCT_CONSTANTS + + // then transform columns + IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + break; + case 3: // iadst_iadst + // generate IADST constants + // GENERATE_IADST_CONSTANTS + + // first transform rows + IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + // transpose the matrix + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + // then transform columns + IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + break; + default: // iadst_idct + assert(0); + break; + } + + q8s16 = vrshrq_n_s16(q8s16, 5); + q9s16 = vrshrq_n_s16(q9s16, 5); + q10s16 = vrshrq_n_s16(q10s16, 5); + q11s16 = vrshrq_n_s16(q11s16, 5); + q12s16 = vrshrq_n_s16(q12s16, 5); + q13s16 = vrshrq_n_s16(q13s16, 5); + q14s16 = vrshrq_n_s16(q14s16, 5); + q15s16 = vrshrq_n_s16(q15s16, 5); + + for (d1 = d2 = dest, i = 0; i < 2; i++) { + if (i != 0) { + q8s16 = q12s16; + q9s16 = q13s16; + q10s16 = q14s16; + q11s16 = q15s16; + } + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), + vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), + vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), + vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), + vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + } + return; +} diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c index 97fe02805..09f470e97 100644 --- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c +++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,9 +8,178 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <arm_neon.h> + #include "./vp9_rtcd.h" +#include "./vpx_config.h" #include "vpx/vpx_integer.h" +static INLINE void vp9_loop_filter_neon_16( + uint8x16_t qblimit, // blimit + uint8x16_t qlimit, // limit + uint8x16_t qthresh, // thresh + uint8x16_t q3, // p3 + uint8x16_t q4, // p2 + uint8x16_t q5, // p1 + uint8x16_t q6, // p0 + uint8x16_t q7, // q0 + uint8x16_t q8, // q1 + uint8x16_t q9, // q2 + uint8x16_t q10, // q3 + uint8x16_t *q5r, // p1 + uint8x16_t *q6r, // p0 + uint8x16_t *q7r, // q0 + uint8x16_t *q8r) { // q1 + uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int16x8_t q2s16, q11s16; + uint16x8_t q4u16; + int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8; + int8x8_t d2s8, d3s8; + + q11u8 = vabdq_u8(q3, q4); + q12u8 = vabdq_u8(q4, q5); + q13u8 = vabdq_u8(q5, q6); + q14u8 = vabdq_u8(q8, q7); + q3 = vabdq_u8(q9, q8); + q4 = vabdq_u8(q10, q9); + + q11u8 = vmaxq_u8(q11u8, q12u8); + q12u8 = vmaxq_u8(q13u8, q14u8); + q3 = vmaxq_u8(q3, q4); + q15u8 = vmaxq_u8(q11u8, q12u8); + + q9 = vabdq_u8(q6, q7); + + // vp8_hevmask + q13u8 = vcgtq_u8(q13u8, qthresh); + q14u8 = vcgtq_u8(q14u8, qthresh); + q15u8 = vmaxq_u8(q15u8, q3); + + q2u8 = vabdq_u8(q5, q8); + q9 = vqaddq_u8(q9, q9); + + q15u8 = vcgeq_u8(qlimit, q15u8); + + // vp8_filter() function + // convert to signed + q10 = vdupq_n_u8(0x80); + q8 = veorq_u8(q8, q10); + q7 = veorq_u8(q7, q10); + q6 = veorq_u8(q6, q10); + q5 = veorq_u8(q5, q10); + + q2u8 = vshrq_n_u8(q2u8, 1); + q9 = vqaddq_u8(q9, q2u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), + vget_low_s8(vreinterpretq_s8_u8(q6))); + q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), + vget_high_s8(vreinterpretq_s8_u8(q6))); + + q9 = vcgeq_u8(qblimit, q9); + + q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), + vreinterpretq_s8_u8(q8)); + + q14u8 = vorrq_u8(q13u8, q14u8); + + q4u16 = vdupq_n_u16(3); + q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16)); + q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16)); + + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8); + q15u8 = vandq_u8(q15u8, q9); + + q1s8 = vreinterpretq_s8_u8(q1u8); + q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); + q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8)); + + q4 = vdupq_n_u8(3); + q9 = vdupq_n_u8(4); + // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) + d2s8 = vqmovn_s16(q2s16); + d3s8 = vqmovn_s16(q11s16); + q1s8 = vcombine_s8(d2s8, d3s8); + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8); + q1s8 = vreinterpretq_s8_u8(q1u8); + + q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4)); + q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9)); + q2s8 = vshrq_n_s8(q2s8, 3); + q1s8 = vshrq_n_s8(q1s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8); + q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8); + + q1s8 = vrshrq_n_s8(q1s8, 1); + q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8); + q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8); + + *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10); + *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10); + *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10); + *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10); + return; +} + +#if !HAVE_NEON_ASM +void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1; + uint8x16_t qblimit, qlimit, qthresh; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8; + + dblimit0 = vld1_u8(blimit0); + dlimit0 = vld1_u8(limit0); + dthresh0 = vld1_u8(thresh0); + dblimit1 = vld1_u8(blimit1); + dlimit1 = vld1_u8(limit1); + dthresh1 = vld1_u8(thresh1); + qblimit = vcombine_u8(dblimit0, dblimit1); + qlimit = vcombine_u8(dlimit0, dlimit1); + qthresh = vcombine_u8(dthresh0, dthresh1); + + s -= (p << 2); + + q3u8 = vld1q_u8(s); + s += p; + q4u8 = vld1q_u8(s); + s += p; + q5u8 = vld1q_u8(s); + s += p; + q6u8 = vld1q_u8(s); + s += p; + q7u8 = vld1q_u8(s); + s += p; + q8u8 = vld1q_u8(s); + s += p; + q9u8 = vld1q_u8(s); + s += p; + q10u8 = vld1q_u8(s); + + vp9_loop_filter_neon_16(qblimit, qlimit, qthresh, + q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, + &q5u8, &q6u8, &q7u8, &q8u8); + + s -= (p * 5); + vst1q_u8(s, q5u8); + s += p; + vst1q_u8(s, q6u8); + s += p; + vst1q_u8(s, q7u8); + s += p; + vst1q_u8(s, q8u8); + return; +} +#endif // !HAVE_NEON_ASM + void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */, const uint8_t *blimit0, const uint8_t *limit0, diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon.c b/vp9/common/arm/neon/vp9_loopfilter_neon.c index f54d7a94b..079d26677 100644 --- a/vp9/common/arm/neon/vp9_loopfilter_neon.c +++ b/vp9/common/arm/neon/vp9_loopfilter_neon.c @@ -10,7 +10,9 @@ #include <arm_neon.h> -static inline void vp9_loop_filter_neon( +#include "./vpx_config.h" + +static INLINE void vp9_loop_filter_neon( uint8x8_t dblimit, // flimit uint8x8_t dlimit, // limit uint8x8_t dthresh, // thresh @@ -271,7 +273,7 @@ void vp9_lpf_vertical_4_neon( return; } -static inline void vp9_mbloop_filter_neon( +static INLINE void vp9_mbloop_filter_neon( uint8x8_t dblimit, // mblimit uint8x8_t dlimit, // limit uint8x8_t dthresh, // thresh diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.c b/vp9/common/arm/neon/vp9_reconintra_neon.c new file mode 100644 index 000000000..d0beaa720 --- /dev/null +++ b/vp9/common/arm/neon/vp9_reconintra_neon.c @@ -0,0 +1,473 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stddef.h> +#include <arm_neon.h> + +void vp9_v_predictor_4x4_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + int i; + uint32x2_t d0u32 = vdup_n_u32(0); + (void)left; + + d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0); + for (i = 0; i < 4; i++, dst += y_stride) + vst1_lane_u32((uint32_t *)dst, d0u32, 0); + return; +} + +void vp9_v_predictor_8x8_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + int i; + uint8x8_t d0u8 = vdup_n_u8(0); + (void)left; + + d0u8 = vld1_u8(above); + for (i = 0; i < 8; i++, dst += y_stride) + vst1_u8(dst, d0u8); + return; +} + +void vp9_v_predictor_16x16_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + int i; + uint8x16_t q0u8 = vdupq_n_u8(0); + (void)left; + + q0u8 = vld1q_u8(above); + for (i = 0; i < 16; i++, dst += y_stride) + vst1q_u8(dst, q0u8); + return; +} + +void vp9_v_predictor_32x32_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + int i; + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)left; + + q0u8 = vld1q_u8(above); + q1u8 = vld1q_u8(above + 16); + for (i = 0; i < 32; i++, dst += y_stride) { + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + } + return; +} + +void vp9_h_predictor_4x4_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + uint8x8_t d0u8 = vdup_n_u8(0); + uint32x2_t d1u32 = vdup_n_u32(0); + (void)above; + + d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0); + + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + return; +} + +void vp9_h_predictor_8x8_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + uint8x8_t d0u8 = vdup_n_u8(0); + uint64x1_t d1u64 = vdup_n_u64(0); + (void)above; + + d1u64 = vld1_u64((const uint64_t *)left); + + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0); + vst1_u8(dst, d0u8); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1); + vst1_u8(dst, d0u8); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2); + vst1_u8(dst, d0u8); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3); + vst1_u8(dst, d0u8); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4); + vst1_u8(dst, d0u8); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5); + vst1_u8(dst, d0u8); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6); + vst1_u8(dst, d0u8); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7); + vst1_u8(dst, d0u8); + return; +} + +void vp9_h_predictor_16x16_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + int j; + uint8x8_t d2u8 = vdup_n_u8(0); + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)above; + + q1u8 = vld1q_u8(left); + d2u8 = vget_low_u8(q1u8); + for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { + q0u8 = vdupq_lane_u8(d2u8, 0); + vst1q_u8(dst, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 1); + vst1q_u8(dst, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 2); + vst1q_u8(dst, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 3); + vst1q_u8(dst, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 4); + vst1q_u8(dst, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 5); + vst1q_u8(dst, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 6); + vst1q_u8(dst, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 7); + vst1q_u8(dst, q0u8); + dst += y_stride; + } + return; +} + +void vp9_h_predictor_32x32_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + int j, k; + uint8x8_t d2u8 = vdup_n_u8(0); + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)above; + + for (k = 0; k < 2; k++, left += 16) { + q1u8 = vld1q_u8(left); + d2u8 = vget_low_u8(q1u8); + for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { + q0u8 = vdupq_lane_u8(d2u8, 0); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 1); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 2); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 3); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 4); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 5); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 6); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 7); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; + } + } + return; +} + +void vp9_tm_predictor_4x4_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + int i; + uint16x8_t q1u16, q3u16; + int16x8_t q1s16; + uint8x8_t d0u8 = vdup_n_u8(0); + uint32x2_t d2u32 = vdup_n_u32(0); + + d0u8 = vdup_n_u8(above[-1]); + d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0); + q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8); + for (i = 0; i < 4; i++, dst += y_stride) { + q1u16 = vdupq_n_u16((uint16_t)left[i]); + q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16), + vreinterpretq_s16_u16(q3u16)); + d0u8 = vqmovun_s16(q1s16); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + } + return; +} + +void vp9_tm_predictor_8x8_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + int j; + uint16x8_t q0u16, q3u16, q10u16; + int16x8_t q0s16; + uint16x4_t d20u16; + uint8x8_t d0u8, d2u8, d30u8; + + d0u8 = vdup_n_u8(above[-1]); + d30u8 = vld1_u8(left); + d2u8 = vld1_u8(above); + q10u16 = vmovl_u8(d30u8); + q3u16 = vsubl_u8(d2u8, d0u8); + d20u16 = vget_low_u16(q10u16); + for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { + q0u16 = vdupq_lane_u16(d20u16, 0); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), + vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += y_stride; + q0u16 = vdupq_lane_u16(d20u16, 1); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), + vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += y_stride; + q0u16 = vdupq_lane_u16(d20u16, 2); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), + vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += y_stride; + q0u16 = vdupq_lane_u16(d20u16, 3); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), + vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += y_stride; + } + return; +} + +void vp9_tm_predictor_16x16_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + int j, k; + uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16; + uint8x16_t q0u8, q1u8; + int16x8_t q0s16, q1s16, q8s16, q11s16; + uint16x4_t d20u16; + uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8; + + q0u8 = vdupq_n_u8(above[-1]); + q1u8 = vld1q_u8(above); + q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); + q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); + for (k = 0; k < 2; k++, left += 8) { + d18u8 = vld1_u8(left); + q10u16 = vmovl_u8(d18u8); + d20u16 = vget_low_u16(q10u16); + for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { + q0u16 = vdupq_lane_u16(d20u16, 0); + q8u16 = vdupq_lane_u16(d20u16, 1); + q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q2u16)); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q3u16)); + q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), + vreinterpretq_s16_u16(q2u16)); + q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), + vreinterpretq_s16_u16(q3u16)); + d2u8 = vqmovun_s16(q1s16); + d3u8 = vqmovun_s16(q0s16); + d22u8 = vqmovun_s16(q11s16); + d23u8 = vqmovun_s16(q8s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); + dst += y_stride; + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); + dst += y_stride; + + q0u16 = vdupq_lane_u16(d20u16, 2); + q8u16 = vdupq_lane_u16(d20u16, 3); + q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q2u16)); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q3u16)); + q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), + vreinterpretq_s16_u16(q2u16)); + q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), + vreinterpretq_s16_u16(q3u16)); + d2u8 = vqmovun_s16(q1s16); + d3u8 = vqmovun_s16(q0s16); + d22u8 = vqmovun_s16(q11s16); + d23u8 = vqmovun_s16(q8s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); + dst += y_stride; + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); + dst += y_stride; + } + } + return; +} + +void vp9_tm_predictor_32x32_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + int j, k; + uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16; + uint8x16_t q0u8, q1u8, q2u8; + int16x8_t q12s16, q13s16, q14s16, q15s16; + uint16x4_t d6u16; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8; + + q0u8 = vdupq_n_u8(above[-1]); + q1u8 = vld1q_u8(above); + q2u8 = vld1q_u8(above + 16); + q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); + q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); + q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8)); + q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8)); + for (k = 0; k < 4; k++, left += 8) { + d26u8 = vld1_u8(left); + q3u16 = vmovl_u8(d26u8); + d6u16 = vget_low_u16(q3u16); + for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) { + q0u16 = vdupq_lane_u16(d6u16, 0); + q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q8u16)); + q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += y_stride; + + q0u16 = vdupq_lane_u16(d6u16, 1); + q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q8u16)); + q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += y_stride; + + q0u16 = vdupq_lane_u16(d6u16, 2); + q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q8u16)); + q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += y_stride; + + q0u16 = vdupq_lane_u16(d6u16, 3); + q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q8u16)); + q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += y_stride; + } + } + return; +} diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.asm b/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm index dc9856fa8..dc9856fa8 100644 --- a/vp9/common/arm/neon/vp9_reconintra_neon.asm +++ b/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index cb299f9f7..2f75af575 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -44,8 +44,10 @@ void vp9_free_ref_frame_buffers(VP9_COMMON *cm) { vp9_free_frame_buffer(&cm->frame_bufs[i].buf); } +#if CONFIG_VP9_POSTPROC vp9_free_frame_buffer(&cm->post_proc_buffer); vp9_free_frame_buffer(&cm->post_proc_buffer_int); +#endif } void vp9_free_context_buffers(VP9_COMMON *cm) { @@ -110,7 +112,8 @@ int vp9_alloc_ref_frame_buffers(VP9_COMMON *cm, int width, int height) { #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS) < 0) + VP9_ENC_BORDER_IN_PIXELS, + cm->byte_alignment) < 0) goto fail; if (cm->frame_bufs[i].mvs == NULL) { cm->frame_bufs[i].mvs = @@ -126,12 +129,13 @@ int vp9_alloc_ref_frame_buffers(VP9_COMMON *cm, int width, int height) { init_frame_bufs(cm); -#if CONFIG_INTERNAL_STATS || CONFIG_VP9_POSTPROC +#if CONFIG_VP9_POSTPROC if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y, #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS) < 0) + VP9_ENC_BORDER_IN_PIXELS, + cm->byte_alignment) < 0) goto fail; #endif diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index 7d7209c56..e7fb19fd1 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -212,6 +212,12 @@ typedef struct macroblockd { /* pointer to current frame */ const YV12_BUFFER_CONFIG *cur_buf; + ENTROPY_CONTEXT *above_context[MAX_MB_PLANE]; + ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16]; + + PARTITION_CONTEXT *above_seg_context; + PARTITION_CONTEXT left_seg_context[8]; + /* mc buffer */ DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]); @@ -221,17 +227,11 @@ typedef struct macroblockd { DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]); #endif - int lossless; + /* dqcoeff are shared by all the planes. So planes must be decoded serially */ + DECLARE_ALIGNED(16, tran_low_t, dqcoeff[64 * 64]); + int lossless; int corrupted; - - DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_MB_PLANE][64 * 64]); - - ENTROPY_CONTEXT *above_context[MAX_MB_PLANE]; - ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16]; - - PARTITION_CONTEXT *above_seg_context; - PARTITION_CONTEXT left_seg_context[8]; } MACROBLOCKD; static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize, diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c index d7610ed28..4557e19bf 100644 --- a/vp9/common/vp9_entropymode.c +++ b/vp9/common/vp9_entropymode.c @@ -453,6 +453,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) { vp9_default_coef_probs(cm); vp9_init_mode_probs(cm->fc); vp9_init_mv_probs(cm); + cm->fc->initialized = 1; if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode || cm->reset_frame_context == 3) { @@ -469,8 +470,6 @@ void vp9_setup_past_independence(VP9_COMMON *cm) { vpx_memset(cm->prev_mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip)); - vpx_memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip)); - vp9_zero(cm->ref_frame_sign_bias); cm->frame_context_idx = 0; diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h index 6831d3f87..6db10806d 100644 --- a/vp9/common/vp9_entropymode.h +++ b/vp9/common/vp9_entropymode.h @@ -50,6 +50,7 @@ typedef struct frame_contexts { struct tx_probs tx_probs; vp9_prob skip_probs[SKIP_CONTEXTS]; nmv_context nmvc; + int initialized; } FRAME_CONTEXT; typedef struct { diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index 43a4fe5b9..58b2da75f 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -968,7 +968,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, break; } // The largest loopfilter we have is 16x16 so we use the 16x16 mask - // for 32x32 transforms also also. + // for 32x32 transforms also. lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32]; lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32]; lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32]; diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c index f1bdc1b06..92650e954 100644 --- a/vp9/common/vp9_mfqe.c +++ b/vp9/common/vp9_mfqe.c @@ -210,17 +210,85 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs, return; } // No MFQE on blocks smaller than 16x16 - if (partition == PARTITION_SPLIT && bs == BLOCK_16X16) { + if (bs == BLOCK_16X16) { partition = PARTITION_NONE; } + if (bs == BLOCK_64X64) { + mi_offset = 4; + y_offset = 32; + uv_offset = 16; + } else { + mi_offset = 2; + y_offset = 16; + uv_offset = 8; + } switch (partition) { + BLOCK_SIZE mfqe_bs, bs_tmp; case PARTITION_HORZ: + if (bs == BLOCK_64X64) { + mfqe_bs = BLOCK_64X32; + bs_tmp = BLOCK_32X32; + } else { + mfqe_bs = BLOCK_32X16; + bs_tmp = BLOCK_16X16; + } + if (mfqe_decision(mi, mfqe_bs)) { + // Do mfqe on the first square partition. + mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride, + yd, ud, vd, yd_stride, uvd_stride); + // Do mfqe on the second square partition. + mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset, + y_stride, uv_stride, yd + y_offset, ud + uv_offset, + vd + uv_offset, yd_stride, uvd_stride); + } + if (mfqe_decision(mi + mi_offset * cm->mi_stride, mfqe_bs)) { + // Do mfqe on the first square partition. + mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride, + v + uv_offset * uv_stride, y_stride, uv_stride, + yd + y_offset * yd_stride, ud + uv_offset * uvd_stride, + vd + uv_offset * uvd_stride, yd_stride, uvd_stride); + // Do mfqe on the second square partition. + mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset, + u + uv_offset * uv_stride + uv_offset, + v + uv_offset * uv_stride + uv_offset, y_stride, + uv_stride, yd + y_offset * yd_stride + y_offset, + ud + uv_offset * uvd_stride + uv_offset, + vd + uv_offset * uvd_stride + uv_offset, + yd_stride, uvd_stride); + } + break; case PARTITION_VERT: - // If current block size is not square. - // Copy the block from current frame(i.e., no mfqe is done). - // TODO(jackychen): Rectangle blocks should also be taken into account. - copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd, - yd_stride, uvd_stride, bs); + if (bs == BLOCK_64X64) { + mfqe_bs = BLOCK_32X64; + bs_tmp = BLOCK_32X32; + } else { + mfqe_bs = BLOCK_16X32; + bs_tmp = BLOCK_16X16; + } + if (mfqe_decision(mi, mfqe_bs)) { + // Do mfqe on the first square partition. + mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride, + yd, ud, vd, yd_stride, uvd_stride); + // Do mfqe on the second square partition. + mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride, + v + uv_offset * uv_stride, y_stride, uv_stride, + yd + y_offset * yd_stride, ud + uv_offset * uvd_stride, + vd + uv_offset * uvd_stride, yd_stride, uvd_stride); + } + if (mfqe_decision(mi + mi_offset, mfqe_bs)) { + // Do mfqe on the first square partition. + mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset, + y_stride, uv_stride, yd + y_offset, ud + uv_offset, + vd + uv_offset, yd_stride, uvd_stride); + // Do mfqe on the second square partition. + mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset, + u + uv_offset * uv_stride + uv_offset, + v + uv_offset * uv_stride + uv_offset, y_stride, + uv_stride, yd + y_offset * yd_stride + y_offset, + ud + uv_offset * uvd_stride + uv_offset, + vd + uv_offset * uvd_stride + uv_offset, + yd_stride, uvd_stride); + } break; case PARTITION_NONE: if (mfqe_decision(mi, cur_bs)) { @@ -234,15 +302,6 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs, } break; case PARTITION_SPLIT: - if (bs == BLOCK_64X64) { - mi_offset = 4; - y_offset = 32; - uv_offset = 16; - } else { - mi_offset = 2; - y_offset = 16; - uv_offset = 8; - } // Recursion on four square partitions, e.g. if bs is 64X64, // then look into four 32X32 blocks in it. mfqe_partition(cm, mi, subsize, y, u, v, y_stride, uv_stride, yd, ud, vd, diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index 55a1f86c7..ad91c10dd 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -112,8 +112,10 @@ typedef struct VP9Common { int new_fb_idx; +#if CONFIG_VP9_POSTPROC YV12_BUFFER_CONFIG post_proc_buffer; YV12_BUFFER_CONFIG post_proc_buffer_int; +#endif FRAME_TYPE last_frame_type; /* last frame's frame type for motion search.*/ FRAME_TYPE frame_type; @@ -182,7 +184,6 @@ typedef struct VP9Common { struct segmentation seg; // Context probabilities for reference frame prediction - int allow_comp_inter_inter; MV_REFERENCE_FRAME comp_fixed_ref; MV_REFERENCE_FRAME comp_var_ref[2]; REFERENCE_MODE reference_mode; @@ -207,6 +208,7 @@ typedef struct VP9Common { int frame_parallel_decoding_mode; int log2_tile_cols, log2_tile_rows; + int byte_alignment; // Private data associated with the frame buffer callbacks. void *cb_priv; @@ -263,7 +265,7 @@ static INLINE void init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd) { int i; for (i = 0; i < MAX_MB_PLANE; ++i) { - xd->plane[i].dqcoeff = xd->dqcoeff[i]; + xd->plane[i].dqcoeff = xd->dqcoeff; xd->above_context[i] = cm->above_context + i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols); } diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c index e1a389132..7eac70be2 100644 --- a/vp9/common/vp9_postproc.c +++ b/vp9/common/vp9_postproc.c @@ -671,7 +671,8 @@ int vp9_post_proc_frame(struct VP9Common *cm, #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, #endif // CONFIG_VP9_HIGHBITDEPTH - VP9_ENC_BORDER_IN_PIXELS) < 0) { + VP9_ENC_BORDER_IN_PIXELS, + cm->byte_alignment) < 0) { vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate MFQE framebuffer"); } @@ -683,19 +684,18 @@ int vp9_post_proc_frame(struct VP9Common *cm, } } -#if CONFIG_VP9_POSTPROC || CONFIG_INTERNAL_STATS if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, #endif - VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0) + VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL) < 0) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate post-processing buffer"); -#endif if ((flags & VP9D_MFQE) && cm->current_video_frame >= 2 && - cm->postproc_state.last_frame_valid && + cm->postproc_state.last_frame_valid && cm->bit_depth == 8 && cm->postproc_state.last_base_qindex <= last_q_thresh && cm->base_qindex - cm->postproc_state.last_base_qindex >= q_diff_thresh) { vp9_mfqe(cm); @@ -749,4 +749,4 @@ int vp9_post_proc_frame(struct VP9Common *cm, swap_mi_and_prev_mi(cm); return 0; } -#endif +#endif // CONFIG_VP9_POSTPROC diff --git a/vp9/common/vp9_rtcd.c b/vp9/common/vp9_rtcd.c index dc15a84ff..c777bc81f 100644 --- a/vp9/common/vp9_rtcd.c +++ b/vp9/common/vp9_rtcd.c @@ -16,5 +16,7 @@ void vpx_scale_rtcd(void); void vp9_rtcd() { vpx_scale_rtcd(); + // TODO(JBB): Remove this once, by insuring that both the encoder and + // decoder setup functions are protected by once(); once(setup_rtcd_internal); } diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 575990bb5..d2ab875e9 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -66,8 +66,7 @@ add_proto qw/void vp9_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, con specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc"; add_proto qw/void vp9_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_4x4 neon_asm dspr2/, "$ssse3_x86inc"; -$vp9_h_predictor_4x4_neon_asm=vp9_h_predictor_4x4_neon; +specialize qw/vp9_h_predictor_4x4 neon dspr2/, "$ssse3_x86inc"; add_proto qw/void vp9_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d117_predictor_4x4/; @@ -79,12 +78,10 @@ add_proto qw/void vp9_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, co specialize qw/vp9_d153_predictor_4x4/, "$ssse3_x86inc"; add_proto qw/void vp9_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_4x4 neon_asm/, "$sse_x86inc"; -$vp9_v_predictor_4x4_neon_asm=vp9_v_predictor_4x4_neon; +specialize qw/vp9_v_predictor_4x4 neon/, "$sse_x86inc"; add_proto qw/void vp9_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_4x4 neon_asm dspr2/, "$sse_x86inc"; -$vp9_tm_predictor_4x4_neon_asm=vp9_tm_predictor_4x4_neon; +specialize qw/vp9_tm_predictor_4x4 neon dspr2/, "$sse_x86inc"; add_proto qw/void vp9_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_dc_predictor_4x4 dspr2/, "$sse_x86inc"; @@ -108,8 +105,7 @@ add_proto qw/void vp9_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, con specialize qw/vp9_d63_predictor_8x8/, "$ssse3_x86inc"; add_proto qw/void vp9_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_8x8 neon_asm dspr2/, "$ssse3_x86inc"; -$vp9_h_predictor_8x8_neon_asm=vp9_h_predictor_8x8_neon; +specialize qw/vp9_h_predictor_8x8 neon dspr2/, "$ssse3_x86inc"; add_proto qw/void vp9_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d117_predictor_8x8/; @@ -121,12 +117,10 @@ add_proto qw/void vp9_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, co specialize qw/vp9_d153_predictor_8x8/, "$ssse3_x86inc"; add_proto qw/void vp9_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_8x8 neon_asm/, "$sse_x86inc"; -$vp9_v_predictor_8x8_neon_asm=vp9_v_predictor_8x8_neon; +specialize qw/vp9_v_predictor_8x8 neon/, "$sse_x86inc"; add_proto qw/void vp9_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_8x8 neon_asm dspr2/, "$sse2_x86inc"; -$vp9_tm_predictor_8x8_neon_asm=vp9_tm_predictor_8x8_neon; +specialize qw/vp9_tm_predictor_8x8 neon dspr2/, "$sse2_x86inc"; add_proto qw/void vp9_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_dc_predictor_8x8 dspr2/, "$sse_x86inc"; @@ -150,8 +144,7 @@ add_proto qw/void vp9_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, c specialize qw/vp9_d63_predictor_16x16/, "$ssse3_x86inc"; add_proto qw/void vp9_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_16x16 neon_asm dspr2/, "$ssse3_x86inc"; -$vp9_h_predictor_16x16_neon_asm=vp9_h_predictor_16x16_neon; +specialize qw/vp9_h_predictor_16x16 neon dspr2/, "$ssse3_x86inc"; add_proto qw/void vp9_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d117_predictor_16x16/; @@ -163,12 +156,10 @@ add_proto qw/void vp9_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, specialize qw/vp9_d153_predictor_16x16/, "$ssse3_x86inc"; add_proto qw/void vp9_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_16x16 neon_asm/, "$sse2_x86inc"; -$vp9_v_predictor_16x16_neon_asm=vp9_v_predictor_16x16_neon; +specialize qw/vp9_v_predictor_16x16 neon/, "$sse2_x86inc"; add_proto qw/void vp9_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_16x16 neon_asm/, "$sse2_x86inc"; -$vp9_tm_predictor_16x16_neon_asm=vp9_tm_predictor_16x16_neon; +specialize qw/vp9_tm_predictor_16x16 neon/, "$sse2_x86inc"; add_proto qw/void vp9_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_dc_predictor_16x16 dspr2/, "$sse2_x86inc"; @@ -192,8 +183,7 @@ add_proto qw/void vp9_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, c specialize qw/vp9_d63_predictor_32x32/, "$ssse3_x86inc"; add_proto qw/void vp9_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_32x32 neon_asm/, "$ssse3_x86inc"; -$vp9_h_predictor_32x32_neon_asm=vp9_h_predictor_32x32_neon; +specialize qw/vp9_h_predictor_32x32 neon/, "$ssse3_x86inc"; add_proto qw/void vp9_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d117_predictor_32x32/; @@ -205,12 +195,10 @@ add_proto qw/void vp9_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, specialize qw/vp9_d153_predictor_32x32/; add_proto qw/void vp9_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_32x32 neon_asm/, "$sse2_x86inc"; -$vp9_v_predictor_32x32_neon_asm=vp9_v_predictor_32x32_neon; +specialize qw/vp9_v_predictor_32x32 neon/, "$sse2_x86inc"; add_proto qw/void vp9_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_32x32 neon_asm/, "$sse2_x86_64"; -$vp9_tm_predictor_32x32_neon_asm=vp9_tm_predictor_32x32_neon; +specialize qw/vp9_tm_predictor_32x32 neon/, "$sse2_x86_64"; add_proto qw/void vp9_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_dc_predictor_32x32/, "$sse2_x86inc"; @@ -261,8 +249,7 @@ add_proto qw/void vp9_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t * specialize qw/vp9_lpf_horizontal_4 mmx neon dspr2/; add_proto qw/void vp9_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/vp9_lpf_horizontal_4_dual sse2 neon_asm dspr2/; -$vp9_lpf_horizontal_4_dual_neon_asm=vp9_lpf_horizontal_4_dual_neon; +specialize qw/vp9_lpf_horizontal_4_dual sse2 neon dspr2/; # # post proc @@ -457,12 +444,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_idct32x32_1_add sse2 neon dspr2/; add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; - specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/; - $vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon; + specialize qw/vp9_iht4x4_16_add sse2 neon dspr2/; add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; - specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/; - $vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon; + specialize qw/vp9_iht8x8_64_add sse2 neon dspr2/; add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; specialize qw/vp9_iht16x16_256_add sse2 dspr2/; @@ -1140,37 +1125,37 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; specialize qw/vp9_block_error/; - add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_quantize_fp/; - add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_quantize_fp_32x32/; - add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_quantize_b/; - add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_quantize_b_32x32/; - add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_fdct8x8_quant/; } else { add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; specialize qw/vp9_block_error avx2/, "$sse2_x86inc"; - add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64"; - add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64"; - add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_quantize_b sse2/, "$ssse3_x86_64"; - add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64"; - add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_fdct8x8_quant sse2 ssse3/; } @@ -1865,16 +1850,16 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd"; specialize qw/vp9_highbd_subtract_block/; - add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_highbd_quantize_fp/; - add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_highbd_quantize_fp_32x32/; - add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_highbd_quantize_b sse2/; - add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_highbd_quantize_b_32x32 sse2/; # diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c index c4efa6565..71dbb402d 100644 --- a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c +++ b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c @@ -312,9 +312,11 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, unsigned int out_pitch, unsigned int output_height, int16_t *filter) { - __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6; + __m128i addFilterReg64, filtersReg, minReg; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; + __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; + __m128i srcReg8; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 @@ -333,27 +335,26 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, // duplicate only the forth 16 bits in the filter forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + // load the first 7 rows of 8 bytes + srcReg1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]); + srcReg2 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch)[0]); + srcReg3 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 2)[0]); + srcReg4 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 3)[0]); + srcReg5 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 4)[0]); + srcReg6 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 5)[0]); + srcReg7 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 6)[0]); + for (i = 0; i < output_height; i++) { - // load the first 8 bytes - srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]); - // load the next 8 bytes in stride of src_pitch - srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]); - srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]); - srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]); + // load the last 8 bytes + srcReg8 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 7)[0]); // merge the result together - srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); - srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); - - // load the next 8 bytes in stride of src_pitch - srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]); - srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]); - srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]); - srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]); + srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); + srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); // merge the result together - srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4); - srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6); + srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); + srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); @@ -377,6 +378,15 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, src_ptr+=src_pitch; + // shift down a row + srcReg1 = srcReg2; + srcReg2 = srcReg3; + srcReg3 = srcReg4; + srcReg4 = srcReg5; + srcReg5 = srcReg6; + srcReg6 = srcReg7; + srcReg7 = srcReg8; + // save only 8 bytes convolve result _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); @@ -390,9 +400,11 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, unsigned int out_pitch, unsigned int output_height, int16_t *filter) { - __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3; + __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; - __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; + __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; + __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; + __m128i srcReg8; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 @@ -411,19 +423,24 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, // duplicate only the forth 16 bits in the filter forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + // load the first 7 rows of 16 bytes + srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr)); + srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch)); + srcReg3 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 2)); + srcReg4 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 3)); + srcReg5 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 4)); + srcReg6 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 5)); + srcReg7 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 6)); + for (i = 0; i < output_height; i++) { - // load the first 16 bytes - srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr)); - // load the next 16 bytes in stride of src_pitch - srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)); - srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)); - srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); + // load the last 16 bytes + srcReg8 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 7)); // merge the result together - srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); - srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); - srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2); - srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4); + srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2); + srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8); + srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2); + srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8); // multiply 2 adjacent elements with the filter and add the result srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters); @@ -435,25 +452,17 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); - // load the next 16 bytes in stride of two/three src_pitch - srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)); - srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)); - // merge the result together - srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); - srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); + srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); + srcRegFilt6 = _mm_unpackhi_epi8(srcReg3, srcReg4); // multiply 2 adjacent elements with the filter and add the result - srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters); + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters); - // load the next 16 bytes in stride of four/five src_pitch - srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)); - srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)); - // merge the result together - srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); - srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); + srcRegFilt7 = _mm_unpacklo_epi8(srcReg5, srcReg6); + srcRegFilt8 = _mm_unpackhi_epi8(srcReg5, srcReg6); // multiply 2 adjacent elements with the filter and add the result srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters); @@ -461,13 +470,13 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, // add and saturate the results together srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, - _mm_min_epi16(srcRegFilt4, srcRegFilt7)); + _mm_min_epi16(srcRegFilt3, srcRegFilt7)); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt6, srcRegFilt8)); // add and saturate the results together srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, - _mm_max_epi16(srcRegFilt4, srcRegFilt7)); + _mm_max_epi16(srcRegFilt3, srcRegFilt7)); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt6, srcRegFilt8)); srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64); @@ -484,6 +493,15 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, src_ptr+=src_pitch; + // shift down a row + srcReg1 = srcReg2; + srcReg2 = srcReg3; + srcReg3 = srcReg4; + srcReg4 = srcReg5; + srcReg5 = srcReg6; + srcReg6 = srcReg7; + srcReg7 = srcReg8; + // save 16 bytes convolve result _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 58df87d0c..9677173db 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -719,6 +719,7 @@ static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { cm->use_highbitdepth, #endif VP9_DEC_BORDER_IN_PIXELS, + cm->byte_alignment, &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb, cm->cb_priv)) { vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, @@ -793,6 +794,7 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm, cm->use_highbitdepth, #endif VP9_DEC_BORDER_IN_PIXELS, + cm->byte_alignment, &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb, cm->cb_priv)) { vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, @@ -1556,6 +1558,10 @@ void vp9_decode_frame(VP9Decoder *pbi, vp9_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y); *cm->fc = cm->frame_contexts[cm->frame_context_idx]; + if (!cm->fc->initialized) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Uninitialized entropy context."); + vp9_zero(cm->counts); xd->corrupted = 0; diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c index 2daf86200..1406b4034 100644 --- a/vp9/decoder/vp9_decoder.c +++ b/vp9/decoder/vp9_decoder.c @@ -15,6 +15,7 @@ #include "./vpx_scale_rtcd.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_once.h" #include "vpx_ports/vpx_timer.h" #include "vpx_scale/vpx_scale.h" @@ -33,8 +34,8 @@ #include "vp9/decoder/vp9_detokenize.h" #include "vp9/decoder/vp9_dthread.h" -static void initialize_dec() { - static int init_done = 0; +static void initialize_dec(void) { + static volatile int init_done = 0; if (!init_done) { vp9_rtcd(); @@ -85,7 +86,7 @@ VP9Decoder *vp9_decoder_create() { sizeof(*cm->frame_contexts))); pbi->need_resync = 1; - initialize_dec(); + once(initialize_dec); // Initialize the references to not point to any frame buffers. vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map)); diff --git a/vp9/decoder/vp9_read_bit_buffer.c b/vp9/decoder/vp9_read_bit_buffer.c index 3eef72844..c3b38a9c7 100644 --- a/vp9/decoder/vp9_read_bit_buffer.c +++ b/vp9/decoder/vp9_read_bit_buffer.c @@ -10,20 +10,20 @@ #include "vp9/decoder/vp9_read_bit_buffer.h" size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb) { - return (rb->bit_offset + CHAR_BIT - 1) / CHAR_BIT; + return (rb->bit_offset + 7) >> 3; } int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb) { const size_t off = rb->bit_offset; - const size_t p = off / CHAR_BIT; - const int q = CHAR_BIT - 1 - (int)off % CHAR_BIT; - if (rb->bit_buffer + p >= rb->bit_buffer_end) { - rb->error_handler(rb->error_handler_data); - return 0; - } else { - const int bit = (rb->bit_buffer[p] & (1 << q)) >> q; + const size_t p = off >> 3; + const int q = 7 - (int)(off & 0x7); + if (rb->bit_buffer + p < rb->bit_buffer_end) { + const int bit = (rb->bit_buffer[p] >> q) & 1; rb->bit_offset = off + 1; return bit; + } else { + rb->error_handler(rb->error_handler_data); + return 0; } } diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c index 8c13d0da6..9cf1e5e2c 100644 --- a/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -26,13 +26,12 @@ void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, - int zbin_oq_value, uint16_t *eob_ptr, + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { // TODO(jingning) Decide the need of these arguments after the // quantization process is completed. (void)zbin_ptr; (void)quant_shift_ptr; - (void)zbin_oq_value; (void)scan; if (!skip_block) { diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index 019d5b90b..752429c8f 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -1181,7 +1181,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i], counts->intra_inter[i]); - if (cm->allow_comp_inter_inter) { + if (cpi->allow_comp_inter_inter) { const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE; const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT; diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 2ffc7ea67..68174a6cc 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -40,8 +40,6 @@ struct macroblock_plane { int16_t *round; int64_t quant_thred[2]; - // Zbin Over Quant value - int16_t zbin_extra; }; /* The [2] dimension is for whether we skip the EOB node (i.e. if previous diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index 020a95196..506f6de84 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -339,7 +339,7 @@ void vp9_fdct8x8_quant_c(const int16_t *input, int stride, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - int zbin_oq_value, uint16_t *eob_ptr, + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int eob = -1; @@ -416,7 +416,6 @@ void vp9_fdct8x8_quant_c(const int16_t *input, int stride, // quantization process is completed. (void)zbin_ptr; (void)quant_shift_ptr; - (void)zbin_oq_value; (void)iscan; vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c index 4deeed217..56ec6b335 100644 --- a/vp9/encoder/vp9_denoiser.c +++ b/vp9/encoder/vp9_denoiser.c @@ -425,6 +425,7 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, #endif int border) { int i, fail; + const int legacy_byte_alignment = 0; assert(denoiser != NULL); for (i = 0; i < MAX_REF_FRAMES; ++i) { @@ -433,7 +434,7 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, #if CONFIG_VP9_HIGHBITDEPTH use_highbitdepth, #endif - border); + border, legacy_byte_alignment); if (fail) { vp9_denoiser_free(denoiser); return 1; @@ -448,7 +449,7 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, #if CONFIG_VP9_HIGHBITDEPTH use_highbitdepth, #endif - border); + border, legacy_byte_alignment); if (fail) { vp9_denoiser_free(denoiser); return 1; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 756393f31..4c948237d 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -423,8 +423,9 @@ static int set_vt_partitioning(VP9_COMP *cpi, if (cm->frame_type == KEY_FRAME) { bsize_ref = BLOCK_8X8; - // Choose lower thresholds for key frame variance to favor split. - threshold_bsize_ref = threshold >> 1; + // Choose lower thresholds for key frame variance to favor split, but keep + // threshold for splitting to 4x4 block still fairly high for now. + threshold_bsize_ref = threshold << 2; threshold_low = threshold >> 2; } @@ -592,7 +593,16 @@ static void choose_partitioning(VP9_COMP *cpi, unsigned int sse = 0; int sum = 0; if (x4_idx < pixels_wide && y4_idx < pixels_high) { +#if CONFIG_VP9_HIGHBITDEPTH + int s_avg; + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + s_avg = vp9_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp); + } else { + s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp); + } +#else int s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp); +#endif // For key frame, reference is set to 128. sum = s_avg - 128; sse = sum * sum; @@ -646,8 +656,6 @@ static void choose_partitioning(VP9_COMP *cpi, for (k = 0; k < 4; ++k) { const int x8_idx = (k & 1); const int y8_idx = (k >> 1); - // TODO(marpan): Allow for setting 4x4 partition on key frame. - /* if (cm->frame_type == KEY_FRAME) { if (!set_vt_partitioning(cpi, xd, &vt.split[i].split[j].split[k], @@ -660,12 +668,11 @@ static void choose_partitioning(VP9_COMP *cpi, BLOCK_4X4); } } else { - */ set_block_size(cpi, xd, (mi_row + y32_idx + y16_idx + y8_idx), (mi_col + x32_idx + x16_idx + x8_idx), BLOCK_8X8); - // } + } } } } @@ -3612,7 +3619,6 @@ static void encode_frame_internal(VP9_COMP *cpi) { if (xd->lossless) { x->optimize = 0; cm->lf.filter_level = 0; - cpi->zbin_mode_boost_enabled = 0; } vp9_frame_init_quantizer(cpi); @@ -3716,9 +3722,9 @@ void vp9_encode_frame(VP9_COMP *cpi) { cm->ref_frame_sign_bias[GOLDEN_FRAME]) || (cm->ref_frame_sign_bias[ALTREF_FRAME] == cm->ref_frame_sign_bias[LAST_FRAME])) { - cm->allow_comp_inter_inter = 0; + cpi->allow_comp_inter_inter = 0; } else { - cm->allow_comp_inter_inter = 1; + cpi->allow_comp_inter_inter = 1; cm->comp_fixed_ref = ALTREF_FRAME; cm->comp_var_ref[0] = LAST_FRAME; cm->comp_var_ref[1] = GOLDEN_FRAME; @@ -3742,7 +3748,7 @@ void vp9_encode_frame(VP9_COMP *cpi) { const int is_alt_ref = frame_type == ALTREF_FRAME; /* prediction (compound, single or hybrid) mode selection */ - if (is_alt_ref || !cm->allow_comp_inter_inter) + if (is_alt_ref || !cpi->allow_comp_inter_inter) cm->reference_mode = SINGLE_REFERENCE; else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] && mode_thrs[COMPOUND_REFERENCE] > @@ -3852,24 +3858,6 @@ static void sum_intra_stats(FRAME_COUNTS *counts, const MODE_INFO *mi) { ++counts->uv_mode[y_mode][uv_mode]; } -static int get_zbin_mode_boost(const MB_MODE_INFO *mbmi, int enabled) { - if (enabled) { - if (is_inter_block(mbmi)) { - if (mbmi->mode == ZEROMV) { - return mbmi->ref_frame[0] != LAST_FRAME ? GF_ZEROMV_ZBIN_BOOST - : LF_ZEROMV_ZBIN_BOOST; - } else { - return mbmi->sb_type < BLOCK_8X8 ? SPLIT_MV_ZBIN_BOOST - : MV_ZBIN_BOOST; - } - } else { - return INTRA_ZBIN_BOOST; - } - } else { - return 0; - } -} - static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t, int output_enabled, int mi_row, int mi_col, BLOCK_SIZE bsize, @@ -3905,12 +3893,6 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td, set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); - // Experimental code. Special case for gf and arf zeromv modes. - // Increase zbin size to suppress noise - cpi->zbin_mode_boost = get_zbin_mode_boost(mbmi, - cpi->zbin_mode_boost_enabled); - vp9_update_zbin_extra(cpi, x); - if (!is_inter_block(mbmi)) { int plane; mbmi->skip = 1; diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 9b2165be6..9c29eb438 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -149,7 +149,6 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block, int64_t rd_cost0, rd_cost1; int rate0, rate1, error0, error1, t0, t1; int best, band, pt, i, final_eob; - const TOKENVALUE *dct_value_tokens; const int16_t *dct_value_cost; assert((!type && !plane) || (type && plane)); @@ -169,22 +168,18 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block, #if CONFIG_VP9_HIGHBITDEPTH if (xd->bd == 12) { - dct_value_tokens = vp9_dct_value_tokens_high12_ptr; dct_value_cost = vp9_dct_value_cost_high12_ptr; } else if (xd->bd == 10) { - dct_value_tokens = vp9_dct_value_tokens_high10_ptr; dct_value_cost = vp9_dct_value_cost_high10_ptr; } else { - dct_value_tokens = vp9_dct_value_tokens_ptr; dct_value_cost = vp9_dct_value_cost_ptr; } #else - dct_value_tokens = vp9_dct_value_tokens_ptr; dct_value_cost = vp9_dct_value_cost_ptr; #endif for (i = 0; i < eob; i++) token_cache[scan[i]] = - vp9_pt_energy_class[dct_value_tokens[qcoeff[scan[i]]].token]; + vp9_pt_energy_class[vp9_get_token(qcoeff[scan[i]])]; for (i = eob; i-- > 0;) { int base_bits, d2, dx; @@ -198,7 +193,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block, /* Evaluate the first possibility for this state. */ rate0 = tokens[next][0].rate; rate1 = tokens[next][1].rate; - t0 = (dct_value_tokens + x)->token; + t0 = vp9_get_token(x); /* Consider both possible successor states. */ if (next < default_eob) { band = band_translate[i + 1]; @@ -250,7 +245,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block, t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN; t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN; } else { - t0 = t1 = (dct_value_tokens + x)->token; + t0 = t1 = vp9_get_token(x); } if (next < default_eob) { band = band_translate[i + 1]; @@ -391,28 +386,28 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, vp9_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp, p->quant_fp, p->quant_shift, qcoeff, dqcoeff, pd->dequant, - p->zbin_extra, eob, scan_order->scan, + eob, scan_order->scan, scan_order->iscan); break; case TX_16X16: vp9_highbd_fdct16x16(src_diff, coeff, diff_stride); vp9_highbd_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp, p->quant_fp, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, + pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_8X8: vp9_highbd_fdct8x8(src_diff, coeff, diff_stride); vp9_highbd_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp, p->quant_fp, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, + pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_4X4: x->fwd_txm4x4(src_diff, coeff, diff_stride); vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp, p->quant_fp, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, + pd->dequant, eob, scan_order->scan, scan_order->iscan); break; default: @@ -427,28 +422,28 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp, p->quant_fp, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, scan_order->scan, + pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_16X16: vp9_fdct16x16(src_diff, coeff, diff_stride); vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp, p->quant_fp, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, + pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_8X8: vp9_fdct8x8_quant(src_diff, diff_stride, coeff, 64, x->skip_block, p->zbin, p->round_fp, p->quant_fp, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, + pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_4X4: x->fwd_txm4x4(src_diff, coeff, diff_stride); vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp, p->quant_fp, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, + pd->dequant, eob, scan_order->scan, scan_order->iscan); break; default: @@ -561,28 +556,28 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vp9_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, - dqcoeff, pd->dequant, p->zbin_extra, eob, + dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_16X16: vp9_highbd_fdct16x16(src_diff, coeff, diff_stride); vp9_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, + pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_8X8: vp9_highbd_fdct8x8(src_diff, coeff, diff_stride); vp9_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, + pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_4X4: x->fwd_txm4x4(src_diff, coeff, diff_stride); vp9_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, + pd->dequant, eob, scan_order->scan, scan_order->iscan); break; default: @@ -597,28 +592,28 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, scan_order->scan, + pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_16X16: vp9_fdct16x16(src_diff, coeff, diff_stride); vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, + pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_8X8: vp9_fdct8x8(src_diff, coeff, diff_stride); vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, + pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_4X4: x->fwd_txm4x4(src_diff, coeff, diff_stride); vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, + pd->dequant, eob, scan_order->scan, scan_order->iscan); break; default: @@ -849,8 +844,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vp9_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, - p->zbin_extra, eob, + qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } if (!x->skip_encode && *eob) { @@ -871,7 +865,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type); vp9_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, + pd->dequant, eob, scan_order->scan, scan_order->iscan); } if (!x->skip_encode && *eob) { @@ -893,7 +887,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type); vp9_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, + pd->dequant, eob, scan_order->scan, scan_order->iscan); } if (!x->skip_encode && *eob) { @@ -919,7 +913,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, x->fwd_txm4x4(src_diff, coeff, diff_stride); vp9_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, + pd->dequant, eob, scan_order->scan, scan_order->iscan); } @@ -958,7 +952,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, scan_order->scan, + pd->dequant, eob, scan_order->scan, scan_order->iscan); } if (!x->skip_encode && *eob) @@ -978,7 +972,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, vp9_fht16x16(src_diff, coeff, diff_stride, tx_type); vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, scan_order->scan, + pd->dequant, eob, scan_order->scan, scan_order->iscan); } if (!x->skip_encode && *eob) @@ -998,7 +992,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, vp9_fht8x8(src_diff, coeff, diff_stride, tx_type); vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, scan_order->scan, + pd->dequant, eob, scan_order->scan, scan_order->iscan); } if (!x->skip_encode && *eob) @@ -1022,7 +1016,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, x->fwd_txm4x4(src_diff, coeff, diff_stride); vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, scan_order->scan, + pd->dequant, eob, scan_order->scan, scan_order->iscan); } diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 620027e2d..047b9aaef 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -181,8 +181,8 @@ static void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) { cm->prev_mi = cm->prev_mip + cm->mi_stride + 1; } -void vp9_initialize_enc() { - static int init_done = 0; +void vp9_initialize_enc(void) { + static volatile int init_done = 0; if (!init_done) { vp9_rtcd(); @@ -490,7 +490,8 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) { #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate altref buffer"); } @@ -510,7 +511,8 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) { #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate last frame buffer"); @@ -520,7 +522,8 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) { #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate scaled source buffer"); @@ -530,7 +533,8 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) { #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate scaled last source buffer"); } @@ -566,7 +570,8 @@ static void update_frame_size(VP9_COMP *cpi) { #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to reallocate alt_ref_buffer"); } @@ -2472,7 +2477,8 @@ void vp9_scale_references(VP9_COMP *cpi) { cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, cm->use_highbitdepth, - VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL); + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL); scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf, (int)cm->bit_depth); #else @@ -2481,7 +2487,8 @@ void vp9_scale_references(VP9_COMP *cpi) { vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, - VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL); + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL); scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf); #endif // CONFIG_VP9_HIGHBITDEPTH cpi->scaled_ref_idx[ref_frame - 1] = new_fb; @@ -2720,7 +2727,8 @@ void set_frame_size(VP9_COMP *cpi) { #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL); + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL); alloc_util_frame_buffers(cpi); init_motion_estimation(cpi); @@ -3139,12 +3147,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, vp9_clear_system_state(); - // Enable or disable mode based tweaking of the zbin. - // For 2 pass only used where GF/ARF prediction quality - // is above a threshold. - cpi->zbin_mode_boost = 0; - cpi->zbin_mode_boost_enabled = 0; - // Set the arf sign bias for this frame. set_arf_sign_bias(cpi); diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 14f7c7f0c..7872e2cc1 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -315,9 +315,6 @@ typedef struct VP9_COMP { int *nmvsadcosts[2]; int *nmvsadcosts_hp[2]; - int zbin_mode_boost; - int zbin_mode_boost_enabled; - int64_t last_time_stamp_seen; int64_t last_end_time_stamp_seen; int64_t first_time_stamp_ever; @@ -339,6 +336,8 @@ typedef struct VP9_COMP { unsigned int max_mv_magnitude; int mv_step_param; + int allow_comp_inter_inter; + // Default value is 1. From first pass stats, encode_breakout may be disabled. ENCODE_BREAKOUT_TYPE allow_encode_breakout; @@ -449,7 +448,7 @@ typedef struct VP9_COMP { VP9Worker *workers; } VP9_COMP; -void vp9_initialize_enc(); +void vp9_initialize_enc(void); struct VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf); void vp9_remove_compressor(VP9_COMP *cpi); diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c index 823e7a162..708072ee2 100644 --- a/vp9/encoder/vp9_lookahead.c +++ b/vp9/encoder/vp9_lookahead.c @@ -65,6 +65,7 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width, // Allocate the lookahead structures ctx = calloc(1, sizeof(*ctx)); if (ctx) { + const int legacy_byte_alignment = 0; unsigned int i; ctx->max_sz = depth; ctx->buf = calloc(depth, sizeof(*ctx->buf)); @@ -76,7 +77,8 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width, #if CONFIG_VP9_HIGHBITDEPTH use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS)) + VP9_ENC_BORDER_IN_PIXELS, + legacy_byte_alignment)) goto bail; } return ctx; diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index b45032456..319a47833 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -249,14 +249,14 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize], + vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize], dc_quant >> (xd->bd - 5), &rate, &dist); } else { - vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize], + vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize], dc_quant >> 3, &rate, &dist); } #else - vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize], + vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize], dc_quant >> 3, &rate, &dist); #endif // CONFIG_VP9_HIGHBITDEPTH @@ -265,14 +265,14 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize], + vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize], ac_quant >> (xd->bd - 5), &rate, &dist); } else { - vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize], + vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize], ac_quant >> 3, &rate, &dist); } #else - vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize], + vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize], ac_quant >> 3, &rate, &dist); #endif // CONFIG_VP9_HIGHBITDEPTH @@ -447,11 +447,10 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, args->dist += dist; } -static const THR_MODES mode_idx[MAX_REF_FRAMES][4] = { +static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][4] = { {THR_DC, THR_H_PRED, THR_V_PRED, THR_TM}, {THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV}, {THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG}, - {THR_NEARESTA, THR_NEARA, THR_ZEROA, THR_NEWA}, }; static const PREDICTION_MODE intra_mode_list[] = { @@ -522,8 +521,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, struct macroblockd_plane *const pd = &xd->plane[0]; PREDICTION_MODE best_mode = ZEROMV; MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME; - TX_SIZE best_tx_size = MIN(max_txsize_lookup[bsize], - tx_mode_to_biggest_tx_size[cm->tx_mode]); + TX_SIZE best_tx_size = TX_SIZES; INTERP_FILTER best_pred_filter = EIGHTTAP; int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; @@ -537,9 +535,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, // Reduce the intra cost penalty for small blocks (<=16x16). const int reduction_fac = (cpi->sf.partition_search_type == VAR_BASED_PARTITION && - bsize <= BLOCK_16X16) ? 4 : 1; + bsize <= BLOCK_16X16) ? 2 : 0; const int intra_cost_penalty = vp9_get_intra_cost_penalty( - cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) / reduction_fac; + cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) >> reduction_fac; const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv, intra_cost_penalty, 0); const int8_t segment_id = mbmi->segment_id; @@ -839,13 +837,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, // Perform intra prediction search, if the best SAD is above a certain // threshold. - if (!x->skip && best_rdc.rdcost > inter_mode_thresh && - bsize <= cpi->sf.max_intra_bsize) { + if (best_rdc.rdcost == INT64_MAX || + (!x->skip && best_rdc.rdcost > inter_mode_thresh && + bsize <= cpi->sf.max_intra_bsize)) { struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 }; const TX_SIZE intra_tx_size = MIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); int i; + TX_SIZE best_intra_tx_size = TX_SIZES; if (reuse_inter_pred && best_pred != NULL) { if (best_pred->data == orig_dst.buf) { @@ -870,11 +870,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, pd->dst = orig_dst; for (i = 0; i < 4; ++i) { - const TX_SIZE saved_tx_size = mbmi->tx_size; const PREDICTION_MODE this_mode = intra_mode_list[i]; if (!((1 << this_mode) & cpi->sf.intra_y_mode_mask[intra_tx_size])) continue; - skip_txfm = x->skip_txfm[0]; args.mode = this_mode; args.rate = 0; args.dist = 0; @@ -891,15 +889,20 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (this_rdc.rdcost < best_rdc.rdcost) { best_rdc = this_rdc; mbmi->mode = this_mode; - mbmi->tx_size = intra_tx_size; + best_intra_tx_size = mbmi->tx_size; mbmi->ref_frame[0] = INTRA_FRAME; mbmi->uv_mode = this_mode; mbmi->mv[0].as_int = INVALID_MV; - } else { - x->skip_txfm[0] = best_mode_skip_txfm; - mbmi->tx_size = saved_tx_size; } } + + // Reset mb_mode_info to the best inter mode. + if (mbmi->ref_frame[0] != INTRA_FRAME) { + x->skip_txfm[0] = best_mode_skip_txfm; + mbmi->tx_size = best_tx_size; + } else { + mbmi->tx_size = best_intra_tx_size; + } } pd->dst = orig_dst; @@ -923,14 +926,23 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } } - if (is_inter_block(mbmi)) - vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact, - cpi->sf.adaptive_rd_thresh, bsize, - mode_idx[best_ref_frame][INTER_OFFSET(mbmi->mode)]); - else - vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact, - cpi->sf.adaptive_rd_thresh, bsize, - mode_idx[INTRA_FRAME][mbmi->mode]); + if (cpi->sf.adaptive_rd_thresh) { + THR_MODES best_mode_idx = is_inter_block(mbmi) ? + mode_idx[best_ref_frame][INTER_OFFSET(mbmi->mode)] : + mode_idx[INTRA_FRAME][mbmi->mode]; + PREDICTION_MODE this_mode; + for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) { + for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { + THR_MODES thr_mode_idx = mode_idx[ref_frame][INTER_OFFSET(this_mode)]; + int *freq_fact = &tile_data->thresh_freq_fact[bsize][thr_mode_idx]; + if (thr_mode_idx == best_mode_idx) + *freq_fact -= (*freq_fact >> 4); + else + *freq_fact = MIN(*freq_fact + RD_THRESH_INC, + cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT); + } + } + } *rd_cost = best_rdc; } diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index e7a20c4d2..389dc87e0 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -122,14 +122,13 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - int zbin_oq_value, uint16_t *eob_ptr, + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int i, eob = -1; // TODO(jingning) Decide the need of these arguments after the // quantization process is completed. (void)zbin_ptr; (void)quant_shift_ptr; - (void)zbin_oq_value; (void)iscan; vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); @@ -168,7 +167,6 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { @@ -178,7 +176,6 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, // quantization process is completed. (void)zbin_ptr; (void)quant_shift_ptr; - (void)zbin_oq_value; (void)iscan; vpx_memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); @@ -217,12 +214,11 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - int zbin_oq_value, uint16_t *eob_ptr, + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int i, eob = -1; (void)zbin_ptr; (void)quant_shift_ptr; - (void)zbin_oq_value; (void)iscan; vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); @@ -261,12 +257,11 @@ void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - int zbin_oq_value, uint16_t *eob_ptr, + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int i, eob = -1; (void)zbin_ptr; (void)quant_shift_ptr; - (void)zbin_oq_value; (void)iscan; vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); @@ -302,13 +297,11 @@ void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - int zbin_oq_value, uint16_t *eob_ptr, + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int i, non_zero_count = (int)n_coeffs, eob = -1; - const int zbins[2] = { zbin_ptr[0] + zbin_oq_value, - zbin_ptr[1] + zbin_oq_value }; - const int nzbins[2] = { zbins[0] * -1, - zbins[1] * -1 }; + const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]}; + const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1}; (void)iscan; vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); @@ -355,14 +348,12 @@ void vp9_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, int zbin_oq_value, + const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int i, non_zero_count = (int)n_coeffs, eob = -1; - const int zbins[2] = { zbin_ptr[0] + zbin_oq_value, - zbin_ptr[1] + zbin_oq_value }; - const int nzbins[2] = { zbins[0] * -1, - zbins[1] * -1 }; + const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]}; + const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1}; (void)iscan; vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); @@ -412,10 +403,10 @@ void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - int zbin_oq_value, uint16_t *eob_ptr, + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1), - ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1) }; + const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1), + ROUND_POWER_OF_TWO(zbin_ptr[1], 1)}; const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1}; int idx = 0; @@ -471,11 +462,11 @@ void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - int zbin_oq_value, uint16_t *eob_ptr, + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1), - ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1) }; - const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1), + ROUND_POWER_OF_TWO(zbin_ptr[1], 1)}; + const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1}; int idx = 0; int idx_arr[1024]; @@ -534,7 +525,7 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, p->zbin, p->round, p->quant, p->quant_shift, BLOCK_OFFSET(p->qcoeff, block), BLOCK_OFFSET(pd->dqcoeff, block), - pd->dequant, p->zbin_extra, &p->eobs[block], + pd->dequant, &p->eobs[block], scan, iscan); return; } @@ -544,7 +535,7 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, p->zbin, p->round, p->quant, p->quant_shift, BLOCK_OFFSET(p->qcoeff, block), BLOCK_OFFSET(pd->dqcoeff, block), - pd->dequant, p->zbin_extra, &p->eobs[block], scan, iscan); + pd->dequant, &p->eobs[block], scan, iscan); } static void invert_quant(int16_t *quant, int16_t *shift, int d) { @@ -641,7 +632,6 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { const int segment_id = xd->mi[0].src_mi->mbmi.segment_id; const int qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex); const int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q); - const int zbin = cpi->zbin_mode_boost; int i; // Y @@ -651,13 +641,10 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { x->plane[0].quant_shift = quants->y_quant_shift[qindex]; x->plane[0].zbin = quants->y_zbin[qindex]; x->plane[0].round = quants->y_round[qindex]; - x->plane[0].zbin_extra = (int16_t)((cm->y_dequant[qindex][1] * zbin) >> 7); xd->plane[0].dequant = cm->y_dequant[qindex]; - x->plane[0].quant_thred[0] = (x->plane[0].zbin[0] + x->plane[0].zbin_extra) * - (x->plane[0].zbin[0] + x->plane[0].zbin_extra); - x->plane[0].quant_thred[1] = (x->plane[0].zbin[1] + x->plane[0].zbin_extra) * - (x->plane[0].zbin[1] + x->plane[0].zbin_extra); + x->plane[0].quant_thred[0] = x->plane[0].zbin[0] * x->plane[0].zbin[0]; + x->plane[0].quant_thred[1] = x->plane[0].zbin[1] * x->plane[0].zbin[1]; // UV for (i = 1; i < 3; i++) { @@ -667,15 +654,10 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { x->plane[i].quant_shift = quants->uv_quant_shift[qindex]; x->plane[i].zbin = quants->uv_zbin[qindex]; x->plane[i].round = quants->uv_round[qindex]; - x->plane[i].zbin_extra = (int16_t)((cm->uv_dequant[qindex][1] * zbin) >> 7); xd->plane[i].dequant = cm->uv_dequant[qindex]; - x->plane[i].quant_thred[0] = - (x->plane[i].zbin[0] + x->plane[i].zbin_extra) * - (x->plane[i].zbin[0] + x->plane[i].zbin_extra); - x->plane[i].quant_thred[1] = - (x->plane[i].zbin[1] + x->plane[i].zbin_extra) * - (x->plane[i].zbin[1] + x->plane[i].zbin_extra); + x->plane[i].quant_thred[0] = x->plane[i].zbin[0] * x->plane[i].zbin[0]; + x->plane[i].quant_thred[1] = x->plane[i].zbin[1] * x->plane[i].zbin[1]; } x->skip_block = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP); @@ -687,20 +669,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { vp9_initialize_me_consts(cpi, x->q_index); } -void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) { - const int qindex = x->q_index; - const int y_zbin_extra = (cpi->common.y_dequant[qindex][1] * - cpi->zbin_mode_boost) >> 7; - const int uv_zbin_extra = (cpi->common.uv_dequant[qindex][1] * - cpi->zbin_mode_boost) >> 7; - - x->plane[0].zbin_extra = (int16_t)y_zbin_extra; - x->plane[1].zbin_extra = (int16_t)uv_zbin_extra; - x->plane[2].zbin_extra = (int16_t)uv_zbin_extra; -} - void vp9_frame_init_quantizer(VP9_COMP *cpi) { - cpi->zbin_mode_boost = 0; vp9_init_plane_quantizers(cpi, &cpi->td.mb); } diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h index cee46e7e0..de2839f5b 100644 --- a/vp9/encoder/vp9_quantize.h +++ b/vp9/encoder/vp9_quantize.h @@ -68,8 +68,6 @@ struct VP9Common; void vp9_frame_init_quantizer(struct VP9_COMP *cpi); -void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x); - void vp9_init_plane_quantizers(struct VP9_COMP *cpi, MACROBLOCK *x); void vp9_init_quantizer(struct VP9_COMP *cpi); diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c index 5b49bfc17..34d49f058 100644 --- a/vp9/encoder/vp9_rd.c +++ b/vp9/encoder/vp9_rd.c @@ -379,7 +379,7 @@ static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) { *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10; } -void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n, +void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2, unsigned int qstep, int *rate, int64_t *dist) { // This function models the rate and distortion for a Laplacian @@ -395,10 +395,10 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n, int d_q10, r_q10; static const uint32_t MAX_XSQ_Q10 = 245727; const uint64_t xsq_q10_64 = - ((((uint64_t)qstep * qstep * n) << 10) + (var >> 1)) / var; + (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var; const int xsq_q10 = (int)MIN(xsq_q10_64, MAX_XSQ_Q10); model_rd_norm(xsq_q10, &r_q10, &d_q10); - *rate = (n * r_q10 + 2) >> 2; + *rate = ((r_q10 << n_log2) + 2) >> 2; *dist = (var * (int64_t)d_q10 + 512) >> 10; } } diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 600a3eb1a..ded082f86 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -37,6 +37,7 @@ #include "vp9/encoder/vp9_rd.h" #include "vp9/encoder/vp9_rdopt.h" #include "vp9/encoder/vp9_variance.h" +#include "vp9/encoder/vp9_aq_variance.h" #define LAST_FRAME_MODE_MASK ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \ (1 << INTRA_FRAME)) @@ -48,6 +49,7 @@ #define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | 0x01) #define MIN_EARLY_TERM_INDEX 3 +#define NEW_MV_DISCOUNT_FACTOR 8 typedef struct { PREDICTION_MODE mode; @@ -75,6 +77,7 @@ struct rdcost_block_args { const scan_order *so; }; +#define LAST_NEW_MV_INDEX 6 static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = { {NEARESTMV, {LAST_FRAME, NONE}}, {NEARESTMV, {ALTREF_FRAME, NONE}}, @@ -265,15 +268,15 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, } else { #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs], + vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs], pd->dequant[1] >> (xd->bd - 5), &rate, &dist); } else { - vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs], + vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs], pd->dequant[1] >> 3, &rate, &dist); } #else - vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs], + vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs], pd->dequant[1] >> 3, &rate, &dist); #endif // CONFIG_VP9_HIGHBITDEPTH rate_sum += rate; @@ -370,7 +373,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x, // dc token int v = qcoeff[0]; - int prev_t = vp9_dct_value_tokens_ptr[v].token; + int prev_t = vp9_get_token(v); cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v]; token_cache[0] = vp9_pt_energy_class[prev_t]; ++token_costs; @@ -381,7 +384,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x, int t; v = qcoeff[rc]; - t = vp9_dct_value_tokens_ptr[v].token; + t = vp9_get_token(v); if (use_fast_coef_costing) { cost += (*token_costs)[!prev_t][!prev_t][t] + vp9_dct_value_cost_ptr[v]; } else { @@ -2355,6 +2358,27 @@ static INLINE void restore_dst_buf(MACROBLOCKD *xd, } } +// In some situations we want to discount tha pparent cost of a new motion +// vector. Where there is a subtle motion field and especially where there is +// low spatial complexity then it can be hard to cover the cost of a new motion +// vector in a single block, even if that motion vector reduces distortion. +// However, once established that vector may be usable through the nearest and +// near mv modes to reduce distortion in subsequent blocks and also improve +// visual quality. +static int discount_newmv_test(const VP9_COMP *cpi, + int this_mode, + int_mv this_mv, + int_mv (*mode_mv)[MAX_REF_FRAMES], + int ref_frame) { + return (!cpi->rc.is_src_frame_alt_ref && + (this_mode == NEWMV) && + (this_mv.as_int != 0) && + ((mode_mv[NEARESTMV][ref_frame].as_int == 0) || + (mode_mv[NEARESTMV][ref_frame].as_int == INVALID_MV)) && + ((mode_mv[NEARMV][ref_frame].as_int == 0) || + (mode_mv[NEARMV][ref_frame].as_int == INVALID_MV))); +} + static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int64_t txfm_cache[], @@ -2464,10 +2488,20 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, &tmp_mv, &rate_mv); if (tmp_mv.as_int == INVALID_MV) return INT64_MAX; - *rate2 += rate_mv; + frame_mv[refs[0]].as_int = xd->mi[0].src_mi->bmi[0].as_mv[0].as_int = tmp_mv.as_int; single_newmv[refs[0]].as_int = tmp_mv.as_int; + + // Estimate the rate implications of a new mv but discount this + // under certain circumstances where we want to help initiate a weak + // motion field, where the distortion gain for a single block may not + // be enough to overcome the cost of a new mv. + if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0])) { + *rate2 += MAX((rate_mv / NEW_MV_DISCOUNT_FACTOR), 1); + } else { + *rate2 += rate_mv; + } } } @@ -2492,11 +2526,20 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, orig_dst_stride[i] = xd->plane[i].dst.stride; } - /* We don't include the cost of the second reference here, because there - * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other - * words if you present them in that order, the second one is always known - * if the first is known */ - *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]); + // We don't include the cost of the second reference here, because there + // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other + // words if you present them in that order, the second one is always known + // if the first is known. + // + // Under some circumstances we discount the cost of new mv mode to encourage + // initiation of a motion field. + if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]], + mode_mv, refs[0])) { + *rate2 += MIN(cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]), + cost_mv_ref(cpi, NEARESTMV, mbmi->mode_context[refs[0]])); + } else { + *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]); + } if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd && mbmi->mode != NEARESTMV) @@ -2726,6 +2769,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, x->skip_encode = 0; ctx->skip = 0; xd->mi[0].src_mi->mbmi.ref_frame[0] = INTRA_FRAME; + xd->mi[0].src_mi->mbmi.ref_frame[1] = NONE; if (bsize >= BLOCK_8X8) { if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, @@ -2941,7 +2985,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, mode_skip_mask[INTRA_FRAME] |= ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]); - for (i = 0; i < MAX_MODES; ++i) + for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) + mode_threshold[i] = 0; + for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i) mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5; midx = sf->schedule_mode_search ? mode_skip_start : 0; @@ -3065,7 +3111,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, comp_pred = second_ref_frame > INTRA_FRAME; if (comp_pred) { - if (!cm->allow_comp_inter_inter) + if (!cpi->allow_comp_inter_inter) continue; // Skip compound inter modes if ARF is not available. @@ -3715,7 +3761,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, comp_pred = second_ref_frame > INTRA_FRAME; if (comp_pred) { - if (!cm->allow_comp_inter_inter) + if (!cpi->allow_comp_inter_inter) continue; if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 0775b919c..15831fbbe 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -339,18 +339,10 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, sf->mv.fullpel_search_step_param = 10; sf->lpf_pick = LPF_PICK_MINIMAL_LPF; } - - if (speed >= 12) { + if (speed >= 8) { sf->adaptive_rd_thresh = 4; sf->mv.subpel_force_stop = 2; } - - if (speed >= 13) { - int i; - sf->max_intra_bsize = BLOCK_32X32; - for (i = 0; i < BLOCK_SIZES; ++i) - sf->inter_mode_mask[i] = INTER_NEAREST; - } } void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) { diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 184322f4f..31e93be65 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -39,7 +39,9 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { #if CONFIG_VP9_HIGHBITDEPTH cpi->common.use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) + VP9_ENC_BORDER_IN_PIXELS, + cpi->common.byte_alignment, + NULL, NULL, NULL)) vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, "Failed to allocate empty frame for multiple frame " "contexts"); diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index a4051f05e..424cc0843 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -710,8 +710,9 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, - NULL)) { + VP9_ENC_BORDER_IN_PIXELS, + cm->byte_alignment, + NULL, NULL, NULL)) { vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to reallocate alt_ref_buffer"); } diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c index 393eb1a4a..06bcfc317 100644 --- a/vp9/encoder/vp9_tokenize.c +++ b/vp9/encoder/vp9_tokenize.c @@ -23,23 +23,46 @@ #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_tokenize.h" -static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2]; -const TOKENVALUE *vp9_dct_value_tokens_ptr; static int16_t dct_value_cost[DCT_MAX_VALUE * 2]; -const int16_t *vp9_dct_value_cost_ptr; +const int16_t *vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE; #if CONFIG_VP9_HIGHBITDEPTH -static TOKENVALUE dct_value_tokens_high10[DCT_MAX_VALUE_HIGH10 * 2]; -const TOKENVALUE *vp9_dct_value_tokens_high10_ptr; static int16_t dct_value_cost_high10[DCT_MAX_VALUE_HIGH10 * 2]; -const int16_t *vp9_dct_value_cost_high10_ptr; +const int16_t *vp9_dct_value_cost_high10_ptr = + dct_value_cost_high10 + DCT_MAX_VALUE_HIGH10; -static TOKENVALUE dct_value_tokens_high12[DCT_MAX_VALUE_HIGH12 * 2]; -const TOKENVALUE *vp9_dct_value_tokens_high12_ptr; static int16_t dct_value_cost_high12[DCT_MAX_VALUE_HIGH12 * 2]; -const int16_t *vp9_dct_value_cost_high12_ptr; +const int16_t *vp9_dct_value_cost_high12_ptr = + dct_value_cost_high12 + DCT_MAX_VALUE_HIGH12; #endif +static const TOKENVALUE dct_cat_lt_10_value_tokens[] = { + {9, 63}, {9, 61}, {9, 59}, {9, 57}, {9, 55}, {9, 53}, {9, 51}, {9, 49}, + {9, 47}, {9, 45}, {9, 43}, {9, 41}, {9, 39}, {9, 37}, {9, 35}, {9, 33}, + {9, 31}, {9, 29}, {9, 27}, {9, 25}, {9, 23}, {9, 21}, {9, 19}, {9, 17}, + {9, 15}, {9, 13}, {9, 11}, {9, 9}, {9, 7}, {9, 5}, {9, 3}, {9, 1}, + {8, 31}, {8, 29}, {8, 27}, {8, 25}, {8, 23}, {8, 21}, + {8, 19}, {8, 17}, {8, 15}, {8, 13}, {8, 11}, {8, 9}, + {8, 7}, {8, 5}, {8, 3}, {8, 1}, + {7, 15}, {7, 13}, {7, 11}, {7, 9}, {7, 7}, {7, 5}, {7, 3}, {7, 1}, + {6, 7}, {6, 5}, {6, 3}, {6, 1}, {5, 3}, {5, 1}, + {4, 1}, {3, 1}, {2, 1}, {1, 1}, {0, 0}, + {1, 0}, {2, 0}, {3, 0}, {4, 0}, + {5, 0}, {5, 2}, {6, 0}, {6, 2}, {6, 4}, {6, 6}, + {7, 0}, {7, 2}, {7, 4}, {7, 6}, {7, 8}, {7, 10}, {7, 12}, {7, 14}, + {8, 0}, {8, 2}, {8, 4}, {8, 6}, {8, 8}, {8, 10}, {8, 12}, + {8, 14}, {8, 16}, {8, 18}, {8, 20}, {8, 22}, {8, 24}, + {8, 26}, {8, 28}, {8, 30}, {9, 0}, {9, 2}, + {9, 4}, {9, 6}, {9, 8}, {9, 10}, {9, 12}, {9, 14}, {9, 16}, + {9, 18}, {9, 20}, {9, 22}, {9, 24}, {9, 26}, {9, 28}, + {9, 30}, {9, 32}, {9, 34}, {9, 36}, {9, 38}, {9, 40}, + {9, 42}, {9, 44}, {9, 46}, {9, 48}, {9, 50}, {9, 52}, + {9, 54}, {9, 56}, {9, 58}, {9, 60}, {9, 62} +}; +const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens = dct_cat_lt_10_value_tokens + + (sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens)) + / 2; + // Array indices are identical to previously-existing CONTEXT_NODE indices const vp9_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = { -EOB_TOKEN, 2, // 0 = EOB @@ -67,21 +90,31 @@ const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = { -CATEGORY5_TOKEN, -CATEGORY6_TOKEN // 7 = CAT_FIVE }; -static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28]; +static const vp9_tree_index cat1[2] = {0, 0}; +static const vp9_tree_index cat2[4] = {2, 2, 0, 0}; +static const vp9_tree_index cat3[6] = {2, 2, 4, 4, 0, 0}; +static const vp9_tree_index cat4[8] = {2, 2, 4, 4, 6, 6, 0, 0}; +static const vp9_tree_index cat5[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0}; +static const vp9_tree_index cat6[28] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, + 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 0, 0}; #if CONFIG_VP9_HIGHBITDEPTH -static vp9_tree_index cat1_high10[2]; -static vp9_tree_index cat2_high10[4]; -static vp9_tree_index cat3_high10[6]; -static vp9_tree_index cat4_high10[8]; -static vp9_tree_index cat5_high10[10]; -static vp9_tree_index cat6_high10[32]; -static vp9_tree_index cat1_high12[2]; -static vp9_tree_index cat2_high12[4]; -static vp9_tree_index cat3_high12[6]; -static vp9_tree_index cat4_high12[8]; -static vp9_tree_index cat5_high12[10]; -static vp9_tree_index cat6_high12[36]; +static const vp9_tree_index cat1_high10[2] = {0, 0}; +static const vp9_tree_index cat2_high10[4] = {2, 2, 0, 0}; +static const vp9_tree_index cat3_high10[6] = {2, 2, 4, 4, 0, 0}; +static const vp9_tree_index cat4_high10[8] = {2, 2, 4, 4, 6, 6, 0, 0}; +static const vp9_tree_index cat5_high10[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0}; +static const vp9_tree_index cat6_high10[32] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10, + 12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 28, 28, + 30, 30, 0, 0}; +static const vp9_tree_index cat1_high12[2] = {0, 0}; +static const vp9_tree_index cat2_high12[4] = {2, 2, 0, 0}; +static const vp9_tree_index cat3_high12[6] = {2, 2, 4, 4, 0, 0}; +static const vp9_tree_index cat4_high12[8] = {2, 2, 4, 4, 6, 6, 0, 0}; +static const vp9_tree_index cat5_high12[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0}; +static const vp9_tree_index cat6_high12[36] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10, + 12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 28, 28, + 30, 30, 32, 32, 34, 34, 0, 0}; #endif static void init_bit_tree(vp9_tree_index *p, int n) { @@ -95,28 +128,6 @@ static void init_bit_tree(vp9_tree_index *p, int n) { p[0] = p[1] = 0; } -static void init_bit_trees() { - init_bit_tree(cat1, 1); - init_bit_tree(cat2, 2); - init_bit_tree(cat3, 3); - init_bit_tree(cat4, 4); - init_bit_tree(cat5, 5); - init_bit_tree(cat6, 14); -#if CONFIG_VP9_HIGHBITDEPTH - init_bit_tree(cat1_high10, 1); - init_bit_tree(cat2_high10, 2); - init_bit_tree(cat3_high10, 3); - init_bit_tree(cat4_high10, 4); - init_bit_tree(cat5_high10, 5); - init_bit_tree(cat6_high10, 16); - init_bit_tree(cat1_high12, 1); - init_bit_tree(cat2_high12, 2); - init_bit_tree(cat3_high12, 3); - init_bit_tree(cat4_high12, 4); - init_bit_tree(cat5_high12, 5); - init_bit_tree(cat6_high12, 18); -#endif -} const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS] = { {0, 0, 0, 0}, // ZERO_TOKEN @@ -167,43 +178,24 @@ const vp9_extra_bit vp9_extra_bits_high12[ENTROPY_TOKENS] = { struct vp9_token vp9_coef_encodings[ENTROPY_TOKENS]; void vp9_coef_tree_initialize() { - init_bit_trees(); vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree); } -static void tokenize_init_one(TOKENVALUE *t, const vp9_extra_bit *const e, +static void tokenize_init_one(const vp9_extra_bit *const e, int16_t *value_cost, int max_value) { int i = -max_value; - int sign = 1; + TOKENVALUE t; do { - if (!i) - sign = 0; - - { - const int a = sign ? -i : i; - int eb = sign; - - if (a > 4) { - int j = 4; - - while (++j < 11 && e[j].base_val <= a) {} - - t[i].token = --j; - eb |= (a - e[j].base_val) << 1; - } else { - t[i].token = a; - } - t[i].extra = eb; - } + vp9_get_token_extra(i, &t.token, &t.extra); // initialize the cost for extra bits for all possible coefficient value. { int cost = 0; - const vp9_extra_bit *p = &e[t[i].token]; + const vp9_extra_bit *p = &e[t.token]; if (p->base_val) { - const int extra = t[i].extra; + const int extra = t.extra; const int length = p->len; if (length) @@ -217,26 +209,14 @@ static void tokenize_init_one(TOKENVALUE *t, const vp9_extra_bit *const e, } void vp9_tokenize_initialize() { - vp9_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE; - vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE; - - tokenize_init_one(dct_value_tokens + DCT_MAX_VALUE, vp9_extra_bits, + tokenize_init_one(vp9_extra_bits, dct_value_cost + DCT_MAX_VALUE, DCT_MAX_VALUE); #if CONFIG_VP9_HIGHBITDEPTH - vp9_dct_value_tokens_high10_ptr = dct_value_tokens_high10 + - DCT_MAX_VALUE_HIGH10; - vp9_dct_value_cost_high10_ptr = dct_value_cost_high10 + DCT_MAX_VALUE_HIGH10; - - tokenize_init_one(dct_value_tokens_high10 + DCT_MAX_VALUE_HIGH10, - vp9_extra_bits_high10, + tokenize_init_one(vp9_extra_bits_high10, dct_value_cost_high10 + DCT_MAX_VALUE_HIGH10, DCT_MAX_VALUE_HIGH10); - vp9_dct_value_tokens_high12_ptr = dct_value_tokens_high12 + - DCT_MAX_VALUE_HIGH12; - vp9_dct_value_cost_high12_ptr = dct_value_cost_high12 + DCT_MAX_VALUE_HIGH12; - tokenize_init_one(dct_value_tokens_high12 + DCT_MAX_VALUE_HIGH12, - vp9_extra_bits_high12, + tokenize_init_one(vp9_extra_bits_high12, dct_value_cost_high12 + DCT_MAX_VALUE_HIGH12, DCT_MAX_VALUE_HIGH12); #endif @@ -322,8 +302,8 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, td->counts->eob_branch[tx_size][type][ref]; const uint8_t *const band = get_band_translate(tx_size); const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size); - const TOKENVALUE *dct_value_tokens; - + int16_t token; + EXTRABIT extra; int aoff, loff; txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff); @@ -333,17 +313,6 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, scan = so->scan; nb = so->neighbors; c = 0; -#if CONFIG_VP9_HIGHBITDEPTH - if (cpi->common.profile >= PROFILE_2) { - dct_value_tokens = (cpi->common.bit_depth == VPX_BITS_10 ? - vp9_dct_value_tokens_high10_ptr : - vp9_dct_value_tokens_high12_ptr); - } else { - dct_value_tokens = vp9_dct_value_tokens_ptr; - } -#else - dct_value_tokens = vp9_dct_value_tokens_ptr; -#endif while (c < eob) { int v = 0; @@ -362,14 +331,13 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, v = qcoeff[scan[c]]; } - add_token(&t, coef_probs[band[c]][pt], - dct_value_tokens[v].extra, - (uint8_t)dct_value_tokens[v].token, - (uint8_t)skip_eob, - counts[band[c]][pt]); + vp9_get_token_extra(v, &token, &extra); + + add_token(&t, coef_probs[band[c]][pt], extra, (uint8_t)token, + (uint8_t)skip_eob, counts[band[c]][pt]); eob_branch[band[c]][pt] += !skip_eob; - token_cache[scan[c]] = vp9_pt_energy_class[dct_value_tokens[v].token]; + token_cache[scan[c]] = vp9_pt_energy_class[token]; ++c; pt = get_coef_context(nb, token_cache, c); } diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h index 00afb723e..845e139f2 100644 --- a/vp9/encoder/vp9_tokenize.h +++ b/vp9/encoder/vp9_tokenize.h @@ -24,24 +24,23 @@ void vp9_tokenize_initialize(); #define EOSB_TOKEN 127 // Not signalled, encoder only -typedef struct { - int16_t token; #if CONFIG_VP9_HIGHBITDEPTH - int32_t extra; + typedef int32_t EXTRABIT; #else - int16_t extra; + typedef int16_t EXTRABIT; #endif + + +typedef struct { + int16_t token; + EXTRABIT extra; } TOKENVALUE; typedef struct { const vp9_prob *context_tree; -#if CONFIG_VP9_HIGHBITDEPTH - int32_t extra; -#else - int16_t extra; -#endif - uint8_t token; - uint8_t skip_eob_node; + EXTRABIT extra; + uint8_t token; + uint8_t skip_eob_node; } TOKENEXTRA; extern const vp9_tree_index vp9_coef_tree[]; @@ -63,6 +62,7 @@ extern const int16_t *vp9_dct_value_cost_ptr; * fields are not. */ extern const TOKENVALUE *vp9_dct_value_tokens_ptr; +extern const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens; #if CONFIG_VP9_HIGHBITDEPTH extern const int16_t *vp9_dct_value_cost_high10_ptr; extern const TOKENVALUE *vp9_dct_value_tokens_high10_ptr; @@ -70,6 +70,25 @@ extern const int16_t *vp9_dct_value_cost_high12_ptr; extern const TOKENVALUE *vp9_dct_value_tokens_high12_ptr; #endif // CONFIG_VP9_HIGHBITDEPTH +static INLINE void vp9_get_token_extra(int v, int16_t *token, EXTRABIT *extra) { + if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) { + *token = CATEGORY6_TOKEN; + if (v >= CAT6_MIN_VAL) + *extra = 2 * v - 2 * CAT6_MIN_VAL; + else + *extra = -2 * v - 2 * CAT6_MIN_VAL + 1; + return; + } + *token = vp9_dct_cat_lt_10_value_tokens[v].token; + *extra = vp9_dct_cat_lt_10_value_tokens[v].extra; +} +static INLINE int16_t vp9_get_token(int v) { + if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) + return 10; + return vp9_dct_cat_lt_10_value_tokens[v].token; +} + + #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c index e671f3998..ae22a0b32 100644 --- a/vp9/encoder/x86/vp9_dct_sse2.c +++ b/vp9/encoder/x86/vp9_dct_sse2.c @@ -254,7 +254,7 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, const int16_t* round_ptr, const int16_t* quant_ptr, const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr, int16_t* dqcoeff_ptr, const int16_t* dequant_ptr, - int zbin_oq_value, uint16_t* eob_ptr, + uint16_t* eob_ptr, const int16_t* scan_ptr, const int16_t* iscan_ptr) { __m128i zero; @@ -287,7 +287,6 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, (void)scan_ptr; (void)zbin_ptr; (void)quant_shift_ptr; - (void)zbin_oq_value; (void)coeff_ptr; // Pre-condition input (shift by two) diff --git a/vp9/encoder/x86/vp9_dct_ssse3.c b/vp9/encoder/x86/vp9_dct_ssse3.c index 237c5e278..5c0ad7892 100644 --- a/vp9/encoder/x86/vp9_dct_ssse3.c +++ b/vp9/encoder/x86/vp9_dct_ssse3.c @@ -23,7 +23,7 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr, int16_t* dqcoeff_ptr, const int16_t* dequant_ptr, - int zbin_oq_value, uint16_t* eob_ptr, + uint16_t* eob_ptr, const int16_t* scan_ptr, const int16_t* iscan_ptr) { __m128i zero; @@ -57,7 +57,6 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, (void)scan_ptr; (void)zbin_ptr; (void)quant_shift_ptr; - (void)zbin_oq_value; (void)coeff_ptr; // Pre-condition input (shift by two) diff --git a/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c b/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c index 55c6ed71f..0bce9c321 100644 --- a/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c +++ b/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c @@ -24,7 +24,6 @@ void vp9_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { @@ -32,11 +31,11 @@ void vp9_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, __m128i zbins[2]; __m128i nzbins[2]; - zbins[0] = _mm_set_epi32((int)(zbin_ptr[1] + zbin_oq_value), - (int)(zbin_ptr[1] + zbin_oq_value), - (int)(zbin_ptr[1] + zbin_oq_value), - (int)(zbin_ptr[0] + zbin_oq_value)); - zbins[1] = _mm_set1_epi32((int)(zbin_ptr[1] + zbin_oq_value)); + zbins[0] = _mm_set_epi32((int)zbin_ptr[1], + (int)zbin_ptr[1], + (int)zbin_ptr[1], + (int)zbin_ptr[0]); + zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]); nzbins[0] = _mm_setzero_si128(); nzbins[1] = _mm_setzero_si128(); @@ -111,7 +110,6 @@ void vp9_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { @@ -120,14 +118,14 @@ void vp9_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, int idx = 0; int idx_arr[1024]; int i, eob = -1; - const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1); - const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1); + const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); + const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); (void)scan; - zbins[0] = _mm_set_epi32((zbin1_tmp + zbin_oq_value), - (zbin1_tmp + zbin_oq_value), - (zbin1_tmp + zbin_oq_value), - (zbin0_tmp + zbin_oq_value)); - zbins[1] = _mm_set1_epi32((zbin1_tmp + zbin_oq_value)); + zbins[0] = _mm_set_epi32(zbin1_tmp, + zbin1_tmp, + zbin1_tmp, + zbin0_tmp); + zbins[1] = _mm_set1_epi32(zbin1_tmp); nzbins[0] = _mm_setzero_si128(); nzbins[1] = _mm_setzero_si128(); diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c index e06eb2f15..679c66e30 100644 --- a/vp9/encoder/x86/vp9_quantize_sse2.c +++ b/vp9/encoder/x86/vp9_quantize_sse2.c @@ -18,7 +18,7 @@ void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, const int16_t* round_ptr, const int16_t* quant_ptr, const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr, int16_t* dqcoeff_ptr, const int16_t* dequant_ptr, - int zbin_oq_value, uint16_t* eob_ptr, + uint16_t* eob_ptr, const int16_t* scan_ptr, const int16_t* iscan_ptr) { __m128i zero; @@ -39,13 +39,10 @@ void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, // Setup global values { - __m128i zbin_oq; __m128i pw_1; - zbin_oq = _mm_set1_epi16(zbin_oq_value); zbin = _mm_load_si128((const __m128i*)zbin_ptr); round = _mm_load_si128((const __m128i*)round_ptr); quant = _mm_load_si128((const __m128i*)quant_ptr); - zbin = _mm_add_epi16(zbin, zbin_oq); pw_1 = _mm_set1_epi16(1); zbin = _mm_sub_epi16(zbin, pw_1); dequant = _mm_load_si128((const __m128i*)dequant_ptr); @@ -229,14 +226,13 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, const int16_t* round_ptr, const int16_t* quant_ptr, const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr, int16_t* dqcoeff_ptr, const int16_t* dequant_ptr, - int zbin_oq_value, uint16_t* eob_ptr, + uint16_t* eob_ptr, const int16_t* scan_ptr, const int16_t* iscan_ptr) { __m128i zero; (void)scan_ptr; (void)zbin_ptr; (void)quant_shift_ptr; - (void)zbin_oq_value; coeff_ptr += n_coeffs; iscan_ptr += n_coeffs; diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm index f5f05e799..72e01d646 100644 --- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm +++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm @@ -17,7 +17,7 @@ SECTION .text %macro QUANTIZE_FN 2 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ - shift, qcoeff, dqcoeff, dequant, zbin_oq, \ + shift, qcoeff, dqcoeff, dequant, \ eob, scan, iscan cmp dword skipm, 0 jne .blank @@ -29,13 +29,9 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ movifnidn zbinq, zbinmp movifnidn roundq, roundmp movifnidn quantq, quantmp - movd m4, dword zbin_oqm ; m4 = zbin_oq mova m0, [zbinq] ; m0 = zbin - punpcklwd m4, m4 mova m1, [roundq] ; m1 = round - pshufd m4, m4, 0 mova m2, [quantq] ; m2 = quant - paddw m0, m4 ; m0 = zbin + zbin_oq %ifidn %1, b_32x32 pcmpeqw m5, m5 psrlw m5, 15 @@ -55,7 +51,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psllw m4, 1 %endif pxor m5, m5 ; m5 = dedicated zero - DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob lea coeffq, [ coeffq+ncoeffq*2] lea iscanq, [ iscanq+ncoeffq*2] lea qcoeffq, [ qcoeffq+ncoeffq*2] @@ -220,7 +216,7 @@ QUANTIZE_FN b_32x32, 7 %macro QUANTIZE_FP 2 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ - shift, qcoeff, dqcoeff, dequant, zbin_oq, \ + shift, qcoeff, dqcoeff, dequant, \ eob, scan, iscan cmp dword skipm, 0 jne .blank @@ -248,7 +244,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psllw m2, 1 %endif pxor m5, m5 ; m5 = dedicated zero - DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob lea coeffq, [ coeffq+ncoeffq*2] lea iscanq, [ iscanq+ncoeffq*2] lea qcoeffq, [ qcoeffq+ncoeffq*2] diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 2504f4db9..f5e6e3190 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -133,13 +133,11 @@ ifeq ($(ARCH_X86_64), yes) VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3_x86_64.asm endif -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht4x4_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht8x8_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_reconintra_neon$(ASM) + +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c # neon with assembly and intrinsics implementations. If both are available # prefer assembly. @@ -158,8 +156,10 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM) +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon_asm$(ASM) else ifeq ($(HAVE_NEON), yes) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_avg_neon.c @@ -178,6 +178,7 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon.c endif # HAVE_NEON endif # HAVE_NEON_ASM diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index b9fb8140c..7b4b17809 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "vpx/vpx_codec.h" +#include "vpx_ports/vpx_once.h" #include "vpx/internal/vpx_codec_internal.h" #include "./vpx_version.h" #include "vp9/encoder/vp9_encoder.h" @@ -208,7 +209,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, "or kf_max_dist instead."); RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2); - RANGE_CHECK(extra_cfg, cpu_used, -16, 16); + RANGE_CHECK(extra_cfg, cpu_used, -8, 8); RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6); RANGE_CHECK(extra_cfg, tile_columns, 0, 6); RANGE_CHECK(extra_cfg, tile_rows, 0, 2); @@ -729,7 +730,7 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, } priv->extra_cfg = default_extra_cfg; - vp9_initialize_enc(); + once(vp9_initialize_enc); res = validate_config(priv, &priv->cfg, &priv->extra_cfg); diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index 809514001..43bf35f9c 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -44,6 +44,7 @@ struct vpx_codec_alg_priv { int flushed; int invert_tile_order; int frame_parallel_decode; // frame-based threading. + int byte_alignment; // External frame buffer info to save for VP9 common. void *ext_priv; // Private data associated with the external frame buffers. @@ -219,6 +220,7 @@ static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) { VP9_COMMON *const cm = &ctx->pbi->common; cm->new_fb_idx = -1; + cm->byte_alignment = ctx->byte_alignment; if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) { cm->get_fb_cb = ctx->get_ext_fb_cb; @@ -617,6 +619,27 @@ static vpx_codec_err_t ctrl_set_decryptor(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_set_byte_alignment(vpx_codec_alg_priv_t *ctx, + va_list args) { + const int legacy_byte_alignment = 0; + const int min_byte_alignment = 32; + const int max_byte_alignment = 1024; + const int byte_alignment = va_arg(args, int); + + if (byte_alignment != legacy_byte_alignment && + (byte_alignment < min_byte_alignment || + byte_alignment > max_byte_alignment || + (byte_alignment & (byte_alignment - 1)) != 0)) + return VPX_CODEC_INVALID_PARAM; + + ctx->byte_alignment = byte_alignment; + if (ctx->pbi != NULL) { + VP9_COMMON *const cm = &ctx->pbi->common; + cm->byte_alignment = byte_alignment; + } + return VPX_CODEC_OK; +} + static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { {VP8_COPY_REFERENCE, ctrl_copy_reference}, @@ -629,6 +652,7 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { {VP8_SET_DBG_DISPLAY_MV, ctrl_set_dbg_options}, {VP9_INVERT_TILE_DECODE_ORDER, ctrl_set_invert_tile_order}, {VPXD_SET_DECRYPTOR, ctrl_set_decryptor}, + {VP9_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment}, // Getters {VP8D_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates}, |