diff options
Diffstat (limited to 'vp9')
29 files changed, 782 insertions, 154 deletions
diff --git a/vp9/common/arm/neon/vp9_convolve8_avg_neon.c b/vp9/common/arm/neon/vp9_convolve8_avg_neon.c index 2f8dda07c..dd569d348 100644 --- a/vp9/common/arm/neon/vp9_convolve8_avg_neon.c +++ b/vp9/common/arm/neon/vp9_convolve8_avg_neon.c @@ -11,6 +11,9 @@ #include <stddef.h> #include <arm_neon.h> +#include "./vpx_config.h" +#include "vpx_ports/mem.h" + void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, @@ -22,7 +25,7 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h); -static inline int32x4_t MULTIPLY_BY_Q0( +static INLINE int32x4_t MULTIPLY_BY_Q0( int16x4_t dsrc0, int16x4_t dsrc1, int16x4_t dsrc2, diff --git a/vp9/common/arm/neon/vp9_convolve8_neon.c b/vp9/common/arm/neon/vp9_convolve8_neon.c index c8704aa9c..5c555c458 100644 --- a/vp9/common/arm/neon/vp9_convolve8_neon.c +++ b/vp9/common/arm/neon/vp9_convolve8_neon.c @@ -11,6 +11,9 @@ #include <stddef.h> #include <arm_neon.h> +#include "./vpx_config.h" +#include "vpx_ports/mem.h" + void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, @@ -22,7 +25,7 @@ void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h); -static inline int32x4_t MULTIPLY_BY_Q0( +static INLINE int32x4_t MULTIPLY_BY_Q0( int16x4_t dsrc0, int16x4_t dsrc1, int16x4_t dsrc2, diff --git a/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm b/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm deleted file mode 100644 index 60a0d98c5..000000000 --- a/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm +++ /dev/null @@ -1,69 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |vp9_dc_only_idct_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void vp9_dc_only_idct_add_neon(int input_dc, uint8_t *pred_ptr, -; uint8_t *dst_ptr, int pitch, int stride) -; -; r0 int input_dc -; r1 uint8_t *pred_ptr -; r2 uint8_t *dst_ptr -; r3 int pitch -; sp int stride - -|vp9_dc_only_idct_add_neon| PROC - - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - ; dct_const_round_shift(input_dc * cospi_16_64) - mul r0, r0, r12 ; input_dc * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; ROUND_POWER_OF_TWO(out, 4) - add r0, r0, #8 ; + (1 <<((4) - 1)) - asr r0, r0, #4 ; >> 4 - - vdup.16 q0, r0; ; duplicate a1 - ldr r12, [sp] ; load stride - - vld1.32 {d2[0]}, [r1], r3 - vld1.32 {d2[1]}, [r1], r3 - vld1.32 {d4[0]}, [r1], r3 - vld1.32 {d4[1]}, [r1] - - vaddw.u8 q1, q0, d2 ; a1 + pred_ptr[c] - vaddw.u8 q2, q0, d4 - - vqmovun.s16 d2, q1 ; clip_pixel - vqmovun.s16 d4, q2 - - vst1.32 {d2[0]}, [r2], r12 - vst1.32 {d2[1]}, [r2], r12 - vst1.32 {d4[0]}, [r2], r12 - vst1.32 {d4[1]}, [r2] - - bx lr - ENDP ; |vp9_dc_only_idct_add_neon| - - END diff --git a/vp9/common/arm/neon/vp9_idct16x16_add_neon.c b/vp9/common/arm/neon/vp9_idct16x16_add_neon.c index 68d7cccc0..5fa3f5c01 100644 --- a/vp9/common/arm/neon/vp9_idct16x16_add_neon.c +++ b/vp9/common/arm/neon/vp9_idct16x16_add_neon.c @@ -10,6 +10,8 @@ #include <arm_neon.h> +#include "./vpx_config.h" + static int16_t cospi_2_64 = 16305; static int16_t cospi_4_64 = 16069; static int16_t cospi_6_64 = 15679; @@ -26,7 +28,7 @@ static int16_t cospi_26_64 = 4756; static int16_t cospi_28_64 = 3196; static int16_t cospi_30_64 = 1606; -static inline void TRANSPOSE8X8( +static INLINE void TRANSPOSE8X8( int16x8_t *q8s16, int16x8_t *q9s16, int16x8_t *q10s16, diff --git a/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c b/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c index 1bfee22b2..d0e4b4f40 100644 --- a/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c +++ b/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c @@ -11,7 +11,9 @@ #include <arm_neon.h> #include "vp9/common/vp9_idct.h" -static inline void LD_16x8( +#include "./vpx_config.h" + +static INLINE void LD_16x8( uint8_t *d, int d_stride, uint8x16_t *q8u8, @@ -40,7 +42,7 @@ static inline void LD_16x8( return; } -static inline void ADD_DIFF_16x8( +static INLINE void ADD_DIFF_16x8( uint8x16_t qdiffu8, uint8x16_t *q8u8, uint8x16_t *q9u8, @@ -61,7 +63,7 @@ static inline void ADD_DIFF_16x8( return; } -static inline void SUB_DIFF_16x8( +static INLINE void SUB_DIFF_16x8( uint8x16_t qdiffu8, uint8x16_t *q8u8, uint8x16_t *q9u8, @@ -82,7 +84,7 @@ static inline void SUB_DIFF_16x8( return; } -static inline void ST_16x8( +static INLINE void ST_16x8( uint8_t *d, int d_stride, uint8x16_t *q8u8, diff --git a/vp9/common/arm/neon/vp9_idct32x32_add_neon.c b/vp9/common/arm/neon/vp9_idct32x32_add_neon.c index 53f721b44..309bdf8d7 100644 --- a/vp9/common/arm/neon/vp9_idct32x32_add_neon.c +++ b/vp9/common/arm/neon/vp9_idct32x32_add_neon.c @@ -10,6 +10,8 @@ #include <arm_neon.h> +#include "./vpx_config.h" + static int16_t cospi_1_64 = 16364; static int16_t cospi_2_64 = 16305; static int16_t cospi_3_64 = 16207; @@ -57,7 +59,7 @@ static int16_t cospi_31_64 = 804; #define STORE_COMBINE_CENTER_RESULTS(r10, r9) \ __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \ q6s16, q7s16, q8s16, q9s16); -static inline void __STORE_COMBINE_CENTER_RESULTS( +static INLINE void __STORE_COMBINE_CENTER_RESULTS( uint8_t *p1, uint8_t *p2, int stride, @@ -105,7 +107,7 @@ static inline void __STORE_COMBINE_CENTER_RESULTS( #define STORE_COMBINE_EXTREME_RESULTS(r7, r6); \ __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \ q4s16, q5s16, q6s16, q7s16); -static inline void __STORE_COMBINE_EXTREME_RESULTS( +static INLINE void __STORE_COMBINE_EXTREME_RESULTS( uint8_t *p1, uint8_t *p2, int stride, @@ -152,7 +154,7 @@ static inline void __STORE_COMBINE_EXTREME_RESULTS( #define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \ DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB); -static inline void DO_BUTTERFLY( +static INLINE void DO_BUTTERFLY( int16x8_t q14s16, int16x8_t q13s16, int16_t first_const, @@ -194,7 +196,7 @@ static inline void DO_BUTTERFLY( return; } -static inline void idct32_transpose_pair( +static INLINE void idct32_transpose_pair( int16_t *input, int16_t *t_buf) { int16_t *in; @@ -288,7 +290,7 @@ static inline void idct32_transpose_pair( return; } -static inline void idct32_bands_end_1st_pass( +static INLINE void idct32_bands_end_1st_pass( int16_t *out, int16x8_t q2s16, int16x8_t q3s16, @@ -383,7 +385,7 @@ static inline void idct32_bands_end_1st_pass( return; } -static inline void idct32_bands_end_2nd_pass( +static INLINE void idct32_bands_end_2nd_pass( int16_t *out, uint8_t *dest, int stride, diff --git a/vp9/common/arm/neon/vp9_idct8x8_add_neon.c b/vp9/common/arm/neon/vp9_idct8x8_add_neon.c index 50587f6bc..2b3c1ce60 100644 --- a/vp9/common/arm/neon/vp9_idct8x8_add_neon.c +++ b/vp9/common/arm/neon/vp9_idct8x8_add_neon.c @@ -10,6 +10,8 @@ #include <arm_neon.h> +#include "./vpx_config.h" + static int16_t cospi_4_64 = 16069; static int16_t cospi_8_64 = 15137; static int16_t cospi_12_64 = 13623; @@ -18,7 +20,7 @@ static int16_t cospi_20_64 = 9102; static int16_t cospi_24_64 = 6270; static int16_t cospi_28_64 = 3196; -static inline void TRANSPOSE8X8( +static INLINE void TRANSPOSE8X8( int16x8_t *q8s16, int16x8_t *q9s16, int16x8_t *q10s16, @@ -87,7 +89,7 @@ static inline void TRANSPOSE8X8( return; } -static inline void IDCT8x8_1D( +static INLINE void IDCT8x8_1D( int16x8_t *q8s16, int16x8_t *q9s16, int16x8_t *q10s16, diff --git a/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/vp9/common/arm/neon/vp9_iht4x4_add_neon.c index cd8c358fd..1761fada2 100644 --- a/vp9/common/arm/neon/vp9_iht4x4_add_neon.c +++ b/vp9/common/arm/neon/vp9_iht4x4_add_neon.c @@ -12,6 +12,7 @@ #include <assert.h> #include "./vp9_rtcd.h" +#include "./vpx_config.h" #include "vp9/common/vp9_common.h" static int16_t sinpi_1_9 = 0x14a3; @@ -22,7 +23,7 @@ static int16_t cospi_8_64 = 0x3b21; static int16_t cospi_16_64 = 0x2d41; static int16_t cospi_24_64 = 0x187e; -static inline void TRANSPOSE4X4( +static INLINE void TRANSPOSE4X4( int16x8_t *q8s16, int16x8_t *q9s16) { int32x4_t q8s32, q9s32; @@ -41,7 +42,7 @@ static inline void TRANSPOSE4X4( return; } -static inline void GENERATE_COSINE_CONSTANTS( +static INLINE void GENERATE_COSINE_CONSTANTS( int16x4_t *d0s16, int16x4_t *d1s16, int16x4_t *d2s16) { @@ -51,7 +52,7 @@ static inline void GENERATE_COSINE_CONSTANTS( return; } -static inline void GENERATE_SINE_CONSTANTS( +static INLINE void GENERATE_SINE_CONSTANTS( int16x4_t *d3s16, int16x4_t *d4s16, int16x4_t *d5s16, @@ -63,7 +64,7 @@ static inline void GENERATE_SINE_CONSTANTS( return; } -static inline void IDCT4x4_1D( +static INLINE void IDCT4x4_1D( int16x4_t *d0s16, int16x4_t *d1s16, int16x4_t *d2s16, @@ -103,7 +104,7 @@ static inline void IDCT4x4_1D( return; } -static inline void IADST4x4_1D( +static INLINE void IADST4x4_1D( int16x4_t *d3s16, int16x4_t *d4s16, int16x4_t *d5s16, diff --git a/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/vp9/common/arm/neon/vp9_iht8x8_add_neon.c index 03c836d86..04b342c3d 100644 --- a/vp9/common/arm/neon/vp9_iht8x8_add_neon.c +++ b/vp9/common/arm/neon/vp9_iht8x8_add_neon.c @@ -12,6 +12,7 @@ #include <assert.h> #include "./vp9_rtcd.h" +#include "./vpx_config.h" #include "vp9/common/vp9_common.h" static int16_t cospi_2_64 = 16305; @@ -30,7 +31,7 @@ static int16_t cospi_26_64 = 4756; static int16_t cospi_28_64 = 3196; static int16_t cospi_30_64 = 1606; -static inline void TRANSPOSE8X8( +static INLINE void TRANSPOSE8X8( int16x8_t *q8s16, int16x8_t *q9s16, int16x8_t *q10s16, @@ -99,7 +100,7 @@ static inline void TRANSPOSE8X8( return; } -static inline void IDCT8x8_1D( +static INLINE void IDCT8x8_1D( int16x8_t *q8s16, int16x8_t *q9s16, int16x8_t *q10s16, @@ -255,7 +256,7 @@ static inline void IDCT8x8_1D( return; } -static inline void IADST8X8_1D( +static INLINE void IADST8X8_1D( int16x8_t *q8s16, int16x8_t *q9s16, int16x8_t *q10s16, diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c index 97fe02805..09f470e97 100644 --- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c +++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,9 +8,178 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <arm_neon.h> + #include "./vp9_rtcd.h" +#include "./vpx_config.h" #include "vpx/vpx_integer.h" +static INLINE void vp9_loop_filter_neon_16( + uint8x16_t qblimit, // blimit + uint8x16_t qlimit, // limit + uint8x16_t qthresh, // thresh + uint8x16_t q3, // p3 + uint8x16_t q4, // p2 + uint8x16_t q5, // p1 + uint8x16_t q6, // p0 + uint8x16_t q7, // q0 + uint8x16_t q8, // q1 + uint8x16_t q9, // q2 + uint8x16_t q10, // q3 + uint8x16_t *q5r, // p1 + uint8x16_t *q6r, // p0 + uint8x16_t *q7r, // q0 + uint8x16_t *q8r) { // q1 + uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int16x8_t q2s16, q11s16; + uint16x8_t q4u16; + int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8; + int8x8_t d2s8, d3s8; + + q11u8 = vabdq_u8(q3, q4); + q12u8 = vabdq_u8(q4, q5); + q13u8 = vabdq_u8(q5, q6); + q14u8 = vabdq_u8(q8, q7); + q3 = vabdq_u8(q9, q8); + q4 = vabdq_u8(q10, q9); + + q11u8 = vmaxq_u8(q11u8, q12u8); + q12u8 = vmaxq_u8(q13u8, q14u8); + q3 = vmaxq_u8(q3, q4); + q15u8 = vmaxq_u8(q11u8, q12u8); + + q9 = vabdq_u8(q6, q7); + + // vp8_hevmask + q13u8 = vcgtq_u8(q13u8, qthresh); + q14u8 = vcgtq_u8(q14u8, qthresh); + q15u8 = vmaxq_u8(q15u8, q3); + + q2u8 = vabdq_u8(q5, q8); + q9 = vqaddq_u8(q9, q9); + + q15u8 = vcgeq_u8(qlimit, q15u8); + + // vp8_filter() function + // convert to signed + q10 = vdupq_n_u8(0x80); + q8 = veorq_u8(q8, q10); + q7 = veorq_u8(q7, q10); + q6 = veorq_u8(q6, q10); + q5 = veorq_u8(q5, q10); + + q2u8 = vshrq_n_u8(q2u8, 1); + q9 = vqaddq_u8(q9, q2u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), + vget_low_s8(vreinterpretq_s8_u8(q6))); + q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), + vget_high_s8(vreinterpretq_s8_u8(q6))); + + q9 = vcgeq_u8(qblimit, q9); + + q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), + vreinterpretq_s8_u8(q8)); + + q14u8 = vorrq_u8(q13u8, q14u8); + + q4u16 = vdupq_n_u16(3); + q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16)); + q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16)); + + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8); + q15u8 = vandq_u8(q15u8, q9); + + q1s8 = vreinterpretq_s8_u8(q1u8); + q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); + q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8)); + + q4 = vdupq_n_u8(3); + q9 = vdupq_n_u8(4); + // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) + d2s8 = vqmovn_s16(q2s16); + d3s8 = vqmovn_s16(q11s16); + q1s8 = vcombine_s8(d2s8, d3s8); + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8); + q1s8 = vreinterpretq_s8_u8(q1u8); + + q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4)); + q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9)); + q2s8 = vshrq_n_s8(q2s8, 3); + q1s8 = vshrq_n_s8(q1s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8); + q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8); + + q1s8 = vrshrq_n_s8(q1s8, 1); + q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8); + q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8); + + *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10); + *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10); + *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10); + *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10); + return; +} + +#if !HAVE_NEON_ASM +void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1; + uint8x16_t qblimit, qlimit, qthresh; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8; + + dblimit0 = vld1_u8(blimit0); + dlimit0 = vld1_u8(limit0); + dthresh0 = vld1_u8(thresh0); + dblimit1 = vld1_u8(blimit1); + dlimit1 = vld1_u8(limit1); + dthresh1 = vld1_u8(thresh1); + qblimit = vcombine_u8(dblimit0, dblimit1); + qlimit = vcombine_u8(dlimit0, dlimit1); + qthresh = vcombine_u8(dthresh0, dthresh1); + + s -= (p << 2); + + q3u8 = vld1q_u8(s); + s += p; + q4u8 = vld1q_u8(s); + s += p; + q5u8 = vld1q_u8(s); + s += p; + q6u8 = vld1q_u8(s); + s += p; + q7u8 = vld1q_u8(s); + s += p; + q8u8 = vld1q_u8(s); + s += p; + q9u8 = vld1q_u8(s); + s += p; + q10u8 = vld1q_u8(s); + + vp9_loop_filter_neon_16(qblimit, qlimit, qthresh, + q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, + &q5u8, &q6u8, &q7u8, &q8u8); + + s -= (p * 5); + vst1q_u8(s, q5u8); + s += p; + vst1q_u8(s, q6u8); + s += p; + vst1q_u8(s, q7u8); + s += p; + vst1q_u8(s, q8u8); + return; +} +#endif // !HAVE_NEON_ASM + void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */, const uint8_t *blimit0, const uint8_t *limit0, diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon.c b/vp9/common/arm/neon/vp9_loopfilter_neon.c index f54d7a94b..079d26677 100644 --- a/vp9/common/arm/neon/vp9_loopfilter_neon.c +++ b/vp9/common/arm/neon/vp9_loopfilter_neon.c @@ -10,7 +10,9 @@ #include <arm_neon.h> -static inline void vp9_loop_filter_neon( +#include "./vpx_config.h" + +static INLINE void vp9_loop_filter_neon( uint8x8_t dblimit, // flimit uint8x8_t dlimit, // limit uint8x8_t dthresh, // thresh @@ -271,7 +273,7 @@ void vp9_lpf_vertical_4_neon( return; } -static inline void vp9_mbloop_filter_neon( +static INLINE void vp9_mbloop_filter_neon( uint8x8_t dblimit, // mblimit uint8x8_t dlimit, // limit uint8x8_t dthresh, // thresh diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.c b/vp9/common/arm/neon/vp9_reconintra_neon.c new file mode 100644 index 000000000..d0beaa720 --- /dev/null +++ b/vp9/common/arm/neon/vp9_reconintra_neon.c @@ -0,0 +1,473 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stddef.h> +#include <arm_neon.h> + +void vp9_v_predictor_4x4_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + int i; + uint32x2_t d0u32 = vdup_n_u32(0); + (void)left; + + d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0); + for (i = 0; i < 4; i++, dst += y_stride) + vst1_lane_u32((uint32_t *)dst, d0u32, 0); + return; +} + +void vp9_v_predictor_8x8_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + int i; + uint8x8_t d0u8 = vdup_n_u8(0); + (void)left; + + d0u8 = vld1_u8(above); + for (i = 0; i < 8; i++, dst += y_stride) + vst1_u8(dst, d0u8); + return; +} + +void vp9_v_predictor_16x16_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + int i; + uint8x16_t q0u8 = vdupq_n_u8(0); + (void)left; + + q0u8 = vld1q_u8(above); + for (i = 0; i < 16; i++, dst += y_stride) + vst1q_u8(dst, q0u8); + return; +} + +void vp9_v_predictor_32x32_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + int i; + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)left; + + q0u8 = vld1q_u8(above); + q1u8 = vld1q_u8(above + 16); + for (i = 0; i < 32; i++, dst += y_stride) { + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + } + return; +} + +void vp9_h_predictor_4x4_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + uint8x8_t d0u8 = vdup_n_u8(0); + uint32x2_t d1u32 = vdup_n_u32(0); + (void)above; + + d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0); + + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + return; +} + +void vp9_h_predictor_8x8_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + uint8x8_t d0u8 = vdup_n_u8(0); + uint64x1_t d1u64 = vdup_n_u64(0); + (void)above; + + d1u64 = vld1_u64((const uint64_t *)left); + + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0); + vst1_u8(dst, d0u8); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1); + vst1_u8(dst, d0u8); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2); + vst1_u8(dst, d0u8); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3); + vst1_u8(dst, d0u8); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4); + vst1_u8(dst, d0u8); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5); + vst1_u8(dst, d0u8); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6); + vst1_u8(dst, d0u8); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7); + vst1_u8(dst, d0u8); + return; +} + +void vp9_h_predictor_16x16_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + int j; + uint8x8_t d2u8 = vdup_n_u8(0); + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)above; + + q1u8 = vld1q_u8(left); + d2u8 = vget_low_u8(q1u8); + for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { + q0u8 = vdupq_lane_u8(d2u8, 0); + vst1q_u8(dst, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 1); + vst1q_u8(dst, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 2); + vst1q_u8(dst, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 3); + vst1q_u8(dst, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 4); + vst1q_u8(dst, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 5); + vst1q_u8(dst, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 6); + vst1q_u8(dst, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 7); + vst1q_u8(dst, q0u8); + dst += y_stride; + } + return; +} + +void vp9_h_predictor_32x32_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + int j, k; + uint8x8_t d2u8 = vdup_n_u8(0); + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)above; + + for (k = 0; k < 2; k++, left += 16) { + q1u8 = vld1q_u8(left); + d2u8 = vget_low_u8(q1u8); + for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { + q0u8 = vdupq_lane_u8(d2u8, 0); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 1); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 2); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 3); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 4); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 5); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 6); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 7); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; + } + } + return; +} + +void vp9_tm_predictor_4x4_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + int i; + uint16x8_t q1u16, q3u16; + int16x8_t q1s16; + uint8x8_t d0u8 = vdup_n_u8(0); + uint32x2_t d2u32 = vdup_n_u32(0); + + d0u8 = vdup_n_u8(above[-1]); + d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0); + q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8); + for (i = 0; i < 4; i++, dst += y_stride) { + q1u16 = vdupq_n_u16((uint16_t)left[i]); + q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16), + vreinterpretq_s16_u16(q3u16)); + d0u8 = vqmovun_s16(q1s16); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + } + return; +} + +void vp9_tm_predictor_8x8_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + int j; + uint16x8_t q0u16, q3u16, q10u16; + int16x8_t q0s16; + uint16x4_t d20u16; + uint8x8_t d0u8, d2u8, d30u8; + + d0u8 = vdup_n_u8(above[-1]); + d30u8 = vld1_u8(left); + d2u8 = vld1_u8(above); + q10u16 = vmovl_u8(d30u8); + q3u16 = vsubl_u8(d2u8, d0u8); + d20u16 = vget_low_u16(q10u16); + for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { + q0u16 = vdupq_lane_u16(d20u16, 0); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), + vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += y_stride; + q0u16 = vdupq_lane_u16(d20u16, 1); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), + vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += y_stride; + q0u16 = vdupq_lane_u16(d20u16, 2); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), + vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += y_stride; + q0u16 = vdupq_lane_u16(d20u16, 3); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), + vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += y_stride; + } + return; +} + +void vp9_tm_predictor_16x16_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + int j, k; + uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16; + uint8x16_t q0u8, q1u8; + int16x8_t q0s16, q1s16, q8s16, q11s16; + uint16x4_t d20u16; + uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8; + + q0u8 = vdupq_n_u8(above[-1]); + q1u8 = vld1q_u8(above); + q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); + q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); + for (k = 0; k < 2; k++, left += 8) { + d18u8 = vld1_u8(left); + q10u16 = vmovl_u8(d18u8); + d20u16 = vget_low_u16(q10u16); + for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { + q0u16 = vdupq_lane_u16(d20u16, 0); + q8u16 = vdupq_lane_u16(d20u16, 1); + q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q2u16)); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q3u16)); + q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), + vreinterpretq_s16_u16(q2u16)); + q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), + vreinterpretq_s16_u16(q3u16)); + d2u8 = vqmovun_s16(q1s16); + d3u8 = vqmovun_s16(q0s16); + d22u8 = vqmovun_s16(q11s16); + d23u8 = vqmovun_s16(q8s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); + dst += y_stride; + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); + dst += y_stride; + + q0u16 = vdupq_lane_u16(d20u16, 2); + q8u16 = vdupq_lane_u16(d20u16, 3); + q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q2u16)); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q3u16)); + q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), + vreinterpretq_s16_u16(q2u16)); + q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), + vreinterpretq_s16_u16(q3u16)); + d2u8 = vqmovun_s16(q1s16); + d3u8 = vqmovun_s16(q0s16); + d22u8 = vqmovun_s16(q11s16); + d23u8 = vqmovun_s16(q8s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); + dst += y_stride; + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); + dst += y_stride; + } + } + return; +} + +void vp9_tm_predictor_32x32_neon( + uint8_t *dst, + ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + int j, k; + uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16; + uint8x16_t q0u8, q1u8, q2u8; + int16x8_t q12s16, q13s16, q14s16, q15s16; + uint16x4_t d6u16; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8; + + q0u8 = vdupq_n_u8(above[-1]); + q1u8 = vld1q_u8(above); + q2u8 = vld1q_u8(above + 16); + q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); + q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); + q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8)); + q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8)); + for (k = 0; k < 4; k++, left += 8) { + d26u8 = vld1_u8(left); + q3u16 = vmovl_u8(d26u8); + d6u16 = vget_low_u16(q3u16); + for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) { + q0u16 = vdupq_lane_u16(d6u16, 0); + q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q8u16)); + q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += y_stride; + + q0u16 = vdupq_lane_u16(d6u16, 1); + q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q8u16)); + q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += y_stride; + + q0u16 = vdupq_lane_u16(d6u16, 2); + q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q8u16)); + q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += y_stride; + + q0u16 = vdupq_lane_u16(d6u16, 3); + q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q8u16)); + q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += y_stride; + } + } + return; +} diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.asm b/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm index dc9856fa8..dc9856fa8 100644 --- a/vp9/common/arm/neon/vp9_reconintra_neon.asm +++ b/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index cad57501a..2f75af575 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -112,7 +112,8 @@ int vp9_alloc_ref_frame_buffers(VP9_COMMON *cm, int width, int height) { #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS) < 0) + VP9_ENC_BORDER_IN_PIXELS, + cm->byte_alignment) < 0) goto fail; if (cm->frame_bufs[i].mvs == NULL) { cm->frame_bufs[i].mvs = @@ -133,7 +134,8 @@ int vp9_alloc_ref_frame_buffers(VP9_COMMON *cm, int width, int height) { #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS) < 0) + VP9_ENC_BORDER_IN_PIXELS, + cm->byte_alignment) < 0) goto fail; #endif diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c index d7610ed28..4557e19bf 100644 --- a/vp9/common/vp9_entropymode.c +++ b/vp9/common/vp9_entropymode.c @@ -453,6 +453,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) { vp9_default_coef_probs(cm); vp9_init_mode_probs(cm->fc); vp9_init_mv_probs(cm); + cm->fc->initialized = 1; if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode || cm->reset_frame_context == 3) { @@ -469,8 +470,6 @@ void vp9_setup_past_independence(VP9_COMMON *cm) { vpx_memset(cm->prev_mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip)); - vpx_memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip)); - vp9_zero(cm->ref_frame_sign_bias); cm->frame_context_idx = 0; diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h index 6831d3f87..6db10806d 100644 --- a/vp9/common/vp9_entropymode.h +++ b/vp9/common/vp9_entropymode.h @@ -50,6 +50,7 @@ typedef struct frame_contexts { struct tx_probs tx_probs; vp9_prob skip_probs[SKIP_CONTEXTS]; nmv_context nmvc; + int initialized; } FRAME_CONTEXT; typedef struct { diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index c166590e2..bba24e03b 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -208,6 +208,7 @@ typedef struct VP9Common { int frame_parallel_decoding_mode; int log2_tile_cols, log2_tile_rows; + int byte_alignment; // Private data associated with the frame buffer callbacks. void *cb_priv; diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c index 06cb65a98..7eac70be2 100644 --- a/vp9/common/vp9_postproc.c +++ b/vp9/common/vp9_postproc.c @@ -671,7 +671,8 @@ int vp9_post_proc_frame(struct VP9Common *cm, #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, #endif // CONFIG_VP9_HIGHBITDEPTH - VP9_ENC_BORDER_IN_PIXELS) < 0) { + VP9_ENC_BORDER_IN_PIXELS, + cm->byte_alignment) < 0) { vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate MFQE framebuffer"); } @@ -688,12 +689,13 @@ int vp9_post_proc_frame(struct VP9Common *cm, #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, #endif - VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0) + VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL) < 0) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate post-processing buffer"); if ((flags & VP9D_MFQE) && cm->current_video_frame >= 2 && - cm->postproc_state.last_frame_valid && + cm->postproc_state.last_frame_valid && cm->bit_depth == 8 && cm->postproc_state.last_base_qindex <= last_q_thresh && cm->base_qindex - cm->postproc_state.last_base_qindex >= q_diff_thresh) { vp9_mfqe(cm); diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 11c0d81a9..df3db505f 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -66,8 +66,7 @@ add_proto qw/void vp9_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, con specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc"; add_proto qw/void vp9_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_4x4 neon_asm dspr2/, "$ssse3_x86inc"; -$vp9_h_predictor_4x4_neon_asm=vp9_h_predictor_4x4_neon; +specialize qw/vp9_h_predictor_4x4 neon dspr2/, "$ssse3_x86inc"; add_proto qw/void vp9_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d117_predictor_4x4/; @@ -79,12 +78,10 @@ add_proto qw/void vp9_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, co specialize qw/vp9_d153_predictor_4x4/, "$ssse3_x86inc"; add_proto qw/void vp9_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_4x4 neon_asm/, "$sse_x86inc"; -$vp9_v_predictor_4x4_neon_asm=vp9_v_predictor_4x4_neon; +specialize qw/vp9_v_predictor_4x4 neon/, "$sse_x86inc"; add_proto qw/void vp9_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_4x4 neon_asm dspr2/, "$sse_x86inc"; -$vp9_tm_predictor_4x4_neon_asm=vp9_tm_predictor_4x4_neon; +specialize qw/vp9_tm_predictor_4x4 neon dspr2/, "$sse_x86inc"; add_proto qw/void vp9_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_dc_predictor_4x4 dspr2/, "$sse_x86inc"; @@ -108,8 +105,7 @@ add_proto qw/void vp9_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, con specialize qw/vp9_d63_predictor_8x8/, "$ssse3_x86inc"; add_proto qw/void vp9_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_8x8 neon_asm dspr2/, "$ssse3_x86inc"; -$vp9_h_predictor_8x8_neon_asm=vp9_h_predictor_8x8_neon; +specialize qw/vp9_h_predictor_8x8 neon dspr2/, "$ssse3_x86inc"; add_proto qw/void vp9_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d117_predictor_8x8/; @@ -121,12 +117,10 @@ add_proto qw/void vp9_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, co specialize qw/vp9_d153_predictor_8x8/, "$ssse3_x86inc"; add_proto qw/void vp9_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_8x8 neon_asm/, "$sse_x86inc"; -$vp9_v_predictor_8x8_neon_asm=vp9_v_predictor_8x8_neon; +specialize qw/vp9_v_predictor_8x8 neon/, "$sse_x86inc"; add_proto qw/void vp9_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_8x8 neon_asm dspr2/, "$sse2_x86inc"; -$vp9_tm_predictor_8x8_neon_asm=vp9_tm_predictor_8x8_neon; +specialize qw/vp9_tm_predictor_8x8 neon dspr2/, "$sse2_x86inc"; add_proto qw/void vp9_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_dc_predictor_8x8 dspr2/, "$sse_x86inc"; @@ -150,8 +144,7 @@ add_proto qw/void vp9_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, c specialize qw/vp9_d63_predictor_16x16/, "$ssse3_x86inc"; add_proto qw/void vp9_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_16x16 neon_asm dspr2/, "$ssse3_x86inc"; -$vp9_h_predictor_16x16_neon_asm=vp9_h_predictor_16x16_neon; +specialize qw/vp9_h_predictor_16x16 neon dspr2/, "$ssse3_x86inc"; add_proto qw/void vp9_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d117_predictor_16x16/; @@ -163,12 +156,10 @@ add_proto qw/void vp9_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, specialize qw/vp9_d153_predictor_16x16/, "$ssse3_x86inc"; add_proto qw/void vp9_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_16x16 neon_asm/, "$sse2_x86inc"; -$vp9_v_predictor_16x16_neon_asm=vp9_v_predictor_16x16_neon; +specialize qw/vp9_v_predictor_16x16 neon/, "$sse2_x86inc"; add_proto qw/void vp9_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_16x16 neon_asm/, "$sse2_x86inc"; -$vp9_tm_predictor_16x16_neon_asm=vp9_tm_predictor_16x16_neon; +specialize qw/vp9_tm_predictor_16x16 neon/, "$sse2_x86inc"; add_proto qw/void vp9_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_dc_predictor_16x16 dspr2/, "$sse2_x86inc"; @@ -192,8 +183,7 @@ add_proto qw/void vp9_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, c specialize qw/vp9_d63_predictor_32x32/, "$ssse3_x86inc"; add_proto qw/void vp9_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_32x32 neon_asm/, "$ssse3_x86inc"; -$vp9_h_predictor_32x32_neon_asm=vp9_h_predictor_32x32_neon; +specialize qw/vp9_h_predictor_32x32 neon/, "$ssse3_x86inc"; add_proto qw/void vp9_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d117_predictor_32x32/; @@ -205,12 +195,10 @@ add_proto qw/void vp9_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, specialize qw/vp9_d153_predictor_32x32/; add_proto qw/void vp9_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_32x32 neon_asm/, "$sse2_x86inc"; -$vp9_v_predictor_32x32_neon_asm=vp9_v_predictor_32x32_neon; +specialize qw/vp9_v_predictor_32x32 neon/, "$sse2_x86inc"; add_proto qw/void vp9_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_32x32 neon_asm/, "$sse2_x86_64"; -$vp9_tm_predictor_32x32_neon_asm=vp9_tm_predictor_32x32_neon; +specialize qw/vp9_tm_predictor_32x32 neon/, "$sse2_x86_64"; add_proto qw/void vp9_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_dc_predictor_32x32/, "$sse2_x86inc"; @@ -261,8 +249,7 @@ add_proto qw/void vp9_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t * specialize qw/vp9_lpf_horizontal_4 mmx neon dspr2/; add_proto qw/void vp9_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/vp9_lpf_horizontal_4_dual sse2 neon_asm dspr2/; -$vp9_lpf_horizontal_4_dual_neon_asm=vp9_lpf_horizontal_4_dual_neon; +specialize qw/vp9_lpf_horizontal_4_dual sse2 neon dspr2/; # # post proc diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 58df87d0c..9677173db 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -719,6 +719,7 @@ static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { cm->use_highbitdepth, #endif VP9_DEC_BORDER_IN_PIXELS, + cm->byte_alignment, &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb, cm->cb_priv)) { vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, @@ -793,6 +794,7 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm, cm->use_highbitdepth, #endif VP9_DEC_BORDER_IN_PIXELS, + cm->byte_alignment, &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb, cm->cb_priv)) { vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, @@ -1556,6 +1558,10 @@ void vp9_decode_frame(VP9Decoder *pbi, vp9_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y); *cm->fc = cm->frame_contexts[cm->frame_context_idx]; + if (!cm->fc->initialized) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Uninitialized entropy context."); + vp9_zero(cm->counts); xd->corrupted = 0; diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c index 4deeed217..56ec6b335 100644 --- a/vp9/encoder/vp9_denoiser.c +++ b/vp9/encoder/vp9_denoiser.c @@ -425,6 +425,7 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, #endif int border) { int i, fail; + const int legacy_byte_alignment = 0; assert(denoiser != NULL); for (i = 0; i < MAX_REF_FRAMES; ++i) { @@ -433,7 +434,7 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, #if CONFIG_VP9_HIGHBITDEPTH use_highbitdepth, #endif - border); + border, legacy_byte_alignment); if (fail) { vp9_denoiser_free(denoiser); return 1; @@ -448,7 +449,7 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, #if CONFIG_VP9_HIGHBITDEPTH use_highbitdepth, #endif - border); + border, legacy_byte_alignment); if (fail) { vp9_denoiser_free(denoiser); return 1; diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index aaa6b238d..a7f34a2e4 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -491,7 +491,8 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) { #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate altref buffer"); } @@ -511,7 +512,8 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) { #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate last frame buffer"); @@ -521,7 +523,8 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) { #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate scaled source buffer"); @@ -531,7 +534,8 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) { #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate scaled last source buffer"); } @@ -567,7 +571,8 @@ static void update_frame_size(VP9_COMP *cpi) { #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to reallocate alt_ref_buffer"); } @@ -2473,7 +2478,8 @@ void vp9_scale_references(VP9_COMP *cpi) { cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, cm->use_highbitdepth, - VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL); + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL); scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf, (int)cm->bit_depth); #else @@ -2482,7 +2488,8 @@ void vp9_scale_references(VP9_COMP *cpi) { vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, - VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL); + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL); scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf); #endif // CONFIG_VP9_HIGHBITDEPTH cpi->scaled_ref_idx[ref_frame - 1] = new_fb; @@ -2721,7 +2728,8 @@ void set_frame_size(VP9_COMP *cpi) { #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL); + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL); alloc_util_frame_buffers(cpi); init_motion_estimation(cpi); diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c index 823e7a162..708072ee2 100644 --- a/vp9/encoder/vp9_lookahead.c +++ b/vp9/encoder/vp9_lookahead.c @@ -65,6 +65,7 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width, // Allocate the lookahead structures ctx = calloc(1, sizeof(*ctx)); if (ctx) { + const int legacy_byte_alignment = 0; unsigned int i; ctx->max_sz = depth; ctx->buf = calloc(depth, sizeof(*ctx->buf)); @@ -76,7 +77,8 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width, #if CONFIG_VP9_HIGHBITDEPTH use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS)) + VP9_ENC_BORDER_IN_PIXELS, + legacy_byte_alignment)) goto bail; } return ctx; diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index d5ab0cc6c..e0892fe35 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -522,8 +522,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, struct macroblockd_plane *const pd = &xd->plane[0]; PREDICTION_MODE best_mode = ZEROMV; MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME; - TX_SIZE best_tx_size = MIN(max_txsize_lookup[bsize], - tx_mode_to_biggest_tx_size[cm->tx_mode]); + TX_SIZE best_tx_size = TX_SIZES; INTERP_FILTER best_pred_filter = EIGHTTAP; int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; @@ -537,9 +536,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, // Reduce the intra cost penalty for small blocks (<=16x16). const int reduction_fac = (cpi->sf.partition_search_type == VAR_BASED_PARTITION && - bsize <= BLOCK_16X16) ? 4 : 1; + bsize <= BLOCK_16X16) ? 2 : 0; const int intra_cost_penalty = vp9_get_intra_cost_penalty( - cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) / reduction_fac; + cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) >> reduction_fac; const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv, intra_cost_penalty, 0); const int8_t segment_id = mbmi->segment_id; @@ -870,11 +869,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, pd->dst = orig_dst; for (i = 0; i < 4; ++i) { - const TX_SIZE saved_tx_size = mbmi->tx_size; const PREDICTION_MODE this_mode = intra_mode_list[i]; if (!((1 << this_mode) & cpi->sf.intra_y_mode_mask[intra_tx_size])) continue; - skip_txfm = x->skip_txfm[0]; args.mode = this_mode; args.rate = 0; args.dist = 0; @@ -895,11 +892,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, mbmi->ref_frame[0] = INTRA_FRAME; mbmi->uv_mode = this_mode; mbmi->mv[0].as_int = INVALID_MV; - } else { - x->skip_txfm[0] = best_mode_skip_txfm; - mbmi->tx_size = saved_tx_size; } } + + // Reset mb_mode_info to the best inter mode. + if (mbmi->ref_frame[0] != INTRA_FRAME) { + x->skip_txfm[0] = best_mode_skip_txfm; + mbmi->tx_size = best_tx_size; + } } pd->dst = orig_dst; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index bc5edc815..e5a98d1bd 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -2726,6 +2726,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, x->skip_encode = 0; ctx->skip = 0; xd->mi[0].src_mi->mbmi.ref_frame[0] = INTRA_FRAME; + xd->mi[0].src_mi->mbmi.ref_frame[1] = NONE; if (bsize >= BLOCK_8X8) { if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 184322f4f..31e93be65 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -39,7 +39,9 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { #if CONFIG_VP9_HIGHBITDEPTH cpi->common.use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) + VP9_ENC_BORDER_IN_PIXELS, + cpi->common.byte_alignment, + NULL, NULL, NULL)) vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, "Failed to allocate empty frame for multiple frame " "contexts"); diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index a4051f05e..424cc0843 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -710,8 +710,9 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, - NULL)) { + VP9_ENC_BORDER_IN_PIXELS, + cm->byte_alignment, + NULL, NULL, NULL)) { vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to reallocate alt_ref_buffer"); } diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 256045983..f5e6e3190 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -133,11 +133,8 @@ ifeq ($(ARCH_X86_64), yes) VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3_x86_64.asm endif -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_reconintra_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c @@ -159,8 +156,10 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM) +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon_asm$(ASM) else ifeq ($(HAVE_NEON), yes) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_avg_neon.c @@ -179,6 +178,7 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon.c endif # HAVE_NEON endif # HAVE_NEON_ASM diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index 809514001..43bf35f9c 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -44,6 +44,7 @@ struct vpx_codec_alg_priv { int flushed; int invert_tile_order; int frame_parallel_decode; // frame-based threading. + int byte_alignment; // External frame buffer info to save for VP9 common. void *ext_priv; // Private data associated with the external frame buffers. @@ -219,6 +220,7 @@ static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) { VP9_COMMON *const cm = &ctx->pbi->common; cm->new_fb_idx = -1; + cm->byte_alignment = ctx->byte_alignment; if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) { cm->get_fb_cb = ctx->get_ext_fb_cb; @@ -617,6 +619,27 @@ static vpx_codec_err_t ctrl_set_decryptor(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_set_byte_alignment(vpx_codec_alg_priv_t *ctx, + va_list args) { + const int legacy_byte_alignment = 0; + const int min_byte_alignment = 32; + const int max_byte_alignment = 1024; + const int byte_alignment = va_arg(args, int); + + if (byte_alignment != legacy_byte_alignment && + (byte_alignment < min_byte_alignment || + byte_alignment > max_byte_alignment || + (byte_alignment & (byte_alignment - 1)) != 0)) + return VPX_CODEC_INVALID_PARAM; + + ctx->byte_alignment = byte_alignment; + if (ctx->pbi != NULL) { + VP9_COMMON *const cm = &ctx->pbi->common; + cm->byte_alignment = byte_alignment; + } + return VPX_CODEC_OK; +} + static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { {VP8_COPY_REFERENCE, ctrl_copy_reference}, @@ -629,6 +652,7 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { {VP8_SET_DBG_DISPLAY_MV, ctrl_set_dbg_options}, {VP9_INVERT_TILE_DECODE_ORDER, ctrl_set_invert_tile_order}, {VPXD_SET_DECRYPTOR, ctrl_set_decryptor}, + {VP9_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment}, // Getters {VP8D_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates}, |