diff options
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm | 237 | ||||
-rw-r--r-- | vp9/common/vp9_convolve.c | 5 | ||||
-rw-r--r-- | vp9/common/vp9_entropymv.c | 82 | ||||
-rw-r--r-- | vp9/common/vp9_loopfilter.c | 10 | ||||
-rw-r--r-- | vp9/common/vp9_onyxc_int.h | 4 | ||||
-rw-r--r-- | vp9/common/vp9_postproc.c | 3 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.sh | 4 | ||||
-rw-r--r-- | vp9/decoder/vp9_decodframe.c | 30 | ||||
-rw-r--r-- | vp9/decoder/vp9_onyxd_if.c | 8 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 72 | ||||
-rw-r--r-- | vp9/encoder/vp9_onyx_if.c | 10 | ||||
-rw-r--r-- | vp9/encoder/vp9_rdopt.c | 4 | ||||
-rw-r--r-- | vp9/vp9_common.mk | 7 | ||||
-rw-r--r-- | vp9/vp9_cx_iface.c | 2 | ||||
-rw-r--r-- | vp9/vp9_dx_iface.c | 4 | ||||
-rw-r--r-- | vp9/vp9cx.mk | 2 |
16 files changed, 409 insertions, 75 deletions
diff --git a/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm b/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm new file mode 100644 index 000000000..963ef35da --- /dev/null +++ b/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm @@ -0,0 +1,237 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_short_iht4x4_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are + ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain + ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back + ; into d16-d19 registers. This macro will touch q10- q15 registers and use + ; them as buffer during calculation. + MACRO + IDCT4x4_1D + ; stage 1 + vadd.s16 d23, d16, d18 ; (input[0] + input[2]) + vsub.s16 d24, d16, d18 ; (input[0] - input[2]) + + vmull.s16 q15, d17, d2 ; input[1] * cospi_24_64 + vmull.s16 q10, d17, d0 ; input[1] * cospi_8_64 + vmull.s16 q13, d23, d1 ; (input[0] + input[2]) * cospi_16_64 + vmull.s16 q14, d24, d1 ; (input[0] - input[2]) * cospi_16_64 + vmlsl.s16 q15, d19, d0 ; input[1] * cospi_24_64 - input[3] * cospi_8_64 + vmlal.s16 q10, d19, d2 ; input[1] * cospi_8_64 + input[3] * cospi_24_64 + + ; dct_const_round_shift + vqrshrn.s32 d26, q13, #14 + vqrshrn.s32 d27, q14, #14 + vqrshrn.s32 d29, q15, #14 + vqrshrn.s32 d28, q10, #14 + + ; stage 2 + ; output[0] = step[0] + step[3]; + ; output[1] = step[1] + step[2]; + ; output[3] = step[0] - step[3]; + ; output[2] = step[1] - step[2]; + vadd.s16 q8, q13, q14 + vsub.s16 q9, q13, q14 + vswp d18, d19 + MEND + + ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which + ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9. + ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be + ; stored back into d16-d19 registers. This macro will touch q11,q12,q13, + ; q14,q15 registers and use them as buffer during calculation. + MACRO + IADST4x4_1D + vmull.s16 q10, d3, d16 ; s0 = sinpi_1_9 * x0 + vmull.s16 q11, d4, d16 ; s1 = sinpi_2_9 * x0 + vmull.s16 q12, d6, d17 ; s2 = sinpi_3_9 * x1 + vmull.s16 q13, d5, d18 ; s3 = sinpi_4_9 * x2 + vmull.s16 q14, d3, d18 ; s4 = sinpi_1_9 * x2 + vmovl.s16 q15, d16 ; expand x0 from 16 bit to 32 bit + vaddw.s16 q15, q15, d19 ; x0 + x3 + vmull.s16 q8, d4, d19 ; s5 = sinpi_2_9 * x3 + vsubw.s16 q15, q15, d18 ; s7 = x0 + x3 - x2 + vmull.s16 q9, d5, d19 ; s6 = sinpi_4_9 * x3 + + vadd.s32 q10, q10, q13 ; x0 = s0 + s3 + s5 + vadd.s32 q10, q10, q8 + vsub.s32 q11, q11, q14 ; x1 = s1 - s4 - s6 + vdup.32 q8, r0 ; duplicate sinpi_3_9 + vsub.s32 q11, q11, q9 + vmul.s32 q15, q15, q8 ; x2 = sinpi_3_9 * s7 + + vadd.s32 q13, q10, q12 ; s0 = x0 + x3 + vadd.s32 q10, q10, q11 ; x0 + x1 + vadd.s32 q14, q11, q12 ; s1 = x1 + x3 + vsub.s32 q10, q10, q12 ; s3 = x0 + x1 - x3 + + ; dct_const_round_shift + vqrshrn.s32 d16, q13, #14 + vqrshrn.s32 d17, q14, #14 + vqrshrn.s32 d18, q15, #14 + vqrshrn.s32 d19, q10, #14 + MEND + + ; Generate cosine constants in d6 - d8 for the IDCT + MACRO + GENERATE_COSINE_CONSTANTS + ; cospi_8_64 = 15137 = 0x3b21 + mov r0, #0x3b00 + add r0, #0x21 + ; cospi_16_64 = 11585 = 0x2d41 + mov r3, #0x2d00 + add r3, #0x41 + ; cospi_24_64 = 6270 = 0x187e + mov r12, #0x1800 + add r12, #0x7e + + ; generate constant vectors + vdup.16 d0, r0 ; duplicate cospi_8_64 + vdup.16 d1, r3 ; duplicate cospi_16_64 + vdup.16 d2, r12 ; duplicate cospi_24_64 + MEND + + ; Generate sine constants in d1 - d4 for the IADST. + MACRO + GENERATE_SINE_CONSTANTS + ; sinpi_1_9 = 5283 = 0x14A3 + mov r0, #0x1400 + add r0, #0xa3 + ; sinpi_2_9 = 9929 = 0x26C9 + mov r3, #0x2600 + add r3, #0xc9 + ; sinpi_4_9 = 15212 = 0x3B6C + mov r12, #0x3b00 + add r12, #0x6c + + ; generate constant vectors + vdup.16 d3, r0 ; duplicate sinpi_1_9 + + ; sinpi_3_9 = 13377 = 0x3441 + mov r0, #0x3400 + add r0, #0x41 + + vdup.16 d4, r3 ; duplicate sinpi_2_9 + vdup.16 d5, r12 ; duplicate sinpi_4_9 + vdup.16 q3, r0 ; duplicate sinpi_3_9 + MEND + + ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19. + MACRO + TRANSPOSE4X4 + vtrn.16 d16, d17 + vtrn.16 d18, d19 + vtrn.32 q8, q9 + MEND + + AREA Block, CODE, READONLY ; name this block of code +;void vp9_short_iht4x4_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride, int tx_type) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride +; r3 int tx_type) +; This function will only handle tx_type of 1,2,3. +|vp9_short_iht4x4_add_neon| PROC + + ; load the inputs into d16-d19 + vld1.s16 {q8,q9}, [r0]! + + ; transpose the input data + TRANSPOSE4X4 + + ; decide the type of transform + cmp r3, #2 + beq idct_iadst + cmp r3, #3 + beq iadst_iadst + +iadst_idct + ; generate constants + GENERATE_COSINE_CONSTANTS + GENERATE_SINE_CONSTANTS + + ; first transform rows + IDCT4x4_1D + + ; transpose the matrix + TRANSPOSE4X4 + + ; then transform columns + IADST4x4_1D + + b end_vp9_short_iht4x4_add_neon + +idct_iadst + ; generate constants + GENERATE_COSINE_CONSTANTS + GENERATE_SINE_CONSTANTS + + ; first transform rows + IADST4x4_1D + + ; transpose the matrix + TRANSPOSE4X4 + + ; then transform columns + IDCT4x4_1D + + b end_vp9_short_iht4x4_add_neon + +iadst_iadst + ; generate constants + GENERATE_SINE_CONSTANTS + + ; first transform rows + IADST4x4_1D + + ; transpose the matrix + TRANSPOSE4X4 + + ; then transform columns + IADST4x4_1D + +end_vp9_short_iht4x4_add_neon + ; ROUND_POWER_OF_TWO(temp_out[j], 4) + vrshr.s16 q8, q8, #4 + vrshr.s16 q9, q9, #4 + + vld1.32 {d26[0]}, [r1], r2 + vld1.32 {d26[1]}, [r1], r2 + vld1.32 {d27[0]}, [r1], r2 + vld1.32 {d27[1]}, [r1] + + ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i] + vaddw.u8 q8, q8, d26 + vaddw.u8 q9, q9, d27 + + ; clip_pixel + vqmovun.s16 d26, q8 + vqmovun.s16 d27, q9 + + ; do the stores in reverse order with negative post-increment, by changing + ; the sign of the stride + rsb r2, r2, #0 + vst1.32 {d27[1]}, [r1], r2 + vst1.32 {d27[0]}, [r1], r2 + vst1.32 {d26[1]}, [r1], r2 + vst1.32 {d26[0]}, [r1] ; no post-increment + bx lr + ENDP ; |vp9_short_iht4x4_add_neon| + + END diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c index 1e6cd4404..be092f41c 100644 --- a/vp9/common/vp9_convolve.c +++ b/vp9/common/vp9_convolve.c @@ -195,7 +195,7 @@ static void convolve_c(const uint8_t *src, ptrdiff_t src_stride, * h == 64, taps == 8. */ uint8_t temp[64 * 135]; - int intermediate_height = MAX(((h * y_step_q4) >> 4), 1) + taps - 1; + int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + taps; assert(w <= 64); assert(h <= 64); @@ -203,9 +203,6 @@ static void convolve_c(const uint8_t *src, ptrdiff_t src_stride, assert(y_step_q4 <= 32); assert(x_step_q4 <= 32); - if (intermediate_height < h) - intermediate_height = h; - convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride, temp, 64, filter_x, x_step_q4, filter_y, y_step_q4, w, intermediate_height, taps); diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c index 6cfc34697..c6eefda92 100644 --- a/vp9/common/vp9_entropymv.c +++ b/vp9/common/vp9_entropymv.c @@ -79,20 +79,59 @@ static const nmv_context default_nmv_context = { #define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0) +static const uint8_t log_in_base_2[] = { + 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10 +}; + MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) { MV_CLASS_TYPE c = MV_CLASS_0; - if (z < CLASS0_SIZE * 8) c = MV_CLASS_0; - else if (z < CLASS0_SIZE * 16) c = MV_CLASS_1; - else if (z < CLASS0_SIZE * 32) c = MV_CLASS_2; - else if (z < CLASS0_SIZE * 64) c = MV_CLASS_3; - else if (z < CLASS0_SIZE * 128) c = MV_CLASS_4; - else if (z < CLASS0_SIZE * 256) c = MV_CLASS_5; - else if (z < CLASS0_SIZE * 512) c = MV_CLASS_6; - else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7; - else if (z < CLASS0_SIZE * 2048) c = MV_CLASS_8; - else if (z < CLASS0_SIZE * 4096) c = MV_CLASS_9; - else if (z < CLASS0_SIZE * 8192) c = MV_CLASS_10; - else assert(0); + if (z >= CLASS0_SIZE * 4096) + c = MV_CLASS_10; + else + c = log_in_base_2[z >> 3]; + if (offset) *offset = z - mv_class_base(c); return c; @@ -123,29 +162,18 @@ static void inc_mv_component(int v, nmv_component_counts *comp_counts, d = (o >> 3); /* int mv data */ f = (o >> 1) & 3; /* fractional pel mv data */ e = (o & 1); /* high precision mv data */ + if (c == MV_CLASS_0) { comp_counts->class0[d] += incr; + comp_counts->class0_fp[d][f] += incr; + comp_counts->class0_hp[e] += usehp * incr; } else { int i; int b = c + CLASS0_BITS - 1; // number of bits for (i = 0; i < b; ++i) comp_counts->bits[i][((d >> i) & 1)] += incr; - } - - /* Code the fractional pel bits */ - if (c == MV_CLASS_0) { - comp_counts->class0_fp[d][f] += incr; - } else { comp_counts->fp[f] += incr; - } - - /* Code the high precision bit */ - if (usehp) { - if (c == MV_CLASS_0) { - comp_counts->class0_hp[e] += incr; - } else { - comp_counts->hp[e] += incr; - } + comp_counts->hp[e] += usehp * incr; } } diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index cfa61c20b..df806ac56 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -777,6 +777,7 @@ static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, } } } +#if CONFIG_NON420 static void filter_block_plane_non420(VP9_COMMON *cm, struct macroblockd_plane *plane, const MODE_INFO *mi, @@ -896,6 +897,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm, dst->buf += 8 * dst->stride; } } +#endif static void filter_block_plane(VP9_COMMON *const cm, struct macroblockd_plane *const plane, @@ -981,8 +983,10 @@ void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer, const int num_planes = y_only ? 1 : MAX_MB_PLANE; int mi_row, mi_col; LOOP_FILTER_MASK lfm; +#if CONFIG_NON420 int use_420 = y_only || (xd->plane[1].subsampling_y == 1 && xd->plane[1].subsampling_x == 1); +#endif for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) { MODE_INFO* const mi = cm->mi + mi_row * cm->mode_info_stride; @@ -993,16 +997,22 @@ void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer, setup_dst_planes(xd, frame_buffer, mi_row, mi_col); // TODO(JBB): Make setup_mask work for non 420. +#if CONFIG_NON420 if (use_420) +#endif setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mode_info_stride, &lfm); for (plane = 0; plane < num_planes; ++plane) { +#if CONFIG_NON420 if (use_420) +#endif filter_block_plane(cm, &xd->plane[plane], mi + mi_col, mi_row, mi_col, &lfm); +#if CONFIG_NON420 else filter_block_plane_non420(cm, &xd->plane[plane], mi + mi_col, mi_row, mi_col); +#endif } } } diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index a669cc5e7..f0bc063f2 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -20,7 +20,7 @@ #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_quant_common.h" -#if CONFIG_POSTPROC +#if CONFIG_VP9_POSTPROC #include "vp9/common/vp9_postproc.h" #endif @@ -201,7 +201,7 @@ typedef struct VP9Common { unsigned int current_video_frame; int version; -#if CONFIG_POSTPROC +#if CONFIG_VP9_POSTPROC struct postproc_state postproc_state; #endif diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c index 48d3d2d98..955e6766a 100644 --- a/vp9/common/vp9_postproc.c +++ b/vp9/common/vp9_postproc.c @@ -1011,7 +1011,8 @@ int vp9_post_proc_frame(struct VP9Common *cm, /* handle problem with extending borders */ dest->y_width = cm->width; dest->y_height = cm->height; - dest->uv_height = dest->y_height / 2; + dest->uv_width = dest->y_width >> cm->subsampling_x; + dest->uv_height = dest->y_height >> cm->subsampling_y; return 0; } diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 104db6aeb..c2777aa51 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -237,7 +237,7 @@ specialize vp9_loop_filter_horizontal_edge mmx neon # # post proc # -if [ "$CONFIG_POSTPROC" = "yes" ]; then +if [ "$CONFIG_VP9_POSTPROC" = "yes" ]; then prototype void vp9_mbpost_proc_down "uint8_t *dst, int pitch, int rows, int cols, int flimit" specialize vp9_mbpost_proc_down mmx sse2 vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm @@ -325,7 +325,7 @@ prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output" specialize vp9_short_idct1_32x32 prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type" -specialize vp9_short_iht4x4_add sse2 +specialize vp9_short_iht4x4_add sse2 neon prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type" specialize vp9_short_iht8x8_add sse2 diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c index 41e406d95..6cb7c094b 100644 --- a/vp9/decoder/vp9_decodframe.c +++ b/vp9/decoder/vp9_decodframe.c @@ -77,14 +77,11 @@ static void read_tx_probs(struct tx_probs *tx_probs, vp9_reader *r) { vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]); } -static void init_dequantizer(VP9_COMMON *cm, MACROBLOCKD *xd) { +static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) { int i; - const int segment_id = xd->mode_info_context->mbmi.segment_id; - xd->q_index = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex); - - xd->plane[0].dequant = cm->y_dequant[xd->q_index]; + xd->plane[0].dequant = cm->y_dequant[q_index]; for (i = 1; i < MAX_MB_PLANE; i++) - xd->plane[i].dequant = cm->uv_dequant[xd->q_index]; + xd->plane[i].dequant = cm->uv_dequant[q_index]; } static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize, @@ -149,14 +146,17 @@ static void decode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, } static int decode_tokens(VP9D_COMP *pbi, BLOCK_SIZE bsize, vp9_reader *r) { + VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; + MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; - if (xd->mode_info_context->mbmi.skip_coeff) { - reset_skip_context(xd, bsize); + if (mbmi->skip_coeff) { + reset_skip_context(xd, bsize); return -1; } else { - if (pbi->common.seg.enabled) - init_dequantizer(&pbi->common, xd); + if (cm->seg.enabled) + setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id, + cm->base_qindex)); // TODO(dkovalev) if (!vp9_reader_has_error(r)) return vp9_decode_tokens(pbi, r, bsize); @@ -173,6 +173,7 @@ static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE bsize, xd->mode_info_context = cm->mi + offset; xd->mode_info_context->mbmi.sb_type = bsize; + xd->mode_info_stride = cm->mode_info_stride; // Special case: if prev_mi is NULL, the previous mode info context // cannot be used. xd->prev_mode_info_context = cm->prev_mi ? cm->prev_mi + offset : NULL; @@ -453,8 +454,7 @@ static void setup_loopfilter(struct loopfilter *lf, static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) { const int old = *delta_q; - if (vp9_rb_read_bit(rb)) - *delta_q = vp9_rb_read_signed_literal(rb, 4); + *delta_q = vp9_rb_read_bit(rb) ? vp9_rb_read_signed_literal(rb, 4) : 0; return old != *delta_q; } @@ -958,11 +958,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet or corrupt header length"); - xd->mode_info_context = cm->mi; - xd->prev_mode_info_context = cm->prev_mi; - xd->mode_info_stride = cm->mode_info_stride; - - init_dequantizer(cm, &pbi->mb); + setup_plane_dequants(cm, &pbi->mb, cm->base_qindex); cm->fc = cm->frame_contexts[cm->frame_context_idx]; diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c index b609f9f31..505e9dc5d 100644 --- a/vp9/decoder/vp9_onyxd_if.c +++ b/vp9/decoder/vp9_onyxd_if.c @@ -13,7 +13,7 @@ #include <stdio.h> #include "vp9/common/vp9_onyxc_int.h" -#if CONFIG_POSTPROC +#if CONFIG_VP9_POSTPROC #include "vp9/common/vp9_postproc.h" #endif #include "vp9/decoder/vp9_onyxd.h" @@ -421,7 +421,7 @@ int vp9_get_raw_frame(VP9D_PTR ptr, YV12_BUFFER_CONFIG *sd, *time_stamp = pbi->last_time_stamp; *time_end_stamp = 0; -#if CONFIG_POSTPROC +#if CONFIG_VP9_POSTPROC ret = vp9_post_proc_frame(&pbi->common, sd, flags); #else @@ -429,7 +429,9 @@ int vp9_get_raw_frame(VP9D_PTR ptr, YV12_BUFFER_CONFIG *sd, *sd = *pbi->common.frame_to_show; sd->y_width = pbi->common.width; sd->y_height = pbi->common.height; - sd->uv_height = pbi->common.height / 2; + sd->uv_width = sd->y_width >> pbi->common.subsampling_x; + sd->uv_height = sd->y_height >> pbi->common.subsampling_y; + ret = 0; } else { ret = -1; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index eb83903ca..45758e7cb 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -851,13 +851,75 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, } } -static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m, BLOCK_SIZE bsize) { +// Check to see if the given partition size is allowed for a specified number +// of 8x8 block rows and columns remaining in the image. +// If not then return the largest allowed partition size +static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, + int rows_left, int cols_left, + int *bh, int *bw) { + if ((rows_left <= 0) || (cols_left <= 0)) { + return MIN(bsize, BLOCK_8X8); + } else { + for (; bsize > 0; --bsize) { + *bh = num_8x8_blocks_high_lookup[bsize]; + *bw = num_8x8_blocks_wide_lookup[bsize]; + if ((*bh <= rows_left) && (*bw <= cols_left)) { + break; + } + } + } + return bsize; +} + +// This function attempts to set all mode info entries in a given SB64 +// to the same block partition size. +// However, at the bottom and right borders of the image the requested size +// may not be allowed in which case this code attempts to choose the largest +// allowable partition. +static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m, + int mi_row, int mi_col) { VP9_COMMON *const cm = &cpi->common; + BLOCK_SIZE bsize = cpi->sf.always_this_block_size; const int mis = cm->mode_info_stride; + int row8x8_remaining = cm->cur_tile_mi_row_end - mi_row; + int col8x8_remaining = cm->cur_tile_mi_col_end - mi_col; int block_row, block_col; - for (block_row = 0; block_row < 8; ++block_row) { - for (block_col = 0; block_col < 8; ++block_col) { - m[block_row * mis + block_col].mbmi.sb_type = bsize; + + assert((row8x8_remaining > 0) && (col8x8_remaining > 0)); + + // Apply the requested partition size to the SB64 if it is all "in image" + if ((col8x8_remaining >= MI_BLOCK_SIZE) && + (row8x8_remaining >= MI_BLOCK_SIZE)) { + for (block_row = 0; block_row < MI_BLOCK_SIZE; ++block_row) { + for (block_col = 0; block_col < MI_BLOCK_SIZE; ++block_col) { + m[block_row * mis + block_col].mbmi.sb_type = bsize; + } + } + } else { + // Else this is a partial SB64. + int bh = num_8x8_blocks_high_lookup[bsize]; + int bw = num_8x8_blocks_wide_lookup[bsize]; + int sub_block_row; + int sub_block_col; + int row_index; + int col_index; + + for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) { + for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) { + // Find a partition size that fits + bsize = find_partition_size(cpi->sf.always_this_block_size, + (row8x8_remaining - block_row), + (col8x8_remaining - block_col), &bh, &bw); + + // Set the mi entries for all 8x8 blocks within the selected size + for (sub_block_row = 0; sub_block_row < bh; ++sub_block_row) { + for (sub_block_col = 0; sub_block_col < bw; ++sub_block_col) { + row_index = block_row + sub_block_row; + col_index = block_col + sub_block_col; + m[row_index * mis + col_index].mbmi.sb_type = bsize; + } + } + } } } } @@ -1946,7 +2008,7 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp, cpi->mb.source_variance = UINT_MAX; if (cpi->sf.use_one_partition_size_always) { set_offsets(cpi, mi_row, mi_col, BLOCK_64X64); - set_partitioning(cpi, m, cpi->sf.always_this_block_size); + set_partitioning(cpi, m, mi_row, mi_col); rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rate, &dummy_dist, 1); } else if (cpi->sf.partition_by_variance) { diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index e9c214ff8..d35b739fb 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -28,7 +28,7 @@ #include "vp9/encoder/vp9_segmentation.h" #include "./vp9_rtcd.h" #include "./vpx_scale_rtcd.h" -#if CONFIG_POSTPROC +#if CONFIG_VP9_POSTPROC #include "vp9/common/vp9_postproc.h" #endif #include "vpx_mem/vpx_mem.h" @@ -2895,7 +2895,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, set_mvcost(&cpi->mb); } -#if CONFIG_POSTPROC +#if CONFIG_VP9_POSTPROC if (cpi->oxcf.noise_sensitivity > 0) { int l = 0; @@ -3954,7 +3954,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, { double frame_psnr2, frame_ssim2 = 0; double weight = 0; -#if CONFIG_POSTPROC +#if CONFIG_VP9_POSTPROC vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->lf.filter_level * 10 / 6); #endif @@ -4030,7 +4030,7 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest, return -1; else { int ret; -#if CONFIG_POSTPROC +#if CONFIG_VP9_POSTPROC ret = vp9_post_proc_frame(&cpi->common, dest, flags); #else @@ -4044,7 +4044,7 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest, ret = -1; } -#endif // !CONFIG_POSTPROC +#endif // !CONFIG_VP9_POSTPROC vp9_clear_system_state(); return ret; } diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 39b6544f6..647265bf6 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -2916,8 +2916,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, (sse_v - var_v < thresh_dc || sse_v == var_v)) { x->skip = 1; - *rate2 = 500; - *rate_uv = 0; + // The cost of skip bit needs to be added. + *rate2 += vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1); // Scaling factor for SSE from spatial domain to frequency domain // is 16. Adjust distortion accordingly. diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index fb302abec..c6daecca0 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -73,11 +73,11 @@ VP9_COMMON_SRCS-yes += common/vp9_common_data.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c -VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h -VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c +VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h +VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm -ifeq ($(CONFIG_POSTPROC),yes) +ifeq ($(CONFIG_VP9_POSTPROC),yes) VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm endif @@ -102,6 +102,7 @@ VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_1_add_neon$(AS VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_1_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht4x4_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_copy_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_avg_neon$(ASM) diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 0874afdbc..e9549228e 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -888,7 +888,7 @@ static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t vp9e_set_previewpp(vpx_codec_alg_priv_t *ctx, int ctr_id, va_list args) { -#if CONFIG_POSTPROC +#if CONFIG_VP9_POSTPROC vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *); (void)ctr_id; diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index e7362fca0..10b32385c 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -20,7 +20,7 @@ #include "vp9/decoder/vp9_read_bit_buffer.h" #include "vp9/vp9_iface_common.h" -#define VP9_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0) +#define VP9_CAP_POSTPROC (CONFIG_VP9_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0) typedef vpx_codec_stream_info_t vp9_stream_info_t; /* Structures for handling memory allocations */ @@ -596,7 +596,7 @@ static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t set_postproc(vpx_codec_alg_priv_t *ctx, int ctr_id, va_list args) { -#if CONFIG_POSTPROC +#if CONFIG_VP9_POSTPROC vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *); if (data) { diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 89de6014e..9fbf100f5 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -64,7 +64,7 @@ VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c VP9_CX_SRCS-yes += encoder/vp9_tokenize.c VP9_CX_SRCS-yes += encoder/vp9_treewriter.c VP9_CX_SRCS-yes += encoder/vp9_variance_c.c -ifeq ($(CONFIG_POSTPROC),yes) +ifeq ($(CONFIG_VP9_POSTPROC),yes) VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.h VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c endif |