diff options
Diffstat (limited to 'vp9/common')
32 files changed, 841 insertions, 244 deletions
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.asm b/vp9/common/arm/neon/vp9_reconintra_neon.asm index 279f678b1..4a49964d5 100644 --- a/vp9/common/arm/neon/vp9_reconintra_neon.asm +++ b/vp9/common/arm/neon/vp9_reconintra_neon.asm @@ -349,7 +349,7 @@ loop_h vdup.u8 d0, r12 ; preload 8 left - vld1.8 d30, [r3] + vld1.8 {d30}, [r3] ; Load above 8 pixels vld1.64 {d2}, [r2] @@ -422,10 +422,10 @@ loop_h vdup.u8 q0, r12 ; Load above 8 pixels - vld1.8 q1, [r2] + vld1.8 {q1}, [r2] ; preload 8 left into r12 - vld1.8 d18, [r3]! + vld1.8 {d18}, [r3]! ; Compute above - ytop_left vsubl.u8 q2, d2, d0 @@ -492,7 +492,7 @@ loop_16x16_neon vqshrun.s16 d23, q8, #0 vdup.16 q0, d20[2] vdup.16 q8, d20[3] - vld1.8 d18, [r3]! ; preload 8 left into r12 + vld1.8 {d18}, [r3]! ; preload 8 left into r12 vmovl.u8 q10, d18 vst1.64 {d2,d3}, [r0], r1 vst1.64 {d22,d23}, [r0], r1 @@ -518,11 +518,11 @@ loop_16x16_neon vdup.u8 q0, r12 ; Load above 32 pixels - vld1.8 q1, [r2]! - vld1.8 q2, [r2] + vld1.8 {q1}, [r2]! + vld1.8 {q2}, [r2] ; preload 8 left pixels - vld1.8 d26, [r3]! + vld1.8 {d26}, [r3]! ; Compute above - ytop_left vsubl.u8 q8, d2, d0 @@ -621,7 +621,7 @@ loop_32x32_neon vst1.64 {d0-d3}, [r0], r1 vqshrun.s16 d24, q12, #0 vqshrun.s16 d25, q13, #0 - vld1.8 d0, [r3]! ; preload 8 left pixels + vld1.8 {d0}, [r3]! ; preload 8 left pixels vqshrun.s16 d26, q14, #0 vqshrun.s16 d27, q15, #0 vmovl.u8 q3, d0 diff --git a/vp9/common/mips/dspr2/vp9_common_dspr2.h b/vp9/common/mips/dspr2/vp9_common_dspr2.h index 991d3c2b3..6ebea9f2f 100644 --- a/vp9/common/mips/dspr2/vp9_common_dspr2.h +++ b/vp9/common/mips/dspr2/vp9_common_dspr2.h @@ -85,8 +85,8 @@ static INLINE void vp9_prefetch_store_streamed(unsigned char *dst) { ); } -void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride); +void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride); void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, diff --git a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c index 1b2f5506a..19c582fd1 100644 --- a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c +++ b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c @@ -19,8 +19,8 @@ #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" #if HAVE_DSPR2 -static void idct16_1d_rows_dspr2(const int16_t *input, int16_t *output, - uint32_t no_rows) { +static void idct16_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { int i; int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; int step1_10, step1_11, step1_12, step1_13; @@ -404,8 +404,8 @@ static void idct16_1d_rows_dspr2(const int16_t *input, int16_t *output, } } -static void idct16_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { +static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { int i; int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; int step1_8, step1_9, step1_10, step1_11; @@ -905,13 +905,13 @@ void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, ); // First transform rows - idct16_1d_rows_dspr2(input, out, 16); + idct16_rows_dspr2(input, out, 16); // Then transform columns and add to dest - idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride); + idct16_cols_add_blk_dspr2(out, dest, dest_stride); } -static void iadst16_1d(const int16_t *input, int16_t *output) { +static void iadst16(const int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; int x0 = input[15]; @@ -1099,16 +1099,16 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, switch (tx_type) { case DCT_DCT: // DCT in both horizontal and vertical - idct16_1d_rows_dspr2(input, outptr, 16); - idct16_1d_cols_add_blk_dspr2(out, dest, pitch); + idct16_rows_dspr2(input, outptr, 16); + idct16_cols_add_blk_dspr2(out, dest, pitch); break; case ADST_DCT: // ADST in vertical, DCT in horizontal - idct16_1d_rows_dspr2(input, outptr, 16); + idct16_rows_dspr2(input, outptr, 16); outptr = out; for (i = 0; i < 16; ++i) { - iadst16_1d(outptr, temp_out); + iadst16(outptr, temp_out); for (j = 0; j < 16; ++j) dest[j * pitch + i] = @@ -1125,7 +1125,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, /* prefetch row */ vp9_prefetch_load((const uint8_t *)(input + 16)); - iadst16_1d(input, outptr); + iadst16(input, outptr); input += 16; outptr += 16; } @@ -1134,7 +1134,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, for (j = 0; j < 16; ++j) temp_in[j * 16 + i] = out[i * 16 + j]; - idct16_1d_cols_add_blk_dspr2(temp_in, dest, pitch); + idct16_cols_add_blk_dspr2(temp_in, dest, pitch); } break; case ADST_ADST: // ADST in both directions @@ -1145,7 +1145,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, /* prefetch row */ vp9_prefetch_load((const uint8_t *)(input + 16)); - iadst16_1d(input, outptr); + iadst16(input, outptr); input += 16; outptr += 16; } @@ -1153,7 +1153,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, for (i = 0; i < 16; ++i) { for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - iadst16_1d(temp_in, temp_out); + iadst16(temp_in, temp_out); for (j = 0; j < 16; ++j) dest[j * pitch + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) @@ -1183,7 +1183,7 @@ void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, // First transform rows. Since all non-zero dct coefficients are in // upper-left 4x4 area, we only need to calculate first 4 rows here. - idct16_1d_rows_dspr2(input, outptr, 4); + idct16_rows_dspr2(input, outptr, 4); outptr += 4; for (i = 0; i < 6; ++i) { @@ -1213,7 +1213,7 @@ void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, } // Then transform columns - idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride); + idct16_cols_add_blk_dspr2(out, dest, dest_stride); } void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, diff --git a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c index 5e92db3d2..132d88ce5 100644 --- a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c +++ b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c @@ -18,8 +18,8 @@ #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" #if HAVE_DSPR2 -void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { +void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19; diff --git a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c index bc6759400..74a90b02c 100644 --- a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c +++ b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c @@ -19,8 +19,8 @@ #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" #if HAVE_DSPR2 -static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output, - uint32_t no_rows) { +static void idct32_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; @@ -882,10 +882,10 @@ void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, ); // Rows - idct32_1d_rows_dspr2(input, outptr, 32); + idct32_rows_dspr2(input, outptr, 32); // Columns - vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride); + vp9_idct32_cols_add_blk_dspr2(out, dest, dest_stride); } void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, @@ -903,7 +903,7 @@ void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, ); // Rows - idct32_1d_rows_dspr2(input, outptr, 8); + idct32_rows_dspr2(input, outptr, 8); outptr += 8; __asm__ __volatile__ ( @@ -947,7 +947,7 @@ void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, } // Columns - vp9_idct32_1d_cols_add_blk_dspr2(out, dest, stride); + vp9_idct32_cols_add_blk_dspr2(out, dest, stride); } void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, diff --git a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c index 5b7aa5e71..1990348b8 100644 --- a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c +++ b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c @@ -19,7 +19,7 @@ #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" #if HAVE_DSPR2 -static void vp9_idct4_1d_rows_dspr2(const int16_t *input, int16_t *output) { +static void vp9_idct4_rows_dspr2(const int16_t *input, int16_t *output) { int16_t step_0, step_1, step_2, step_3; int Temp0, Temp1, Temp2, Temp3; const int const_2_power_13 = 8192; @@ -104,7 +104,7 @@ static void vp9_idct4_1d_rows_dspr2(const int16_t *input, int16_t *output) { } } -static void vp9_idct4_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, +static void vp9_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { int16_t step_0, step_1, step_2, step_3; int Temp0, Temp1, Temp2, Temp3; @@ -240,10 +240,10 @@ void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, ); // Rows - vp9_idct4_1d_rows_dspr2(input, outptr); + vp9_idct4_rows_dspr2(input, outptr); // Columns - vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); + vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride); } void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, @@ -319,7 +319,7 @@ void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, } } -static void iadst4_1d_dspr2(const int16_t *input, int16_t *output) { +static void iadst4_dspr2(const int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; int x0, x1, x2, x3; @@ -379,16 +379,16 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, switch (tx_type) { case DCT_DCT: // DCT in both horizontal and vertical - vp9_idct4_1d_rows_dspr2(input, outptr); - vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); + vp9_idct4_rows_dspr2(input, outptr); + vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride); break; case ADST_DCT: // ADST in vertical, DCT in horizontal - vp9_idct4_1d_rows_dspr2(input, outptr); + vp9_idct4_rows_dspr2(input, outptr); outptr = out; for (i = 0; i < 4; ++i) { - iadst4_1d_dspr2(outptr, temp_out); + iadst4_dspr2(outptr, temp_out); for (j = 0; j < 4; ++j) dest[j * dest_stride + i] = @@ -400,7 +400,7 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, break; case DCT_ADST: // DCT in vertical, ADST in horizontal for (i = 0; i < 4; ++i) { - iadst4_1d_dspr2(input, outptr); + iadst4_dspr2(input, outptr); input += 4; outptr += 4; } @@ -410,11 +410,11 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, temp_in[i * 4 + j] = out[j * 4 + i]; } } - vp9_idct4_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); + vp9_idct4_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); break; case ADST_ADST: // ADST in both directions for (i = 0; i < 4; ++i) { - iadst4_1d_dspr2(input, outptr); + iadst4_dspr2(input, outptr); input += 4; outptr += 4; } @@ -422,7 +422,7 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, for (i = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - iadst4_1d_dspr2(temp_in, temp_out); + iadst4_dspr2(temp_in, temp_out); for (j = 0; j < 4; ++j) dest[j * dest_stride + i] = diff --git a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c index 93a08401d..acccaea6d 100644 --- a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c +++ b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c @@ -19,8 +19,8 @@ #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" #if HAVE_DSPR2 -static void idct8_1d_rows_dspr2(const int16_t *input, int16_t *output, - uint32_t no_rows) { +static void idct8_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; const int const_2_power_13 = 8192; int Temp0, Temp1, Temp2, Temp3, Temp4; @@ -200,8 +200,8 @@ static void idct8_1d_rows_dspr2(const int16_t *input, int16_t *output, } } -static void idct8_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { +static void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; int Temp0, Temp1, Temp2, Temp3; int i; @@ -462,13 +462,13 @@ void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, ); // First transform rows - idct8_1d_rows_dspr2(input, outptr, 8); + idct8_rows_dspr2(input, outptr, 8); // Then transform columns and add to dest - idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); + idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); } -static void iadst8_1d_dspr2(const int16_t *input, int16_t *output) { +static void iadst8_dspr2(const int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; int x0, x1, x2, x3, x4, x5, x6, x7; @@ -563,14 +563,14 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, switch (tx_type) { case DCT_DCT: // DCT in both horizontal and vertical - idct8_1d_rows_dspr2(input, outptr, 8); - idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); + idct8_rows_dspr2(input, outptr, 8); + idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); break; case ADST_DCT: // ADST in vertical, DCT in horizontal - idct8_1d_rows_dspr2(input, outptr, 8); + idct8_rows_dspr2(input, outptr, 8); for (i = 0; i < 8; ++i) { - iadst8_1d_dspr2(&out[i * 8], temp_out); + iadst8_dspr2(&out[i * 8], temp_out); for (j = 0; j < 8; ++j) dest[j * dest_stride + i] = @@ -580,7 +580,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, break; case DCT_ADST: // DCT in vertical, ADST in horizontal for (i = 0; i < 8; ++i) { - iadst8_1d_dspr2(input, outptr); + iadst8_dspr2(input, outptr); input += 8; outptr += 8; } @@ -590,11 +590,11 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, temp_in[i * 8 + j] = out[j * 8 + i]; } } - idct8_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); + idct8_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); break; case ADST_ADST: // ADST in both directions for (i = 0; i < 8; ++i) { - iadst8_1d_dspr2(input, outptr); + iadst8_dspr2(input, outptr); input += 8; outptr += 8; } @@ -603,7 +603,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - iadst8_1d_dspr2(temp_in, temp_out); + iadst8_dspr2(temp_in, temp_out); for (j = 0; j < 8; ++j) dest[j * dest_stride + i] = @@ -631,7 +631,7 @@ void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest, ); // First transform rows - idct8_1d_rows_dspr2(input, outptr, 4); + idct8_rows_dspr2(input, outptr, 4); outptr += 4; @@ -659,7 +659,7 @@ void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest, // Then transform columns and add to dest - idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); + idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); } void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index e033fbb99..6f771992b 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -33,9 +33,16 @@ void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi) { void vp9_free_frame_buffers(VP9_COMMON *cm) { int i; - for (i = 0; i < FRAME_BUFFERS; i++) + for (i = 0; i < FRAME_BUFFERS; i++) { vp9_free_frame_buffer(&cm->frame_bufs[i].buf); + if (cm->frame_bufs[i].ref_count > 0 && + cm->frame_bufs[i].raw_frame_buffer.data != NULL) { + cm->release_fb_cb(cm->cb_priv, &cm->frame_bufs[i].raw_frame_buffer); + cm->frame_bufs[i].ref_count = 0; + } + } + vp9_free_frame_buffer(&cm->post_proc_buffer); vpx_free(cm->mip); @@ -85,7 +92,7 @@ int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) { int mi_size; if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y, - VP9_DEC_BORDER_IN_PIXELS) < 0) + VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0) goto fail; set_mb_mi(cm, aligned_width, aligned_height); @@ -199,6 +206,7 @@ void vp9_create_common(VP9_COMMON *cm) { void vp9_remove_common(VP9_COMMON *cm) { vp9_free_frame_buffers(cm); + vp9_free_internal_frame_buffers(&cm->int_frame_buffers); } void vp9_initialize_common() { diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index 70b8ffa4e..f10a3c8c7 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -182,7 +182,7 @@ struct macroblockd_plane { int subsampling_y; struct buf_2d dst; struct buf_2d pre[2]; - int16_t *dequant; + const int16_t *dequant; ENTROPY_CONTEXT *above_context; ENTROPY_CONTEXT *left_context; }; diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c index 3807ccc87..d30e0b488 100644 --- a/vp9/common/vp9_convolve.c +++ b/vp9/common/vp9_convolve.c @@ -145,7 +145,7 @@ static const InterpKernel *get_filter_base(const int16_t *filter) { } static int get_filter_offset(const int16_t *f, const InterpKernel *base) { - return (const InterpKernel *)(intptr_t)f - base; + return (int)((const InterpKernel *)(intptr_t)f - base); } void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h index e030d92ec..d6b380fd5 100644 --- a/vp9/common/vp9_entropy.h +++ b/vp9/common/vp9_entropy.h @@ -119,7 +119,7 @@ static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) { extern DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_8x8plus[1024]); extern DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_4x4[16]); -static const uint8_t *get_band_translate(TX_SIZE tx_size) { +static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) { return tx_size == TX_4X4 ? vp9_coefband_trans_4x4 : vp9_coefband_trans_8x8plus; } @@ -146,8 +146,8 @@ typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS] void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full); -static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, - const ENTROPY_CONTEXT *l) { +static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, + const ENTROPY_CONTEXT *l) { ENTROPY_CONTEXT above_ec = 0, left_ec = 0; switch (tx_size) { @@ -174,8 +174,8 @@ static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, return combine_entropy_contexts(above_ec, left_ec); } -static const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, - PLANE_TYPE type, int block_idx) { +static const INLINE scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, + PLANE_TYPE type, int block_idx) { const MODE_INFO *const mi = xd->mi_8x8[0]; const MB_MODE_INFO *const mbmi = &mi->mbmi; diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c index 6def3c869..25cba7fbe 100644 --- a/vp9/common/vp9_entropymode.c +++ b/vp9/common/vp9_entropymode.c @@ -345,7 +345,7 @@ static int adapt_prob(vp9_prob pre_prob, const unsigned int ct[2]) { static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, const unsigned int *counts, vp9_prob *probs) { - tree_merge_probs(tree, pre_probs, counts, COUNT_SAT, MAX_UPDATE_FACTOR, + vp9_tree_merge_probs(tree, pre_probs, counts, COUNT_SAT, MAX_UPDATE_FACTOR, probs); } diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c index 60ae79fdc..e1f5ef7b4 100644 --- a/vp9/common/vp9_entropymv.c +++ b/vp9/common/vp9_entropymv.c @@ -192,8 +192,8 @@ static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) { static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, const unsigned int *counts, vp9_prob *probs) { - tree_merge_probs(tree, pre_probs, counts, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR, - probs); + vp9_tree_merge_probs(tree, pre_probs, counts, MV_COUNT_SAT, + MV_MAX_UPDATE_FACTOR, probs); } void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) { diff --git a/vp9/common/vp9_frame_buffers.c b/vp9/common/vp9_frame_buffers.c new file mode 100644 index 000000000..d903ed695 --- /dev/null +++ b/vp9/common/vp9_frame_buffers.c @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "vp9/common/vp9_frame_buffers.h" +#include "vpx_mem/vpx_mem.h" + +int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list) { + assert(list != NULL); + vp9_free_internal_frame_buffers(list); + + list->num_internal_frame_buffers = + VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; + list->int_fb = vpx_calloc(list->num_internal_frame_buffers, + sizeof(*list->int_fb)); + return (list->int_fb == NULL); +} + +void vp9_free_internal_frame_buffers(InternalFrameBufferList *list) { + int i; + + assert(list != NULL); + + for (i = 0; i < list->num_internal_frame_buffers; ++i) { + vpx_free(list->int_fb[i].data); + list->int_fb[i].data = NULL; + } + vpx_free(list->int_fb); + list->int_fb = NULL; +} + +int vp9_get_frame_buffer(void *cb_priv, size_t min_size, + vpx_codec_frame_buffer_t *fb) { + int i; + InternalFrameBufferList *const int_fb_list = + (InternalFrameBufferList *)cb_priv; + if (int_fb_list == NULL || fb == NULL) + return -1; + + // Find a free frame buffer. + for (i = 0; i < int_fb_list->num_internal_frame_buffers; ++i) { + if (!int_fb_list->int_fb[i].in_use) + break; + } + + if (i == int_fb_list->num_internal_frame_buffers) + return -1; + + if (int_fb_list->int_fb[i].size < min_size) { + int_fb_list->int_fb[i].data = + (uint8_t *)vpx_realloc(int_fb_list->int_fb[i].data, min_size); + if (!int_fb_list->int_fb[i].data) + return -1; + + int_fb_list->int_fb[i].size = min_size; + } + + fb->data = int_fb_list->int_fb[i].data; + fb->size = int_fb_list->int_fb[i].size; + int_fb_list->int_fb[i].in_use = 1; + + // Set the frame buffer's private data to point at the internal frame buffer. + fb->priv = &int_fb_list->int_fb[i]; + return 0; +} + +int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb) { + InternalFrameBuffer *int_fb; + (void)cb_priv; + if (fb == NULL) + return -1; + + int_fb = (InternalFrameBuffer *)fb->priv; + int_fb->in_use = 0; + return 0; +} diff --git a/vp9/common/vp9_frame_buffers.h b/vp9/common/vp9_frame_buffers.h new file mode 100644 index 000000000..e2cfe61b6 --- /dev/null +++ b/vp9/common/vp9_frame_buffers.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_FRAME_BUFFERS_H_ +#define VP9_COMMON_VP9_FRAME_BUFFERS_H_ + +#include "vpx/vpx_frame_buffer.h" +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct InternalFrameBuffer { + uint8_t *data; + size_t size; + int in_use; +} InternalFrameBuffer; + +typedef struct InternalFrameBufferList { + int num_internal_frame_buffers; + InternalFrameBuffer *int_fb; +} InternalFrameBufferList; + +// Initializes |list|. Returns 0 on success. +int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list); + +// Free any data allocated to the frame buffers. +void vp9_free_internal_frame_buffers(InternalFrameBufferList *list); + +// Callback used by libvpx to request an external frame buffer. |cb_priv| +// Callback private data, which points to an InternalFrameBufferList. +// |min_size| is the minimum size in bytes needed to decode the next frame. +// |fb| pointer to the frame buffer. +int vp9_get_frame_buffer(void *cb_priv, size_t min_size, + vpx_codec_frame_buffer_t *fb); + +// Callback used by libvpx when there are no references to the frame buffer. +// |cb_priv| is not used. |fb| pointer to the frame buffer. +int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_FRAME_BUFFERS_H_ diff --git a/vp9/common/vp9_mv.h b/vp9/common/vp9_mv.h index 98fd1d82f..3eb7f9d61 100644 --- a/vp9/common/vp9_mv.h +++ b/vp9/common/vp9_mv.h @@ -34,8 +34,8 @@ typedef struct mv32 { int32_t col; } MV32; -static void clamp_mv(MV *mv, int min_col, int max_col, - int min_row, int max_row) { +static INLINE void clamp_mv(MV *mv, int min_col, int max_col, + int min_row, int max_row) { mv->col = clamp(mv->col, min_col, max_col); mv->row = clamp(mv->row, min_row, max_row); } diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h index 0936abfcd..f99952f3c 100644 --- a/vp9/common/vp9_mvref_common.h +++ b/vp9/common/vp9_mvref_common.h @@ -48,7 +48,7 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, int_mv *mvlist, int_mv *nearest, int_mv *near); // TODO(jingning): this mv clamping function should be block size dependent. -static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { +static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN, xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, xd->mb_to_top_edge - LEFT_TOP_MARGIN, diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h index 564e4195f..ac39a98fd 100644 --- a/vp9/common/vp9_onyx.h +++ b/vp9/common/vp9_onyx.h @@ -147,8 +147,12 @@ extern "C" { // END DATARATE CONTROL OPTIONS // ---------------------------------------------------------------- - // Spatial scalability - int ss_number_layers; + // Spatial and temporal scalability. + int ss_number_layers; // Number of spatial layers. + int ts_number_layers; // Number of temporal layers. + // Bitrate allocation (CBR mode) and framerate factor, for temporal layers. + int ts_target_bitrate[VPX_TS_MAX_LAYERS]; + int ts_rate_decimator[VPX_TS_MAX_LAYERS]; // these parameters aren't to be used in final build don't use!!! int play_alternate; diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index d92a25b12..97983c596 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -18,6 +18,7 @@ #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_frame_buffers.h" #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_tile_common.h" @@ -94,6 +95,7 @@ typedef enum { typedef struct { int ref_count; + vpx_codec_frame_buffer_t raw_frame_buffer; YV12_BUFFER_CONFIG buf; } RefCntBuffer; @@ -223,13 +225,21 @@ typedef struct VP9Common { int frame_parallel_decoding_mode; int log2_tile_cols, log2_tile_rows; + + // Private data associated with the frame buffer callbacks. + void *cb_priv; + vpx_get_frame_buffer_cb_fn_t get_fb_cb; + vpx_release_frame_buffer_cb_fn_t release_fb_cb; + + // Handles memory for the codec. + InternalFrameBufferList int_frame_buffers; } VP9_COMMON; -static YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) { +static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) { return &cm->frame_bufs[cm->new_fb_idx].buf; } -static int get_free_fb(VP9_COMMON *cm) { +static INLINE int get_free_fb(VP9_COMMON *cm) { int i; for (i = 0; i < FRAME_BUFFERS; i++) if (cm->frame_bufs[i].ref_count == 0) @@ -240,7 +250,7 @@ static int get_free_fb(VP9_COMMON *cm) { return i; } -static void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) { +static INLINE void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) { const int ref_index = *idx; if (ref_index >= 0 && bufs[ref_index].ref_count > 0) @@ -251,7 +261,7 @@ static void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) { bufs[new_idx].ref_count++; } -static int mi_cols_aligned_to_sb(int n_mis) { +static INLINE int mi_cols_aligned_to_sb(int n_mis) { return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2); } @@ -275,10 +285,10 @@ static INLINE void set_skip_context( } } -static void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, - int mi_row, int bh, - int mi_col, int bw, - int mi_rows, int mi_cols) { +static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, + int mi_row, int bh, + int mi_col, int bw, + int mi_rows, int mi_cols) { xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8; xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c index 11b6d93c1..487f00cca 100644 --- a/vp9/common/vp9_pred_common.c +++ b/vp9/common/vp9_pred_common.c @@ -218,27 +218,25 @@ int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) { } else { // inter/inter const int above_has_second = has_second_ref(above_mbmi); const int left_has_second = has_second_ref(left_mbmi); + const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1]; + const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1]; if (above_has_second && left_has_second) { - pred_context = 1 + (above_mbmi->ref_frame[0] == LAST_FRAME || - above_mbmi->ref_frame[1] == LAST_FRAME || - left_mbmi->ref_frame[0] == LAST_FRAME || - left_mbmi->ref_frame[1] == LAST_FRAME); + pred_context = 1 + (above0 == LAST_FRAME || above1 == LAST_FRAME || + left0 == LAST_FRAME || left1 == LAST_FRAME); } else if (above_has_second || left_has_second) { - const MV_REFERENCE_FRAME rfs = !above_has_second ? - above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; - const MV_REFERENCE_FRAME crf1 = above_has_second ? - above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; - const MV_REFERENCE_FRAME crf2 = above_has_second ? - above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1]; + const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1; if (rfs == LAST_FRAME) pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME); else pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME); } else { - pred_context = 2 * (above_mbmi->ref_frame[0] == LAST_FRAME) + - 2 * (left_mbmi->ref_frame[0] == LAST_FRAME); + pred_context = 2 * (above0 == LAST_FRAME) + 2 * (left0 == LAST_FRAME); } } } else if (has_above || has_left) { // one edge available @@ -291,23 +289,23 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { } else { // inter/inter const int above_has_second = has_second_ref(above_mbmi); const int left_has_second = has_second_ref(left_mbmi); + const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1]; + const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1]; if (above_has_second && left_has_second) { - if (above_mbmi->ref_frame[0] == left_mbmi->ref_frame[0] && - above_mbmi->ref_frame[1] == left_mbmi->ref_frame[1]) - pred_context = 3 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME || - above_mbmi->ref_frame[1] == GOLDEN_FRAME || - left_mbmi->ref_frame[0] == GOLDEN_FRAME || - left_mbmi->ref_frame[1] == GOLDEN_FRAME); + if (above0 == left0 && above1 == left1) + pred_context = 3 * (above0 == GOLDEN_FRAME || + above1 == GOLDEN_FRAME || + left0 == GOLDEN_FRAME || + left1 == GOLDEN_FRAME); else pred_context = 2; } else if (above_has_second || left_has_second) { - const MV_REFERENCE_FRAME rfs = !above_has_second ? - above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; - const MV_REFERENCE_FRAME crf1 = above_has_second ? - above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; - const MV_REFERENCE_FRAME crf2 = above_has_second ? - above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1]; + const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1; if (rfs == GOLDEN_FRAME) pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME); @@ -316,17 +314,15 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { else pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME); } else { - if (above_mbmi->ref_frame[0] == LAST_FRAME && - left_mbmi->ref_frame[0] == LAST_FRAME) { + if (above0 == LAST_FRAME && left0 == LAST_FRAME) { pred_context = 3; - } else if (above_mbmi->ref_frame[0] == LAST_FRAME || - left_mbmi->ref_frame[0] == LAST_FRAME) { - const MB_MODE_INFO *edge_mbmi = - above_mbmi->ref_frame[0] == LAST_FRAME ? left_mbmi : above_mbmi; - pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME); + } else if (above0 == LAST_FRAME || left0 == LAST_FRAME) { + const MV_REFERENCE_FRAME edge0 = (above0 == LAST_FRAME) ? left0 + : above0; + pred_context = 4 * (edge0 == GOLDEN_FRAME); } else { - pred_context = 2 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME) + - 2 * (left_mbmi->ref_frame[0] == GOLDEN_FRAME); + pred_context = 2 * (above0 == GOLDEN_FRAME) + + 2 * (left0 == GOLDEN_FRAME); } } } diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h index 0acee32f8..33ae5a896 100644 --- a/vp9/common/vp9_pred_common.h +++ b/vp9/common/vp9_pred_common.h @@ -98,8 +98,8 @@ static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm, int vp9_get_tx_size_context(const MACROBLOCKD *xd); -static const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx, - const struct tx_probs *tx_probs) { +static INLINE const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx, + const struct tx_probs *tx_probs) { switch (max_tx_size) { case TX_8X8: return tx_probs->p8x8[ctx]; @@ -113,13 +113,14 @@ static const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx, } } -static const vp9_prob *get_tx_probs2(TX_SIZE max_tx_size, const MACROBLOCKD *xd, - const struct tx_probs *tx_probs) { +static INLINE const vp9_prob *get_tx_probs2(TX_SIZE max_tx_size, + const MACROBLOCKD *xd, + const struct tx_probs *tx_probs) { return get_tx_probs(max_tx_size, vp9_get_tx_size_context(xd), tx_probs); } -static unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx, - struct tx_counts *tx_counts) { +static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx, + struct tx_counts *tx_counts) { switch (max_tx_size) { case TX_8X8: return tx_counts->p8x8[ctx]; diff --git a/vp9/common/vp9_prob.c b/vp9/common/vp9_prob.c index 884884e0b..f9bc06ecf 100644 --- a/vp9/common/vp9_prob.c +++ b/vp9/common/vp9_prob.c @@ -28,3 +28,34 @@ DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + + +static unsigned int tree_merge_probs_impl(unsigned int i, + const vp9_tree_index *tree, + const vp9_prob *pre_probs, + const unsigned int *counts, + unsigned int count_sat, + unsigned int max_update, + vp9_prob *probs) { + const int l = tree[i]; + const unsigned int left_count = (l <= 0) + ? counts[-l] + : tree_merge_probs_impl(l, tree, pre_probs, counts, + count_sat, max_update, probs); + const int r = tree[i + 1]; + const unsigned int right_count = (r <= 0) + ? counts[-r] + : tree_merge_probs_impl(r, tree, pre_probs, counts, + count_sat, max_update, probs); + const unsigned int ct[2] = { left_count, right_count }; + probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct, + count_sat, max_update); + return left_count + right_count; +} + +void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, + const unsigned int *counts, unsigned int count_sat, + unsigned int max_update_factor, vp9_prob *probs) { + tree_merge_probs_impl(0, tree, pre_probs, counts, count_sat, + max_update_factor, probs); +} diff --git a/vp9/common/vp9_prob.h b/vp9/common/vp9_prob.h index cc8d8ab38..f36148035 100644 --- a/vp9/common/vp9_prob.h +++ b/vp9/common/vp9_prob.h @@ -79,37 +79,10 @@ static INLINE vp9_prob merge_probs(vp9_prob pre_prob, return weighted_prob(pre_prob, prob, factor); } -static unsigned int tree_merge_probs_impl(unsigned int i, - const vp9_tree_index *tree, - const vp9_prob *pre_probs, - const unsigned int *counts, - unsigned int count_sat, - unsigned int max_update_factor, - vp9_prob *probs) { - const int l = tree[i]; - const unsigned int left_count = (l <= 0) - ? counts[-l] - : tree_merge_probs_impl(l, tree, pre_probs, counts, - count_sat, max_update_factor, probs); - const int r = tree[i + 1]; - const unsigned int right_count = (r <= 0) - ? counts[-r] - : tree_merge_probs_impl(r, tree, pre_probs, counts, - count_sat, max_update_factor, probs); - const unsigned int ct[2] = { left_count, right_count }; - probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct, - count_sat, max_update_factor); - return left_count + right_count; -} +void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, + const unsigned int *counts, unsigned int count_sat, + unsigned int max_update_factor, vp9_prob *probs); -static void tree_merge_probs(const vp9_tree_index *tree, - const vp9_prob *pre_probs, - const unsigned int *counts, - unsigned int count_sat, - unsigned int max_update_factor, vp9_prob *probs) { - tree_merge_probs_impl(0, tree, pre_probs, counts, - count_sat, max_update_factor, probs); -} DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]); diff --git a/vp9/common/vp9_quant_common.c b/vp9/common/vp9_quant_common.c index 6dbdb4216..9fef8b1ef 100644 --- a/vp9/common/vp9_quant_common.c +++ b/vp9/common/vp9_quant_common.c @@ -130,7 +130,8 @@ int16_t vp9_ac_quant(int qindex, int delta) { } -int vp9_get_qindex(struct segmentation *seg, int segment_id, int base_qindex) { +int vp9_get_qindex(const struct segmentation *seg, int segment_id, + int base_qindex) { if (vp9_segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) { const int data = vp9_get_segdata(seg, segment_id, SEG_LVL_ALT_Q); return seg->abs_delta == SEGMENT_ABSDATA ? diff --git a/vp9/common/vp9_quant_common.h b/vp9/common/vp9_quant_common.h index af50e23cd..581104006 100644 --- a/vp9/common/vp9_quant_common.h +++ b/vp9/common/vp9_quant_common.h @@ -27,7 +27,8 @@ void vp9_init_quant_tables(); int16_t vp9_dc_quant(int qindex, int delta); int16_t vp9_ac_quant(int qindex, int delta); -int vp9_get_qindex(struct segmentation *seg, int segment_id, int base_qindex); +int vp9_get_qindex(const struct segmentation *seg, int segment_id, + int base_qindex); #ifdef __cplusplus } // extern "C" diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c index cc70e4cc0..7576e7b6f 100644 --- a/vp9/common/vp9_reconinter.c +++ b/vp9/common/vp9_reconinter.c @@ -269,21 +269,15 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block, ? (plane == 0 ? mi->bmi[block].as_mv[ref].as_mv : mi_mv_pred_q4(mi, ref)) : mi->mbmi.mv[ref].as_mv; - - // TODO(jkoleszar): This clamping is done in the incorrect place for the - // scaling case. It needs to be done on the scaled MV, not the pre-scaling - // MV. Note however that it performs the subsampling aware scaling so - // that the result is always q4. - // mv_precision precision is MV_PRECISION_Q4. - const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh, - pd->subsampling_x, - pd->subsampling_y); - MV32 scaled_mv; - int xs, ys, x0, y0, x0_16, y0_16, x1, y1, frame_width, - frame_height, subpel_x, subpel_y, buf_stride; + int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height, buf_stride, + subpel_x, subpel_y; uint8_t *ref_frame, *buf_ptr; const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf; + const MV mv_q4 = { + mv.row * (1 << (1 - pd->subsampling_y)), + mv.col * (1 << (1 - pd->subsampling_x)) + }; // Get reference frame pointer, width and height. if (plane == 0) { @@ -327,10 +321,6 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block, x0_16 += scaled_mv.col; y0_16 += scaled_mv.row; - // Get reference block bottom right coordinate. - x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1; - y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1; - // Get reference block pointer. buf_ptr = ref_frame + y0 * pre_buf->stride + x0; buf_stride = pre_buf->stride; @@ -339,6 +329,9 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block, // width/height is not a multiple of 8 pixels. if (scaled_mv.col || scaled_mv.row || (frame_width & 0x7) || (frame_height & 0x7)) { + // Get reference block bottom right coordinate. + int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1; + int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1; int x_pad = 0, y_pad = 0; if (subpel_x || (sf->x_step_q4 & SUBPEL_MASK)) { diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h index bf738c28b..dccd60938 100644 --- a/vp9/common/vp9_reconinter.h +++ b/vp9/common/vp9_reconinter.h @@ -39,18 +39,18 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride, enum mv_precision precision, int x, int y); -static int scaled_buffer_offset(int x_offset, int y_offset, int stride, - const struct scale_factors *sf) { +static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride, + const struct scale_factors *sf) { const int x = sf ? sf->scale_value_x(x_offset, sf) : x_offset; const int y = sf ? sf->scale_value_y(y_offset, sf) : y_offset; return y * stride + x; } -static void setup_pred_plane(struct buf_2d *dst, - uint8_t *src, int stride, - int mi_row, int mi_col, - const struct scale_factors *scale, - int subsampling_x, int subsampling_y) { +static INLINE void setup_pred_plane(struct buf_2d *dst, + uint8_t *src, int stride, + int mi_row, int mi_col, + const struct scale_factors *scale, + int subsampling_x, int subsampling_y) { const int x = (MI_SIZE * mi_col) >> subsampling_x; const int y = (MI_SIZE * mi_row) >> subsampling_y; dst->buf = src + scaled_buffer_offset(x, y, stride, scale); diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 04a40bd58..7bdd11eb0 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -707,14 +707,14 @@ if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then fi # fdct functions -prototype void vp9_short_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type" -specialize vp9_short_fht4x4 sse2 avx2 +prototype void vp9_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type" +specialize vp9_fht4x4 sse2 avx2 -prototype void vp9_short_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type" -specialize vp9_short_fht8x8 sse2 avx2 +prototype void vp9_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type" +specialize vp9_fht8x8 sse2 avx2 -prototype void vp9_short_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type" -specialize vp9_short_fht16x16 sse2 avx2 +prototype void vp9_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type" +specialize vp9_fht16x16 sse2 avx2 prototype void vp9_fwht4x4 "const int16_t *input, int16_t *output, int stride" specialize vp9_fwht4x4 diff --git a/vp9/common/vp9_scale.h b/vp9/common/vp9_scale.h index 90b0d0bf9..a9dda1889 100644 --- a/vp9/common/vp9_scale.h +++ b/vp9/common/vp9_scale.h @@ -40,12 +40,12 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, int other_h, int this_w, int this_h); -static int vp9_is_valid_scale(const struct scale_factors *sf) { +static INLINE int vp9_is_valid_scale(const struct scale_factors *sf) { return sf->x_scale_fp != REF_INVALID_SCALE && sf->y_scale_fp != REF_INVALID_SCALE; } -static int vp9_is_scaled(const struct scale_factors *sf) { +static INLINE int vp9_is_scaled(const struct scale_factors *sf) { return sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE; } diff --git a/vp9/common/vp9_systemdependent.h b/vp9/common/vp9_systemdependent.h index ee9a4823b..7455abce3 100644 --- a/vp9/common/vp9_systemdependent.h +++ b/vp9/common/vp9_systemdependent.h @@ -11,13 +11,17 @@ #ifndef VP9_COMMON_VP9_SYSTEMDEPENDENT_H_ #define VP9_COMMON_VP9_SYSTEMDEPENDENT_H_ -#ifdef __cplusplus -extern "C" { +#ifdef _MSC_VER +# if _MSC_VER > 1310 && (defined(_M_X64) || defined(_M_IX86)) +# include <intrin.h> +# define USE_MSC_INTRIN +# endif +# include <math.h> +# define snprintf _snprintf #endif -#ifdef _MSC_VER -#include <math.h> -#define snprintf _snprintf +#ifdef __cplusplus +extern "C" { #endif #include "./vpx_config.h" @@ -30,7 +34,7 @@ void vpx_reset_mmx_state(void); #if defined(_MSC_VER) && _MSC_VER < 1800 // round is not defined in MSVC before VS2013. -static int round(double x) { +static INLINE int round(double x) { if (x < 0) return (int)ceil(x - 0.5); else @@ -44,9 +48,7 @@ static int round(double x) { static INLINE int get_msb(unsigned int n) { return 31 ^ __builtin_clz(n); } -#elif defined(_MSC_VER) && _MSC_VER > 1310 && \ - (defined(_M_X64) || defined(_M_IX86)) -#include <intrin.h> +#elif defined(USE_MSC_INTRIN) #pragma intrinsic(_BitScanReverse) static INLINE int get_msb(unsigned int n) { @@ -54,6 +56,7 @@ static INLINE int get_msb(unsigned int n) { _BitScanReverse(&first_set_bit, n); return first_set_bit; } +#undef USE_MSC_INTRIN #else // Returns (int)floor(log2(n)). n must be > 0. static INLINE int get_msb(unsigned int n) { diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c index 8ab5fb1bc..60018ea86 100644 --- a/vp9/common/x86/vp9_asm_stubs.c +++ b/vp9/common/x86/vp9_asm_stubs.c @@ -16,15 +16,15 @@ typedef void filter8_1dfunction ( const unsigned char *src_ptr, - const unsigned int src_pitch, + const ptrdiff_t src_pitch, unsigned char *output_ptr, - unsigned int out_pitch, + ptrdiff_t out_pitch, unsigned int output_height, const short *filter ); -#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt1, opt2) \ -void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \ +#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ uint8_t *dst, ptrdiff_t dst_stride, \ const int16_t *filter_x, int x_step_q4, \ const int16_t *filter_y, int y_step_q4, \ @@ -32,50 +32,68 @@ void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \ if (step_q4 == 16 && filter[3] != 128) { \ if (filter[0] || filter[1] || filter[2]) { \ while (w >= 16) { \ - vp9_filter_block1d16_##dir##8_##avg##opt1(src_start, src_stride, \ - dst, dst_stride, \ - h, filter); \ + vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ src += 16; \ dst += 16; \ w -= 16; \ } \ while (w >= 8) { \ - vp9_filter_block1d8_##dir##8_##avg##opt1(src_start, src_stride, \ - dst, dst_stride, \ - h, filter); \ + vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ src += 8; \ dst += 8; \ w -= 8; \ } \ while (w >= 4) { \ - vp9_filter_block1d4_##dir##8_##avg##opt1(src_start, src_stride, \ - dst, dst_stride, \ - h, filter); \ + vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ src += 4; \ dst += 4; \ w -= 4; \ } \ } else { \ while (w >= 16) { \ - vp9_filter_block1d16_##dir##2_##avg##opt2(src, src_stride, \ - dst, dst_stride, \ - h, filter); \ + vp9_filter_block1d16_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ src += 16; \ dst += 16; \ w -= 16; \ } \ while (w >= 8) { \ - vp9_filter_block1d8_##dir##2_##avg##opt2(src, src_stride, \ - dst, dst_stride, \ - h, filter); \ + vp9_filter_block1d8_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ src += 8; \ dst += 8; \ w -= 8; \ } \ while (w >= 4) { \ - vp9_filter_block1d4_##dir##2_##avg##opt2(src, src_stride, \ - dst, dst_stride, \ - h, filter); \ + vp9_filter_block1d4_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ src += 4; \ dst += 4; \ w -= 4; \ @@ -136,18 +154,18 @@ filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d16_v2_sse2; -filter8_1dfunction vp9_filter_block1d16_h2_sse2; -filter8_1dfunction vp9_filter_block1d8_v2_sse2; -filter8_1dfunction vp9_filter_block1d8_h2_sse2; -filter8_1dfunction vp9_filter_block1d4_v2_sse2; -filter8_1dfunction vp9_filter_block1d4_h2_sse2; -filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2; -filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2; -filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2; -filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2; -filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2; -filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2; +filter8_1dfunction vp9_filter_block1d16_v2_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_ssse3; +filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3; // void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, @@ -169,11 +187,11 @@ filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2; // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3, sse2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3, sse2); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3, sse2); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, - ssse3, sse2); + ssse3); // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, @@ -236,11 +254,10 @@ filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2; // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2, sse2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2, sse2); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2, sse2); -FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2, - sse2); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); +FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); // void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, diff --git a/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm b/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm new file mode 100644 index 000000000..b5e18fe6d --- /dev/null +++ b/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm @@ -0,0 +1,422 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm3, [rdx] ;load filters + psrldq xmm3, 6 + packsswb xmm3, xmm3 + pshuflw xmm3, xmm3, 0b ;k3_k4 + + movq xmm2, rcx ;rounding + pshufd xmm2, xmm2, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm3 + + paddsw xmm0, xmm2 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + psrldq xmm7, 6 + packsswb xmm7, xmm7 + pshuflw xmm7, xmm7, 0b ;k3_k4 + punpcklwd xmm7, xmm7 + + movq xmm6, rcx ;rounding + pshufd xmm6, xmm6, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm7 + + paddsw xmm0, xmm6 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte + +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm1 + punpckhbw xmm2, xmm1 + pmaddubsw xmm0, xmm7 + pmaddubsw xmm2, xmm7 + + paddsw xmm0, xmm6 ;rounding + paddsw xmm2, xmm6 + psraw xmm0, 7 ;shift + psraw xmm2, 7 + packuswb xmm0, xmm2 ;pack back to byte + +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +global sym(vp9_filter_block1d4_v2_ssse3) PRIVATE +sym(vp9_filter_block1d4_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v2_ssse3) PRIVATE +sym(vp9_filter_block1d8_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v2_ssse3) PRIVATE +sym(vp9_filter_block1d16_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_v2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d4_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d8_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d16_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h2_ssse3) PRIVATE +sym(vp9_filter_block1d4_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h2_ssse3) PRIVATE +sym(vp9_filter_block1d8_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h2_ssse3) PRIVATE +sym(vp9_filter_block1d16_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d4_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d8_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d16_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret |