diff options
Diffstat (limited to 'vp9')
37 files changed, 3916 insertions, 577 deletions
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index 7f8c5f215..b9094ed61 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -21,6 +21,7 @@ #include "vp9/common/vp9_common_data.h" #include "vp9/common/vp9_enums.h" #include "vp9/common/vp9_filter.h" +#include "vp9/common/vp9_idct.h" #include "vp9/common/vp9_mv.h" #include "vp9/common/vp9_scale.h" #include "vp9/common/vp9_seg_common.h" @@ -179,7 +180,7 @@ struct buf_2d { }; struct macroblockd_plane { - int16_t *dqcoeff; + tran_low_t *dqcoeff; PLANE_TYPE plane_type; int subsampling_x; int subsampling_y; @@ -226,11 +227,17 @@ typedef struct macroblockd { /* mc buffer */ DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]); +#if CONFIG_VP9_HIGHBITDEPTH + /* Bit depth: 8, 10, 12 */ + int bd; + DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]); +#endif + int lossless; int corrupted; - DECLARE_ALIGNED(16, int16_t, dqcoeff[MAX_MB_PLANE][64 * 64]); + DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_MB_PLANE][64 * 64]); ENTROPY_CONTEXT *above_context[MAX_MB_PLANE]; ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16]; diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index 856d41e70..b196fc527 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -18,14 +18,47 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_idct.h" -void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { +#if CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH +// When CONFIG_EMULATE_HW_HIGHBITDEPTH is 1 the transform performs strict +// overflow wrapping to match expected hardware implementations. +// bd of 8 uses trans_low with 16bits, need to remove 16bits +// bd of 10 uses trans_low with 18bits, need to remove 14bits +// bd of 12 uses trans_low with 20bits, need to remove 12bits +// bd of x uses trans_low with 8+x bits, need to remove 24-x bits +#define WRAPLOW(x) ((((int32_t)x) << (24 - bd)) >> (24 - bd)) +#else +#define WRAPLOW(x) (x) +#endif // CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE tran_low_t clamp_high(tran_high_t value, tran_low_t low, + tran_low_t high) { + return value < low ? low : (value > high ? high : value); +} + +static INLINE tran_low_t clip_pixel_bd_high(tran_high_t dest, + tran_high_t trans, int bd) { + trans = WRAPLOW(trans); + switch (bd) { + case 8: + default: + return clamp_high(WRAPLOW(dest + trans), 0, 255); + case 10: + return clamp_high(WRAPLOW(dest + trans), 0, 1023); + case 12: + return clamp_high(WRAPLOW(dest + trans), 0, 4095); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +void vp9_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 0.5 shifts per pixel. */ int i; - int16_t output[16]; - int a1, b1, c1, d1, e1; - const int16_t *ip = input; - int16_t *op = output; + tran_low_t output[16]; + tran_high_t a1, b1, c1, d1, e1; + const tran_low_t *ip = input; + tran_low_t *op = output; for (i = 0; i < 4; i++) { a1 = ip[0] >> UNIT_QUANT_SHIFT; @@ -70,12 +103,12 @@ void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { } } -void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) { +void vp9_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) { int i; - int a1, e1; - int16_t tmp[4]; - const int16_t *ip = in; - int16_t *op = tmp; + tran_high_t a1, e1; + tran_low_t tmp[4]; + const tran_low_t *ip = in; + tran_low_t *op = tmp; a1 = ip[0] >> UNIT_QUANT_SHIFT; e1 = a1 >> 1; @@ -96,9 +129,9 @@ void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) { } } -static void idct4(const int16_t *input, int16_t *output) { - int16_t step[4]; - int temp1, temp2; +static void idct4(const tran_low_t *input, tran_low_t *output) { + tran_low_t step[4]; + tran_high_t temp1, temp2; // stage 1 temp1 = (input[0] + input[2]) * cospi_16_64; temp2 = (input[0] - input[2]) * cospi_16_64; @@ -116,11 +149,11 @@ static void idct4(const int16_t *input, int16_t *output) { output[3] = step[0] - step[3]; } -void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { - int16_t out[4 * 4]; - int16_t *outptr = out; +void vp9_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; int i, j; - int16_t temp_in[4], temp_out[4]; + tran_low_t temp_in[4], temp_out[4]; // Rows for (i = 0; i < 4; ++i) { @@ -140,10 +173,11 @@ void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { } } -void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) { +void vp9_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, + int dest_stride) { int i; - int a1; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + tran_high_t a1; + tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64); out = dct_const_round_shift(out * cospi_16_64); a1 = ROUND_POWER_OF_TWO(out, 4); @@ -156,9 +190,9 @@ void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) { } } -static void idct8(const int16_t *input, int16_t *output) { - int16_t step1[8], step2[8]; - int temp1, temp2; +static void idct8(const tran_low_t *input, tran_low_t *output) { + tran_low_t step1[8], step2[8]; + tran_high_t temp1, temp2; // stage 1 step1[0] = input[0]; step1[2] = input[4]; @@ -201,11 +235,11 @@ static void idct8(const int16_t *input, int16_t *output) { output[7] = step1[0] - step1[7]; } -void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) { - int16_t out[8 * 8]; - int16_t *outptr = out; +void vp9_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; int i, j; - int16_t temp_in[8], temp_out[8]; + tran_low_t temp_in[8], temp_out[8]; // First transform rows for (i = 0; i < 8; ++i) { @@ -225,10 +259,10 @@ void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) { } } -void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) { +void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; - int a1; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + tran_high_t a1; + tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64); out = dct_const_round_shift(out * cospi_16_64); a1 = ROUND_POWER_OF_TWO(out, 5); for (j = 0; j < 8; ++j) { @@ -238,13 +272,13 @@ void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) { } } -static void iadst4(const int16_t *input, int16_t *output) { - int s0, s1, s2, s3, s4, s5, s6, s7; +static void iadst4(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; - int x0 = input[0]; - int x1 = input[1]; - int x2 = input[2]; - int x3 = input[3]; + tran_high_t x0 = input[0]; + tran_high_t x1 = input[1]; + tran_high_t x2 = input[2]; + tran_high_t x3 = input[3]; if (!(x0 | x1 | x2 | x3)) { output[0] = output[1] = output[2] = output[3] = 0; @@ -280,7 +314,7 @@ static void iadst4(const int16_t *input, int16_t *output) { output[3] = dct_const_round_shift(s3); } -void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride, +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { const transform_2d IHT_4[] = { { idct4, idct4 }, // DCT_DCT = 0 @@ -290,9 +324,9 @@ void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride, }; int i, j; - int16_t out[4 * 4]; - int16_t *outptr = out; - int16_t temp_in[4], temp_out[4]; + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + tran_low_t temp_in[4], temp_out[4]; // inverse transform row vectors for (i = 0; i < 4; ++i) { @@ -311,17 +345,17 @@ void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride, + dest[j * stride + i]); } } -static void iadst8(const int16_t *input, int16_t *output) { +static void iadst8(const tran_low_t *input, tran_low_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; - int x0 = input[7]; - int x1 = input[0]; - int x2 = input[5]; - int x3 = input[2]; - int x4 = input[3]; - int x5 = input[4]; - int x6 = input[1]; - int x7 = input[6]; + tran_high_t x0 = input[7]; + tran_high_t x1 = input[0]; + tran_high_t x2 = input[5]; + tran_high_t x3 = input[2]; + tran_high_t x4 = input[3]; + tran_high_t x5 = input[4]; + tran_high_t x6 = input[1]; + tran_high_t x7 = input[6]; if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { output[0] = output[1] = output[2] = output[3] = output[4] @@ -395,12 +429,12 @@ static const transform_2d IHT_8[] = { { iadst8, iadst8 } // ADST_ADST = 3 }; -void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride, +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { int i, j; - int16_t out[8 * 8]; - int16_t *outptr = out; - int16_t temp_in[8], temp_out[8]; + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; + tran_low_t temp_in[8], temp_out[8]; const transform_2d ht = IHT_8[tx_type]; // inverse transform row vectors @@ -421,11 +455,11 @@ void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride, } } -void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int stride) { - int16_t out[8 * 8] = { 0 }; - int16_t *outptr = out; +void vp9_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + tran_low_t out[8 * 8] = { 0 }; + tran_low_t *outptr = out; int i, j; - int16_t temp_in[8], temp_out[8]; + tran_low_t temp_in[8], temp_out[8]; // First transform rows // only first 4 row has non-zero coefs @@ -446,9 +480,9 @@ void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int stride) { } } -static void idct16(const int16_t *input, int16_t *output) { - int16_t step1[16], step2[16]; - int temp1, temp2; +static void idct16(const tran_low_t *input, tran_low_t *output) { + tran_low_t step1[16], step2[16]; + tran_high_t temp1, temp2; // stage 1 step1[0] = input[0/2]; @@ -611,11 +645,12 @@ static void idct16(const int16_t *input, int16_t *output) { output[15] = step2[0] - step2[15]; } -void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) { - int16_t out[16 * 16]; - int16_t *outptr = out; +void vp9_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[16 * 16]; + tran_low_t *outptr = out; int i, j; - int16_t temp_in[16], temp_out[16]; + tran_low_t temp_in[16], temp_out[16]; // First transform rows for (i = 0; i < 16; ++i) { @@ -635,25 +670,26 @@ void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) { } } -static void iadst16(const int16_t *input, int16_t *output) { - int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; - - int x0 = input[15]; - int x1 = input[0]; - int x2 = input[13]; - int x3 = input[2]; - int x4 = input[11]; - int x5 = input[4]; - int x6 = input[9]; - int x7 = input[6]; - int x8 = input[7]; - int x9 = input[8]; - int x10 = input[5]; - int x11 = input[10]; - int x12 = input[3]; - int x13 = input[12]; - int x14 = input[1]; - int x15 = input[14]; +static void iadst16(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; + tran_high_t s9, s10, s11, s12, s13, s14, s15; + + tran_high_t x0 = input[15]; + tran_high_t x1 = input[0]; + tran_high_t x2 = input[13]; + tran_high_t x3 = input[2]; + tran_high_t x4 = input[11]; + tran_high_t x5 = input[4]; + tran_high_t x6 = input[9]; + tran_high_t x7 = input[6]; + tran_high_t x8 = input[7]; + tran_high_t x9 = input[8]; + tran_high_t x10 = input[5]; + tran_high_t x11 = input[10]; + tran_high_t x12 = input[3]; + tran_high_t x13 = input[12]; + tran_high_t x14 = input[1]; + tran_high_t x15 = input[14]; if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { @@ -813,12 +849,12 @@ static const transform_2d IHT_16[] = { { iadst16, iadst16 } // ADST_ADST = 3 }; -void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride, +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { int i, j; - int16_t out[16 * 16]; - int16_t *outptr = out; - int16_t temp_in[16], temp_out[16]; + tran_low_t out[16 * 16]; + tran_low_t *outptr = out; + tran_low_t temp_in[16], temp_out[16]; const transform_2d ht = IHT_16[tx_type]; // Rows @@ -839,11 +875,12 @@ void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride, } } -void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) { - int16_t out[16 * 16] = { 0 }; - int16_t *outptr = out; +void vp9_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; int i, j; - int16_t temp_in[16], temp_out[16]; + tran_low_t temp_in[16], temp_out[16]; // First transform rows. Since all non-zero dct coefficients are in // upper-left 4x4 area, we only need to calculate first 4 rows here. @@ -864,10 +901,10 @@ void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) { } } -void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) { +void vp9_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; - int a1; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + tran_high_t a1; + tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64); out = dct_const_round_shift(out * cospi_16_64); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 16; ++j) { @@ -877,9 +914,9 @@ void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) { } } -static void idct32(const int16_t *input, int16_t *output) { - int16_t step1[32], step2[32]; - int temp1, temp2; +static void idct32(const tran_low_t *input, tran_low_t *output) { + tran_low_t step1[32], step2[32]; + tran_high_t temp1, temp2; // stage 1 step1[0] = input[0]; @@ -1244,11 +1281,12 @@ static void idct32(const int16_t *input, int16_t *output) { output[31] = step1[0] - step1[31]; } -void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) { - int16_t out[32 * 32]; - int16_t *outptr = out; +void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[32 * 32]; + tran_low_t *outptr = out; int i, j; - int16_t temp_in[32], temp_out[32]; + tran_low_t temp_in[32], temp_out[32]; // Rows for (i = 0; i < 32; ++i) { @@ -1265,7 +1303,7 @@ void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) { if (zero_coeff[0] | zero_coeff[1]) idct32(input, outptr); else - vpx_memset(outptr, 0, sizeof(int16_t) * 32); + vpx_memset(outptr, 0, sizeof(tran_low_t) * 32); input += 32; outptr += 32; } @@ -1281,11 +1319,12 @@ void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) { } } -void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) { - int16_t out[32 * 32] = {0}; - int16_t *outptr = out; +void vp9_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[32 * 32] = {0}; + tran_low_t *outptr = out; int i, j; - int16_t temp_in[32], temp_out[32]; + tran_low_t temp_in[32], temp_out[32]; // Rows // only upper-left 8x8 has non-zero coeff @@ -1306,11 +1345,11 @@ void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) { } } -void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) { +void vp9_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; - int a1; + tran_high_t a1; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64); out = dct_const_round_shift(out * cospi_16_64); a1 = ROUND_POWER_OF_TWO(out, 6); @@ -1322,7 +1361,8 @@ void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) { } // idct -void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) { +void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob) { if (eob > 1) vp9_idct4x4_16_add(input, dest, stride); else @@ -1330,14 +1370,16 @@ void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) { } -void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) { +void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob) { if (eob > 1) vp9_iwht4x4_16_add(input, dest, stride); else vp9_iwht4x4_1_add(input, dest, stride); } -void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) { +void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob) { // If dc is 1, then input[0] is the reconstructed value, do not need // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. @@ -1354,7 +1396,7 @@ void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) { vp9_idct8x8_64_add(input, dest, stride); } -void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, +void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, int eob) { /* The calculation can be simplified if there are not many non-zero dct * coefficients. Use eobs to separate different cases. */ @@ -1367,7 +1409,7 @@ void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, vp9_idct16x16_256_add(input, dest, stride); } -void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride, +void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, int eob) { if (eob == 1) vp9_idct32x32_1_add(input, dest, stride); @@ -1379,7 +1421,7 @@ void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride, } // iht -void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, +void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, int stride, int eob) { if (tx_type == DCT_DCT) vp9_idct4x4_add(input, dest, stride, eob); @@ -1387,7 +1429,7 @@ void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, vp9_iht4x4_16_add(input, dest, stride, tx_type); } -void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, +void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, int stride, int eob) { if (tx_type == DCT_DCT) { vp9_idct8x8_add(input, dest, stride, eob); @@ -1396,7 +1438,7 @@ void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, } } -void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, +void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, int stride, int eob) { if (tx_type == DCT_DCT) { vp9_idct16x16_add(input, dest, stride, eob); @@ -1404,3 +1446,1433 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, vp9_iht16x16_256_add(input, dest, stride, tx_type); } } + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_high_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, + 0.5 shifts per pixel. */ + int i; + tran_low_t output[16]; + tran_high_t a1, b1, c1, d1, e1; + const tran_low_t *ip = input; + tran_low_t *op = output; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + for (i = 0; i < 4; i++) { + a1 = ip[0] >> UNIT_QUANT_SHIFT; + c1 = ip[1] >> UNIT_QUANT_SHIFT; + d1 = ip[2] >> UNIT_QUANT_SHIFT; + b1 = ip[3] >> UNIT_QUANT_SHIFT; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + op[0] = WRAPLOW(a1); + op[1] = WRAPLOW(b1); + op[2] = WRAPLOW(c1); + op[3] = WRAPLOW(d1); + ip += 4; + op += 4; + } + + ip = output; + for (i = 0; i < 4; i++) { + a1 = ip[4 * 0]; + c1 = ip[4 * 1]; + d1 = ip[4 * 2]; + b1 = ip[4 * 3]; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + dest[stride * 0] = clip_pixel_bd_high(dest[stride * 0], a1, bd); + dest[stride * 1] = clip_pixel_bd_high(dest[stride * 1], b1, bd); + dest[stride * 2] = clip_pixel_bd_high(dest[stride * 2], c1, bd); + dest[stride * 3] = clip_pixel_bd_high(dest[stride * 3], d1, bd); + + ip++; + dest++; + } +} + +static void high_idct4(const tran_low_t *input, tran_low_t *output, int bd) { + tran_low_t step[4]; + tran_high_t temp1, temp2; + (void) bd; + // stage 1 + temp1 = (input[0] + input[2]) * cospi_16_64; + temp2 = (input[0] - input[2]) * cospi_16_64; + step[0] = WRAPLOW(dct_const_round_shift(temp1)); + step[1] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step[2] = WRAPLOW(dct_const_round_shift(temp1)); + step[3] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 2 + output[0] = WRAPLOW(step[0] + step[3]); + output[1] = WRAPLOW(step[1] + step[2]); + output[2] = WRAPLOW(step[1] - step[2]); + output[3] = WRAPLOW(step[0] - step[3]); +} + +void vp9_high_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, + int dest_stride, int bd) { + int i; + tran_high_t a1, e1; + tran_low_t tmp[4]; + const tran_low_t *ip = in; + tran_low_t *op = tmp; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + (void) bd; + + a1 = ip[0] >> UNIT_QUANT_SHIFT; + e1 = a1 >> 1; + a1 -= e1; + op[0] = WRAPLOW(a1); + op[1] = op[2] = op[3] = WRAPLOW(e1); + + ip = tmp; + for (i = 0; i < 4; i++) { + e1 = ip[0] >> 1; + a1 = ip[0] - e1; + dest[dest_stride * 0] = clip_pixel_bd_high(dest[dest_stride * 0], a1, bd); + dest[dest_stride * 1] = clip_pixel_bd_high(dest[dest_stride * 1], e1, bd); + dest[dest_stride * 2] = clip_pixel_bd_high(dest[dest_stride * 2], e1, bd); + dest[dest_stride * 3] = clip_pixel_bd_high(dest[dest_stride * 3], e1, bd); + ip++; + dest++; + } +} + +void vp9_high_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[4], temp_out[4]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // Rows + for (i = 0; i < 4; ++i) { + high_idct4(input, outptr, bd); + input += 4; + outptr += 4; + } + + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j * 4 + i]; + high_idct4(temp_in, temp_out, bd); + for (j = 0; j < 4; ++j) + dest[j * stride + i] = clip_pixel_bd_high( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); + } +} + +void vp9_high_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, + int dest_stride, int bd) { + int i; + tran_high_t a1; + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 4); + + for (i = 0; i < 4; i++) { + dest[0] = clip_pixel_bd_high(dest[0], a1, bd); + dest[1] = clip_pixel_bd_high(dest[1], a1, bd); + dest[2] = clip_pixel_bd_high(dest[2], a1, bd); + dest[3] = clip_pixel_bd_high(dest[3], a1, bd); + dest += dest_stride; + } +} + +static void high_idct8(const tran_low_t *input, tran_low_t *output, int bd) { + tran_low_t step1[8], step2[8]; + tran_high_t temp1, temp2; + // stage 1 + step1[0] = input[0]; + step1[2] = input[4]; + step1[1] = input[2]; + step1[3] = input[6]; + temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 2 & stage 3 - even half + high_idct4(step1, step1, bd); + + // stage 2 - odd half + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); + + // stage 3 - odd half + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[7] = step2[7]; + + // stage 4 + output[0] = WRAPLOW(step1[0] + step1[7]); + output[1] = WRAPLOW(step1[1] + step1[6]); + output[2] = WRAPLOW(step1[2] + step1[5]); + output[3] = WRAPLOW(step1[3] + step1[4]); + output[4] = WRAPLOW(step1[3] - step1[4]); + output[5] = WRAPLOW(step1[2] - step1[5]); + output[6] = WRAPLOW(step1[1] - step1[6]); + output[7] = WRAPLOW(step1[0] - step1[7]); +} + +void vp9_high_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[8], temp_out[8]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // First transform rows. + for (i = 0; i < 8; ++i) { + high_idct8(input, outptr, bd); + input += 8; + outptr += 8; + } + + // Then transform columns. + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + high_idct8(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) + dest[j * stride + i] = clip_pixel_bd_high(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 5), + bd); + } +} + +void vp9_high_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + int i, j; + tran_high_t a1; + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 5); + for (j = 0; j < 8; ++j) { + for (i = 0; i < 8; ++i) + dest[i] = clip_pixel_bd_high(dest[i], a1, bd); + dest += stride; + } +} + +static void high_iadst4(const tran_low_t *input, tran_low_t *output, int bd) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + + tran_high_t x0 = input[0]; + tran_high_t x1 = input[1]; + tran_high_t x2 = input[2]; + tran_high_t x3 = input[3]; + (void) bd; + + if (!(x0 | x1 | x2 | x3)) { + vpx_memset(output, 0, 4 * sizeof(*output)); + return; + } + + s0 = sinpi_1_9 * x0; + s1 = sinpi_2_9 * x0; + s2 = sinpi_3_9 * x1; + s3 = sinpi_4_9 * x2; + s4 = sinpi_1_9 * x2; + s5 = sinpi_2_9 * x3; + s6 = sinpi_4_9 * x3; + s7 = x0 - x2 + x3; + + x0 = s0 + s3 + s5; + x1 = s1 - s4 - s6; + x2 = sinpi_3_9 * s7; + x3 = s2; + + s0 = x0 + x3; + s1 = x1 + x3; + s2 = x2; + s3 = x0 + x1 - x3; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = WRAPLOW(dct_const_round_shift(s0)); + output[1] = WRAPLOW(dct_const_round_shift(s1)); + output[2] = WRAPLOW(dct_const_round_shift(s2)); + output[3] = WRAPLOW(dct_const_round_shift(s3)); +} + +void vp9_high_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int tx_type, int bd) { + const high_transform_2d IHT_4[] = { + { high_idct4, high_idct4 }, // DCT_DCT = 0 + { high_iadst4, high_idct4 }, // ADST_DCT = 1 + { high_idct4, high_iadst4 }, // DCT_ADST = 2 + { high_iadst4, high_iadst4 } // ADST_ADST = 3 + }; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + int i, j; + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + tran_low_t temp_in[4], temp_out[4]; + + // Inverse transform row vectors. + for (i = 0; i < 4; ++i) { + IHT_4[tx_type].rows(input, outptr, bd); + input += 4; + outptr += 4; + } + + // Inverse transform column vectors. + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j * 4 + i]; + IHT_4[tx_type].cols(temp_in, temp_out, bd); + for (j = 0; j < 4; ++j) + dest[j * stride + i] = clip_pixel_bd_high( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); + } +} + +static void high_iadst8(const tran_low_t *input, tran_low_t *output, int bd) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + + tran_high_t x0 = input[7]; + tran_high_t x1 = input[0]; + tran_high_t x2 = input[5]; + tran_high_t x3 = input[2]; + tran_high_t x4 = input[3]; + tran_high_t x5 = input[4]; + tran_high_t x6 = input[1]; + tran_high_t x7 = input[6]; + (void) bd; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { + vpx_memset(output, 0, 8 * sizeof(*output)); + return; + } + + // stage 1 + s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + + x0 = WRAPLOW(dct_const_round_shift(s0 + s4)); + x1 = WRAPLOW(dct_const_round_shift(s1 + s5)); + x2 = WRAPLOW(dct_const_round_shift(s2 + s6)); + x3 = WRAPLOW(dct_const_round_shift(s3 + s7)); + x4 = WRAPLOW(dct_const_round_shift(s0 - s4)); + x5 = WRAPLOW(dct_const_round_shift(s1 - s5)); + x6 = WRAPLOW(dct_const_round_shift(s2 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s3 - s7)); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); + x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); + + // stage 3 + s2 = cospi_16_64 * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (x6 - x7); + + x2 = WRAPLOW(dct_const_round_shift(s2)); + x3 = WRAPLOW(dct_const_round_shift(s3)); + x6 = WRAPLOW(dct_const_round_shift(s6)); + x7 = WRAPLOW(dct_const_round_shift(s7)); + + output[0] = WRAPLOW(x0); + output[1] = WRAPLOW(-x4); + output[2] = WRAPLOW(x6); + output[3] = WRAPLOW(-x2); + output[4] = WRAPLOW(x3); + output[5] = WRAPLOW(-x7); + output[6] = WRAPLOW(x5); + output[7] = WRAPLOW(-x1); +} + +static const high_transform_2d HIGH_IHT_8[] = { + { high_idct8, high_idct8 }, // DCT_DCT = 0 + { high_iadst8, high_idct8 }, // ADST_DCT = 1 + { high_idct8, high_iadst8 }, // DCT_ADST = 2 + { high_iadst8, high_iadst8 } // ADST_ADST = 3 +}; + +void vp9_high_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int tx_type, int bd) { + int i, j; + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; + tran_low_t temp_in[8], temp_out[8]; + const high_transform_2d ht = HIGH_IHT_8[tx_type]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // Inverse transform row vectors. + for (i = 0; i < 8; ++i) { + ht.rows(input, outptr, bd); + input += 8; + outptr += 8; + } + + // Inverse transform column vectors. + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + ht.cols(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) + dest[j * stride + i] = clip_pixel_bd_high( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } +} + +void vp9_high_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[8 * 8] = { 0 }; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[8], temp_out[8]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // First transform rows. + // Only first 4 row has non-zero coefs. + for (i = 0; i < 4; ++i) { + high_idct8(input, outptr, bd); + input += 8; + outptr += 8; + } + // Then transform columns. + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + high_idct8(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) + dest[j * stride + i] = clip_pixel_bd_high( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } +} + +static void high_idct16(const tran_low_t *input, tran_low_t *output, int bd) { + tran_low_t step1[16], step2[16]; + tran_high_t temp1, temp2; + (void) bd; + + // stage 1 + step1[0] = input[0/2]; + step1[1] = input[16/2]; + step1[2] = input[8/2]; + step1[3] = input[24/2]; + step1[4] = input[4/2]; + step1[5] = input[20/2]; + step1[6] = input[12/2]; + step1[7] = input[28/2]; + step1[8] = input[2/2]; + step1[9] = input[18/2]; + step1[10] = input[10/2]; + step1[11] = input[26/2]; + step1[12] = input[6/2]; + step1[13] = input[22/2]; + step1[14] = input[14/2]; + step1[15] = input[30/2]; + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + step2[8] = WRAPLOW(dct_const_round_shift(temp1)); + step2[15] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + + step1[8] = WRAPLOW(step2[8] + step2[9]); + step1[9] = WRAPLOW(step2[8] - step2[9]); + step1[10] = WRAPLOW(-step2[10] + step2[11]); + step1[11] = WRAPLOW(step2[10] + step2[11]); + step1[12] = WRAPLOW(step2[12] + step2[13]); + step1[13] = WRAPLOW(step2[12] - step2[13]); + step1[14] = WRAPLOW(-step2[14] + step2[15]); + step1[15] = WRAPLOW(step2[14] + step2[15]); + + // stage 4 + temp1 = (step1[0] + step1[1]) * cospi_16_64; + temp2 = (step1[0] - step1[1]) * cospi_16_64; + step2[0] = WRAPLOW(dct_const_round_shift(temp1)); + step2[1] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = WRAPLOW(dct_const_round_shift(temp1)); + step2[3] = WRAPLOW(dct_const_round_shift(temp2)); + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[0] = WRAPLOW(step2[0] + step2[3]); + step1[1] = WRAPLOW(step2[1] + step2[2]); + step1[2] = WRAPLOW(step2[1] - step2[2]); + step1[3] = WRAPLOW(step2[0] - step2[3]); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[7] = step2[7]; + + step1[8] = WRAPLOW(step2[8] + step2[11]); + step1[9] = WRAPLOW(step2[9] + step2[10]); + step1[10] = WRAPLOW(step2[9] - step2[10]); + step1[11] = WRAPLOW(step2[8] - step2[11]); + step1[12] = WRAPLOW(-step2[12] + step2[15]); + step1[13] = WRAPLOW(-step2[13] + step2[14]); + step1[14] = WRAPLOW(step2[13] + step2[14]); + step1[15] = WRAPLOW(step2[12] + step2[15]); + + // stage 6 + step2[0] = WRAPLOW(step1[0] + step1[7]); + step2[1] = WRAPLOW(step1[1] + step1[6]); + step2[2] = WRAPLOW(step1[2] + step1[5]); + step2[3] = WRAPLOW(step1[3] + step1[4]); + step2[4] = WRAPLOW(step1[3] - step1[4]); + step2[5] = WRAPLOW(step1[2] - step1[5]); + step2[6] = WRAPLOW(step1[1] - step1[6]); + step2[7] = WRAPLOW(step1[0] - step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * cospi_16_64; + temp2 = (step1[10] + step1[13]) * cospi_16_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step1[11] + step1[12]) * cospi_16_64; + temp2 = (step1[11] + step1[12]) * cospi_16_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + output[0] = WRAPLOW(step2[0] + step2[15]); + output[1] = WRAPLOW(step2[1] + step2[14]); + output[2] = WRAPLOW(step2[2] + step2[13]); + output[3] = WRAPLOW(step2[3] + step2[12]); + output[4] = WRAPLOW(step2[4] + step2[11]); + output[5] = WRAPLOW(step2[5] + step2[10]); + output[6] = WRAPLOW(step2[6] + step2[9]); + output[7] = WRAPLOW(step2[7] + step2[8]); + output[8] = WRAPLOW(step2[7] - step2[8]); + output[9] = WRAPLOW(step2[6] - step2[9]); + output[10] = WRAPLOW(step2[5] - step2[10]); + output[11] = WRAPLOW(step2[4] - step2[11]); + output[12] = WRAPLOW(step2[3] - step2[12]); + output[13] = WRAPLOW(step2[2] - step2[13]); + output[14] = WRAPLOW(step2[1] - step2[14]); + output[15] = WRAPLOW(step2[0] - step2[15]); +} + +void vp9_high_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[16 * 16]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[16], temp_out[16]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // First transform rows. + for (i = 0; i < 16; ++i) { + high_idct16(input, outptr, bd); + input += 16; + outptr += 16; + } + + // Then transform columns. + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j * 16 + i]; + high_idct16(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) + dest[j * stride + i] = clip_pixel_bd_high( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } +} + +static void high_iadst16(const tran_low_t *input, tran_low_t *output, int bd) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; + tran_high_t s9, s10, s11, s12, s13, s14, s15; + + tran_high_t x0 = input[15]; + tran_high_t x1 = input[0]; + tran_high_t x2 = input[13]; + tran_high_t x3 = input[2]; + tran_high_t x4 = input[11]; + tran_high_t x5 = input[4]; + tran_high_t x6 = input[9]; + tran_high_t x7 = input[6]; + tran_high_t x8 = input[7]; + tran_high_t x9 = input[8]; + tran_high_t x10 = input[5]; + tran_high_t x11 = input[10]; + tran_high_t x12 = input[3]; + tran_high_t x13 = input[12]; + tran_high_t x14 = input[1]; + tran_high_t x15 = input[14]; + (void) bd; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 + | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { + vpx_memset(output, 0, 16 * sizeof(*output)); + return; + } + + // stage 1 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + + x0 = WRAPLOW(dct_const_round_shift(s0 + s8)); + x1 = WRAPLOW(dct_const_round_shift(s1 + s9)); + x2 = WRAPLOW(dct_const_round_shift(s2 + s10)); + x3 = WRAPLOW(dct_const_round_shift(s3 + s11)); + x4 = WRAPLOW(dct_const_round_shift(s4 + s12)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s13)); + x6 = WRAPLOW(dct_const_round_shift(s6 + s14)); + x7 = WRAPLOW(dct_const_round_shift(s7 + s15)); + x8 = WRAPLOW(dct_const_round_shift(s0 - s8)); + x9 = WRAPLOW(dct_const_round_shift(s1 - s9)); + x10 = WRAPLOW(dct_const_round_shift(s2 - s10)); + x11 = WRAPLOW(dct_const_round_shift(s3 - s11)); + x12 = WRAPLOW(dct_const_round_shift(s4 - s12)); + x13 = WRAPLOW(dct_const_round_shift(s5 - s13)); + x14 = WRAPLOW(dct_const_round_shift(s6 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s7 - s15)); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = WRAPLOW(s0 + s4); + x1 = WRAPLOW(s1 + s5); + x2 = WRAPLOW(s2 + s6); + x3 = WRAPLOW(s3 + s7); + x4 = WRAPLOW(s0 - s4); + x5 = WRAPLOW(s1 - s5); + x6 = WRAPLOW(s2 - s6); + x7 = WRAPLOW(s3 - s7); + x8 = WRAPLOW(dct_const_round_shift(s8 + s12)); + x9 = WRAPLOW(dct_const_round_shift(s9 + s13)); + x10 = WRAPLOW(dct_const_round_shift(s10 + s14)); + x11 = WRAPLOW(dct_const_round_shift(s11 + s15)); + x12 = WRAPLOW(dct_const_round_shift(s8 - s12)); + x13 = WRAPLOW(dct_const_round_shift(s9 - s13)); + x14 = WRAPLOW(dct_const_round_shift(s10 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s11 - s15)); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + + x0 = WRAPLOW(s0 + s2); + x1 = WRAPLOW(s1 + s3); + x2 = WRAPLOW(s0 - s2); + x3 = WRAPLOW(s1 - s3); + x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); + x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); + x8 = WRAPLOW(s8 + s10); + x9 = WRAPLOW(s9 + s11); + x10 = WRAPLOW(s8 - s10); + x11 = WRAPLOW(s9 - s11); + x12 = WRAPLOW(dct_const_round_shift(s12 + s14)); + x13 = WRAPLOW(dct_const_round_shift(s13 + s15)); + x14 = WRAPLOW(dct_const_round_shift(s12 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s13 - s15)); + + // stage 4 + s2 = (- cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (-x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (-x10 + x11); + s14 = (- cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); + + x2 = WRAPLOW(dct_const_round_shift(s2)); + x3 = WRAPLOW(dct_const_round_shift(s3)); + x6 = WRAPLOW(dct_const_round_shift(s6)); + x7 = WRAPLOW(dct_const_round_shift(s7)); + x10 = WRAPLOW(dct_const_round_shift(s10)); + x11 = WRAPLOW(dct_const_round_shift(s11)); + x14 = WRAPLOW(dct_const_round_shift(s14)); + x15 = WRAPLOW(dct_const_round_shift(s15)); + + output[0] = WRAPLOW(x0); + output[1] = WRAPLOW(-x8); + output[2] = WRAPLOW(x12); + output[3] = WRAPLOW(-x4); + output[4] = WRAPLOW(x6); + output[5] = WRAPLOW(x14); + output[6] = WRAPLOW(x10); + output[7] = WRAPLOW(x2); + output[8] = WRAPLOW(x3); + output[9] = WRAPLOW(x11); + output[10] = WRAPLOW(x15); + output[11] = WRAPLOW(x7); + output[12] = WRAPLOW(x5); + output[13] = WRAPLOW(-x13); + output[14] = WRAPLOW(x9); + output[15] = WRAPLOW(-x1); +} + +static const high_transform_2d HIGH_IHT_16[] = { + { high_idct16, high_idct16 }, // DCT_DCT = 0 + { high_iadst16, high_idct16 }, // ADST_DCT = 1 + { high_idct16, high_iadst16 }, // DCT_ADST = 2 + { high_iadst16, high_iadst16 } // ADST_ADST = 3 +}; + +void vp9_high_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int tx_type, int bd) { + int i, j; + tran_low_t out[16 * 16]; + tran_low_t *outptr = out; + tran_low_t temp_in[16], temp_out[16]; + const high_transform_2d ht = HIGH_IHT_16[tx_type]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // Rows + for (i = 0; i < 16; ++i) { + ht.rows(input, outptr, bd); + input += 16; + outptr += 16; + } + + // Columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j * 16 + i]; + ht.cols(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) + dest[j * stride + i] = clip_pixel_bd_high( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } +} + +void vp9_high_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[16], temp_out[16]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 4x4 area, we only need to calculate first 4 rows here. + for (i = 0; i < 4; ++i) { + high_idct16(input, outptr, bd); + input += 16; + outptr += 16; + } + + // Then transform columns. + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j*16 + i]; + high_idct16(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) + dest[j * stride + i] = clip_pixel_bd_high( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } +} + +void vp9_high_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + int i, j; + tran_high_t a1; + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 6); + for (j = 0; j < 16; ++j) { + for (i = 0; i < 16; ++i) + dest[i] = clip_pixel_bd_high(dest[i], a1, bd); + dest += stride; + } +} + +static void high_idct32(const tran_low_t *input, tran_low_t *output, int bd) { + tran_low_t step1[32], step2[32]; + tran_high_t temp1, temp2; + (void) bd; + + // stage 1 + step1[0] = input[0]; + step1[1] = input[16]; + step1[2] = input[8]; + step1[3] = input[24]; + step1[4] = input[4]; + step1[5] = input[20]; + step1[6] = input[12]; + step1[7] = input[28]; + step1[8] = input[2]; + step1[9] = input[18]; + step1[10] = input[10]; + step1[11] = input[26]; + step1[12] = input[6]; + step1[13] = input[22]; + step1[14] = input[14]; + step1[15] = input[30]; + + temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; + temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; + step1[16] = WRAPLOW(dct_const_round_shift(temp1)); + step1[31] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; + temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; + step1[17] = WRAPLOW(dct_const_round_shift(temp1)); + step1[30] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; + temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; + step1[18] = WRAPLOW(dct_const_round_shift(temp1)); + step1[29] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; + temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; + step1[19] = WRAPLOW(dct_const_round_shift(temp1)); + step1[28] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; + temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; + step1[20] = WRAPLOW(dct_const_round_shift(temp1)); + step1[27] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; + temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; + temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; + step1[22] = WRAPLOW(dct_const_round_shift(temp1)); + step1[25] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; + temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; + step1[23] = WRAPLOW(dct_const_round_shift(temp1)); + step1[24] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + step2[8] = WRAPLOW(dct_const_round_shift(temp1)); + step2[15] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + + step2[16] = WRAPLOW(step1[16] + step1[17]); + step2[17] = WRAPLOW(step1[16] - step1[17]); + step2[18] = WRAPLOW(-step1[18] + step1[19]); + step2[19] = WRAPLOW(step1[18] + step1[19]); + step2[20] = WRAPLOW(step1[20] + step1[21]); + step2[21] = WRAPLOW(step1[20] - step1[21]); + step2[22] = WRAPLOW(-step1[22] + step1[23]); + step2[23] = WRAPLOW(step1[22] + step1[23]); + step2[24] = WRAPLOW(step1[24] + step1[25]); + step2[25] = WRAPLOW(step1[24] - step1[25]); + step2[26] = WRAPLOW(-step1[26] + step1[27]); + step2[27] = WRAPLOW(step1[26] + step1[27]); + step2[28] = WRAPLOW(step1[28] + step1[29]); + step2[29] = WRAPLOW(step1[28] - step1[29]); + step2[30] = WRAPLOW(-step1[30] + step1[31]); + step2[31] = WRAPLOW(step1[30] + step1[31]); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + + step1[8] = WRAPLOW(step2[8] + step2[9]); + step1[9] = WRAPLOW(step2[8] - step2[9]); + step1[10] = WRAPLOW(-step2[10] + step2[11]); + step1[11] = WRAPLOW(step2[10] + step2[11]); + step1[12] = WRAPLOW(step2[12] + step2[13]); + step1[13] = WRAPLOW(step2[12] - step2[13]); + step1[14] = WRAPLOW(-step2[14] + step2[15]); + step1[15] = WRAPLOW(step2[14] + step2[15]); + + step1[16] = step2[16]; + step1[31] = step2[31]; + temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; + temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; + step1[17] = WRAPLOW(dct_const_round_shift(temp1)); + step1[30] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; + temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; + step1[18] = WRAPLOW(dct_const_round_shift(temp1)); + step1[29] = WRAPLOW(dct_const_round_shift(temp2)); + step1[19] = step2[19]; + step1[20] = step2[20]; + temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; + temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; + temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; + step1[22] = WRAPLOW(dct_const_round_shift(temp1)); + step1[25] = WRAPLOW(dct_const_round_shift(temp2)); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + // stage 4 + temp1 = (step1[0] + step1[1]) * cospi_16_64; + temp2 = (step1[0] - step1[1]) * cospi_16_64; + step2[0] = WRAPLOW(dct_const_round_shift(temp1)); + step2[1] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = WRAPLOW(dct_const_round_shift(temp1)); + step2[3] = WRAPLOW(dct_const_round_shift(temp2)); + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + step2[11] = step1[11]; + step2[12] = step1[12]; + + step2[16] = WRAPLOW(step1[16] + step1[19]); + step2[17] = WRAPLOW(step1[17] + step1[18]); + step2[18] = WRAPLOW(step1[17] - step1[18]); + step2[19] = WRAPLOW(step1[16] - step1[19]); + step2[20] = WRAPLOW(-step1[20] + step1[23]); + step2[21] = WRAPLOW(-step1[21] + step1[22]); + step2[22] = WRAPLOW(step1[21] + step1[22]); + step2[23] = WRAPLOW(step1[20] + step1[23]); + + step2[24] = WRAPLOW(step1[24] + step1[27]); + step2[25] = WRAPLOW(step1[25] + step1[26]); + step2[26] = WRAPLOW(step1[25] - step1[26]); + step2[27] = WRAPLOW(step1[24] - step1[27]); + step2[28] = WRAPLOW(-step1[28] + step1[31]); + step2[29] = WRAPLOW(-step1[29] + step1[30]); + step2[30] = WRAPLOW(step1[29] + step1[30]); + step2[31] = WRAPLOW(step1[28] + step1[31]); + + // stage 5 + step1[0] = WRAPLOW(step2[0] + step2[3]); + step1[1] = WRAPLOW(step2[1] + step2[2]); + step1[2] = WRAPLOW(step2[1] - step2[2]); + step1[3] = WRAPLOW(step2[0] - step2[3]); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[7] = step2[7]; + + step1[8] = WRAPLOW(step2[8] + step2[11]); + step1[9] = WRAPLOW(step2[9] + step2[10]); + step1[10] = WRAPLOW(step2[9] - step2[10]); + step1[11] = WRAPLOW(step2[8] - step2[11]); + step1[12] = WRAPLOW(-step2[12] + step2[15]); + step1[13] = WRAPLOW(-step2[13] + step2[14]); + step1[14] = WRAPLOW(step2[13] + step2[14]); + step1[15] = WRAPLOW(step2[12] + step2[15]); + + step1[16] = step2[16]; + step1[17] = step2[17]; + temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; + temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; + step1[18] = WRAPLOW(dct_const_round_shift(temp1)); + step1[29] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; + temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; + step1[19] = WRAPLOW(dct_const_round_shift(temp1)); + step1[28] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; + temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; + step1[20] = WRAPLOW(dct_const_round_shift(temp1)); + step1[27] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; + temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + step2[0] = WRAPLOW(step1[0] + step1[7]); + step2[1] = WRAPLOW(step1[1] + step1[6]); + step2[2] = WRAPLOW(step1[2] + step1[5]); + step2[3] = WRAPLOW(step1[3] + step1[4]); + step2[4] = WRAPLOW(step1[3] - step1[4]); + step2[5] = WRAPLOW(step1[2] - step1[5]); + step2[6] = WRAPLOW(step1[1] - step1[6]); + step2[7] = WRAPLOW(step1[0] - step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * cospi_16_64; + temp2 = (step1[10] + step1[13]) * cospi_16_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step1[11] + step1[12]) * cospi_16_64; + temp2 = (step1[11] + step1[12]) * cospi_16_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + step2[14] = WRAPLOW(step1[14]); + step2[15] = WRAPLOW(step1[15]); + + step2[16] = WRAPLOW(step1[16] + step1[23]); + step2[17] = WRAPLOW(step1[17] + step1[22]); + step2[18] = WRAPLOW(step1[18] + step1[21]); + step2[19] = WRAPLOW(step1[19] + step1[20]); + step2[20] = WRAPLOW(step1[19] - step1[20]); + step2[21] = WRAPLOW(step1[18] - step1[21]); + step2[22] = WRAPLOW(step1[17] - step1[22]); + step2[23] = WRAPLOW(step1[16] - step1[23]); + + step2[24] = WRAPLOW(-step1[24] + step1[31]); + step2[25] = WRAPLOW(-step1[25] + step1[30]); + step2[26] = WRAPLOW(-step1[26] + step1[29]); + step2[27] = WRAPLOW(-step1[27] + step1[28]); + step2[28] = WRAPLOW(step1[27] + step1[28]); + step2[29] = WRAPLOW(step1[26] + step1[29]); + step2[30] = WRAPLOW(step1[25] + step1[30]); + step2[31] = WRAPLOW(step1[24] + step1[31]); + + // stage 7 + step1[0] = WRAPLOW(step2[0] + step2[15]); + step1[1] = WRAPLOW(step2[1] + step2[14]); + step1[2] = WRAPLOW(step2[2] + step2[13]); + step1[3] = WRAPLOW(step2[3] + step2[12]); + step1[4] = WRAPLOW(step2[4] + step2[11]); + step1[5] = WRAPLOW(step2[5] + step2[10]); + step1[6] = WRAPLOW(step2[6] + step2[9]); + step1[7] = WRAPLOW(step2[7] + step2[8]); + step1[8] = WRAPLOW(step2[7] - step2[8]); + step1[9] = WRAPLOW(step2[6] - step2[9]); + step1[10] = WRAPLOW(step2[5] - step2[10]); + step1[11] = WRAPLOW(step2[4] - step2[11]); + step1[12] = WRAPLOW(step2[3] - step2[12]); + step1[13] = WRAPLOW(step2[2] - step2[13]); + step1[14] = WRAPLOW(step2[1] - step2[14]); + step1[15] = WRAPLOW(step2[0] - step2[15]); + + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[18] = step2[18]; + step1[19] = step2[19]; + temp1 = (-step2[20] + step2[27]) * cospi_16_64; + temp2 = (step2[20] + step2[27]) * cospi_16_64; + step1[20] = WRAPLOW(dct_const_round_shift(temp1)); + step1[27] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step2[21] + step2[26]) * cospi_16_64; + temp2 = (step2[21] + step2[26]) * cospi_16_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step2[22] + step2[25]) * cospi_16_64; + temp2 = (step2[22] + step2[25]) * cospi_16_64; + step1[22] = WRAPLOW(dct_const_round_shift(temp1)); + step1[25] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step2[23] + step2[24]) * cospi_16_64; + temp2 = (step2[23] + step2[24]) * cospi_16_64; + step1[23] = WRAPLOW(dct_const_round_shift(temp1)); + step1[24] = WRAPLOW(dct_const_round_shift(temp2)); + step1[28] = step2[28]; + step1[29] = step2[29]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // final stage + output[0] = WRAPLOW(step1[0] + step1[31]); + output[1] = WRAPLOW(step1[1] + step1[30]); + output[2] = WRAPLOW(step1[2] + step1[29]); + output[3] = WRAPLOW(step1[3] + step1[28]); + output[4] = WRAPLOW(step1[4] + step1[27]); + output[5] = WRAPLOW(step1[5] + step1[26]); + output[6] = WRAPLOW(step1[6] + step1[25]); + output[7] = WRAPLOW(step1[7] + step1[24]); + output[8] = WRAPLOW(step1[8] + step1[23]); + output[9] = WRAPLOW(step1[9] + step1[22]); + output[10] = WRAPLOW(step1[10] + step1[21]); + output[11] = WRAPLOW(step1[11] + step1[20]); + output[12] = WRAPLOW(step1[12] + step1[19]); + output[13] = WRAPLOW(step1[13] + step1[18]); + output[14] = WRAPLOW(step1[14] + step1[17]); + output[15] = WRAPLOW(step1[15] + step1[16]); + output[16] = WRAPLOW(step1[15] - step1[16]); + output[17] = WRAPLOW(step1[14] - step1[17]); + output[18] = WRAPLOW(step1[13] - step1[18]); + output[19] = WRAPLOW(step1[12] - step1[19]); + output[20] = WRAPLOW(step1[11] - step1[20]); + output[21] = WRAPLOW(step1[10] - step1[21]); + output[22] = WRAPLOW(step1[9] - step1[22]); + output[23] = WRAPLOW(step1[8] - step1[23]); + output[24] = WRAPLOW(step1[7] - step1[24]); + output[25] = WRAPLOW(step1[6] - step1[25]); + output[26] = WRAPLOW(step1[5] - step1[26]); + output[27] = WRAPLOW(step1[4] - step1[27]); + output[28] = WRAPLOW(step1[3] - step1[28]); + output[29] = WRAPLOW(step1[2] - step1[29]); + output[30] = WRAPLOW(step1[1] - step1[30]); + output[31] = WRAPLOW(step1[0] - step1[31]); +} + +void vp9_high_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[32 * 32]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[32], temp_out[32]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // Rows + for (i = 0; i < 32; ++i) { + tran_low_t zero_coeff[16]; + for (j = 0; j < 16; ++j) + zero_coeff[j] = input[2 * j] | input[2 * j + 1]; + for (j = 0; j < 8; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + for (j = 0; j < 4; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + for (j = 0; j < 2; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + + if (zero_coeff[0] | zero_coeff[1]) + high_idct32(input, outptr, bd); + else + vpx_memset(outptr, 0, sizeof(tran_low_t) * 32); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) + temp_in[j] = out[j * 32 + i]; + high_idct32(temp_in, temp_out, bd); + for (j = 0; j < 32; ++j) + dest[j * stride + i] = clip_pixel_bd_high( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } +} + +void vp9_high_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[32 * 32] = {0}; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[32], temp_out[32]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // Rows + // Only upper-left 8x8 has non-zero coeff. + for (i = 0; i < 8; ++i) { + high_idct32(input, outptr, bd); + input += 32; + outptr += 32; + } + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) + temp_in[j] = out[j * 32 + i]; + high_idct32(temp_in, temp_out, bd); + for (j = 0; j < 32; ++j) + dest[j * stride + i] = clip_pixel_bd_high( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } +} + +void vp9_high_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + int i, j; + int a1; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 6); + + for (j = 0; j < 32; ++j) { + for (i = 0; i < 32; ++i) + dest[i] = clip_pixel_bd_high(dest[i], a1, bd); + dest += stride; + } +} + +// idct +void vp9_high_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd) { + if (eob > 1) + vp9_high_idct4x4_16_add(input, dest, stride, bd); + else + vp9_high_idct4x4_1_add(input, dest, stride, bd); +} + + +void vp9_high_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd) { + if (eob > 1) + vp9_high_iwht4x4_16_add(input, dest, stride, bd); + else + vp9_high_iwht4x4_1_add(input, dest, stride, bd); +} + +void vp9_high_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd) { + // If dc is 1, then input[0] is the reconstructed value, do not need + // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. + + // The calculation can be simplified if there are not many non-zero dct + // coefficients. Use eobs to decide what to do. + // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. + // Combine that with code here. + // DC only DCT coefficient + if (eob == 1) { + vp9_high_idct8x8_1_add(input, dest, stride, bd); + } else if (eob <= 10) { + vp9_high_idct8x8_10_add(input, dest, stride, bd); + } else { + vp9_high_idct8x8_64_add(input, dest, stride, bd); + } +} + +void vp9_high_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd) { + // The calculation can be simplified if there are not many non-zero dct + // coefficients. Use eobs to separate different cases. + // DC only DCT coefficient. + if (eob == 1) { + vp9_high_idct16x16_1_add(input, dest, stride, bd); + } else if (eob <= 10) { + vp9_high_idct16x16_10_add(input, dest, stride, bd); + } else { + vp9_high_idct16x16_256_add(input, dest, stride, bd); + } +} + +void vp9_high_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd) { + // Non-zero coeff only in upper-left 8x8 + if (eob == 1) { + vp9_high_idct32x32_1_add(input, dest, stride, bd); + } else if (eob <= 34) { + vp9_high_idct32x32_34_add(input, dest, stride, bd); + } else { + vp9_high_idct32x32_1024_add(input, dest, stride, bd); + } +} + +// iht +void vp9_high_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, + uint8_t *dest, int stride, int eob, int bd) { + if (tx_type == DCT_DCT) + vp9_high_idct4x4_add(input, dest, stride, eob, bd); + else + vp9_high_iht4x4_16_add(input, dest, stride, tx_type, bd); +} + +void vp9_high_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, + uint8_t *dest, int stride, int eob, int bd) { + if (tx_type == DCT_DCT) { + vp9_high_idct8x8_add(input, dest, stride, eob, bd); + } else { + vp9_high_iht8x8_64_add(input, dest, stride, tx_type, bd); + } +} + +void vp9_high_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, + uint8_t *dest, int stride, int eob, int bd) { + if (tx_type == DCT_DCT) { + vp9_high_idct16x16_add(input, dest, stride, eob, bd); + } else { + vp9_high_iht16x16_256_add(input, dest, stride, tx_type, bd); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h index 7f595e1cc..694be3cf9 100644 --- a/vp9/common/vp9_idct.h +++ b/vp9/common/vp9_idct.h @@ -36,52 +36,69 @@ extern "C" { #define dual_set_epi16(a, b) \ _mm_set_epi16(b, b, b, b, a, a, a, a) +// Note: +// tran_low_t is the datatype used for final transform coefficients. +// tran_high_t is the datatype used for intermediate transform stages. +#if CONFIG_VP9_HIGHBITDEPTH +typedef int64_t tran_high_t; +typedef int32_t tran_low_t; +#else +typedef int32_t tran_high_t; +typedef int16_t tran_low_t; +#endif + // Constants: // for (int i = 1; i< 32; ++i) // printf("static const int cospi_%d_64 = %.0f;\n", i, // round(16384 * cos(i*M_PI/64))); // Note: sin(k*Pi/64) = cos((32-k)*Pi/64) -static const int cospi_1_64 = 16364; -static const int cospi_2_64 = 16305; -static const int cospi_3_64 = 16207; -static const int cospi_4_64 = 16069; -static const int cospi_5_64 = 15893; -static const int cospi_6_64 = 15679; -static const int cospi_7_64 = 15426; -static const int cospi_8_64 = 15137; -static const int cospi_9_64 = 14811; -static const int cospi_10_64 = 14449; -static const int cospi_11_64 = 14053; -static const int cospi_12_64 = 13623; -static const int cospi_13_64 = 13160; -static const int cospi_14_64 = 12665; -static const int cospi_15_64 = 12140; -static const int cospi_16_64 = 11585; -static const int cospi_17_64 = 11003; -static const int cospi_18_64 = 10394; -static const int cospi_19_64 = 9760; -static const int cospi_20_64 = 9102; -static const int cospi_21_64 = 8423; -static const int cospi_22_64 = 7723; -static const int cospi_23_64 = 7005; -static const int cospi_24_64 = 6270; -static const int cospi_25_64 = 5520; -static const int cospi_26_64 = 4756; -static const int cospi_27_64 = 3981; -static const int cospi_28_64 = 3196; -static const int cospi_29_64 = 2404; -static const int cospi_30_64 = 1606; -static const int cospi_31_64 = 804; +static const tran_high_t cospi_1_64 = 16364; +static const tran_high_t cospi_2_64 = 16305; +static const tran_high_t cospi_3_64 = 16207; +static const tran_high_t cospi_4_64 = 16069; +static const tran_high_t cospi_5_64 = 15893; +static const tran_high_t cospi_6_64 = 15679; +static const tran_high_t cospi_7_64 = 15426; +static const tran_high_t cospi_8_64 = 15137; +static const tran_high_t cospi_9_64 = 14811; +static const tran_high_t cospi_10_64 = 14449; +static const tran_high_t cospi_11_64 = 14053; +static const tran_high_t cospi_12_64 = 13623; +static const tran_high_t cospi_13_64 = 13160; +static const tran_high_t cospi_14_64 = 12665; +static const tran_high_t cospi_15_64 = 12140; +static const tran_high_t cospi_16_64 = 11585; +static const tran_high_t cospi_17_64 = 11003; +static const tran_high_t cospi_18_64 = 10394; +static const tran_high_t cospi_19_64 = 9760; +static const tran_high_t cospi_20_64 = 9102; +static const tran_high_t cospi_21_64 = 8423; +static const tran_high_t cospi_22_64 = 7723; +static const tran_high_t cospi_23_64 = 7005; +static const tran_high_t cospi_24_64 = 6270; +static const tran_high_t cospi_25_64 = 5520; +static const tran_high_t cospi_26_64 = 4756; +static const tran_high_t cospi_27_64 = 3981; +static const tran_high_t cospi_28_64 = 3196; +static const tran_high_t cospi_29_64 = 2404; +static const tran_high_t cospi_30_64 = 1606; +static const tran_high_t cospi_31_64 = 804; // 16384 * sqrt(2) * sin(kPi/9) * 2 / 3 -static const int sinpi_1_9 = 5283; -static const int sinpi_2_9 = 9929; -static const int sinpi_3_9 = 13377; -static const int sinpi_4_9 = 15212; - -static INLINE int dct_const_round_shift(int input) { - int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); -#if CONFIG_COEFFICIENT_RANGE_CHECKING +static const tran_high_t sinpi_1_9 = 5283; +static const tran_high_t sinpi_2_9 = 9929; +static const tran_high_t sinpi_3_9 = 13377; +static const tran_high_t sinpi_4_9 = 15212; + +static INLINE tran_low_t dct_const_round_shift(tran_high_t input) { + tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); +#if CONFIG_VP9_HIGHBITDEPTH + // For valid highbitdepth VP9 streams, intermediate stage coefficients will + // stay within the ranges: + // - 8 bit: signed 16 bit integer + // - 10 bit: signed 18 bit integer + // - 12 bit: signed 20 bit integer +#elif CONFIG_COEFFICIENT_RANGE_CHECKING // For valid VP9 input streams, intermediate stage coefficients should always // stay within the range of a signed 16 bit integer. Coefficients can go out // of this range for invalid/corrupt VP9 streams. However, strictly checking @@ -91,32 +108,59 @@ static INLINE int dct_const_round_shift(int input) { assert(INT16_MIN <= rv); assert(rv <= INT16_MAX); #endif - return (int16_t)rv; + return (tran_low_t)rv; } -typedef void (*transform_1d)(const int16_t*, int16_t*); +typedef void (*transform_1d)(const tran_low_t*, tran_low_t*); typedef struct { transform_1d cols, rows; // vertical and horizontal } transform_2d; -void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob); +#if CONFIG_VP9_HIGHBITDEPTH +typedef void (*high_transform_1d)(const tran_low_t*, tran_low_t*, int bd); -void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob); -void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob); -void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, int +typedef struct { + high_transform_1d cols, rows; // vertical and horizontal +} high_transform_2d; +#endif // CONFIG_VP9_HIGHBITDEPTH + +void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob); +void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob); +void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob); +void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, int eob); -void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride, +void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, int eob); -void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, +void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, int stride, int eob); -void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, +void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, int stride, int eob); -void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, +void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, int stride, int eob); - +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_high_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd); +void vp9_high_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd); +void vp9_high_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd); +void vp9_high_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd); +void vp9_high_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd); +void vp9_high_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, + uint8_t *dest, int stride, int eob, int bd); +void vp9_high_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, + uint8_t *dest, int stride, int eob, int bd); +void vp9_high_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, + uint8_t *dest, int stride, int eob, int bd); +#endif // CONFIG_VP9_HIGHBITDEPTH #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index 3b39d4274..4d03c4dcb 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -1193,7 +1193,7 @@ void vp9_filter_block_plane(VP9_COMMON *const cm, } } -void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer, +void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, VP9_COMMON *cm, struct macroblockd_plane planes[MAX_MB_PLANE], int start, int stop, int y_only) { @@ -1247,9 +1247,8 @@ void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame, y_only); } -int vp9_loop_filter_worker(void *arg1, void *arg2) { - LFWorkerData *const lf_data = (LFWorkerData*)arg1; - (void)arg2; +int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) { + (void)unused; vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->start, lf_data->stop, lf_data->y_only); return 1; diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h index 6fa2773e5..69e7dd08c 100644 --- a/vp9/common/vp9_loopfilter.h +++ b/vp9/common/vp9_loopfilter.h @@ -111,13 +111,13 @@ void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame, int y_only, int partial_frame); // Apply the loop filter to [start, stop) macro block rows in frame_buffer. -void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer, +void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, struct VP9Common *cm, struct macroblockd_plane planes[MAX_MB_PLANE], int start, int stop, int y_only); typedef struct LoopFilterWorkerData { - const YV12_BUFFER_CONFIG *frame_buffer; + YV12_BUFFER_CONFIG *frame_buffer; struct VP9Common *cm; struct macroblockd_plane planes[MAX_MB_PLANE]; @@ -129,8 +129,8 @@ typedef struct LoopFilterWorkerData { int num_lf_workers; } LFWorkerData; -// Operates on the rows described by LFWorkerData passed as 'arg1'. -int vp9_loop_filter_worker(void *arg1, void *arg2); +// Operates on the rows described by 'lf_data'. +int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused); #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 667e057b6..32bcf9a77 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -6,6 +6,7 @@ print <<EOF #include "vpx/vpx_integer.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_idct.h" struct macroblockd; @@ -45,6 +46,13 @@ if ($opts{arch} eq "x86_64") { $avx_x86_64 = $avx2_x86_64 = ''; } +# optimizations which depend on multiple features +if ((vpx_config("HAVE_AVX2") eq "yes") && (vpx_config("HAVE_SSSE3") eq "yes")) { + $avx2_ssse3 = 'avx2'; +} else { + $avx2_ssse3 = ''; +} + # # RECON # @@ -296,15 +304,15 @@ specialize qw/vp9_convolve_avg neon_asm dspr2/, "$sse2_x86inc"; $vp9_convolve_avg_neon_asm=vp9_convolve_avg_neon; add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2 avx2/; +specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3"; $vp9_convolve8_neon_asm=vp9_convolve8_neon; add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2 avx2/; +specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3"; $vp9_convolve8_horiz_neon_asm=vp9_convolve8_horiz_neon; add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2 avx2/; +specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3"; $vp9_convolve8_vert_neon_asm=vp9_convolve8_vert_neon; add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; @@ -322,68 +330,177 @@ $vp9_convolve8_avg_vert_neon_asm=vp9_convolve8_avg_vert_neon; # # dct # -add_proto qw/void vp9_idct4x4_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/; -$vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon; +if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { + add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct4x4_1_add/; + + add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct4x4_16_add/; + + add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct8x8_1_add/; + + add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct8x8_64_add/; + + add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct8x8_12_add/; + + add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct16x16_1_add/; + + add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct16x16_256_add/; + + add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct16x16_10_add/; + + add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct32x32_1024_add/; + + add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct32x32_34_add/; + + add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct32x32_1_add/; + + add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp9_iht4x4_16_add/; + + add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp9_iht8x8_64_add/; + + add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; + specialize qw/vp9_iht16x16_256_add/; + + # dct and add + + add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_iwht4x4_1_add/; -add_proto qw/void vp9_idct4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/; -$vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon; + add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_iwht4x4_16_add/; +} else { + add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/; + $vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon; + + add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/; + $vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon; + + add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/; + $vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon; + + add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64"; + $vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon; + + add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64"; + $vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon; + + add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/; + $vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon; + + add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/; + $vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon; + + add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/; + $vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon; + + add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/; + $vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon; -add_proto qw/void vp9_idct8x8_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/; -$vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon; + add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/; + $vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon; -add_proto qw/void vp9_idct8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64"; -$vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon; + add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/; + $vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon; -add_proto qw/void vp9_idct8x8_12_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64"; -$vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon; + add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/; + $vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon; -add_proto qw/void vp9_idct16x16_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/; -$vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon; + add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/; + $vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon; -add_proto qw/void vp9_idct16x16_256_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/; -$vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon; + add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; + specialize qw/vp9_iht16x16_256_add sse2 dspr2/; + + # dct and add + + add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_iwht4x4_1_add/; + + add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_iwht4x4_16_add/; +} + + +# High bitdepth functions +if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { +# +# dct +# +add_proto qw/void vp9_high_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; +specialize qw/vp9_high_idct4x4_1_add/; -add_proto qw/void vp9_idct16x16_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/; -$vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon; +add_proto qw/void vp9_high_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; +specialize qw/vp9_high_idct4x4_16_add/; -add_proto qw/void vp9_idct32x32_1024_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/; -$vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon; +add_proto qw/void vp9_high_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; +specialize qw/vp9_high_idct8x8_1_add/; -add_proto qw/void vp9_idct32x32_34_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/; -$vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon; +add_proto qw/void vp9_high_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; +specialize qw/vp9_high_idct8x8_64_add/; -add_proto qw/void vp9_idct32x32_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/; -$vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon; +add_proto qw/void vp9_high_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; +specialize qw/vp9_high_idct8x8_10_add/; -add_proto qw/void vp9_iht4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"; -specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/; -$vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon; +add_proto qw/void vp9_high_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; +specialize qw/vp9_high_idct16x16_1_add/; -add_proto qw/void vp9_iht8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"; -specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/; -$vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon; +add_proto qw/void vp9_high_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; +specialize qw/vp9_high_idct16x16_256_add/; -add_proto qw/void vp9_iht16x16_256_add/, "const int16_t *input, uint8_t *output, int pitch, int tx_type"; -specialize qw/vp9_iht16x16_256_add sse2 dspr2/; +add_proto qw/void vp9_high_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; +specialize qw/vp9_high_idct16x16_10_add/; + +add_proto qw/void vp9_high_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; +specialize qw/vp9_high_idct32x32_1024_add/; + +add_proto qw/void vp9_high_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; +specialize qw/vp9_high_idct32x32_34_add/; + +add_proto qw/void vp9_high_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; +specialize qw/vp9_high_idct32x32_1_add/; + +add_proto qw/void vp9_high_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd"; +specialize qw/vp9_high_iht4x4_16_add/; + +add_proto qw/void vp9_high_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd"; +specialize qw/vp9_high_iht8x8_64_add/; + +add_proto qw/void vp9_high_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd"; +specialize qw/vp9_high_iht16x16_256_add/; # dct and add -add_proto qw/void vp9_iwht4x4_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_iwht4x4_1_add/; +add_proto qw/void vp9_high_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; +specialize qw/vp9_high_iwht4x4_1_add/; -add_proto qw/void vp9_iwht4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_iwht4x4_16_add/; +add_proto qw/void vp9_high_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; +specialize qw/vp9_high_iwht4x4_16_add/; +} # # Encoder functions below this point. @@ -699,23 +816,42 @@ add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *"; specialize qw/vp9_get_mb_ss/, "$sse2_x86inc"; # ENCODEMB INVOKE -add_proto qw/int64_t vp9_block_error/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz"; -specialize qw/vp9_block_error avx2/, "$sse2_x86inc"; - add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; specialize qw/vp9_subtract_block neon/, "$sse2_x86inc"; -add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; -specialize qw/vp9_quantize_fp neon/, "$ssse3_x86_64"; +if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { +# the transform coefficients are held in 32-bit +# values, so the assembler code for vp9_block_error can no longer be used. + add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; + specialize qw/vp9_block_error/; + + add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/vp9_quantize_fp/; -add_proto qw/void vp9_quantize_fp_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; -specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64"; + add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/vp9_quantize_fp_32x32/; -add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; -specialize qw/vp9_quantize_b/, "$ssse3_x86_64"; + add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/vp9_quantize_b/; -add_proto qw/void vp9_quantize_b_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; -specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64"; + add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/vp9_quantize_b_32x32/; +} else { + add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; + specialize qw/vp9_block_error avx2/; + + add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/vp9_quantize_fp neon/, "$ssse3_x86_64"; + + add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64"; + + add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/vp9_quantize_b/, "$ssse3_x86_64"; + + add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/vp9_quantize_b_32x32/; +} # # Structured Similarity (SSIM) @@ -729,44 +865,86 @@ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") { } # fdct functions -add_proto qw/void vp9_fht4x4/, "const int16_t *input, int16_t *output, int stride, int tx_type"; -specialize qw/vp9_fht4x4 sse2/; -add_proto qw/void vp9_fht8x8/, "const int16_t *input, int16_t *output, int stride, int tx_type"; -specialize qw/vp9_fht8x8 sse2/; +if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { + add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp9_fht4x4/; + + add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp9_fht8x8/; -add_proto qw/void vp9_fht16x16/, "const int16_t *input, int16_t *output, int stride, int tx_type"; -specialize qw/vp9_fht16x16 sse2/; + add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp9_fht16x16/; -add_proto qw/void vp9_fwht4x4/, "const int16_t *input, int16_t *output, int stride"; -specialize qw/vp9_fwht4x4/, "$mmx_x86inc"; + add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fwht4x4/; -add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, int16_t *output, int stride"; -specialize qw/vp9_fdct4x4_1 sse2/; + add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct4x4_1/; -add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stride"; -specialize qw/vp9_fdct4x4 sse2/; + add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct4x4/; -add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, int16_t *output, int stride"; -specialize qw/vp9_fdct8x8_1 sse2 neon/; + add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct8x8_1/; -add_proto qw/void vp9_fdct8x8/, "const int16_t *input, int16_t *output, int stride"; -specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64"; + add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct8x8/; -add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, int16_t *output, int stride"; -specialize qw/vp9_fdct16x16_1 sse2/; + add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct16x16_1/; -add_proto qw/void vp9_fdct16x16/, "const int16_t *input, int16_t *output, int stride"; -specialize qw/vp9_fdct16x16 sse2/; + add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct16x16/; -add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, int16_t *output, int stride"; -specialize qw/vp9_fdct32x32_1 sse2/; + add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct32x32_1/; -add_proto qw/void vp9_fdct32x32/, "const int16_t *input, int16_t *output, int stride"; -specialize qw/vp9_fdct32x32 sse2 avx2/; + add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct32x32/; + + add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct32x32_rd/; +} else { + add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp9_fht4x4 sse2/; -add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, int16_t *output, int stride"; -specialize qw/vp9_fdct32x32_rd sse2 avx2/; + add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp9_fht8x8 sse2/; + + add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp9_fht16x16 sse2/; + + add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fwht4x4/, "$mmx_x86inc"; + + add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct4x4_1 sse2/; + + add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct4x4 sse2/; + + add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct8x8_1 sse2 neon/; + + add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64"; + + add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct16x16_1 sse2/; + + add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct16x16 sse2/; + + add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct32x32_1 sse2/; + + add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct32x32 sse2 avx2/; + + add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct32x32_rd sse2 avx2/; +} # # Motion search @@ -788,6 +966,654 @@ specialize qw/vp9_full_range_search/; add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; specialize qw/vp9_temporal_filter_apply sse2/; +if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { + + # variance + add_proto qw/unsigned int vp9_high_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_variance32x16/; + + add_proto qw/unsigned int vp9_high_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_variance16x32/; + + add_proto qw/unsigned int vp9_high_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_variance64x32/; + + add_proto qw/unsigned int vp9_high_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_variance32x64/; + + add_proto qw/unsigned int vp9_high_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_variance32x32/; + + add_proto qw/unsigned int vp9_high_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_variance64x64/; + + add_proto qw/unsigned int vp9_high_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_variance16x16/; + + add_proto qw/unsigned int vp9_high_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_variance16x8/; + + add_proto qw/unsigned int vp9_high_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_variance8x16/; + + add_proto qw/unsigned int vp9_high_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_variance8x8/; + + add_proto qw/unsigned int vp9_high_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_variance8x4/; + + add_proto qw/unsigned int vp9_high_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_variance4x8/; + + add_proto qw/unsigned int vp9_high_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_variance4x4/; + + add_proto qw/void vp9_high_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + specialize qw/vp9_high_get8x8var/; + + add_proto qw/void vp9_high_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + specialize qw/vp9_high_get16x16var/; + + add_proto qw/unsigned int vp9_high_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_variance32x16/; + + add_proto qw/unsigned int vp9_high_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_variance16x32/; + + add_proto qw/unsigned int vp9_high_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_variance64x32/; + + add_proto qw/unsigned int vp9_high_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_variance32x64/; + + add_proto qw/unsigned int vp9_high_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_variance32x32/; + + add_proto qw/unsigned int vp9_high_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_variance64x64/; + + add_proto qw/unsigned int vp9_high_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_variance16x16/; + + add_proto qw/unsigned int vp9_high_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_variance16x8/; + + add_proto qw/unsigned int vp9_high_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_variance8x16/; + + add_proto qw/unsigned int vp9_high_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_variance8x8/; + + add_proto qw/unsigned int vp9_high_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_variance8x4/; + + add_proto qw/unsigned int vp9_high_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_variance4x8/; + + add_proto qw/unsigned int vp9_high_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_variance4x4/; + + add_proto qw/void vp9_high_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + specialize qw/vp9_high_10_get8x8var/; + + add_proto qw/void vp9_high_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + specialize qw/vp9_high_10_get16x16var/; + + add_proto qw/unsigned int vp9_high_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_variance32x16/; + + add_proto qw/unsigned int vp9_high_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_variance16x32/; + + add_proto qw/unsigned int vp9_high_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_variance64x32/; + + add_proto qw/unsigned int vp9_high_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_variance32x64/; + + add_proto qw/unsigned int vp9_high_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_variance32x32/; + + add_proto qw/unsigned int vp9_high_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_variance64x64/; + + add_proto qw/unsigned int vp9_high_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_variance16x16/; + + add_proto qw/unsigned int vp9_high_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_variance16x8/; + + add_proto qw/unsigned int vp9_high_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_variance8x16/; + + add_proto qw/unsigned int vp9_high_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_variance8x8/; + + add_proto qw/unsigned int vp9_high_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_variance8x4/; + + add_proto qw/unsigned int vp9_high_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_variance4x8/; + + add_proto qw/unsigned int vp9_high_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_variance4x4/; + + add_proto qw/void vp9_high_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + specialize qw/vp9_high_12_get8x8var/; + + add_proto qw/void vp9_high_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + specialize qw/vp9_high_12_get16x16var/; + + add_proto qw/unsigned int vp9_high_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_sub_pixel_variance64x64/; + + add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_sub_pixel_avg_variance64x64/; + + add_proto qw/unsigned int vp9_high_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_sub_pixel_variance32x64/; + + add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_sub_pixel_avg_variance32x64/; + + add_proto qw/unsigned int vp9_high_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_sub_pixel_variance64x32/; + + add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_sub_pixel_avg_variance64x32/; + + add_proto qw/unsigned int vp9_high_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_sub_pixel_variance32x16/; + + add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_sub_pixel_avg_variance32x16/; + + add_proto qw/unsigned int vp9_high_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_sub_pixel_variance16x32/; + + add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_sub_pixel_avg_variance16x32/; + + add_proto qw/unsigned int vp9_high_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_sub_pixel_variance32x32/; + + add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_sub_pixel_avg_variance32x32/; + + add_proto qw/unsigned int vp9_high_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_sub_pixel_variance16x16/; + + add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_sub_pixel_avg_variance16x16/; + + add_proto qw/unsigned int vp9_high_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_sub_pixel_variance8x16/; + + add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_sub_pixel_avg_variance8x16/; + + add_proto qw/unsigned int vp9_high_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_sub_pixel_variance16x8/; + + add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_sub_pixel_avg_variance16x8/; + + add_proto qw/unsigned int vp9_high_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_sub_pixel_variance8x8/; + + add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_sub_pixel_avg_variance8x8/; + + add_proto qw/unsigned int vp9_high_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_sub_pixel_variance8x4/; + + add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_sub_pixel_avg_variance8x4/; + + add_proto qw/unsigned int vp9_high_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_sub_pixel_variance4x8/; + + add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_sub_pixel_avg_variance4x8/; + + add_proto qw/unsigned int vp9_high_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_sub_pixel_variance4x4/; + + add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_sub_pixel_avg_variance4x4/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_sub_pixel_variance64x64/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_10_sub_pixel_avg_variance64x64/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_sub_pixel_variance32x64/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_10_sub_pixel_avg_variance32x64/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_sub_pixel_variance64x32/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_10_sub_pixel_avg_variance64x32/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_sub_pixel_variance32x16/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_10_sub_pixel_avg_variance32x16/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_sub_pixel_variance16x32/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_10_sub_pixel_avg_variance16x32/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_sub_pixel_variance32x32/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_10_sub_pixel_avg_variance32x32/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_sub_pixel_variance16x16/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_10_sub_pixel_avg_variance16x16/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_sub_pixel_variance8x16/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_10_sub_pixel_avg_variance8x16/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_sub_pixel_variance16x8/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_10_sub_pixel_avg_variance16x8/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_sub_pixel_variance8x8/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_10_sub_pixel_avg_variance8x8/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_sub_pixel_variance8x4/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_10_sub_pixel_avg_variance8x4/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_sub_pixel_variance4x8/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_10_sub_pixel_avg_variance4x8/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_10_sub_pixel_variance4x4/; + + add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_10_sub_pixel_avg_variance4x4/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_sub_pixel_variance64x64/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_12_sub_pixel_avg_variance64x64/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_sub_pixel_variance32x64/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_12_sub_pixel_avg_variance32x64/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_sub_pixel_variance64x32/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_12_sub_pixel_avg_variance64x32/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_sub_pixel_variance32x16/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_12_sub_pixel_avg_variance32x16/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_sub_pixel_variance16x32/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_12_sub_pixel_avg_variance16x32/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_sub_pixel_variance32x32/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_12_sub_pixel_avg_variance32x32/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_sub_pixel_variance16x16/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_12_sub_pixel_avg_variance16x16/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_sub_pixel_variance8x16/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_12_sub_pixel_avg_variance8x16/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_sub_pixel_variance16x8/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_12_sub_pixel_avg_variance16x8/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_sub_pixel_variance8x8/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_12_sub_pixel_avg_variance8x8/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_sub_pixel_variance8x4/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_12_sub_pixel_avg_variance8x4/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_sub_pixel_variance4x8/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_12_sub_pixel_avg_variance4x8/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vp9_high_12_sub_pixel_variance4x4/; + + add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; + specialize qw/vp9_high_12_sub_pixel_avg_variance4x4/; + + add_proto qw/unsigned int vp9_high_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vp9_high_sad64x64/; + + add_proto qw/unsigned int vp9_high_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vp9_high_sad32x64/; + + add_proto qw/unsigned int vp9_high_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vp9_high_sad64x32/; + + add_proto qw/unsigned int vp9_high_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vp9_high_sad32x16/; + + add_proto qw/unsigned int vp9_high_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vp9_high_sad16x32/; + + add_proto qw/unsigned int vp9_high_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vp9_high_sad32x32/; + + add_proto qw/unsigned int vp9_high_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vp9_high_sad16x16/; + + add_proto qw/unsigned int vp9_high_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vp9_high_sad16x8/; + + add_proto qw/unsigned int vp9_high_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vp9_high_sad8x16/; + + add_proto qw/unsigned int vp9_high_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vp9_high_sad8x8/; + + add_proto qw/unsigned int vp9_high_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vp9_high_sad8x4/; + + add_proto qw/unsigned int vp9_high_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vp9_high_sad4x8/; + + add_proto qw/unsigned int vp9_high_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vp9_high_sad4x4/; + + add_proto qw/unsigned int vp9_high_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + specialize qw/vp9_high_sad64x64_avg/; + + add_proto qw/unsigned int vp9_high_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + specialize qw/vp9_high_sad32x64_avg/; + + add_proto qw/unsigned int vp9_high_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + specialize qw/vp9_high_sad64x32_avg/; + + add_proto qw/unsigned int vp9_high_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + specialize qw/vp9_high_sad32x16_avg/; + + add_proto qw/unsigned int vp9_high_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + specialize qw/vp9_high_sad16x32_avg/; + + add_proto qw/unsigned int vp9_high_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + specialize qw/vp9_high_sad32x32_avg/; + + add_proto qw/unsigned int vp9_high_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + specialize qw/vp9_high_sad16x16_avg/; + + add_proto qw/unsigned int vp9_high_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + specialize qw/vp9_high_sad16x8_avg/; + + add_proto qw/unsigned int vp9_high_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + specialize qw/vp9_high_sad8x16_avg/; + + add_proto qw/unsigned int vp9_high_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + specialize qw/vp9_high_sad8x8_avg/; + + add_proto qw/unsigned int vp9_high_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + specialize qw/vp9_high_sad8x4_avg/; + + add_proto qw/unsigned int vp9_high_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + specialize qw/vp9_high_sad4x8_avg/; + + add_proto qw/unsigned int vp9_high_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + specialize qw/vp9_high_sad4x4_avg/; + + add_proto qw/void vp9_high_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; + specialize qw/vp9_high_sad64x64x3/; + + add_proto qw/void vp9_high_sad32x32x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; + specialize qw/vp9_high_sad32x32x3/; + + add_proto qw/void vp9_high_sad16x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; + specialize qw/vp9_high_sad16x16x3/; + + add_proto qw/void vp9_high_sad16x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; + specialize qw/vp9_high_sad16x8x3/; + + add_proto qw/void vp9_high_sad8x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; + specialize qw/vp9_high_sad8x16x3/; + + add_proto qw/void vp9_high_sad8x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; + specialize qw/vp9_high_sad8x8x3/; + + add_proto qw/void vp9_high_sad4x4x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; + specialize qw/vp9_high_sad4x4x3/; + + add_proto qw/void vp9_high_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + specialize qw/vp9_high_sad64x64x8/; + + add_proto qw/void vp9_high_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + specialize qw/vp9_high_sad32x32x8/; + + add_proto qw/void vp9_high_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + specialize qw/vp9_high_sad16x16x8/; + + add_proto qw/void vp9_high_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + specialize qw/vp9_high_sad16x8x8/; + + add_proto qw/void vp9_high_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + specialize qw/vp9_high_sad8x16x8/; + + add_proto qw/void vp9_high_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + specialize qw/vp9_high_sad8x8x8/; + + add_proto qw/void vp9_high_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + specialize qw/vp9_high_sad8x4x8/; + + add_proto qw/void vp9_high_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + specialize qw/vp9_high_sad4x8x8/; + + add_proto qw/void vp9_high_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + specialize qw/vp9_high_sad4x4x8/; + + add_proto qw/void vp9_high_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; + specialize qw/vp9_high_sad64x64x4d/; + + add_proto qw/void vp9_high_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; + specialize qw/vp9_high_sad32x64x4d/; + + add_proto qw/void vp9_high_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; + specialize qw/vp9_high_sad64x32x4d/; + + add_proto qw/void vp9_high_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; + specialize qw/vp9_high_sad32x16x4d/; + + add_proto qw/void vp9_high_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; + specialize qw/vp9_high_sad16x32x4d/; + + add_proto qw/void vp9_high_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; + specialize qw/vp9_high_sad32x32x4d/; + + add_proto qw/void vp9_high_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; + specialize qw/vp9_high_sad16x16x4d/; + + add_proto qw/void vp9_high_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; + specialize qw/vp9_high_sad16x8x4d/; + + add_proto qw/void vp9_high_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; + specialize qw/vp9_high_sad8x16x4d/; + + add_proto qw/void vp9_high_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; + specialize qw/vp9_high_sad8x8x4d/; + + # TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form + add_proto qw/void vp9_high_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; + specialize qw/vp9_high_sad8x4x4d/; + + add_proto qw/void vp9_high_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; + specialize qw/vp9_high_sad4x8x4d/; + + add_proto qw/void vp9_high_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; + specialize qw/vp9_high_sad4x4x4d/; + + add_proto qw/unsigned int vp9_high_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vp9_high_mse16x16/; + + add_proto qw/unsigned int vp9_high_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vp9_high_mse8x16/; + + add_proto qw/unsigned int vp9_high_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vp9_high_mse16x8/; + + add_proto qw/unsigned int vp9_high_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vp9_high_mse8x8/; + + add_proto qw/unsigned int vp9_high_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vp9_high_10_mse16x16/; + + add_proto qw/unsigned int vp9_high_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vp9_high_10_mse8x16/; + + add_proto qw/unsigned int vp9_high_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vp9_high_10_mse16x8/; + + add_proto qw/unsigned int vp9_high_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vp9_high_10_mse8x8/; + + add_proto qw/unsigned int vp9_high_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vp9_high_12_mse16x16/; + + add_proto qw/unsigned int vp9_high_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vp9_high_12_mse8x16/; + + add_proto qw/unsigned int vp9_high_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vp9_high_12_mse16x8/; + + add_proto qw/unsigned int vp9_high_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vp9_high_12_mse8x8/; + + # ENCODEMB INVOKE + + add_proto qw/int64_t vp9_high_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; + specialize qw/vp9_high_block_error/; + + add_proto qw/void vp9_high_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd"; + specialize qw/vp9_high_subtract_block/; + + add_proto qw/void vp9_high_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/vp9_high_quantize_fp/; + + add_proto qw/void vp9_high_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/vp9_high_quantize_fp_32x32/; + + add_proto qw/void vp9_high_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/vp9_high_quantize_b/; + + add_proto qw/void vp9_high_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/vp9_high_quantize_b_32x32/; + + # + # Structured Similarity (SSIM) + # + if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") { + add_proto qw/void vp9_high_ssim_parms_8x8/, "uint16_t *s, int sp, uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; + specialize qw/vp9_high_ssim_parms_8x8/; + + add_proto qw/void vp9_high_ssim_parms_8x8_shift/, "uint16_t *s, int sp, uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr, unsigned int bd, unsigned int shift"; + specialize qw/vp9_high_ssim_parms_8x8_shift/; + } + + # fdct functions + add_proto qw/void vp9_high_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp9_high_fht4x4/; + + add_proto qw/void vp9_high_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp9_high_fht8x8/; + + add_proto qw/void vp9_high_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp9_high_fht16x16/; + + add_proto qw/void vp9_high_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_high_fwht4x4/; + + add_proto qw/void vp9_high_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_high_fdct4x4/; + + add_proto qw/void vp9_high_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_high_fdct8x8_1/; + + add_proto qw/void vp9_high_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_high_fdct8x8/; + + add_proto qw/void vp9_high_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_high_fdct16x16_1/; + + add_proto qw/void vp9_high_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_high_fdct16x16/; + + add_proto qw/void vp9_high_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_high_fdct32x32_1/; + + add_proto qw/void vp9_high_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_high_fdct32x32/; + + add_proto qw/void vp9_high_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_high_fdct32x32_rd/; + + add_proto qw/void vp9_high_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; + specialize qw/vp9_high_temporal_filter_apply/; + +} +# End vp9_high encoder functions + } # end encoder functions 1; diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c index 1b4904c39..b6847b92e 100644 --- a/vp9/common/x86/vp9_asm_stubs.c +++ b/vp9/common/x86/vp9_asm_stubs.c @@ -139,25 +139,25 @@ void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ filter_x, x_step_q4, filter_y, y_step_q4, w, h); \ } \ } -#if HAVE_AVX2 +#if HAVE_AVX2 && HAVE_SSSE3 filter8_1dfunction vp9_filter_block1d16_v8_avx2; filter8_1dfunction vp9_filter_block1d16_h8_avx2; filter8_1dfunction vp9_filter_block1d4_v8_ssse3; -#if (ARCH_X86_64) +#if ARCH_X86_64 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3 -#else +#else // ARCH_X86 filter8_1dfunction vp9_filter_block1d8_v8_ssse3; filter8_1dfunction vp9_filter_block1d8_h8_ssse3; filter8_1dfunction vp9_filter_block1d4_h8_ssse3; #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3 -#endif +#endif // ARCH_X86_64 / ARCH_X86 filter8_1dfunction vp9_filter_block1d16_v2_ssse3; filter8_1dfunction vp9_filter_block1d16_h2_ssse3; filter8_1dfunction vp9_filter_block1d8_v2_ssse3; @@ -190,9 +190,9 @@ FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); // const int16_t *filter_y, int y_step_q4, // int w, int h); FUN_CONV_2D(, avx2); -#endif +#endif // HAVE_AX2 && HAVE_SSSE3 #if HAVE_SSSE3 -#if (ARCH_X86_64) +#if ARCH_X86_64 filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3; filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3; filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; @@ -204,14 +204,14 @@ filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; #define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3 #define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3 #define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3 -#else +#else // ARCH_X86 filter8_1dfunction vp9_filter_block1d16_v8_ssse3; filter8_1dfunction vp9_filter_block1d16_h8_ssse3; filter8_1dfunction vp9_filter_block1d8_v8_ssse3; filter8_1dfunction vp9_filter_block1d8_h8_ssse3; filter8_1dfunction vp9_filter_block1d4_v8_ssse3; filter8_1dfunction vp9_filter_block1d4_h8_ssse3; -#endif +#endif // ARCH_X86_64 / ARCH_X86 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; @@ -270,7 +270,7 @@ FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, // int w, int h); FUN_CONV_2D(, ssse3); FUN_CONV_2D(avg_ , ssse3); -#endif +#endif // HAVE_SSSE3 #if HAVE_SSE2 filter8_1dfunction vp9_filter_block1d16_v8_sse2; @@ -336,4 +336,4 @@ FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); // int w, int h); FUN_CONV_2D(, sse2); FUN_CONV_2D(avg_ , sse2); -#endif +#endif // HAVE_SSE2 diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index b60f8a06d..df609872b 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -3573,6 +3573,7 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, int stride) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<5); + const __m128i zero = _mm_setzero_si128(); // idct constants for each stage const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); @@ -3635,7 +3636,6 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, stp2_30, stp2_31; __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int i, j, i32; - int zero_flag[2]; for (i = 0; i < 4; i++) { i32 = (i << 5); @@ -3710,13 +3710,7 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); - zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]); - zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]); - zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32); - zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]); - zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]); - - if (!zero_flag[0] && !zero_flag[1]) { + if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { col[i32 + 0] = _mm_setzero_si128(); col[i32 + 1] = _mm_setzero_si128(); col[i32 + 2] = _mm_setzero_si128(); @@ -3795,7 +3789,6 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); } for (i = 0; i < 4; i++) { - const __m128i zero = _mm_setzero_si128(); // Second 1-D idct j = i << 3; diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index a9c03f0cc..7615cddda 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -195,7 +195,7 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block, struct macroblockd_plane *const pd = &xd->plane[plane]; if (eob > 0) { TX_TYPE tx_type = DCT_DCT; - int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); if (xd->lossless) { tx_type = DCT_DCT; vp9_iwht4x4_add(dqcoeff, dst, stride, eob); @@ -668,6 +668,15 @@ static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); } + cm->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth; +} + +static INLINE int valid_ref_frame_img_fmt(vpx_bit_depth_t ref_bit_depth, + int ref_xss, int ref_yss, + vpx_bit_depth_t this_bit_depth, + int this_xss, int this_yss) { + return ref_bit_depth == this_bit_depth && ref_xss == this_xss && + ref_yss == this_yss; } static void setup_frame_size_with_refs(VP9_COMMON *cm, @@ -707,6 +716,18 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm, if (!has_valid_ref_frame) vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Referenced frame has invalid size"); + for (i = 0; i < REFS_PER_FRAME; ++i) { + RefBuffer *const ref_frame = &cm->frame_refs[i]; + if (!valid_ref_frame_img_fmt( + ref_frame->buf->bit_depth, + ref_frame->buf->uv_crop_width < ref_frame->buf->y_crop_width, + ref_frame->buf->uv_crop_height < ref_frame->buf->y_crop_height, + cm->bit_depth, + cm->subsampling_x, + cm->subsampling_y)) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Referenced frame has incompatible color space"); + } resize_context_buffers(cm, width, height); setup_display_size(cm, rb); @@ -723,6 +744,7 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm, vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); } + cm->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth; } static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { @@ -938,9 +960,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, return vp9_reader_find_end(&tile_data->bit_reader); } -static int tile_worker_hook(void *arg1, void *arg2) { - TileWorkerData *const tile_data = (TileWorkerData*)arg1; - const TileInfo *const tile = (TileInfo*)arg2; +static int tile_worker_hook(TileWorkerData *const tile_data, + const TileInfo *const tile) { int mi_row, mi_col; for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; @@ -1201,6 +1222,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, } setup_frame_size(cm, rb); + pbi->need_resync = 0; } else { cm->intra_only = cm->show_frame ? 0 : vp9_rb_read_bit(rb); @@ -1224,6 +1246,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES); setup_frame_size(cm, rb); + pbi->need_resync = 0; } else { pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES); for (i = 0; i < REFS_PER_FRAME; ++i) { @@ -1252,6 +1275,12 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, } } + if (pbi->need_resync) { + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Keyframe / intra-only frame required to reset decoder" + " state"); + } + if (!cm->error_resilient_mode) { cm->refresh_frame_context = vp9_rb_read_bit(rb); cm->frame_parallel_decoding_mode = vp9_rb_read_bit(rb); diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c index 9106b0d14..6ee3d7037 100644 --- a/vp9/decoder/vp9_decoder.c +++ b/vp9/decoder/vp9_decoder.c @@ -60,6 +60,7 @@ VP9Decoder *vp9_decoder_create() { } cm->error.setjmp = 1; + pbi->need_resync = 1; initialize_dec(); // Initialize the references to not point to any frame buffers. @@ -238,6 +239,7 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, cm->new_fb_idx = get_free_fb(cm); if (setjmp(cm->error.jmp)) { + pbi->need_resync = 1; cm->error.setjmp = 0; vp9_clear_system_state(); diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h index 848d212e6..4f52bb9c4 100644 --- a/vp9/decoder/vp9_decoder.h +++ b/vp9/decoder/vp9_decoder.h @@ -58,6 +58,7 @@ typedef struct VP9Decoder { int max_threads; int inv_tile_order; + int need_resync; // wait for key/intra-only frame } VP9Decoder; int vp9_receive_compressed_data(struct VP9Decoder *pbi, diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c index 91cdf3860..76ca1ae8f 100644 --- a/vp9/decoder/vp9_detokenize.c +++ b/vp9/decoder/vp9_detokenize.c @@ -51,7 +51,7 @@ } while (0) static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type, - int16_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq, + tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq, int ctx, const int16_t *scan, const int16_t *nb, vp9_reader *r) { const int max_eob = 16 << (tx_size << 1); diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c index b82ea6a79..6635880e5 100644 --- a/vp9/decoder/vp9_dthread.c +++ b/vp9/decoder/vp9_dthread.c @@ -121,10 +121,10 @@ static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer, } // Row-based multi-threaded loopfilter hook -static int loop_filter_row_worker(void *arg1, void *arg2) { - TileWorkerData *const tile_data = (TileWorkerData*)arg1; +static int loop_filter_row_worker(TileWorkerData *const tile_data, + void *unused) { LFWorkerData *const lf_data = &tile_data->lfdata; - (void) arg2; + (void)unused; loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->start, lf_data->stop, lf_data->y_only, lf_data->lf_sync, lf_data->num_lf_workers); @@ -145,15 +145,13 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, const int num_workers = MIN(pbi->max_threads & ~1, tile_cols); int i; - // Allocate memory used in thread synchronization. - // This always needs to be done even if frame_filter_level is 0. + if (!frame_filter_level) return; + if (!lf_sync->sync_range || cm->last_height != cm->height) { vp9_loop_filter_dealloc(lf_sync); - vp9_loop_filter_alloc(cm, lf_sync, sb_rows, cm->width); + vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width); } - if (!frame_filter_level) return; - vp9_loop_filter_frame_init(cm, frame_filter_level); // Initialize cur_sb_col to -1 for all SB rows. @@ -216,7 +214,7 @@ static int get_sync_range(int width) { } // Allocate memory for lf row synchronization -void vp9_loop_filter_alloc(VP9_COMMON *cm, VP9LfSync *lf_sync, int rows, +void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, int width) { lf_sync->rows = rows; #if CONFIG_MULTITHREAD diff --git a/vp9/decoder/vp9_dthread.h b/vp9/decoder/vp9_dthread.h index 8b02ef71e..b1fbdeb74 100644 --- a/vp9/decoder/vp9_dthread.h +++ b/vp9/decoder/vp9_dthread.h @@ -42,8 +42,8 @@ typedef struct VP9LfSyncData { } VP9LfSync; // Allocate memory for loopfilter row synchronization. -void vp9_loop_filter_alloc(struct VP9Common *cm, VP9LfSync *lf_sync, - int rows, int width); +void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, + int width); // Deallocate loopfilter synchronization related mutex and data. void vp9_loop_filter_dealloc(VP9LfSync *lf_sync); diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index b72638322..767bd7f91 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -28,8 +28,8 @@ typedef struct { struct macroblock_plane { DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]); - int16_t *qcoeff; - int16_t *coeff; + tran_low_t *qcoeff; + tran_low_t *coeff; uint16_t *eobs; struct buf_2d src; @@ -119,8 +119,12 @@ struct macroblock { // Used to store sub partition's choices. MV pred_mv[MAX_REF_FRAMES]; - void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride); - void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob); + void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride); + void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob); +#if CONFIG_VP9_HIGHBITDEPTH + void (*high_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd); +#endif }; #ifdef __cplusplus diff --git a/vp9/encoder/vp9_context_tree.c b/vp9/encoder/vp9_context_tree.c index 9b7a93267..12acc5114 100644 --- a/vp9/encoder/vp9_context_tree.c +++ b/vp9/encoder/vp9_context_tree.c @@ -30,13 +30,13 @@ static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk, for (i = 0; i < MAX_MB_PLANE; ++i) { for (k = 0; k < 3; ++k) { CHECK_MEM_ERROR(cm, ctx->coeff[i][k], - vpx_memalign(16, num_pix * sizeof(int16_t))); + vpx_memalign(16, num_pix * sizeof(*ctx->coeff[i][k]))); CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k], - vpx_memalign(16, num_pix * sizeof(int16_t))); + vpx_memalign(16, num_pix * sizeof(*ctx->qcoeff[i][k]))); CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k], - vpx_memalign(16, num_pix * sizeof(int16_t))); + vpx_memalign(16, num_pix * sizeof(*ctx->dqcoeff[i][k]))); CHECK_MEM_ERROR(cm, ctx->eobs[i][k], - vpx_memalign(16, num_pix * sizeof(uint16_t))); + vpx_memalign(16, num_pix * sizeof(*ctx->eobs[i][k]))); ctx->coeff_pbuf[i][k] = ctx->coeff[i][k]; ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k]; ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k]; diff --git a/vp9/encoder/vp9_context_tree.h b/vp9/encoder/vp9_context_tree.h index 236389b6f..97f074148 100644 --- a/vp9/encoder/vp9_context_tree.h +++ b/vp9/encoder/vp9_context_tree.h @@ -19,15 +19,15 @@ struct VP9_COMP; typedef struct { MODE_INFO mic; uint8_t *zcoeff_blk; - int16_t *coeff[MAX_MB_PLANE][3]; - int16_t *qcoeff[MAX_MB_PLANE][3]; - int16_t *dqcoeff[MAX_MB_PLANE][3]; + tran_low_t *coeff[MAX_MB_PLANE][3]; + tran_low_t *qcoeff[MAX_MB_PLANE][3]; + tran_low_t *dqcoeff[MAX_MB_PLANE][3]; uint16_t *eobs[MAX_MB_PLANE][3]; // dual buffer pointers, 0: in use, 1: best in store - int16_t *coeff_pbuf[MAX_MB_PLANE][3]; - int16_t *qcoeff_pbuf[MAX_MB_PLANE][3]; - int16_t *dqcoeff_pbuf[MAX_MB_PLANE][3]; + tran_low_t *coeff_pbuf[MAX_MB_PLANE][3]; + tran_low_t *qcoeff_pbuf[MAX_MB_PLANE][3]; + tran_low_t *dqcoeff_pbuf[MAX_MB_PLANE][3]; uint16_t *eobs_pbuf[MAX_MB_PLANE][3]; int is_coded; diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index 59222f0a9..eff899610 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -18,15 +18,17 @@ #include "vp9/common/vp9_idct.h" #include "vp9/common/vp9_systemdependent.h" -static INLINE int fdct_round_shift(int input) { - int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); - assert(INT16_MIN <= rv && rv <= INT16_MAX); +static INLINE tran_high_t fdct_round_shift(tran_high_t input) { + tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + // TODO(debargha, peter.derivaz): Find new bounds for this assert + // and make the bounds consts. + // assert(INT16_MIN <= rv && rv <= INT16_MAX); return rv; } -static void fdct4(const int16_t *input, int16_t *output) { - int16_t step[4]; - int temp1, temp2; +static void fdct4(const tran_low_t *input, tran_low_t *output) { + tran_high_t step[4]; + tran_high_t temp1, temp2; step[0] = input[0] + input[3]; step[1] = input[1] + input[2]; @@ -43,9 +45,9 @@ static void fdct4(const int16_t *input, int16_t *output) { output[3] = fdct_round_shift(temp2); } -void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) { +void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) { int r, c; - int16_t sum = 0; + tran_low_t sum = 0; for (r = 0; r < 4; ++r) for (c = 0; c < 4; ++c) sum += input[r * stride + c]; @@ -54,7 +56,7 @@ void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) { output[1] = 0; } -void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) { +void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, @@ -63,22 +65,23 @@ void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) { // in normal/row positions). int pass; // We need an intermediate buffer between passes. - int16_t intermediate[4 * 4]; - const int16_t *in = input; - int16_t *out = intermediate; + tran_low_t intermediate[4 * 4]; + const int16_t *in_pass0 = input; + const tran_low_t *in = NULL; + tran_low_t *out = intermediate; // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { - /*canbe16*/ int input[4]; - /*canbe16*/ int step[4]; - /*needs32*/ int temp1, temp2; + tran_high_t input[4]; // canbe16 + tran_high_t step[4]; // canbe16 + tran_high_t temp1, temp2; // needs32 int i; for (i = 0; i < 4; ++i) { // Load inputs. if (0 == pass) { - input[0] = in[0 * stride] * 16; - input[1] = in[1 * stride] * 16; - input[2] = in[2 * stride] * 16; - input[3] = in[3 * stride] * 16; + input[0] = in_pass0[0 * stride] * 16; + input[1] = in_pass0[1 * stride] * 16; + input[2] = in_pass0[2 * stride] * 16; + input[3] = in_pass0[3 * stride] * 16; if (i == 0 && input[0]) { input[0] += 1; } @@ -102,6 +105,7 @@ void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) { out[1] = fdct_round_shift(temp1); out[3] = fdct_round_shift(temp2); // Do next column (which is a transposed row in second/horizontal pass) + in_pass0++; in++; out += 4; } @@ -119,9 +123,9 @@ void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) { } } -static void fadst4(const int16_t *input, int16_t *output) { - int x0, x1, x2, x3; - int s0, s1, s2, s3, s4, s5, s6, s7; +static void fadst4(const tran_low_t *input, tran_low_t *output) { + tran_high_t x0, x1, x2, x3; + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; x0 = input[0]; x1 = input[1]; @@ -166,15 +170,15 @@ static const transform_2d FHT_4[] = { { fadst4, fadst4 } // ADST_ADST = 3 }; -void vp9_fht4x4_c(const int16_t *input, int16_t *output, +void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type) { if (tx_type == DCT_DCT) { vp9_fdct4x4_c(input, output, stride); } else { - int16_t out[4 * 4]; - int16_t *outptr = &out[0]; + tran_low_t out[4 * 4]; + tran_low_t *outptr = &out[0]; int i, j; - int16_t temp_in[4], temp_out[4]; + tran_low_t temp_in[4], temp_out[4]; const transform_2d ht = FHT_4[tx_type]; // Columns @@ -199,10 +203,10 @@ void vp9_fht4x4_c(const int16_t *input, int16_t *output, } } -static void fdct8(const int16_t *input, int16_t *output) { - /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; - /*needs32*/ int t0, t1, t2, t3; - /*canbe16*/ int x0, x1, x2, x3; +static void fdct8(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 + tran_high_t t0, t1, t2, t3; // needs32 + tran_high_t x0, x1, x2, x3; // canbe16 // stage 1 s0 = input[0] + input[7]; @@ -251,9 +255,9 @@ static void fdct8(const int16_t *input, int16_t *output) { output[7] = fdct_round_shift(t3); } -void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) { +void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) { int r, c; - int16_t sum = 0; + tran_low_t sum = 0; for (r = 0; r < 8; ++r) for (c = 0; c < 8; ++c) sum += input[r * stride + c]; @@ -262,16 +266,16 @@ void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) { output[1] = 0; } -void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) { +void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { int i, j; - int16_t intermediate[64]; + tran_low_t intermediate[64]; // Transform columns { - int16_t *output = intermediate; - /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; - /*needs32*/ int t0, t1, t2, t3; - /*canbe16*/ int x0, x1, x2, x3; + tran_low_t *output = intermediate; + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 + tran_high_t t0, t1, t2, t3; // needs32 + tran_high_t x0, x1, x2, x3; // canbe16 int i; for (i = 0; i < 8; i++) { @@ -333,9 +337,9 @@ void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) { } } -void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) { +void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) { int r, c; - int16_t sum = 0; + tran_low_t sum = 0; for (r = 0; r < 16; ++r) for (c = 0; c < 16; ++c) sum += input[r * stride + c]; @@ -344,7 +348,7 @@ void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) { output[1] = 0; } -void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { +void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, @@ -353,37 +357,38 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { // in normal/row positions). int pass; // We need an intermediate buffer between passes. - int16_t intermediate[256]; - const int16_t *in = input; - int16_t *out = intermediate; + tran_low_t intermediate[256]; + const int16_t *in_pass0 = input; + const tran_low_t *in = NULL; + tran_low_t *out = intermediate; // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { - /*canbe16*/ int step1[8]; - /*canbe16*/ int step2[8]; - /*canbe16*/ int step3[8]; - /*canbe16*/ int input[8]; - /*needs32*/ int temp1, temp2; + tran_high_t step1[8]; // canbe16 + tran_high_t step2[8]; // canbe16 + tran_high_t step3[8]; // canbe16 + tran_high_t input[8]; // canbe16 + tran_high_t temp1, temp2; // needs32 int i; for (i = 0; i < 16; i++) { if (0 == pass) { // Calculate input for the first 8 results. - input[0] = (in[0 * stride] + in[15 * stride]) * 4; - input[1] = (in[1 * stride] + in[14 * stride]) * 4; - input[2] = (in[2 * stride] + in[13 * stride]) * 4; - input[3] = (in[3 * stride] + in[12 * stride]) * 4; - input[4] = (in[4 * stride] + in[11 * stride]) * 4; - input[5] = (in[5 * stride] + in[10 * stride]) * 4; - input[6] = (in[6 * stride] + in[ 9 * stride]) * 4; - input[7] = (in[7 * stride] + in[ 8 * stride]) * 4; + input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4; + input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4; + input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4; + input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4; + input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4; + input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4; + input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4; + input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4; // Calculate input for the next 8 results. - step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4; - step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4; - step1[2] = (in[5 * stride] - in[10 * stride]) * 4; - step1[3] = (in[4 * stride] - in[11 * stride]) * 4; - step1[4] = (in[3 * stride] - in[12 * stride]) * 4; - step1[5] = (in[2 * stride] - in[13 * stride]) * 4; - step1[6] = (in[1 * stride] - in[14 * stride]) * 4; - step1[7] = (in[0 * stride] - in[15 * stride]) * 4; + step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4; + step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4; + step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4; + step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4; + step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4; + step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4; + step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4; + step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4; } else { // Calculate input for the first 8 results. input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); @@ -406,9 +411,9 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { } // Work on the first eight values; fdct8(input, even_results); { - /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; - /*needs32*/ int t0, t1, t2, t3; - /*canbe16*/ int x0, x1, x2, x3; + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 + tran_high_t t0, t1, t2, t3; // needs32 + tran_high_t x0, x1, x2, x3; // canbe16 // stage 1 s0 = input[0] + input[7]; @@ -514,6 +519,7 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { } // Do next column (which is a transposed row in second/horizontal pass) in++; + in_pass0++; out += 16; } // Setup in/out for next pass. @@ -522,17 +528,17 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { } } -static void fadst8(const int16_t *input, int16_t *output) { - int s0, s1, s2, s3, s4, s5, s6, s7; +static void fadst8(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; - int x0 = input[7]; - int x1 = input[0]; - int x2 = input[5]; - int x3 = input[2]; - int x4 = input[3]; - int x5 = input[4]; - int x6 = input[1]; - int x7 = input[6]; + tran_high_t x0 = input[7]; + tran_high_t x1 = input[0]; + tran_high_t x2 = input[5]; + tran_high_t x3 = input[2]; + tran_high_t x4 = input[3]; + tran_high_t x5 = input[4]; + tran_high_t x6 = input[1]; + tran_high_t x7 = input[6]; // stage 1 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; @@ -600,15 +606,15 @@ static const transform_2d FHT_8[] = { { fadst8, fadst8 } // ADST_ADST = 3 }; -void vp9_fht8x8_c(const int16_t *input, int16_t *output, +void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type) { if (tx_type == DCT_DCT) { vp9_fdct8x8_c(input, output, stride); } else { - int16_t out[64]; - int16_t *outptr = &out[0]; + tran_low_t out[64]; + tran_low_t *outptr = &out[0]; int i, j; - int16_t temp_in[8], temp_out[8]; + tran_low_t temp_in[8], temp_out[8]; const transform_2d ht = FHT_8[tx_type]; // Columns @@ -633,17 +639,18 @@ void vp9_fht8x8_c(const int16_t *input, int16_t *output, /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per pixel. */ -void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) { +void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) { int i; - int a1, b1, c1, d1, e1; - const int16_t *ip = input; - int16_t *op = output; + tran_high_t a1, b1, c1, d1, e1; + const int16_t *ip_pass0 = input; + const tran_low_t *ip = NULL; + tran_low_t *op = output; for (i = 0; i < 4; i++) { - a1 = ip[0 * stride]; - b1 = ip[1 * stride]; - c1 = ip[2 * stride]; - d1 = ip[3 * stride]; + a1 = ip_pass0[0 * stride]; + b1 = ip_pass0[1 * stride]; + c1 = ip_pass0[2 * stride]; + d1 = ip_pass0[3 * stride]; a1 += b1; d1 = d1 - c1; @@ -657,7 +664,7 @@ void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) { op[8] = d1; op[12] = b1; - ip++; + ip_pass0++; op++; } ip = output; @@ -687,12 +694,12 @@ void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) { } // Rewrote to use same algorithm as others. -static void fdct16(const int16_t in[16], int16_t out[16]) { - /*canbe16*/ int step1[8]; - /*canbe16*/ int step2[8]; - /*canbe16*/ int step3[8]; - /*canbe16*/ int input[8]; - /*needs32*/ int temp1, temp2; +static void fdct16(const tran_low_t in[16], tran_low_t out[16]) { + tran_high_t step1[8]; // canbe16 + tran_high_t step2[8]; // canbe16 + tran_high_t step3[8]; // canbe16 + tran_high_t input[8]; // canbe16 + tran_high_t temp1, temp2; // needs32 // step 1 input[0] = in[0] + in[15]; @@ -715,9 +722,9 @@ static void fdct16(const int16_t in[16], int16_t out[16]) { // fdct8(step, step); { - /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; - /*needs32*/ int t0, t1, t2, t3; - /*canbe16*/ int x0, x1, x2, x3; + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 + tran_high_t t0, t1, t2, t3; // needs32 + tran_high_t x0, x1, x2, x3; // canbe16 // stage 1 s0 = input[0] + input[7]; @@ -828,25 +835,26 @@ static void fdct16(const int16_t in[16], int16_t out[16]) { out[15] = fdct_round_shift(temp2); } -static void fadst16(const int16_t *input, int16_t *output) { - int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; - - int x0 = input[15]; - int x1 = input[0]; - int x2 = input[13]; - int x3 = input[2]; - int x4 = input[11]; - int x5 = input[4]; - int x6 = input[9]; - int x7 = input[6]; - int x8 = input[7]; - int x9 = input[8]; - int x10 = input[5]; - int x11 = input[10]; - int x12 = input[3]; - int x13 = input[12]; - int x14 = input[1]; - int x15 = input[14]; +static void fadst16(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; + tran_high_t s9, s10, s11, s12, s13, s14, s15; + + tran_high_t x0 = input[15]; + tran_high_t x1 = input[0]; + tran_high_t x2 = input[13]; + tran_high_t x3 = input[2]; + tran_high_t x4 = input[11]; + tran_high_t x5 = input[4]; + tran_high_t x6 = input[9]; + tran_high_t x7 = input[6]; + tran_high_t x8 = input[7]; + tran_high_t x9 = input[8]; + tran_high_t x10 = input[5]; + tran_high_t x11 = input[10]; + tran_high_t x12 = input[3]; + tran_high_t x13 = input[12]; + tran_high_t x14 = input[1]; + tran_high_t x15 = input[14]; // stage 1 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; @@ -997,15 +1005,15 @@ static const transform_2d FHT_16[] = { { fadst16, fadst16 } // ADST_ADST = 3 }; -void vp9_fht16x16_c(const int16_t *input, int16_t *output, +void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type) { if (tx_type == DCT_DCT) { vp9_fdct16x16_c(input, output, stride); } else { - int16_t out[256]; - int16_t *outptr = &out[0]; + tran_low_t out[256]; + tran_low_t *outptr = &out[0]; int i, j; - int16_t temp_in[16], temp_out[16]; + tran_low_t temp_in[16], temp_out[16]; const transform_2d ht = FHT_16[tx_type]; // Columns @@ -1028,19 +1036,21 @@ void vp9_fht16x16_c(const int16_t *input, int16_t *output, } } -static INLINE int dct_32_round(int input) { - int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); - assert(-131072 <= rv && rv <= 131071); +static INLINE tran_high_t dct_32_round(tran_high_t input) { + tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + // TODO(debargha, peter.derivaz): Find new bounds for this assert, + // and make the bounds consts. + // assert(-131072 <= rv && rv <= 131071); return rv; } -static INLINE int half_round_shift(int input) { - int rv = (input + 1 + (input < 0)) >> 2; +static INLINE tran_high_t half_round_shift(tran_high_t input) { + tran_high_t rv = (input + 1 + (input < 0)) >> 2; return rv; } -static void fdct32(const int *input, int *output, int round) { - int step[32]; +static void fdct32(const tran_high_t *input, tran_high_t *output, int round) { + tran_high_t step[32]; // Stage 1 step[0] = input[0] + input[(32 - 1)]; step[1] = input[1] + input[(32 - 2)]; @@ -1362,9 +1372,9 @@ static void fdct32(const int *input, int *output, int round) { output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); } -void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride) { +void vp9_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) { int r, c; - int16_t sum = 0; + tran_low_t sum = 0; for (r = 0; r < 32; ++r) for (c = 0; c < 32; ++c) sum += input[r * stride + c]; @@ -1373,13 +1383,13 @@ void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride) { output[1] = 0; } -void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { +void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { int i, j; - int output[32 * 32]; + tran_high_t output[32 * 32]; // Columns for (i = 0; i < 32; ++i) { - int temp_in[32], temp_out[32]; + tran_high_t temp_in[32], temp_out[32]; for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; fdct32(temp_in, temp_out, 0); @@ -1389,7 +1399,7 @@ void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { // Rows for (i = 0; i < 32; ++i) { - int temp_in[32], temp_out[32]; + tran_high_t temp_in[32], temp_out[32]; for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; fdct32(temp_in, temp_out, 0); @@ -1401,13 +1411,13 @@ void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { // Note that although we use dct_32_round in dct32 computation flow, // this 2d fdct32x32 for rate-distortion optimization loop is operating // within 16 bits precision. -void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { +void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { int i, j; - int output[32 * 32]; + tran_high_t output[32 * 32]; // Columns for (i = 0; i < 32; ++i) { - int temp_in[32], temp_out[32]; + tran_high_t temp_in[32], temp_out[32]; for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; fdct32(temp_in, temp_out, 0); @@ -1420,7 +1430,7 @@ void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { // Rows for (i = 0; i < 32; ++i) { - int temp_in[32], temp_out[32]; + tran_high_t temp_in[32], temp_out[32]; for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; fdct32(temp_in, temp_out, 1); @@ -1428,3 +1438,61 @@ void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { out[j + i * 32] = temp_out[j]; } } + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_high_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { + vp9_fdct4x4_c(input, output, stride); +} + +void vp9_high_fht4x4_c(const int16_t *input, tran_low_t *output, + int stride, int tx_type) { + vp9_fht4x4_c(input, output, stride, tx_type); +} + +void vp9_high_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output, + int stride) { + vp9_fdct8x8_1_c(input, final_output, stride); +} + +void vp9_high_fdct8x8_c(const int16_t *input, tran_low_t *final_output, + int stride) { + vp9_fdct8x8_c(input, final_output, stride); +} + +void vp9_high_fdct16x16_1_c(const int16_t *input, tran_low_t *output, + int stride) { + vp9_fdct16x16_1_c(input, output, stride); +} + +void vp9_high_fdct16x16_c(const int16_t *input, tran_low_t *output, + int stride) { + vp9_fdct16x16_c(input, output, stride); +} + +void vp9_high_fht8x8_c(const int16_t *input, tran_low_t *output, + int stride, int tx_type) { + vp9_fht8x8_c(input, output, stride, tx_type); +} + +void vp9_high_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) { + vp9_fwht4x4_c(input, output, stride); +} + +void vp9_high_fht16x16_c(const int16_t *input, tran_low_t *output, + int stride, int tx_type) { + vp9_fht16x16_c(input, output, stride, tx_type); +} + +void vp9_high_fdct32x32_1_c(const int16_t *input, tran_low_t *out, int stride) { + vp9_fdct32x32_1_c(input, out, stride); +} + +void vp9_high_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { + vp9_fdct32x32_c(input, out, stride); +} + +void vp9_high_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, + int stride) { + vp9_fdct32x32_rd_c(input, out, stride); +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c index c4cf5eeb6..75b94499d 100644 --- a/vp9/encoder/vp9_denoiser.c +++ b/vp9/encoder/vp9_denoiser.c @@ -89,9 +89,9 @@ static VP9_DENOISER_DECISION denoiser_filter(const uint8_t *sig, int sig_stride, int total_adj = 0; int shift_inc = 1; - /* If motion_magnitude is small, making the denoiser more aggressive by - * increasing the adjustment for each level. Add another increment for - * blocks that are labeled for increase denoising. */ + // If motion_magnitude is small, making the denoiser more aggressive by + // increasing the adjustment for each level. Add another increment for + // blocks that are labeled for increase denoising. if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) { if (increase_denoising) { shift_inc = 2; diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h index a913add86..fa714b132 100644 --- a/vp9/encoder/vp9_denoiser.h +++ b/vp9/encoder/vp9_denoiser.h @@ -18,7 +18,7 @@ extern "C" { #endif -#define MOTION_MAGNITUDE_THRESHOLD (8*3) +#define MOTION_MAGNITUDE_THRESHOLD (8 * 3) typedef enum vp9_denoiser_decision { COPY_BLOCK, diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 72ced0529..c62b52fb5 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -2466,6 +2466,9 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, vp9_zero(cpi->mb.pred_mv); cpi->pc_root->index = 0; + // TODO(yunqingwang): use_lastframe_partitioning is no longer used in good- + // quality encoding. Need to evaluate it in real-time encoding later to + // decide if it can be removed too. And then, do the code cleanup. if ((sf->partition_search_type == SEARCH_PARTITION && sf->use_lastframe_partitioning) || sf->partition_search_type == FIXED_PARTITION || diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 667845072..794e6d0e3 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -107,9 +107,9 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block, vp9_token_state tokens[1025][2]; unsigned best_index[1025][2]; uint8_t token_cache[1024]; - const int16_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block); - int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); - int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); const int eob = p->eobs[block]; const PLANE_TYPE type = pd->plane_type; const int default_eob = 16 << (tx_size << 1); @@ -294,22 +294,33 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block, } static INLINE void fdct32x32(int rd_transform, - const int16_t *src, int16_t *dst, int src_stride) { + const int16_t *src, tran_low_t *dst, + int src_stride) { if (rd_transform) vp9_fdct32x32_rd(src, dst, src_stride); else vp9_fdct32x32(src, dst, src_stride); } +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void high_fdct32x32(int rd_transform, const int16_t *src, + tran_low_t *dst, int src_stride) { + if (rd_transform) + vp9_high_fdct32x32_rd(src, dst, src_stride); + else + vp9_high_fdct32x32(src, dst, src_stride); +} +#endif + void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size) { MACROBLOCKD *const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; - int16_t *const coeff = BLOCK_OFFSET(p->coeff, block); - int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); - int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); uint16_t *const eob = &p->eobs[block]; const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; int i, j; @@ -357,9 +368,9 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, MACROBLOCKD *const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; - int16_t *const coeff = BLOCK_OFFSET(p->coeff, block); - int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); - int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); uint16_t *const eob = &p->eobs[block]; const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; int i, j; @@ -405,9 +416,9 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; - int16_t *const coeff = BLOCK_OFFSET(p->coeff, block); - int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); - int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); uint16_t *const eob = &p->eobs[block]; const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; int i, j; @@ -458,7 +469,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, struct optimize_ctx *const ctx = args->ctx; struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; - int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); int i, j; uint8_t *dst; ENTROPY_CONTEXT *a, *l; @@ -538,7 +549,7 @@ static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize, MACROBLOCKD *const xd = &x->e_mbd; struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; - int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); int i, j; uint8_t *dst; txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); @@ -587,9 +598,9 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; - int16_t *coeff = BLOCK_OFFSET(p->coeff, block); - int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); - int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); const scan_order *scan_order; TX_TYPE tx_type; PREDICTION_MODE mode; diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index b6e606d78..b3884d056 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -556,6 +556,9 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) { cm->profile = oxcf->profile; cm->bit_depth = oxcf->bit_depth; +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth = oxcf->use_highbitdepth; +#endif cm->color_space = UNKNOWN; cm->width = oxcf->width; @@ -613,6 +616,11 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { assert(cm->bit_depth > VPX_BITS_8); cpi->oxcf = *oxcf; +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->oxcf.use_highbitdepth) { + cpi->mb.e_mbd.bd = (int)cm->bit_depth; + } +#endif rc->baseline_gf_interval = DEFAULT_GF_INTERVAL; @@ -1272,7 +1280,10 @@ static void generate_psnr_packet(VP9_COMP *cpi) { pkt.data.psnr.psnr[i] = psnr.psnr[i]; } pkt.kind = VPX_CODEC_PSNR_PKT; - vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt); + if (is_two_pass_svc(cpi)) + cpi->svc.layer_context[cpi->svc.spatial_layer_id].psnr_pkt = pkt.data.psnr; + else + vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt); } int vp9_use_as_reference(VP9_COMP *cpi, int ref_frame_flags) { @@ -2768,7 +2779,16 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, if (oxcf->pass == 1 && (!cpi->use_svc || is_two_pass_svc(cpi))) { const int lossless = is_lossless_requested(oxcf); +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->oxcf.use_highbitdepth) + cpi->mb.fwd_txm4x4 = lossless ? vp9_high_fwht4x4 : vp9_high_fdct4x4; + else + cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4; + cpi->mb.high_itxm_add = lossless ? vp9_high_iwht4x4_add : + vp9_high_idct4x4_add; +#else cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4; +#endif cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; vp9_first_pass(cpi, source); } else if (oxcf->pass == 2 && diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 0d3c4c19a..80774de92 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -217,6 +217,9 @@ typedef struct VP9EncoderConfig { vp8e_tuning tuning; vp9e_tune_content content; +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth; +#endif } VP9EncoderConfig; static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) { diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 8041b59cf..54b57cf88 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -437,41 +437,51 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { vp9_set_quantizer(cm, find_fp_qindex()); if (lc != NULL) { - MV_REFERENCE_FRAME ref_frame = LAST_FRAME; twopass = &lc->twopass; - if (cpi->common.current_video_frame == 0) { - cpi->ref_frame_flags = 0; + cpi->lst_fb_idx = cpi->svc.spatial_layer_id; + cpi->ref_frame_flags = VP9_LAST_FLAG; + + if (cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id < + REF_FRAMES) { + cpi->gld_fb_idx = + cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id; + cpi->ref_frame_flags |= VP9_GOLD_FLAG; + cpi->refresh_golden_frame = (lc->current_video_frame_in_layer == 0); } else { - if (lc->current_video_frame_in_layer < - (unsigned int)cpi->svc.number_temporal_layers) - cpi->ref_frame_flags = VP9_GOLD_FLAG; - else - cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + cpi->refresh_golden_frame = 0; } + if (lc->current_video_frame_in_layer == 0) + cpi->ref_frame_flags = 0; + vp9_scale_references(cpi); // Use either last frame or alt frame for motion search. if (cpi->ref_frame_flags & VP9_LAST_FLAG) { first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME); - ref_frame = LAST_FRAME; if (first_ref_buf == NULL) first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME); - } else if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { - first_ref_buf = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME); - ref_frame = GOLDEN_FRAME; - if (first_ref_buf == NULL) - first_ref_buf = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + } + + if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { + const int ref_idx = + cm->ref_frame_map[get_ref_frame_idx(cpi, GOLDEN_FRAME)]; + const int scaled_idx = cpi->scaled_ref_idx[GOLDEN_FRAME - 1]; + + gld_yv12 = (scaled_idx != ref_idx) ? &cm->frame_bufs[scaled_idx].buf : + get_ref_frame_buffer(cpi, GOLDEN_FRAME); + } else { + gld_yv12 = NULL; } recon_y_stride = new_yv12->y_stride; recon_uv_stride = new_yv12->uv_stride; uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height); - // Disable golden frame for svc first pass for now. - gld_yv12 = NULL; - set_ref_ptrs(cm, xd, ref_frame, NONE); + set_ref_ptrs(cm, xd, + (cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME: NONE, + (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE); cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source); @@ -581,7 +591,8 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16; // Other than for the first frame do a motion search. - if (cm->current_video_frame > 0) { + if ((lc == NULL && cm->current_video_frame > 0) || + (lc != NULL && lc->current_video_frame_in_layer > 0)) { int tmp_err, motion_error, raw_motion_error; // Assume 0,0 motion with no mv overhead. MV mv = {0, 0} , tmp_mv = {0, 0}; @@ -628,7 +639,9 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { } // Search in an older reference frame. - if (cm->current_video_frame > 1 && gld_yv12 != NULL) { + if (((lc == NULL && cm->current_video_frame > 1) || + (lc != NULL && lc->current_video_frame_in_layer > 1)) + && gld_yv12 != NULL) { // Assume 0,0 motion with no mv overhead. int gf_motion_error; @@ -893,7 +906,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { // Special case for the first frame. Copy into the GF buffer as a second // reference. - if (cm->current_video_frame == 0 && gld_yv12 != NULL) { + if (cm->current_video_frame == 0 && gld_yv12 != NULL && lc == NULL) { vp8_yv12_copy_frame(lst_yv12, gld_yv12); } diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index d6f6b2563..a25dc61aa 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -320,23 +320,23 @@ int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x, switch (whichdir) { case 0: CHECK_BETTER(left, tr, tc - hstep); - CHECK_BETTER(up, tr - hstep, tc); - CHECK_BETTER(diag, tr - hstep, tc - hstep); + CHECK_BETTER(down, tr + hstep, tc); + CHECK_BETTER(diag, tr + hstep, tc - hstep); break; case 1: CHECK_BETTER(right, tr, tc + hstep); - CHECK_BETTER(up, tr - hstep, tc); - CHECK_BETTER(diag, tr - hstep, tc + hstep); + CHECK_BETTER(down, tr + hstep, tc); + CHECK_BETTER(diag, tr + hstep, tc + hstep); break; case 2: CHECK_BETTER(left, tr, tc - hstep); - CHECK_BETTER(down, tr + hstep, tc); - CHECK_BETTER(diag, tr + hstep, tc - hstep); + CHECK_BETTER(up, tr - hstep, tc); + CHECK_BETTER(diag, tr - hstep, tc - hstep); break; case 3: CHECK_BETTER(right, tr, tc + hstep); - CHECK_BETTER(down, tr + hstep, tc); - CHECK_BETTER(diag, tr + hstep, tc + hstep); + CHECK_BETTER(up, tr - hstep, tc); + CHECK_BETTER(diag, tr - hstep, tc + hstep); break; } } else { @@ -648,11 +648,11 @@ static int vp9_pattern_search(const MACROBLOCK *x, // Returns the one-away integer pel sad values around the best as follows: // sad_list[0]: sad at the best integer pel // sad_list[1]: sad at delta {0, -1} (left) from the best integer pel - // sad_list[2]: sad at delta {-1, 0} (top) from the best integer pel + // sad_list[2]: sad at delta { 1, 0} (bottom) from the best integer pel // sad_list[3]: sad at delta { 0, 1} (right) from the best integer pel - // sad_list[4]: sad at delta { 1, 0} (bottom) from the best integer pel + // sad_list[4]: sad at delta {-1, 0} (top) from the best integer pel if (sad_list) { - static const MV neighbors[4] = {{0, -1}, {-1, 0}, {0, 1}, {1, 0}}; + static const MV neighbors[4] = {{0, -1}, {1, 0}, {0, 1}, {-1, 0}}; sad_list[0] = bestsad; if (check_bounds(x, br, bc, 1)) { for (i = 0; i < 4; i++) { @@ -660,7 +660,10 @@ static int vp9_pattern_search(const MACROBLOCK *x, bc + neighbors[i].col}; sad_list[i + 1] = vfp->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv), - in_what->stride); + in_what->stride) + + (use_mvcost ? + mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit) : + 0); } } else { for (i = 0; i < 4; i++) { @@ -671,7 +674,300 @@ static int vp9_pattern_search(const MACROBLOCK *x, else sad_list[i + 1] = vfp->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv), + in_what->stride) + + (use_mvcost ? + mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit) : + 0); + } + } + } + best_mv->row = br; + best_mv->col = bc; + return bestsad; +} + +// A specialized function where the smallest scale search candidates +// are 4 1-away neighbors, and sad_list is non-null +// TODO(debargha): Merge this function with the one above. Also remove +// use_mvcost option since it is always 1, to save unnecessary branches. +static int vp9_pattern_search_sad(const MACROBLOCK *x, + MV *ref_mv, + int search_param, + int sad_per_bit, + int do_init_search, + int *sad_list, + const vp9_variance_fn_ptr_t *vfp, + int use_mvcost, + const MV *center_mv, + MV *best_mv, + const int num_candidates[MAX_PATTERN_SCALES], + const MV candidates[MAX_PATTERN_SCALES] + [MAX_PATTERN_CANDIDATES]) { + const MACROBLOCKD *const xd = &x->e_mbd; + static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = { + 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, + }; + int i, s, t; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + int br, bc; + int bestsad = INT_MAX; + int thissad; + int k = -1; + const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; + int best_init_s = search_param_to_steps[search_param]; + // adjust ref_mv to make sure it is within MV range + clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); + br = ref_mv->row; + bc = ref_mv->col; + if (sad_list != NULL) { + sad_list[0] = sad_list[1] = sad_list[2] = sad_list[3] = sad_list[4] = + INT_MAX; + } + + // Work out the start point for the search + bestsad = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, ref_mv), in_what->stride) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); + + // Search all possible scales upto the search param around the center point + // pick the scale of the point that is best as the starting scale of + // further steps around it. + if (do_init_search) { + s = best_init_s; + best_init_s = -1; + for (t = 0; t <= s; ++t) { + int best_site = -1; + if (check_bounds(x, br, bc, 1 << t)) { + for (i = 0; i < num_candidates[t]; i++) { + const MV this_mv = {br + candidates[t][i].row, + bc + candidates[t][i].col}; + thissad = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), + in_what->stride); + CHECK_BETTER + } + } else { + for (i = 0; i < num_candidates[t]; i++) { + const MV this_mv = {br + candidates[t][i].row, + bc + candidates[t][i].col}; + if (!is_mv_in(x, &this_mv)) + continue; + thissad = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), + in_what->stride); + CHECK_BETTER + } + } + if (best_site == -1) { + continue; + } else { + best_init_s = t; + k = best_site; + } + } + if (best_init_s != -1) { + br += candidates[best_init_s][k].row; + bc += candidates[best_init_s][k].col; + } + } + + // If the center point is still the best, just skip this and move to + // the refinement step. + if (best_init_s != -1) { + int do_sad = (num_candidates[0] == 4 && sad_list != NULL); + int best_site = -1; + s = best_init_s; + + for (; s >= do_sad; s--) { + if (!do_init_search || s != best_init_s) { + if (check_bounds(x, br, bc, 1 << s)) { + for (i = 0; i < num_candidates[s]; i++) { + const MV this_mv = {br + candidates[s][i].row, + bc + candidates[s][i].col}; + thissad = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), + in_what->stride); + CHECK_BETTER + } + } else { + for (i = 0; i < num_candidates[s]; i++) { + const MV this_mv = {br + candidates[s][i].row, + bc + candidates[s][i].col}; + if (!is_mv_in(x, &this_mv)) + continue; + thissad = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), + in_what->stride); + CHECK_BETTER + } + } + + if (best_site == -1) { + continue; + } else { + br += candidates[s][best_site].row; + bc += candidates[s][best_site].col; + k = best_site; + } + } + + do { + int next_chkpts_indices[PATTERN_CANDIDATES_REF]; + best_site = -1; + next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1; + next_chkpts_indices[1] = k; + next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1; + + if (check_bounds(x, br, bc, 1 << s)) { + for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { + const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row, + bc + candidates[s][next_chkpts_indices[i]].col}; + thissad = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), + in_what->stride); + CHECK_BETTER + } + } else { + for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { + const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row, + bc + candidates[s][next_chkpts_indices[i]].col}; + if (!is_mv_in(x, &this_mv)) + continue; + thissad = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), + in_what->stride); + CHECK_BETTER + } + } + + if (best_site != -1) { + k = next_chkpts_indices[best_site]; + br += candidates[s][k].row; + bc += candidates[s][k].col; + } + } while (best_site != -1); + } + + // Note: If we enter the if below, then sad_list must be non-NULL. + if (s == 0) { + sad_list[0] = bestsad; + if (!do_init_search || s != best_init_s) { + if (check_bounds(x, br, bc, 1 << s)) { + for (i = 0; i < num_candidates[s]; i++) { + const MV this_mv = {br + candidates[s][i].row, + bc + candidates[s][i].col}; + sad_list[i + 1] = + thissad = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), + in_what->stride); + CHECK_BETTER + } + } else { + for (i = 0; i < num_candidates[s]; i++) { + const MV this_mv = {br + candidates[s][i].row, + bc + candidates[s][i].col}; + if (!is_mv_in(x, &this_mv)) + continue; + sad_list[i + 1] = + thissad = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), + in_what->stride); + CHECK_BETTER + } + } + + if (best_site != -1) { + br += candidates[s][best_site].row; + bc += candidates[s][best_site].col; + k = best_site; + } + } + while (best_site != -1) { + int next_chkpts_indices[PATTERN_CANDIDATES_REF]; + best_site = -1; + next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1; + next_chkpts_indices[1] = k; + next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1; + sad_list[1] = sad_list[2] = sad_list[3] = sad_list[4] = INT_MAX; + sad_list[((k + 2) % 4) + 1] = sad_list[0]; + sad_list[0] = bestsad; + + if (check_bounds(x, br, bc, 1 << s)) { + for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { + const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row, + bc + candidates[s][next_chkpts_indices[i]].col}; + sad_list[next_chkpts_indices[i] + 1] = + thissad = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), + in_what->stride); + CHECK_BETTER + } + } else { + for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { + const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row, + bc + candidates[s][next_chkpts_indices[i]].col}; + if (!is_mv_in(x, &this_mv)) { + sad_list[next_chkpts_indices[i] + 1] = INT_MAX; + continue; + } + sad_list[next_chkpts_indices[i] + 1] = + thissad = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), + in_what->stride); + CHECK_BETTER + } + } + + if (best_site != -1) { + k = next_chkpts_indices[best_site]; + br += candidates[s][k].row; + bc += candidates[s][k].col; + } + } + } + } + + // Returns the one-away integer pel sad values around the best as follows: + // sad_list[0]: sad at the best integer pel + // sad_list[1]: sad at delta {0, -1} (left) from the best integer pel + // sad_list[2]: sad at delta { 1, 0} (bottom) from the best integer pel + // sad_list[3]: sad at delta { 0, 1} (right) from the best integer pel + // sad_list[4]: sad at delta {-1, 0} (top) from the best integer pel + if (sad_list) { + static const MV neighbors[4] = {{0, -1}, {1, 0}, {0, 1}, {-1, 0}}; + if (sad_list[0] == INT_MAX) { + sad_list[0] = bestsad; + if (check_bounds(x, br, bc, 1)) { + for (i = 0; i < 4; i++) { + const MV this_mv = {br + neighbors[i].row, + bc + neighbors[i].col}; + sad_list[i + 1] = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + } + } else { + for (i = 0; i < 4; i++) { + const MV this_mv = {br + neighbors[i].row, + bc + neighbors[i].col}; + if (!is_mv_in(x, &this_mv)) + sad_list[i + 1] = INT_MAX; + else + sad_list[i + 1] = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), + in_what->stride); + } + } + } else { + if (use_mvcost) { + for (i = 0; i < 4; i++) { + const MV this_mv = {br + neighbors[i].row, + bc + neighbors[i].col}; + if (sad_list[i + 1] != INT_MAX) { + sad_list[i + 1] += + mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); + } + } } } } @@ -784,10 +1080,10 @@ int vp9_bigdia_search(const MACROBLOCK *x, {{-512, -512}, {0, -1024}, {512, -512}, {1024, 0}, {512, 512}, {0, 1024}, {-512, 512}, {-1024, 0}}, }; - return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit, - do_init_search, sad_list, vfp, use_mvcost, - center_mv, best_mv, - bigdia_num_candidates, bigdia_candidates); + return vp9_pattern_search_sad(x, ref_mv, search_param, sad_per_bit, + do_init_search, sad_list, vfp, use_mvcost, + center_mv, best_mv, + bigdia_num_candidates, bigdia_candidates); } int vp9_square_search(const MACROBLOCK *x, diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c index d36548996..5557d7fe7 100644 --- a/vp9/encoder/vp9_picklpf.c +++ b/vp9/encoder/vp9_picklpf.c @@ -77,7 +77,6 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, while (filter_step > 0) { const int filt_high = MIN(filt_mid + filter_step, max_filter_level); const int filt_low = MAX(filt_mid - filter_step, min_filter_level); - int filt_err; // Bias against raising loop filter in favor of lowering it. int bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; @@ -92,17 +91,14 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, if (filt_direction <= 0 && filt_low != filt_mid) { // Get Low filter error score if (ss_err[filt_low] < 0) { - filt_err = try_filter_frame(sd, cpi, filt_low, partial_frame); - ss_err[filt_low] = filt_err; - } else { - filt_err = ss_err[filt_low]; + ss_err[filt_low] = try_filter_frame(sd, cpi, filt_low, partial_frame); } // If value is close to the best so far then bias towards a lower loop // filter value. - if ((filt_err - bias) < best_err) { + if ((ss_err[filt_low] - bias) < best_err) { // Was it actually better than the previous best? - if (filt_err < best_err) - best_err = filt_err; + if (ss_err[filt_low] < best_err) + best_err = ss_err[filt_low]; filt_best = filt_low; } @@ -111,14 +107,11 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, // Now look at filt_high if (filt_direction >= 0 && filt_high != filt_mid) { if (ss_err[filt_high] < 0) { - filt_err = try_filter_frame(sd, cpi, filt_high, partial_frame); - ss_err[filt_high] = filt_err; - } else { - filt_err = ss_err[filt_high]; + ss_err[filt_high] = try_filter_frame(sd, cpi, filt_high, partial_frame); } // Was it better than the previous best? - if (filt_err < (best_err - bias)) { - best_err = filt_err; + if (ss_err[filt_high] < (best_err - bias)) { + best_err = ss_err[filt_high]; filt_best = filt_high; } } diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index eababdbca..d49eb956f 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -19,9 +19,9 @@ #include "vp9/encoder/vp9_quantize.h" #include "vp9/encoder/vp9_rd.h" -void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block, +void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, const int16_t quant, - int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr) { const int rc = 0; const int coeff = coeff_ptr[rc]; @@ -40,9 +40,9 @@ void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block, *eob_ptr = eob + 1; } -void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block, +void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, const int16_t quant, - int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr) { const int rc = 0; const int coeff = coeff_ptr[rc]; @@ -62,11 +62,11 @@ void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block, *eob_ptr = eob + 1; } -void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count, +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, - int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { @@ -78,13 +78,13 @@ void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count, (void)zbin_oq_value; (void)iscan; - vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t)); - vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t)); + vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); if (!skip_block) { // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. - for (i = 0; i < count; i++) { + for (i = 0; i < n_coeffs; i++) { const int rc = scan[i]; const int coeff = coeff_ptr[rc]; const int coeff_sign = (coeff >> 31); @@ -105,12 +105,12 @@ void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count, // TODO(jingning) Refactor this file and combine functions with similar // operations. -void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, - int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { @@ -120,8 +120,8 @@ void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, (void)zbin_oq_value; (void)iscan; - vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t)); - vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t)); + vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); if (!skip_block) { for (i = 0; i < n_coeffs; i++) { @@ -146,27 +146,27 @@ void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = eob + 1; } -void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count, +void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, - int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - int i, non_zero_count = (int)count, eob = -1; + int i, non_zero_count = (int)n_coeffs, eob = -1; const int zbins[2] = { zbin_ptr[0] + zbin_oq_value, zbin_ptr[1] + zbin_oq_value }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; (void)iscan; - vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t)); - vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t)); + vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); if (!skip_block) { // Pre-scan pass - for (i = (int)count - 1; i >= 0; i--) { + for (i = (int)n_coeffs - 1; i >= 0; i--) { const int rc = scan[i]; const int coeff = coeff_ptr[rc]; @@ -199,12 +199,12 @@ void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count, *eob_ptr = eob + 1; } -void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, +void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, - int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { @@ -217,8 +217,8 @@ void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int i, eob = -1; (void)iscan; - vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t)); - vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t)); + vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); if (!skip_block) { // Pre-scan pass @@ -280,13 +280,19 @@ static void invert_quant(int16_t *quant, int16_t *shift, int d) { *shift = 1 << (16 - l); } +static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) { + int quant = vp9_dc_quant(q, 0); + (void) bit_depth; + return q == 0 ? 64 : (quant < 148 ? 84 : 80); +} + void vp9_init_quantizer(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; QUANTS *const quants = &cpi->quants; int i, q, quant; for (q = 0; q < QINDEX_RANGE; q++) { - const int qzbin_factor = q == 0 ? 64 : (vp9_dc_quant(q, 0) < 148 ? 84 : 80); + const int qzbin_factor = get_qzbin_factor(q, cm->bit_depth); const int qrounding_factor = q == 0 ? 64 : 48; for (i = 0; i < 2; ++i) { diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h index 262529b05..d7edb0bdc 100644 --- a/vp9/encoder/vp9_quantize.h +++ b/vp9/encoder/vp9_quantize.h @@ -37,17 +37,29 @@ typedef struct { DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]); } QUANTS; -void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block, +void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, const int16_t quant_ptr, - int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr); -void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block, +void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, const int16_t quant_ptr, - int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr); void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, const int16_t *scan, const int16_t *iscan); +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_high_quantize_dc(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr); +void vp9_high_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, + const int16_t quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr); +#endif + struct VP9_COMP; struct VP9Common; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 939351d2d..9df85defd 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -249,7 +249,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, *out_dist_sum = dist_sum << 4; } -int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, +int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz) { int i; int64_t error = 0, sqcoeff = 0; @@ -288,7 +288,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x, const PLANE_TYPE type = pd->plane_type; const int16_t *band_count = &band_counts[tx_size][1]; const int eob = p->eobs[block]; - const int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = x->token_costs[tx_size][type][is_inter_block(mbmi)]; uint8_t token_cache[32 * 32]; @@ -358,8 +358,8 @@ static void dist_block(int plane, int block, TX_SIZE tx_size, const struct macroblockd_plane *const pd = &xd->plane[plane]; int64_t this_sse; int shift = tx_size == TX_32X32 ? 0 : 2; - int16_t *const coeff = BLOCK_OFFSET(p->coeff, block); - int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse) >> shift; args->sse = this_sse >> shift; @@ -405,8 +405,8 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, dist_block(plane, block, tx_size, args); } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) { // compute DC coefficient - int16_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block); - int16_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block); + tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block); vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size); args->sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4; args->dist = args->sse; @@ -690,7 +690,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride]; int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block, p->src_diff); - int16_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block); + tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block); xd->mi[0]->bmi[block].as_mode = mode; vp9_predict_intra_block(xd, block, 1, TX_4X4, mode, @@ -1137,7 +1137,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, for (idy = 0; idy < height / 4; ++idy) { for (idx = 0; idx < width / 4; ++idx) { int64_t ssz, rd, rd1, rd2; - int16_t* coeff; + tran_low_t* coeff; k += (idy * 2 + idx); coeff = BLOCK_OFFSET(p->coeff, k); diff --git a/vp9/encoder/vp9_sad.c b/vp9/encoder/vp9_sad.c index d06263676..cee6ce140 100644 --- a/vp9/encoder/vp9_sad.c +++ b/vp9/encoder/vp9_sad.c @@ -14,6 +14,9 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#if CONFIG_VP9_HIGHBITDEPTH +#include "vp9/common/vp9_common.h" +#endif #include "vp9/encoder/vp9_variance.h" static INLINE unsigned int sad(const uint8_t *a, int a_stride, @@ -131,3 +134,138 @@ sadMxN(4, 4) sadMxNxK(4, 4, 3) sadMxNxK(4, 4, 8) sadMxNx4D(4, 4) + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE unsigned int high_sad(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int width, int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + sad += abs(a[x] - b[x]); + + a += a_stride; + b += b_stride; + } + return sad; +} + +static INLINE unsigned int high_sadb(const uint8_t *a8, int a_stride, + const uint16_t *b, int b_stride, + int width, int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + sad += abs(a[x] - b[x]); + + a += a_stride; + b += b_stride; + } + return sad; +} + +#define high_sadMxN(m, n) \ +unsigned int vp9_high_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return high_sad(src, src_stride, ref, ref_stride, m, n); \ +} \ +unsigned int vp9_high_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + uint16_t comp_pred[m * n]; \ + vp9_high_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \ + return high_sadb(src, src_stride, comp_pred, m, m, n); \ +} + +#define high_sadMxNxK(m, n, k) \ +void vp9_high_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + unsigned int *sads) { \ + int i; \ + for (i = 0; i < k; ++i) \ + sads[i] = vp9_high_sad##m##x##n##_c(src, src_stride, &ref[i], ref_stride); \ +} + +#define high_sadMxNx4D(m, n) \ +void vp9_high_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ + const uint8_t *const refs[], \ + int ref_stride, unsigned int *sads) { \ + int i; \ + for (i = 0; i < 4; ++i) \ + sads[i] = vp9_high_sad##m##x##n##_c(src, src_stride, refs[i], ref_stride); \ +} + +// 64x64 +high_sadMxN(64, 64) +high_sadMxNxK(64, 64, 3) +high_sadMxNxK(64, 64, 8) +high_sadMxNx4D(64, 64) + +// 64x32 +high_sadMxN(64, 32) +high_sadMxNx4D(64, 32) + +// 32x64 +high_sadMxN(32, 64) +high_sadMxNx4D(32, 64) + +// 32x32 +high_sadMxN(32, 32) +high_sadMxNxK(32, 32, 3) +high_sadMxNxK(32, 32, 8) +high_sadMxNx4D(32, 32) + +// 32x16 +high_sadMxN(32, 16) +high_sadMxNx4D(32, 16) + +// 16x32 +high_sadMxN(16, 32) +high_sadMxNx4D(16, 32) + +// 16x16 +high_sadMxN(16, 16) +high_sadMxNxK(16, 16, 3) +high_sadMxNxK(16, 16, 8) +high_sadMxNx4D(16, 16) + +// 16x8 +high_sadMxN(16, 8) +high_sadMxNxK(16, 8, 3) +high_sadMxNxK(16, 8, 8) +high_sadMxNx4D(16, 8) + +// 8x16 +high_sadMxN(8, 16) +high_sadMxNxK(8, 16, 3) +high_sadMxNxK(8, 16, 8) +high_sadMxNx4D(8, 16) + +// 8x8 +high_sadMxN(8, 8) +high_sadMxNxK(8, 8, 3) +high_sadMxNxK(8, 8, 8) +high_sadMxNx4D(8, 8) + +// 8x4 +high_sadMxN(8, 4) +high_sadMxNxK(8, 4, 8) +high_sadMxNx4D(8, 4) + +// 4x8 +high_sadMxN(4, 8) +high_sadMxNxK(4, 8, 8) +high_sadMxNx4D(4, 8) + +// 4x4 +high_sadMxN(4, 4) +high_sadMxNxK(4, 4, 3) +high_sadMxNxK(4, 4, 8) +high_sadMxNx4D(4, 4) + +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index d9ac12262..889a8be21 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -67,15 +67,11 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, : USE_LARGESTALL; if (MIN(cm->width, cm->height) >= 720) { - sf->lf_motion_threshold = LOW_MOTION_THRESHOLD; - sf->last_partitioning_redo_frequency = 3; sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; sf->adaptive_pred_interp_filter = 0; } else { sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY; - sf->last_partitioning_redo_frequency = 2; - sf->lf_motion_threshold = NO_MOTION_THRESHOLD; } sf->reference_masking = 1; @@ -86,7 +82,6 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->disable_filter_search_var_thresh = 100; sf->comp_inter_joint_search_thresh = BLOCK_SIZES; sf->auto_min_max_partition_size = CONSTRAIN_NEIGHBORING_MIN_MAX; - sf->adjust_partitioning_from_last_frame = 1; if (MIN(cm->width, cm->height) >= 720) sf->partition_search_breakout_dist_thr = (1 << 24); @@ -110,8 +105,6 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->cb_pred_filter_search = 1; sf->alt_ref_search_fp = 1; sf->motion_field_mode_search = !boosted; - sf->lf_motion_threshold = LOW_MOTION_THRESHOLD; - sf->last_partitioning_redo_frequency = 2; sf->recode_loop = ALLOW_RECODE_KFMAXBW; sf->adaptive_rd_thresh = 3; sf->mode_skip_start = 6; @@ -130,11 +123,12 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->use_square_partition_only = 1; sf->tx_size_search_method = USE_LARGESTALL; sf->disable_split_mask = DISABLE_ALL_SPLIT; + sf->mv.search_method = BIGDIA; + sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED; sf->adaptive_rd_thresh = 4; sf->mode_search_skip_flags |= FLAG_SKIP_COMP_REFMISMATCH | FLAG_EARLY_TERMINATE; sf->disable_filter_search_var_thresh = 200; - sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL; sf->use_lp32x32fdct = 1; sf->use_fast_coef_updates = ONE_LOOP_REDUCED; sf->use_fast_coef_costing = 1; diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h index 1fc43a427..d180d1a8c 100644 --- a/vp9/encoder/vp9_svc_layercontext.h +++ b/vp9/encoder/vp9_svc_layercontext.h @@ -36,6 +36,7 @@ typedef struct { int gold_ref_idx; int has_alt_frame; size_t layer_size; + struct vpx_psnr_pkt psnr_pkt; } LAYER_CONTEXT; typedef struct { diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c index 6068b85a0..263883f66 100644 --- a/vp9/encoder/vp9_tokenize.c +++ b/vp9/encoder/vp9_tokenize.c @@ -212,7 +212,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, TOKENEXTRA *t = *tp; /* store tokens starting here */ int eob = p->eobs[block]; const PLANE_TYPE type = pd->plane_type; - const int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); + const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); const int segment_id = mbmi->segment_id; const int16_t *scan, *nb; const scan_order *so; diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c index afbb191ad..c97f93fda 100644 --- a/vp9/encoder/vp9_variance.c +++ b/vp9/encoder/vp9_variance.c @@ -267,3 +267,375 @@ void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width, ref += ref_stride; } } + +#if CONFIG_VP9_HIGHBITDEPTH +void high_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, uint64_t *sse, + uint64_t *sum) { + int i, j; + + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + a += a_stride; + b += b_stride; + } +} + +void high_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, unsigned int *sse, + int *sum) { + uint64_t sse_long = 0; + uint64_t sum_long = 0; + high_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = sse_long; + *sum = sum_long; +} + +void high_10_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, unsigned int *sse, + int *sum) { + uint64_t sse_long = 0; + uint64_t sum_long = 0; + high_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sum = ROUND_POWER_OF_TWO(sum_long, 2); + *sse = ROUND_POWER_OF_TWO(sse_long, 4); +} + +void high_12_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, unsigned int *sse, + int *sum) { + uint64_t sse_long = 0; + uint64_t sum_long = 0; + high_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sum = ROUND_POWER_OF_TWO(sum_long, 4); + *sse = ROUND_POWER_OF_TWO(sse_long, 8); +} + +static void high_var_filter_block2d_bil_first_pass( + const uint8_t *src_ptr8, + uint16_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp9_filter) { + unsigned int i, j; + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8); + for (i = 0; i < output_height; i++) { + for (j = 0; j < output_width; j++) { + output_ptr[j] = + ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + + (int)src_ptr[pixel_step] * vp9_filter[1], + FILTER_BITS); + + src_ptr++; + } + + // Next row... + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +static void high_var_filter_block2d_bil_second_pass( + const uint16_t *src_ptr, + uint16_t *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp9_filter) { + unsigned int i, j; + + for (i = 0; i < output_height; i++) { + for (j = 0; j < output_width; j++) { + output_ptr[j] = + ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + + (int)src_ptr[pixel_step] * vp9_filter[1], + FILTER_BITS); + src_ptr++; + } + + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +#define HIGH_VAR(W, H) \ +unsigned int vp9_high_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + unsigned int *sse) { \ + int sum; \ + high_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ +} \ +\ +unsigned int vp9_high_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + unsigned int *sse) { \ + int sum; \ + high_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ +} \ +\ +unsigned int vp9_high_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + unsigned int *sse) { \ + int sum; \ + high_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ +} + +#define HIGH_SUBPIX_VAR(W, H) \ +unsigned int vp9_high_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, \ + int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ +\ + high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ + W, BILINEAR_FILTERS_2TAP(xoffset)); \ + high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + BILINEAR_FILTERS_2TAP(yoffset)); \ +\ + return vp9_high_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \ + dst_stride, sse); \ +} \ +\ +unsigned int vp9_high_10_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, \ + int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ +\ + high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ + W, BILINEAR_FILTERS_2TAP(xoffset)); \ + high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + BILINEAR_FILTERS_2TAP(yoffset)); \ +\ + return vp9_high_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \ + dst_stride, sse); \ +} \ +\ +unsigned int vp9_high_12_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, \ + int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ +\ + high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ + W, BILINEAR_FILTERS_2TAP(xoffset)); \ + high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + BILINEAR_FILTERS_2TAP(yoffset)); \ +\ + return vp9_high_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \ + dst_stride, sse); \ +} + +#define HIGH_SUBPIX_AVG_VAR(W, H) \ +unsigned int vp9_high_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, \ + int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, \ + unsigned int *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \ +\ + high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ + W, BILINEAR_FILTERS_2TAP(xoffset)); \ + high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + BILINEAR_FILTERS_2TAP(yoffset)); \ +\ + vp9_high_comp_avg_pred(temp3, second_pred, W, H, CONVERT_TO_BYTEPTR(temp2), \ + W); \ +\ + return vp9_high_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + dst_stride, sse); \ +} \ +\ +unsigned int vp9_high_10_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, \ + int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, \ + unsigned int *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \ +\ + high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ + W, BILINEAR_FILTERS_2TAP(xoffset)); \ + high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + BILINEAR_FILTERS_2TAP(yoffset)); \ +\ + vp9_high_comp_avg_pred(temp3, second_pred, W, H, CONVERT_TO_BYTEPTR(temp2), \ + W); \ +\ + return vp9_high_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + dst_stride, sse); \ +} \ +\ +unsigned int vp9_high_12_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, \ + int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, \ + unsigned int *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \ +\ + high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ + W, BILINEAR_FILTERS_2TAP(xoffset)); \ + high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + BILINEAR_FILTERS_2TAP(yoffset)); \ +\ + vp9_high_comp_avg_pred(temp3, second_pred, W, H, CONVERT_TO_BYTEPTR(temp2), \ + W); \ +\ + return vp9_high_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + dst_stride, sse); \ +} + +#define HIGH_GET_VAR(S) \ +void vp9_high_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + unsigned int *sse, int *sum) { \ + high_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ +} \ +\ +void vp9_high_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + unsigned int *sse, int *sum) { \ + high_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ +} \ +\ +void vp9_high_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + unsigned int *sse, int *sum) { \ + high_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ +} + +#define HIGH_MSE(W, H) \ +unsigned int vp9_high_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + high_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ +} \ +\ +unsigned int vp9_high_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + high_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ +} \ +\ +unsigned int vp9_high_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + high_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ +} + +HIGH_GET_VAR(8) +HIGH_GET_VAR(16) + +HIGH_MSE(16, 16) +HIGH_MSE(16, 8) +HIGH_MSE(8, 16) +HIGH_MSE(8, 8) + +HIGH_VAR(4, 4) +HIGH_SUBPIX_VAR(4, 4) +HIGH_SUBPIX_AVG_VAR(4, 4) + +HIGH_VAR(4, 8) +HIGH_SUBPIX_VAR(4, 8) +HIGH_SUBPIX_AVG_VAR(4, 8) + +HIGH_VAR(8, 4) +HIGH_SUBPIX_VAR(8, 4) +HIGH_SUBPIX_AVG_VAR(8, 4) + +HIGH_VAR(8, 8) +HIGH_SUBPIX_VAR(8, 8) +HIGH_SUBPIX_AVG_VAR(8, 8) + +HIGH_VAR(8, 16) +HIGH_SUBPIX_VAR(8, 16) +HIGH_SUBPIX_AVG_VAR(8, 16) + +HIGH_VAR(16, 8) +HIGH_SUBPIX_VAR(16, 8) +HIGH_SUBPIX_AVG_VAR(16, 8) + +HIGH_VAR(16, 16) +HIGH_SUBPIX_VAR(16, 16) +HIGH_SUBPIX_AVG_VAR(16, 16) + +HIGH_VAR(16, 32) +HIGH_SUBPIX_VAR(16, 32) +HIGH_SUBPIX_AVG_VAR(16, 32) + +HIGH_VAR(32, 16) +HIGH_SUBPIX_VAR(32, 16) +HIGH_SUBPIX_AVG_VAR(32, 16) + +HIGH_VAR(32, 32) +HIGH_SUBPIX_VAR(32, 32) +HIGH_SUBPIX_AVG_VAR(32, 32) + +HIGH_VAR(32, 64) +HIGH_SUBPIX_VAR(32, 64) +HIGH_SUBPIX_AVG_VAR(32, 64) + +HIGH_VAR(64, 32) +HIGH_SUBPIX_VAR(64, 32) +HIGH_SUBPIX_AVG_VAR(64, 32) + +HIGH_VAR(64, 64) +HIGH_SUBPIX_VAR(64, 64) +HIGH_SUBPIX_AVG_VAR(64, 64) + +void vp9_high_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride) { + int i, j; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h index 4a194b72c..c51d08d04 100644 --- a/vp9/encoder/vp9_variance.h +++ b/vp9/encoder/vp9_variance.h @@ -22,6 +22,23 @@ void variance(const uint8_t *a, int a_stride, int w, int h, unsigned int *sse, int *sum); +#if CONFIG_VP9_HIGHBITDEPTH +void high_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, + unsigned int *sse, int *sum); + +void high_10_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, + unsigned int *sse, int *sum); + +void high_12_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, + unsigned int *sse, int *sum); +#endif + typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, @@ -81,6 +98,11 @@ typedef struct vp9_variance_vtable { void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_high_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride); +#endif + #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 0f0b7a5ab..4b24960ef 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -555,7 +555,7 @@ static vpx_codec_err_t ctrl_set_enable_auto_alt_ref(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t ctrl_set_noise_sensitivity(vpx_codec_alg_priv_t *ctx, va_list args) { struct vp9_extracfg extra_cfg = ctx->extra_cfg; - extra_cfg.noise_sensitivity = CAST(VP8E_SET_NOISE_SENSITIVITY, args); + extra_cfg.noise_sensitivity = CAST(VP9E_SET_NOISE_SENSITIVITY, args); return update_extra_cfg(ctx, &extra_cfg); } @@ -686,6 +686,10 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, if (res == VPX_CODEC_OK) { set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg); +#if CONFIG_VP9_HIGHBITDEPTH + priv->oxcf.use_highbitdepth = + (ctx->init_flags & VPX_CODEC_USE_HIGHBITDEPTH) ? 1 : 0; +#endif priv->cpi = vp9_create_compressor(&priv->oxcf); if (priv->cpi == NULL) res = VPX_CODEC_MEM_ERROR; @@ -981,15 +985,20 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, cx_data_sz -= size; #if CONFIG_SPATIAL_SVC if (is_two_pass_svc(cpi)) { - vpx_codec_cx_pkt_t pkt; + vpx_codec_cx_pkt_t pkt_sizes, pkt_psnr; int i; - vp9_zero(pkt); - pkt.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES; + vp9_zero(pkt_sizes); + vp9_zero(pkt_psnr); + pkt_sizes.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES; + pkt_psnr.kind = VPX_CODEC_SPATIAL_SVC_LAYER_PSNR; for (i = 0; i < cpi->svc.number_spatial_layers; ++i) { - pkt.data.layer_sizes[i] = cpi->svc.layer_context[i].layer_size; - cpi->svc.layer_context[i].layer_size = 0; + LAYER_CONTEXT *lc = &cpi->svc.layer_context[i]; + pkt_sizes.data.layer_sizes[i] = lc->layer_size; + pkt_psnr.data.layer_psnr[i] = lc->psnr_pkt; + lc->layer_size = 0; } - vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt); + vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_sizes); + vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_psnr); } #endif } @@ -1231,7 +1240,6 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { {VP8E_SET_ACTIVEMAP, ctrl_set_active_map}, {VP8E_SET_SCALEMODE, ctrl_set_scale_mode}, {VP8E_SET_CPUUSED, ctrl_set_cpuused}, - {VP8E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity}, {VP8E_SET_ENABLEAUTOALTREF, ctrl_set_enable_auto_alt_ref}, {VP8E_SET_SHARPNESS, ctrl_set_sharpness}, {VP8E_SET_STATIC_THRESHOLD, ctrl_set_static_thresh}, @@ -1251,6 +1259,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { {VP9E_SET_SVC_PARAMETERS, ctrl_set_svc_parameters}, {VP9E_SET_SVC_LAYER_ID, ctrl_set_svc_layer_id}, {VP9E_SET_TUNE_CONTENT, ctrl_set_tune_content}, + {VP9E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity}, // Getters {VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer}, @@ -1333,6 +1342,9 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { CODEC_INTERFACE(vpx_codec_vp9_cx) = { "WebM Project VP9 Encoder" VERSION_STRING, VPX_CODEC_INTERNAL_ABI_VERSION, +#if CONFIG_VP9_HIGHBITDEPTH + VPX_CODEC_CAP_HIGHBITDEPTH | +#endif VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR, // vpx_codec_caps_t encoder_init, // vpx_codec_init_fn_t encoder_destroy, // vpx_codec_destroy_fn_t |