diff options
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/common/vp9_entropy.c | 4 | ||||
-rw-r--r-- | vp9/common/vp9_idctllm.c | 110 | ||||
-rw-r--r-- | vp9/encoder/vp9_dct.c | 83 | ||||
-rw-r--r-- | vp9/encoder/vp9_rdopt.c | 6 |
4 files changed, 173 insertions, 30 deletions
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c index cdc8bc14e..bc8738411 100644 --- a/vp9/common/vp9_entropy.c +++ b/vp9/common/vp9_entropy.c @@ -143,7 +143,7 @@ DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = { }; #if CONFIG_TX32X32 -#if CONFIG_DWT32X32HYBRID +#if CONFIG_DWTDCTHYBRID DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = { 0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6, 6, 6, @@ -458,7 +458,7 @@ DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = { 951, 920, 889, 858, 827, 796, 765, 734, 703, 735, 766, 797, 828, 859, 890, 921, 952, 983, 1014, 1015, 984, 953, 922, 891, 860, 829, 798, 767, 799, 830, 861, 892, 923, 954, 985, 1016, 1017, 986, 955, 924, 893, 862, 831, 863, 894, 925, 956, 987, 1018, 1019, 988, 957, 926, 895, 927, 958, 989, 1020, 1021, 990, 959, 991, 1022, 1023, }; -#endif // CONFIG_DWT32X32HYBRID +#endif // CONFIG_DWTDCTHYBRID #endif /* Array indices are identical to previously-existing CONTEXT_NODE indices */ diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c index 4dd540e2a..baa22457f 100644 --- a/vp9/common/vp9_idctllm.c +++ b/vp9/common/vp9_idctllm.c @@ -1534,7 +1534,7 @@ void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) { #endif #if CONFIG_TX32X32 -#if !CONFIG_DWT32X32HYBRID +#if !CONFIG_DWTDCTHYBRID #define DownshiftMultiplyBy2(x) x * 2 #define DownshiftMultiply(x) x static void idct16(double *input, double *output, int stride) { @@ -1879,7 +1879,7 @@ void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) { vp9_clear_system_state(); // Make it simd safe : __asm emms; } -#else // CONFIG_DWT32X32HYBRID +#else // CONFIG_DWTDCTHYBRID #define DWT_MAX_LENGTH 32 #define DWT_TYPE 26 // 26/53/97 @@ -1940,8 +1940,8 @@ static void synthesis_53_col(int length, int16_t *lowpass, int16_t *highpass, *x++ = ((*b) << 1) + *a; } -void dyadic_synthesize_53(int levels, int width, int height, int16_t *c, - int pitch_c, int16_t *x, int pitch_x) { +static void dyadic_synthesize_53(int levels, int width, int height, int16_t *c, + int pitch_c, int16_t *x, int pitch_x) { int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width; short buffer[2 * DWT_MAX_LENGTH]; @@ -2031,8 +2031,8 @@ static void synthesis_26_col(int length, int16_t *lowpass, int16_t *highpass, } } -void dyadic_synthesize_26(int levels, int width, int height, int16_t *c, - int pitch_c, int16_t *x, int pitch_x) { +static void dyadic_synthesize_26(int levels, int width, int height, int16_t *c, + int pitch_c, int16_t *x, int pitch_x) { int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width; int16_t buffer[2 * DWT_MAX_LENGTH]; @@ -2111,8 +2111,8 @@ static void synthesis_97(int length, double *lowpass, double *highpass, x[length - 1] -= 2 * a_predict1 * x[length - 2]; } -void dyadic_synthesize_97(int levels, int width, int height, int16_t *c, - int pitch_c, int16_t *x, int pitch_x) { +static void dyadic_synthesize_97(int levels, int width, int height, int16_t *c, + int pitch_c, int16_t *x, int pitch_x) { int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width; double buffer[2 * DWT_MAX_LENGTH]; double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH]; @@ -2358,7 +2358,8 @@ static void butterfly_16x16_idct_1d_f(double input[16], double output[16]) { vp9_clear_system_state(); // Make it simd safe : __asm emms; } -void vp9_short_idct16x16_c_f(int16_t *input, int16_t *output, int pitch) { +static void vp9_short_idct16x16_c_f(int16_t *input, int16_t *output, int pitch, + int scale) { vp9_clear_system_state(); // Make it simd safe : __asm emms; { double out[16*16], out2[16*16]; @@ -2383,13 +2384,13 @@ void vp9_short_idct16x16_c_f(int16_t *input, int16_t *output, int pitch) { out2[j*16 + i] = temp_out[j]; } for (i = 0; i < 16*16; ++i) - output[i] = round(out2[i] / (64 >> DWT_PRECISION_BITS)); + output[i] = round(out2[i] / (128 >> scale)); } vp9_clear_system_state(); // Make it simd safe : __asm emms; } void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) { - // assume out is a 32x32 buffer + // assume output is a 32x32 buffer // Temporary buffer to hold a 16x16 block for 16x16 inverse dct int16_t buffer[16 * 16]; // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt @@ -2400,20 +2401,24 @@ void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) { // TODO(debargha): Implement more efficiently by adding output pitch // argument to the idct16x16 function - vp9_short_idct16x16_c_f(input, buffer, pitch); + vp9_short_idct16x16_c_f(input, buffer, pitch, + 1 + DWT_PRECISION_BITS); for (i = 0; i < 16; ++i) { vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(*buffer2) * 16); } - vp9_short_idct16x16_c_f(input + 16, buffer, pitch); + vp9_short_idct16x16_c_f(input + 16, buffer, pitch, + 1 + DWT_PRECISION_BITS); for (i = 0; i < 16; ++i) { vpx_memcpy(buffer2 + i * 32 + 16, buffer + i * 16, sizeof(*buffer2) * 16); } - vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch); + vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch, + 1 + DWT_PRECISION_BITS); for (i = 0; i < 16; ++i) { vpx_memcpy(buffer2 + i * 32 + 16 * 32, buffer + i * 16, sizeof(*buffer2) * 16); } - vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch); + vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch, + 1 + DWT_PRECISION_BITS); for (i = 0; i < 16; ++i) { vpx_memcpy(buffer2 + i * 32 + 16 * 33, buffer + i * 16, sizeof(*buffer2) * 16); @@ -2426,5 +2431,78 @@ void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) { dyadic_synthesize_53(1, 32, 32, buffer2, 32, output, 32); #endif } -#endif // CONFIG_DWT32X32HYBRID + +void vp9_short_idct64x64_c(int16_t *input, int16_t *output, int pitch) { + // assume output is a 64x64 buffer + // Temporary buffer to hold a 16x16 block for 16x16 inverse dct + int16_t buffer[16 * 16]; + // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt + int16_t buffer2[64 * 64]; + // Note: pitch is in bytes, short_pitch is in short units + const int short_pitch = pitch >> 1; + int i, j; + + // TODO(debargha): Implement more efficiently by adding output pitch + // argument to the idct16x16 function + vp9_short_idct16x16_c_f(input, buffer, pitch, + 2 + DWT_PRECISION_BITS); + for (i = 0; i < 16; ++i) { + vpx_memcpy(buffer2 + i * 64, buffer + i * 16, sizeof(*buffer2) * 16); + } + vp9_short_idct16x16_c_f(input + 16, buffer, pitch, + 2 + DWT_PRECISION_BITS); + for (i = 0; i < 16; ++i) { + vpx_memcpy(buffer2 + i * 64 + 16, buffer + i * 16, sizeof(*buffer2) * 16); + } + vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch, + 2 + DWT_PRECISION_BITS); + for (i = 0; i < 16; ++i) { + vpx_memcpy(buffer2 + i * 64 + 16 * 64, buffer + i * 16, + sizeof(*buffer2) * 16); + } + vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch, + 2 + DWT_PRECISION_BITS); + for (i = 0; i < 16; ++i) { + vpx_memcpy(buffer2 + i * 64 + 16 * 65, buffer + i * 16, + sizeof(*buffer2) * 16); + } + + // Copying and scaling highest bands into buffer2 +#if DWT_PRECISION_BITS < 1 + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) { + buffer2[i * 64 + 32 + j] = + input[i * short_pitch + 32 + j] >> (1 - DWT_PRECISION_BITS); + } + } + for (i = 0; i < 32; ++i) { + for (j = 0; j < 64; ++j) { + buffer2[i * 64 + j] = + input[(i + 32) * short_pitch + j] >> (1 - DWT_PRECISION_BITS); + } + } +#else + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) { + buffer2[i * 64 + 32 + j] = + input[i * short_pitch + 32 + j] << (DWT_PRECISION_BITS - 1); + } + } + for (i = 0; i < 32; ++i) { + for (j = 0; j < 64; ++j) { + buffer2[i * 64 + j] = + input[(i + 32) * short_pitch + j] << (DWT_PRECISION_BITS - 1); + } + } +#endif + +#if DWT_TYPE == 26 + dyadic_synthesize_26(2, 64, 64, buffer2, 64, output, 64); +#elif DWT_TYPE == 97 + dyadic_synthesize_97(2, 64, 64, buffer2, 64, output, 64); +#elif DWT_TYPE == 53 + dyadic_synthesize_53(2, 64, 64, buffer2, 64, output, 64); +#endif +} +#endif // CONFIG_DWTDCTHYBRID #endif // CONFIG_TX32X32 diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index e14421d2d..0de6393a0 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -1332,8 +1332,9 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *out, int pitch) { #undef ROUNDING #endif +#if CONFIG_TX32X32 || CONFIG_TX64X64 +#if !CONFIG_DWTDCTHYBRID #if CONFIG_TX32X32 -#if !CONFIG_DWT32X32HYBRID static void dct32_1d(double *input, double *output, int stride) { static const double C1 = 0.998795456205; // cos(pi * 1 / 64) static const double C2 = 0.995184726672; // cos(pi * 2 / 64) @@ -1684,8 +1685,9 @@ void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) { vp9_clear_system_state(); // Make it simd safe : __asm emms; } +#endif // CONFIG_TX32X32 -#else // CONFIG_DWT32X32HYBRID +#else // CONFIG_DWTDCTHYBRID #define DWT_MAX_LENGTH 64 #define DWT_TYPE 26 // 26/53/97 @@ -2108,7 +2110,8 @@ static void dct16x16_1d_f(double input[16], double output[16]) { vp9_clear_system_state(); // Make it simd safe : __asm emms; } -void vp9_short_fdct16x16_c_f(short *input, short *out, int pitch) { +static void vp9_short_fdct16x16_c_f(short *input, short *out, int pitch, + int scale) { vp9_clear_system_state(); // Make it simd safe : __asm emms; { int shortpitch = pitch >> 1; @@ -2134,11 +2137,12 @@ void vp9_short_fdct16x16_c_f(short *input, short *out, int pitch) { } // Scale by some magic number for (i = 0; i < 256; i++) - out[i] = (short)round(output[i] / (4 << DWT_PRECISION_BITS)); + out[i] = (short)round(output[i] / (2 << scale)); } vp9_clear_system_state(); // Make it simd safe : __asm emms; } +#if CONFIG_TX32X32 void vp9_short_fdct32x32_c(short *input, short *out, int pitch) { // assume out is a 32x32 buffer short buffer[16 * 16]; @@ -2153,21 +2157,82 @@ void vp9_short_fdct32x32_c(short *input, short *out, int pitch) { #endif // TODO(debargha): Implement more efficiently by adding output pitch // argument to the dct16x16 function - vp9_short_fdct16x16_c_f(out, buffer, 64); + vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS); for (i = 0; i < 16; ++i) vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16); - vp9_short_fdct16x16_c_f(out + 16, buffer, 64); + vp9_short_fdct16x16_c_f(out + 16, buffer, 64, 1 + DWT_PRECISION_BITS); for (i = 0; i < 16; ++i) vpx_memcpy(out + i * 32 + 16, buffer + i * 16, sizeof(short) * 16); - vp9_short_fdct16x16_c_f(out + 32 * 16, buffer, 64); + vp9_short_fdct16x16_c_f(out + 32 * 16, buffer, 64, 1 + DWT_PRECISION_BITS); for (i = 0; i < 16; ++i) vpx_memcpy(out + i * 32 + 32 * 16, buffer + i * 16, sizeof(short) * 16); - vp9_short_fdct16x16_c_f(out + 33 * 16, buffer, 64); + vp9_short_fdct16x16_c_f(out + 33 * 16, buffer, 64, 1 + DWT_PRECISION_BITS); for (i = 0; i < 16; ++i) vpx_memcpy(out + i * 32 + 33 * 16, buffer + i * 16, sizeof(short) * 16); } -#endif // CONFIG_DWT32X32HYBRID #endif // CONFIG_TX32X32 + +#if CONFIG_TX64X64 +void vp9_short_fdct64x64_c(short *input, short *out, int pitch) { + // assume out is a 64x64 buffer + short buffer[16 * 16]; + int i, j; + const int short_pitch = pitch >> 1; +#if DWT_TYPE == 26 + dyadic_analyze_26(2, 64, 64, input, short_pitch, out, 64); +#elif DWT_TYPE == 97 + dyadic_analyze_97(2, 64, 64, input, short_pitch, out, 64); +#elif DWT_TYPE == 53 + dyadic_analyze_53(2, 64, 64, input, short_pitch, out, 64); +#endif + // TODO(debargha): Implement more efficiently by adding output pitch + // argument to the dct16x16 function + vp9_short_fdct16x16_c_f(out, buffer, 128, 2 + DWT_PRECISION_BITS); + for (i = 0; i < 16; ++i) + vpx_memcpy(out + i * 64, buffer + i * 16, sizeof(short) * 16); + + vp9_short_fdct16x16_c_f(out + 16, buffer, 128, 2 + DWT_PRECISION_BITS); + for (i = 0; i < 16; ++i) + vpx_memcpy(out + i * 64 + 16, buffer + i * 16, sizeof(short) * 16); + + vp9_short_fdct16x16_c_f(out + 64 * 16, buffer, 128, 2 + DWT_PRECISION_BITS); + for (i = 0; i < 16; ++i) + vpx_memcpy(out + i * 64 + 64 * 16, buffer + i * 16, sizeof(short) * 16); + + vp9_short_fdct16x16_c_f(out + 65 * 16, buffer, 128, 2 + DWT_PRECISION_BITS); + for (i = 0; i < 16; ++i) + vpx_memcpy(out + i * 64 + 65 * 16, buffer + i * 16, sizeof(short) * 16); + + // There is no dct used on the highest bands for now. + // Need to scale these coeffs by a factor of 2/2^DWT_PRECISION_BITS + // TODO(debargha): experiment with turning these coeffs to 0 +#if DWT_PRECISION_BITS < 1 + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) { + out[i * 64 + 32 + j] <<= (1 - DWT_PRECISION_BITS); + } + } + for (i = 0; i < 32; ++i) { + for (j = 0; j < 64; ++j) { + out[i * 64 + j] <<= (1 - DWT_PRECISION_BITS); + } + } +#else + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) { + out[i * 64 + 32 + j] >>= (DWT_PRECISION_BITS - 1); + } + } + for (i = 0; i < 32; ++i) { + for (j = 0; j < 64; ++j) { + out[i * 64 + j] >>= (DWT_PRECISION_BITS - 1); + } + } +#endif +} +#endif // CONFIG_TX64X64 +#endif // CONFIG_DWTDCTHYBRID +#endif // CONFIG_TX32X32 || CONFIG_TX64X64 diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index c695c049a..956d8f90c 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -965,17 +965,17 @@ static void super_block_yrd_32x32(MACROBLOCK *x, SUPERBLOCK * const x_sb = &x->sb_coeff_data; MACROBLOCKD * const xd = &x->e_mbd; SUPERBLOCKD * const xd_sb = &xd->sb_coeff_data; -#if DEBUG_ERROR || CONFIG_DWT32X32HYBRID +#if DEBUG_ERROR || CONFIG_DWTDCTHYBRID int16_t out[1024]; #endif vp9_transform_sby_32x32(x); vp9_quantize_sby_32x32(x); -#if DEBUG_ERROR || CONFIG_DWT32X32HYBRID +#if DEBUG_ERROR || CONFIG_DWTDCTHYBRID vp9_short_idct32x32(xd_sb->dqcoeff, out, 64); #endif -#if !CONFIG_DWT32X32HYBRID +#if !CONFIG_DWTDCTHYBRID *distortion = vp9_sb_block_error_c(x_sb->coeff, xd_sb->dqcoeff, 1024); #else *distortion = vp9_block_error_c(x_sb->src_diff, out, 1024) << 4; |