diff options
Diffstat (limited to 'vp9/encoder/vp9_dct.c')
-rw-r--r-- | vp9/encoder/vp9_dct.c | 83 |
1 files changed, 74 insertions, 9 deletions
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index e14421d2d..0de6393a0 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -1332,8 +1332,9 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *out, int pitch) { #undef ROUNDING #endif +#if CONFIG_TX32X32 || CONFIG_TX64X64 +#if !CONFIG_DWTDCTHYBRID #if CONFIG_TX32X32 -#if !CONFIG_DWT32X32HYBRID static void dct32_1d(double *input, double *output, int stride) { static const double C1 = 0.998795456205; // cos(pi * 1 / 64) static const double C2 = 0.995184726672; // cos(pi * 2 / 64) @@ -1684,8 +1685,9 @@ void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) { vp9_clear_system_state(); // Make it simd safe : __asm emms; } +#endif // CONFIG_TX32X32 -#else // CONFIG_DWT32X32HYBRID +#else // CONFIG_DWTDCTHYBRID #define DWT_MAX_LENGTH 64 #define DWT_TYPE 26 // 26/53/97 @@ -2108,7 +2110,8 @@ static void dct16x16_1d_f(double input[16], double output[16]) { vp9_clear_system_state(); // Make it simd safe : __asm emms; } -void vp9_short_fdct16x16_c_f(short *input, short *out, int pitch) { +static void vp9_short_fdct16x16_c_f(short *input, short *out, int pitch, + int scale) { vp9_clear_system_state(); // Make it simd safe : __asm emms; { int shortpitch = pitch >> 1; @@ -2134,11 +2137,12 @@ void vp9_short_fdct16x16_c_f(short *input, short *out, int pitch) { } // Scale by some magic number for (i = 0; i < 256; i++) - out[i] = (short)round(output[i] / (4 << DWT_PRECISION_BITS)); + out[i] = (short)round(output[i] / (2 << scale)); } vp9_clear_system_state(); // Make it simd safe : __asm emms; } +#if CONFIG_TX32X32 void vp9_short_fdct32x32_c(short *input, short *out, int pitch) { // assume out is a 32x32 buffer short buffer[16 * 16]; @@ -2153,21 +2157,82 @@ void vp9_short_fdct32x32_c(short *input, short *out, int pitch) { #endif // TODO(debargha): Implement more efficiently by adding output pitch // argument to the dct16x16 function - vp9_short_fdct16x16_c_f(out, buffer, 64); + vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS); for (i = 0; i < 16; ++i) vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16); - vp9_short_fdct16x16_c_f(out + 16, buffer, 64); + vp9_short_fdct16x16_c_f(out + 16, buffer, 64, 1 + DWT_PRECISION_BITS); for (i = 0; i < 16; ++i) vpx_memcpy(out + i * 32 + 16, buffer + i * 16, sizeof(short) * 16); - vp9_short_fdct16x16_c_f(out + 32 * 16, buffer, 64); + vp9_short_fdct16x16_c_f(out + 32 * 16, buffer, 64, 1 + DWT_PRECISION_BITS); for (i = 0; i < 16; ++i) vpx_memcpy(out + i * 32 + 32 * 16, buffer + i * 16, sizeof(short) * 16); - vp9_short_fdct16x16_c_f(out + 33 * 16, buffer, 64); + vp9_short_fdct16x16_c_f(out + 33 * 16, buffer, 64, 1 + DWT_PRECISION_BITS); for (i = 0; i < 16; ++i) vpx_memcpy(out + i * 32 + 33 * 16, buffer + i * 16, sizeof(short) * 16); } -#endif // CONFIG_DWT32X32HYBRID #endif // CONFIG_TX32X32 + +#if CONFIG_TX64X64 +void vp9_short_fdct64x64_c(short *input, short *out, int pitch) { + // assume out is a 64x64 buffer + short buffer[16 * 16]; + int i, j; + const int short_pitch = pitch >> 1; +#if DWT_TYPE == 26 + dyadic_analyze_26(2, 64, 64, input, short_pitch, out, 64); +#elif DWT_TYPE == 97 + dyadic_analyze_97(2, 64, 64, input, short_pitch, out, 64); +#elif DWT_TYPE == 53 + dyadic_analyze_53(2, 64, 64, input, short_pitch, out, 64); +#endif + // TODO(debargha): Implement more efficiently by adding output pitch + // argument to the dct16x16 function + vp9_short_fdct16x16_c_f(out, buffer, 128, 2 + DWT_PRECISION_BITS); + for (i = 0; i < 16; ++i) + vpx_memcpy(out + i * 64, buffer + i * 16, sizeof(short) * 16); + + vp9_short_fdct16x16_c_f(out + 16, buffer, 128, 2 + DWT_PRECISION_BITS); + for (i = 0; i < 16; ++i) + vpx_memcpy(out + i * 64 + 16, buffer + i * 16, sizeof(short) * 16); + + vp9_short_fdct16x16_c_f(out + 64 * 16, buffer, 128, 2 + DWT_PRECISION_BITS); + for (i = 0; i < 16; ++i) + vpx_memcpy(out + i * 64 + 64 * 16, buffer + i * 16, sizeof(short) * 16); + + vp9_short_fdct16x16_c_f(out + 65 * 16, buffer, 128, 2 + DWT_PRECISION_BITS); + for (i = 0; i < 16; ++i) + vpx_memcpy(out + i * 64 + 65 * 16, buffer + i * 16, sizeof(short) * 16); + + // There is no dct used on the highest bands for now. + // Need to scale these coeffs by a factor of 2/2^DWT_PRECISION_BITS + // TODO(debargha): experiment with turning these coeffs to 0 +#if DWT_PRECISION_BITS < 1 + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) { + out[i * 64 + 32 + j] <<= (1 - DWT_PRECISION_BITS); + } + } + for (i = 0; i < 32; ++i) { + for (j = 0; j < 64; ++j) { + out[i * 64 + j] <<= (1 - DWT_PRECISION_BITS); + } + } +#else + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) { + out[i * 64 + 32 + j] >>= (DWT_PRECISION_BITS - 1); + } + } + for (i = 0; i < 32; ++i) { + for (j = 0; j < 64; ++j) { + out[i * 64 + j] >>= (DWT_PRECISION_BITS - 1); + } + } +#endif +} +#endif // CONFIG_TX64X64 +#endif // CONFIG_DWTDCTHYBRID +#endif // CONFIG_TX32X32 || CONFIG_TX64X64 |