diff options
author | John Koleszar <jkoleszar@google.com> | 2010-05-18 11:58:33 -0400 |
---|---|---|
committer | John Koleszar <jkoleszar@google.com> | 2010-05-18 11:58:33 -0400 |
commit | 0ea50ce9cb4b65eee6afa1d041fe8beb5abda667 (patch) | |
tree | 1f3b9019f28bc56fd3156f96e5a9653a983ee61b /vpx_scale/generic/bicubic_scaler.c | |
download | libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.gz libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.bz2 libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.zip |
Initial WebM release
Diffstat (limited to 'vpx_scale/generic/bicubic_scaler.c')
-rw-r--r-- | vpx_scale/generic/bicubic_scaler.c | 601 |
1 files changed, 601 insertions, 0 deletions
diff --git a/vpx_scale/generic/bicubic_scaler.c b/vpx_scale/generic/bicubic_scaler.c new file mode 100644 index 000000000..e3c2b4a80 --- /dev/null +++ b/vpx_scale/generic/bicubic_scaler.c @@ -0,0 +1,601 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include <float.h> +#include <math.h> +#include <stdio.h> +#include "vpx_mem/vpx_mem.h" +#include "vpxscale_arbitrary.h" + +#define FIXED_POINT + +#define MAX_IN_WIDTH 800 +#define MAX_IN_HEIGHT 600 +#define MAX_OUT_WIDTH 800 +#define MAX_OUT_HEIGHT 600 +#define MAX_OUT_DIMENSION ((MAX_OUT_WIDTH > MAX_OUT_HEIGHT) ? \ + MAX_OUT_WIDTH : MAX_OUT_HEIGHT) + +BICUBIC_SCALER_STRUCT g_b_scaler; +static int g_first_time = 1; + +#pragma DATA_SECTION(g_hbuf, "VP6_HEAP") +#pragma DATA_ALIGN (g_hbuf, 32); +unsigned char g_hbuf[MAX_OUT_DIMENSION]; + +#pragma DATA_SECTION(g_hbuf_uv, "VP6_HEAP") +#pragma DATA_ALIGN (g_hbuf_uv, 32); +unsigned char g_hbuf_uv[MAX_OUT_DIMENSION]; + + +#ifdef FIXED_POINT +static int a_i = 0.6 * 65536; +#else +static float a = -0.6; +#endif + +#ifdef FIXED_POINT +// 3 2 +// C0 = a*t - a*t +// +static INLINE short c0_fixed(unsigned int t) +{ + // put t in Q16 notation + unsigned short v1, v2; + + // Q16 + v1 = (a_i * t) >> 16; + v1 = (v1 * t) >> 16; + + // Q16 + v2 = (a_i * t) >> 16; + v2 = (v2 * t) >> 16; + v2 = (v2 * t) >> 16; + + // Q12 + return -((v1 - v2) >> 4); +} + +// 2 3 +// C1 = a*t + (3-2*a)*t - (2-a)*t +// +static INLINE short c1_fixed(unsigned int t) +{ + unsigned short v1, v2, v3; + unsigned short two, three; + + // Q16 + v1 = (a_i * t) >> 16; + + // Q13 + two = 2 << 13; + v2 = two - (a_i >> 3); + v2 = (v2 * t) >> 16; + v2 = (v2 * t) >> 16; + v2 = (v2 * t) >> 16; + + // Q13 + three = 3 << 13; + v3 = three - (2 * (a_i >> 3)); + v3 = (v3 * t) >> 16; + v3 = (v3 * t) >> 16; + + // Q12 + return (((v1 >> 3) - v2 + v3) >> 1); + +} + +// 2 3 +// C2 = 1 - (3-a)*t + (2-a)*t +// +static INLINE short c2_fixed(unsigned int t) +{ + unsigned short v1, v2, v3; + unsigned short two, three; + + // Q13 + v1 = 1 << 13; + + // Q13 + three = 3 << 13; + v2 = three - (a_i >> 3); + v2 = (v2 * t) >> 16; + v2 = (v2 * t) >> 16; + + // Q13 + two = 2 << 13; + v3 = two - (a_i >> 3); + v3 = (v3 * t) >> 16; + v3 = (v3 * t) >> 16; + v3 = (v3 * t) >> 16; + + // Q12 + return (v1 - v2 + v3) >> 1; +} + +// 2 3 +// C3 = a*t - 2*a*t + a*t +// +static INLINE short c3_fixed(unsigned int t) +{ + int v1, v2, v3; + + // Q16 + v1 = (a_i * t) >> 16; + + // Q15 + v2 = 2 * (a_i >> 1); + v2 = (v2 * t) >> 16; + v2 = (v2 * t) >> 16; + + // Q16 + v3 = (a_i * t) >> 16; + v3 = (v3 * t) >> 16; + v3 = (v3 * t) >> 16; + + // Q12 + return ((v2 - (v1 >> 1) - (v3 >> 1)) >> 3); +} +#else +// 3 2 +// C0 = -a*t + a*t +// +float C0(float t) +{ + return -a * t * t * t + a * t * t; +} + +// 2 3 +// C1 = -a*t + (2*a+3)*t - (a+2)*t +// +float C1(float t) +{ + return -(a + 2.0f) * t * t * t + (2.0f * a + 3.0f) * t * t - a * t; +} + +// 2 3 +// C2 = 1 - (a+3)*t + (a+2)*t +// +float C2(float t) +{ + return (a + 2.0f) * t * t * t - (a + 3.0f) * t * t + 1.0f; +} + +// 2 3 +// C3 = a*t - 2*a*t + a*t +// +float C3(float t) +{ + return a * t * t * t - 2.0f * a * t * t + a * t; +} +#endif + +#if 0 +int compare_real_fixed() +{ + int i, errors = 0; + float mult = 1.0 / 10000.0; + unsigned int fixed_mult = mult * 4294967296;//65536; + unsigned int phase_offset_int; + float phase_offset_real; + + for (i = 0; i < 10000; i++) + { + int fixed0, fixed1, fixed2, fixed3, fixed_total; + int real0, real1, real2, real3, real_total; + + phase_offset_real = (float)i * mult; + phase_offset_int = (fixed_mult * i) >> 16; +// phase_offset_int = phase_offset_real * 65536; + + fixed0 = c0_fixed(phase_offset_int); + real0 = C0(phase_offset_real) * 4096.0; + + if ((abs(fixed0) > (abs(real0) + 1)) || (abs(fixed0) < (abs(real0) - 1))) + errors++; + + fixed1 = c1_fixed(phase_offset_int); + real1 = C1(phase_offset_real) * 4096.0; + + if ((abs(fixed1) > (abs(real1) + 1)) || (abs(fixed1) < (abs(real1) - 1))) + errors++; + + fixed2 = c2_fixed(phase_offset_int); + real2 = C2(phase_offset_real) * 4096.0; + + if ((abs(fixed2) > (abs(real2) + 1)) || (abs(fixed2) < (abs(real2) - 1))) + errors++; + + fixed3 = c3_fixed(phase_offset_int); + real3 = C3(phase_offset_real) * 4096.0; + + if ((abs(fixed3) > (abs(real3) + 1)) || (abs(fixed3) < (abs(real3) - 1))) + errors++; + + fixed_total = fixed0 + fixed1 + fixed2 + fixed3; + real_total = real0 + real1 + real2 + real3; + + if ((fixed_total > 4097) || (fixed_total < 4094)) + errors ++; + + if ((real_total > 4097) || (real_total < 4095)) + errors ++; + } + + return errors; +} +#endif + +// Find greatest common denominator between two integers. Method used here is +// slow compared to Euclid's algorithm, but does not require any division. +int gcd(int a, int b) +{ + // Problem with this algorithm is that if a or b = 0 this function + // will never exit. Don't want to return 0 because any computation + // that was based on a common denoninator and tried to reduce by + // dividing by 0 would fail. Best solution that could be thought of + // would to be fail by returing a 1; + if (a <= 0 || b <= 0) + return 1; + + while (a != b) + { + if (b > a) + b = b - a; + else + { + int tmp = a;//swap large and + a = b; //small + b = tmp; + } + } + + return b; +} + +void bicubic_coefficient_init() +{ + vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT)); + g_first_time = 0; +} + +void bicubic_coefficient_destroy() +{ + if (!g_first_time) + { + if (g_b_scaler.l_w) vpx_free(g_b_scaler.l_w); + + if (g_b_scaler.l_h) vpx_free(g_b_scaler.l_h); + + if (g_b_scaler.l_h_uv) vpx_free(g_b_scaler.l_h_uv); + + if (g_b_scaler.c_w) vpx_free(g_b_scaler.c_w); + + if (g_b_scaler.c_h) vpx_free(g_b_scaler.c_h); + + if (g_b_scaler.c_h_uv) vpx_free(g_b_scaler.c_h_uv); + + vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT)); + } +} + +// Create the coeffients that will be used for the cubic interpolation. +// Because scaling does not have to be equal in the vertical and horizontal +// regimes the phase offsets will be different. There are 4 coefficents +// for each point, two on each side. The layout is that there are the +// 4 coefficents for each phase in the array and then the next phase. +int bicubic_coefficient_setup(int in_width, int in_height, int out_width, int out_height) +{ + int i; +#ifdef FIXED_POINT + int phase_offset_int; + unsigned int fixed_mult; + int product_val = 0; +#else + float phase_offset; +#endif + int gcd_w, gcd_h, gcd_h_uv, d_w, d_h, d_h_uv; + + if (g_first_time) + bicubic_coefficient_init(); + + + // check to see if the coefficents have already been set up correctly + if ((in_width == g_b_scaler.in_width) && (in_height == g_b_scaler.in_height) + && (out_width == g_b_scaler.out_width) && (out_height == g_b_scaler.out_height)) + return 0; + + g_b_scaler.in_width = in_width; + g_b_scaler.in_height = in_height; + g_b_scaler.out_width = out_width; + g_b_scaler.out_height = out_height; + + // Don't want to allow crazy scaling, just try and prevent a catastrophic + // failure here. Want to fail after setting the member functions so if + // if the scaler is called the member functions will not scale. + if (out_width <= 0 || out_height <= 0) + return -1; + + // reduce in/out width and height ratios using the gcd + gcd_w = gcd(out_width, in_width); + gcd_h = gcd(out_height, in_height); + gcd_h_uv = gcd(out_height, in_height / 2); + + // the numerator width and height are to be saved in + // globals so they can be used during the scaling process + // without having to be recalculated. + g_b_scaler.nw = out_width / gcd_w; + d_w = in_width / gcd_w; + + g_b_scaler.nh = out_height / gcd_h; + d_h = in_height / gcd_h; + + g_b_scaler.nh_uv = out_height / gcd_h_uv; + d_h_uv = (in_height / 2) / gcd_h_uv; + + // allocate memory for the coefficents + if (g_b_scaler.l_w) vpx_free(g_b_scaler.l_w); + + if (g_b_scaler.l_h) vpx_free(g_b_scaler.l_h); + + if (g_b_scaler.l_h_uv) vpx_free(g_b_scaler.l_h_uv); + + g_b_scaler.l_w = (short *)vpx_memalign(32, out_width * 2); + g_b_scaler.l_h = (short *)vpx_memalign(32, out_height * 2); + g_b_scaler.l_h_uv = (short *)vpx_memalign(32, out_height * 2); + + if (g_b_scaler.c_w) vpx_free(g_b_scaler.c_w); + + if (g_b_scaler.c_h) vpx_free(g_b_scaler.c_h); + + if (g_b_scaler.c_h_uv) vpx_free(g_b_scaler.c_h_uv); + + g_b_scaler.c_w = (short *)vpx_memalign(32, g_b_scaler.nw * 4 * 2); + g_b_scaler.c_h = (short *)vpx_memalign(32, g_b_scaler.nh * 4 * 2); + g_b_scaler.c_h_uv = (short *)vpx_memalign(32, g_b_scaler.nh_uv * 4 * 2); + + g_b_scaler.hbuf = g_hbuf; + g_b_scaler.hbuf_uv = g_hbuf_uv; + + // Set up polyphase filter taps. This needs to be done before + // the scaling because of the floating point math required. The + // coefficients are multiplied by 2^12 so that fixed point math + // can be used in the main scaling loop. +#ifdef FIXED_POINT + fixed_mult = (1.0 / (float)g_b_scaler.nw) * 4294967296; + + product_val = 0; + + for (i = 0; i < g_b_scaler.nw; i++) + { + if (product_val > g_b_scaler.nw) + product_val -= g_b_scaler.nw; + + phase_offset_int = (fixed_mult * product_val) >> 16; + + g_b_scaler.c_w[i*4] = c3_fixed(phase_offset_int); + g_b_scaler.c_w[i*4+1] = c2_fixed(phase_offset_int); + g_b_scaler.c_w[i*4+2] = c1_fixed(phase_offset_int); + g_b_scaler.c_w[i*4+3] = c0_fixed(phase_offset_int); + + product_val += d_w; + } + + + fixed_mult = (1.0 / (float)g_b_scaler.nh) * 4294967296; + + product_val = 0; + + for (i = 0; i < g_b_scaler.nh; i++) + { + if (product_val > g_b_scaler.nh) + product_val -= g_b_scaler.nh; + + phase_offset_int = (fixed_mult * product_val) >> 16; + + g_b_scaler.c_h[i*4] = c0_fixed(phase_offset_int); + g_b_scaler.c_h[i*4+1] = c1_fixed(phase_offset_int); + g_b_scaler.c_h[i*4+2] = c2_fixed(phase_offset_int); + g_b_scaler.c_h[i*4+3] = c3_fixed(phase_offset_int); + + product_val += d_h; + } + + fixed_mult = (1.0 / (float)g_b_scaler.nh_uv) * 4294967296; + + product_val = 0; + + for (i = 0; i < g_b_scaler.nh_uv; i++) + { + if (product_val > g_b_scaler.nh_uv) + product_val -= g_b_scaler.nh_uv; + + phase_offset_int = (fixed_mult * product_val) >> 16; + + g_b_scaler.c_h_uv[i*4] = c0_fixed(phase_offset_int); + g_b_scaler.c_h_uv[i*4+1] = c1_fixed(phase_offset_int); + g_b_scaler.c_h_uv[i*4+2] = c2_fixed(phase_offset_int); + g_b_scaler.c_h_uv[i*4+3] = c3_fixed(phase_offset_int); + + product_val += d_h_uv; + } + +#else + + for (i = 0; i < g_nw; i++) + { + phase_offset = (float)((i * d_w) % g_nw) / (float)g_nw; + g_c_w[i*4] = (C3(phase_offset) * 4096.0); + g_c_w[i*4+1] = (C2(phase_offset) * 4096.0); + g_c_w[i*4+2] = (C1(phase_offset) * 4096.0); + g_c_w[i*4+3] = (C0(phase_offset) * 4096.0); + } + + for (i = 0; i < g_nh; i++) + { + phase_offset = (float)((i * d_h) % g_nh) / (float)g_nh; + g_c_h[i*4] = (C0(phase_offset) * 4096.0); + g_c_h[i*4+1] = (C1(phase_offset) * 4096.0); + g_c_h[i*4+2] = (C2(phase_offset) * 4096.0); + g_c_h[i*4+3] = (C3(phase_offset) * 4096.0); + } + + for (i = 0; i < g_nh_uv; i++) + { + phase_offset = (float)((i * d_h_uv) % g_nh_uv) / (float)g_nh_uv; + g_c_h_uv[i*4] = (C0(phase_offset) * 4096.0); + g_c_h_uv[i*4+1] = (C1(phase_offset) * 4096.0); + g_c_h_uv[i*4+2] = (C2(phase_offset) * 4096.0); + g_c_h_uv[i*4+3] = (C3(phase_offset) * 4096.0); + } + +#endif + + // Create an array that corresponds input lines to output lines. + // This doesn't require floating point math, but it does require + // a division and because hardware division is not present that + // is a call. + for (i = 0; i < out_width; i++) + { + g_b_scaler.l_w[i] = (i * d_w) / g_b_scaler.nw; + + if ((g_b_scaler.l_w[i] + 2) <= in_width) + g_b_scaler.max_usable_out_width = i; + + } + + for (i = 0; i < out_height + 1; i++) + { + g_b_scaler.l_h[i] = (i * d_h) / g_b_scaler.nh; + g_b_scaler.l_h_uv[i] = (i * d_h_uv) / g_b_scaler.nh_uv; + } + + return 0; +} + +int bicubic_scale(int in_width, int in_height, int in_stride, + int out_width, int out_height, int out_stride, + unsigned char *input_image, unsigned char *output_image) +{ + short *RESTRICT l_w, * RESTRICT l_h; + short *RESTRICT c_w, * RESTRICT c_h; + unsigned char *RESTRICT ip, * RESTRICT op; + unsigned char *RESTRICT hbuf; + int h, w, lw, lh; + int temp_sum; + int phase_offset_w, phase_offset_h; + + c_w = g_b_scaler.c_w; + c_h = g_b_scaler.c_h; + + op = output_image; + + l_w = g_b_scaler.l_w; + l_h = g_b_scaler.l_h; + + phase_offset_h = 0; + + for (h = 0; h < out_height; h++) + { + // select the row to work on + lh = l_h[h]; + ip = input_image + (in_stride * lh); + + // vp8_filter the row vertically into an temporary buffer. + // If the phase offset == 0 then all the multiplication + // is going to result in the output equalling the input. + // So instead point the temporary buffer to the input. + // Also handle the boundry condition of not being able to + // filter that last lines. + if (phase_offset_h && (lh < in_height - 2)) + { + hbuf = g_b_scaler.hbuf; + + for (w = 0; w < in_width; w++) + { + temp_sum = c_h[phase_offset_h*4+3] * ip[w - in_stride]; + temp_sum += c_h[phase_offset_h*4+2] * ip[w]; + temp_sum += c_h[phase_offset_h*4+1] * ip[w + in_stride]; + temp_sum += c_h[phase_offset_h*4] * ip[w + 2*in_stride]; + + hbuf[w] = temp_sum >> 12; + } + } + else + hbuf = ip; + + // increase the phase offset for the next time around. + if (++phase_offset_h >= g_b_scaler.nh) + phase_offset_h = 0; + + // now filter and expand it horizontally into the final + // output buffer + phase_offset_w = 0; + + for (w = 0; w < out_width; w++) + { + // get the index to use to expand the image + lw = l_w[w]; + + temp_sum = c_w[phase_offset_w*4] * hbuf[lw - 1]; + temp_sum += c_w[phase_offset_w*4+1] * hbuf[lw]; + temp_sum += c_w[phase_offset_w*4+2] * hbuf[lw + 1]; + temp_sum += c_w[phase_offset_w*4+3] * hbuf[lw + 2]; + temp_sum = temp_sum >> 12; + + if (++phase_offset_w >= g_b_scaler.nw) + phase_offset_w = 0; + + // boundry conditions + if ((lw + 2) >= in_width) + temp_sum = hbuf[lw]; + + if (lw == 0) + temp_sum = hbuf[0]; + + op[w] = temp_sum; + } + + op += out_stride; + } + + return 0; +} + +void bicubic_scale_frame_reset() +{ + g_b_scaler.out_width = 0; + g_b_scaler.out_height = 0; +} + +void bicubic_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, + int new_width, int new_height) +{ + + dst->y_width = new_width; + dst->y_height = new_height; + dst->uv_width = new_width / 2; + dst->uv_height = new_height / 2; + + dst->y_stride = dst->y_width; + dst->uv_stride = dst->uv_width; + + bicubic_scale(src->y_width, src->y_height, src->y_stride, + new_width, new_height, dst->y_stride, + src->y_buffer, dst->y_buffer); + + bicubic_scale(src->uv_width, src->uv_height, src->uv_stride, + new_width / 2, new_height / 2, dst->uv_stride, + src->u_buffer, dst->u_buffer); + + bicubic_scale(src->uv_width, src->uv_height, src->uv_stride, + new_width / 2, new_height / 2, dst->uv_stride, + src->v_buffer, dst->v_buffer); +} |