summaryrefslogtreecommitdiff
path: root/vpx_scale/generic/bicubic_scaler.c
diff options
context:
space:
mode:
Diffstat (limited to 'vpx_scale/generic/bicubic_scaler.c')
-rw-r--r--vpx_scale/generic/bicubic_scaler.c601
1 files changed, 601 insertions, 0 deletions
diff --git a/vpx_scale/generic/bicubic_scaler.c b/vpx_scale/generic/bicubic_scaler.c
new file mode 100644
index 000000000..e3c2b4a80
--- /dev/null
+++ b/vpx_scale/generic/bicubic_scaler.c
@@ -0,0 +1,601 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include "vpx_mem/vpx_mem.h"
+#include "vpxscale_arbitrary.h"
+
+#define FIXED_POINT
+
+#define MAX_IN_WIDTH 800
+#define MAX_IN_HEIGHT 600
+#define MAX_OUT_WIDTH 800
+#define MAX_OUT_HEIGHT 600
+#define MAX_OUT_DIMENSION ((MAX_OUT_WIDTH > MAX_OUT_HEIGHT) ? \
+ MAX_OUT_WIDTH : MAX_OUT_HEIGHT)
+
+BICUBIC_SCALER_STRUCT g_b_scaler;
+static int g_first_time = 1;
+
+#pragma DATA_SECTION(g_hbuf, "VP6_HEAP")
+#pragma DATA_ALIGN (g_hbuf, 32);
+unsigned char g_hbuf[MAX_OUT_DIMENSION];
+
+#pragma DATA_SECTION(g_hbuf_uv, "VP6_HEAP")
+#pragma DATA_ALIGN (g_hbuf_uv, 32);
+unsigned char g_hbuf_uv[MAX_OUT_DIMENSION];
+
+
+#ifdef FIXED_POINT
+static int a_i = 0.6 * 65536;
+#else
+static float a = -0.6;
+#endif
+
+#ifdef FIXED_POINT
+// 3 2
+// C0 = a*t - a*t
+//
+static INLINE short c0_fixed(unsigned int t)
+{
+ // put t in Q16 notation
+ unsigned short v1, v2;
+
+ // Q16
+ v1 = (a_i * t) >> 16;
+ v1 = (v1 * t) >> 16;
+
+ // Q16
+ v2 = (a_i * t) >> 16;
+ v2 = (v2 * t) >> 16;
+ v2 = (v2 * t) >> 16;
+
+ // Q12
+ return -((v1 - v2) >> 4);
+}
+
+// 2 3
+// C1 = a*t + (3-2*a)*t - (2-a)*t
+//
+static INLINE short c1_fixed(unsigned int t)
+{
+ unsigned short v1, v2, v3;
+ unsigned short two, three;
+
+ // Q16
+ v1 = (a_i * t) >> 16;
+
+ // Q13
+ two = 2 << 13;
+ v2 = two - (a_i >> 3);
+ v2 = (v2 * t) >> 16;
+ v2 = (v2 * t) >> 16;
+ v2 = (v2 * t) >> 16;
+
+ // Q13
+ three = 3 << 13;
+ v3 = three - (2 * (a_i >> 3));
+ v3 = (v3 * t) >> 16;
+ v3 = (v3 * t) >> 16;
+
+ // Q12
+ return (((v1 >> 3) - v2 + v3) >> 1);
+
+}
+
+// 2 3
+// C2 = 1 - (3-a)*t + (2-a)*t
+//
+static INLINE short c2_fixed(unsigned int t)
+{
+ unsigned short v1, v2, v3;
+ unsigned short two, three;
+
+ // Q13
+ v1 = 1 << 13;
+
+ // Q13
+ three = 3 << 13;
+ v2 = three - (a_i >> 3);
+ v2 = (v2 * t) >> 16;
+ v2 = (v2 * t) >> 16;
+
+ // Q13
+ two = 2 << 13;
+ v3 = two - (a_i >> 3);
+ v3 = (v3 * t) >> 16;
+ v3 = (v3 * t) >> 16;
+ v3 = (v3 * t) >> 16;
+
+ // Q12
+ return (v1 - v2 + v3) >> 1;
+}
+
+// 2 3
+// C3 = a*t - 2*a*t + a*t
+//
+static INLINE short c3_fixed(unsigned int t)
+{
+ int v1, v2, v3;
+
+ // Q16
+ v1 = (a_i * t) >> 16;
+
+ // Q15
+ v2 = 2 * (a_i >> 1);
+ v2 = (v2 * t) >> 16;
+ v2 = (v2 * t) >> 16;
+
+ // Q16
+ v3 = (a_i * t) >> 16;
+ v3 = (v3 * t) >> 16;
+ v3 = (v3 * t) >> 16;
+
+ // Q12
+ return ((v2 - (v1 >> 1) - (v3 >> 1)) >> 3);
+}
+#else
+// 3 2
+// C0 = -a*t + a*t
+//
+float C0(float t)
+{
+ return -a * t * t * t + a * t * t;
+}
+
+// 2 3
+// C1 = -a*t + (2*a+3)*t - (a+2)*t
+//
+float C1(float t)
+{
+ return -(a + 2.0f) * t * t * t + (2.0f * a + 3.0f) * t * t - a * t;
+}
+
+// 2 3
+// C2 = 1 - (a+3)*t + (a+2)*t
+//
+float C2(float t)
+{
+ return (a + 2.0f) * t * t * t - (a + 3.0f) * t * t + 1.0f;
+}
+
+// 2 3
+// C3 = a*t - 2*a*t + a*t
+//
+float C3(float t)
+{
+ return a * t * t * t - 2.0f * a * t * t + a * t;
+}
+#endif
+
+#if 0
+int compare_real_fixed()
+{
+ int i, errors = 0;
+ float mult = 1.0 / 10000.0;
+ unsigned int fixed_mult = mult * 4294967296;//65536;
+ unsigned int phase_offset_int;
+ float phase_offset_real;
+
+ for (i = 0; i < 10000; i++)
+ {
+ int fixed0, fixed1, fixed2, fixed3, fixed_total;
+ int real0, real1, real2, real3, real_total;
+
+ phase_offset_real = (float)i * mult;
+ phase_offset_int = (fixed_mult * i) >> 16;
+// phase_offset_int = phase_offset_real * 65536;
+
+ fixed0 = c0_fixed(phase_offset_int);
+ real0 = C0(phase_offset_real) * 4096.0;
+
+ if ((abs(fixed0) > (abs(real0) + 1)) || (abs(fixed0) < (abs(real0) - 1)))
+ errors++;
+
+ fixed1 = c1_fixed(phase_offset_int);
+ real1 = C1(phase_offset_real) * 4096.0;
+
+ if ((abs(fixed1) > (abs(real1) + 1)) || (abs(fixed1) < (abs(real1) - 1)))
+ errors++;
+
+ fixed2 = c2_fixed(phase_offset_int);
+ real2 = C2(phase_offset_real) * 4096.0;
+
+ if ((abs(fixed2) > (abs(real2) + 1)) || (abs(fixed2) < (abs(real2) - 1)))
+ errors++;
+
+ fixed3 = c3_fixed(phase_offset_int);
+ real3 = C3(phase_offset_real) * 4096.0;
+
+ if ((abs(fixed3) > (abs(real3) + 1)) || (abs(fixed3) < (abs(real3) - 1)))
+ errors++;
+
+ fixed_total = fixed0 + fixed1 + fixed2 + fixed3;
+ real_total = real0 + real1 + real2 + real3;
+
+ if ((fixed_total > 4097) || (fixed_total < 4094))
+ errors ++;
+
+ if ((real_total > 4097) || (real_total < 4095))
+ errors ++;
+ }
+
+ return errors;
+}
+#endif
+
+// Find greatest common denominator between two integers. Method used here is
+// slow compared to Euclid's algorithm, but does not require any division.
+int gcd(int a, int b)
+{
+ // Problem with this algorithm is that if a or b = 0 this function
+ // will never exit. Don't want to return 0 because any computation
+ // that was based on a common denoninator and tried to reduce by
+ // dividing by 0 would fail. Best solution that could be thought of
+ // would to be fail by returing a 1;
+ if (a <= 0 || b <= 0)
+ return 1;
+
+ while (a != b)
+ {
+ if (b > a)
+ b = b - a;
+ else
+ {
+ int tmp = a;//swap large and
+ a = b; //small
+ b = tmp;
+ }
+ }
+
+ return b;
+}
+
+void bicubic_coefficient_init()
+{
+ vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));
+ g_first_time = 0;
+}
+
+void bicubic_coefficient_destroy()
+{
+ if (!g_first_time)
+ {
+ if (g_b_scaler.l_w) vpx_free(g_b_scaler.l_w);
+
+ if (g_b_scaler.l_h) vpx_free(g_b_scaler.l_h);
+
+ if (g_b_scaler.l_h_uv) vpx_free(g_b_scaler.l_h_uv);
+
+ if (g_b_scaler.c_w) vpx_free(g_b_scaler.c_w);
+
+ if (g_b_scaler.c_h) vpx_free(g_b_scaler.c_h);
+
+ if (g_b_scaler.c_h_uv) vpx_free(g_b_scaler.c_h_uv);
+
+ vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));
+ }
+}
+
+// Create the coeffients that will be used for the cubic interpolation.
+// Because scaling does not have to be equal in the vertical and horizontal
+// regimes the phase offsets will be different. There are 4 coefficents
+// for each point, two on each side. The layout is that there are the
+// 4 coefficents for each phase in the array and then the next phase.
+int bicubic_coefficient_setup(int in_width, int in_height, int out_width, int out_height)
+{
+ int i;
+#ifdef FIXED_POINT
+ int phase_offset_int;
+ unsigned int fixed_mult;
+ int product_val = 0;
+#else
+ float phase_offset;
+#endif
+ int gcd_w, gcd_h, gcd_h_uv, d_w, d_h, d_h_uv;
+
+ if (g_first_time)
+ bicubic_coefficient_init();
+
+
+ // check to see if the coefficents have already been set up correctly
+ if ((in_width == g_b_scaler.in_width) && (in_height == g_b_scaler.in_height)
+ && (out_width == g_b_scaler.out_width) && (out_height == g_b_scaler.out_height))
+ return 0;
+
+ g_b_scaler.in_width = in_width;
+ g_b_scaler.in_height = in_height;
+ g_b_scaler.out_width = out_width;
+ g_b_scaler.out_height = out_height;
+
+ // Don't want to allow crazy scaling, just try and prevent a catastrophic
+ // failure here. Want to fail after setting the member functions so if
+ // if the scaler is called the member functions will not scale.
+ if (out_width <= 0 || out_height <= 0)
+ return -1;
+
+ // reduce in/out width and height ratios using the gcd
+ gcd_w = gcd(out_width, in_width);
+ gcd_h = gcd(out_height, in_height);
+ gcd_h_uv = gcd(out_height, in_height / 2);
+
+ // the numerator width and height are to be saved in
+ // globals so they can be used during the scaling process
+ // without having to be recalculated.
+ g_b_scaler.nw = out_width / gcd_w;
+ d_w = in_width / gcd_w;
+
+ g_b_scaler.nh = out_height / gcd_h;
+ d_h = in_height / gcd_h;
+
+ g_b_scaler.nh_uv = out_height / gcd_h_uv;
+ d_h_uv = (in_height / 2) / gcd_h_uv;
+
+ // allocate memory for the coefficents
+ if (g_b_scaler.l_w) vpx_free(g_b_scaler.l_w);
+
+ if (g_b_scaler.l_h) vpx_free(g_b_scaler.l_h);
+
+ if (g_b_scaler.l_h_uv) vpx_free(g_b_scaler.l_h_uv);
+
+ g_b_scaler.l_w = (short *)vpx_memalign(32, out_width * 2);
+ g_b_scaler.l_h = (short *)vpx_memalign(32, out_height * 2);
+ g_b_scaler.l_h_uv = (short *)vpx_memalign(32, out_height * 2);
+
+ if (g_b_scaler.c_w) vpx_free(g_b_scaler.c_w);
+
+ if (g_b_scaler.c_h) vpx_free(g_b_scaler.c_h);
+
+ if (g_b_scaler.c_h_uv) vpx_free(g_b_scaler.c_h_uv);
+
+ g_b_scaler.c_w = (short *)vpx_memalign(32, g_b_scaler.nw * 4 * 2);
+ g_b_scaler.c_h = (short *)vpx_memalign(32, g_b_scaler.nh * 4 * 2);
+ g_b_scaler.c_h_uv = (short *)vpx_memalign(32, g_b_scaler.nh_uv * 4 * 2);
+
+ g_b_scaler.hbuf = g_hbuf;
+ g_b_scaler.hbuf_uv = g_hbuf_uv;
+
+ // Set up polyphase filter taps. This needs to be done before
+ // the scaling because of the floating point math required. The
+ // coefficients are multiplied by 2^12 so that fixed point math
+ // can be used in the main scaling loop.
+#ifdef FIXED_POINT
+ fixed_mult = (1.0 / (float)g_b_scaler.nw) * 4294967296;
+
+ product_val = 0;
+
+ for (i = 0; i < g_b_scaler.nw; i++)
+ {
+ if (product_val > g_b_scaler.nw)
+ product_val -= g_b_scaler.nw;
+
+ phase_offset_int = (fixed_mult * product_val) >> 16;
+
+ g_b_scaler.c_w[i*4] = c3_fixed(phase_offset_int);
+ g_b_scaler.c_w[i*4+1] = c2_fixed(phase_offset_int);
+ g_b_scaler.c_w[i*4+2] = c1_fixed(phase_offset_int);
+ g_b_scaler.c_w[i*4+3] = c0_fixed(phase_offset_int);
+
+ product_val += d_w;
+ }
+
+
+ fixed_mult = (1.0 / (float)g_b_scaler.nh) * 4294967296;
+
+ product_val = 0;
+
+ for (i = 0; i < g_b_scaler.nh; i++)
+ {
+ if (product_val > g_b_scaler.nh)
+ product_val -= g_b_scaler.nh;
+
+ phase_offset_int = (fixed_mult * product_val) >> 16;
+
+ g_b_scaler.c_h[i*4] = c0_fixed(phase_offset_int);
+ g_b_scaler.c_h[i*4+1] = c1_fixed(phase_offset_int);
+ g_b_scaler.c_h[i*4+2] = c2_fixed(phase_offset_int);
+ g_b_scaler.c_h[i*4+3] = c3_fixed(phase_offset_int);
+
+ product_val += d_h;
+ }
+
+ fixed_mult = (1.0 / (float)g_b_scaler.nh_uv) * 4294967296;
+
+ product_val = 0;
+
+ for (i = 0; i < g_b_scaler.nh_uv; i++)
+ {
+ if (product_val > g_b_scaler.nh_uv)
+ product_val -= g_b_scaler.nh_uv;
+
+ phase_offset_int = (fixed_mult * product_val) >> 16;
+
+ g_b_scaler.c_h_uv[i*4] = c0_fixed(phase_offset_int);
+ g_b_scaler.c_h_uv[i*4+1] = c1_fixed(phase_offset_int);
+ g_b_scaler.c_h_uv[i*4+2] = c2_fixed(phase_offset_int);
+ g_b_scaler.c_h_uv[i*4+3] = c3_fixed(phase_offset_int);
+
+ product_val += d_h_uv;
+ }
+
+#else
+
+ for (i = 0; i < g_nw; i++)
+ {
+ phase_offset = (float)((i * d_w) % g_nw) / (float)g_nw;
+ g_c_w[i*4] = (C3(phase_offset) * 4096.0);
+ g_c_w[i*4+1] = (C2(phase_offset) * 4096.0);
+ g_c_w[i*4+2] = (C1(phase_offset) * 4096.0);
+ g_c_w[i*4+3] = (C0(phase_offset) * 4096.0);
+ }
+
+ for (i = 0; i < g_nh; i++)
+ {
+ phase_offset = (float)((i * d_h) % g_nh) / (float)g_nh;
+ g_c_h[i*4] = (C0(phase_offset) * 4096.0);
+ g_c_h[i*4+1] = (C1(phase_offset) * 4096.0);
+ g_c_h[i*4+2] = (C2(phase_offset) * 4096.0);
+ g_c_h[i*4+3] = (C3(phase_offset) * 4096.0);
+ }
+
+ for (i = 0; i < g_nh_uv; i++)
+ {
+ phase_offset = (float)((i * d_h_uv) % g_nh_uv) / (float)g_nh_uv;
+ g_c_h_uv[i*4] = (C0(phase_offset) * 4096.0);
+ g_c_h_uv[i*4+1] = (C1(phase_offset) * 4096.0);
+ g_c_h_uv[i*4+2] = (C2(phase_offset) * 4096.0);
+ g_c_h_uv[i*4+3] = (C3(phase_offset) * 4096.0);
+ }
+
+#endif
+
+ // Create an array that corresponds input lines to output lines.
+ // This doesn't require floating point math, but it does require
+ // a division and because hardware division is not present that
+ // is a call.
+ for (i = 0; i < out_width; i++)
+ {
+ g_b_scaler.l_w[i] = (i * d_w) / g_b_scaler.nw;
+
+ if ((g_b_scaler.l_w[i] + 2) <= in_width)
+ g_b_scaler.max_usable_out_width = i;
+
+ }
+
+ for (i = 0; i < out_height + 1; i++)
+ {
+ g_b_scaler.l_h[i] = (i * d_h) / g_b_scaler.nh;
+ g_b_scaler.l_h_uv[i] = (i * d_h_uv) / g_b_scaler.nh_uv;
+ }
+
+ return 0;
+}
+
+int bicubic_scale(int in_width, int in_height, int in_stride,
+ int out_width, int out_height, int out_stride,
+ unsigned char *input_image, unsigned char *output_image)
+{
+ short *RESTRICT l_w, * RESTRICT l_h;
+ short *RESTRICT c_w, * RESTRICT c_h;
+ unsigned char *RESTRICT ip, * RESTRICT op;
+ unsigned char *RESTRICT hbuf;
+ int h, w, lw, lh;
+ int temp_sum;
+ int phase_offset_w, phase_offset_h;
+
+ c_w = g_b_scaler.c_w;
+ c_h = g_b_scaler.c_h;
+
+ op = output_image;
+
+ l_w = g_b_scaler.l_w;
+ l_h = g_b_scaler.l_h;
+
+ phase_offset_h = 0;
+
+ for (h = 0; h < out_height; h++)
+ {
+ // select the row to work on
+ lh = l_h[h];
+ ip = input_image + (in_stride * lh);
+
+ // vp8_filter the row vertically into an temporary buffer.
+ // If the phase offset == 0 then all the multiplication
+ // is going to result in the output equalling the input.
+ // So instead point the temporary buffer to the input.
+ // Also handle the boundry condition of not being able to
+ // filter that last lines.
+ if (phase_offset_h && (lh < in_height - 2))
+ {
+ hbuf = g_b_scaler.hbuf;
+
+ for (w = 0; w < in_width; w++)
+ {
+ temp_sum = c_h[phase_offset_h*4+3] * ip[w - in_stride];
+ temp_sum += c_h[phase_offset_h*4+2] * ip[w];
+ temp_sum += c_h[phase_offset_h*4+1] * ip[w + in_stride];
+ temp_sum += c_h[phase_offset_h*4] * ip[w + 2*in_stride];
+
+ hbuf[w] = temp_sum >> 12;
+ }
+ }
+ else
+ hbuf = ip;
+
+ // increase the phase offset for the next time around.
+ if (++phase_offset_h >= g_b_scaler.nh)
+ phase_offset_h = 0;
+
+ // now filter and expand it horizontally into the final
+ // output buffer
+ phase_offset_w = 0;
+
+ for (w = 0; w < out_width; w++)
+ {
+ // get the index to use to expand the image
+ lw = l_w[w];
+
+ temp_sum = c_w[phase_offset_w*4] * hbuf[lw - 1];
+ temp_sum += c_w[phase_offset_w*4+1] * hbuf[lw];
+ temp_sum += c_w[phase_offset_w*4+2] * hbuf[lw + 1];
+ temp_sum += c_w[phase_offset_w*4+3] * hbuf[lw + 2];
+ temp_sum = temp_sum >> 12;
+
+ if (++phase_offset_w >= g_b_scaler.nw)
+ phase_offset_w = 0;
+
+ // boundry conditions
+ if ((lw + 2) >= in_width)
+ temp_sum = hbuf[lw];
+
+ if (lw == 0)
+ temp_sum = hbuf[0];
+
+ op[w] = temp_sum;
+ }
+
+ op += out_stride;
+ }
+
+ return 0;
+}
+
+void bicubic_scale_frame_reset()
+{
+ g_b_scaler.out_width = 0;
+ g_b_scaler.out_height = 0;
+}
+
+void bicubic_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+ int new_width, int new_height)
+{
+
+ dst->y_width = new_width;
+ dst->y_height = new_height;
+ dst->uv_width = new_width / 2;
+ dst->uv_height = new_height / 2;
+
+ dst->y_stride = dst->y_width;
+ dst->uv_stride = dst->uv_width;
+
+ bicubic_scale(src->y_width, src->y_height, src->y_stride,
+ new_width, new_height, dst->y_stride,
+ src->y_buffer, dst->y_buffer);
+
+ bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,
+ new_width / 2, new_height / 2, dst->uv_stride,
+ src->u_buffer, dst->u_buffer);
+
+ bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,
+ new_width / 2, new_height / 2, dst->uv_stride,
+ src->v_buffer, dst->v_buffer);
+}