summaryrefslogtreecommitdiff
path: root/vpx_scale
diff options
context:
space:
mode:
Diffstat (limited to 'vpx_scale')
-rw-r--r--vpx_scale/arm/armv4/gen_scalers_armv4.asm773
-rw-r--r--vpx_scale/arm/nds/yv12extend.c220
-rw-r--r--vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm227
-rw-r--r--vpx_scale/arm/neon/vp8_vpxyv12_copyframeyonly_neon.asm499
-rw-r--r--vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm257
-rw-r--r--vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm587
-rw-r--r--vpx_scale/arm/scalesystemdependant.c87
-rw-r--r--vpx_scale/arm/yv12extend_arm.c24
-rw-r--r--vpx_scale/blackfin/yv12config.c119
-rw-r--r--vpx_scale/blackfin/yv12extend.c349
-rw-r--r--vpx_scale/dm642/bicubic_scaler_c64.c193
-rw-r--r--vpx_scale/dm642/gen_scalers_c64.c607
-rw-r--r--vpx_scale/dm642/yv12extend.c445
-rw-r--r--vpx_scale/generic/bicubic_scaler.c601
-rw-r--r--vpx_scale/generic/gen_scalers.c954
-rw-r--r--vpx_scale/generic/scalesystemdependant.c79
-rw-r--r--vpx_scale/generic/vpxscale.c1088
-rw-r--r--vpx_scale/generic/yv12config.c110
-rw-r--r--vpx_scale/generic/yv12extend.c279
-rw-r--r--vpx_scale/include/arm/vpxscale_nofp.h67
-rw-r--r--vpx_scale/include/generic/vpxscale_arbitrary.h55
-rw-r--r--vpx_scale/include/generic/vpxscale_depricated.h33
-rw-r--r--vpx_scale/include/generic/vpxscale_nofp.h50
-rw-r--r--vpx_scale/include/leapster/vpxscale.h61
-rw-r--r--vpx_scale/include/symbian/vpxscale_nofp.h67
-rw-r--r--vpx_scale/include/vpxscale_nofp.h15
-rw-r--r--vpx_scale/intel_linux/scaleopt.c1852
-rw-r--r--vpx_scale/intel_linux/scalesystemdependant.c90
-rw-r--r--vpx_scale/leapster/doptsystemdependant_lf.c71
-rw-r--r--vpx_scale/leapster/gen_scalers_lf.c521
-rw-r--r--vpx_scale/leapster/vpxscale_lf.c890
-rw-r--r--vpx_scale/leapster/yv12extend.c231
-rw-r--r--vpx_scale/scale_mode.h28
-rw-r--r--vpx_scale/symbian/gen_scalers_armv4.asm773
-rw-r--r--vpx_scale/symbian/gen_scalers_armv4.s808
-rw-r--r--vpx_scale/symbian/scalesystemdependant.c57
-rw-r--r--vpx_scale/vpx_scale.mk23
-rw-r--r--vpx_scale/vpxscale.h113
-rw-r--r--vpx_scale/wce/gen_scalers_armv4.asm773
-rw-r--r--vpx_scale/wce/scalesystemdependant.c59
-rw-r--r--vpx_scale/win32/scaleopt.c1749
-rw-r--r--vpx_scale/win32/scalesystemdependant.c90
-rw-r--r--vpx_scale/x86_64/scaleopt.c1749
-rw-r--r--vpx_scale/x86_64/scalesystemdependant.c60
-rw-r--r--vpx_scale/yv12config.h70
-rw-r--r--vpx_scale/yv12extend.h32
46 files changed, 17885 insertions, 0 deletions
diff --git a/vpx_scale/arm/armv4/gen_scalers_armv4.asm b/vpx_scale/arm/armv4/gen_scalers_armv4.asm
new file mode 100644
index 000000000..1c904edae
--- /dev/null
+++ b/vpx_scale/arm/armv4/gen_scalers_armv4.asm
@@ -0,0 +1,773 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |horizontal_line_4_5_scale_armv4|
+ EXPORT |vertical_band_4_5_scale_armv4|
+ EXPORT |horizontal_line_2_3_scale_armv4|
+ EXPORT |vertical_band_2_3_scale_armv4|
+ EXPORT |horizontal_line_3_5_scale_armv4|
+ EXPORT |vertical_band_3_5_scale_armv4|
+ EXPORT |horizontal_line_3_4_scale_armv4|
+ EXPORT |vertical_band_3_4_scale_armv4|
+ EXPORT |horizontal_line_1_2_scale_armv4|
+ EXPORT |vertical_band_1_2_scale_armv4|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+src RN r0
+srcw RN r1
+dest RN r2
+mask RN r12
+c51_205 RN r10
+c102_154 RN r11
+;/****************************************************************************
+; *
+; * ROUTINE : horizontal_line_4_5_scale_armv4
+; *
+; * INPUTS : const unsigned char *source : Pointer to source data.
+; * unsigned int source_width : Stride of source.
+; * unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_width : Stride of destination (NOT USED).
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Copies horizontal line of pixels from source to
+; * destination scaling up by 4 to 5.
+; *
+; * SPECIAL NOTES : None.
+; *
+; ****************************************************************************/
+;void horizontal_line_4_5_scale_armv4
+;(
+; r0 = UINT8 *source
+; r1 = UINT32 source_width
+; r2 = UINT8 *dest
+; r3 = UINT32 dest_width
+;)
+|horizontal_line_4_5_scale_armv4| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ mov mask, #255 ; mask for selection
+ ldr c51_205, =0x3300cd
+ ldr c102_154, =0x66009a
+
+ ldr r3, [src], #4
+
+hl45_loop
+
+ and r4, r3, mask ; a = src[0]
+ and r5, mask, r3, lsr #8 ; b = src[1]
+ strb r4, [dest], #1
+
+ orr r6, r4, r5, lsl #16 ; b | a
+ and r7, mask, r3, lsr #16 ; c = src[2]
+ mul r6, c51_205, r6 ; a * 51 + 205 * b
+
+ orr r5, r5, r7, lsl #16 ; c | b
+ mul r5, c102_154, r5 ; b * 102 + 154 * c
+ add r6, r6, #0x8000
+ and r8, mask, r3, lsr #24 ; d = src[3]
+ mov r6, r6, lsr #24
+ strb r6, [dest], #1
+
+ orr r7, r8, r7, lsl #16 ; c | d
+ mul r7, c102_154, r7 ; c * 154 + 102 * d
+ add r5, r5, #0x8000
+ ldr r3, [src], #4
+ mov r5, r5, lsr #24
+ strb r5, [dest], #1
+
+ add r7, r7, #0x8000
+ and r9, mask, r3 ; e = src[4]
+ orr r9, r9, r8, lsl #16 ; d | e
+ mul r9, c51_205, r9 ; d * 205 + 51 * e
+ mov r7, r7, lsr #24
+ strb r7, [dest], #1
+
+ add r9, r9, #0x8000
+ subs srcw, srcw, #4
+ mov r9, r9, lsr #24
+ strb r9, [dest], #1
+
+ bne hl45_loop
+
+ and r4, r3, mask
+ and r5, mask, r3, lsl #8
+ strb r4, [dest], #1
+
+ orr r6, r4, r5, lsl #16 ; b | a
+ mul r6, c51_205, r6
+
+ and r7, mask, r3, lsl #16
+ orr r5, r5, r7, lsl #16 ; c | b
+ mul r5, c102_154, r5
+ add r6, r6, #0x8000
+ and r8, mask, r3, lsl #24
+ mov r6, r6, lsr #24
+ strb r6, [dest], #1
+
+ orr r7, r8, r7, lsl #16 ; c | d
+ mul r7, c102_154, r7
+ add r5, r5, #0x8000
+ mov r5, r5, lsr #24
+ strb r5, [dest], #1
+
+ add r7, r7, #0x8000
+ mov r7, r7, lsr #24
+ strb r7, [dest], #1
+
+ ldrb r3, [src]
+ strb r3, [dest], #1
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ;|vp8cx_horizontal_line_4_5_scale_c|
+
+;/****************************************************************************
+; *
+; * ROUTINE : vertical_band_4_5_scale_armv4
+; *
+; * INPUTS : unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_pitch : Stride of destination data.
+; * unsigned int dest_width : Width of destination data.
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Scales vertical band of pixels by scale 4 to 5. The
+; * height of the band scaled is 4-pixels.
+; *
+; * SPECIAL NOTES : The routine uses the first line of the band below
+; * the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_4_5_scale_armv4
+;(
+; r0 = UINT8 *dest
+; r1 = UINT32 dest_pitch
+; r2 = UINT32 dest_width
+;)
+|vertical_band_4_5_scale_armv4| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr c51_205, =0x3300cd
+ ldr c102_154, =0x66009a
+
+vl45_loop
+ mov r3, src
+ ldrb r4, [r3], r1 ; a = des [0]
+ ldrb r5, [r3], r1 ; b = des [dest_pitch]
+ ldrb r7, [r3], r1 ; c = des[dest_pitch*2]
+ add lr, src, r1
+
+ orr r6, r4, r5, lsl #16 ; b | a
+ mul r6, c51_205, r6 ; a * 51 + 205 * b
+
+ ldrb r8, [r3], r1 ; d = des[dest_pitch*3]
+ orr r5, r5, r7, lsl #16 ; c | b
+ mul r5, c102_154, r5 ; b * 102 + 154 * c
+ add r6, r6, #0x8000
+ orr r7, r8, r7, lsl #16 ; c | d
+ mov r6, r6, lsr #24
+ strb r6, [lr], r1
+
+ ldrb r9, [r3, r1] ; e = des [dest_pitch * 5]
+ mul r7, c102_154, r7 ; c * 154 + 102 * d
+ add r5, r5, #0x8000
+ orr r9, r9, r8, lsl #16 ; d | e
+ mov r5, r5, lsr #24
+ strb r5, [lr], r1
+
+ mul r9, c51_205, r9 ; d * 205 + 51 * e
+ add r7, r7, #0x8000
+ add src, src, #1
+ mov r7, r7, lsr #24
+ strb r7, [lr], r1
+
+ add r9, r9, #0x8000
+ subs r2, r2, #1
+ mov r9, r9, lsr #24
+ strb r9, [lr], r1
+
+ bne vl45_loop
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ;|vertical_band_4_5_scale_armv4|
+
+;/****************************************************************************
+; *
+; * ROUTINE : horizontal_line_2_3_scale_armv4
+; *
+; * INPUTS : const unsigned char *source : Pointer to source data.
+; * unsigned int source_width : Stride of source.
+; * unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_width : Stride of destination (NOT USED).
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Copies horizontal line of pixels from source to
+; * destination scaling up by 2 to 3.
+; *
+; * SPECIAL NOTES : None.
+; *
+; *
+; ****************************************************************************/
+;void horizontal_line_2_3_scale_armv4
+;(
+; const unsigned char *source,
+; unsigned int source_width,
+; unsigned char *dest,
+; unsigned int dest_width
+;)
+|horizontal_line_2_3_scale_armv4| PROC
+ stmdb sp!, {r4 - r11, lr}
+ ldr lr, =85
+ ldr r12, =171
+
+hl23_loop
+
+ ldrb r3, [src], #1 ; a
+ ldrb r4, [src], #1 ; b
+ ldrb r5, [src] ; c
+
+ strb r3, [dest], #1
+ mul r4, r12, r4 ; b * 171
+ mla r6, lr, r3, r4 ; a * 85
+ mla r7, lr, r5, r4 ; c * 85
+
+ add r6, r6, #128
+ mov r6, r6, lsr #8
+ strb r6, [dest], #1
+
+ add r7, r7, #128
+ mov r7, r7, lsr #8
+ strb r7, [dest], #1
+
+ subs srcw, srcw, #2
+ bne hl23_loop
+
+ ldrb r4, [src, #1] ; b
+ strb r5, [dest], #1
+ strb r4, [dest, #1]
+
+ mul r4, r12, r4 ; b * 171
+ mla r6, lr, r5, r4 ; a * 85 + b *171
+
+ add r6, r6, #128
+ mov r6, r6, lsr #8
+ strb r6, [dest]
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ;|horizontal_line_2_3_scale_armv4|
+
+;/****************************************************************************
+; *
+; * ROUTINE : vertical_band_2_3_scale_armv4
+; *
+; * INPUTS : unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_pitch : Stride of destination data.
+; * unsigned int dest_width : Width of destination data.
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Scales vertical band of pixels by scale 2 to 3. The
+; * height of the band scaled is 2-pixels.
+; *
+; * SPECIAL NOTES : The routine uses the first line of the band below
+; * the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_2_3_scale_armv4
+;(
+; r0 = UINT8 *dest
+; r1 = UINT32 dest_pitch
+; r2 = UINT32 dest_width
+;)
+|vertical_band_2_3_scale_armv4| PROC
+ stmdb sp!, {r4 - r8, lr}
+ ldr lr, =85
+ ldr r12, =171
+ add r3, r1, r1, lsl #1 ; 3 * dest_pitch
+
+vl23_loop
+ ldrb r4, [src] ; a = des [0]
+ ldrb r5, [src, r1] ; b = des [dest_pitch]
+ ldrb r7, [src, r3] ; c = des [dest_pitch*3]
+ subs r2, r2, #1
+
+ mul r5, r12, r5 ; b * 171
+ mla r6, lr, r4, r5 ; a * 85
+ mla r8, lr, r7, r5 ; c * 85
+
+ add r6, r6, #128
+ mov r6, r6, lsr #8
+ strb r6, [src, r1]
+
+ add r8, r8, #128
+ mov r8, r8, lsr #8
+ strb r8, [src, r1, lsl #1]
+
+ add src, src, #1
+
+ bne vl23_loop
+
+ ldmia sp!, {r4 - r8, pc}
+ ENDP ;|vertical_band_2_3_scale_armv4|
+
+;/****************************************************************************
+; *
+; * ROUTINE : vp8cx_horizontal_line_3_5_scale_c
+; *
+; * INPUTS : const unsigned char *source : Pointer to source data.
+; * unsigned int source_width : Stride of source.
+; * unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_width : Stride of destination (NOT USED).
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Copies horizontal line of pixels from source to
+; * destination scaling up by 3 to 5.
+; *
+; * SPECIAL NOTES : None.
+; *
+; *
+; ****************************************************************************/
+;void vp8cx_horizontal_line_3_5_scale_c
+;(
+; const unsigned char *source,
+; unsigned int source_width,
+; unsigned char *dest,
+; unsigned int dest_width
+;)
+|horizontal_line_3_5_scale_armv4| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr c51_205, =0x3300cd
+ ldr c102_154, =0x66009a
+
+ ldrb r4, [src], #1 ; a = src[0]
+
+hl35_loop
+
+ ldrb r8, [src], #1 ; b = src[1]
+ strb r4, [dest], #1
+
+ orr r6, r4, r8, lsl #16 ; b | a
+ ldrb r9, [src], #1 ; c = src[2]
+ mul r6, c102_154, r6 ; a * 102 + 154 * b
+
+ orr r5, r9, r8, lsl #16 ; b | c
+ mul r5, c51_205, r5 ; b * 205 + 51 * c
+ add r6, r6, #0x8000
+ ldrb r4, [src], #1 ; d = src[3]
+ mov r6, r6, lsr #24
+ strb r6, [dest], #1
+
+ orr r7, r8, r9, lsl #16 ; c | b
+ mul r7, c51_205, r7 ; c * 205 + 154 * b
+ add r5, r5, #0x8000
+ mov r5, r5, lsr #24
+ strb r5, [dest], #1
+
+ orr r9, r4, r9, lsl #16 ; c | d
+ mul r9, c102_154, r9 ; c * 154 + 102 * d
+ add r7, r7, #0x8000
+ mov r7, r7, lsr #24
+ strb r7, [dest], #1
+
+ add r9, r9, #0x8000
+ subs srcw, srcw, #3
+ mov r9, r9, lsr #24
+ strb r9, [dest], #1
+
+ bpl hl35_loop
+
+ ldrb r5, [src], #1 ; b = src[1]
+ strb r4, [dest], #1
+
+ orr r6, r4, r8, lsl #16 ; b | a
+ ldrb r9, [src], #1 ; c = src[2]
+ mul r6, c102_154, r6 ; a * 102 + 154 * b
+
+ orr r5, r9, r8, lsl #16 ; b | c
+ mul r5, c51_205, r5 ; b * 205 + 51 * c
+ add r6, r6, #0x8000
+ mov r6, r6, lsr #24
+ strb r6, [dest], #1
+
+ orr r7, r8, r9, lsl #16 ; c | b
+ mul r7, c51_205, r7 ; c * 205 + 154 * b
+ add r5, r5, #0x8000
+ mov r5, r5, lsr #24
+ strb r5, [dest], #1
+
+ add r7, r7, #0x8000
+ mov r7, r7, lsr #24
+ strb r7, [dest], #1
+ strb r9, [dest], #1
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ;|vp8cx_horizontal_line_3_5_scale_c|
+
+
+;/****************************************************************************
+; *
+; * ROUTINE : vp8cx_vertical_band_3_5_scale_c
+; *
+; * INPUTS : unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_pitch : Stride of destination data.
+; * unsigned int dest_width : Width of destination data.
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Scales vertical band of pixels by scale 3 to 5. The
+; * height of the band scaled is 3-pixels.
+; *
+; * SPECIAL NOTES : The routine uses the first line of the band below
+; * the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_4_5_scale_armv4
+;(
+; r0 = UINT8 *dest
+; r1 = UINT32 dest_pitch
+; r2 = UINT32 dest_width
+;)
+|vertical_band_3_5_scale_armv4| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr c51_205, =0x3300cd
+ ldr c102_154, =0x66009a
+
+vl35_loop
+ mov r3, src
+ ldrb r4, [r3], r1 ; a = des [0]
+ ldrb r5, [r3], r1 ; b = des [dest_pitch]
+ ldrb r7, [r3], r1 ; c = des[dest_pitch*2]
+ add lr, src, r1
+
+ orr r8, r4, r5, lsl #16 ; b | a
+ mul r6, c102_154, r8 ; a * 102 + 154 * b
+
+ ldrb r8, [r3, r1, lsl #1] ; d = des[dest_pitch*5]
+ orr r3, r7, r5, lsl #16 ; b | c
+ mul r9, c51_205, r3 ; b * 205 + 51 * c
+ add r6, r6, #0x8000
+ orr r3, r5, r7, lsl #16 ; c | b
+ mov r6, r6, lsr #24
+ strb r6, [lr], r1
+
+ mul r5, c51_205, r3 ; c * 205 + 154 * b
+ add r9, r9, #0x8000
+ orr r3, r8, r7, lsl #16 ; c | d
+ mov r9, r9, lsr #24
+ strb r9, [lr], r1
+
+ mul r7, c102_154, r3 ; c * 154 + 102 * d
+ add r5, r5, #0x8000
+ add src, src, #1
+ mov r5, r5, lsr #24
+ strb r5, [lr], r1
+
+ add r7, r7, #0x8000
+ subs r2, r2, #1
+ mov r7, r7, lsr #24
+ strb r7, [lr], r1
+
+
+ bne vl35_loop
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ;|vertical_band_3_5_scale_armv4|
+
+;/****************************************************************************
+; *
+; * ROUTINE : horizontal_line_3_4_scale_armv4
+; *
+; * INPUTS : const unsigned char *source : Pointer to source data.
+; * unsigned int source_width : Stride of source.
+; * unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_width : Stride of destination (NOT USED).
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Copies horizontal line of pixels from source to
+; * destination scaling up by 3 to 4.
+; *
+; * SPECIAL NOTES : None.
+; *
+; *
+; ****************************************************************************/
+;void horizontal_line_3_4_scale_armv4
+;(
+; const unsigned char *source,
+; unsigned int source_width,
+; unsigned char *dest,
+; unsigned int dest_width
+;)
+|horizontal_line_3_4_scale_armv4| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r10, =64
+ ldr r11, =192
+ mov r9, #128
+
+ ldrb r4, [src], #1 ; a = src[0]
+
+hl34_loop
+
+ ldrb r8, [src], #1 ; b = src[1]
+ ldrb r7, [src], #1 ; c = src[2]
+ strb r4, [dest], #1
+
+ mla r4, r10, r4, r9 ; a*64 + 128
+ mla r4, r11, r8, r4 ; a*64 + b*192 + 1
+
+ add r8, r8, #1 ; b + 1
+ add r8, r8, r7 ; b + c + 1
+ mov r8, r8, asr #1 ; (b + c + 1) >> 1
+
+ mov r4, r4, asr #8 ; (a*64 + b*192 + 1) >> 8
+ strb r4, [dest], #1
+
+ strb r8, [dest], #1
+
+ ldrb r4, [src], #1 ; [a+1]
+
+ mla r7, r11, r7, r9 ; c*192 + 128
+ mla r7, r4, r10, r7 ; a*64 + b*192 + 128
+
+ subs srcw, srcw, #3
+
+ mov r7, r7, asr #8 ; (a*64 + b*192 + 128) >> 8
+ strb r7, [dest], #1
+
+ bpl hl34_loop
+
+ ldrb r8, [src], #1 ; b = src[1]
+ ldrb r7, [src], #1 ; c = src[2]
+ strb r4, [dest], #1
+
+ mla r4, r10, r4, r9 ; a*64 + 128
+ mla r4, r11, r8, r4 ; a*64 + b*192 + 1
+ mov r4, r4, asr #8 ; (a*64 + b*192 + 1) >> 8
+ strb r4, [dest], #1
+
+ add r8, r8, #1 ; b + 1
+ add r8, r8, r7 ; b + c + 1
+ mov r8, r8, asr #1 ; (b + c + 1) >> 1
+ strb r8, [dest], #1
+ strb r7, [dest], #1
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ;|vp8cx_horizontal_line_3_4_scale_c|
+
+
+;/****************************************************************************
+; *
+; * ROUTINE : vertical_band_3_4_scale_armv4
+; *
+; * INPUTS : unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_pitch : Stride of destination data.
+; * unsigned int dest_width : Width of destination data.
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Scales vertical band of pixels by scale 3 to 4. The
+; * height of the band scaled is 3-pixels.
+; *
+; * SPECIAL NOTES : The routine uses the first line of the band below
+; * the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_3_4_scale_armv4
+;(
+; r0 = UINT8 *dest
+; r1 = UINT32 dest_pitch
+; r2 = UINT32 dest_width
+;)
+|vertical_band_3_4_scale_armv4| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r10, =64
+ ldr r11, =192
+ mov r9, #128
+
+; ldr r1,[r1]
+vl34_loop
+ mov r3, src
+ ldrb r4, [r3], r1 ; a = des [0]
+ ldrb r5, [r3], r1 ; b = des [dest_pitch]
+ ldrb r7, [r3], r1 ; c = des [dest_pitch*2]
+ add lr, src, r1
+
+ mla r4, r10, r4, r9 ; a*64 + 128
+ mla r4, r11, r5, r4 ; a*64 + b*192 + 1
+
+ add r5, r5, #1 ; b + 1
+ add r5, r5, r7 ; b + c + 1
+ mov r5, r5, asr #1 ; (b + c + 1) >> 1
+
+ mov r4, r4, asr #8 ; (a*64 + b*192 + 1) >> 8
+ strb r4, [lr], r1
+
+ ldrb r4, [r3, r1] ; a = des [dest_pitch*4]
+
+ strb r5, [lr], r1
+
+ mla r7, r11, r7, r9 ; c*192 + 128
+ mla r7, r4, r10, r7 ; a*64 + b*192 + 128
+ mov r7, r7, asr #8 ; (a*64 + b*192 + 128) >> 8
+
+ add src, src, #1
+ subs r2, r2, #1
+
+ strb r7, [lr]
+
+ bne vl34_loop
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ;|vertical_band_3_4_scale_armv4|
+
+;/****************************************************************************
+; *
+; * ROUTINE : vp8cx_horizontal_line_1_2_scale_c
+; *
+; * INPUTS : const unsigned char *source : Pointer to source data.
+; * unsigned int source_width : Stride of source.
+; * unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_width : Stride of destination (NOT USED).
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Copies horizontal line of pixels from source to
+; * destination scaling up by 1 to 2.
+; *
+; * SPECIAL NOTES : None.
+; *
+; ****************************************************************************/
+;void vp8cx_horizontal_line_1_2_scale_c
+;(
+; const unsigned char *source,
+; unsigned int source_width,
+; unsigned char *dest,
+; unsigned int dest_width
+;)
+|horizontal_line_1_2_scale_armv4| PROC
+ stmdb sp!, {r4 - r5, lr}
+
+ sub srcw, srcw, #1
+
+ ldrb r3, [src], #1
+ ldrb r4, [src], #1
+hl12_loop
+ subs srcw, srcw, #1
+
+ add r5, r3, r4
+ add r5, r5, #1
+ mov r5, r5, lsr #1
+
+ orr r5, r3, r5, lsl #8
+ strh r5, [dest], #2
+
+ mov r3, r4
+
+ ldrneb r4, [src], #1
+ bne hl12_loop
+
+ orr r5, r4, r4, lsl #8
+ strh r5, [dest]
+
+ ldmia sp!, {r4 - r5, pc}
+ ENDP ;|vertical_band_3_5_scale_armv4|
+
+;/****************************************************************************
+; *
+; * ROUTINE : vp8cx_vertical_band_1_2_scale_c
+; *
+; * INPUTS : unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_pitch : Stride of destination data.
+; * unsigned int dest_width : Width of destination data.
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Scales vertical band of pixels by scale 1 to 2. The
+; * height of the band scaled is 1-pixel.
+; *
+; * SPECIAL NOTES : The routine uses the first line of the band below
+; * the current band.
+; *
+; ****************************************************************************/
+;void vp8cx_vertical_band_1_2_scale_c
+;(
+; r0 = UINT8 *dest
+; r1 = UINT32 dest_pitch
+; r2 = UINT32 dest_width
+;)
+|vertical_band_1_2_scale_armv4| PROC
+ stmdb sp!, {r4 - r7, lr}
+
+ ldr mask, =0xff00ff ; mask for selection
+ ldr lr, = 0x010001
+
+vl12_loop
+ mov r3, src
+ ldr r4, [r3], r1
+ ldr r5, [r3, r1]
+
+ add src, src, #4
+ subs r2, r2, #4
+
+ and r6, r4, mask
+ and r7, r5, mask
+
+ add r6, r7, r6
+ add r6, r6, lr
+
+ and r4, mask, r4, lsr #8
+ and r5, mask, r5, lsr #8
+
+ mov r6, r6, lsr #1
+ and r6, r6, mask
+
+ add r4, r5, r4
+ add r4, r4, lr
+
+ mov r4, r4, lsr #1
+ and r4, r4, mask
+
+ orr r5, r6, r4, lsl #8
+
+ str r5, [r3]
+
+ bpl vl12_loop
+
+ ldmia sp!, {r4 - r7, pc}
+ ENDP ;|vertical_band_3_5_scale_armv4|
+
+ END
diff --git a/vpx_scale/arm/nds/yv12extend.c b/vpx_scale/arm/nds/yv12extend.c
new file mode 100644
index 000000000..56959cb18
--- /dev/null
+++ b/vpx_scale/arm/nds/yv12extend.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+* Module Title : yv12extend.c
+*
+* Description :
+*
+***************************************************************************/
+
+/****************************************************************************
+* Header Files
+****************************************************************************/
+#include "vpx_scale/yv12config.h"
+#include "vpx_mem/vpx_mem.h"
+#include <nitro.h>
+#include <nitro/mi.h>
+#include <nitro/itcm_begin.h>
+
+//---- DMA Number
+#define DMA_NO 3
+
+/****************************************************************************
+* Exports
+****************************************************************************/
+
+/****************************************************************************
+*
+****************************************************************************/
+void
+vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf)
+{
+ int i;
+ unsigned char *src_ptr1, *src_ptr2;
+ unsigned char *dest_ptr1, *dest_ptr2;
+
+ unsigned int Border;
+ int plane_stride;
+ int plane_height;
+ int plane_width;
+
+ /***********/
+ /* Y Plane */
+ /***********/
+ Border = ybf->border;
+ plane_stride = ybf->y_stride;
+ plane_height = ybf->y_height;
+ plane_width = ybf->y_width;
+
+ // copy the left and right most columns out
+ src_ptr1 = ybf->y_buffer;
+ src_ptr2 = src_ptr1 + plane_width - 1;
+ dest_ptr1 = src_ptr1 - Border;
+ dest_ptr2 = src_ptr2 + 1;
+
+ for (i = 0; i < plane_height; i++)
+ {
+ mi_cpu_fill8(dest_ptr1, src_ptr1[0], Border);
+ mi_cpu_fill8(dest_ptr2, src_ptr2[0], Border);
+ src_ptr1 += plane_stride;
+ src_ptr2 += plane_stride;
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ // Now copy the top and bottom source lines into each line of the respective borders
+ src_ptr1 = ybf->y_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)Border; i++)
+ {
+ mi_cpu_copy_fast(src_ptr1, dest_ptr1, plane_stride);
+ mi_cpu_copy_fast(src_ptr2, dest_ptr2, plane_stride);
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ plane_stride /= 2;
+ plane_height /= 2;
+ plane_width /= 2;
+ Border /= 2;
+
+ /***********/
+ /* U Plane */
+ /***********/
+
+ // copy the left and right most columns out
+ src_ptr1 = ybf->u_buffer;
+ src_ptr2 = src_ptr1 + plane_width - 1;
+ dest_ptr1 = src_ptr1 - Border;
+ dest_ptr2 = src_ptr2 + 1;
+
+ for (i = 0; i < plane_height; i++)
+ {
+ mi_cpu_fill8(dest_ptr1, src_ptr1[0], Border);
+ mi_cpu_fill8(dest_ptr2, src_ptr2[0], Border);
+ src_ptr1 += plane_stride;
+ src_ptr2 += plane_stride;
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ // Now copy the top and bottom source lines into each line of the respective borders
+ src_ptr1 = ybf->u_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)(Border); i++)
+ {
+ mi_cpu_copy_fast(src_ptr1, dest_ptr1, plane_stride);
+ mi_cpu_copy_fast(src_ptr2, dest_ptr2, plane_stride);
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ /***********/
+ /* V Plane */
+ /***********/
+
+ // copy the left and right most columns out
+ src_ptr1 = ybf->v_buffer;
+ src_ptr2 = src_ptr1 + plane_width - 1;
+ dest_ptr1 = src_ptr1 - Border;
+ dest_ptr2 = src_ptr2 + 1;
+
+ for (i = 0; i < plane_height; i++)
+ {
+ mi_cpu_fill8(dest_ptr1, src_ptr1[0], Border);
+ mi_cpu_fill8(dest_ptr2, src_ptr2[0], Border);
+ src_ptr1 += plane_stride;
+ src_ptr2 += plane_stride;
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ // Now copy the top and bottom source lines into each line of the respective borders
+ src_ptr1 = ybf->v_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)(Border); i++)
+ {
+ mi_cpu_copy_fast(src_ptr1, dest_ptr1, plane_stride);
+ mi_cpu_copy_fast(src_ptr2, dest_ptr2, plane_stride);
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+}
+
+
+
+/****************************************************************************
+*
+* ROUTINE : vp8_yv12_copy_frame
+*
+* INPUTS :
+*
+* OUTPUTS : None.
+*
+* RETURNS : void
+*
+* FUNCTION : Copies the source image into the destination image and
+* updates the destination's UMV borders.
+*
+* SPECIAL NOTES : The frames are assumed to be identical in size.
+*
+****************************************************************************/
+void
+vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
+{
+ int yplane_size = (src_ybc->y_height + 2 * src_ybc->border) * (src_ybc->y_stride);
+ int mem_size = (yplane_size * 3 / 2) + (src_ybc->y_stride * 2);
+
+ mi_cpu_copy_fast(src_ybc->buffer_alloc, dst_ybc->buffer_alloc, mem_size);
+
+ /* unsigned char *src_y, *dst_y;
+ unsigned char *src_u, *dst_u;
+ unsigned char *src_v, *dst_v;
+
+ int yheight, uv_height;
+ int ystride, uv_stride;
+ int border;
+ int yoffset, uvoffset;
+
+ border = src_ybc->border;
+ yheight = src_ybc->y_height;
+ uv_height = src_ybc->uv_height;
+
+ ystride = src_ybc->y_stride;
+ uv_stride = src_ybc->uv_stride;
+
+ yoffset = border * (ystride + 1);
+ uvoffset = border/2 * (uv_stride + 1);
+
+ src_y = src_ybc->y_buffer - yoffset;
+ dst_y = dst_ybc->y_buffer - yoffset;
+ src_u = src_ybc->u_buffer - uvoffset;
+ dst_u = dst_ybc->u_buffer - uvoffset;
+ src_v = src_ybc->v_buffer - uvoffset;
+ dst_v = dst_ybc->v_buffer - uvoffset;
+
+ mi_cpu_copy_fast (src_y, dst_y, ystride * (yheight + 2 * border));
+ mi_cpu_copy_fast (src_u, dst_u, uv_stride * (uv_height + border));
+ mi_cpu_copy_fast (src_v, dst_v, uv_stride * (uv_height + border));
+ */
+}
+
+#include <nitro/itcm_end.h>
diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm
new file mode 100644
index 000000000..26384c42c
--- /dev/null
+++ b/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm
@@ -0,0 +1,227 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_yv12_copy_frame_func_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ INCLUDE vpx_asm_offsets.asm
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+
+|vp8_yv12_copy_frame_func_neon| PROC
+ push {r4 - r11, lr}
+ vpush {d8 - d15}
+
+ sub sp, sp, #16
+
+ ;Copy Y plane
+ ldr r8, [r0, #yv12_buffer_config_u_buffer] ;srcptr1
+ ldr r9, [r1, #yv12_buffer_config_u_buffer] ;srcptr1
+ ldr r10, [r0, #yv12_buffer_config_v_buffer] ;srcptr1
+ ldr r11, [r1, #yv12_buffer_config_v_buffer] ;srcptr1
+
+ ldr r4, [r0, #yv12_buffer_config_y_height]
+ ldr r5, [r0, #yv12_buffer_config_y_width]
+ ldr r6, [r0, #yv12_buffer_config_y_stride]
+ ldr r7, [r1, #yv12_buffer_config_y_stride]
+ ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
+ ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1
+
+ str r8, [sp]
+ str r9, [sp, #4]
+ str r10, [sp, #8]
+ str r11, [sp, #12]
+
+ ; copy two rows at one time
+ mov lr, r4, lsr #1
+
+cp_src_to_dst_height_loop
+ mov r8, r2
+ mov r9, r3
+ add r10, r2, r6
+ add r11, r3, r7
+ mov r12, r5, lsr #7
+
+cp_src_to_dst_width_loop
+ vld1.8 {q0, q1}, [r8]!
+ vld1.8 {q8, q9}, [r10]!
+ vld1.8 {q2, q3}, [r8]!
+ vld1.8 {q10, q11}, [r10]!
+ vld1.8 {q4, q5}, [r8]!
+ vld1.8 {q12, q13}, [r10]!
+ vld1.8 {q6, q7}, [r8]!
+ vld1.8 {q14, q15}, [r10]!
+
+ subs r12, r12, #1
+
+ vst1.8 {q0, q1}, [r9]!
+ vst1.8 {q8, q9}, [r11]!
+ vst1.8 {q2, q3}, [r9]!
+ vst1.8 {q10, q11}, [r11]!
+ vst1.8 {q4, q5}, [r9]!
+ vst1.8 {q12, q13}, [r11]!
+ vst1.8 {q6, q7}, [r9]!
+ vst1.8 {q14, q15}, [r11]!
+
+ bne cp_src_to_dst_width_loop
+
+ subs lr, lr, #1
+ add r2, r2, r6, lsl #1
+ add r3, r3, r7, lsl #1
+
+ bne cp_src_to_dst_height_loop
+
+ ands r10, r5, #0x7f ;check to see if extra copy is needed
+ sub r11, r5, r10
+ ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
+ ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1
+ bne extra_cp_src_to_dst_width
+end_of_cp_src_to_dst
+
+;Copy U & V planes
+ ldr r2, [sp] ;srcptr1
+ ldr r3, [sp, #4] ;dstptr1
+ mov r4, r4, lsr #1 ;src uv_height
+ mov r5, r5, lsr #1 ;src uv_width
+ mov r6, r6, lsr #1 ;src uv_stride
+ mov r7, r7, lsr #1 ;dst uv_stride
+
+ mov r1, #2
+
+cp_uv_loop
+
+ ;copy two rows at one time
+ mov lr, r4, lsr #1
+
+cp_src_to_dst_height_uv_loop
+ mov r8, r2
+ mov r9, r3
+ add r10, r2, r6
+ add r11, r3, r7
+ mov r12, r5, lsr #6
+
+cp_src_to_dst_width_uv_loop
+ vld1.8 {q0, q1}, [r8]!
+ vld1.8 {q8, q9}, [r10]!
+ vld1.8 {q2, q3}, [r8]!
+ vld1.8 {q10, q11}, [r10]!
+
+ subs r12, r12, #1
+
+ vst1.8 {q0, q1}, [r9]!
+ vst1.8 {q8, q9}, [r11]!
+ vst1.8 {q2, q3}, [r9]!
+ vst1.8 {q10, q11}, [r11]!
+
+ bne cp_src_to_dst_width_uv_loop
+
+ subs lr, lr, #1
+ add r2, r2, r6, lsl #1
+ add r3, r3, r7, lsl #1
+
+ bne cp_src_to_dst_height_uv_loop
+
+ ands r10, r5, #0x3f ;check to see if extra copy is needed
+ sub r11, r5, r10
+ ldr r2, [sp] ;srcptr1
+ ldr r3, [sp, #4] ;dstptr1
+ bne extra_cp_src_to_dst_uv_width
+end_of_cp_src_to_dst_uv
+
+ subs r1, r1, #1
+
+ addne sp, sp, #8
+
+ ldrne r2, [sp] ;srcptr1
+ ldrne r3, [sp, #4] ;dstptr1
+
+ bne cp_uv_loop
+
+ add sp, sp, #8
+
+ vpop {d8 - d15}
+ pop {r4 - r11, pc}
+
+;=============================
+extra_cp_src_to_dst_width
+ add r2, r2, r11
+ add r3, r3, r11
+ add r0, r8, r6
+ add r11, r9, r7
+
+ mov lr, r4, lsr #1
+extra_cp_src_to_dst_height_loop
+ mov r8, r2
+ mov r9, r3
+ add r0, r8, r6
+ add r11, r9, r7
+
+ mov r12, r10
+
+extra_cp_src_to_dst_width_loop
+ vld1.8 {q0}, [r8]!
+ vld1.8 {q1}, [r0]!
+
+ subs r12, r12, #16
+
+ vst1.8 {q0}, [r9]!
+ vst1.8 {q1}, [r11]!
+ bne extra_cp_src_to_dst_width_loop
+
+ subs lr, lr, #1
+
+ add r2, r2, r6, lsl #1
+ add r3, r3, r7, lsl #1
+
+ bne extra_cp_src_to_dst_height_loop
+
+ b end_of_cp_src_to_dst
+
+;=================================
+extra_cp_src_to_dst_uv_width
+ add r2, r2, r11
+ add r3, r3, r11
+ add r0, r8, r6
+ add r11, r9, r7
+
+ mov lr, r4, lsr #1
+extra_cp_src_to_dst_height_uv_loop
+ mov r8, r2
+ mov r9, r3
+ add r0, r8, r6
+ add r11, r9, r7
+
+ mov r12, r10
+
+extra_cp_src_to_dst_width_uv_loop
+ vld1.8 {d0}, [r8]!
+ vld1.8 {d1}, [r0]!
+
+ subs r12, r12, #8
+
+ vst1.8 {d0}, [r9]!
+ vst1.8 {d1}, [r11]!
+ bne extra_cp_src_to_dst_width_uv_loop
+
+ subs lr, lr, #1
+
+ add r2, r2, r6, lsl #1
+ add r3, r3, r7, lsl #1
+
+ bne extra_cp_src_to_dst_height_uv_loop
+
+ b end_of_cp_src_to_dst_uv
+
+ ENDP
+ END
diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copyframeyonly_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copyframeyonly_neon.asm
new file mode 100644
index 000000000..a50ae60d7
--- /dev/null
+++ b/vpx_scale/arm/neon/vp8_vpxyv12_copyframeyonly_neon.asm
@@ -0,0 +1,499 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_yv12_copy_frame_yonly_neon|
+ EXPORT |vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ INCLUDE vpx_asm_offsets.asm
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vpxyv12_copy_frame_yonly(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+; Note: this is VP8 function, which has border=32 and 16. Internal y_width and y_height
+; are always multiples of 16.
+
+|vp8_yv12_copy_frame_yonly_neon| PROC
+ push {r4 - r11, lr}
+ vpush {d8 - d15}
+
+ ldr r4, [r0, #yv12_buffer_config_y_height]
+ ldr r5, [r0, #yv12_buffer_config_y_width]
+ ldr r6, [r0, #yv12_buffer_config_y_stride]
+ ldr r7, [r1, #yv12_buffer_config_y_stride]
+ ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
+ ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1
+
+ ; copy two rows at one time
+ mov lr, r4, lsr #1
+
+cp_src_to_dst_height_loop
+ mov r8, r2
+ mov r9, r3
+ add r10, r2, r6
+ add r11, r3, r7
+ mov r12, r5, lsr #7
+
+cp_src_to_dst_width_loop
+ vld1.8 {q0, q1}, [r8]!
+ vld1.8 {q8, q9}, [r10]!
+ vld1.8 {q2, q3}, [r8]!
+ vld1.8 {q10, q11}, [r10]!
+ vld1.8 {q4, q5}, [r8]!
+ vld1.8 {q12, q13}, [r10]!
+ vld1.8 {q6, q7}, [r8]!
+ vld1.8 {q14, q15}, [r10]!
+
+ subs r12, r12, #1
+
+ vst1.8 {q0, q1}, [r9]!
+ vst1.8 {q8, q9}, [r11]!
+ vst1.8 {q2, q3}, [r9]!
+ vst1.8 {q10, q11}, [r11]!
+ vst1.8 {q4, q5}, [r9]!
+ vst1.8 {q12, q13}, [r11]!
+ vst1.8 {q6, q7}, [r9]!
+ vst1.8 {q14, q15}, [r11]!
+
+ bne cp_src_to_dst_width_loop
+
+ subs lr, lr, #1
+ add r2, r2, r6, lsl #1
+ add r3, r3, r7, lsl #1
+
+ bne cp_src_to_dst_height_loop
+
+ ands r10, r5, #0x7f ;check to see if extra copy is needed
+ sub r11, r5, r10
+ ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
+ ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1
+ bne extra_cp_src_to_dst_width
+end_of_cp_src_to_dst
+
+
+ ;vpxyv12_extend_frame_borders_yonly
+ mov r0, r1
+ ;Not need to load y_width, since: y_width = y_stride - 2*border
+ ldr r3, [r0, #yv12_buffer_config_border]
+ ldr r1, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
+ ldr r4, [r0, #yv12_buffer_config_y_height]
+ ldr lr, [r0, #yv12_buffer_config_y_stride]
+
+ cmp r3, #16
+ beq b16_extend_frame_borders
+
+;=======================
+b32_extend_frame_borders
+;border = 32
+;=======================
+;Border copy for Y plane
+;copy the left and right most columns out
+ sub r5, r1, r3 ;destptr1
+ add r6, r1, lr
+ sub r6, r6, r3, lsl #1 ;destptr2
+ sub r2, r6, #1 ;srcptr2
+
+ ;Do four rows at one time
+ mov r12, r4, lsr #2
+
+copy_left_right_y
+ vld1.8 {d0[], d1[]}, [r1], lr
+ vld1.8 {d4[], d5[]}, [r2], lr
+ vld1.8 {d8[], d9[]}, [r1], lr
+ vld1.8 {d12[], d13[]}, [r2], lr
+ vld1.8 {d16[], d17[]}, [r1], lr
+ vld1.8 {d20[], d21[]}, [r2], lr
+ vld1.8 {d24[], d25[]}, [r1], lr
+ vld1.8 {d28[], d29[]}, [r2], lr
+
+ vmov q1, q0
+ vmov q3, q2
+ vmov q5, q4
+ vmov q7, q6
+ vmov q9, q8
+ vmov q11, q10
+ vmov q13, q12
+ vmov q15, q14
+
+ subs r12, r12, #1
+
+ vst1.8 {q0, q1}, [r5], lr
+ vst1.8 {q2, q3}, [r6], lr
+ vst1.8 {q4, q5}, [r5], lr
+ vst1.8 {q6, q7}, [r6], lr
+ vst1.8 {q8, q9}, [r5], lr
+ vst1.8 {q10, q11}, [r6], lr
+ vst1.8 {q12, q13}, [r5], lr
+ vst1.8 {q14, q15}, [r6], lr
+
+ bne copy_left_right_y
+
+;Now copy the top and bottom source lines into each line of the respective borders
+ ldr r7, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
+ mul r8, r3, lr
+
+ mov r12, lr, lsr #7
+
+ sub r6, r1, r3 ;destptr2
+ sub r2, r6, lr ;srcptr2
+ sub r1, r7, r3 ;srcptr1
+ sub r5, r1, r8 ;destptr1
+
+copy_top_bottom_y
+ vld1.8 {q0, q1}, [r1]!
+ vld1.8 {q8, q9}, [r2]!
+ vld1.8 {q2, q3}, [r1]!
+ vld1.8 {q10, q11}, [r2]!
+ vld1.8 {q4, q5}, [r1]!
+ vld1.8 {q12, q13}, [r2]!
+ vld1.8 {q6, q7}, [r1]!
+ vld1.8 {q14, q15}, [r2]!
+
+ mov r7, r3
+
+top_bottom_32
+ subs r7, r7, #1
+
+ vst1.8 {q0, q1}, [r5]!
+ vst1.8 {q8, q9}, [r6]!
+ vst1.8 {q2, q3}, [r5]!
+ vst1.8 {q10, q11}, [r6]!
+ vst1.8 {q4, q5}, [r5]!
+ vst1.8 {q12, q13}, [r6]!
+ vst1.8 {q6, q7}, [r5]!
+ vst1.8 {q14, q15}, [r6]!
+
+ add r5, r5, lr
+ sub r5, r5, #128
+ add r6, r6, lr
+ sub r6, r6, #128
+
+ bne top_bottom_32
+
+ sub r5, r1, r8
+ add r6, r2, lr
+
+ subs r12, r12, #1
+ bne copy_top_bottom_y
+
+ mov r7, lr, lsr #4 ;check to see if extra copy is needed
+ ands r7, r7, #0x7
+ bne extra_top_bottom_y
+end_of_border_copy_y
+
+ vpop {d8 - d15}
+ pop {r4 - r11, pc}
+
+;=====================
+;extra copy part for Y
+extra_top_bottom_y
+ vld1.8 {q0}, [r1]!
+ vld1.8 {q2}, [r2]!
+
+ mov r9, r3, lsr #3
+
+extra_top_bottom_32
+ subs r9, r9, #1
+
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ bne extra_top_bottom_32
+
+ sub r5, r1, r8
+ add r6, r2, lr
+ subs r7, r7, #1
+ bne extra_top_bottom_y
+
+ b end_of_border_copy_y
+
+
+;=======================
+b16_extend_frame_borders
+;border = 16
+;=======================
+;Border copy for Y plane
+;copy the left and right most columns out
+ sub r5, r1, r3 ;destptr1
+ add r6, r1, lr
+ sub r6, r6, r3, lsl #1 ;destptr2
+ sub r2, r6, #1 ;srcptr2
+
+ ;Do four rows at one time
+ mov r12, r4, lsr #2
+
+copy_left_right_y_b16
+ vld1.8 {d0[], d1[]}, [r1], lr
+ vld1.8 {d4[], d5[]}, [r2], lr
+ vld1.8 {d8[], d9[]}, [r1], lr
+ vld1.8 {d12[], d13[]}, [r2], lr
+ vld1.8 {d16[], d17[]}, [r1], lr
+ vld1.8 {d20[], d21[]}, [r2], lr
+ vld1.8 {d24[], d25[]}, [r1], lr
+ vld1.8 {d28[], d29[]}, [r2], lr
+
+ subs r12, r12, #1
+
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q4}, [r5], lr
+ vst1.8 {q6}, [r6], lr
+ vst1.8 {q8}, [r5], lr
+ vst1.8 {q10}, [r6], lr
+ vst1.8 {q12}, [r5], lr
+ vst1.8 {q14}, [r6], lr
+
+ bne copy_left_right_y_b16
+
+;Now copy the top and bottom source lines into each line of the respective borders
+ ldr r7, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
+ mul r8, r3, lr
+
+ mov r12, lr, lsr #7
+
+ sub r6, r1, r3 ;destptr2
+ sub r2, r6, lr ;srcptr2
+ sub r1, r7, r3 ;srcptr1
+ sub r5, r1, r8 ;destptr1
+
+copy_top_bottom_y_b16
+ vld1.8 {q0, q1}, [r1]!
+ vld1.8 {q8, q9}, [r2]!
+ vld1.8 {q2, q3}, [r1]!
+ vld1.8 {q10, q11}, [r2]!
+ vld1.8 {q4, q5}, [r1]!
+ vld1.8 {q12, q13}, [r2]!
+ vld1.8 {q6, q7}, [r1]!
+ vld1.8 {q14, q15}, [r2]!
+
+ mov r7, r3
+
+top_bottom_16_b16
+ subs r7, r7, #1
+
+ vst1.8 {q0, q1}, [r5]!
+ vst1.8 {q8, q9}, [r6]!
+ vst1.8 {q2, q3}, [r5]!
+ vst1.8 {q10, q11}, [r6]!
+ vst1.8 {q4, q5}, [r5]!
+ vst1.8 {q12, q13}, [r6]!
+ vst1.8 {q6, q7}, [r5]!
+ vst1.8 {q14, q15}, [r6]!
+
+ add r5, r5, lr
+ sub r5, r5, #128
+ add r6, r6, lr
+ sub r6, r6, #128
+
+ bne top_bottom_16_b16
+
+ sub r5, r1, r8
+ add r6, r2, lr
+
+ subs r12, r12, #1
+ bne copy_top_bottom_y_b16
+
+ mov r7, lr, lsr #4 ;check to see if extra copy is needed
+ ands r7, r7, #0x7
+ bne extra_top_bottom_y_b16
+end_of_border_copy_y_b16
+
+ vpop {d8 - d15}
+ pop {r4 - r11, pc}
+
+;=====================
+;extra copy part for Y
+extra_top_bottom_y_b16
+ vld1.8 {q0}, [r1]!
+ vld1.8 {q2}, [r2]!
+
+ mov r9, r3, lsr #3
+
+extra_top_bottom_16_b16
+ subs r9, r9, #1
+
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ bne extra_top_bottom_16_b16
+
+ sub r5, r1, r8
+ add r6, r2, lr
+ subs r7, r7, #1
+ bne extra_top_bottom_y_b16
+
+ b end_of_border_copy_y_b16
+
+;=============================
+extra_cp_src_to_dst_width
+ add r2, r2, r11
+ add r3, r3, r11
+ add r0, r8, r6
+ add r11, r9, r7
+
+ mov lr, r4, lsr #1
+extra_cp_src_to_dst_height_loop
+ mov r8, r2
+ mov r9, r3
+ add r0, r8, r6
+ add r11, r9, r7
+
+ mov r12, r10
+
+extra_cp_src_to_dst_width_loop
+ vld1.8 {q0}, [r8]!
+ vld1.8 {q1}, [r0]!
+
+ subs r12, r12, #16
+
+ vst1.8 {q0}, [r9]!
+ vst1.8 {q1}, [r11]!
+ bne extra_cp_src_to_dst_width_loop
+
+ subs lr, lr, #1
+
+ add r2, r2, r6, lsl #1
+ add r3, r3, r7, lsl #1
+
+ bne extra_cp_src_to_dst_height_loop
+
+ b end_of_cp_src_to_dst
+
+ ENDP
+
+;===========================================================
+;In vp8cx_pick_filter_level(), call vp8_yv12_copy_frame_yonly
+;without extend_frame_borders.
+|vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon| PROC
+ push {r4 - r11, lr}
+ vpush {d8-d15}
+
+ ldr r4, [r0, #yv12_buffer_config_y_height]
+ ldr r5, [r0, #yv12_buffer_config_y_width]
+ ldr r6, [r0, #yv12_buffer_config_y_stride]
+ ldr r7, [r1, #yv12_buffer_config_y_stride]
+ ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
+ ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1
+
+ ; copy two rows at one time
+ mov lr, r4, lsr #1
+
+cp_src_to_dst_height_loop1
+ mov r8, r2
+ mov r9, r3
+ add r10, r2, r6
+ add r11, r3, r7
+ mov r12, r5, lsr #7
+
+cp_src_to_dst_width_loop1
+ vld1.8 {q0, q1}, [r8]!
+ vld1.8 {q8, q9}, [r10]!
+ vld1.8 {q2, q3}, [r8]!
+ vld1.8 {q10, q11}, [r10]!
+ vld1.8 {q4, q5}, [r8]!
+ vld1.8 {q12, q13}, [r10]!
+ vld1.8 {q6, q7}, [r8]!
+ vld1.8 {q14, q15}, [r10]!
+
+ subs r12, r12, #1
+
+ vst1.8 {q0, q1}, [r9]!
+ vst1.8 {q8, q9}, [r11]!
+ vst1.8 {q2, q3}, [r9]!
+ vst1.8 {q10, q11}, [r11]!
+ vst1.8 {q4, q5}, [r9]!
+ vst1.8 {q12, q13}, [r11]!
+ vst1.8 {q6, q7}, [r9]!
+ vst1.8 {q14, q15}, [r11]!
+
+ bne cp_src_to_dst_width_loop1
+
+ subs lr, lr, #1
+ add r2, r2, r6, lsl #1
+ add r3, r3, r7, lsl #1
+
+ bne cp_src_to_dst_height_loop1
+
+ ands r10, r5, #0x7f ;check to see if extra copy is needed
+ sub r11, r5, r10
+ ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
+ ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1
+ bne extra_cp_src_to_dst_width1
+end_of_cp_src_to_dst1
+
+ vpop {d8 - d15}
+ pop {r4-r11, pc}
+
+;=============================
+extra_cp_src_to_dst_width1
+ add r2, r2, r11
+ add r3, r3, r11
+ add r0, r8, r6
+ add r11, r9, r7
+
+ mov lr, r4, lsr #1
+extra_cp_src_to_dst_height_loop1
+ mov r8, r2
+ mov r9, r3
+ add r0, r8, r6
+ add r11, r9, r7
+
+ mov r12, r10
+
+extra_cp_src_to_dst_width_loop1
+ vld1.8 {q0}, [r8]!
+ vld1.8 {q1}, [r0]!
+
+ subs r12, r12, #16
+
+ vst1.8 {q0}, [r9]!
+ vst1.8 {q1}, [r11]!
+ bne extra_cp_src_to_dst_width_loop1
+
+ subs lr, lr, #1
+
+ add r2, r2, r6, lsl #1
+ add r3, r3, r7, lsl #1
+
+ bne extra_cp_src_to_dst_height_loop1
+
+ b end_of_cp_src_to_dst1
+
+ ENDP
+
+ END
diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm
new file mode 100644
index 000000000..c8923d5a5
--- /dev/null
+++ b/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm
@@ -0,0 +1,257 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_yv12_copy_src_frame_func_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ INCLUDE vpx_asm_offsets.asm
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;Note: This function is used to copy source data in src_buffer[i] at beginning of
+;the encoding. The buffer has a width and height of cpi->oxcf.Width and cpi->oxcf.Height,
+;which can be ANY numbers(NOT always multiples of 16 or 4).
+
+;void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+
+|vp8_yv12_copy_src_frame_func_neon| PROC
+ push {r4 - r11, lr}
+ vpush {d8 - d15}
+
+ ;Copy Y plane
+ ldr r4, [r0, #yv12_buffer_config_y_height]
+ ldr r5, [r0, #yv12_buffer_config_y_width]
+ ldr r6, [r0, #yv12_buffer_config_y_stride]
+ ldr r7, [r1, #yv12_buffer_config_y_stride]
+ ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
+ ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1
+
+ add r10, r2, r6 ;second row src
+ add r11, r3, r7 ;second row dst
+ mov r6, r6, lsl #1
+ mov r7, r7, lsl #1
+ sub r6, r6, r5 ;adjust stride
+ sub r7, r7, r5
+
+ ; copy two rows at one time
+ mov lr, r4, lsr #1
+
+cp_src_to_dst_height_loop
+ mov r12, r5
+
+cp_width_128_loop
+ vld1.8 {q0, q1}, [r2]!
+ vld1.8 {q4, q5}, [r10]!
+ vld1.8 {q2, q3}, [r2]!
+ vld1.8 {q6, q7}, [r10]!
+ vld1.8 {q8, q9}, [r2]!
+ vld1.8 {q12, q13}, [r10]!
+ vld1.8 {q10, q11}, [r2]!
+ vld1.8 {q14, q15}, [r10]!
+ sub r12, r12, #128
+ cmp r12, #128
+ vst1.8 {q0, q1}, [r3]!
+ vst1.8 {q4, q5}, [r11]!
+ vst1.8 {q2, q3}, [r3]!
+ vst1.8 {q6, q7}, [r11]!
+ vst1.8 {q8, q9}, [r3]!
+ vst1.8 {q12, q13}, [r11]!
+ vst1.8 {q10, q11}, [r3]!
+ vst1.8 {q14, q15}, [r11]!
+ bhs cp_width_128_loop
+
+ cmp r12, #0
+ beq cp_width_done
+
+cp_width_8_loop
+ vld1.8 {d0}, [r2]!
+ vld1.8 {d1}, [r10]!
+ sub r12, r12, #8
+ cmp r12, #8
+ vst1.8 {d0}, [r3]!
+ vst1.8 {d1}, [r11]!
+ bhs cp_width_8_loop
+
+ cmp r12, #0
+ beq cp_width_done
+
+cp_width_1_loop
+ ldrb r8, [r2], #1
+ subs r12, r12, #1
+ strb r8, [r3], #1
+ ldrb r8, [r10], #1
+ strb r8, [r11], #1
+ bne cp_width_1_loop
+
+cp_width_done
+ subs lr, lr, #1
+ add r2, r2, r6
+ add r3, r3, r7
+ add r10, r10, r6
+ add r11, r11, r7
+ bne cp_src_to_dst_height_loop
+
+;copy last line for Y if y_height is odd
+ tst r4, #1
+ beq cp_width_done_1
+ mov r12, r5
+
+cp_width_128_loop_1
+ vld1.8 {q0, q1}, [r2]!
+ vld1.8 {q2, q3}, [r2]!
+ vld1.8 {q8, q9}, [r2]!
+ vld1.8 {q10, q11}, [r2]!
+ sub r12, r12, #128
+ cmp r12, #128
+ vst1.8 {q0, q1}, [r3]!
+ vst1.8 {q2, q3}, [r3]!
+ vst1.8 {q8, q9}, [r3]!
+ vst1.8 {q10, q11}, [r3]!
+ bhs cp_width_128_loop_1
+
+ cmp r12, #0
+ beq cp_width_done_1
+
+cp_width_8_loop_1
+ vld1.8 {d0}, [r2]!
+ sub r12, r12, #8
+ cmp r12, #8
+ vst1.8 {d0}, [r3]!
+ bhs cp_width_8_loop_1
+
+ cmp r12, #0
+ beq cp_width_done_1
+
+cp_width_1_loop_1
+ ldrb r8, [r2], #1
+ subs r12, r12, #1
+ strb r8, [r3], #1
+ bne cp_width_1_loop_1
+cp_width_done_1
+
+;Copy U & V planes
+ ldr r4, [r0, #yv12_buffer_config_uv_height]
+ ldr r5, [r0, #yv12_buffer_config_uv_width]
+ ldr r6, [r0, #yv12_buffer_config_uv_stride]
+ ldr r7, [r1, #yv12_buffer_config_uv_stride]
+ ldr r2, [r0, #yv12_buffer_config_u_buffer] ;srcptr1
+ ldr r3, [r1, #yv12_buffer_config_u_buffer] ;dstptr1
+
+ add r10, r2, r6 ;second row src
+ add r11, r3, r7 ;second row dst
+ mov r6, r6, lsl #1
+ mov r7, r7, lsl #1
+ sub r6, r6, r5 ;adjust stride
+ sub r7, r7, r5
+
+ mov r9, #2
+
+cp_uv_loop
+ ;copy two rows at one time
+ mov lr, r4, lsr #1
+
+cp_src_to_dst_height_uv_loop
+ mov r12, r5
+
+cp_width_uv_64_loop
+ vld1.8 {q0, q1}, [r2]!
+ vld1.8 {q4, q5}, [r10]!
+ vld1.8 {q2, q3}, [r2]!
+ vld1.8 {q6, q7}, [r10]!
+ sub r12, r12, #64
+ cmp r12, #64
+ vst1.8 {q0, q1}, [r3]!
+ vst1.8 {q4, q5}, [r11]!
+ vst1.8 {q2, q3}, [r3]!
+ vst1.8 {q6, q7}, [r11]!
+ bhs cp_width_uv_64_loop
+
+ cmp r12, #0
+ beq cp_width_uv_done
+
+cp_width_uv_8_loop
+ vld1.8 {d0}, [r2]!
+ vld1.8 {d1}, [r10]!
+ sub r12, r12, #8
+ cmp r12, #8
+ vst1.8 {d0}, [r3]!
+ vst1.8 {d1}, [r11]!
+ bhs cp_width_uv_8_loop
+
+ cmp r12, #0
+ beq cp_width_uv_done
+
+cp_width_uv_1_loop
+ ldrb r8, [r2], #1
+ subs r12, r12, #1
+ strb r8, [r3], #1
+ ldrb r8, [r10], #1
+ strb r8, [r11], #1
+ bne cp_width_uv_1_loop
+
+cp_width_uv_done
+ subs lr, lr, #1
+ add r2, r2, r6
+ add r3, r3, r7
+ add r10, r10, r6
+ add r11, r11, r7
+ bne cp_src_to_dst_height_uv_loop
+
+;copy last line for U & V if uv_height is odd
+ tst r4, #1
+ beq cp_width_uv_done_1
+ mov r12, r5
+
+cp_width_uv_64_loop_1
+ vld1.8 {q0, q1}, [r2]!
+ vld1.8 {q2, q3}, [r2]!
+ sub r12, r12, #64
+ cmp r12, #64
+ vst1.8 {q0, q1}, [r3]!
+ vst1.8 {q2, q3}, [r3]!
+ bhs cp_width_uv_64_loop_1
+
+ cmp r12, #0
+ beq cp_width_uv_done_1
+
+cp_width_uv_8_loop_1
+ vld1.8 {d0}, [r2]!
+ sub r12, r12, #8
+ cmp r12, #8
+ vst1.8 {d0}, [r3]!
+ bhs cp_width_uv_8_loop_1
+
+ cmp r12, #0
+ beq cp_width_uv_done_1
+
+cp_width_uv_1_loop_1
+ ldrb r8, [r2], #1
+ subs r12, r12, #1
+ strb r8, [r3], #1
+ bne cp_width_uv_1_loop_1
+cp_width_uv_done_1
+
+ subs r9, r9, #1
+ ldrne r2, [r0, #yv12_buffer_config_v_buffer] ;srcptr1
+ ldrne r3, [r1, #yv12_buffer_config_v_buffer] ;dstptr1
+ ldrne r10, [r0, #yv12_buffer_config_uv_stride]
+ ldrne r11, [r1, #yv12_buffer_config_uv_stride]
+
+ addne r10, r2, r10 ;second row src
+ addne r11, r3, r11 ;second row dst
+
+ bne cp_uv_loop
+
+ vpop {d8 - d15}
+ pop {r4 - r11, pc}
+
+ ENDP
+ END
diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm
new file mode 100644
index 000000000..8c9ce1962
--- /dev/null
+++ b/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm
@@ -0,0 +1,587 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_yv12_extend_frame_borders_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ INCLUDE vpx_asm_offsets.asm
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_yv12_extend_frame_borders_neon (YV12_BUFFER_CONFIG *ybf);
+;Note: this is VP8 function, which has border=32 and 16. Internal y_width and y_height
+; are always multiples of 16.
+
+|vp8_yv12_extend_frame_borders_neon| PROC
+ push {r4 - r10, lr}
+ vpush {d8 - d15}
+
+ ;Not need to load y_width, since: y_width = y_stride - 2*border
+ ldr r3, [r0, #yv12_buffer_config_border]
+ ldr r1, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
+ ldr r4, [r0, #yv12_buffer_config_y_height]
+ ldr lr, [r0, #yv12_buffer_config_y_stride]
+
+ cmp r3, #16
+ beq b16_extend_frame_borders
+
+;=======================
+b32_extend_frame_borders
+;border = 32
+;=======================
+;Border copy for Y plane
+;copy the left and right most columns out
+ sub r5, r1, r3 ;destptr1
+ add r6, r1, lr
+ sub r6, r6, r3, lsl #1 ;destptr2
+ sub r2, r6, #1 ;srcptr2
+
+ ;Do four rows at one time
+ mov r12, r4, lsr #2
+
+copy_left_right_y
+ vld1.8 {d0[], d1[]}, [r1], lr
+ vld1.8 {d4[], d5[]}, [r2], lr
+ vld1.8 {d8[], d9[]}, [r1], lr
+ vld1.8 {d12[], d13[]}, [r2], lr
+ vld1.8 {d16[], d17[]}, [r1], lr
+ vld1.8 {d20[], d21[]}, [r2], lr
+ vld1.8 {d24[], d25[]}, [r1], lr
+ vld1.8 {d28[], d29[]}, [r2], lr
+
+ vmov q1, q0
+ vmov q3, q2
+ vmov q5, q4
+ vmov q7, q6
+ vmov q9, q8
+ vmov q11, q10
+ vmov q13, q12
+ vmov q15, q14
+
+ subs r12, r12, #1
+
+ vst1.8 {q0, q1}, [r5], lr
+ vst1.8 {q2, q3}, [r6], lr
+ vst1.8 {q4, q5}, [r5], lr
+ vst1.8 {q6, q7}, [r6], lr
+ vst1.8 {q8, q9}, [r5], lr
+ vst1.8 {q10, q11}, [r6], lr
+ vst1.8 {q12, q13}, [r5], lr
+ vst1.8 {q14, q15}, [r6], lr
+
+ bne copy_left_right_y
+
+;Now copy the top and bottom source lines into each line of the respective borders
+ ldr r7, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
+ mul r8, r3, lr
+
+ mov r12, lr, lsr #7
+
+ sub r6, r1, r3 ;destptr2
+ sub r2, r6, lr ;srcptr2
+ sub r1, r7, r3 ;srcptr1
+ sub r5, r1, r8 ;destptr1
+
+copy_top_bottom_y
+ vld1.8 {q0, q1}, [r1]!
+ vld1.8 {q8, q9}, [r2]!
+ vld1.8 {q2, q3}, [r1]!
+ vld1.8 {q10, q11}, [r2]!
+ vld1.8 {q4, q5}, [r1]!
+ vld1.8 {q12, q13}, [r2]!
+ vld1.8 {q6, q7}, [r1]!
+ vld1.8 {q14, q15}, [r2]!
+
+ mov r7, r3
+
+top_bottom_32
+ subs r7, r7, #1
+
+ vst1.8 {q0, q1}, [r5]!
+ vst1.8 {q8, q9}, [r6]!
+ vst1.8 {q2, q3}, [r5]!
+ vst1.8 {q10, q11}, [r6]!
+ vst1.8 {q4, q5}, [r5]!
+ vst1.8 {q12, q13}, [r6]!
+ vst1.8 {q6, q7}, [r5]!
+ vst1.8 {q14, q15}, [r6]!
+
+ add r5, r5, lr
+ sub r5, r5, #128
+ add r6, r6, lr
+ sub r6, r6, #128
+
+ bne top_bottom_32
+
+ sub r5, r1, r8
+ add r6, r2, lr
+
+ subs r12, r12, #1
+ bne copy_top_bottom_y
+
+ mov r7, lr, lsr #4 ;check to see if extra copy is needed
+ ands r7, r7, #0x7
+ bne extra_top_bottom_y
+end_of_border_copy_y
+
+;Border copy for U, V planes
+ ldr r1, [r0, #yv12_buffer_config_u_buffer] ;srcptr1
+ mov lr, lr, lsr #1 ;uv_stride
+ mov r3, r3, lsr #1 ;border
+ mov r4, r4, lsr #1 ;uv_height
+ mov r8, r8, lsr #2
+
+ mov r10, #2
+
+;copy the left and right most columns out
+border_copy_uv
+ sub r5, r1, r3 ;destptr1
+ add r6, r1, lr
+ sub r6, r6, r3, lsl #1 ;destptr2
+ sub r2, r6, #1 ;srcptr2
+
+ mov r7, r1
+
+ ;Do eight rows at one time
+ mov r12, r4, lsr #3
+
+copy_left_right_uv
+ vld1.8 {d0[], d1[]}, [r1], lr
+ vld1.8 {d2[], d3[]}, [r2], lr
+ vld1.8 {d4[], d5[]}, [r1], lr
+ vld1.8 {d6[], d7[]}, [r2], lr
+ vld1.8 {d8[], d9[]}, [r1], lr
+ vld1.8 {d10[], d11[]}, [r2], lr
+ vld1.8 {d12[], d13[]}, [r1], lr
+ vld1.8 {d14[], d15[]}, [r2], lr
+ vld1.8 {d16[], d17[]}, [r1], lr
+ vld1.8 {d18[], d19[]}, [r2], lr
+ vld1.8 {d20[], d21[]}, [r1], lr
+ vld1.8 {d22[], d23[]}, [r2], lr
+ vld1.8 {d24[], d25[]}, [r1], lr
+ vld1.8 {d26[], d27[]}, [r2], lr
+ vld1.8 {d28[], d29[]}, [r1], lr
+ vld1.8 {d30[], d31[]}, [r2], lr
+
+ subs r12, r12, #1
+
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q1}, [r6], lr
+ vst1.8 {q2}, [r5], lr
+ vst1.8 {q3}, [r6], lr
+ vst1.8 {q4}, [r5], lr
+ vst1.8 {q5}, [r6], lr
+ vst1.8 {q6}, [r5], lr
+ vst1.8 {q7}, [r6], lr
+ vst1.8 {q8}, [r5], lr
+ vst1.8 {q9}, [r6], lr
+ vst1.8 {q10}, [r5], lr
+ vst1.8 {q11}, [r6], lr
+ vst1.8 {q12}, [r5], lr
+ vst1.8 {q13}, [r6], lr
+ vst1.8 {q14}, [r5], lr
+ vst1.8 {q15}, [r6], lr
+
+ bne copy_left_right_uv
+
+;Now copy the top and bottom source lines into each line of the respective borders
+ mov r12, lr, lsr #6
+
+ sub r6, r1, r3 ;destptr2
+ sub r2, r6, lr ;srcptr2
+ sub r1, r7, r3 ;srcptr1
+ sub r5, r1, r8 ;destptr1
+
+copy_top_bottom_uv
+ vld1.8 {q0, q1}, [r1]!
+ vld1.8 {q8, q9}, [r2]!
+ vld1.8 {q2, q3}, [r1]!
+ vld1.8 {q10, q11}, [r2]!
+
+ mov r7, r3
+
+top_bottom_16
+ subs r7, r7, #1
+
+ vst1.8 {q0, q1}, [r5]!
+ vst1.8 {q8, q9}, [r6]!
+ vst1.8 {q2, q3}, [r5]!
+ vst1.8 {q10, q11}, [r6]!
+
+ add r5, r5, lr
+ sub r5, r5, #64
+ add r6, r6, lr
+ sub r6, r6, #64
+
+ bne top_bottom_16
+
+ sub r5, r1, r8
+ add r6, r2, lr
+
+ subs r12, r12, #1
+ bne copy_top_bottom_uv
+
+ mov r7, lr, lsr #3 ;check to see if extra copy is needed
+ ands r7, r7, #0x7
+ bne extra_top_bottom_uv
+
+end_of_border_copy_uv
+ subs r10, r10, #1
+ ldrne r1, [r0, #yv12_buffer_config_v_buffer] ;srcptr1
+ bne border_copy_uv
+
+ vpop {d8 - d15}
+ pop {r4 - r10, pc}
+
+;;;;;;;;;;;;;;;;;;;;;;
+;extra copy part for Y
+extra_top_bottom_y
+ vld1.8 {q0}, [r1]!
+ vld1.8 {q2}, [r2]!
+
+ mov r9, r3, lsr #3
+
+extra_top_bottom_32
+ subs r9, r9, #1
+
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ bne extra_top_bottom_32
+
+ sub r5, r1, r8
+ add r6, r2, lr
+ subs r7, r7, #1
+ bne extra_top_bottom_y
+
+ b end_of_border_copy_y
+
+;extra copy part for UV
+extra_top_bottom_uv
+ vld1.8 {d0}, [r1]!
+ vld1.8 {d8}, [r2]!
+
+ mov r9, r3, lsr #3
+
+extra_top_bottom_16
+ subs r9, r9, #1
+
+ vst1.8 {d0}, [r5], lr
+ vst1.8 {d8}, [r6], lr
+ vst1.8 {d0}, [r5], lr
+ vst1.8 {d8}, [r6], lr
+ vst1.8 {d0}, [r5], lr
+ vst1.8 {d8}, [r6], lr
+ vst1.8 {d0}, [r5], lr
+ vst1.8 {d8}, [r6], lr
+ vst1.8 {d0}, [r5], lr
+ vst1.8 {d8}, [r6], lr
+ vst1.8 {d0}, [r5], lr
+ vst1.8 {d8}, [r6], lr
+ vst1.8 {d0}, [r5], lr
+ vst1.8 {d8}, [r6], lr
+ vst1.8 {d0}, [r5], lr
+ vst1.8 {d8}, [r6], lr
+ bne extra_top_bottom_16
+
+ sub r5, r1, r8
+ add r6, r2, lr
+ subs r7, r7, #1
+ bne extra_top_bottom_uv
+
+ b end_of_border_copy_uv
+
+
+;=======================
+b16_extend_frame_borders
+;border = 16
+;=======================
+;Border copy for Y plane
+;copy the left and right most columns out
+ sub r5, r1, r3 ;destptr1
+ add r6, r1, lr
+ sub r6, r6, r3, lsl #1 ;destptr2
+ sub r2, r6, #1 ;srcptr2
+
+ ;Do four rows at one time
+ mov r12, r4, lsr #2
+
+copy_left_right_y_b16
+ vld1.8 {d0[], d1[]}, [r1], lr
+ vld1.8 {d4[], d5[]}, [r2], lr
+ vld1.8 {d8[], d9[]}, [r1], lr
+ vld1.8 {d12[], d13[]}, [r2], lr
+ vld1.8 {d16[], d17[]}, [r1], lr
+ vld1.8 {d20[], d21[]}, [r2], lr
+ vld1.8 {d24[], d25[]}, [r1], lr
+ vld1.8 {d28[], d29[]}, [r2], lr
+
+ subs r12, r12, #1
+
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q4}, [r5], lr
+ vst1.8 {q6}, [r6], lr
+ vst1.8 {q8}, [r5], lr
+ vst1.8 {q10}, [r6], lr
+ vst1.8 {q12}, [r5], lr
+ vst1.8 {q14}, [r6], lr
+
+ bne copy_left_right_y_b16
+
+;Now copy the top and bottom source lines into each line of the respective borders
+ ldr r7, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
+ mul r8, r3, lr
+
+ mov r12, lr, lsr #7
+
+ sub r6, r1, r3 ;destptr2
+ sub r2, r6, lr ;srcptr2
+ sub r1, r7, r3 ;srcptr1
+ sub r5, r1, r8 ;destptr1
+
+copy_top_bottom_y_b16
+ vld1.8 {q0, q1}, [r1]!
+ vld1.8 {q8, q9}, [r2]!
+ vld1.8 {q2, q3}, [r1]!
+ vld1.8 {q10, q11}, [r2]!
+ vld1.8 {q4, q5}, [r1]!
+ vld1.8 {q12, q13}, [r2]!
+ vld1.8 {q6, q7}, [r1]!
+ vld1.8 {q14, q15}, [r2]!
+
+ mov r7, r3
+
+top_bottom_16_b16
+ subs r7, r7, #1
+
+ vst1.8 {q0, q1}, [r5]!
+ vst1.8 {q8, q9}, [r6]!
+ vst1.8 {q2, q3}, [r5]!
+ vst1.8 {q10, q11}, [r6]!
+ vst1.8 {q4, q5}, [r5]!
+ vst1.8 {q12, q13}, [r6]!
+ vst1.8 {q6, q7}, [r5]!
+ vst1.8 {q14, q15}, [r6]!
+
+ add r5, r5, lr
+ sub r5, r5, #128
+ add r6, r6, lr
+ sub r6, r6, #128
+
+ bne top_bottom_16_b16
+
+ sub r5, r1, r8
+ add r6, r2, lr
+
+ subs r12, r12, #1
+ bne copy_top_bottom_y_b16
+
+ mov r7, lr, lsr #4 ;check to see if extra copy is needed
+ ands r7, r7, #0x7
+ bne extra_top_bottom_y_b16
+end_of_border_copy_y_b16
+
+;Border copy for U, V planes
+ ldr r1, [r0, #yv12_buffer_config_u_buffer] ;srcptr1
+ mov lr, lr, lsr #1 ;uv_stride
+ mov r3, r3, lsr #1 ;border
+ mov r4, r4, lsr #1 ;uv_height
+ mov r8, r8, lsr #2
+
+ mov r10, #2
+
+;copy the left and right most columns out
+border_copy_uv_b16
+ sub r5, r1, r3 ;destptr1
+ add r6, r1, lr
+ sub r6, r6, r3, lsl #1 ;destptr2
+ sub r2, r6, #1 ;srcptr2
+
+ mov r7, r1
+
+ ;Do eight rows at one time
+ mov r12, r4, lsr #3
+
+copy_left_right_uv_b16
+ vld1.8 {d0[]}, [r1], lr
+ vld1.8 {d2[]}, [r2], lr
+ vld1.8 {d4[]}, [r1], lr
+ vld1.8 {d6[]}, [r2], lr
+ vld1.8 {d8[]}, [r1], lr
+ vld1.8 {d10[]}, [r2], lr
+ vld1.8 {d12[]}, [r1], lr
+ vld1.8 {d14[]}, [r2], lr
+ vld1.8 {d16[]}, [r1], lr
+ vld1.8 {d18[]}, [r2], lr
+ vld1.8 {d20[]}, [r1], lr
+ vld1.8 {d22[]}, [r2], lr
+ vld1.8 {d24[]}, [r1], lr
+ vld1.8 {d26[]}, [r2], lr
+ vld1.8 {d28[]}, [r1], lr
+ vld1.8 {d30[]}, [r2], lr
+
+ subs r12, r12, #1
+
+ vst1.8 {d0}, [r5], lr
+ vst1.8 {d2}, [r6], lr
+ vst1.8 {d4}, [r5], lr
+ vst1.8 {d6}, [r6], lr
+ vst1.8 {d8}, [r5], lr
+ vst1.8 {d10}, [r6], lr
+ vst1.8 {d12}, [r5], lr
+ vst1.8 {d14}, [r6], lr
+ vst1.8 {d16}, [r5], lr
+ vst1.8 {d18}, [r6], lr
+ vst1.8 {d20}, [r5], lr
+ vst1.8 {d22}, [r6], lr
+ vst1.8 {d24}, [r5], lr
+ vst1.8 {d26}, [r6], lr
+ vst1.8 {d28}, [r5], lr
+ vst1.8 {d30}, [r6], lr
+
+ bne copy_left_right_uv_b16
+
+;Now copy the top and bottom source lines into each line of the respective borders
+ mov r12, lr, lsr #6
+
+ sub r6, r1, r3 ;destptr2
+ sub r2, r6, lr ;srcptr2
+ sub r1, r7, r3 ;srcptr1
+ sub r5, r1, r8 ;destptr1
+
+copy_top_bottom_uv_b16
+ vld1.8 {q0, q1}, [r1]!
+ vld1.8 {q8, q9}, [r2]!
+ vld1.8 {q2, q3}, [r1]!
+ vld1.8 {q10, q11}, [r2]!
+
+ mov r7, r3
+
+top_bottom_8_b16
+ subs r7, r7, #1
+
+ vst1.8 {q0, q1}, [r5]!
+ vst1.8 {q8, q9}, [r6]!
+ vst1.8 {q2, q3}, [r5]!
+ vst1.8 {q10, q11}, [r6]!
+
+ add r5, r5, lr
+ sub r5, r5, #64
+ add r6, r6, lr
+ sub r6, r6, #64
+
+ bne top_bottom_8_b16
+
+ sub r5, r1, r8
+ add r6, r2, lr
+
+ subs r12, r12, #1
+ bne copy_top_bottom_uv_b16
+
+ mov r7, lr, lsr #3 ;check to see if extra copy is needed
+ ands r7, r7, #0x7
+ bne extra_top_bottom_uv_b16
+
+end_of_border_copy_uv_b16
+ subs r10, r10, #1
+ ldrne r1, [r0, #yv12_buffer_config_v_buffer] ;srcptr1
+ bne border_copy_uv_b16
+
+ vpop {d8-d15}
+ pop {r4 - r10, pc}
+
+;;;;;;;;;;;;;;;;;;;;;;
+;extra copy part for Y
+extra_top_bottom_y_b16
+ vld1.8 {q0}, [r1]!
+ vld1.8 {q2}, [r2]!
+
+ mov r9, r3, lsr #3
+
+extra_top_bottom_16_b16
+ subs r9, r9, #1
+
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ vst1.8 {q0}, [r5], lr
+ vst1.8 {q2}, [r6], lr
+ bne extra_top_bottom_16_b16
+
+ sub r5, r1, r8
+ add r6, r2, lr
+ subs r7, r7, #1
+ bne extra_top_bottom_y_b16
+
+ b end_of_border_copy_y_b16
+
+;extra copy part for UV
+extra_top_bottom_uv_b16
+ vld1.8 {d0}, [r1]!
+ vld1.8 {d8}, [r2]!
+
+ mov r9, r3, lsr #3
+
+extra_top_bottom_8_b16
+ subs r9, r9, #1
+
+ vst1.8 {d0}, [r5], lr
+ vst1.8 {d8}, [r6], lr
+ vst1.8 {d0}, [r5], lr
+ vst1.8 {d8}, [r6], lr
+ vst1.8 {d0}, [r5], lr
+ vst1.8 {d8}, [r6], lr
+ vst1.8 {d0}, [r5], lr
+ vst1.8 {d8}, [r6], lr
+ vst1.8 {d0}, [r5], lr
+ vst1.8 {d8}, [r6], lr
+ vst1.8 {d0}, [r5], lr
+ vst1.8 {d8}, [r6], lr
+ vst1.8 {d0}, [r5], lr
+ vst1.8 {d8}, [r6], lr
+ vst1.8 {d0}, [r5], lr
+ vst1.8 {d8}, [r6], lr
+ bne extra_top_bottom_8_b16
+
+ sub r5, r1, r8
+ add r6, r2, lr
+ subs r7, r7, #1
+ bne extra_top_bottom_uv_b16
+
+ b end_of_border_copy_uv_b16
+
+ ENDP
+ END
diff --git a/vpx_scale/arm/scalesystemdependant.c b/vpx_scale/arm/scalesystemdependant.c
new file mode 100644
index 000000000..3c355becc
--- /dev/null
+++ b/vpx_scale/arm/scalesystemdependant.c
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/vpxscale.h"
+
+#ifdef HAVE_CONFIG_H
+#include "vpx_config.h"
+#endif
+
+void (*vp8_yv12_extend_frame_borders_ptr)(YV12_BUFFER_CONFIG *ybf);
+extern void vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf);
+extern void vp8_yv12_extend_frame_borders_neon(YV12_BUFFER_CONFIG *ybf);
+
+void (*vp8_yv12_copy_frame_yonly_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+extern void vp8_yv12_copy_frame_yonly(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+extern void vp8_yv12_copy_frame_yonly_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+
+void (*vp8_yv12_copy_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+extern void vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+extern void vp8_yv12_copy_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+
+/****************************************************************************
+* Imports
+*****************************************************************************/
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8_scale_machine_specific_config
+ *
+ * INPUTS : UINT32 Version : Codec version number.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Checks for machine specifc features such as MMX support
+ * sets appropriate flags and function pointers.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8_scale_machine_specific_config()
+{
+ /*
+ vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_armv4;
+ vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_armv4;
+ vp8_last_vertical_band_1_2_scale = vp8cx_last_vertical_band_1_2_scale_c;
+ vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_armv4;
+ vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_armv4;
+ vp8_last_vertical_band_3_5_scale = vp8cx_last_vertical_band_3_5_scale_c;
+ vp8_horizontal_line_3_4_scale = horizontal_line_3_4_scale_armv4;
+ vp8_vertical_band_3_4_scale = vertical_band_3_4_scale_armv4;
+ vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c;
+ vp8_horizontal_line_2_3_scale = horizontal_line_2_3_scale_armv4;
+ vp8_vertical_band_2_3_scale = vertical_band_2_3_scale_armv4;
+ vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c;
+ vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_armv4;
+ vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_armv4;
+ vp8_last_vertical_band_4_5_scale = vp8cx_last_vertical_band_4_5_scale_c;
+
+ vp8_vertical_band_5_4_scale = vp8cx_vertical_band_5_4_scale_c;
+ vp8_vertical_band_5_3_scale = vp8cx_vertical_band_5_3_scale_c;
+ vp8_vertical_band_2_1_scale = vp8cx_vertical_band_2_1_scale_c;
+ vp8_vertical_band_2_1_scale_i = vp8cx_vertical_band_2_1_scale_i_c;
+ vp8_horizontal_line_2_1_scale = vp8cx_horizontal_line_2_1_scale_c;
+ vp8_horizontal_line_5_3_scale = vp8cx_horizontal_line_5_3_scale_c;
+ vp8_horizontal_line_5_4_scale = vp8cx_horizontal_line_5_4_scale_c;
+ */
+
+#if HAVE_ARMV7
+ vp8_yv12_extend_frame_borders_ptr = vp8_yv12_extend_frame_borders_neon;
+ vp8_yv12_copy_frame_yonly_ptr = vp8_yv12_copy_frame_yonly_neon;
+ vp8_yv12_copy_frame_ptr = vp8_yv12_copy_frame_neon;
+#else
+ vp8_yv12_extend_frame_borders_ptr = vp8_yv12_extend_frame_borders;
+ vp8_yv12_copy_frame_yonly_ptr = vp8_yv12_copy_frame_yonly;
+ vp8_yv12_copy_frame_ptr = vp8_yv12_copy_frame;
+#endif
+
+}
diff --git a/vpx_scale/arm/yv12extend_arm.c b/vpx_scale/arm/yv12extend_arm.c
new file mode 100644
index 000000000..7c3f7cd07
--- /dev/null
+++ b/vpx_scale/arm/yv12extend_arm.c
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/yv12config.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/vpxscale.h"
+
+void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+
+void
+vp8_yv12_copy_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
+{
+ vp8_yv12_copy_frame_func_neon(src_ybc, dst_ybc);
+ //printf("Border:%d; plane_stride:%d; plane_height:%d; plane_width:%d\n",dst_ybc->border,dst_ybc->y_stride,dst_ybc->y_height,dst_ybc->y_width);
+
+ vp8_yv12_extend_frame_borders_ptr(dst_ybc);
+}
diff --git a/vpx_scale/blackfin/yv12config.c b/vpx_scale/blackfin/yv12config.c
new file mode 100644
index 000000000..7cb083fb9
--- /dev/null
+++ b/vpx_scale/blackfin/yv12config.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+ *
+ * Module Title : yv12config.c
+ *
+ * Description :
+ *
+ ***************************************************************************/
+
+/****************************************************************************
+* Header Files
+****************************************************************************/
+#include "vpx_scale/yv12config.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include <cdef_bf533.h>
+
+/****************************************************************************
+* Imports
+****************************************************************************/
+void
+extend_memset(void *dst, unsigned char value, unsigned int size);
+
+/****************************************************************************
+ *
+ ****************************************************************************/
+int
+vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf)
+{
+ if (ybf)
+ {
+ if (ybf->buffer_alloc)
+ {
+ duck_free(ybf->buffer_alloc);
+ }
+
+ ybf->buffer_alloc = 0;
+ }
+ else
+ {
+ return -1;
+ }
+
+ return 0;
+}
+
+/****************************************************************************
+ *
+ ****************************************************************************/
+int
+vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int border)
+{
+//NOTE:
+
+ int yplane_size = (height + 2 * border) * (width + 2 * border);
+ int uvplane_size = (height / 2 + border) * (width / 2 + border);
+
+ if (ybf)
+ {
+ vp8_yv12_de_alloc_frame_buffer(ybf);
+
+ ybf->y_width = width;
+ ybf->y_height = height;
+ ybf->y_stride = width + 2 * border;
+
+ ybf->uv_width = width / 2;
+ ybf->uv_height = height / 2;
+ ybf->uv_stride = ybf->uv_width + border;
+
+ ybf->border = border;
+
+ // Added 2 extra lines to framebuffer so that copy12x12 doesn't fail
+ // when we have a large motion vector in V on the last v block.
+ // Note : We never use these pixels anyway so this doesn't hurt.
+ ybf->buffer_alloc = (unsigned char *) duck_memalign(32, (yplane_size * 3 / 2) + ybf->y_stride , 0);
+
+ if (ybf->buffer_alloc == NULL)
+ return -1;
+
+ ybf->y_buffer = ybf->buffer_alloc + border * ybf->y_stride + border;
+ ybf->u_buffer = ybf->buffer_alloc + yplane_size + border / 2 * ybf->uv_stride + border / 2;
+ ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + border / 2 * ybf->uv_stride + border / 2;
+ }
+ else
+ {
+ return -2;
+ }
+
+ return 0;
+}
+/****************************************************************************
+ *
+ ****************************************************************************/
+int
+vp8_yv12_black_frame_buffer(YV12_BUFFER_CONFIG *ybf)
+{
+ if (ybf)
+ {
+ if (ybf->buffer_alloc)
+ {
+ extend_memset(ybf->y_buffer, 0x0, ybf->y_stride *(ybf->y_height + 2 * ybf->border));
+ extend_memset(ybf->u_buffer, 0x80, ybf->uv_stride *(ybf->uv_height + ybf->border));
+ extend_memset(ybf->v_buffer, 0x80, ybf->uv_stride *(ybf->uv_height + ybf->border));
+ }
+
+ return 0;
+ }
+
+ return -1;
+}
diff --git a/vpx_scale/blackfin/yv12extend.c b/vpx_scale/blackfin/yv12extend.c
new file mode 100644
index 000000000..d5be4950d
--- /dev/null
+++ b/vpx_scale/blackfin/yv12extend.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+ *
+ * Module Title : yv12extend.c
+ *
+ * Description :
+ *
+ ***************************************************************************/
+
+/****************************************************************************
+* Header Files
+****************************************************************************/
+#include <cdef_bf533.h>
+
+#include "vpx_scale/yv12config.h"
+#include "vpx_mem/vpx_mem.h"
+
+/****************************************************************************
+*
+****************************************************************************/
+
+
+/****************************************************************************
+*
+****************************************************************************/
+void
+extend_memset(void *dst, unsigned char value, unsigned int size)
+{
+#if 0
+ unsigned int quad_value;
+
+ quad_value = (unsigned int) value;
+ quad_value |= (unsigned int) value << 8;
+ quad_value |= (unsigned int) value << 16;
+ quad_value |= (unsigned int) value << 24;
+#else
+ unsigned short quad_value;
+
+ quad_value = (unsigned int) value;
+ quad_value |= (unsigned int) value << 8;
+#endif
+
+
+ if (size / 2 >= 64 * 1024)
+ printf("_Extend_memset__________ dma memset is broken\n");
+
+ *p_mdma_s1_start_addr = &quad_value;
+ *p_mdma_s1_x_count = size / 2;
+ *p_mdma_s1_x_modify = 0x0;
+ *p_mdma_d1_start_addr = dst;
+ *p_mdma_d1_x_count = size / 2;
+ *p_mdma_d1_x_modify = 2;
+
+ *p_mdma_s1_config = DMAEN | WDSIZE_16;
+ asm("ssync;");
+
+ *p_mdma_d1_config = DI_EN | DMAEN | WNR | WDSIZE_16;
+ asm("ssync;");
+
+ while ((*p_mdma_d1_irq_status & DMA_DONE) == 0);
+
+ *p_mdma_d1_irq_status |= DMA_DONE;
+}
+
+/****************************************************************************
+*
+****************************************************************************/
+void
+extend_memcpy(void *dst, void *src, unsigned int size)
+{
+ if (size / 2 >= 64 * 1024)
+ printf("_Extend_memcpy__________ dma memcpy is broken\n");
+
+
+ if ((size & 0x3))
+ printf("_)__________ size not a multiple of 4\n");
+
+//32 bit dma here caused some data to be corrupted --- WHY ??????
+
+ *p_mdma_s1_start_addr = src;
+ *p_mdma_s1_x_count = size / 2;
+ *p_mdma_s1_x_modify = 2;
+ *p_mdma_d1_start_addr = dst;
+ *p_mdma_d1_x_count = size / 2;
+ *p_mdma_d1_x_modify = 2;
+
+ *p_mdma_s1_config = DMAEN | WDSIZE_16;
+ asm("ssync;");
+
+ *p_mdma_d1_config = DI_EN | DMAEN | WNR | WDSIZE_16;
+ asm("ssync;");
+
+ while ((*p_mdma_d1_irq_status & DMA_DONE) == 0);
+
+ *p_mdma_d1_irq_status |= DMA_DONE;
+}
+
+/****************************************************************************
+ *
+ ****************************************************************************/
+void
+vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf)
+{
+#if 1
+ int i;
+ unsigned char *src_ptr1, *src_ptr2;
+ unsigned char *dest_ptr1, *dest_ptr2;
+
+ unsigned int Border;
+ int plane_stride;
+ int plane_height;
+ int plane_width;
+
+ unsigned int quad_sample;
+ unsigned int sample;
+
+ /***********/
+ /* Y Plane */
+ /***********/
+ Border = ybf->border;
+ plane_stride = ybf->y_stride;
+ plane_height = ybf->y_height;
+ plane_width = ybf->y_width;
+
+ // copy the left and right most columns out
+ src_ptr1 = ybf->y_buffer;
+ src_ptr2 = src_ptr1 + plane_width - 1;
+ dest_ptr1 = src_ptr1 - Border;
+ dest_ptr2 = src_ptr2 + 1;
+
+ for (i = 0; i < plane_height; i++)
+ {
+ extend_memset(dest_ptr1, src_ptr1[0], Border);
+ extend_memset(dest_ptr2, src_ptr2[0], Border);
+ src_ptr1 += plane_stride;
+ src_ptr2 += plane_stride;
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ // Now copy the top and bottom source lines into each line of the respective borders
+ src_ptr1 = ybf->y_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)Border; i++)
+ {
+ extend_memcpy(dest_ptr1, src_ptr1, plane_stride);
+ dest_ptr1 += plane_stride;
+ }
+
+ for (i = 0; i < (int)Border; i++)
+ {
+ extend_memcpy(dest_ptr2, src_ptr2, plane_stride);
+ dest_ptr2 += plane_stride;
+ }
+
+ plane_stride /= 2;
+ plane_height /= 2;
+ plane_width /= 2;
+ Border /= 2;
+
+ /***********/
+ /* U Plane */
+ /***********/
+
+ // copy the left and right most columns out
+ src_ptr1 = ybf->u_buffer;
+ src_ptr2 = src_ptr1 + plane_width - 1;
+ dest_ptr1 = src_ptr1 - Border;
+ dest_ptr2 = src_ptr2 + 1;
+
+ for (i = 0; i < plane_height; i++)
+ {
+ extend_memset(dest_ptr1, src_ptr1[0], Border);
+ extend_memset(dest_ptr2, src_ptr2[0], Border);
+ src_ptr1 += plane_stride;
+ src_ptr2 += plane_stride;
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ // Now copy the top and bottom source lines into each line of the respective borders
+ src_ptr1 = ybf->u_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)(Border); i++)
+ {
+ extend_memcpy(dest_ptr1, src_ptr1, plane_stride);
+ dest_ptr1 += plane_stride;
+ }
+
+ for (i = 0; i < (int)(Border); i++)
+ {
+ extend_memcpy(dest_ptr2, src_ptr2, plane_stride);
+ dest_ptr2 += plane_stride;
+ }
+
+ /***********/
+ /* V Plane */
+ /***********/
+
+ // copy the left and right most columns out
+ src_ptr1 = ybf->v_buffer;
+ src_ptr2 = src_ptr1 + plane_width - 1;
+ dest_ptr1 = src_ptr1 - Border;
+ dest_ptr2 = src_ptr2 + 1;
+
+ for (i = 0; i < plane_height; i++)
+ {
+ extend_memset(dest_ptr1, src_ptr1[0], Border);
+ extend_memset(dest_ptr2, src_ptr2[0], Border);
+ src_ptr1 += plane_stride;
+ src_ptr2 += plane_stride;
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ // Now copy the top and bottom source lines into each line of the respective borders
+ src_ptr1 = ybf->v_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)(Border); i++)
+ {
+ extend_memcpy(dest_ptr1, src_ptr1, plane_stride);
+ dest_ptr1 += plane_stride;
+ }
+
+ for (i = 0; i < (int)(Border); i++)
+ {
+ extend_memcpy(dest_ptr2, src_ptr2, plane_stride);
+ dest_ptr2 += plane_stride;
+ }
+
+#endif
+}
+/****************************************************************************
+ *
+ * ROUTINE : vp8_yv12_copy_frame
+ *
+ * INPUTS :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies the source image into the destination image and
+ * updates the destination's UMV borders.
+ *
+ * SPECIAL NOTES : The frames are assumed to be identical in size.
+ *
+ ****************************************************************************/
+void
+vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
+{
+#if 1
+ int row;
+ unsigned char *source, *dest;
+
+ source = src_ybc->y_buffer;
+ dest = dst_ybc->y_buffer;
+
+ for (row = 0; row < src_ybc->y_height; row++)
+ {
+ extend_memcpy(dest, source, src_ybc->y_width);
+ source += src_ybc->y_stride;
+ dest += dst_ybc->y_stride;
+ }
+
+ source = src_ybc->u_buffer;
+ dest = dst_ybc->u_buffer;
+
+ for (row = 0; row < src_ybc->uv_height; row++)
+ {
+ extend_memcpy(dest, source, src_ybc->uv_width);
+ source += src_ybc->uv_stride;
+ dest += dst_ybc->uv_stride;
+ }
+
+ source = src_ybc->v_buffer;
+ dest = dst_ybc->v_buffer;
+
+ for (row = 0; row < src_ybc->uv_height; row++)
+ {
+ extend_memcpy(dest, source, src_ybc->uv_width);
+ source += src_ybc->uv_stride;
+ dest += dst_ybc->uv_stride;
+ }
+
+ vp8_yv12_extend_frame_borders(dst_ybc);
+
+#else
+ int row;
+ char *source, *dest;
+ int height;
+ int width;
+
+ height = src_ybc->y_height + (src_ybc->border * 2);
+ width = src_ybc->y_width + (src_ybc->border * 2);
+ source = src_ybc->y_buffer;
+ dest = dst_ybc->y_buffer;
+
+ for (row = 0; row < height; row++)
+ {
+ extend_memcpy(dest, source, width);
+ source += src_ybc->y_stride;
+ dest += dst_ybc->y_stride;
+ }
+
+ height = src_ybc->uv_height + (src_ybc->border);
+ width = src_ybc->uv_width + (src_ybc->border);
+
+ source = src_ybc->u_buffer;
+ dest = dst_ybc->u_buffer;
+
+ for (row = 0; row < height; row++)
+ {
+ extend_memcpy(dest, source, width);
+ source += src_ybc->uv_stride;
+ dest += dst_ybc->uv_stride;
+ }
+
+ source = src_ybc->v_buffer;
+ dest = dst_ybc->v_buffer;
+
+ for (row = 0; row < height; row++)
+ {
+ extend_memcpy(dest, source, width);
+ source += src_ybc->uv_stride;
+ dest += dst_ybc->uv_stride;
+ }
+
+#endif
+
+}
diff --git a/vpx_scale/dm642/bicubic_scaler_c64.c b/vpx_scale/dm642/bicubic_scaler_c64.c
new file mode 100644
index 000000000..9bd379725
--- /dev/null
+++ b/vpx_scale/dm642/bicubic_scaler_c64.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include "vpx_mem/vpx_mem.h"
+#include "vpxscale_arbitrary.h"
+
+extern BICUBIC_SCALER_STRUCT g_b_scaler;
+
+int bicubic_scale_c64(int in_width, int in_height, int in_stride,
+ int out_width, int out_height, int out_stride,
+ unsigned char *input_image, unsigned char *output_image)
+{
+ short *restrict l_w, * restrict l_h;
+ short *restrict c_w, * restrict c_h;
+ unsigned char *restrict ip, * restrict op, *restrict op_w;
+ unsigned char *restrict hbuf;
+ int h, w, lw, lh;
+ int phase_offset_w, phase_offset_h;
+ double coeff;
+ int max_phase;
+
+ c_w = g_b_scaler.c_w;
+ c_h = g_b_scaler.c_h;
+
+ op = output_image;
+
+ l_w = g_b_scaler.l_w;
+ l_h = g_b_scaler.l_h;
+
+ phase_offset_h = 0;
+
+ for (h = 0; h < out_height; h++)
+ {
+ // select the row to work on
+ lh = l_h[h];
+ ip = input_image + (in_stride * lh);
+
+ coeff = _memd8_const(&c_h[phase_offset_h*4]);
+
+ // vp8_filter the row vertically into an temporary buffer.
+ // If the phase offset == 0 then all the multiplication
+ // is going to result in the output equalling the input.
+ // So instead point the temporary buffer to the input.
+ // Also handle the boundry condition of not being able to
+ // filter that last lines.
+ if (phase_offset_h && (lh < in_height - 2))
+ {
+ hbuf = g_b_scaler.hbuf;
+
+ for (w = 0; w < in_width; w += 4)
+ {
+ int ip1, ip2, ip3, ip4;
+ int y13_12, y11_10, y23_22, y21_20, y33_32, y31_30, y43_42, y41_40;
+ int y10_20, y11_21, y12_22, y13_23, y30_40, y31_41, y32_42, y33_43;
+ int s1, s2, s3, s4;
+
+ ip1 = _mem4_const(&ip[w - in_stride]);
+ ip2 = _mem4_const(&ip[w]);
+ ip3 = _mem4_const(&ip[w + in_stride]);
+ ip4 = _mem4_const(&ip[w + 2*in_stride]);
+
+ // realignment of data. Unpack the data so that it is in short
+ // format instead of bytes.
+ y13_12 = _unpkhu4(ip1);
+ y11_10 = _unpklu4(ip1);
+ y23_22 = _unpkhu4(ip2);
+ y21_20 = _unpklu4(ip2);
+ y33_32 = _unpkhu4(ip3);
+ y31_30 = _unpklu4(ip3);
+ y43_42 = _unpkhu4(ip4);
+ y41_40 = _unpklu4(ip4);
+
+ // repack the data so that elements 1 and 2 are together. this
+ // lines up so that a dot product with the coefficients can be
+ // done.
+ y10_20 = _pack2(y11_10, y21_20);
+ y11_21 = _packh2(y11_10, y21_20);
+ y12_22 = _pack2(y13_12, y23_22);
+ y13_23 = _packh2(y13_12, y23_22);
+
+ s1 = _dotp2(_hi(coeff), y10_20);
+ s2 = _dotp2(_hi(coeff), y11_21);
+ s3 = _dotp2(_hi(coeff), y12_22);
+ s4 = _dotp2(_hi(coeff), y13_23);
+
+ y30_40 = _pack2(y31_30, y41_40);
+ y31_41 = _packh2(y31_30, y41_40);
+ y32_42 = _pack2(y33_32, y43_42);
+ y33_43 = _packh2(y33_32, y43_42);
+
+ // now repack elements 3 and 4 together.
+ s1 += _dotp2(_lo(coeff), y30_40);
+ s2 += _dotp2(_lo(coeff), y31_41);
+ s3 += _dotp2(_lo(coeff), y32_42);
+ s4 += _dotp2(_lo(coeff), y33_43);
+
+ s1 = s1 >> 12;
+ s2 = s2 >> 12;
+ s3 = s3 >> 12;
+ s4 = s4 >> 12;
+
+ s1 = _pack2(s2, s1);
+ s2 = _pack2(s4, s3);
+
+ _amem4(&hbuf[w]) = _spacku4(s2, s1);
+ }
+ }
+ else
+ hbuf = ip;
+
+ // increase the phase offset for the next time around.
+ if (++phase_offset_h >= g_b_scaler.nh)
+ phase_offset_h = 0;
+
+ op_w = op;
+
+ // will never be able to interpolate first pixel, so just copy it
+ // over here.
+ phase_offset_w = 1;
+ *op_w++ = hbuf[0];
+
+ if (1 >= g_b_scaler.nw) phase_offset_w = 0;
+
+ max_phase = g_b_scaler.nw;
+
+ for (w = 1; w < out_width; w++)
+ {
+ double coefficients;
+ int hbuf_high, hbuf_low, hbuf_both;
+ int sum_high, sum_low, sum;
+
+ // get the index to use to expand the image
+ lw = l_w[w];
+ coefficients = _amemd8_const(&c_w[phase_offset_w*4]);
+ hbuf_both = _mem4_const(&hbuf[lw-1]);
+
+ hbuf_high = _unpkhu4(hbuf_both);
+ hbuf_low = _unpklu4(hbuf_both);
+
+ sum_high = _dotp2(_hi(coefficients), hbuf_high);
+ sum_low = _dotp2(_lo(coefficients), hbuf_low);
+
+ sum = (sum_high + sum_low) >> 12;
+
+ if (++phase_offset_w >= max_phase)
+ phase_offset_w = 0;
+
+ if ((lw + 2) >= in_width)
+ sum = hbuf[lw];
+
+ *op_w++ = sum;
+ }
+
+ op += out_stride;
+ }
+
+ return 0;
+}
+
+void bicubic_scale_frame_c64(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+ int new_width, int new_height)
+{
+
+ dst->y_width = new_width;
+ dst->y_height = new_height;
+ dst->uv_width = new_width / 2;
+ dst->uv_height = new_height / 2;
+
+ dst->y_stride = dst->y_width;
+ dst->uv_stride = dst->uv_width;
+
+ bicubic_scale_c64(src->y_width, src->y_height, src->y_stride,
+ new_width, new_height, dst->y_stride,
+ src->y_buffer, dst->y_buffer);
+
+ bicubic_scale_c64(src->uv_width, src->uv_height, src->uv_stride,
+ new_width / 2, new_height / 2, dst->uv_stride,
+ src->u_buffer, dst->u_buffer);
+
+ bicubic_scale_c64(src->uv_width, src->uv_height, src->uv_stride,
+ new_width / 2, new_height / 2, dst->uv_stride,
+ src->v_buffer, dst->v_buffer);
+}
diff --git a/vpx_scale/dm642/gen_scalers_c64.c b/vpx_scale/dm642/gen_scalers_c64.c
new file mode 100644
index 000000000..2126a7534
--- /dev/null
+++ b/vpx_scale/dm642/gen_scalers_c64.c
@@ -0,0 +1,607 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+ *
+ * Module Title : gen_scalers.c
+ *
+ * Description : Generic image scaling functions.
+ *
+ ***************************************************************************/
+
+/****************************************************************************
+* Header Files
+****************************************************************************/
+#include "vpx_scale/vpxscale.h"
+
+/****************************************************************************
+* Imports
+****************************************************************************/
+
+/****************************************************************************
+ *
+ * ROUTINE : horizontal_line_4_5_scale_c4
+ *
+ * INPUTS : const unsigned char *source : Pointer to source data.
+ * unsigned int source_width : Stride of source.
+ * unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_width : Stride of destination (NOT USED).
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies horizontal line of pixels from source to
+ * destination scaling up by 4 to 5.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_4_5_scale_c64
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ unsigned i;
+ unsigned int ba, cb, dc, ed;
+ unsigned char *restrict des = dest;
+ unsigned int *restrict src = (unsigned int *)source;
+ unsigned int const_51_205, const_102_154,
+ const_205_51, const_154_102;
+
+ unsigned int src_current, src_next;
+
+ (void) dest_width;
+
+ // Constants that are to be used for the filtering. For
+ // best speed we are going to want to right shift by 16.
+ // In the generic version they were shift by 8, so put
+ // an extra 8 in now so that 16 will come out later.
+ const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
+ const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
+ const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
+ const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
+
+ // 5 points are needed to filter to give 5 output points.
+ // A load can pull up 4 at a time, and one needs to be
+ // "borrowed" from the next set of data. So instead of
+ // loading those 5 points each time, "steal" a point from
+ // the next set and only load up 4 each time through.
+ src_current = _mem4(src);
+
+ for (i = 0; i < source_width - 4; i += 4)
+ {
+ src_next = _mem4(src++);
+
+ // Reorder the data so that it is ready for the
+ // dot product.
+ ba = _unpklu4(src_current);
+ cb = _unpkhu4(_rotl(src_current, 8));
+ dc = _unpkhu4(src_current);
+ ed = _unpkhu4(_shrmb(src_next, src_current));
+
+ // Use the dot product with round and shift.
+ des [0] = src_current & 0xff;
+ des [1] = _dotprsu2(ba, const_205_51);
+ des [2] = _dotprsu2(cb, const_154_102);
+ des [3] = _dotprsu2(dc, const_102_154);
+ des [4] = _dotprsu2(ed, const_51_205);
+
+ des += 5;
+
+ // reuse loaded vales next time around.
+ src_current = src_next;
+ }
+
+ // vp8_filter the last set of points. Normally a point from the next set
+ // would be used, but there is no next set, so just fill.
+ ba = _unpklu4(src_current);
+ cb = _unpkhu4(_rotl(src_current, 8));
+ dc = _unpkhu4(src_current);
+
+ des [0] = src_current & 0xff;
+ des [1] = _dotprsu2(ba, const_205_51);
+ des [2] = _dotprsu2(cb, const_154_102);
+ des [3] = _dotprsu2(dc, const_102_154);
+ des [4] = src_current & 0xff;
+
+}
+/****************************************************************************
+ *
+ * ROUTINE : vertical_band_4_5_scale_c64
+ *
+ * INPUTS : unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_pitch : Stride of destination data.
+ * unsigned int dest_width : Width of destination data.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Scales vertical band of pixels by scale 4 to 5. The
+ * height of the band scaled is 4-pixels.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band.
+ *
+ ****************************************************************************/
+static
+void vertical_band_4_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned int a, b, c, d, e;
+ unsigned int ba, cb, dc, ed;
+ unsigned char *restrict src = dest;
+ unsigned char *restrict des = dest;
+ unsigned int const_51_205, const_102_154,
+ const_205_51, const_154_102;
+
+ const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
+ const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
+ const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
+ const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
+
+ // Force a loop unroll here so that there is not such a
+ // dependancy.
+ a = src [0];
+ b = src [dest_pitch];
+ c = src [dest_pitch*2];
+ d = src [dest_pitch*3];
+ e = src [dest_pitch*5];
+ src ++;
+
+ for (i = 0; i < dest_width; i++)
+ {
+ ba = _pack2(b, a);
+ cb = _pack2(c, b);
+ dc = _pack2(d, c);
+ ed = _pack2(e, d);
+
+ a = src [0];
+ b = src [dest_pitch];
+ c = src [dest_pitch*2];
+ d = src [dest_pitch*3];
+ e = src [dest_pitch*5];
+ src ++;
+
+ des [dest_pitch] = _dotprsu2(ba, const_205_51);
+ des [dest_pitch*2] = _dotprsu2(cb, const_154_102);
+ des [dest_pitch*3] = _dotprsu2(dc, const_102_154);
+ des [dest_pitch*4] = _dotprsu2(ed, const_51_205);
+
+ des ++;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : last_vertical_band_4_5_scale_c64
+ *
+ * INPUTS : unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_pitch : Stride of destination data.
+ * unsigned int dest_width : Width of destination data.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Scales last vertical band of pixels by scale 4 to 5. The
+ * height of the band scaled is 4-pixels.
+ *
+ * SPECIAL NOTES : The routine does not have available the first line of
+ * the band below the current band, since this is the
+ * last band.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_4_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned int a, b, c, d;
+ unsigned int ba, cb, dc;
+ unsigned char *restrict src = dest;
+ unsigned char *restrict des = dest;
+ unsigned int const_102_154, const_205_51, const_154_102;
+
+ const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
+ const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
+ const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
+
+ a = src [0];
+ b = src [dest_pitch];
+ c = src [dest_pitch*2];
+ d = src [dest_pitch*3];
+ src ++;
+
+ for (i = 0; i < dest_width; ++i)
+ {
+ ba = _pack2(b, a);
+ cb = _pack2(c, b);
+ dc = _pack2(d, c);
+
+ a = src [0];
+ b = src [dest_pitch];
+ c = src [dest_pitch*2];
+ d = src [dest_pitch*3];
+ src ++;
+
+ des [dest_pitch] = _dotprsu2(ba, const_205_51);
+ des [dest_pitch*2] = _dotprsu2(cb, const_154_102);
+ des [dest_pitch*3] = _dotprsu2(dc, const_102_154);
+ des [dest_pitch*4] = (unsigned char) d;
+
+ des++;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : horizontal_line_3_5_scale_c64
+ *
+ * INPUTS : const unsigned char *source : Pointer to source data.
+ * unsigned int source_width : Stride of source.
+ * unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_width : Stride of destination (NOT USED).
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies horizontal line of pixels from source to
+ * destination scaling up by 3 to 5.
+ *
+ * SPECIAL NOTES : None.
+ *
+ *
+ ****************************************************************************/
+static
+void horizontal_line_3_5_scale_c64
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ unsigned int i;
+ unsigned int ba, cb, dc;
+ unsigned int src_current;
+ unsigned char *restrict des = dest;
+ unsigned char *restrict src = (unsigned char *)source;
+ unsigned int const_51_205, const_102_154,
+ const_205_51, const_154_102;
+
+ (void) dest_width;
+
+ const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
+ const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
+ const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
+ const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
+
+ for (i = 0; i < source_width - 3; i += 3)
+ {
+ src_current = _mem4(src);
+
+ // Reorder the data so that it is ready for the
+ // dot product.
+ ba = _unpklu4(src_current);
+ cb = _unpkhu4(_rotl(src_current, 8));
+ dc = _unpkhu4(src_current);
+
+ des [0] = src_current & 0xff;
+ des [1] = _dotprsu2(ba, const_154_102);
+ des [2] = _dotprsu2(cb, const_51_205);
+ des [3] = _dotprsu2(cb, const_205_51);
+ des [4] = _dotprsu2(dc, const_102_154);
+
+ src += 3;
+ des += 5;
+ }
+
+ src_current = _mem4(src);
+
+ ba = _unpklu4(src_current);
+ cb = _unpkhu4(_rotl(src_current, 8));
+ dc = _unpkhu4(src_current);
+
+
+ des [0] = src_current & 0xff;
+ des [1] = _dotprsu2(ba, const_154_102);
+ des [2] = _dotprsu2(cb, const_51_205);
+ des [3] = _dotprsu2(cb, const_205_51);
+ des [4] = dc & 0xff;
+
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vertical_band_3_5_scale_c64
+ *
+ * INPUTS : unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_pitch : Stride of destination data.
+ * unsigned int dest_width : Width of destination data.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Scales vertical band of pixels by scale 3 to 5. The
+ * height of the band scaled is 3-pixels.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band.
+ *
+ ****************************************************************************/
+static
+void vertical_band_3_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned int a, b, c, d;
+ unsigned int ba, cb, dc;
+ unsigned char *restrict src = dest;
+ unsigned char *restrict des = dest;
+ unsigned int const_51_205, const_102_154,
+ const_205_51, const_154_102;
+
+ const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
+ const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
+ const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
+ const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
+
+ a = src [0];
+ b = src [dest_pitch];
+ c = src [dest_pitch*2];
+ d = src [dest_pitch*5];
+ src ++;
+
+ for (i = 0; i < dest_width; i++)
+ {
+ ba = _pack2(b, a);
+ cb = _pack2(c, b);
+ dc = _pack2(d, c);
+
+ a = src [0];
+ b = src [dest_pitch];
+ c = src [dest_pitch*2];
+ d = src [dest_pitch*5];
+ src ++;
+
+ des [dest_pitch] = _dotprsu2(ba, const_154_102);
+ des [dest_pitch*2] = _dotprsu2(cb, const_51_205);
+ des [dest_pitch*3] = _dotprsu2(cb, const_205_51);
+ des [dest_pitch*4] = _dotprsu2(dc, const_102_154);
+
+ des++;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : last_vertical_band_3_5_scale_c64
+ *
+ * INPUTS : unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_pitch : Stride of destination data.
+ * unsigned int dest_width : Width of destination data.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Scales last vertical band of pixels by scale 3 to 5. The
+ * height of the band scaled is 3-pixels.
+ *
+ * SPECIAL NOTES : The routine does not have available the first line of
+ * the band below the current band, since this is the
+ * last band.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_3_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned int a, b, c;
+ unsigned int ba, cb;
+ unsigned char *restrict src = dest;
+ unsigned char *restrict des = dest;
+ unsigned int const_51_205, const_205_51, const_154_102;
+
+ const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
+ const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
+ const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
+
+ a = src [0];
+ b = src [dest_pitch];
+ c = src [dest_pitch*2];
+ src ++;
+
+ for (i = 0; i < dest_width; ++i)
+ {
+ ba = _pack2(b, a);
+ cb = _pack2(c, b);
+
+ a = src [0];
+ b = src [dest_pitch];
+ c = src [dest_pitch*2];
+ src ++;
+
+ des [dest_pitch] = _dotprsu2(ba, const_154_102);
+ des [dest_pitch*2] = _dotprsu2(cb, const_51_205);
+ des [dest_pitch*3] = _dotprsu2(cb, const_205_51);
+ des [dest_pitch*4] = (unsigned char)(c) ;
+
+ des++;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : horizontal_line_1_2_scale_c64
+ *
+ * INPUTS : const unsigned char *source : Pointer to source data.
+ * unsigned int source_width : Stride of source.
+ * unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_width : Stride of destination (NOT USED).
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies horizontal line of pixels from source to
+ * destination scaling up by 1 to 2.
+ *
+ * SPECIAL NOTES : source width must be a multiple of 4.
+ *
+ ****************************************************************************/
+void horizontal_line_1_2_scale_c64
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ unsigned int i;
+ unsigned char *restrict des = dest;
+ unsigned char *restrict src = (unsigned char *)source;
+ unsigned int src7_4i, src4_1i, src3_0i;
+ unsigned int a4_0i, ahi, alo;
+ double src7_0d, src3_0d;
+ const unsigned int k01 = 0x01010101;
+
+ for (i = 0; i < source_width / 4; i += 1)
+ {
+ // Load up the data from src. Here a wide load is
+ // used to get 8 bytes at once, only 5 will be used
+ // for the actual computation.
+ src7_0d = _memd8(src);
+ src3_0i = _lo(src7_0d);
+ src7_4i = _hi(src7_0d);
+
+ // Need to average between points. Shift byte 5 into
+ // the lower word. This will result in bytes 5-1
+ // averaged with 4-0.
+ src4_1i = _shrmb(src7_4i, src3_0i);
+ a4_0i = _avgu4(src4_1i, src3_0i);
+
+ // Expand the data out. Could do an unpack, however
+ // all but the multiply units are getting pretty hard
+ // here the multiply unit can take some of the computations.
+ src3_0d = _mpyu4(src3_0i, k01);
+
+ // The averages need to be unpacked so that they are in 16
+ // bit form and will be able to be interleaved with the
+ // original data
+ ahi = _unpkhu4(a4_0i);
+ alo = _unpklu4(a4_0i);
+
+ ahi = _swap4(ahi);
+ alo = _swap4(alo);
+
+ // Mix the average result in with the orginal data.
+ ahi = _hi(src3_0d) | ahi;
+ alo = _lo(src3_0d) | alo;
+
+ _memd8(des) = _itod(ahi, alo);
+
+ des += 8;
+ src += 4;
+ }
+}
+
+
+/****************************************************************************
+ *
+ * ROUTINE : vertical_band_1_2_scale_c64
+ *
+ * INPUTS : unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_pitch : Stride of destination data.
+ * unsigned int dest_width : Width of destination data.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Scales vertical band of pixels by scale 1 to 2. The
+ * height of the band scaled is 1-pixel.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band.
+ * Destination width must be a multiple of 4. Because the
+ * intput must be, therefore the output must be.
+ *
+ ****************************************************************************/
+static
+void vertical_band_1_2_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned int a, b;
+ unsigned int *restrict line_a = (unsigned int *)dest;
+ unsigned int *restrict line_b = (unsigned int *)(dest + (dest_pitch * 2));
+ unsigned int *restrict des = (unsigned int *)(dest + dest_pitch);
+
+ for (i = 0; i < dest_width / 4; i++)
+ {
+ a = _mem4(line_a++);
+ b = _mem4(line_b++);
+
+ _mem4(des++) = _avgu4(a, b);
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : last_vertical_band_1_2_scale_c64
+ *
+ * INPUTS : unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_pitch : Stride of destination data.
+ * unsigned int dest_width : Width of destination data.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Scales last vertical band of pixels by scale 1 to 2. The
+ * height of the band scaled is 1-pixel.
+ *
+ * SPECIAL NOTES : The routine does not have available the first line of
+ * the band below the current band, since this is the
+ * last band. Again, width must be a multiple of 4.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_1_2_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned int *restrict src = (unsigned int *)dest;
+ unsigned int *restrict des = (unsigned int *)(dest + dest_pitch);
+
+ for (i = 0; i < dest_width / 4; ++i)
+ {
+ _mem4(des++) = _mem4(src++);
+ }
+}
+
+void
+register_generic_scalers(void)
+{
+ vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_c64;
+ vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_c64;
+ vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_c64;
+ vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_c64;
+ vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_c64;
+ vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_c64;
+ vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_c64;
+ vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_c64;
+ vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_c64;
+}
diff --git a/vpx_scale/dm642/yv12extend.c b/vpx_scale/dm642/yv12extend.c
new file mode 100644
index 000000000..ca25a5fce
--- /dev/null
+++ b/vpx_scale/dm642/yv12extend.c
@@ -0,0 +1,445 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+ *
+ * Module Title : yv12extend.c
+ *
+ * Description :
+ *
+ ***************************************************************************/
+
+/****************************************************************************
+* Header Files
+****************************************************************************/
+//#include <stdlib.h>
+#include "csl_dat.h"
+#include "vpx_scale/yv12config.h"
+#include "vpx_mem/vpx_mem.h"
+
+/****************************************************************************
+* Exports
+****************************************************************************/
+#define UINT8 unsigned char
+#define UINT32 unsigned int
+
+
+static inline
+void copy_yleft_right_border(
+ UINT8 *restrict src_ptr1,
+ UINT8 *restrict src_ptr2,
+ UINT8 *restrict dest_ptr1,
+ UINT8 *restrict dest_ptr2,
+ UINT32 plane_height,
+ UINT32 plane_stride
+)
+{
+ UINT32 left, right, left2, left4, right2, right4;
+ double dl, dr;
+ int i;
+
+#pragma MUST_ITERATE(16,16,16)
+
+ for (i = 0; i < plane_height; i++)
+ {
+ left = src_ptr1[0];
+ right = src_ptr2[0];
+
+ left2 = _pack2(left, left);
+ left4 = _packl4(left2, left2);
+
+ right2 = _pack2(right, right);
+ right4 = _packl4(right2, right2);
+
+ dl = _itod(left4, left4);
+ dr = _itod(right4, right4);
+
+ _amemd8(&dest_ptr1[ 0]) = dl;
+ _amemd8(&dest_ptr2[ 0]) = dr;
+
+ _amemd8(&dest_ptr1[ 8]) = dl;
+ _amemd8(&dest_ptr2[ 8]) = dr;
+
+ _amemd8(&dest_ptr1[16]) = dl;
+ _amemd8(&dest_ptr2[16]) = dr;
+
+ _amemd8(&dest_ptr1[24]) = dl;
+ _amemd8(&dest_ptr2[24]) = dr;
+
+ _amemd8(&dest_ptr1[32]) = dl;
+ _amemd8(&dest_ptr2[32]) = dr;
+
+ _amemd8(&dest_ptr1[40]) = dl;
+ _amemd8(&dest_ptr2[40]) = dr;
+
+
+ src_ptr1 += plane_stride;
+ src_ptr2 += plane_stride;
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+}
+/****************************************************************************
+ *
+ *
+ ****************************************************************************/
+static
+void copy_uvleft_right_border(
+ UINT8 *restrict src_ptr1,
+ UINT8 *restrict src_ptr2,
+ UINT8 *restrict dest_ptr1,
+ UINT8 *restrict dest_ptr2,
+ UINT32 plane_height,
+ UINT32 plane_stride
+)
+{
+ UINT32 left, right, left2, left4, right2, right4;
+ double dl, dr;
+ int i;
+
+#pragma MUST_ITERATE(8,8 ,8)
+
+ for (i = 0; i < plane_height; i++)
+ {
+ left = src_ptr1[0];
+ right = src_ptr2[0];
+
+ left2 = _pack2(left, left);
+ left4 = _packl4(left2, left2);
+
+ right2 = _pack2(right, right);
+ right4 = _packl4(right2, right2);
+
+ dl = _itod(left4, left4);
+ dr = _itod(right4, right4);
+
+ _amemd8(&dest_ptr1[ 0]) = dl;
+ _amemd8(&dest_ptr2[ 0]) = dr;
+
+ _amemd8(&dest_ptr1[ 8]) = dl;
+ _amemd8(&dest_ptr2[ 8]) = dr;
+
+ _amemd8(&dest_ptr1[16]) = dl;
+ _amemd8(&dest_ptr2[16]) = dr;
+
+
+ src_ptr1 += plane_stride;
+ src_ptr2 += plane_stride;
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+}
+/****************************************************************************
+ *
+ ****************************************************************************/
+void
+vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf)
+{
+ int i;
+ unsigned char *src_ptr1, *src_ptr2;
+ unsigned char *dest_ptr1, *dest_ptr2;
+
+ unsigned int Border;
+ int plane_stride;
+ int plane_height;
+ int plane_width;
+
+ /***********/
+ /* Y Plane */
+ /***********/
+ Border = ybf->border;
+ plane_stride = ybf->y_stride;
+ plane_height = ybf->y_height;
+ plane_width = ybf->y_width;
+
+#if 1
+ // copy the left and right most columns out
+ src_ptr1 = ybf->y_buffer;
+ src_ptr2 = src_ptr1 + plane_width - 1;
+ dest_ptr1 = src_ptr1 - Border;
+ dest_ptr2 = src_ptr2 + 1;
+ copy_yleft_right_border(src_ptr1, src_ptr2, dest_ptr1, dest_ptr2, plane_height, plane_stride);
+#endif
+
+ // Now copy the top and bottom source lines into each line of the respective borders
+ src_ptr1 = ybf->y_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)Border; i++)
+ {
+ vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+ vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ plane_stride /= 2;
+ plane_height /= 2;
+ plane_width /= 2;
+ Border /= 2;
+
+ /***********/
+ /* U Plane */
+ /***********/
+#if 1
+ // copy the left and right most columns out
+ src_ptr1 = ybf->u_buffer;
+ src_ptr2 = src_ptr1 + plane_width - 1;
+ dest_ptr1 = src_ptr1 - Border;
+ dest_ptr2 = src_ptr2 + 1;
+
+ copy_uvleft_right_border(src_ptr1, src_ptr2, dest_ptr1, dest_ptr2, plane_height, plane_stride);
+
+
+#endif
+
+ // Now copy the top and bottom source lines into each line of the respective borders
+ src_ptr1 = ybf->u_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)(Border); i++)
+ {
+ vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+ vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ /***********/
+ /* V Plane */
+ /***********/
+#if 1
+ // copy the left and right most columns out
+ src_ptr1 = ybf->v_buffer;
+ src_ptr2 = src_ptr1 + plane_width - 1;
+ dest_ptr1 = src_ptr1 - Border;
+ dest_ptr2 = src_ptr2 + 1;
+
+ copy_uvleft_right_border(src_ptr1, src_ptr2, dest_ptr1, dest_ptr2, plane_height, plane_stride);
+
+#endif
+
+ // Now copy the top and bottom source lines into each line of the respective borders
+ src_ptr1 = ybf->v_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)(Border); i++)
+ {
+ vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+ vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+}
+/****************************************************************************
+ *
+ ****************************************************************************/
+void
+vpxyv12_extend_frame_tbborders(YV12_BUFFER_CONFIG *ybf)
+{
+ int i;
+ unsigned char *src_ptr1, *src_ptr2;
+ unsigned char *dest_ptr1, *dest_ptr2;
+ int tid1, tid2;
+
+ unsigned int Border;
+ int plane_stride;
+ int plane_height;
+ int plane_width;
+
+ /***********/
+ /* Y Plane */
+ /***********/
+ Border = ybf->border;
+ plane_stride = ybf->y_stride;
+ plane_height = ybf->y_height;
+ plane_width = ybf->y_width;
+
+
+ // Now copy the top and bottom source lines into each line of the respective borders
+ src_ptr1 = ybf->y_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+
+ for (i = 0; i < (int)Border; i++)
+ {
+ dat_copy(src_ptr1, dest_ptr1, plane_stride);
+ dat_copy(src_ptr2, dest_ptr2, plane_stride);
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ plane_stride /= 2;
+ plane_height /= 2;
+ plane_width /= 2;
+ Border /= 2;
+
+ /***********/
+ /* U Plane */
+ /***********/
+ // Now copy the top and bottom source lines into each line of the respective borders
+ src_ptr1 = ybf->u_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)(Border); i++)
+ {
+ dat_copy(src_ptr1, dest_ptr1, plane_stride);
+ dat_copy(src_ptr2, dest_ptr2, plane_stride);
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ /***********/
+ /* V Plane */
+ /***********/
+ // Now copy the top and bottom source lines into each line of the respective borders
+ src_ptr1 = ybf->v_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)(Border); i++)
+ {
+ tid1 = dat_copy(src_ptr1, dest_ptr1, plane_stride);
+ tid2 = dat_copy(src_ptr2, dest_ptr2, plane_stride);
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ dat_wait(tid1);
+ dat_wait(tid2);
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8_yv12_copy_frame
+ *
+ * INPUTS :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies the source image into the destination image and
+ * updates the destination's UMV borders. Because the
+ * borders have been update prior to this so the whole frame
+ * is copied, borders and all. This is also to circumvent
+ * using copy_left_right Border functions when copying data
+ * between L2 and main memory. When that occurs a cache
+ * clean needs to be done, which would require invalidating
+ * an entire frame.
+ *
+ * SPECIAL NOTES : The frames are assumed to be identical in size.
+ *
+ ****************************************************************************/
+void
+vpxyv12_copy_frame_dma(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
+{
+ int yheight, uv_height;
+ int ystride, uv_stride;
+ int border;
+ int yoffset, uvoffset;
+
+ border = src_ybc->border;
+ yheight = src_ybc->y_height;
+ uv_height = src_ybc->uv_height;
+
+ ystride = src_ybc->y_stride;
+ uv_stride = src_ybc->uv_stride;
+
+ yoffset = border * (ystride + 1);
+ uvoffset = border / 2 * (uv_stride + 1);
+
+ dat_copy2d(DAT_2D2D,
+ src_ybc->y_buffer - yoffset,
+ dst_ybc->y_buffer - yoffset,
+ ystride,
+ yheight + 2 * border,
+ ystride);
+ dat_copy2d(DAT_2D2D,
+ src_ybc->u_buffer - uvoffset,
+ dst_ybc->u_buffer - uvoffset,
+ uv_stride,
+ uv_height + border,
+ uv_stride);
+ dat_copy2d(DAT_2D2D,
+ src_ybc->v_buffer - uvoffset,
+ dst_ybc->v_buffer - uvoffset,
+ uv_stride,
+ uv_height + border,
+ uv_stride);
+
+}
+
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8_yv12_copy_frame
+ *
+ * INPUTS :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies the source image into the destination image and
+ * updates the destination's UMV borders.
+ *
+ * SPECIAL NOTES : The frames are assumed to be identical in size.
+ *
+ ****************************************************************************/
+void
+vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
+{
+ int row;
+ unsigned char *source, *dest;
+
+ source = src_ybc->y_buffer;
+ dest = dst_ybc->y_buffer;
+
+ for (row = 0; row < src_ybc->y_height; row++)
+ {
+ vpx_memcpy(dest, source, src_ybc->y_width);
+ source += src_ybc->y_stride;
+ dest += dst_ybc->y_stride;
+ }
+
+ source = src_ybc->u_buffer;
+ dest = dst_ybc->u_buffer;
+
+ for (row = 0; row < src_ybc->uv_height; row++)
+ {
+ vpx_memcpy(dest, source, src_ybc->uv_width);
+ source += src_ybc->uv_stride;
+ dest += dst_ybc->uv_stride;
+ }
+
+ source = src_ybc->v_buffer;
+ dest = dst_ybc->v_buffer;
+
+ for (row = 0; row < src_ybc->uv_height; row++)
+ {
+ vpx_memcpy(dest, source, src_ybc->uv_width);
+ source += src_ybc->uv_stride;
+ dest += dst_ybc->uv_stride;
+ }
+
+ vp8_yv12_extend_frame_borders(dst_ybc);
+}
diff --git a/vpx_scale/generic/bicubic_scaler.c b/vpx_scale/generic/bicubic_scaler.c
new file mode 100644
index 000000000..e3c2b4a80
--- /dev/null
+++ b/vpx_scale/generic/bicubic_scaler.c
@@ -0,0 +1,601 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include "vpx_mem/vpx_mem.h"
+#include "vpxscale_arbitrary.h"
+
+#define FIXED_POINT
+
+#define MAX_IN_WIDTH 800
+#define MAX_IN_HEIGHT 600
+#define MAX_OUT_WIDTH 800
+#define MAX_OUT_HEIGHT 600
+#define MAX_OUT_DIMENSION ((MAX_OUT_WIDTH > MAX_OUT_HEIGHT) ? \
+ MAX_OUT_WIDTH : MAX_OUT_HEIGHT)
+
+BICUBIC_SCALER_STRUCT g_b_scaler;
+static int g_first_time = 1;
+
+#pragma DATA_SECTION(g_hbuf, "VP6_HEAP")
+#pragma DATA_ALIGN (g_hbuf, 32);
+unsigned char g_hbuf[MAX_OUT_DIMENSION];
+
+#pragma DATA_SECTION(g_hbuf_uv, "VP6_HEAP")
+#pragma DATA_ALIGN (g_hbuf_uv, 32);
+unsigned char g_hbuf_uv[MAX_OUT_DIMENSION];
+
+
+#ifdef FIXED_POINT
+static int a_i = 0.6 * 65536;
+#else
+static float a = -0.6;
+#endif
+
+#ifdef FIXED_POINT
+// 3 2
+// C0 = a*t - a*t
+//
+static INLINE short c0_fixed(unsigned int t)
+{
+ // put t in Q16 notation
+ unsigned short v1, v2;
+
+ // Q16
+ v1 = (a_i * t) >> 16;
+ v1 = (v1 * t) >> 16;
+
+ // Q16
+ v2 = (a_i * t) >> 16;
+ v2 = (v2 * t) >> 16;
+ v2 = (v2 * t) >> 16;
+
+ // Q12
+ return -((v1 - v2) >> 4);
+}
+
+// 2 3
+// C1 = a*t + (3-2*a)*t - (2-a)*t
+//
+static INLINE short c1_fixed(unsigned int t)
+{
+ unsigned short v1, v2, v3;
+ unsigned short two, three;
+
+ // Q16
+ v1 = (a_i * t) >> 16;
+
+ // Q13
+ two = 2 << 13;
+ v2 = two - (a_i >> 3);
+ v2 = (v2 * t) >> 16;
+ v2 = (v2 * t) >> 16;
+ v2 = (v2 * t) >> 16;
+
+ // Q13
+ three = 3 << 13;
+ v3 = three - (2 * (a_i >> 3));
+ v3 = (v3 * t) >> 16;
+ v3 = (v3 * t) >> 16;
+
+ // Q12
+ return (((v1 >> 3) - v2 + v3) >> 1);
+
+}
+
+// 2 3
+// C2 = 1 - (3-a)*t + (2-a)*t
+//
+static INLINE short c2_fixed(unsigned int t)
+{
+ unsigned short v1, v2, v3;
+ unsigned short two, three;
+
+ // Q13
+ v1 = 1 << 13;
+
+ // Q13
+ three = 3 << 13;
+ v2 = three - (a_i >> 3);
+ v2 = (v2 * t) >> 16;
+ v2 = (v2 * t) >> 16;
+
+ // Q13
+ two = 2 << 13;
+ v3 = two - (a_i >> 3);
+ v3 = (v3 * t) >> 16;
+ v3 = (v3 * t) >> 16;
+ v3 = (v3 * t) >> 16;
+
+ // Q12
+ return (v1 - v2 + v3) >> 1;
+}
+
+// 2 3
+// C3 = a*t - 2*a*t + a*t
+//
+static INLINE short c3_fixed(unsigned int t)
+{
+ int v1, v2, v3;
+
+ // Q16
+ v1 = (a_i * t) >> 16;
+
+ // Q15
+ v2 = 2 * (a_i >> 1);
+ v2 = (v2 * t) >> 16;
+ v2 = (v2 * t) >> 16;
+
+ // Q16
+ v3 = (a_i * t) >> 16;
+ v3 = (v3 * t) >> 16;
+ v3 = (v3 * t) >> 16;
+
+ // Q12
+ return ((v2 - (v1 >> 1) - (v3 >> 1)) >> 3);
+}
+#else
+// 3 2
+// C0 = -a*t + a*t
+//
+float C0(float t)
+{
+ return -a * t * t * t + a * t * t;
+}
+
+// 2 3
+// C1 = -a*t + (2*a+3)*t - (a+2)*t
+//
+float C1(float t)
+{
+ return -(a + 2.0f) * t * t * t + (2.0f * a + 3.0f) * t * t - a * t;
+}
+
+// 2 3
+// C2 = 1 - (a+3)*t + (a+2)*t
+//
+float C2(float t)
+{
+ return (a + 2.0f) * t * t * t - (a + 3.0f) * t * t + 1.0f;
+}
+
+// 2 3
+// C3 = a*t - 2*a*t + a*t
+//
+float C3(float t)
+{
+ return a * t * t * t - 2.0f * a * t * t + a * t;
+}
+#endif
+
+#if 0
+int compare_real_fixed()
+{
+ int i, errors = 0;
+ float mult = 1.0 / 10000.0;
+ unsigned int fixed_mult = mult * 4294967296;//65536;
+ unsigned int phase_offset_int;
+ float phase_offset_real;
+
+ for (i = 0; i < 10000; i++)
+ {
+ int fixed0, fixed1, fixed2, fixed3, fixed_total;
+ int real0, real1, real2, real3, real_total;
+
+ phase_offset_real = (float)i * mult;
+ phase_offset_int = (fixed_mult * i) >> 16;
+// phase_offset_int = phase_offset_real * 65536;
+
+ fixed0 = c0_fixed(phase_offset_int);
+ real0 = C0(phase_offset_real) * 4096.0;
+
+ if ((abs(fixed0) > (abs(real0) + 1)) || (abs(fixed0) < (abs(real0) - 1)))
+ errors++;
+
+ fixed1 = c1_fixed(phase_offset_int);
+ real1 = C1(phase_offset_real) * 4096.0;
+
+ if ((abs(fixed1) > (abs(real1) + 1)) || (abs(fixed1) < (abs(real1) - 1)))
+ errors++;
+
+ fixed2 = c2_fixed(phase_offset_int);
+ real2 = C2(phase_offset_real) * 4096.0;
+
+ if ((abs(fixed2) > (abs(real2) + 1)) || (abs(fixed2) < (abs(real2) - 1)))
+ errors++;
+
+ fixed3 = c3_fixed(phase_offset_int);
+ real3 = C3(phase_offset_real) * 4096.0;
+
+ if ((abs(fixed3) > (abs(real3) + 1)) || (abs(fixed3) < (abs(real3) - 1)))
+ errors++;
+
+ fixed_total = fixed0 + fixed1 + fixed2 + fixed3;
+ real_total = real0 + real1 + real2 + real3;
+
+ if ((fixed_total > 4097) || (fixed_total < 4094))
+ errors ++;
+
+ if ((real_total > 4097) || (real_total < 4095))
+ errors ++;
+ }
+
+ return errors;
+}
+#endif
+
+// Find greatest common denominator between two integers. Method used here is
+// slow compared to Euclid's algorithm, but does not require any division.
+int gcd(int a, int b)
+{
+ // Problem with this algorithm is that if a or b = 0 this function
+ // will never exit. Don't want to return 0 because any computation
+ // that was based on a common denoninator and tried to reduce by
+ // dividing by 0 would fail. Best solution that could be thought of
+ // would to be fail by returing a 1;
+ if (a <= 0 || b <= 0)
+ return 1;
+
+ while (a != b)
+ {
+ if (b > a)
+ b = b - a;
+ else
+ {
+ int tmp = a;//swap large and
+ a = b; //small
+ b = tmp;
+ }
+ }
+
+ return b;
+}
+
+void bicubic_coefficient_init()
+{
+ vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));
+ g_first_time = 0;
+}
+
+void bicubic_coefficient_destroy()
+{
+ if (!g_first_time)
+ {
+ if (g_b_scaler.l_w) vpx_free(g_b_scaler.l_w);
+
+ if (g_b_scaler.l_h) vpx_free(g_b_scaler.l_h);
+
+ if (g_b_scaler.l_h_uv) vpx_free(g_b_scaler.l_h_uv);
+
+ if (g_b_scaler.c_w) vpx_free(g_b_scaler.c_w);
+
+ if (g_b_scaler.c_h) vpx_free(g_b_scaler.c_h);
+
+ if (g_b_scaler.c_h_uv) vpx_free(g_b_scaler.c_h_uv);
+
+ vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));
+ }
+}
+
+// Create the coeffients that will be used for the cubic interpolation.
+// Because scaling does not have to be equal in the vertical and horizontal
+// regimes the phase offsets will be different. There are 4 coefficents
+// for each point, two on each side. The layout is that there are the
+// 4 coefficents for each phase in the array and then the next phase.
+int bicubic_coefficient_setup(int in_width, int in_height, int out_width, int out_height)
+{
+ int i;
+#ifdef FIXED_POINT
+ int phase_offset_int;
+ unsigned int fixed_mult;
+ int product_val = 0;
+#else
+ float phase_offset;
+#endif
+ int gcd_w, gcd_h, gcd_h_uv, d_w, d_h, d_h_uv;
+
+ if (g_first_time)
+ bicubic_coefficient_init();
+
+
+ // check to see if the coefficents have already been set up correctly
+ if ((in_width == g_b_scaler.in_width) && (in_height == g_b_scaler.in_height)
+ && (out_width == g_b_scaler.out_width) && (out_height == g_b_scaler.out_height))
+ return 0;
+
+ g_b_scaler.in_width = in_width;
+ g_b_scaler.in_height = in_height;
+ g_b_scaler.out_width = out_width;
+ g_b_scaler.out_height = out_height;
+
+ // Don't want to allow crazy scaling, just try and prevent a catastrophic
+ // failure here. Want to fail after setting the member functions so if
+ // if the scaler is called the member functions will not scale.
+ if (out_width <= 0 || out_height <= 0)
+ return -1;
+
+ // reduce in/out width and height ratios using the gcd
+ gcd_w = gcd(out_width, in_width);
+ gcd_h = gcd(out_height, in_height);
+ gcd_h_uv = gcd(out_height, in_height / 2);
+
+ // the numerator width and height are to be saved in
+ // globals so they can be used during the scaling process
+ // without having to be recalculated.
+ g_b_scaler.nw = out_width / gcd_w;
+ d_w = in_width / gcd_w;
+
+ g_b_scaler.nh = out_height / gcd_h;
+ d_h = in_height / gcd_h;
+
+ g_b_scaler.nh_uv = out_height / gcd_h_uv;
+ d_h_uv = (in_height / 2) / gcd_h_uv;
+
+ // allocate memory for the coefficents
+ if (g_b_scaler.l_w) vpx_free(g_b_scaler.l_w);
+
+ if (g_b_scaler.l_h) vpx_free(g_b_scaler.l_h);
+
+ if (g_b_scaler.l_h_uv) vpx_free(g_b_scaler.l_h_uv);
+
+ g_b_scaler.l_w = (short *)vpx_memalign(32, out_width * 2);
+ g_b_scaler.l_h = (short *)vpx_memalign(32, out_height * 2);
+ g_b_scaler.l_h_uv = (short *)vpx_memalign(32, out_height * 2);
+
+ if (g_b_scaler.c_w) vpx_free(g_b_scaler.c_w);
+
+ if (g_b_scaler.c_h) vpx_free(g_b_scaler.c_h);
+
+ if (g_b_scaler.c_h_uv) vpx_free(g_b_scaler.c_h_uv);
+
+ g_b_scaler.c_w = (short *)vpx_memalign(32, g_b_scaler.nw * 4 * 2);
+ g_b_scaler.c_h = (short *)vpx_memalign(32, g_b_scaler.nh * 4 * 2);
+ g_b_scaler.c_h_uv = (short *)vpx_memalign(32, g_b_scaler.nh_uv * 4 * 2);
+
+ g_b_scaler.hbuf = g_hbuf;
+ g_b_scaler.hbuf_uv = g_hbuf_uv;
+
+ // Set up polyphase filter taps. This needs to be done before
+ // the scaling because of the floating point math required. The
+ // coefficients are multiplied by 2^12 so that fixed point math
+ // can be used in the main scaling loop.
+#ifdef FIXED_POINT
+ fixed_mult = (1.0 / (float)g_b_scaler.nw) * 4294967296;
+
+ product_val = 0;
+
+ for (i = 0; i < g_b_scaler.nw; i++)
+ {
+ if (product_val > g_b_scaler.nw)
+ product_val -= g_b_scaler.nw;
+
+ phase_offset_int = (fixed_mult * product_val) >> 16;
+
+ g_b_scaler.c_w[i*4] = c3_fixed(phase_offset_int);
+ g_b_scaler.c_w[i*4+1] = c2_fixed(phase_offset_int);
+ g_b_scaler.c_w[i*4+2] = c1_fixed(phase_offset_int);
+ g_b_scaler.c_w[i*4+3] = c0_fixed(phase_offset_int);
+
+ product_val += d_w;
+ }
+
+
+ fixed_mult = (1.0 / (float)g_b_scaler.nh) * 4294967296;
+
+ product_val = 0;
+
+ for (i = 0; i < g_b_scaler.nh; i++)
+ {
+ if (product_val > g_b_scaler.nh)
+ product_val -= g_b_scaler.nh;
+
+ phase_offset_int = (fixed_mult * product_val) >> 16;
+
+ g_b_scaler.c_h[i*4] = c0_fixed(phase_offset_int);
+ g_b_scaler.c_h[i*4+1] = c1_fixed(phase_offset_int);
+ g_b_scaler.c_h[i*4+2] = c2_fixed(phase_offset_int);
+ g_b_scaler.c_h[i*4+3] = c3_fixed(phase_offset_int);
+
+ product_val += d_h;
+ }
+
+ fixed_mult = (1.0 / (float)g_b_scaler.nh_uv) * 4294967296;
+
+ product_val = 0;
+
+ for (i = 0; i < g_b_scaler.nh_uv; i++)
+ {
+ if (product_val > g_b_scaler.nh_uv)
+ product_val -= g_b_scaler.nh_uv;
+
+ phase_offset_int = (fixed_mult * product_val) >> 16;
+
+ g_b_scaler.c_h_uv[i*4] = c0_fixed(phase_offset_int);
+ g_b_scaler.c_h_uv[i*4+1] = c1_fixed(phase_offset_int);
+ g_b_scaler.c_h_uv[i*4+2] = c2_fixed(phase_offset_int);
+ g_b_scaler.c_h_uv[i*4+3] = c3_fixed(phase_offset_int);
+
+ product_val += d_h_uv;
+ }
+
+#else
+
+ for (i = 0; i < g_nw; i++)
+ {
+ phase_offset = (float)((i * d_w) % g_nw) / (float)g_nw;
+ g_c_w[i*4] = (C3(phase_offset) * 4096.0);
+ g_c_w[i*4+1] = (C2(phase_offset) * 4096.0);
+ g_c_w[i*4+2] = (C1(phase_offset) * 4096.0);
+ g_c_w[i*4+3] = (C0(phase_offset) * 4096.0);
+ }
+
+ for (i = 0; i < g_nh; i++)
+ {
+ phase_offset = (float)((i * d_h) % g_nh) / (float)g_nh;
+ g_c_h[i*4] = (C0(phase_offset) * 4096.0);
+ g_c_h[i*4+1] = (C1(phase_offset) * 4096.0);
+ g_c_h[i*4+2] = (C2(phase_offset) * 4096.0);
+ g_c_h[i*4+3] = (C3(phase_offset) * 4096.0);
+ }
+
+ for (i = 0; i < g_nh_uv; i++)
+ {
+ phase_offset = (float)((i * d_h_uv) % g_nh_uv) / (float)g_nh_uv;
+ g_c_h_uv[i*4] = (C0(phase_offset) * 4096.0);
+ g_c_h_uv[i*4+1] = (C1(phase_offset) * 4096.0);
+ g_c_h_uv[i*4+2] = (C2(phase_offset) * 4096.0);
+ g_c_h_uv[i*4+3] = (C3(phase_offset) * 4096.0);
+ }
+
+#endif
+
+ // Create an array that corresponds input lines to output lines.
+ // This doesn't require floating point math, but it does require
+ // a division and because hardware division is not present that
+ // is a call.
+ for (i = 0; i < out_width; i++)
+ {
+ g_b_scaler.l_w[i] = (i * d_w) / g_b_scaler.nw;
+
+ if ((g_b_scaler.l_w[i] + 2) <= in_width)
+ g_b_scaler.max_usable_out_width = i;
+
+ }
+
+ for (i = 0; i < out_height + 1; i++)
+ {
+ g_b_scaler.l_h[i] = (i * d_h) / g_b_scaler.nh;
+ g_b_scaler.l_h_uv[i] = (i * d_h_uv) / g_b_scaler.nh_uv;
+ }
+
+ return 0;
+}
+
+int bicubic_scale(int in_width, int in_height, int in_stride,
+ int out_width, int out_height, int out_stride,
+ unsigned char *input_image, unsigned char *output_image)
+{
+ short *RESTRICT l_w, * RESTRICT l_h;
+ short *RESTRICT c_w, * RESTRICT c_h;
+ unsigned char *RESTRICT ip, * RESTRICT op;
+ unsigned char *RESTRICT hbuf;
+ int h, w, lw, lh;
+ int temp_sum;
+ int phase_offset_w, phase_offset_h;
+
+ c_w = g_b_scaler.c_w;
+ c_h = g_b_scaler.c_h;
+
+ op = output_image;
+
+ l_w = g_b_scaler.l_w;
+ l_h = g_b_scaler.l_h;
+
+ phase_offset_h = 0;
+
+ for (h = 0; h < out_height; h++)
+ {
+ // select the row to work on
+ lh = l_h[h];
+ ip = input_image + (in_stride * lh);
+
+ // vp8_filter the row vertically into an temporary buffer.
+ // If the phase offset == 0 then all the multiplication
+ // is going to result in the output equalling the input.
+ // So instead point the temporary buffer to the input.
+ // Also handle the boundry condition of not being able to
+ // filter that last lines.
+ if (phase_offset_h && (lh < in_height - 2))
+ {
+ hbuf = g_b_scaler.hbuf;
+
+ for (w = 0; w < in_width; w++)
+ {
+ temp_sum = c_h[phase_offset_h*4+3] * ip[w - in_stride];
+ temp_sum += c_h[phase_offset_h*4+2] * ip[w];
+ temp_sum += c_h[phase_offset_h*4+1] * ip[w + in_stride];
+ temp_sum += c_h[phase_offset_h*4] * ip[w + 2*in_stride];
+
+ hbuf[w] = temp_sum >> 12;
+ }
+ }
+ else
+ hbuf = ip;
+
+ // increase the phase offset for the next time around.
+ if (++phase_offset_h >= g_b_scaler.nh)
+ phase_offset_h = 0;
+
+ // now filter and expand it horizontally into the final
+ // output buffer
+ phase_offset_w = 0;
+
+ for (w = 0; w < out_width; w++)
+ {
+ // get the index to use to expand the image
+ lw = l_w[w];
+
+ temp_sum = c_w[phase_offset_w*4] * hbuf[lw - 1];
+ temp_sum += c_w[phase_offset_w*4+1] * hbuf[lw];
+ temp_sum += c_w[phase_offset_w*4+2] * hbuf[lw + 1];
+ temp_sum += c_w[phase_offset_w*4+3] * hbuf[lw + 2];
+ temp_sum = temp_sum >> 12;
+
+ if (++phase_offset_w >= g_b_scaler.nw)
+ phase_offset_w = 0;
+
+ // boundry conditions
+ if ((lw + 2) >= in_width)
+ temp_sum = hbuf[lw];
+
+ if (lw == 0)
+ temp_sum = hbuf[0];
+
+ op[w] = temp_sum;
+ }
+
+ op += out_stride;
+ }
+
+ return 0;
+}
+
+void bicubic_scale_frame_reset()
+{
+ g_b_scaler.out_width = 0;
+ g_b_scaler.out_height = 0;
+}
+
+void bicubic_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+ int new_width, int new_height)
+{
+
+ dst->y_width = new_width;
+ dst->y_height = new_height;
+ dst->uv_width = new_width / 2;
+ dst->uv_height = new_height / 2;
+
+ dst->y_stride = dst->y_width;
+ dst->uv_stride = dst->uv_width;
+
+ bicubic_scale(src->y_width, src->y_height, src->y_stride,
+ new_width, new_height, dst->y_stride,
+ src->y_buffer, dst->y_buffer);
+
+ bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,
+ new_width / 2, new_height / 2, dst->uv_stride,
+ src->u_buffer, dst->u_buffer);
+
+ bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,
+ new_width / 2, new_height / 2, dst->uv_stride,
+ src->v_buffer, dst->v_buffer);
+}
diff --git a/vpx_scale/generic/gen_scalers.c b/vpx_scale/generic/gen_scalers.c
new file mode 100644
index 000000000..a5e545f70
--- /dev/null
+++ b/vpx_scale/generic/gen_scalers.c
@@ -0,0 +1,954 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/vpxscale.h"
+#include "vpx_mem/vpx_mem.h"
+/****************************************************************************
+* Imports
+****************************************************************************/
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_horizontal_line_4_5_scale_c
+ *
+ * INPUTS : const unsigned char *source : Pointer to source data.
+ * unsigned int source_width : Stride of source.
+ * unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_width : Stride of destination (NOT USED).
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies horizontal line of pixels from source to
+ * destination scaling up by 4 to 5.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8cx_horizontal_line_4_5_scale_c
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ unsigned i;
+ unsigned int a, b, c;
+ unsigned char *des = dest;
+ const unsigned char *src = source;
+
+ (void) dest_width;
+
+ for (i = 0; i < source_width - 4; i += 4)
+ {
+ a = src[0];
+ b = src[1];
+ des [0] = (unsigned char) a;
+ des [1] = (unsigned char)((a * 51 + 205 * b + 128) >> 8);
+ c = src[2] * 154;
+ a = src[3];
+ des [2] = (unsigned char)((b * 102 + c + 128) >> 8);
+ des [3] = (unsigned char)((c + 102 * a + 128) >> 8);
+ b = src[4];
+ des [4] = (unsigned char)((a * 205 + 51 * b + 128) >> 8);
+
+ src += 4;
+ des += 5;
+ }
+
+ a = src[0];
+ b = src[1];
+ des [0] = (unsigned char)(a);
+ des [1] = (unsigned char)((a * 51 + 205 * b + 128) >> 8);
+ c = src[2] * 154;
+ a = src[3];
+ des [2] = (unsigned char)((b * 102 + c + 128) >> 8);
+ des [3] = (unsigned char)((c + 102 * a + 128) >> 8);
+ des [4] = (unsigned char)(a);
+
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_vertical_band_4_5_scale_c
+ *
+ * INPUTS : unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_pitch : Stride of destination data.
+ * unsigned int dest_width : Width of destination data.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Scales vertical band of pixels by scale 4 to 5. The
+ * height of the band scaled is 4-pixels.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band.
+ *
+ ****************************************************************************/
+void vp8cx_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned int a, b, c, d;
+ unsigned char *des = dest;
+
+ for (i = 0; i < dest_width; i++)
+ {
+ a = des [0];
+ b = des [dest_pitch];
+
+ des[dest_pitch] = (unsigned char)((a * 51 + 205 * b + 128) >> 8);
+
+ c = des[dest_pitch*2] * 154;
+ d = des[dest_pitch*3];
+
+ des [dest_pitch*2] = (unsigned char)((b * 102 + c + 128) >> 8);
+ des [dest_pitch*3] = (unsigned char)((c + 102 * d + 128) >> 8);
+
+ // First line in next band
+ a = des [dest_pitch * 5];
+ des [dest_pitch * 4] = (unsigned char)((d * 205 + 51 * a + 128) >> 8);
+
+ des ++;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_last_vertical_band_4_5_scale_c
+ *
+ * INPUTS : unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_pitch : Stride of destination data.
+ * unsigned int dest_width : Width of destination data.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Scales last vertical band of pixels by scale 4 to 5. The
+ * height of the band scaled is 4-pixels.
+ *
+ * SPECIAL NOTES : The routine does not have available the first line of
+ * the band below the current band, since this is the
+ * last band.
+ *
+ ****************************************************************************/
+void vp8cx_last_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned int a, b, c, d;
+ unsigned char *des = dest;
+
+ for (i = 0; i < dest_width; ++i)
+ {
+ a = des[0];
+ b = des[dest_pitch];
+
+ des[dest_pitch] = (unsigned char)((a * 51 + 205 * b + 128) >> 8);
+
+ c = des[dest_pitch*2] * 154;
+ d = des[dest_pitch*3];
+
+ des [dest_pitch*2] = (unsigned char)((b * 102 + c + 128) >> 8);
+ des [dest_pitch*3] = (unsigned char)((c + 102 * d + 128) >> 8);
+
+ // No other line for interplation of this line, so ..
+ des[dest_pitch*4] = (unsigned char) d;
+
+ des++;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_horizontal_line_2_3_scale_c
+ *
+ * INPUTS : const unsigned char *source : Pointer to source data.
+ * unsigned int source_width : Stride of source.
+ * unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_width : Stride of destination (NOT USED).
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies horizontal line of pixels from source to
+ * destination scaling up by 2 to 3.
+ *
+ * SPECIAL NOTES : None.
+ *
+ *
+ ****************************************************************************/
+void vp8cx_horizontal_line_2_3_scale_c
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ unsigned int i;
+ unsigned int a, b, c;
+ unsigned char *des = dest;
+ const unsigned char *src = source;
+
+ (void) dest_width;
+
+ for (i = 0; i < source_width - 2; i += 2)
+ {
+ a = src[0];
+ b = src[1];
+ c = src[2];
+
+ des [0] = (unsigned char)(a);
+ des [1] = (unsigned char)((a * 85 + 171 * b + 128) >> 8);
+ des [2] = (unsigned char)((b * 171 + 85 * c + 128) >> 8);
+
+ src += 2;
+ des += 3;
+ }
+
+ a = src[0];
+ b = src[1];
+ des [0] = (unsigned char)(a);
+ des [1] = (unsigned char)((a * 85 + 171 * b + 128) >> 8);
+ des [2] = (unsigned char)(b);
+}
+
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_vertical_band_2_3_scale_c
+ *
+ * INPUTS : unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_pitch : Stride of destination data.
+ * unsigned int dest_width : Width of destination data.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Scales vertical band of pixels by scale 2 to 3. The
+ * height of the band scaled is 2-pixels.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band.
+ *
+ ****************************************************************************/
+void vp8cx_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned int a, b, c;
+ unsigned char *des = dest;
+
+ for (i = 0; i < dest_width; i++)
+ {
+ a = des [0];
+ b = des [dest_pitch];
+ c = des[dest_pitch*3];
+ des [dest_pitch ] = (unsigned char)((a * 85 + 171 * b + 128) >> 8);
+ des [dest_pitch*2] = (unsigned char)((b * 171 + 85 * c + 128) >> 8);
+
+ des++;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_last_vertical_band_2_3_scale_c
+ *
+ * INPUTS : unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_pitch : Stride of destination data.
+ * unsigned int dest_width : Width of destination data.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Scales last vertical band of pixels by scale 2 to 3. The
+ * height of the band scaled is 2-pixels.
+ *
+ * SPECIAL NOTES : The routine does not have available the first line of
+ * the band below the current band, since this is the
+ * last band.
+ *
+ ****************************************************************************/
+void vp8cx_last_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned int a, b;
+ unsigned char *des = dest;
+
+ for (i = 0; i < dest_width; ++i)
+ {
+ a = des [0];
+ b = des [dest_pitch];
+
+ des [dest_pitch ] = (unsigned char)((a * 85 + 171 * b + 128) >> 8);
+ des [dest_pitch*2] = (unsigned char)(b);
+ des++;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_horizontal_line_3_5_scale_c
+ *
+ * INPUTS : const unsigned char *source : Pointer to source data.
+ * unsigned int source_width : Stride of source.
+ * unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_width : Stride of destination (NOT USED).
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies horizontal line of pixels from source to
+ * destination scaling up by 3 to 5.
+ *
+ * SPECIAL NOTES : None.
+ *
+ *
+ ****************************************************************************/
+void vp8cx_horizontal_line_3_5_scale_c
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ unsigned int i;
+ unsigned int a, b, c;
+ unsigned char *des = dest;
+ const unsigned char *src = source;
+
+ (void) dest_width;
+
+ for (i = 0; i < source_width - 3; i += 3)
+ {
+ a = src[0];
+ b = src[1];
+ des [0] = (unsigned char)(a);
+ des [1] = (unsigned char)((a * 102 + 154 * b + 128) >> 8);
+
+ c = src[2] ;
+ des [2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8);
+ des [3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8);
+
+ a = src[3];
+ des [4] = (unsigned char)((c * 154 + a * 102 + 128) >> 8);
+
+ src += 3;
+ des += 5;
+ }
+
+ a = src[0];
+ b = src[1];
+ des [0] = (unsigned char)(a);
+
+ des [1] = (unsigned char)((a * 102 + 154 * b + 128) >> 8);
+ c = src[2] ;
+ des [2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8);
+ des [3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8);
+
+ des [4] = (unsigned char)(c);
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_vertical_band_3_5_scale_c
+ *
+ * INPUTS : unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_pitch : Stride of destination data.
+ * unsigned int dest_width : Width of destination data.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Scales vertical band of pixels by scale 3 to 5. The
+ * height of the band scaled is 3-pixels.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band.
+ *
+ ****************************************************************************/
+void vp8cx_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned int a, b, c;
+ unsigned char *des = dest;
+
+ for (i = 0; i < dest_width; i++)
+ {
+ a = des [0];
+ b = des [dest_pitch];
+ des [dest_pitch] = (unsigned char)((a * 102 + 154 * b + 128) >> 8);
+
+ c = des[dest_pitch*2];
+ des [dest_pitch*2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8);
+ des [dest_pitch*3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8);
+
+ // First line in next band...
+ a = des [dest_pitch * 5];
+ des [dest_pitch * 4] = (unsigned char)((c * 154 + a * 102 + 128) >> 8);
+
+ des++;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_last_vertical_band_3_5_scale_c
+ *
+ * INPUTS : unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_pitch : Stride of destination data.
+ * unsigned int dest_width : Width of destination data.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Scales last vertical band of pixels by scale 3 to 5. The
+ * height of the band scaled is 3-pixels.
+ *
+ * SPECIAL NOTES : The routine does not have available the first line of
+ * the band below the current band, since this is the
+ * last band.
+ *
+ ****************************************************************************/
+void vp8cx_last_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned int a, b, c;
+ unsigned char *des = dest;
+
+ for (i = 0; i < dest_width; ++i)
+ {
+ a = des [0];
+ b = des [dest_pitch];
+
+ des [ dest_pitch ] = (unsigned char)((a * 102 + 154 * b + 128) >> 8);
+
+ c = des[dest_pitch*2];
+ des [dest_pitch*2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8);
+ des [dest_pitch*3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8);
+
+ // No other line for interplation of this line, so ..
+ des [ dest_pitch * 4 ] = (unsigned char)(c) ;
+
+ des++;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_horizontal_line_3_4_scale_c
+ *
+ * INPUTS : const unsigned char *source : Pointer to source data.
+ * unsigned int source_width : Stride of source.
+ * unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_width : Stride of destination (NOT USED).
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies horizontal line of pixels from source to
+ * destination scaling up by 3 to 4.
+ *
+ * SPECIAL NOTES : None.
+ *
+ *
+ ****************************************************************************/
+void vp8cx_horizontal_line_3_4_scale_c
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ unsigned int i;
+ unsigned int a, b, c;
+ unsigned char *des = dest;
+ const unsigned char *src = source;
+
+ (void) dest_width;
+
+ for (i = 0; i < source_width - 3; i += 3)
+ {
+ a = src[0];
+ b = src[1];
+ des [0] = (unsigned char)(a);
+ des [1] = (unsigned char)((a * 64 + b * 192 + 128) >> 8);
+
+ c = src[2];
+ des [2] = (unsigned char)((b + c + 1) >> 1);
+
+ a = src[3];
+ des [3] = (unsigned char)((c * 192 + a * 64 + 128) >> 8);
+
+ src += 3;
+ des += 4;
+ }
+
+ a = src[0];
+ b = src[1];
+ des [0] = (unsigned char)(a);
+ des [1] = (unsigned char)((a * 64 + b * 192 + 128) >> 8);
+
+ c = src[2] ;
+ des [2] = (unsigned char)((b + c + 1) >> 1);
+ des [3] = (unsigned char)(c);
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_vertical_band_3_4_scale_c
+ *
+ * INPUTS : unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_pitch : Stride of destination data.
+ * unsigned int dest_width : Width of destination data.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Scales vertical band of pixels by scale 3 to 4. The
+ * height of the band scaled is 3-pixels.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band.
+ *
+ ****************************************************************************/
+void vp8cx_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned int a, b, c;
+ unsigned char *des = dest;
+
+ for (i = 0; i < dest_width; i++)
+ {
+ a = des [0];
+ b = des [dest_pitch];
+ des [dest_pitch] = (unsigned char)((a * 64 + b * 192 + 128) >> 8);
+
+ c = des[dest_pitch*2];
+ des [dest_pitch*2] = (unsigned char)((b + c + 1) >> 1);
+
+ // First line in next band...
+ a = des [dest_pitch*4];
+ des [dest_pitch*3] = (unsigned char)((c * 192 + a * 64 + 128) >> 8);
+
+ des++;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_last_vertical_band_3_4_scale_c
+ *
+ * INPUTS : unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_pitch : Stride of destination data.
+ * unsigned int dest_width : Width of destination data.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Scales last vertical band of pixels by scale 3 to 4. The
+ * height of the band scaled is 3-pixels.
+ *
+ * SPECIAL NOTES : The routine does not have available the first line of
+ * the band below the current band, since this is the
+ * last band.
+ *
+ ****************************************************************************/
+void vp8cx_last_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned int a, b, c;
+ unsigned char *des = dest;
+
+ for (i = 0; i < dest_width; ++i)
+ {
+ a = des [0];
+ b = des [dest_pitch];
+
+ des [dest_pitch] = (unsigned char)((a * 64 + b * 192 + 128) >> 8);
+
+ c = des[dest_pitch*2];
+ des [dest_pitch*2] = (unsigned char)((b + c + 1) >> 1);
+
+ // No other line for interplation of this line, so ..
+ des [dest_pitch*3] = (unsigned char)(c);
+
+ des++;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_horizontal_line_1_2_scale_c
+ *
+ * INPUTS : const unsigned char *source : Pointer to source data.
+ * unsigned int source_width : Stride of source.
+ * unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_width : Stride of destination (NOT USED).
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies horizontal line of pixels from source to
+ * destination scaling up by 1 to 2.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8cx_horizontal_line_1_2_scale_c
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ unsigned int i;
+ unsigned int a, b;
+ unsigned char *des = dest;
+ const unsigned char *src = source;
+
+ (void) dest_width;
+
+ for (i = 0; i < source_width - 1; i += 1)
+ {
+ a = src[0];
+ b = src[1];
+ des [0] = (unsigned char)(a);
+ des [1] = (unsigned char)((a + b + 1) >> 1);
+ src += 1;
+ des += 2;
+ }
+
+ a = src[0];
+ des [0] = (unsigned char)(a);
+ des [1] = (unsigned char)(a);
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_vertical_band_1_2_scale_c
+ *
+ * INPUTS : unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_pitch : Stride of destination data.
+ * unsigned int dest_width : Width of destination data.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Scales vertical band of pixels by scale 1 to 2. The
+ * height of the band scaled is 1-pixel.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band.
+ *
+ ****************************************************************************/
+void vp8cx_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned int a, b;
+ unsigned char *des = dest;
+
+ for (i = 0; i < dest_width; i++)
+ {
+ a = des [0];
+ b = des [dest_pitch * 2];
+
+ des[dest_pitch] = (unsigned char)((a + b + 1) >> 1);
+
+ des++;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_last_vertical_band_1_2_scale_c
+ *
+ * INPUTS : unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_pitch : Stride of destination data.
+ * unsigned int dest_width : Width of destination data.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Scales last vertical band of pixels by scale 1 to 2. The
+ * height of the band scaled is 1-pixel.
+ *
+ * SPECIAL NOTES : The routine does not have available the first line of
+ * the band below the current band, since this is the
+ * last band.
+ *
+ ****************************************************************************/
+void vp8cx_last_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned char *des = dest;
+
+ for (i = 0; i < dest_width; ++i)
+ {
+ des[dest_pitch] = des[0];
+ des++;
+ }
+}
+
+
+
+
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_horizontal_line_4_5_scale_c
+ *
+ * INPUTS : const unsigned char *source : Pointer to source data.
+ * unsigned int source_width : Stride of source.
+ * unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_width : Stride of destination (NOT USED).
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies horizontal line of pixels from source to
+ * destination scaling up by 4 to 5.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8cx_horizontal_line_5_4_scale_c
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ unsigned i;
+ unsigned int a, b, c, d, e;
+ unsigned char *des = dest;
+ const unsigned char *src = source;
+
+ (void) dest_width;
+
+ for (i = 0; i < source_width; i += 5)
+ {
+ a = src[0];
+ b = src[1];
+ c = src[2];
+ d = src[3];
+ e = src[4];
+
+ des[0] = (unsigned char) a;
+ des[1] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
+ des[2] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
+ des[3] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
+
+ src += 5;
+ des += 4;
+ }
+}
+
+
+
+
+void vp8cx_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned int a, b, c, d, e;
+ unsigned char *des = dest;
+ unsigned char *src = source;
+
+ for (i = 0; i < dest_width; i++)
+ {
+
+ a = src[0 * src_pitch];
+ b = src[1 * src_pitch];
+ c = src[2 * src_pitch];
+ d = src[3 * src_pitch];
+ e = src[4 * src_pitch];
+
+ des[0 * dest_pitch] = (unsigned char) a;
+ des[1 * dest_pitch] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
+ des[2 * dest_pitch] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
+ des[3 * dest_pitch] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
+
+ src ++;
+ des ++;
+
+ }
+}
+
+
+/*7***************************************************************************
+ *
+ * ROUTINE : vp8cx_horizontal_line_3_5_scale_c
+ *
+ * INPUTS : const unsigned char *source : Pointer to source data.
+ * unsigned int source_width : Stride of source.
+ * unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_width : Stride of destination (NOT USED).
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies horizontal line of pixels from source to
+ * destination scaling up by 3 to 5.
+ *
+ * SPECIAL NOTES : None.
+ *
+ *
+ ****************************************************************************/
+void vp8cx_horizontal_line_5_3_scale_c
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ unsigned int i;
+ unsigned int a, b, c, d , e;
+ unsigned char *des = dest;
+ const unsigned char *src = source;
+
+ (void) dest_width;
+
+ for (i = 0; i < source_width; i += 5)
+ {
+ a = src[0];
+ b = src[1];
+ c = src[2];
+ d = src[3];
+ e = src[4];
+
+ des[0] = (unsigned char) a;
+ des[1] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
+ des[2] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
+
+ src += 5;
+ des += 3;
+ }
+
+}
+
+void vp8cx_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned int a, b, c, d, e;
+ unsigned char *des = dest;
+ unsigned char *src = source;
+
+ for (i = 0; i < dest_width; i++)
+ {
+
+ a = src[0 * src_pitch];
+ b = src[1 * src_pitch];
+ c = src[2 * src_pitch];
+ d = src[3 * src_pitch];
+ e = src[4 * src_pitch];
+
+ des[0 * dest_pitch] = (unsigned char) a;
+ des[1 * dest_pitch] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
+ des[2 * dest_pitch] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
+
+ src ++;
+ des ++;
+
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_horizontal_line_1_2_scale_c
+ *
+ * INPUTS : const unsigned char *source : Pointer to source data.
+ * unsigned int source_width : Stride of source.
+ * unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_width : Stride of destination (NOT USED).
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies horizontal line of pixels from source to
+ * destination scaling up by 1 to 2.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8cx_horizontal_line_2_1_scale_c
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ unsigned int i;
+ unsigned int a;
+ unsigned char *des = dest;
+ const unsigned char *src = source;
+
+ (void) dest_width;
+
+ for (i = 0; i < source_width; i += 2)
+ {
+ a = src[0];
+ des [0] = (unsigned char)(a);
+ src += 2;
+ des += 1;
+ }
+
+
+
+}
+void vp8cx_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ (void) dest_pitch;
+ (void) src_pitch;
+ vpx_memcpy(dest, source, dest_width);
+}
+
+void vp8cx_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ int temp;
+
+ (void) dest_pitch;
+
+ for (i = 0; i < dest_width; i++)
+ {
+ temp = 8;
+ temp += source[i-(int)src_pitch] * 3;
+ temp += source[i] * 10;
+ temp += source[i+src_pitch] * 3;
+ temp >>= 4 ;
+ dest[i] = (unsigned char)(temp);
+ }
+
+}
diff --git a/vpx_scale/generic/scalesystemdependant.c b/vpx_scale/generic/scalesystemdependant.c
new file mode 100644
index 000000000..28f5c7252
--- /dev/null
+++ b/vpx_scale/generic/scalesystemdependant.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/vpxscale.h"
+
+#ifdef HAVE_CONFIG_H
+#include "vpx_config.h"
+#endif
+
+void (*vp8_yv12_extend_frame_borders_ptr)(YV12_BUFFER_CONFIG *ybf);
+extern void vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf);
+
+void (*vp8_yv12_copy_frame_yonly_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+extern void vp8_yv12_copy_frame_yonly(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+
+void (*vp8_yv12_copy_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+extern void vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+
+/****************************************************************************
+* Imports
+*****************************************************************************/
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8_scale_machine_specific_config
+ *
+ * INPUTS : UINT32 Version : Codec version number.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Checks for machine specifc features such as MMX support
+ * sets appropriate flags and function pointers.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8_scale_machine_specific_config()
+{
+#if CONFIG_SPATIAL_RESAMPLING
+ vp8_horizontal_line_1_2_scale = vp8cx_horizontal_line_1_2_scale_c;
+ vp8_vertical_band_1_2_scale = vp8cx_vertical_band_1_2_scale_c;
+ vp8_last_vertical_band_1_2_scale = vp8cx_last_vertical_band_1_2_scale_c;
+ vp8_horizontal_line_3_5_scale = vp8cx_horizontal_line_3_5_scale_c;
+ vp8_vertical_band_3_5_scale = vp8cx_vertical_band_3_5_scale_c;
+ vp8_last_vertical_band_3_5_scale = vp8cx_last_vertical_band_3_5_scale_c;
+ vp8_horizontal_line_3_4_scale = vp8cx_horizontal_line_3_4_scale_c;
+ vp8_vertical_band_3_4_scale = vp8cx_vertical_band_3_4_scale_c;
+ vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c;
+ vp8_horizontal_line_2_3_scale = vp8cx_horizontal_line_2_3_scale_c;
+ vp8_vertical_band_2_3_scale = vp8cx_vertical_band_2_3_scale_c;
+ vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c;
+ vp8_horizontal_line_4_5_scale = vp8cx_horizontal_line_4_5_scale_c;
+ vp8_vertical_band_4_5_scale = vp8cx_vertical_band_4_5_scale_c;
+ vp8_last_vertical_band_4_5_scale = vp8cx_last_vertical_band_4_5_scale_c;
+
+
+ vp8_vertical_band_5_4_scale = vp8cx_vertical_band_5_4_scale_c;
+ vp8_vertical_band_5_3_scale = vp8cx_vertical_band_5_3_scale_c;
+ vp8_vertical_band_2_1_scale = vp8cx_vertical_band_2_1_scale_c;
+ vp8_vertical_band_2_1_scale_i = vp8cx_vertical_band_2_1_scale_i_c;
+ vp8_horizontal_line_2_1_scale = vp8cx_horizontal_line_2_1_scale_c;
+ vp8_horizontal_line_5_3_scale = vp8cx_horizontal_line_5_3_scale_c;
+ vp8_horizontal_line_5_4_scale = vp8cx_horizontal_line_5_4_scale_c;
+#endif
+
+ vp8_yv12_extend_frame_borders_ptr = vp8_yv12_extend_frame_borders;
+ vp8_yv12_copy_frame_yonly_ptr = vp8_yv12_copy_frame_yonly;
+ vp8_yv12_copy_frame_ptr = vp8_yv12_copy_frame;
+
+}
diff --git a/vpx_scale/generic/vpxscale.c b/vpx_scale/generic/vpxscale.c
new file mode 100644
index 000000000..206cd5512
--- /dev/null
+++ b/vpx_scale/generic/vpxscale.c
@@ -0,0 +1,1088 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+ *
+ * Module Title : scale.c
+ *
+ * Description : Image scaling functions.
+ *
+ ***************************************************************************/
+
+/****************************************************************************
+* Header Files
+****************************************************************************/
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12config.h"
+#include "vpx_scale/scale_mode.h"
+
+/****************************************************************************
+* Exports
+****************************************************************************/
+#ifndef VPX_NO_GLOBALS
+void (*vp8_vertical_band_4_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_last_vertical_band_4_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_vertical_band_2_3_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_last_vertical_band_2_3_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_vertical_band_3_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_last_vertical_band_3_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_vertical_band_3_4_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_last_vertical_band_3_4_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_horizontal_line_1_2_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0;
+void (*vp8_horizontal_line_3_5_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0;
+void (*vp8_horizontal_line_3_4_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0;
+void (*vp8_horizontal_line_2_3_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0;
+void (*vp8_horizontal_line_4_5_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0;
+void (*vp8_vertical_band_1_2_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_last_vertical_band_1_2_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+
+void (*vp8_vertical_band_5_4_scale)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_vertical_band_5_3_scale)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_vertical_band_2_1_scale)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_vertical_band_2_1_scale_i)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_horizontal_line_2_1_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0;
+void (*vp8_horizontal_line_5_3_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0;
+void (*vp8_horizontal_line_5_4_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0;
+#else
+# include "vpxscale_nofp.h"
+#endif
+
+typedef struct
+{
+ int expanded_frame_width;
+ int expanded_frame_height;
+
+ int HScale;
+ int HRatio;
+ int VScale;
+ int VRatio;
+
+ YV12_BUFFER_CONFIG *src_yuv_config;
+ YV12_BUFFER_CONFIG *dst_yuv_config;
+
+} SCALE_VARS;
+
+/****************************************************************************
+ *
+ * ROUTINE : horizontal_line_copy
+ *
+ * INPUTS : None
+ *
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : None
+ *
+ * FUNCTION : 1 to 1 scaling up for a horizontal line of pixles
+ *
+ * SPECIAL NOTES : None.
+ *
+ * ERRORS : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_copy(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ (void) dest_width;
+
+ duck_memcpy(dest, source, source_width);
+}
+/****************************************************************************
+ *
+ * ROUTINE : null_scale
+ *
+ * INPUTS : None
+ *
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : None
+ *
+ * FUNCTION : 1 to 1 scaling up for a vertical band
+ *
+ * SPECIAL NOTES : None.
+ *
+ * ERRORS : None.
+ *
+ ****************************************************************************/
+static
+void null_scale(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ (void) dest;
+ (void) dest_pitch;
+ (void) dest_width;
+
+ return;
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : scale1d_2t1_i
+ *
+ * INPUTS : const unsigned char *source : Pointer to data to be scaled.
+ * int source_step : Number of pixels to step on in source.
+ * unsigned int source_scale : Scale for source (UNUSED).
+ * unsigned int source_length : Length of source (UNUSED).
+ * unsigned char *dest : Pointer to output data array.
+ * int dest_step : Number of pixels to step on in destination.
+ * unsigned int dest_scale : Scale for destination (UNUSED).
+ * unsigned int dest_length : Length of destination.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Performs 2-to-1 interpolated scaling.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void scale1d_2t1_i
+(
+ const unsigned char *source,
+ int source_step,
+ unsigned int source_scale,
+ unsigned int source_length,
+ unsigned char *dest,
+ int dest_step,
+ unsigned int dest_scale,
+ unsigned int dest_length
+)
+{
+ unsigned int i, j;
+ unsigned int temp;
+ int source_pitch = source_step;
+ (void) source_length;
+ (void) source_scale;
+ (void) dest_scale;
+
+ source_step *= 2;
+ dest[0] = source[0];
+
+ for (i = dest_step, j = source_step; i < dest_length * dest_step; i += dest_step, j += source_step)
+ {
+ temp = 8;
+ temp += 3 * source[j-source_pitch];
+ temp += 10 * source[j];
+ temp += 3 * source[j+source_pitch];
+ temp >>= 4;
+ dest[i] = (char)(temp);
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : scale1d_2t1_ps
+ *
+ * INPUTS : const unsigned char *source : Pointer to data to be scaled.
+ * int source_step : Number of pixels to step on in source.
+ * unsigned int source_scale : Scale for source (UNUSED).
+ * unsigned int source_length : Length of source (UNUSED).
+ * unsigned char *dest : Pointer to output data array.
+ * int dest_step : Number of pixels to step on in destination.
+ * unsigned int dest_scale : Scale for destination (UNUSED).
+ * unsigned int dest_length : Length of destination.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Performs 2-to-1 point subsampled scaling.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void scale1d_2t1_ps
+(
+ const unsigned char *source,
+ int source_step,
+ unsigned int source_scale,
+ unsigned int source_length,
+ unsigned char *dest,
+ int dest_step,
+ unsigned int dest_scale,
+ unsigned int dest_length
+)
+{
+ unsigned int i, j;
+
+ (void) source_length;
+ (void) source_scale;
+ (void) dest_scale;
+
+ source_step *= 2;
+ j = 0;
+
+ for (i = 0; i < dest_length * dest_step; i += dest_step, j += source_step)
+ dest[i] = source[j];
+}
+/****************************************************************************
+ *
+ * ROUTINE : scale1d_c
+ *
+ * INPUTS : const unsigned char *source : Pointer to data to be scaled.
+ * int source_step : Number of pixels to step on in source.
+ * unsigned int source_scale : Scale for source.
+ * unsigned int source_length : Length of source (UNUSED).
+ * unsigned char *dest : Pointer to output data array.
+ * int dest_step : Number of pixels to step on in destination.
+ * unsigned int dest_scale : Scale for destination.
+ * unsigned int dest_length : Length of destination.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Performs linear interpolation in one dimension.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void scale1d_c
+(
+ const unsigned char *source,
+ int source_step,
+ unsigned int source_scale,
+ unsigned int source_length,
+ unsigned char *dest,
+ int dest_step,
+ unsigned int dest_scale,
+ unsigned int dest_length
+)
+{
+ unsigned int i;
+ unsigned int round_value = dest_scale / 2;
+ unsigned int left_modifier = dest_scale;
+ unsigned int right_modifier = 0;
+ unsigned char left_pixel = *source;
+ unsigned char right_pixel = *(source + source_step);
+
+ (void) source_length;
+
+ // These asserts are needed if there are boundary issues...
+ //assert ( dest_scale > source_scale );
+ //assert ( (source_length-1) * dest_scale >= (dest_length-1) * source_scale );
+
+ for (i = 0; i < dest_length * dest_step; i += dest_step)
+ {
+ dest[i] = (char)((left_modifier * left_pixel + right_modifier * right_pixel + round_value) / dest_scale);
+
+ right_modifier += source_scale;
+
+ while (right_modifier > dest_scale)
+ {
+ right_modifier -= dest_scale;
+ source += source_step;
+ left_pixel = *source;
+ right_pixel = *(source + source_step);
+ }
+
+ left_modifier = dest_scale - right_modifier;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : Scale2D
+ *
+ * INPUTS : const unsigned char *source : Pointer to data to be scaled.
+ * int source_pitch : Stride of source image.
+ * unsigned int source_width : Width of input image.
+ * unsigned int source_height : Height of input image.
+ * unsigned char *dest : Pointer to output data array.
+ * int dest_pitch : Stride of destination image.
+ * unsigned int dest_width : Width of destination image.
+ * unsigned int dest_height : Height of destination image.
+ * unsigned char *temp_area : Pointer to temp work area.
+ * unsigned char temp_area_height : Height of temp work area.
+ * unsigned int hscale : Horizontal scale factor numerator.
+ * unsigned int hratio : Horizontal scale factor denominator.
+ * unsigned int vscale : Vertical scale factor numerator.
+ * unsigned int vratio : Vertical scale factor denominator.
+ * unsigned int interlaced : Interlace flag.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Performs 2-tap linear interpolation in two dimensions.
+ *
+ * SPECIAL NOTES : Expansion is performed one band at a time to help with
+ * caching.
+ *
+ ****************************************************************************/
+static
+void Scale2D
+(
+ //const
+ unsigned char *source,
+ int source_pitch,
+ unsigned int source_width,
+ unsigned int source_height,
+ unsigned char *dest,
+ int dest_pitch,
+ unsigned int dest_width,
+ unsigned int dest_height,
+ unsigned char *temp_area,
+ unsigned char temp_area_height,
+ unsigned int hscale,
+ unsigned int hratio,
+ unsigned int vscale,
+ unsigned int vratio,
+ unsigned int interlaced
+)
+{
+ //unsigned
+ int i, j, k;
+ int bands;
+ int dest_band_height;
+ int source_band_height;
+
+ typedef void (*Scale1D)(const unsigned char * source, int source_step, unsigned int source_scale, unsigned int source_length,
+ unsigned char * dest, int dest_step, unsigned int dest_scale, unsigned int dest_length);
+
+ Scale1D Scale1Dv = scale1d_c;
+ Scale1D Scale1Dh = scale1d_c;
+
+ void (*horiz_line_scale)(const unsigned char *, unsigned int, unsigned char *, unsigned int) = NULL;
+ void (*vert_band_scale)(unsigned char *, unsigned int, unsigned char *, unsigned int, unsigned int) = NULL;
+
+ int ratio_scalable = 1;
+ int interpolation = 0;
+
+ unsigned char *source_base; // = (unsigned char *) ((source_pitch >= 0) ? source : (source + ((source_height-1) * source_pitch)));
+ unsigned char *line_src;
+
+
+ source_base = (unsigned char *)source;
+
+ if (source_pitch < 0)
+ {
+ int offset;
+
+ offset = (source_height - 1);
+ offset *= source_pitch;
+
+ source_base += offset;
+ }
+
+ // find out the ratio for each direction
+ switch (hratio * 10 / hscale)
+ {
+ case 8:
+ // 4-5 Scale in Width direction
+ horiz_line_scale = vp8_horizontal_line_5_4_scale;
+ break;
+ case 6:
+ // 3-5 Scale in Width direction
+ horiz_line_scale = vp8_horizontal_line_5_3_scale;
+ break;
+ case 5:
+ // 1-2 Scale in Width direction
+ horiz_line_scale = vp8_horizontal_line_2_1_scale;
+ break;
+ default:
+ // The ratio is not acceptable now
+ // throw("The ratio is not acceptable for now!");
+ ratio_scalable = 0;
+ break;
+ }
+
+ switch (vratio * 10 / vscale)
+ {
+ case 8:
+ // 4-5 Scale in vertical direction
+ vert_band_scale = vp8_vertical_band_5_4_scale;
+ source_band_height = 5;
+ dest_band_height = 4;
+ break;
+ case 6:
+ // 3-5 Scale in vertical direction
+ vert_band_scale = vp8_vertical_band_5_3_scale;
+ source_band_height = 5;
+ dest_band_height = 3;
+ break;
+ case 5:
+ // 1-2 Scale in vertical direction
+
+ if (interlaced)
+ {
+ //if the content is interlaced, point sampling is used
+ vert_band_scale = vp8_vertical_band_2_1_scale;
+ }
+ else
+ {
+
+ interpolation = 1;
+ //if the content is progressive, interplo
+ vert_band_scale = vp8_vertical_band_2_1_scale_i;
+
+ }
+
+ source_band_height = 2;
+ dest_band_height = 1;
+ break;
+ default:
+ // The ratio is not acceptable now
+ // throw("The ratio is not acceptable for now!");
+ ratio_scalable = 0;
+ break;
+ }
+
+ if (ratio_scalable)
+ {
+ if (source_height == dest_height)
+ {
+ // for each band of the image
+ for (k = 0; k < (int)dest_height; k++)
+ {
+ horiz_line_scale(source, source_width, dest, dest_width);
+ source += source_pitch;
+ dest += dest_pitch;
+ }
+
+ return;
+ }
+
+ if (interpolation)
+ {
+ if (source < source_base)
+ source = source_base;
+
+ horiz_line_scale(source, source_width, temp_area, dest_width);
+ }
+
+ for (k = 0; k < (int)(dest_height + dest_band_height - 1) / dest_band_height; k++)
+ {
+ // scale one band horizontally
+ for (i = 0; i < source_band_height; i++)
+ {
+ // Trap case where we could read off the base of the source buffer
+
+ line_src = (unsigned char *)source + i * source_pitch;
+
+ if (line_src < source_base)
+ line_src = source_base;
+
+ horiz_line_scale(line_src, source_width,
+ temp_area + (i + 1)*dest_pitch, dest_width);
+ }
+
+ // Vertical scaling is in place
+ vert_band_scale(temp_area + dest_pitch, dest_pitch, dest, dest_pitch, dest_width);
+
+ if (interpolation)
+ vpx_memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_width);
+
+ // Next band...
+ source += (unsigned long) source_band_height * source_pitch;
+ dest += (unsigned long) dest_band_height * dest_pitch;
+ }
+
+ return;
+ }
+
+ if (hscale == 2 && hratio == 1)
+ Scale1Dh = scale1d_2t1_ps;
+
+ if (vscale == 2 && vratio == 1)
+ {
+ if (interlaced)
+ Scale1Dv = scale1d_2t1_ps;
+ else
+ Scale1Dv = scale1d_2t1_i;
+ }
+
+ if (source_height == dest_height)
+ {
+ // for each band of the image
+ for (k = 0; k < (int)dest_height; k++)
+ {
+ Scale1Dh(source, 1, hscale, source_width + 1, dest, 1, hratio, dest_width);
+ source += source_pitch;
+ dest += dest_pitch;
+ }
+
+ return;
+ }
+
+ if (dest_height > source_height)
+ {
+ dest_band_height = temp_area_height - 1;
+ source_band_height = dest_band_height * source_height / dest_height;
+ }
+ else
+ {
+ source_band_height = temp_area_height - 1;
+ dest_band_height = source_band_height * vratio / vscale;
+ }
+
+ // first row needs to be done so that we can stay one row ahead for vertical zoom
+ Scale1Dh(source, 1, hscale, source_width + 1, temp_area, 1, hratio, dest_width);
+
+ // for each band of the image
+ bands = (dest_height + dest_band_height - 1) / dest_band_height;
+
+ for (k = 0; k < bands; k++)
+ {
+ // scale one band horizontally
+ for (i = 1; i < source_band_height + 1; i++)
+ {
+ if (k * source_band_height + i < (int) source_height)
+ {
+ Scale1Dh(source + i * source_pitch, 1, hscale, source_width + 1,
+ temp_area + i * dest_pitch, 1, hratio, dest_width);
+ }
+ else // Duplicate the last row
+ {
+ // copy temp_area row 0 over from last row in the past
+ duck_memcpy(temp_area + i * dest_pitch, temp_area + (i - 1)*dest_pitch, dest_pitch);
+ }
+ }
+
+ // scale one band vertically
+ for (j = 0; j < (int)dest_width; j++)
+ {
+ Scale1Dv(&temp_area[j], dest_pitch, vscale, source_band_height + 1,
+ &dest[j], dest_pitch, vratio, dest_band_height);
+ }
+
+ // copy temp_area row 0 over from last row in the past
+ duck_memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch);
+
+ // move to the next band
+ source += source_band_height * source_pitch;
+ dest += dest_band_height * dest_pitch;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE :
+ *
+ * INPUTS : YV12_BUFFER_CONFIG *src : Pointer to frame to be scaled.
+ * YV12_BUFFER_CONFIG *dst : Pointer to buffer to hold scaled frame.
+ * unsigned char *temp_area : Pointer to temp work area.
+ * unsigned char temp_area_height : Height of temp work area.
+ * unsigned int hscale : Horizontal scale factor numerator.
+ * unsigned int hratio : Horizontal scale factor denominator.
+ * unsigned int vscale : Vertical scale factor numerator.
+ * unsigned int vratio : Vertical scale factor denominator.
+ * unsigned int interlaced : Interlace flag.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Performs 2-tap linear interpolation in two dimensions.
+ *
+ * SPECIAL NOTES : Expansion is performed one band at a time to help with
+ * caching.
+ *
+ ****************************************************************************/
+void vp8_scale_frame
+(
+ YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst,
+ unsigned char *temp_area,
+ unsigned char temp_height,
+ unsigned int hscale,
+ unsigned int hratio,
+ unsigned int vscale,
+ unsigned int vratio,
+ unsigned int interlaced
+)
+{
+ int i;
+ int dw = (hscale - 1 + src->y_width * hratio) / hscale;
+ int dh = (vscale - 1 + src->y_height * vratio) / vscale;
+
+ // call our internal scaling routines!!
+ Scale2D((unsigned char *) src->y_buffer, src->y_stride, src->y_width, src->y_height,
+ (unsigned char *) dst->y_buffer, dst->y_stride, dw, dh,
+ temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
+
+ if (dw < (int)dst->y_width)
+ for (i = 0; i < dh; i++)
+ duck_memset(dst->y_buffer + i * dst->y_stride + dw - 1, dst->y_buffer[i*dst->y_stride+dw-2], dst->y_width - dw + 1);
+
+ if (dh < (int)dst->y_height)
+ for (i = dh - 1; i < (int)dst->y_height; i++)
+ duck_memcpy(dst->y_buffer + i * dst->y_stride, dst->y_buffer + (dh - 2) * dst->y_stride, dst->y_width + 1);
+
+ Scale2D((unsigned char *) src->u_buffer, src->uv_stride, src->uv_width, src->uv_height,
+ (unsigned char *) dst->u_buffer, dst->uv_stride, dw / 2, dh / 2,
+ temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
+
+ if (dw / 2 < (int)dst->uv_width)
+ for (i = 0; i < dst->uv_height; i++)
+ duck_memset(dst->u_buffer + i * dst->uv_stride + dw / 2 - 1, dst->u_buffer[i*dst->uv_stride+dw/2-2], dst->uv_width - dw / 2 + 1);
+
+ if (dh / 2 < (int)dst->uv_height)
+ for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
+ duck_memcpy(dst->u_buffer + i * dst->uv_stride, dst->u_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
+
+ Scale2D((unsigned char *) src->v_buffer, src->uv_stride, src->uv_width, src->uv_height,
+ (unsigned char *) dst->v_buffer, dst->uv_stride, dw / 2, dh / 2,
+ temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
+
+ if (dw / 2 < (int)dst->uv_width)
+ for (i = 0; i < dst->uv_height; i++)
+ duck_memset(dst->v_buffer + i * dst->uv_stride + dw / 2 - 1, dst->v_buffer[i*dst->uv_stride+dw/2-2], dst->uv_width - dw / 2 + 1);
+
+ if (dh / 2 < (int) dst->uv_height)
+ for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
+ duck_memcpy(dst->v_buffer + i * dst->uv_stride, dst->v_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
+}
+/****************************************************************************
+ *
+ * ROUTINE : any_ratio_2d_scale
+ *
+ * INPUTS : SCALE_INSTANCE *si : Pointer to post-processor instance (NOT USED).
+ * const unsigned char *source : Pointer to source image.
+ * unsigned int source_pitch : Stride of source image.
+ * unsigned int source_width : Width of source image.
+ * unsigned int source_height : Height of source image (NOT USED).
+ * unsigned char *dest : Pointer to destination image.
+ * unsigned int dest_pitch : Stride of destination image.
+ * unsigned int dest_width : Width of destination image.
+ * unsigned int dest_height : Height of destination image.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : int: 1 if image scaled, 0 if image could not be scaled.
+ *
+ * FUNCTION : Scale the image with changing apect ratio.
+ *
+ * SPECIAL NOTES : This scaling is a bi-linear scaling. Need to re-work the
+ * whole function for new scaling algorithm.
+ *
+ ****************************************************************************/
+static
+int any_ratio_2d_scale
+(
+ SCALE_VARS *si,
+ const unsigned char *source,
+ int source_pitch,
+ unsigned int source_width,
+ unsigned int source_height,
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width,
+ unsigned int dest_height
+)
+{
+ unsigned int i, k;
+ unsigned int src_band_height = 0;
+ unsigned int dest_band_height = 0;
+
+ // suggested scale factors
+ int hs = si->HScale;
+ int hr = si->HRatio;
+ int vs = si->VScale;
+ int vr = si->VRatio;
+
+ // assume the ratios are scalable instead of should be centered
+ int ratio_scalable = 1;
+
+ const unsigned char *source_base = ((source_pitch >= 0) ? source : (source + ((source_height - 1) * source_pitch)));
+ const unsigned char *line_src;
+
+ void (*horiz_line_scale)(const unsigned char *, unsigned int, unsigned char *, unsigned int) = NULL;
+ void (*vert_band_scale)(unsigned char *, unsigned int, unsigned int) = NULL;
+ void (*last_vert_band_scale)(unsigned char *, unsigned int, unsigned int) = NULL;
+
+ (void) si;
+
+ // find out the ratio for each direction
+ switch (hr * 30 / hs)
+ {
+ case 24:
+ // 4-5 Scale in Width direction
+ horiz_line_scale = vp8_horizontal_line_4_5_scale;
+ break;
+ case 22:
+ // 3-4 Scale in Width direction
+ horiz_line_scale = vp8_horizontal_line_3_4_scale;
+ break;
+
+ case 20:
+ // 4-5 Scale in Width direction
+ horiz_line_scale = vp8_horizontal_line_2_3_scale;
+ break;
+ case 18:
+ // 3-5 Scale in Width direction
+ horiz_line_scale = vp8_horizontal_line_3_5_scale;
+ break;
+ case 15:
+ // 1-2 Scale in Width direction
+ horiz_line_scale = vp8_horizontal_line_1_2_scale;
+ break;
+ case 30:
+ // no scale in Width direction
+ horiz_line_scale = horizontal_line_copy;
+ break;
+ default:
+ // The ratio is not acceptable now
+ // throw("The ratio is not acceptable for now!");
+ ratio_scalable = 0;
+ break;
+ }
+
+ switch (vr * 30 / vs)
+ {
+ case 24:
+ // 4-5 Scale in vertical direction
+ vert_band_scale = vp8_vertical_band_4_5_scale;
+ last_vert_band_scale = vp8_last_vertical_band_4_5_scale;
+ src_band_height = 4;
+ dest_band_height = 5;
+ break;
+ case 22:
+ // 3-4 Scale in vertical direction
+ vert_band_scale = vp8_vertical_band_3_4_scale;
+ last_vert_band_scale = vp8_last_vertical_band_3_4_scale;
+ src_band_height = 3;
+ dest_band_height = 4;
+ break;
+ case 20:
+ // 2-3 Scale in vertical direction
+ vert_band_scale = vp8_vertical_band_2_3_scale;
+ last_vert_band_scale = vp8_last_vertical_band_2_3_scale;
+ src_band_height = 2;
+ dest_band_height = 3;
+ break;
+ case 18:
+ // 3-5 Scale in vertical direction
+ vert_band_scale = vp8_vertical_band_3_5_scale;
+ last_vert_band_scale = vp8_last_vertical_band_3_5_scale;
+ src_band_height = 3;
+ dest_band_height = 5;
+ break;
+ case 15:
+ // 1-2 Scale in vertical direction
+ vert_band_scale = vp8_vertical_band_1_2_scale;
+ last_vert_band_scale = vp8_last_vertical_band_1_2_scale;
+ src_band_height = 1;
+ dest_band_height = 2;
+ break;
+ case 30:
+ // no scale in Width direction
+ vert_band_scale = null_scale;
+ last_vert_band_scale = null_scale;
+ src_band_height = 4;
+ dest_band_height = 4;
+ break;
+ default:
+ // The ratio is not acceptable now
+ // throw("The ratio is not acceptable for now!");
+ ratio_scalable = 0;
+ break;
+ }
+
+ if (ratio_scalable == 0)
+ return ratio_scalable;
+
+ horiz_line_scale(source, source_width, dest, dest_width);
+
+ // except last band
+ for (k = 0; k < (dest_height + dest_band_height - 1) / dest_band_height - 1; k++)
+ {
+ // scale one band horizontally
+ for (i = 1; i < src_band_height; i++)
+ {
+ // Trap case where we could read off the base of the source buffer
+ line_src = source + i * source_pitch;
+
+ if (line_src < source_base)
+ line_src = source_base;
+
+ horiz_line_scale(line_src, source_width,
+ dest + i * dest_pitch, dest_width);
+ }
+
+ // first line of next band
+ // Trap case where we could read off the base of the source buffer
+ line_src = source + src_band_height * source_pitch;
+
+ if (line_src < source_base)
+ line_src = source_base;
+
+ horiz_line_scale(line_src, source_width,
+ dest + dest_band_height * dest_pitch,
+ dest_width);
+
+ // Vertical scaling is in place
+ vert_band_scale(dest, dest_pitch, dest_width);
+
+ // Next band...
+ source += src_band_height * source_pitch;
+ dest += dest_band_height * dest_pitch;
+ }
+
+ // scale one band horizontally
+ for (i = 1; i < src_band_height; i++)
+ {
+ // Trap case where we could read off the base of the source buffer
+ line_src = source + i * source_pitch;
+
+ if (line_src < source_base)
+ line_src = source_base;
+
+ horiz_line_scale(line_src, source_width,
+ dest + i * dest_pitch,
+ dest_width);
+ }
+
+ // Vertical scaling is in place
+ last_vert_band_scale(dest, dest_pitch, dest_width);
+
+ return ratio_scalable;
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : any_ratio_frame_scale
+ *
+ * INPUTS : SCALE_INSTANCE *si : Pointer to post-processor instance (NOT USED).
+ * unsigned char *frame_buffer : Pointer to source image.
+ * int YOffset : Offset from start of buffer to Y samples.
+ * int UVOffset : Offset from start of buffer to UV samples.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : int: 1 if image scaled, 0 if image could not be scaled.
+ *
+ * FUNCTION : Scale the image with changing apect ratio.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+int any_ratio_frame_scale(SCALE_VARS *scale_vars, int YOffset, int UVOffset)
+{
+ int i;
+ int ew;
+ int eh;
+
+ // suggested scale factors
+ int hs = scale_vars->HScale;
+ int hr = scale_vars->HRatio;
+ int vs = scale_vars->VScale;
+ int vr = scale_vars->VRatio;
+
+ int ratio_scalable = 1;
+
+ int sw = (scale_vars->expanded_frame_width * hr + hs - 1) / hs;
+ int sh = (scale_vars->expanded_frame_height * vr + vs - 1) / vs;
+ int dw = scale_vars->expanded_frame_width;
+ int dh = scale_vars->expanded_frame_height;
+ YV12_BUFFER_CONFIG *src_yuv_config = scale_vars->src_yuv_config;
+ YV12_BUFFER_CONFIG *dst_yuv_config = scale_vars->dst_yuv_config;
+
+ if (hr == 3)
+ ew = (sw + 2) / 3 * 3 * hs / hr;
+ else
+ ew = (sw + 7) / 8 * 8 * hs / hr;
+
+ if (vr == 3)
+ eh = (sh + 2) / 3 * 3 * vs / vr;
+ else
+ eh = (sh + 7) / 8 * 8 * vs / vr;
+
+ ratio_scalable = any_ratio_2d_scale(scale_vars,
+ (const unsigned char *)src_yuv_config->y_buffer,
+ src_yuv_config->y_stride, sw, sh,
+ (unsigned char *) dst_yuv_config->y_buffer + YOffset,
+ dst_yuv_config->y_stride, dw, dh);
+
+ for (i = 0; i < eh; i++)
+ duck_memset(dst_yuv_config->y_buffer + YOffset + i * dst_yuv_config->y_stride + dw, 0, ew - dw);
+
+ for (i = dh; i < eh; i++)
+ duck_memset(dst_yuv_config->y_buffer + YOffset + i * dst_yuv_config->y_stride, 0, ew);
+
+ if (ratio_scalable == 0)
+ return ratio_scalable;
+
+ sw = (sw + 1) >> 1;
+ sh = (sh + 1) >> 1;
+ dw = (dw + 1) >> 1;
+ dh = (dh + 1) >> 1;
+
+ any_ratio_2d_scale(scale_vars,
+ (const unsigned char *)src_yuv_config->u_buffer,
+ src_yuv_config->y_stride / 2, sw, sh,
+ (unsigned char *)dst_yuv_config->u_buffer + UVOffset,
+ dst_yuv_config->uv_stride, dw, dh);
+
+ any_ratio_2d_scale(scale_vars,
+ (const unsigned char *)src_yuv_config->v_buffer,
+ src_yuv_config->y_stride / 2, sw, sh,
+ (unsigned char *)dst_yuv_config->v_buffer + UVOffset,
+ dst_yuv_config->uv_stride, dw, dh);
+
+ return ratio_scalable;
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : center_image
+ *
+ * INPUTS : SCALE_INSTANCE *si : Pointer to post-processor instance.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Centers the image without scaling in the output buffer.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static void
+center_image(YV12_BUFFER_CONFIG *src_yuv_config, YV12_BUFFER_CONFIG *dst_yuv_config)
+{
+ int i;
+ int row_offset, col_offset;
+ unsigned char *src_data_pointer;
+ unsigned char *dst_data_pointer;
+
+ // center values
+ row_offset = (dst_yuv_config->y_height - src_yuv_config->y_height) / 2;
+ col_offset = (dst_yuv_config->y_width - src_yuv_config->y_width) / 2;
+
+ // Y's
+ src_data_pointer = src_yuv_config->y_buffer;
+ dst_data_pointer = (unsigned char *)dst_yuv_config->y_buffer + (row_offset * dst_yuv_config->y_stride) + col_offset;
+
+ for (i = 0; i < src_yuv_config->y_height; i++)
+ {
+ duck_memcpy(dst_data_pointer, src_data_pointer, src_yuv_config->y_width);
+ dst_data_pointer += dst_yuv_config->y_stride;
+ src_data_pointer += src_yuv_config->y_stride;
+ }
+
+ row_offset /= 2;
+ col_offset /= 2;
+
+ // U's
+ src_data_pointer = src_yuv_config->u_buffer;
+ dst_data_pointer = (unsigned char *)dst_yuv_config->u_buffer + (row_offset * dst_yuv_config->uv_stride) + col_offset;
+
+ for (i = 0; i < src_yuv_config->uv_height; i++)
+ {
+ duck_memcpy(dst_data_pointer, src_data_pointer, src_yuv_config->uv_width);
+ dst_data_pointer += dst_yuv_config->uv_stride;
+ src_data_pointer += src_yuv_config->uv_stride;
+ }
+
+ // V's
+ src_data_pointer = src_yuv_config->v_buffer;
+ dst_data_pointer = (unsigned char *)dst_yuv_config->v_buffer + (row_offset * dst_yuv_config->uv_stride) + col_offset;
+
+ for (i = 0; i < src_yuv_config->uv_height; i++)
+ {
+ duck_memcpy(dst_data_pointer, src_data_pointer, src_yuv_config->uv_width);
+ dst_data_pointer += dst_yuv_config->uv_stride;
+ src_data_pointer += src_yuv_config->uv_stride;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : scale_or_center
+ *
+ * INPUTS : SCALE_INSTANCE *si : Pointer to post-processor instance.
+ *
+ *
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Decides to scale or center image in scale buffer for blit
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void
+vp8_yv12_scale_or_center
+(
+ YV12_BUFFER_CONFIG *src_yuv_config,
+ YV12_BUFFER_CONFIG *dst_yuv_config,
+ int expanded_frame_width,
+ int expanded_frame_height,
+ int scaling_mode,
+ int HScale,
+ int HRatio,
+ int VScale,
+ int VRatio
+)
+{
+// if ( ppi->post_processing_level )
+ // update_umvborder ( ppi, frame_buffer );
+
+
+ switch (scaling_mode)
+ {
+ case SCALE_TO_FIT:
+ case MAINTAIN_ASPECT_RATIO:
+ {
+ SCALE_VARS scale_vars;
+ // center values
+#if 1
+ int row = (dst_yuv_config->y_height - expanded_frame_height) / 2;
+ int col = (dst_yuv_config->y_width - expanded_frame_width) / 2;
+// int YOffset = row * dst_yuv_config->y_width + col;
+// int UVOffset = (row>>1) * dst_yuv_config->uv_width + (col>>1);
+ int YOffset = row * dst_yuv_config->y_stride + col;
+ int UVOffset = (row >> 1) * dst_yuv_config->uv_stride + (col >> 1);
+#else
+ int row = (src_yuv_config->y_height - expanded_frame_height) / 2;
+ int col = (src_yuv_config->y_width - expanded_frame_width) / 2;
+ int YOffset = row * src_yuv_config->y_width + col;
+ int UVOffset = (row >> 1) * src_yuv_config->uv_width + (col >> 1);
+#endif
+
+ scale_vars.dst_yuv_config = dst_yuv_config;
+ scale_vars.src_yuv_config = src_yuv_config;
+ scale_vars.HScale = HScale;
+ scale_vars.HRatio = HRatio;
+ scale_vars.VScale = VScale;
+ scale_vars.VRatio = VRatio;
+ scale_vars.expanded_frame_width = expanded_frame_width;
+ scale_vars.expanded_frame_height = expanded_frame_height;
+
+ // perform center and scale
+ any_ratio_frame_scale(&scale_vars, YOffset, UVOffset);
+
+ break;
+ }
+ case CENTER:
+ center_image(src_yuv_config, dst_yuv_config);
+ break;
+
+ default:
+ break;
+ }
+}
diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c
new file mode 100644
index 000000000..04617be51
--- /dev/null
+++ b/vpx_scale/generic/yv12config.c
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/yv12config.h"
+#include "vpx_mem/vpx_mem.h"
+
+/****************************************************************************
+* Exports
+****************************************************************************/
+
+/****************************************************************************
+ *
+ ****************************************************************************/
+int
+vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf)
+{
+ if (ybf)
+ {
+ if (ybf->buffer_alloc)
+ {
+ duck_free(ybf->buffer_alloc);
+ }
+
+ ybf->buffer_alloc = 0;
+ }
+ else
+ {
+ return -1;
+ }
+
+ return 0;
+}
+
+/****************************************************************************
+ *
+ ****************************************************************************/
+int
+vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int border)
+{
+//NOTE:
+
+ int yplane_size = (height + 2 * border) * (width + 2 * border);
+ int uvplane_size = ((1 + height) / 2 + border) * ((1 + width) / 2 + border);
+
+ if (ybf)
+ {
+ vp8_yv12_de_alloc_frame_buffer(ybf);
+
+ ybf->y_width = width;
+ ybf->y_height = height;
+ ybf->y_stride = width + 2 * border;
+
+ ybf->uv_width = (1 + width) / 2;
+ ybf->uv_height = (1 + height) / 2;
+ ybf->uv_stride = ybf->uv_width + border;
+
+ ybf->border = border;
+ ybf->frame_size = yplane_size + 2 * uvplane_size;
+
+ // Added 2 extra lines to framebuffer so that copy12x12 doesn't fail
+ // when we have a large motion vector in V on the last v block.
+ // Note : We never use these pixels anyway so this doesn't hurt.
+ ybf->buffer_alloc = (unsigned char *) duck_memalign(32, ybf->frame_size + (ybf->y_stride * 2) + 32, 0);
+
+ if (ybf->buffer_alloc == NULL)
+ return -1;
+
+ ybf->y_buffer = ybf->buffer_alloc + (border * ybf->y_stride) + border;
+
+ if (yplane_size & 0xf)
+ yplane_size += 16 - (yplane_size & 0xf);
+
+ ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2 * ybf->uv_stride) + border / 2;
+ ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2 * ybf->uv_stride) + border / 2;
+ }
+ else
+ {
+ return -2;
+ }
+
+ return 0;
+}
+
+/****************************************************************************
+ *
+ ****************************************************************************/
+int
+vp8_yv12_black_frame_buffer(YV12_BUFFER_CONFIG *ybf)
+{
+ if (ybf)
+ {
+ if (ybf->buffer_alloc)
+ {
+ duck_memset(ybf->y_buffer, 0x0, ybf->y_stride * ybf->y_height);
+ duck_memset(ybf->u_buffer, 0x80, ybf->uv_stride * ybf->uv_height);
+ duck_memset(ybf->v_buffer, 0x80, ybf->uv_stride * ybf->uv_height);
+ }
+
+ return 0;
+ }
+
+ return -1;
+}
diff --git a/vpx_scale/generic/yv12extend.c b/vpx_scale/generic/yv12extend.c
new file mode 100644
index 000000000..4906625c8
--- /dev/null
+++ b/vpx_scale/generic/yv12extend.c
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/yv12config.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/vpxscale.h"
+
+/****************************************************************************
+* Exports
+****************************************************************************/
+
+/****************************************************************************
+ *
+ ****************************************************************************/
+void
+vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf)
+{
+ int i;
+ unsigned char *src_ptr1, *src_ptr2;
+ unsigned char *dest_ptr1, *dest_ptr2;
+
+ unsigned int Border;
+ int plane_stride;
+ int plane_height;
+ int plane_width;
+
+ /***********/
+ /* Y Plane */
+ /***********/
+ Border = ybf->border;
+ plane_stride = ybf->y_stride;
+ plane_height = ybf->y_height;
+ plane_width = ybf->y_width;
+
+ // copy the left and right most columns out
+ src_ptr1 = ybf->y_buffer;
+ src_ptr2 = src_ptr1 + plane_width - 1;
+ dest_ptr1 = src_ptr1 - Border;
+ dest_ptr2 = src_ptr2 + 1;
+
+ for (i = 0; i < plane_height; i++)
+ {
+ vpx_memset(dest_ptr1, src_ptr1[0], Border);
+ vpx_memset(dest_ptr2, src_ptr2[0], Border);
+ src_ptr1 += plane_stride;
+ src_ptr2 += plane_stride;
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ // Now copy the top and bottom source lines into each line of the respective borders
+ src_ptr1 = ybf->y_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)Border; i++)
+ {
+ vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+ vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+
+ /***********/
+ /* U Plane */
+ /***********/
+ plane_stride = ybf->uv_stride;
+ plane_height = ybf->uv_height;
+ plane_width = ybf->uv_width;
+ Border /= 2;
+
+ // copy the left and right most columns out
+ src_ptr1 = ybf->u_buffer;
+ src_ptr2 = src_ptr1 + plane_width - 1;
+ dest_ptr1 = src_ptr1 - Border;
+ dest_ptr2 = src_ptr2 + 1;
+
+ for (i = 0; i < plane_height; i++)
+ {
+ vpx_memset(dest_ptr1, src_ptr1[0], Border);
+ vpx_memset(dest_ptr2, src_ptr2[0], Border);
+ src_ptr1 += plane_stride;
+ src_ptr2 += plane_stride;
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ // Now copy the top and bottom source lines into each line of the respective borders
+ src_ptr1 = ybf->u_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)(Border); i++)
+ {
+ vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+ vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ /***********/
+ /* V Plane */
+ /***********/
+
+ // copy the left and right most columns out
+ src_ptr1 = ybf->v_buffer;
+ src_ptr2 = src_ptr1 + plane_width - 1;
+ dest_ptr1 = src_ptr1 - Border;
+ dest_ptr2 = src_ptr2 + 1;
+
+ for (i = 0; i < plane_height; i++)
+ {
+ vpx_memset(dest_ptr1, src_ptr1[0], Border);
+ vpx_memset(dest_ptr2, src_ptr2[0], Border);
+ src_ptr1 += plane_stride;
+ src_ptr2 += plane_stride;
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ // Now copy the top and bottom source lines into each line of the respective borders
+ src_ptr1 = ybf->v_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)(Border); i++)
+ {
+ vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+ vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+}
+
+
+void
+vp8_yv12_extend_frame_borders_yonly(YV12_BUFFER_CONFIG *ybf)
+{
+ int i;
+ unsigned char *src_ptr1, *src_ptr2;
+ unsigned char *dest_ptr1, *dest_ptr2;
+
+ unsigned int Border;
+ int plane_stride;
+ int plane_height;
+ int plane_width;
+
+ /***********/
+ /* Y Plane */
+ /***********/
+ Border = ybf->border;
+ plane_stride = ybf->y_stride;
+ plane_height = ybf->y_height;
+ plane_width = ybf->y_width;
+
+ // copy the left and right most columns out
+ src_ptr1 = ybf->y_buffer;
+ src_ptr2 = src_ptr1 + plane_width - 1;
+ dest_ptr1 = src_ptr1 - Border;
+ dest_ptr2 = src_ptr2 + 1;
+
+ for (i = 0; i < plane_height; i++)
+ {
+ vpx_memset(dest_ptr1, src_ptr1[0], Border);
+ vpx_memset(dest_ptr2, src_ptr2[0], Border);
+ src_ptr1 += plane_stride;
+ src_ptr2 += plane_stride;
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ // Now copy the top and bottom source lines into each line of the respective borders
+ src_ptr1 = ybf->y_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)Border; i++)
+ {
+ vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+ vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ plane_stride /= 2;
+ plane_height /= 2;
+ plane_width /= 2;
+ Border /= 2;
+
+}
+
+
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8_yv12_copy_frame
+ *
+ * INPUTS :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies the source image into the destination image and
+ * updates the destination's UMV borders.
+ *
+ * SPECIAL NOTES : The frames are assumed to be identical in size.
+ *
+ ****************************************************************************/
+void
+vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
+{
+ int row;
+ unsigned char *source, *dest;
+
+ source = src_ybc->y_buffer;
+ dest = dst_ybc->y_buffer;
+
+ for (row = 0; row < src_ybc->y_height; row++)
+ {
+ vpx_memcpy(dest, source, src_ybc->y_width);
+ source += src_ybc->y_stride;
+ dest += dst_ybc->y_stride;
+ }
+
+ source = src_ybc->u_buffer;
+ dest = dst_ybc->u_buffer;
+
+ for (row = 0; row < src_ybc->uv_height; row++)
+ {
+ vpx_memcpy(dest, source, src_ybc->uv_width);
+ source += src_ybc->uv_stride;
+ dest += dst_ybc->uv_stride;
+ }
+
+ source = src_ybc->v_buffer;
+ dest = dst_ybc->v_buffer;
+
+ for (row = 0; row < src_ybc->uv_height; row++)
+ {
+ vpx_memcpy(dest, source, src_ybc->uv_width);
+ source += src_ybc->uv_stride;
+ dest += dst_ybc->uv_stride;
+ }
+
+ vp8_yv12_extend_frame_borders_ptr(dst_ybc);
+}
+
+void
+vp8_yv12_copy_frame_yonly(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
+{
+ int row;
+ unsigned char *source, *dest;
+
+
+ source = src_ybc->y_buffer;
+ dest = dst_ybc->y_buffer;
+
+ for (row = 0; row < src_ybc->y_height; row++)
+ {
+ vpx_memcpy(dest, source, src_ybc->y_width);
+ source += src_ybc->y_stride;
+ dest += dst_ybc->y_stride;
+ }
+
+ vp8_yv12_extend_frame_borders_yonly(dst_ybc);
+}
diff --git a/vpx_scale/include/arm/vpxscale_nofp.h b/vpx_scale/include/arm/vpxscale_nofp.h
new file mode 100644
index 000000000..d6181d207
--- /dev/null
+++ b/vpx_scale/include/arm/vpxscale_nofp.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+void vp8cx_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_horizontal_line_1_2_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_horizontal_line_3_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_horizontal_line_3_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_horizontal_line_2_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_horizontal_line_4_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+
+void vp8cx_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+
+void horizontal_line_4_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_2_3_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_3_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_3_4_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_1_2_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vertical_band_4_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_2_3_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_3_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_3_4_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_1_2_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+
+#define vp8_vertical_band_4_5_scale vertical_band_4_5_scale_armv4
+#define vp8_last_vertical_band_4_5_scale vp8cx_last_vertical_band_4_5_scale_c
+#define vp8_vertical_band_2_3_scale vertical_band_2_3_scale_armv4
+#define vp8_last_vertical_band_2_3_scale vp8cx_last_vertical_band_2_3_scale_c
+#define vp8_vertical_band_3_5_scale vertical_band_3_5_scale_armv4
+#define vp8_last_vertical_band_3_5_scale vp8cx_last_vertical_band_3_5_scale_c
+#define vp8_vertical_band_3_4_scale vertical_band_3_4_scale_armv4
+#define vp8_last_vertical_band_3_4_scale vp8cx_last_vertical_band_3_4_scale_c
+#define vp8_horizontal_line_1_2_scale horizontal_line_1_2_scale_armv4
+#define vp8_horizontal_line_3_5_scale horizontal_line_3_5_scale_armv4
+#define vp8_horizontal_line_3_4_scale horizontal_line_3_4_scale_armv4
+#define vp8_horizontal_line_4_5_scale horizontal_line_4_5_scale_armv4
+#define vp8_horizontal_line_2_3_scale horizontal_line_2_3_scale_armv4
+#define vp8_vertical_band_1_2_scale vertical_band_1_2_scale_armv4
+#define vp8_last_vertical_band_1_2_scale vp8cx_last_vertical_band_1_2_scale_c
+#define vp8_vertical_band_5_4_scale vp8cx_vertical_band_5_4_scale_c
+#define vp8_vertical_band_5_3_scale vp8cx_vertical_band_5_3_scale_c
+#define vp8_vertical_band_2_1_scale vp8cx_vertical_band_2_1_scale_c
+#define vp8_vertical_band_2_1_scale_i vp8cx_vertical_band_2_1_scale_i_c
+#define vp8_horizontal_line_2_1_scale vp8cx_horizontal_line_2_1_scale_c
+#define vp8_horizontal_line_5_3_scale vp8cx_horizontal_line_5_3_scale_c
+#define vp8_horizontal_line_5_4_scale vp8cx_horizontal_line_5_4_scale_c
diff --git a/vpx_scale/include/generic/vpxscale_arbitrary.h b/vpx_scale/include/generic/vpxscale_arbitrary.h
new file mode 100644
index 000000000..2b50f24cf
--- /dev/null
+++ b/vpx_scale/include/generic/vpxscale_arbitrary.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef __VPX_SCALE_ARBITRARY_H__
+#define __VPX_SCALE_ARBITRARY_H__
+
+#include "vpx_scale/yv12config.h"
+
+typedef struct
+{
+ int in_width;
+ int in_height;
+
+ int out_width;
+ int out_height;
+ int max_usable_out_width;
+
+ // numerator for the width and height
+ int nw;
+ int nh;
+ int nh_uv;
+
+ // output to input correspondance array
+ short *l_w;
+ short *l_h;
+ short *l_h_uv;
+
+ // polyphase coefficients
+ short *c_w;
+ short *c_h;
+ short *c_h_uv;
+
+ // buffer for horizontal filtering.
+ unsigned char *hbuf;
+ unsigned char *hbuf_uv;
+} BICUBIC_SCALER_STRUCT;
+
+int bicubic_coefficient_setup(int in_width, int in_height, int out_width, int out_height);
+int bicubic_scale(int in_width, int in_height, int in_stride,
+ int out_width, int out_height, int out_stride,
+ unsigned char *input_image, unsigned char *output_image);
+void bicubic_scale_frame_reset();
+void bicubic_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+ int new_width, int new_height);
+void bicubic_coefficient_init();
+void bicubic_coefficient_destroy();
+
+#endif /* __VPX_SCALE_ARBITRARY_H__ */
diff --git a/vpx_scale/include/generic/vpxscale_depricated.h b/vpx_scale/include/generic/vpxscale_depricated.h
new file mode 100644
index 000000000..015eed0fc
--- /dev/null
+++ b/vpx_scale/include/generic/vpxscale_depricated.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+* Module Title : postp.h
+*
+* Description : Post processor interface
+*
+****************************************************************************/
+#ifndef VPXSCALE_H
+#define VPXSCALE_H
+
+extern void (*vp8_vertical_band_4_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_last_vertical_band_4_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_vertical_band_3_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_last_vertical_band_3_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_horizontal_line_1_2_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+extern void (*vp8_horizontal_line_3_5_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+extern void (*vp8_horizontal_line_4_5_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+extern void (*vp8_vertical_band_1_2_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_last_vertical_band_1_2_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+
+extern void dmachine_specific_config(int mmx_enabled, int xmm_enabled, int wmt_enabled);
+
+#endif
diff --git a/vpx_scale/include/generic/vpxscale_nofp.h b/vpx_scale/include/generic/vpxscale_nofp.h
new file mode 100644
index 000000000..c4d5f4c6f
--- /dev/null
+++ b/vpx_scale/include/generic/vpxscale_nofp.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+void vp8cx_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_horizontal_line_1_2_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_horizontal_line_3_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_horizontal_line_2_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_horizontal_line_4_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+
+void vp8cx_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+
+#define vp8_vertical_band_4_5_scale vp8cx_vertical_band_4_5_scale_c
+#define vp8_last_vertical_band_4_5_scale vp8cx_last_vertical_band_4_5_scale_c
+#define vp8_vertical_band_2_3_scale vp8cx_vertical_band_2_3_scale_c
+#define vp8_last_vertical_band_2_3_scale vp8cx_last_vertical_band_2_3_scale_c
+#define vp8_vertical_band_3_5_scale vp8cx_vertical_band_3_5_scale_c
+#define vp8_last_vertical_band_3_5_scale vp8cx_last_vertical_band_3_5_scale_c
+#define vp8_horizontal_line_1_2_scale vp8cx_horizontal_line_1_2_scale_c
+#define vp8_horizontal_line_3_5_scale vp8cx_horizontal_line_3_5_scale_c
+#define vp8_horizontal_line_4_5_scale vp8cx_horizontal_line_4_5_scale_c
+#define vp8_horizontal_line_2_3_scale vp8cx_horizontal_line_2_3_scale_c
+#define vp8_vertical_band_1_2_scale vp8cx_vertical_band_1_2_scale_c
+#define vp8_last_vertical_band_1_2_scale vp8cx_last_vertical_band_1_2_scale_c
+#define vp8_vertical_band_5_4_scale vp8cx_vertical_band_5_4_scale_c
+#define vp8_vertical_band_5_3_scale vp8cx_vertical_band_5_3_scale_c
+#define vp8_vertical_band_2_1_scale vp8cx_vertical_band_2_1_scale_c
+#define vp8_vertical_band_2_1_scale_i vp8cx_vertical_band_2_1_scale_i_c
+#define vp8_horizontal_line_2_1_scale vp8cx_horizontal_line_2_1_scale_c
+#define vp8_horizontal_line_5_3_scale vp8cx_horizontal_line_5_3_scale_c
+#define vp8_horizontal_line_5_4_scale vp8cx_horizontal_line_5_4_scale_c
diff --git a/vpx_scale/include/leapster/vpxscale.h b/vpx_scale/include/leapster/vpxscale.h
new file mode 100644
index 000000000..f70029cae
--- /dev/null
+++ b/vpx_scale/include/leapster/vpxscale.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+* Module Title : postp.h
+*
+* Description : Post processor interface
+*
+****************************************************************************/
+#ifndef VPXSCALE_H
+#define VPXSCALE_H
+
+
+// fwg 2004-10-14
+typedef void (*vpxvertical_band_4_5_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+typedef void (*vpxlast_vertical_band_4_5_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+typedef void (*vpxvertical_band_3_5_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+typedef void (*vpxlast_vertical_band_3_5_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+typedef void (*vpxhorizontal_line_1_2_scale_lf)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+typedef void (*vpxhorizontal_line_3_5_scale_lf)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+typedef void (*vpxhorizontal_line_4_5_scale_lf)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+typedef void (*vpxvertical_band_1_2_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+typedef void (*vpxlast_vertical_band_1_2_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+
+
+typedef struct vpxglobal_scalling_ptrs_t
+{
+ vpxvertical_band_4_5_scale_lf vpxvertical_band_4_5_scale_t;
+ vpxlast_vertical_band_4_5_scale_lf vpxlast_vertical_band_4_5_scale_t;
+ vpxvertical_band_3_5_scale_lf vpxvertical_band_3_5_scale_t;
+ vpxlast_vertical_band_3_5_scale_lf vpxlast_vertical_band_3_5_scale_t;
+ vpxhorizontal_line_1_2_scale_lf vpxhorizontal_line_1_2_scale_t;
+ vpxhorizontal_line_3_5_scale_lf vpxhorizontal_line_3_5_scale_t;
+ vpxhorizontal_line_4_5_scale_lf vpxhorizontal_line_4_5_scale_t;
+ vpxvertical_band_1_2_scale_lf vpxvertical_band_1_2_scale_t;
+ vpxlast_vertical_band_1_2_scale_lf vpxlast_vertical_band_1_2_scale_t;
+} vpxglobal_scalling_ptrs;
+
+extern struct vpxglobal_scalling_ptrs_t *g_scaling_ptrs;
+
+/*
+extern void (*vp8_vertical_band_4_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+extern void (*vp8_last_vertical_band_4_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+extern void (*vp8_vertical_band_3_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+extern void (*vp8_last_vertical_band_3_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+extern void (*vp8_horizontal_line_1_2_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width);
+extern void (*vp8_horizontal_line_3_5_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width);
+extern void (*vp8_horizontal_line_4_5_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width);
+extern void (*vp8_vertical_band_1_2_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+extern void (*vp8_last_vertical_band_1_2_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+*/
+
+#endif
diff --git a/vpx_scale/include/symbian/vpxscale_nofp.h b/vpx_scale/include/symbian/vpxscale_nofp.h
new file mode 100644
index 000000000..d6181d207
--- /dev/null
+++ b/vpx_scale/include/symbian/vpxscale_nofp.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+void vp8cx_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_horizontal_line_1_2_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_horizontal_line_3_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_horizontal_line_3_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_horizontal_line_2_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_horizontal_line_4_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+
+void vp8cx_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+
+void horizontal_line_4_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_2_3_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_3_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_3_4_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_1_2_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vertical_band_4_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_2_3_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_3_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_3_4_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_1_2_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+
+#define vp8_vertical_band_4_5_scale vertical_band_4_5_scale_armv4
+#define vp8_last_vertical_band_4_5_scale vp8cx_last_vertical_band_4_5_scale_c
+#define vp8_vertical_band_2_3_scale vertical_band_2_3_scale_armv4
+#define vp8_last_vertical_band_2_3_scale vp8cx_last_vertical_band_2_3_scale_c
+#define vp8_vertical_band_3_5_scale vertical_band_3_5_scale_armv4
+#define vp8_last_vertical_band_3_5_scale vp8cx_last_vertical_band_3_5_scale_c
+#define vp8_vertical_band_3_4_scale vertical_band_3_4_scale_armv4
+#define vp8_last_vertical_band_3_4_scale vp8cx_last_vertical_band_3_4_scale_c
+#define vp8_horizontal_line_1_2_scale horizontal_line_1_2_scale_armv4
+#define vp8_horizontal_line_3_5_scale horizontal_line_3_5_scale_armv4
+#define vp8_horizontal_line_3_4_scale horizontal_line_3_4_scale_armv4
+#define vp8_horizontal_line_4_5_scale horizontal_line_4_5_scale_armv4
+#define vp8_horizontal_line_2_3_scale horizontal_line_2_3_scale_armv4
+#define vp8_vertical_band_1_2_scale vertical_band_1_2_scale_armv4
+#define vp8_last_vertical_band_1_2_scale vp8cx_last_vertical_band_1_2_scale_c
+#define vp8_vertical_band_5_4_scale vp8cx_vertical_band_5_4_scale_c
+#define vp8_vertical_band_5_3_scale vp8cx_vertical_band_5_3_scale_c
+#define vp8_vertical_band_2_1_scale vp8cx_vertical_band_2_1_scale_c
+#define vp8_vertical_band_2_1_scale_i vp8cx_vertical_band_2_1_scale_i_c
+#define vp8_horizontal_line_2_1_scale vp8cx_horizontal_line_2_1_scale_c
+#define vp8_horizontal_line_5_3_scale vp8cx_horizontal_line_5_3_scale_c
+#define vp8_horizontal_line_5_4_scale vp8cx_horizontal_line_5_4_scale_c
diff --git a/vpx_scale/include/vpxscale_nofp.h b/vpx_scale/include/vpxscale_nofp.h
new file mode 100644
index 000000000..f6482f944
--- /dev/null
+++ b/vpx_scale/include/vpxscale_nofp.h
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#if defined(__S60_V20__) || defined(__SYMBIAN32__) && !defined(__WINS__)
+#include "symbian\vpxscale_nofp.h"
+#else
+#include "generic\vpxscale_nofp.h"
+#endif
diff --git a/vpx_scale/intel_linux/scaleopt.c b/vpx_scale/intel_linux/scaleopt.c
new file mode 100644
index 000000000..6555600e9
--- /dev/null
+++ b/vpx_scale/intel_linux/scaleopt.c
@@ -0,0 +1,1852 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+* Module Title : scaleopt.cpp
+*
+* Description : Optimized scaling functions
+*
+****************************************************************************/
+#include "pragmas.h"
+
+/****************************************************************************
+* Module Statics
+****************************************************************************/
+#if 0
+__declspec(align(16)) const static unsigned short one_fifth[] = { 51, 51, 51, 51 };
+__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
+__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
+__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
+__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
+__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
+__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102, 51 };
+__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
+__declspec(align(16)) const static unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
+__declspec(align(16)) const static unsigned short const35_2[] = { 154, 51, 205, 102 };
+__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205, 51, 154 };
+#endif
+
+#include "vpx_scale/vpxscale.h"
+#include "vpx_mem/vpx_mem.h"
+
+/****************************************************************************
+ *
+ * ROUTINE : horizontal_line_3_5_scale_mmx
+ *
+ * INPUTS : const unsigned char *source :
+ * unsigned int source_width :
+ * unsigned char *dest :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_3_5_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ __declspec(align(16)) unsigned short const35_2[] = { 154, 51, 205, 102 };
+ __declspec(align(16)) unsigned short const35_1[] = { 102, 205, 51, 154 };
+ __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
+
+ (void) dest_width;
+
+ __asm
+ {
+
+ push ebx
+
+ mov esi, source
+ mov edi, dest
+
+ mov ecx, source_width
+ lea edx, [esi+ecx-3];
+
+ movq mm5, const35_1 // mm5 = 66 xx cd xx 33 xx 9a xx
+ movq mm6, const35_2 // mm6 = 9a xx 33 xx cd xx 66 xx
+
+ movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx
+ pxor mm7, mm7 // clear mm7
+
+ horiz_line_3_5_loop:
+
+ mov eax, DWORD PTR [esi] // eax = 00 01 02 03
+ mov ebx, eax
+
+ and ebx, 0xffff00 // ebx = xx 01 02 xx
+ mov ecx, eax // ecx = 00 01 02 03
+
+ and eax, 0xffff0000 // eax = xx xx 02 03
+ xor ecx, eax // ecx = 00 01 xx xx
+
+ shr ebx, 8 // ebx = 01 02 xx xx
+ or eax, ebx // eax = 01 02 02 03
+
+ shl ebx, 16 // ebx = xx xx 01 02
+ movd mm1, eax // mm1 = 01 02 02 03 xx xx xx xx
+
+ or ebx, ecx // ebx = 00 01 01 02
+ punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 03 xx
+
+ movd mm0, ebx // mm0 = 00 01 01 02
+ pmullw mm1, mm6 //
+
+ punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
+ pmullw mm0, mm5 //
+
+ mov [edi], ebx // writeoutput 00 xx xx xx
+ add esi, 3
+
+ add edi, 5
+ paddw mm0, mm1
+
+ paddw mm0, mm4
+ psrlw mm0, 8
+
+ cmp esi, edx
+ packuswb mm0, mm7
+
+ movd DWORD Ptr [edi-4], mm0
+ jl horiz_line_3_5_loop
+
+//Exit:
+ mov eax, DWORD PTR [esi] // eax = 00 01 02 03
+ mov ebx, eax
+
+ and ebx, 0xffff00 // ebx = xx 01 02 xx
+ mov ecx, eax // ecx = 00 01 02 03
+
+ and eax, 0xffff0000 // eax = xx xx 02 03
+ xor ecx, eax // ecx = 00 01 xx xx
+
+ shr ebx, 8 // ebx = 01 02 xx xx
+ or eax, ebx // eax = 01 02 02 03
+
+ shl eax, 8 // eax = xx 01 02 02
+ and eax, 0xffff0000 // eax = xx xx 02 02
+
+ or eax, ebx // eax = 01 02 02 02
+
+ shl ebx, 16 // ebx = xx xx 01 02
+ movd mm1, eax // mm1 = 01 02 02 02 xx xx xx xx
+
+ or ebx, ecx // ebx = 00 01 01 02
+ punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 02 xx
+
+ movd mm0, ebx // mm0 = 00 01 01 02
+ pmullw mm1, mm6 //
+
+ punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
+ pmullw mm0, mm5 //
+
+ mov [edi], ebx // writeoutput 00 xx xx xx
+ paddw mm0, mm1
+
+ paddw mm0, mm4
+ psrlw mm0, 8
+
+ packuswb mm0, mm7
+ movd DWORD Ptr [edi+1], mm0
+
+ pop ebx
+
+ }
+
+ /*
+ const unsigned char *src = source;
+ unsigned char *des = dest;
+ unsigned int a, b, c ;
+ unsigned int i;
+ (void) dest_width;
+
+ for ( i=0; i<source_width-3; i+=3 )
+ {
+ a = src[0];
+ b = src[1];
+ des [0] = (UINT8) (a);
+ // 2 * left + 3 * right /5
+ des [1] = (UINT8) (( a * 102 + 154 * b + 128 ) >> 8);
+ c = src[2] ;
+ // 4 * left + 1 * right /5
+ des [2] = (UINT8) (( b * 205 + c * 51 + 128 ) >> 8);
+ // 1 * left + 4 * right /5
+ des [3] = (UINT8) (( b * 51 + c * 205 + 128 ) >> 8);
+
+ a = src[3];
+ // 3 * left + 2 * right /5
+ des [4] = (UINT8) (( c * 154 + a * 102 + 128 ) >> 8);
+
+ src += 3;
+ des += 5;
+ }
+
+ a = src[0];
+ b = src[1];
+ des [0] = (UINT8) (a);
+ // 2 * left + 3 * right /5
+ des [1] = (UINT8) (( a * 102 + 154 * b + 128 ) >> 8);
+ c = src[2] ;
+ // 4 * left + 1 * right /5
+ des [2] = (UINT8) (( b * 205 + c * 51 + 128 ) >> 8);
+ // 1 * left + 4 * right /5
+ des [3] = (UINT8) (( b * 51 + c * 205 + 128 ) >> 8);
+
+ des [4] = (UINT8) (c);
+ */
+}
+
+
+/****************************************************************************
+ *
+ * ROUTINE : horizontal_line_4_5_scale_mmx
+ *
+ * INPUTS : const unsigned char *source :
+ * unsigned int source_width :
+ * unsigned char *dest :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_4_5_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
+ __declspec(align(16)) unsigned short const45_2[] = {205, 154, 102, 51 };
+ __declspec(align(16)) unsigned short const45_1[] = { 51, 102, 154, 205 };
+ __declspec(align(16)) unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
+
+ (void)dest_width;
+
+ __asm
+ {
+
+ mov esi, source
+ mov edi, dest
+
+ mov ecx, source_width
+ lea edx, [esi+ecx-8];
+
+ movq mm5, const45_1 // mm5 = 33 xx 66 xx 9a xx cd xx
+ movq mm6, const45_2 // mm6 = cd xx 9a xx 66 xx 33 xx
+
+ movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx
+ pxor mm7, mm7 // clear mm7
+
+ horiz_line_4_5_loop:
+
+ movq mm0, QWORD PTR [esi] // mm0 = 00 01 02 03 04 05 06 07
+ movq mm1, QWORD PTR [esi+1]; // mm1 = 01 02 03 04 05 06 07 08
+
+ movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
+ movq mm3, mm1 // mm3 = 01 02 03 04 05 06 07 08
+
+ movd DWORD PTR [edi], mm0 // write output 00 xx xx xx
+ punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
+
+ punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
+ pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
+
+ pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
+ punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
+
+ movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx
+ pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
+
+ punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
+ pmullw mm3, mm6 // 05*205 06*154 07*102 08* 51
+
+ paddw mm0, mm1 // added round values
+ paddw mm0, mm4
+
+ psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
+ packuswb mm0, mm7
+
+ movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04
+ add edi, 10
+
+ add esi, 8
+ paddw mm2, mm3 //
+
+ paddw mm2, mm4 // added round values
+ cmp esi, edx
+
+ psrlw mm2, 8
+ packuswb mm2, mm7
+
+ movd DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09
+ jl horiz_line_4_5_loop
+
+//Exit:
+ movq mm0, [esi] // mm0 = 00 01 02 03 04 05 06 07
+ movq mm1, mm0 // mm1 = 00 01 02 03 04 05 06 07
+
+ movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
+ psrlq mm1, 8 // mm1 = 01 02 03 04 05 06 07 00
+
+ movq mm3, mask45 // mm3 = 00 00 00 00 00 00 ff 00
+ pand mm3, mm1 // mm3 = 00 00 00 00 00 00 07 00
+
+ psllq mm3, 8 // mm3 = 00 00 00 00 00 00 00 07
+ por mm1, mm3 // mm1 = 01 02 03 04 05 06 07 07
+
+ movq mm3, mm1
+
+ movd DWORD PTR [edi], mm0 // write output 00 xx xx xx
+ punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
+
+ punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
+ pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
+
+ pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
+ punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
+
+ movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx
+ pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
+
+ punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
+ pmullw mm3, mm6 // 05*205 06*154 07*102 07* 51
+
+ paddw mm0, mm1 // added round values
+ paddw mm0, mm4
+
+ psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
+ packuswb mm0, mm7 // 01 02 03 04 xx xx xx xx
+
+ movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04
+ paddw mm2, mm3 //
+
+ paddw mm2, mm4 // added round values
+ psrlw mm2, 8
+
+ packuswb mm2, mm7
+ movd DWORD PTR [edi+6], mm2 // writeoutput 06 07 08 09
+
+
+ }
+ /*
+ const unsigned char *src = source;
+ unsigned char *des = dest;
+ unsigned int a, b, c ;
+ unsigned i;
+ (void) dest_width;
+
+ for ( i=0; i<source_width-4; i+=4 )
+ {
+ a = src[0];
+ b = src[1];
+ des [0] = (UINT8) a;
+ des [1] = (UINT8) (( a * 51 + 205 * b + 128) >> 8);
+ c = src[2] * 154;
+ a = src[3];
+ des [2] = (UINT8) (( b * 102 + c + 128) >> 8);
+ des [3] = (UINT8) (( c + 102 * a + 128) >> 8);
+ b = src[4];
+ des [4] = (UINT8) (( a * 205 + 51 * b + 128) >> 8);
+
+ src += 4;
+ des += 5;
+ }
+
+ a = src[0];
+ b = src[1];
+ des [0] = (UINT8) (a);
+ des [1] = (UINT8) (( a * 51 + 205 * b + 128) >> 8);
+ c = src[2] * 154;
+ a = src[3];
+ des [2] = (UINT8) (( b * 102 + c + 128) >> 8);
+ des [3] = (UINT8) (( c + 102 * a + 128) >> 8);
+ des [4] = (UINT8) (a);
+ */
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vertical_band_4_5_scale_mmx
+ *
+ * INPUTS : unsigned char *dest :
+ * unsigned int dest_pitch :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band. The function also has a "C" only
+ * version.
+ *
+ ****************************************************************************/
+static
+void vertical_band_4_5_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+
+ __declspec(align(16)) unsigned short one_fifth[] = { 51, 51, 51, 51 };
+ __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
+ __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
+ __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
+ __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
+
+ __asm
+ {
+
+ mov esi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ lea edi, [esi+ecx*2] // tow lines below
+ add edi, ecx // three lines below
+
+ pxor mm7, mm7 // clear out mm7
+ mov edx, dest_width // Loop counter
+
+ vs_4_5_loop:
+
+ movq mm0, QWORD ptr [esi] // src[0];
+ movq mm1, QWORD ptr [esi+ecx] // src[1];
+
+ movq mm2, mm0 // Make a copy
+ punpcklbw mm0, mm7 // unpack low to word
+
+ movq mm5, one_fifth
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm0, mm5 // a * 1/5
+
+ movq mm3, mm1 // make a copy
+ punpcklbw mm1, mm7 // unpack low to word
+
+ pmullw mm2, mm5 // a * 1/5
+ movq mm6, four_fifths // constan
+
+ movq mm4, mm1 // copy of low b
+ pmullw mm4, mm6 // b * 4/5
+
+ punpckhbw mm3, mm7 // unpack high to word
+ movq mm5, mm3 // copy of high b
+
+ pmullw mm5, mm6 // b * 4/5
+ paddw mm0, mm4 // a * 1/5 + b * 4/5
+
+ paddw mm2, mm5 // a * 1/5 + b * 4/5
+ paddw mm0, round_values // + 128
+
+ paddw mm2, round_values // + 128
+ psrlw mm0, 8
+
+ psrlw mm2, 8
+ packuswb mm0, mm2 // des [1]
+
+ movq QWORD ptr [esi+ecx], mm0 // write des[1]
+ movq mm0, [esi+ecx*2] // mm0 = src[2]
+
+ // mm1, mm3 --- Src[1]
+ // mm0 --- Src[2]
+ // mm7 for unpacking
+
+ movq mm5, two_fifths
+ movq mm2, mm0 // make a copy
+
+ pmullw mm1, mm5 // b * 2/5
+ movq mm6, three_fifths
+
+
+ punpcklbw mm0, mm7 // unpack low to word
+ pmullw mm3, mm5 // b * 2/5
+
+ movq mm4, mm0 // make copy of c
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm4, mm6 // c * 3/5
+ movq mm5, mm2
+
+ pmullw mm5, mm6 // c * 3/5
+ paddw mm1, mm4 // b * 2/5 + c * 3/5
+
+ paddw mm3, mm5 // b * 2/5 + c * 3/5
+ paddw mm1, round_values // + 128
+
+ paddw mm3, round_values // + 128
+ psrlw mm1, 8
+
+ psrlw mm3, 8
+ packuswb mm1, mm3 // des[2]
+
+ movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
+ movq mm1, [edi] // mm1=Src[3];
+
+ // mm0, mm2 --- Src[2]
+ // mm1 --- Src[3]
+ // mm6 --- 3/5
+ // mm7 for unpacking
+
+ pmullw mm0, mm6 // c * 3/5
+ movq mm5, two_fifths // mm5 = 2/5
+
+ movq mm3, mm1 // make a copy
+ pmullw mm2, mm6 // c * 3/5
+
+ punpcklbw mm1, mm7 // unpack low
+ movq mm4, mm1 // make a copy
+
+ punpckhbw mm3, mm7 // unpack high
+ pmullw mm4, mm5 // d * 2/5
+
+ movq mm6, mm3 // make a copy
+ pmullw mm6, mm5 // d * 2/5
+
+ paddw mm0, mm4 // c * 3/5 + d * 2/5
+ paddw mm2, mm6 // c * 3/5 + d * 2/5
+
+ paddw mm0, round_values // + 128
+ paddw mm2, round_values // + 128
+
+ psrlw mm0, 8
+ psrlw mm2, 8
+
+ packuswb mm0, mm2 // des[3]
+ movq QWORD ptr [edi], mm0 // write des[3]
+
+ // mm1, mm3 --- Src[3]
+ // mm7 -- cleared for unpacking
+
+ movq mm0, [edi+ecx*2] // mm0, Src[0] of the next group
+
+ movq mm5, four_fifths // mm5 = 4/5
+ pmullw mm1, mm5 // d * 4/5
+
+ movq mm6, one_fifth // mm6 = 1/5
+ movq mm2, mm0 // make a copy
+
+ pmullw mm3, mm5 // d * 4/5
+ punpcklbw mm0, mm7 // unpack low
+
+ pmullw mm0, mm6 // an * 1/5
+ punpckhbw mm2, mm7 // unpack high
+
+ paddw mm1, mm0 // d * 4/5 + an * 1/5
+ pmullw mm2, mm6 // an * 1/5
+
+ paddw mm3, mm2 // d * 4/5 + an * 1/5
+ paddw mm1, round_values // + 128
+
+ paddw mm3, round_values // + 128
+ psrlw mm1, 8
+
+ psrlw mm3, 8
+ packuswb mm1, mm3 // des[4]
+
+ movq QWORD ptr [edi+ecx], mm1 // write des[4]
+
+ add edi, 8
+ add esi, 8
+
+ sub edx, 8
+ jg vs_4_5_loop
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : last_vertical_band_4_5_scale_mmx
+ *
+ * INPUTS : unsigned char *dest :
+ * unsigned int dest_pitch :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : None
+ *
+ * FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band. The function also has an "C" only
+ * version.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_4_5_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __declspec(align(16)) unsigned short one_fifth[] = { 51, 51, 51, 51 };
+ __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
+ __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
+ __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
+ __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
+
+ __asm
+ {
+ mov esi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ lea edi, [esi+ecx*2] // tow lines below
+ add edi, ecx // three lines below
+
+ pxor mm7, mm7 // clear out mm7
+ mov edx, dest_width // Loop counter
+
+ last_vs_4_5_loop:
+
+ movq mm0, QWORD ptr [esi] // src[0];
+ movq mm1, QWORD ptr [esi+ecx] // src[1];
+
+ movq mm2, mm0 // Make a copy
+ punpcklbw mm0, mm7 // unpack low to word
+
+ movq mm5, one_fifth
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm0, mm5 // a * 1/5
+
+ movq mm3, mm1 // make a copy
+ punpcklbw mm1, mm7 // unpack low to word
+
+ pmullw mm2, mm5 // a * 1/5
+ movq mm6, four_fifths // constan
+
+ movq mm4, mm1 // copy of low b
+ pmullw mm4, mm6 // b * 4/5
+
+ punpckhbw mm3, mm7 // unpack high to word
+ movq mm5, mm3 // copy of high b
+
+ pmullw mm5, mm6 // b * 4/5
+ paddw mm0, mm4 // a * 1/5 + b * 4/5
+
+ paddw mm2, mm5 // a * 1/5 + b * 4/5
+ paddw mm0, round_values // + 128
+
+ paddw mm2, round_values // + 128
+ psrlw mm0, 8
+
+ psrlw mm2, 8
+ packuswb mm0, mm2 // des [1]
+
+ movq QWORD ptr [esi+ecx], mm0 // write des[1]
+ movq mm0, [esi+ecx*2] // mm0 = src[2]
+
+ // mm1, mm3 --- Src[1]
+ // mm0 --- Src[2]
+ // mm7 for unpacking
+
+ movq mm5, two_fifths
+ movq mm2, mm0 // make a copy
+
+ pmullw mm1, mm5 // b * 2/5
+ movq mm6, three_fifths
+
+
+ punpcklbw mm0, mm7 // unpack low to word
+ pmullw mm3, mm5 // b * 2/5
+
+ movq mm4, mm0 // make copy of c
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm4, mm6 // c * 3/5
+ movq mm5, mm2
+
+ pmullw mm5, mm6 // c * 3/5
+ paddw mm1, mm4 // b * 2/5 + c * 3/5
+
+ paddw mm3, mm5 // b * 2/5 + c * 3/5
+ paddw mm1, round_values // + 128
+
+ paddw mm3, round_values // + 128
+ psrlw mm1, 8
+
+ psrlw mm3, 8
+ packuswb mm1, mm3 // des[2]
+
+ movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
+ movq mm1, [edi] // mm1=Src[3];
+
+ movq QWORD ptr [edi+ecx], mm1 // write des[4];
+
+ // mm0, mm2 --- Src[2]
+ // mm1 --- Src[3]
+ // mm6 --- 3/5
+ // mm7 for unpacking
+
+ pmullw mm0, mm6 // c * 3/5
+ movq mm5, two_fifths // mm5 = 2/5
+
+ movq mm3, mm1 // make a copy
+ pmullw mm2, mm6 // c * 3/5
+
+ punpcklbw mm1, mm7 // unpack low
+ movq mm4, mm1 // make a copy
+
+ punpckhbw mm3, mm7 // unpack high
+ pmullw mm4, mm5 // d * 2/5
+
+ movq mm6, mm3 // make a copy
+ pmullw mm6, mm5 // d * 2/5
+
+ paddw mm0, mm4 // c * 3/5 + d * 2/5
+ paddw mm2, mm6 // c * 3/5 + d * 2/5
+
+ paddw mm0, round_values // + 128
+ paddw mm2, round_values // + 128
+
+ psrlw mm0, 8
+ psrlw mm2, 8
+
+ packuswb mm0, mm2 // des[3]
+ movq QWORD ptr [edi], mm0 // write des[3]
+
+ // mm1, mm3 --- Src[3]
+ // mm7 -- cleared for unpacking
+ add edi, 8
+ add esi, 8
+
+ sub edx, 8
+ jg last_vs_4_5_loop
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vertical_band_3_5_scale_mmx
+ *
+ * INPUTS : unsigned char *dest :
+ * unsigned int dest_pitch :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band. The function also has an "C" only
+ * version.
+ *
+ ****************************************************************************/
+static
+void vertical_band_3_5_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __declspec(align(16)) unsigned short one_fifth[] = { 51, 51, 51, 51 };
+ __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
+ __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
+ __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
+ __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
+
+ __asm
+ {
+ mov esi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ lea edi, [esi+ecx*2] // tow lines below
+ add edi, ecx // three lines below
+
+ pxor mm7, mm7 // clear out mm7
+ mov edx, dest_width // Loop counter
+
+ vs_3_5_loop:
+
+ movq mm0, QWORD ptr [esi] // src[0];
+ movq mm1, QWORD ptr [esi+ecx] // src[1];
+
+ movq mm2, mm0 // Make a copy
+ punpcklbw mm0, mm7 // unpack low to word
+
+ movq mm5, two_fifths // mm5 = 2/5
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm0, mm5 // a * 2/5
+
+ movq mm3, mm1 // make a copy
+ punpcklbw mm1, mm7 // unpack low to word
+
+ pmullw mm2, mm5 // a * 2/5
+ movq mm6, three_fifths // mm6 = 3/5
+
+ movq mm4, mm1 // copy of low b
+ pmullw mm4, mm6 // b * 3/5
+
+ punpckhbw mm3, mm7 // unpack high to word
+ movq mm5, mm3 // copy of high b
+
+ pmullw mm5, mm6 // b * 3/5
+ paddw mm0, mm4 // a * 2/5 + b * 3/5
+
+ paddw mm2, mm5 // a * 2/5 + b * 3/5
+ paddw mm0, round_values // + 128
+
+ paddw mm2, round_values // + 128
+ psrlw mm0, 8
+
+ psrlw mm2, 8
+ packuswb mm0, mm2 // des [1]
+
+ movq QWORD ptr [esi+ecx], mm0 // write des[1]
+ movq mm0, [esi+ecx*2] // mm0 = src[2]
+
+ // mm1, mm3 --- Src[1]
+ // mm0 --- Src[2]
+ // mm7 for unpacking
+
+ movq mm4, mm1 // b low
+ pmullw mm1, four_fifths // b * 4/5 low
+
+ movq mm5, mm3 // b high
+ pmullw mm3, four_fifths // b * 4/5 high
+
+ movq mm2, mm0 // c
+ pmullw mm4, one_fifth // b * 1/5
+
+ punpcklbw mm0, mm7 // c low
+ pmullw mm5, one_fifth // b * 1/5
+
+ movq mm6, mm0 // make copy of c low
+ punpckhbw mm2, mm7 // c high
+
+ pmullw mm6, one_fifth // c * 1/5 low
+ movq mm7, mm2 // make copy of c high
+
+ pmullw mm7, one_fifth // c * 1/5 high
+ paddw mm1, mm6 // b * 4/5 + c * 1/5 low
+
+ paddw mm3, mm7 // b * 4/5 + c * 1/5 high
+ movq mm6, mm0 // make copy of c low
+
+ pmullw mm6, four_fifths // c * 4/5 low
+ movq mm7, mm2 // make copy of c high
+
+ pmullw mm7, four_fifths // c * 4/5 high
+
+ paddw mm4, mm6 // b * 1/5 + c * 4/5 low
+ paddw mm5, mm7 // b * 1/5 + c * 4/5 high
+
+ paddw mm1, round_values // + 128
+ paddw mm3, round_values // + 128
+
+ psrlw mm1, 8
+ psrlw mm3, 8
+
+ packuswb mm1, mm3 // des[2]
+ movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
+
+ paddw mm4, round_values // + 128
+ paddw mm5, round_values // + 128
+
+ psrlw mm4, 8
+ psrlw mm5, 8
+
+ packuswb mm4, mm5 // des[3]
+ movq QWORD ptr [edi], mm4 // write des[3]
+
+ // mm0, mm2 --- Src[3]
+
+ pxor mm7, mm7 // clear mm7 for unpacking
+ movq mm1, [edi+ecx*2] // mm1 = Src[0] of the next group
+
+ movq mm5, three_fifths // mm5 = 3/5
+ pmullw mm0, mm5 // d * 3/5
+
+ movq mm6, two_fifths // mm6 = 2/5
+ movq mm3, mm1 // make a copy
+
+ pmullw mm2, mm5 // d * 3/5
+ punpcklbw mm1, mm7 // unpack low
+
+ pmullw mm1, mm6 // an * 2/5
+ punpckhbw mm3, mm7 // unpack high
+
+ paddw mm0, mm1 // d * 3/5 + an * 2/5
+ pmullw mm3, mm6 // an * 2/5
+
+ paddw mm2, mm3 // d * 3/5 + an * 2/5
+ paddw mm0, round_values // + 128
+
+ paddw mm2, round_values // + 128
+ psrlw mm0, 8
+
+ psrlw mm2, 8
+ packuswb mm0, mm2 // des[4]
+
+ movq QWORD ptr [edi+ecx], mm0 // write des[4]
+
+ add edi, 8
+ add esi, 8
+
+ sub edx, 8
+ jg vs_3_5_loop
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : last_vertical_band_3_5_scale_mmx
+ *
+ * INPUTS : unsigned char *dest :
+ * unsigned int dest_pitch :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band. The function also has an "C" only
+ * version.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_3_5_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __declspec(align(16)) unsigned short one_fifth[] = { 51, 51, 51, 51 };
+ __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
+ __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
+ __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
+ __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
+ __asm
+ {
+ mov esi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ lea edi, [esi+ecx*2] // tow lines below
+ add edi, ecx // three lines below
+
+ pxor mm7, mm7 // clear out mm7
+ mov edx, dest_width // Loop counter
+
+
+ last_vs_3_5_loop:
+
+ movq mm0, QWORD ptr [esi] // src[0];
+ movq mm1, QWORD ptr [esi+ecx] // src[1];
+
+ movq mm2, mm0 // Make a copy
+ punpcklbw mm0, mm7 // unpack low to word
+
+ movq mm5, two_fifths // mm5 = 2/5
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm0, mm5 // a * 2/5
+
+ movq mm3, mm1 // make a copy
+ punpcklbw mm1, mm7 // unpack low to word
+
+ pmullw mm2, mm5 // a * 2/5
+ movq mm6, three_fifths // mm6 = 3/5
+
+ movq mm4, mm1 // copy of low b
+ pmullw mm4, mm6 // b * 3/5
+
+ punpckhbw mm3, mm7 // unpack high to word
+ movq mm5, mm3 // copy of high b
+
+ pmullw mm5, mm6 // b * 3/5
+ paddw mm0, mm4 // a * 2/5 + b * 3/5
+
+ paddw mm2, mm5 // a * 2/5 + b * 3/5
+ paddw mm0, round_values // + 128
+
+ paddw mm2, round_values // + 128
+ psrlw mm0, 8
+
+ psrlw mm2, 8
+ packuswb mm0, mm2 // des [1]
+
+ movq QWORD ptr [esi+ecx], mm0 // write des[1]
+ movq mm0, [esi+ecx*2] // mm0 = src[2]
+
+
+
+ // mm1, mm3 --- Src[1]
+ // mm0 --- Src[2]
+ // mm7 for unpacking
+
+ movq mm4, mm1 // b low
+ pmullw mm1, four_fifths // b * 4/5 low
+
+ movq QWORD ptr [edi+ecx], mm0 // write des[4]
+
+ movq mm5, mm3 // b high
+ pmullw mm3, four_fifths // b * 4/5 high
+
+ movq mm2, mm0 // c
+ pmullw mm4, one_fifth // b * 1/5
+
+ punpcklbw mm0, mm7 // c low
+ pmullw mm5, one_fifth // b * 1/5
+
+ movq mm6, mm0 // make copy of c low
+ punpckhbw mm2, mm7 // c high
+
+ pmullw mm6, one_fifth // c * 1/5 low
+ movq mm7, mm2 // make copy of c high
+
+ pmullw mm7, one_fifth // c * 1/5 high
+ paddw mm1, mm6 // b * 4/5 + c * 1/5 low
+
+ paddw mm3, mm7 // b * 4/5 + c * 1/5 high
+ movq mm6, mm0 // make copy of c low
+
+ pmullw mm6, four_fifths // c * 4/5 low
+ movq mm7, mm2 // make copy of c high
+
+ pmullw mm7, four_fifths // c * 4/5 high
+
+ paddw mm4, mm6 // b * 1/5 + c * 4/5 low
+ paddw mm5, mm7 // b * 1/5 + c * 4/5 high
+
+ paddw mm1, round_values // + 128
+ paddw mm3, round_values // + 128
+
+ psrlw mm1, 8
+ psrlw mm3, 8
+
+ packuswb mm1, mm3 // des[2]
+ movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
+
+ paddw mm4, round_values // + 128
+ paddw mm5, round_values // + 128
+
+ psrlw mm4, 8
+ psrlw mm5, 8
+
+ packuswb mm4, mm5 // des[3]
+ movq QWORD ptr [edi], mm4 // write des[3]
+
+ // mm0, mm2 --- Src[3]
+
+ add edi, 8
+ add esi, 8
+
+ sub edx, 8
+ jg last_vs_3_5_loop
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vertical_band_1_2_scale_mmx
+ *
+ * INPUTS : unsigned char *dest :
+ * unsigned int dest_pitch :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 1 to 2 up-scaling of a band of pixels.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band. The function also has an "C" only
+ * version.
+ *
+ ****************************************************************************/
+static
+void vertical_band_1_2_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __declspec(align(16))unsigned short four_ones[] = { 1, 1, 1, 1};
+
+ __asm
+ {
+
+ mov esi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ pxor mm7, mm7 // clear out mm7
+ mov edx, dest_width // Loop counter
+
+ vs_1_2_loop:
+
+ movq mm0, [esi] // get Src[0]
+ movq mm1, [esi + ecx * 2] // get Src[1]
+
+ movq mm2, mm0 // make copy before unpack
+ movq mm3, mm1 // make copy before unpack
+
+ punpcklbw mm0, mm7 // low Src[0]
+ movq mm6, four_ones // mm6= 1, 1, 1, 1
+
+ punpcklbw mm1, mm7 // low Src[1]
+ paddw mm0, mm1 // low (a + b)
+
+ punpckhbw mm2, mm7 // high Src[0]
+ paddw mm0, mm6 // low (a + b + 1)
+
+ punpckhbw mm3, mm7
+ paddw mm2, mm3 // high (a + b )
+
+ psraw mm0, 1 // low (a + b +1 )/2
+ paddw mm2, mm6 // high (a + b + 1)
+
+ psraw mm2, 1 // high (a + b + 1)/2
+ packuswb mm0, mm2 // pack results
+
+ movq [esi+ecx], mm0 // write out eight bytes
+ add esi, 8
+
+ sub edx, 8
+ jg vs_1_2_loop
+ }
+
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : last_vertical_band_1_2_scale_mmx
+ *
+ * INPUTS : unsigned char *dest :
+ * unsigned int dest_pitch :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 1 to 2 up-scaling of band of pixels.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band. The function also has an "C" only
+ * version.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_1_2_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __asm
+ {
+ mov esi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ mov edx, dest_width // Loop counter
+
+ last_vs_1_2_loop:
+
+ movq mm0, [esi] // get Src[0]
+ movq [esi+ecx], mm0 // write out eight bytes
+
+ add esi, 8
+ sub edx, 8
+
+ jg last_vs_1_2_loop
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : horizontal_line_1_2_scale
+ *
+ * INPUTS : const unsigned char *source :
+ * unsigned int source_width :
+ * unsigned char *dest :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_1_2_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ __declspec(align(16))unsigned short four_ones[] = { 1, 1, 1, 1};
+
+ (void) dest_width;
+
+ __asm
+ {
+ mov esi, source
+ mov edi, dest
+
+ pxor mm7, mm7
+ movq mm6, four_ones
+
+ mov ecx, source_width
+
+ hs_1_2_loop:
+
+ movq mm0, [esi]
+ movq mm1, [esi+1]
+
+ movq mm2, mm0
+ movq mm3, mm1
+
+ movq mm4, mm0
+ punpcklbw mm0, mm7
+
+ punpcklbw mm1, mm7
+ paddw mm0, mm1
+
+ paddw mm0, mm6
+ punpckhbw mm2, mm7
+
+ punpckhbw mm3, mm7
+ paddw mm2, mm3
+
+ paddw mm2, mm6
+ psraw mm0, 1
+
+ psraw mm2, 1
+ packuswb mm0, mm2
+
+ movq mm2, mm4
+ punpcklbw mm2, mm0
+
+ movq [edi], mm2
+ punpckhbw mm4, mm0
+
+ movq [edi+8], mm4
+ add esi, 8
+
+ add edi, 16
+ sub ecx, 8
+
+ cmp ecx, 8
+ jg hs_1_2_loop
+
+// last eight pixel
+
+ movq mm0, [esi]
+ movq mm1, mm0
+
+ movq mm2, mm0
+ movq mm3, mm1
+
+ psrlq mm1, 8
+ psrlq mm3, 56
+
+ psllq mm3, 56
+ por mm1, mm3
+
+ movq mm3, mm1
+ movq mm4, mm0
+
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+
+ paddw mm0, mm1
+ paddw mm0, mm6
+
+ punpckhbw mm2, mm7
+ punpckhbw mm3, mm7
+
+ paddw mm2, mm3
+ paddw mm2, mm6
+
+ psraw mm0, 1
+ psraw mm2, 1
+
+ packuswb mm0, mm2
+ movq mm2, mm4
+
+ punpcklbw mm2, mm0
+ movq [edi], mm2
+
+ punpckhbw mm4, mm0
+ movq [edi+8], mm4
+ }
+}
+
+
+
+
+
+
+/****************************************************************************
+ *
+ * ROUTINE : horizontal_line_5_4_scale_mmx
+ *
+ * INPUTS : const unsigned char *source : Pointer to source data.
+ * unsigned int source_width : Stride of source.
+ * unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_width : Stride of destination (NOT USED).
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies horizontal line of pixels from source to
+ * destination scaling up by 4 to 5.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_5_4_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+
+ __declspec(align(16)) const unsigned short const54_2[] = { 0, 64, 128, 192 };
+ __declspec(align(16)) const unsigned short const54_1[] = {256, 192, 128, 64 };
+ __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
+ /*
+ unsigned i;
+ unsigned int a, b, c, d, e;
+ unsigned char *des = dest;
+ const unsigned char *src = source;
+
+ (void) dest_width;
+
+ for ( i=0; i<source_width; i+=5 )
+ {
+ a = src[0];
+ b = src[1];
+ c = src[2];
+ d = src[3];
+ e = src[4];
+
+ des[0] = a;
+ des[1] = ((b*192 + c* 64 + 128)>>8);
+ des[2] = ((c*128 + d*128 + 128)>>8);
+ des[3] = ((d* 64 + e*192 + 128)>>8);
+
+ src += 5;
+ des += 4;
+ }
+ */
+ __asm
+ {
+
+ mov esi, source ;
+ mov edi, dest ;
+
+ mov ecx, source_width ;
+ movq mm5, const54_1 ;
+
+ pxor mm7, mm7 ;
+ movq mm6, const54_2 ;
+
+ movq mm4, round_values ;
+ lea edx, [esi+ecx] ;
+ horizontal_line_5_4_loop:
+
+ movq mm0, QWORD PTR [esi] ;
+ 00 01 02 03 04 05 06 07
+ movq mm1, mm0 ;
+ 00 01 02 03 04 05 06 07
+
+ psrlq mm0, 8 ;
+ 01 02 03 04 05 06 07 xx
+ punpcklbw mm1, mm7 ;
+ xx 00 xx 01 xx 02 xx 03
+
+ punpcklbw mm0, mm7 ;
+ xx 01 xx 02 xx 03 xx 04
+ pmullw mm1, mm5
+
+ pmullw mm0, mm6
+ add esi, 5
+
+ add edi, 4
+ paddw mm1, mm0
+
+ paddw mm1, mm4
+ psrlw mm1, 8
+
+ cmp esi, edx
+ packuswb mm1, mm7
+
+ movd DWORD PTR [edi-4], mm1
+
+ jl horizontal_line_5_4_loop
+
+ }
+
+}
+
+static
+void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+
+ __declspec(align(16)) const unsigned short one_fourths[] = { 64, 64, 64, 64 };
+ __declspec(align(16)) const unsigned short two_fourths[] = { 128, 128, 128, 128 };
+ __declspec(align(16)) const unsigned short three_fourths[] = { 192, 192, 192, 192 };
+
+ __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
+ __asm
+ {
+ push ebx
+
+ mov esi, source // Get the source and destination pointer
+ mov ecx, src_pitch // Get the pitch size
+
+ mov edi, dest // tow lines below
+ pxor mm7, mm7 // clear out mm7
+
+ mov edx, dest_pitch // Loop counter
+ mov ebx, dest_width
+
+ vs_5_4_loop:
+
+ movd mm0, DWORD ptr [esi] // src[0];
+ movd mm1, DWORD ptr [esi+ecx] // src[1];
+
+ movd mm2, DWORD ptr [esi+ecx*2]
+ lea eax, [esi+ecx*2] //
+
+ punpcklbw mm1, mm7
+ punpcklbw mm2, mm7
+
+ movq mm3, mm2
+ pmullw mm1, three_fourths
+
+ pmullw mm2, one_fourths
+ movd mm4, [eax+ecx]
+
+ pmullw mm3, two_fourths
+ punpcklbw mm4, mm7
+
+ movq mm5, mm4
+ pmullw mm4, two_fourths
+
+ paddw mm1, mm2
+ movd mm6, [eax+ecx*2]
+
+ pmullw mm5, one_fourths
+ paddw mm1, round_values;
+
+ paddw mm3, mm4
+ psrlw mm1, 8
+
+ punpcklbw mm6, mm7
+ paddw mm3, round_values
+
+ pmullw mm6, three_fourths
+ psrlw mm3, 8
+
+ packuswb mm1, mm7
+ packuswb mm3, mm7
+
+ movd DWORD PTR [edi], mm0
+ movd DWORD PTR [edi+edx], mm1
+
+
+ paddw mm5, mm6
+ movd DWORD PTR [edi+edx*2], mm3
+
+ lea eax, [edi+edx*2]
+ paddw mm5, round_values
+
+ psrlw mm5, 8
+ add edi, 4
+
+ packuswb mm5, mm7
+ movd DWORD PTR [eax+edx], mm5
+
+ add esi, 4
+ sub ebx, 4
+
+ jg vs_5_4_loop
+
+ pop ebx
+ }
+}
+
+
+
+static
+void horizontal_line_5_3_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ __declspec(align(16)) const unsigned short const53_1[] = { 0, 85, 171, 0 };
+ __declspec(align(16)) const unsigned short const53_2[] = {256, 171, 85, 0 };
+ __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
+ __asm
+ {
+
+ mov esi, source ;
+ mov edi, dest ;
+
+ mov ecx, source_width ;
+ movq mm5, const53_1 ;
+
+ pxor mm7, mm7 ;
+ movq mm6, const53_2 ;
+
+ movq mm4, round_values ;
+ lea edx, [esi+ecx-5] ;
+ horizontal_line_5_3_loop:
+
+ movq mm0, QWORD PTR [esi] ;
+ 00 01 02 03 04 05 06 07
+ movq mm1, mm0 ;
+ 00 01 02 03 04 05 06 07
+
+ psllw mm0, 8 ;
+ xx 00 xx 02 xx 04 xx 06
+ psrlw mm1, 8 ;
+ 01 xx 03 xx 05 xx 07 xx
+
+ psrlw mm0, 8 ;
+ 00 xx 02 xx 04 xx 06 xx
+ psllq mm1, 16 ;
+ xx xx 01 xx 03 xx 05 xx
+
+ pmullw mm0, mm6
+
+ pmullw mm1, mm5
+ add esi, 5
+
+ add edi, 3
+ paddw mm1, mm0
+
+ paddw mm1, mm4
+ psrlw mm1, 8
+
+ cmp esi, edx
+ packuswb mm1, mm7
+
+ movd DWORD PTR [edi-3], mm1
+ jl horizontal_line_5_3_loop
+
+//exit condition
+ movq mm0, QWORD PTR [esi] ;
+ 00 01 02 03 04 05 06 07
+ movq mm1, mm0 ;
+ 00 01 02 03 04 05 06 07
+
+ psllw mm0, 8 ;
+ xx 00 xx 02 xx 04 xx 06
+ psrlw mm1, 8 ;
+ 01 xx 03 xx 05 xx 07 xx
+
+ psrlw mm0, 8 ;
+ 00 xx 02 xx 04 xx 06 xx
+ psllq mm1, 16 ;
+ xx xx 01 xx 03 xx 05 xx
+
+ pmullw mm0, mm6
+
+ pmullw mm1, mm5
+ paddw mm1, mm0
+
+ paddw mm1, mm4
+ psrlw mm1, 8
+
+ packuswb mm1, mm7
+ movd eax, mm1
+
+ mov edx, eax
+ shr edx, 16
+
+ mov WORD PTR[edi], ax
+ mov BYTE PTR[edi+2], dl
+
+ }
+
+}
+
+
+static
+void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
+ __declspec(align(16)) const unsigned short one_thirds[] = { 85, 85, 85, 85 };
+ __declspec(align(16)) const unsigned short two_thirds[] = { 171, 171, 171, 171 };
+
+ __asm
+ {
+ push ebx
+
+ mov esi, source // Get the source and destination pointer
+ mov ecx, src_pitch // Get the pitch size
+
+ mov edi, dest // tow lines below
+ pxor mm7, mm7 // clear out mm7
+
+ mov edx, dest_pitch // Loop counter
+ movq mm5, one_thirds
+
+ movq mm6, two_thirds
+ mov ebx, dest_width;
+
+ vs_5_3_loop:
+
+ movd mm0, DWORD ptr [esi] // src[0];
+ movd mm1, DWORD ptr [esi+ecx] // src[1];
+
+ movd mm2, DWORD ptr [esi+ecx*2]
+ lea eax, [esi+ecx*2] //
+
+ punpcklbw mm1, mm7
+ punpcklbw mm2, mm7
+
+ pmullw mm1, mm5
+ pmullw mm2, mm6
+
+ movd mm3, DWORD ptr [eax+ecx]
+ movd mm4, DWORD ptr [eax+ecx*2]
+
+ punpcklbw mm3, mm7
+ punpcklbw mm4, mm7
+
+ pmullw mm3, mm6
+ pmullw mm4, mm5
+
+
+ movd DWORD PTR [edi], mm0
+ paddw mm1, mm2
+
+ paddw mm1, round_values
+ psrlw mm1, 8
+
+ packuswb mm1, mm7
+ paddw mm3, mm4
+
+ paddw mm3, round_values
+ movd DWORD PTR [edi+edx], mm1
+
+ psrlw mm3, 8
+ packuswb mm3, mm7
+
+ movd DWORD PTR [edi+edx*2], mm3
+
+
+ add edi, 4
+ add esi, 4
+
+ sub ebx, 4
+ jg vs_5_3_loop
+
+ pop ebx
+ }
+}
+
+
+
+
+/****************************************************************************
+ *
+ * ROUTINE : horizontal_line_2_1_scale
+ *
+ * INPUTS : const unsigned char *source :
+ * unsigned int source_width :
+ * unsigned char *dest :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_2_1_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ (void) dest_width;
+
+ __asm
+ {
+ mov esi, source
+ mov edi, dest
+
+ pxor mm7, mm7
+ mov ecx, dest_width
+
+ xor edx, edx
+ hs_2_1_loop:
+
+ movq mm0, [esi+edx*2]
+ psllw mm0, 8
+
+ psrlw mm0, 8
+ packuswb mm0, mm7
+
+ movd DWORD Ptr [edi+edx], mm0;
+ add edx, 4
+
+ cmp edx, ecx
+ jl hs_2_1_loop
+
+ }
+}
+
+
+
+static
+void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ vpx_memcpy(dest, source, dest_width);
+}
+
+
+
+static
+void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+
+ __declspec(align(16)) const unsigned short three_sixteenths[] = { 48, 48, 48, 48 };
+ __declspec(align(16)) const unsigned short ten_sixteenths[] = { 160, 160, 160, 160 };
+ __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
+ __asm
+ {
+ mov esi, source
+ mov edi, dest
+
+ mov eax, src_pitch
+ mov edx, dest_width
+
+ pxor mm7, mm7
+ sub esi, eax //back one line
+
+
+ lea ecx, [esi+edx];
+ movq mm6, round_values;
+
+ movq mm5, three_sixteenths;
+ movq mm4, ten_sixteenths;
+
+ vs_2_1_i_loop:
+ movd mm0, [esi] //
+ movd mm1, [esi+eax] //
+
+ movd mm2, [esi+eax*2] //
+ punpcklbw mm0, mm7
+
+ pmullw mm0, mm5
+ punpcklbw mm1, mm7
+
+ pmullw mm1, mm4
+ punpcklbw mm2, mm7
+
+ pmullw mm2, mm5
+ paddw mm0, round_values
+
+ paddw mm1, mm2
+ paddw mm0, mm1
+
+ psrlw mm0, 8
+ packuswb mm0, mm7
+
+ movd DWORD PTR [edi], mm0
+ add esi, 4
+
+ add edi, 4;
+ cmp esi, ecx
+ jl vs_2_1_i_loop
+
+ }
+}
+
+void
+register_mmxscalers(void)
+{
+ vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_mmx;
+ vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_mmx;
+ vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_mmx;
+ vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_mmx;
+ vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_mmx;
+ vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_mmx;
+ vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_mmx;
+ vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_mmx;
+ vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_mmx;
+
+ vp8_horizontal_line_3_4_scale = vp8cx_horizontal_line_3_4_scale_c;
+ vp8_vertical_band_3_4_scale = vp8cx_vertical_band_3_4_scale_c;
+ vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c;
+ vp8_horizontal_line_2_3_scale = vp8cx_horizontal_line_2_3_scale_c;
+ vp8_vertical_band_2_3_scale = vp8cx_vertical_band_2_3_scale_c;
+ vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c;
+
+
+
+ vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx;
+ vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx;
+ vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx;
+ vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx;
+ vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx;
+ vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx;
+ vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx;
+
+}
diff --git a/vpx_scale/intel_linux/scalesystemdependant.c b/vpx_scale/intel_linux/scalesystemdependant.c
new file mode 100644
index 000000000..9ed48bfc6
--- /dev/null
+++ b/vpx_scale/intel_linux/scalesystemdependant.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+* Module Title : system_dependant.c
+*
+* Description : Miscellaneous system dependant functions
+*
+****************************************************************************/
+
+/****************************************************************************
+* Header Files
+****************************************************************************/
+#include "vpx_scale/vpxscale.h"
+#include "cpuidlib.h"
+
+/****************************************************************************
+* Imports
+*****************************************************************************/
+extern void register_generic_scalers(void);
+extern void register_mmxscalers(void);
+
+/****************************************************************************
+ *
+ * ROUTINE : post_proc_machine_specific_config
+ *
+ * INPUTS : UINT32 Version : Codec version number.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Checks for machine specifc features such as MMX support
+ * sets appropriate flags and function pointers.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void
+vp8_scale_machine_specific_config(void)
+{
+ // If MMX supported then set to use MMX versions of functions else
+ // use original 'C' versions.
+ int mmx_enabled;
+ int xmm_enabled;
+ int wmt_enabled;
+
+ vpx_get_processor_flags(&mmx_enabled, &xmm_enabled, &wmt_enabled);
+
+ if (mmx_enabled || xmm_enabled || wmt_enabled)
+ {
+ register_mmxscalers();
+ }
+ else
+ {
+ vp8_horizontal_line_1_2_scale = vp8cx_horizontal_line_1_2_scale_c;
+ vp8_vertical_band_1_2_scale = vp8cx_vertical_band_1_2_scale_c;
+ vp8_last_vertical_band_1_2_scale = vp8cx_last_vertical_band_1_2_scale_c;
+ vp8_horizontal_line_3_5_scale = vp8cx_horizontal_line_3_5_scale_c;
+ vp8_vertical_band_3_5_scale = vp8cx_vertical_band_3_5_scale_c;
+ vp8_last_vertical_band_3_5_scale = vp8cx_last_vertical_band_3_5_scale_c;
+ vp8_horizontal_line_3_4_scale = vp8cx_horizontal_line_3_4_scale_c;
+ vp8_vertical_band_3_4_scale = vp8cx_vertical_band_3_4_scale_c;
+ vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c;
+ vp8_horizontal_line_2_3_scale = vp8cx_horizontal_line_2_3_scale_c;
+ vp8_vertical_band_2_3_scale = vp8cx_vertical_band_2_3_scale_c;
+ vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c;
+ vp8_horizontal_line_4_5_scale = vp8cx_horizontal_line_4_5_scale_c;
+ vp8_vertical_band_4_5_scale = vp8cx_vertical_band_4_5_scale_c;
+ vp8_last_vertical_band_4_5_scale = vp8cx_last_vertical_band_4_5_scale_c;
+
+
+ vp8_vertical_band_5_4_scale = vp8cx_vertical_band_5_4_scale_c;
+ vp8_vertical_band_5_3_scale = vp8cx_vertical_band_5_3_scale_c;
+ vp8_vertical_band_2_1_scale = vp8cx_vertical_band_2_1_scale_c;
+ vp8_vertical_band_2_1_scale_i = vp8cx_vertical_band_2_1_scale_i_c;
+ vp8_horizontal_line_2_1_scale = vp8cx_horizontal_line_2_1_scale_c;
+ vp8_horizontal_line_5_3_scale = vp8cx_horizontal_line_5_3_scale_c;
+ vp8_horizontal_line_5_4_scale = vp8cx_horizontal_line_5_4_scale_c;
+
+ }
+}
diff --git a/vpx_scale/leapster/doptsystemdependant_lf.c b/vpx_scale/leapster/doptsystemdependant_lf.c
new file mode 100644
index 000000000..ca1316730
--- /dev/null
+++ b/vpx_scale/leapster/doptsystemdependant_lf.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+* Module Title : system_dependant.c
+*
+* Description : Miscellaneous system dependant functions
+*
+****************************************************************************/
+
+/****************************************************************************
+* Header Files
+****************************************************************************/
+#include "vpx_scale/vpxscale.h"
+
+/****************************************************************************
+* Imports
+*****************************************************************************/
+extern int register_generic_scalers(void);
+extern int de_register_generic_scalers(void);
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8_scale_machine_specific_config
+ *
+ * INPUTS : UINT32 Version : Codec version number.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : int
+ *
+ * FUNCTION : Checks for machine specifc features such as MMX support
+ * sets appropriate flags and function pointers.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+int
+vp8_scale_machine_specific_config()
+{
+ return register_generic_scalers();
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8_scale_machine_specific_config
+ *
+ * INPUTS : UINT32 Version : Codec version number.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : int
+ *
+ * FUNCTION : Resets the funtion pointers and deallocates memory.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+int
+scale_machine_specific_de_config()
+{
+ return de_register_generic_scalers();
+}
diff --git a/vpx_scale/leapster/gen_scalers_lf.c b/vpx_scale/leapster/gen_scalers_lf.c
new file mode 100644
index 000000000..1b9c7c745
--- /dev/null
+++ b/vpx_scale/leapster/gen_scalers_lf.c
@@ -0,0 +1,521 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+ *
+ * Module Title : gen_scalers.c
+ *
+ * Description : Generic image scaling functions.
+ *
+ ***************************************************************************/
+
+/****************************************************************************
+* Header Files
+****************************************************************************/
+#include "vpx_scale/vpxscale.h"
+
+/****************************************************************************
+* Imports
+****************************************************************************/
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_horizontal_line_4_5_scale_c
+ *
+ * INPUTS : const unsigned char *source : Pointer to source data.
+ * unsigned int source_width : Stride of source.
+ * unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_width : Stride of destination (NOT USED).
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies horizontal line of pixels from source to
+ * destination scaling up by 4 to 5.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void vp8cx_horizontal_line_4_5_scale_c
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ unsigned i;
+ unsigned int a, b, c;
+ unsigned char *des = dest;
+ const unsigned char *src = source;
+
+ (void) dest_width;
+
+ for (i = 0; i < source_width - 4; i += 4)
+ {
+ a = src[0];
+ b = src[1];
+ des [0] = (unsigned char) a;
+ des [1] = (unsigned char)((a * 51 + 205 * b + 128) >> 8);
+ c = src[2] * 154;
+ a = src[3];
+ des [2] = (unsigned char)((b * 102 + c + 128) >> 8);
+ des [3] = (unsigned char)((c + 102 * a + 128) >> 8);
+ b = src[4];
+ des [4] = (unsigned char)((a * 205 + 51 * b + 128) >> 8);
+
+ src += 4;
+ des += 5;
+ }
+
+ a = src[0];
+ b = src[1];
+ des [0] = (unsigned char)(a);
+ des [1] = (unsigned char)((a * 51 + 205 * b + 128) >> 8);
+ c = src[2] * 154;
+ a = src[3];
+ des [2] = (unsigned char)((b * 102 + c + 128) >> 8);
+ des [3] = (unsigned char)((c + 102 * a + 128) >> 8);
+ des [4] = (unsigned char)(a);
+
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_vertical_band_4_5_scale_c
+ *
+ * INPUTS : unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_pitch : Stride of destination data.
+ * unsigned int dest_width : Width of destination data.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Scales vertical band of pixels by scale 4 to 5. The
+ * height of the band scaled is 4-pixels.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band.
+ *
+ ****************************************************************************/
+static
+void vp8cx_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned int a, b, c, d;
+ unsigned char *des = dest;
+
+ for (i = 0; i < dest_width; i++)
+ {
+ a = des [0];
+ b = des [dest_pitch];
+
+ des[dest_pitch] = (unsigned char)((a * 51 + 205 * b + 128) >> 8);
+
+ c = des[dest_pitch*2] * 154;
+ d = des[dest_pitch*3];
+
+ des [dest_pitch*2] = (unsigned char)((b * 102 + c + 128) >> 8);
+ des [dest_pitch*3] = (unsigned char)((c + 102 * d + 128) >> 8);
+
+ // First line in next band
+ a = des [dest_pitch * 5];
+ des [dest_pitch * 4] = (unsigned char)((d * 205 + 51 * a + 128) >> 8);
+
+ des ++;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_last_vertical_band_4_5_scale_c
+ *
+ * INPUTS : unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_pitch : Stride of destination data.
+ * unsigned int dest_width : Width of destination data.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Scales last vertical band of pixels by scale 4 to 5. The
+ * height of the band scaled is 4-pixels.
+ *
+ * SPECIAL NOTES : The routine does not have available the first line of
+ * the band below the current band, since this is the
+ * last band.
+ *
+ ****************************************************************************/
+static
+void vp8cx_last_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned int a, b, c, d;
+ unsigned char *des = dest;
+
+ for (i = 0; i < dest_width; ++i)
+ {
+ a = des[0];
+ b = des[dest_pitch];
+
+ des[dest_pitch] = (unsigned char)((a * 51 + 205 * b + 128) >> 8);
+
+ c = des[dest_pitch*2] * 154;
+ d = des[dest_pitch*3];
+
+ des [dest_pitch*2] = (unsigned char)((b * 102 + c + 128) >> 8);
+ des [dest_pitch*3] = (unsigned char)((c + 102 * d + 128) >> 8);
+
+ // No other line for interplation of this line, so ..
+ des[dest_pitch*4] = (unsigned char) d;
+
+ des++;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_horizontal_line_3_5_scale_c
+ *
+ * INPUTS : const unsigned char *source : Pointer to source data.
+ * unsigned int source_width : Stride of source.
+ * unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_width : Stride of destination (NOT USED).
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies horizontal line of pixels from source to
+ * destination scaling up by 3 to 5.
+ *
+ * SPECIAL NOTES : None.
+ *
+ *
+ ****************************************************************************/
+static
+void vp8cx_horizontal_line_3_5_scale_c
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ unsigned int i;
+ unsigned int a, b, c;
+ unsigned char *des = dest;
+ const unsigned char *src = source;
+
+ (void) dest_width;
+
+ for (i = 0; i < source_width - 3; i += 3)
+ {
+ a = src[0];
+ b = src[1];
+ des [0] = (unsigned char)(a);
+ des [1] = (unsigned char)((a * 102 + 154 * b + 128) >> 8);
+
+ c = src[2] ;
+ des [2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8);
+ des [3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8);
+
+ a = src[3];
+ des [4] = (unsigned char)((c * 154 + a * 102 + 128) >> 8);
+
+ src += 3;
+ des += 5;
+ }
+
+ a = src[0];
+ b = src[1];
+ des [0] = (unsigned char)(a);
+
+ des [1] = (unsigned char)((a * 102 + 154 * b + 128) >> 8);
+ c = src[2] ;
+ des [2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8);
+ des [3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8);
+
+ des [4] = (unsigned char)(c);
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_vertical_band_3_5_scale_c
+ *
+ * INPUTS : unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_pitch : Stride of destination data.
+ * unsigned int dest_width : Width of destination data.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Scales vertical band of pixels by scale 3 to 5. The
+ * height of the band scaled is 3-pixels.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band.
+ *
+ ****************************************************************************/
+static
+void vp8cx_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned int a, b, c;
+ unsigned char *des = dest;
+
+ for (i = 0; i < dest_width; i++)
+ {
+ a = des [0];
+ b = des [dest_pitch];
+ des [dest_pitch] = (unsigned char)((a * 102 + 154 * b + 128) >> 8);
+
+ c = des[dest_pitch*2];
+ des [dest_pitch*2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8);
+ des [dest_pitch*3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8);
+
+ // First line in next band...
+ a = des [dest_pitch * 5];
+ des [dest_pitch * 4] = (unsigned char)((c * 154 + a * 102 + 128) >> 8);
+
+ des++;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_last_vertical_band_3_5_scale_c
+ *
+ * INPUTS : unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_pitch : Stride of destination data.
+ * unsigned int dest_width : Width of destination data.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Scales last vertical band of pixels by scale 3 to 5. The
+ * height of the band scaled is 3-pixels.
+ *
+ * SPECIAL NOTES : The routine does not have available the first line of
+ * the band below the current band, since this is the
+ * last band.
+ *
+ ****************************************************************************/
+static
+void vp8cx_last_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned int a, b, c;
+ unsigned char *des = dest;
+
+ for (i = 0; i < dest_width; ++i)
+ {
+ a = des [0];
+ b = des [dest_pitch];
+
+ des [ dest_pitch ] = (unsigned char)((a * 102 + 154 * b + 128) >> 8);
+
+ c = des[dest_pitch*2];
+ des [dest_pitch*2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8);
+ des [dest_pitch*3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8);
+
+ // No other line for interplation of this line, so ..
+ des [ dest_pitch * 4 ] = (unsigned char)(c) ;
+
+ des++;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_horizontal_line_1_2_scale_c
+ *
+ * INPUTS : const unsigned char *source : Pointer to source data.
+ * unsigned int source_width : Stride of source.
+ * unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_width : Stride of destination (NOT USED).
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies horizontal line of pixels from source to
+ * destination scaling up by 1 to 2.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void vp8cx_horizontal_line_1_2_scale_c
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ unsigned int i;
+ unsigned int a, b;
+ unsigned char *des = dest;
+ const unsigned char *src = source;
+
+ (void) dest_width;
+
+ for (i = 0; i < source_width - 1; i += 1)
+ {
+ a = src[0];
+ b = src[1];
+ des [0] = (unsigned char)(a);
+ des [1] = (unsigned char)((a + b + 1) >> 1);
+ src += 1;
+ des += 2;
+ }
+
+ a = src[0];
+ des [0] = (unsigned char)(a);
+ des [1] = (unsigned char)(a);
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_vertical_band_1_2_scale_c
+ *
+ * INPUTS : unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_pitch : Stride of destination data.
+ * unsigned int dest_width : Width of destination data.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Scales vertical band of pixels by scale 1 to 2. The
+ * height of the band scaled is 1-pixel.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band.
+ *
+ ****************************************************************************/
+static
+void vp8cx_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned int a, b;
+ unsigned char *des = dest;
+
+ for (i = 0; i < dest_width; i++)
+ {
+ a = des [0];
+ b = des [dest_pitch * 2];
+
+ des[dest_pitch] = (unsigned char)((a + b + 1) >> 1);
+
+ des++;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8cx_last_vertical_band_1_2_scale_c
+ *
+ * INPUTS : unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_pitch : Stride of destination data.
+ * unsigned int dest_width : Width of destination data.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Scales last vertical band of pixels by scale 1 to 2. The
+ * height of the band scaled is 1-pixel.
+ *
+ * SPECIAL NOTES : The routine does not have available the first line of
+ * the band below the current band, since this is the
+ * last band.
+ *
+ ****************************************************************************/
+static
+void vp8cx_last_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ unsigned int i;
+ unsigned char *des = dest;
+
+ for (i = 0; i < dest_width; ++i)
+ {
+ des[dest_pitch] = des[0];
+ des++;
+ }
+}
+
+#include "vpx_scale/vpxscale.h"
+#include "vpx_mem/vpx_mem.h"
+
+struct vpxglobal_scalling_ptrs_t *g_scaling_ptrs = 0;
+
+int
+register_generic_scalers(void)
+{
+ int rv = 0;
+
+ g_scaling_ptrs = (struct vpxglobal_scalling_ptrs_t *)vpx_malloc(sizeof(struct vpxglobal_scalling_ptrs_t));
+
+ if (g_scaling_ptrs)
+ {
+ g_scaling_ptrs->vpxhorizontal_line_1_2_scale_t = vp8cx_horizontal_line_1_2_scale_c;
+ g_scaling_ptrs->vpxvertical_band_1_2_scale_t = vp8cx_vertical_band_1_2_scale_c;
+ g_scaling_ptrs->vpxlast_vertical_band_1_2_scale_t = vp8cx_last_vertical_band_1_2_scale_c;
+ g_scaling_ptrs->vpxhorizontal_line_3_5_scale_t = vp8cx_horizontal_line_3_5_scale_c;
+ g_scaling_ptrs->vpxvertical_band_3_5_scale_t = vp8cx_vertical_band_3_5_scale_c;
+ g_scaling_ptrs->vpxlast_vertical_band_3_5_scale_t = vp8cx_last_vertical_band_3_5_scale_c;
+ g_scaling_ptrs->vpxhorizontal_line_4_5_scale_t = vp8cx_horizontal_line_4_5_scale_c;
+ g_scaling_ptrs->vpxvertical_band_4_5_scale_t = vp8cx_vertical_band_4_5_scale_c;
+ g_scaling_ptrs->vpxlast_vertical_band_4_5_scale_t = vp8cx_last_vertical_band_4_5_scale_c;
+ }
+ else
+ {
+ rv = -1;
+ }
+
+ /*
+ vp8_horizontal_line_1_2_scale = vp8cx_horizontal_line_1_2_scale_c;
+ vp8_vertical_band_1_2_scale = vp8cx_vertical_band_1_2_scale_c;
+ vp8_last_vertical_band_1_2_scale = vp8cx_last_vertical_band_1_2_scale_c;
+ vp8_horizontal_line_3_5_scale = vp8cx_horizontal_line_3_5_scale_c;
+ vp8_vertical_band_3_5_scale = vp8cx_vertical_band_3_5_scale_c;
+ vp8_last_vertical_band_3_5_scale = vp8cx_last_vertical_band_3_5_scale_c;
+ vp8_horizontal_line_4_5_scale = vp8cx_horizontal_line_4_5_scale_c;
+ vp8_vertical_band_4_5_scale = vp8cx_vertical_band_4_5_scale_c;
+ vp8_last_vertical_band_4_5_scale = vp8cx_last_vertical_band_4_5_scale_c;
+ */
+
+ return rv;
+}
+
+int
+de_register_generic_scalers(void)
+{
+ int rv = 0;
+
+ if (g_scaling_ptrs)
+ {
+ vpx_free(g_scaling_ptrs);
+ g_scaling_ptrs = 0;
+ }
+ else
+ {
+ rv = -1;
+ }
+
+ return rv;
+}
diff --git a/vpx_scale/leapster/vpxscale_lf.c b/vpx_scale/leapster/vpxscale_lf.c
new file mode 100644
index 000000000..5f05e5de0
--- /dev/null
+++ b/vpx_scale/leapster/vpxscale_lf.c
@@ -0,0 +1,890 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+ *
+ * Module Title : scale.c
+ *
+ * Description : Image scaling functions.
+ *
+ ***************************************************************************/
+
+/****************************************************************************
+* Header Files
+****************************************************************************/
+#include "stdlib.h"
+#include "vpx_scale/vpxscale.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12config.h"
+#include "codec_common_interface.h"
+
+/****************************************************************************
+* Exports
+****************************************************************************/
+/*
+void (*vp8_vertical_band_4_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+void (*vp8_last_vertical_band_4_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+void (*vp8_vertical_band_3_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+void (*vp8_last_vertical_band_3_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+void (*vp8_horizontal_line_1_2_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width);
+void (*vp8_horizontal_line_3_5_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width);
+void (*vp8_horizontal_line_4_5_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width);
+void (*vp8_vertical_band_1_2_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+void (*vp8_last_vertical_band_1_2_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+*/
+
+
+typedef struct
+{
+ int expanded_frame_width;
+ int expanded_frame_height;
+
+ int HScale;
+ int HRatio;
+ int VScale;
+ int VRatio;
+
+ YV12_BUFFER_CONFIG *src_yuv_config;
+ YV12_BUFFER_CONFIG *dst_yuv_config;
+
+} SCALE_VARS;
+
+
+/****************************************************************************
+ *
+ * ROUTINE : horizontal_line_copy
+ *
+ * INPUTS : None
+ *
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : None
+ *
+ * FUNCTION : 1 to 1 scaling up for a horizontal line of pixles
+ *
+ * SPECIAL NOTES : None.
+ *
+ * ERRORS : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_copy(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ duck_memcpy(dest, source, source_width);
+}
+/****************************************************************************
+ *
+ * ROUTINE : null_scale
+ *
+ * INPUTS : None
+ *
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : None
+ *
+ * FUNCTION : 1 to 1 scaling up for a vertical band
+ *
+ * SPECIAL NOTES : None.
+ *
+ * ERRORS : None.
+ *
+ ****************************************************************************/
+static
+void null_scale(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ return;
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : scale1d_2t1_i
+ *
+ * INPUTS : const unsigned char *source : Pointer to data to be scaled.
+ * int source_step : Number of pixels to step on in source.
+ * unsigned int source_scale : Scale for source (UNUSED).
+ * unsigned int source_length : Length of source (UNUSED).
+ * unsigned char *dest : Pointer to output data array.
+ * int dest_step : Number of pixels to step on in destination.
+ * unsigned int dest_scale : Scale for destination (UNUSED).
+ * unsigned int dest_length : Length of destination.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Performs 2-to-1 interpolated scaling.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void scale1d_2t1_i
+(
+ const unsigned char *source,
+ int source_step,
+ unsigned int source_scale,
+ unsigned int source_length,
+ unsigned char *dest,
+ int dest_step,
+ unsigned int dest_scale,
+ unsigned int dest_length
+)
+{
+ unsigned int i, j;
+ unsigned int temp;
+
+ (void) source_length;
+ (void) source_scale;
+ (void) dest_scale;
+
+ source_step *= 2;
+ dest[0] = source[0];
+
+ for (i = dest_step, j = source_step; i < dest_length * dest_step; i += dest_step, j += source_step)
+ {
+ temp = 8;
+ temp += 3 * source[j-source_step];
+ temp += 10 * source[j];
+ temp += 3 * source[j+source_step];
+ temp >>= 4;
+ dest[i] = (char)(temp);
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : scale1d_2t1_ps
+ *
+ * INPUTS : const unsigned char *source : Pointer to data to be scaled.
+ * int source_step : Number of pixels to step on in source.
+ * unsigned int source_scale : Scale for source (UNUSED).
+ * unsigned int source_length : Length of source (UNUSED).
+ * unsigned char *dest : Pointer to output data array.
+ * int dest_step : Number of pixels to step on in destination.
+ * unsigned int dest_scale : Scale for destination (UNUSED).
+ * unsigned int dest_length : Length of destination.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Performs 2-to-1 point subsampled scaling.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void scale1d_2t1_ps
+(
+ const unsigned char *source,
+ int source_step,
+ unsigned int source_scale,
+ unsigned int source_length,
+ unsigned char *dest,
+ int dest_step,
+ unsigned int dest_scale,
+ unsigned int dest_length
+)
+{
+ unsigned int i, j;
+
+ (void) source_length;
+ (void) source_scale;
+ (void) dest_scale;
+
+ source_step *= 2;
+ j = 0;
+
+ for (i = 0; i < dest_length * dest_step; i += dest_step, j += source_step)
+ dest[i] = source[j];
+}
+/****************************************************************************
+ *
+ * ROUTINE : scale1d_c
+ *
+ * INPUTS : const unsigned char *source : Pointer to data to be scaled.
+ * int source_step : Number of pixels to step on in source.
+ * unsigned int source_scale : Scale for source.
+ * unsigned int source_length : Length of source (UNUSED).
+ * unsigned char *dest : Pointer to output data array.
+ * int dest_step : Number of pixels to step on in destination.
+ * unsigned int dest_scale : Scale for destination.
+ * unsigned int dest_length : Length of destination.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Performs linear interpolation in one dimension.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void scale1d_c
+(
+ const unsigned char *source,
+ int source_step,
+ unsigned int source_scale,
+ unsigned int source_length,
+ unsigned char *dest,
+ int dest_step,
+ unsigned int dest_scale,
+ unsigned int dest_length
+)
+{
+ unsigned int i;
+ unsigned int round_value = dest_scale / 2;
+ unsigned int left_modifier = dest_scale;
+ unsigned int right_modifier = 0;
+ unsigned char left_pixel = *source;
+ unsigned char right_pixel = *(source + source_step);
+
+ (void) source_length;
+
+ // These asserts are needed if there are boundary issues...
+ //assert ( dest_scale > source_scale );
+ //assert ( (source_length-1) * dest_scale >= (dest_length-1) * source_scale );
+
+ for (i = 0; i < dest_length * dest_step; i += dest_step)
+ {
+ dest[i] = (char)((left_modifier * left_pixel + right_modifier * right_pixel + round_value) / dest_scale);
+
+ right_modifier += source_scale;
+
+ while (right_modifier > dest_scale)
+ {
+ right_modifier -= dest_scale;
+ source += source_step;
+ left_pixel = *source;
+ right_pixel = *(source + source_step);
+ }
+
+ left_modifier = dest_scale - right_modifier;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : Scale2D
+ *
+ * INPUTS : const unsigned char *source : Pointer to data to be scaled.
+ * int source_pitch : Stride of source image.
+ * unsigned int source_width : Width of input image.
+ * unsigned int source_height : Height of input image.
+ * unsigned char *dest : Pointer to output data array.
+ * int dest_pitch : Stride of destination image.
+ * unsigned int dest_width : Width of destination image.
+ * unsigned int dest_height : Height of destination image.
+ * unsigned char *temp_area : Pointer to temp work area.
+ * unsigned char temp_area_height : Height of temp work area.
+ * unsigned int hscale : Horizontal scale factor numerator.
+ * unsigned int hratio : Horizontal scale factor denominator.
+ * unsigned int vscale : Vertical scale factor numerator.
+ * unsigned int vratio : Vertical scale factor denominator.
+ * unsigned int interlaced : Interlace flag.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Performs 2-tap linear interpolation in two dimensions.
+ *
+ * SPECIAL NOTES : Expansion is performed one band at a time to help with
+ * caching.
+ *
+ ****************************************************************************/
+static
+void Scale2D
+(
+ const unsigned char *source,
+ int source_pitch,
+ unsigned int source_width,
+ unsigned int source_height,
+ unsigned char *dest,
+ int dest_pitch,
+ unsigned int dest_width,
+ unsigned int dest_height,
+ unsigned char *temp_area,
+ unsigned char temp_area_height,
+ unsigned int hscale,
+ unsigned int hratio,
+ unsigned int vscale,
+ unsigned int vratio,
+ unsigned int interlaced
+)
+{
+ unsigned int i, j, k;
+ unsigned int bands;
+ unsigned int dest_band_height;
+ unsigned int source_band_height;
+
+ typedef void (*Scale1D)(const unsigned char * source, int source_step, unsigned int source_scale, unsigned int source_length,
+ unsigned char * dest, int dest_step, unsigned int dest_scale, unsigned int dest_length);
+
+ Scale1D Scale1Dv = scale1d_c;
+ Scale1D Scale1Dh = scale1d_c;
+
+ if (hscale == 2 && hratio == 1)
+ Scale1Dh = scale1d_2t1_ps;
+
+ if (vscale == 2 && vratio == 1)
+ {
+ if (interlaced)
+ Scale1Dv = scale1d_2t1_ps;
+ else
+ Scale1Dv = scale1d_2t1_i;
+ }
+
+ if (source_height == dest_height)
+ {
+ // for each band of the image
+ for (k = 0; k < dest_height; k++)
+ {
+ Scale1Dh(source, 1, hscale, source_width + 1, dest, 1, hratio, dest_width);
+ source += source_pitch;
+ dest += dest_pitch;
+ }
+
+ return;
+ }
+
+ if (dest_height > source_height)
+ {
+ dest_band_height = temp_area_height - 1;
+ source_band_height = dest_band_height * source_height / dest_height;
+ }
+ else
+ {
+ source_band_height = temp_area_height - 1;
+ dest_band_height = source_band_height * vratio / vscale;
+ }
+
+ // first row needs to be done so that we can stay one row ahead for vertical zoom
+ Scale1Dh(source, 1, hscale, source_width + 1, temp_area, 1, hratio, dest_width);
+
+ // for each band of the image
+ bands = (dest_height + dest_band_height - 1) / dest_band_height;
+
+ for (k = 0; k < bands; k++)
+ {
+ // scale one band horizontally
+ for (i = 1; i < source_band_height + 1; i++)
+ {
+ if (k * source_band_height + i < source_height)
+ {
+ Scale1Dh(source + i * source_pitch, 1, hscale, source_width + 1,
+ temp_area + i * dest_pitch, 1, hratio, dest_width);
+ }
+ else // Duplicate the last row
+ {
+ // copy temp_area row 0 over from last row in the past
+ duck_memcpy(temp_area + i * dest_pitch, temp_area + (i - 1)*dest_pitch, dest_pitch);
+ }
+ }
+
+ // scale one band vertically
+ for (j = 0; j < dest_width; j++)
+ {
+ Scale1Dv(&temp_area[j], dest_pitch, vscale, source_band_height + 1,
+ &dest[j], dest_pitch, vratio, dest_band_height);
+ }
+
+ // copy temp_area row 0 over from last row in the past
+ duck_memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch);
+
+ // move to the next band
+ source += source_band_height * source_pitch;
+ dest += dest_band_height * dest_pitch;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8_scale_frame
+ *
+ * INPUTS : YV12_BUFFER_CONFIG *src : Pointer to frame to be scaled.
+ * YV12_BUFFER_CONFIG *dst : Pointer to buffer to hold scaled frame.
+ * unsigned char *temp_area : Pointer to temp work area.
+ * unsigned char temp_area_height : Height of temp work area.
+ * unsigned int hscale : Horizontal scale factor numerator.
+ * unsigned int hratio : Horizontal scale factor denominator.
+ * unsigned int vscale : Vertical scale factor numerator.
+ * unsigned int vratio : Vertical scale factor denominator.
+ * unsigned int interlaced : Interlace flag.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Performs 2-tap linear interpolation in two dimensions.
+ *
+ * SPECIAL NOTES : Expansion is performed one band at a time to help with
+ * caching.
+ *
+ ****************************************************************************/
+void vp8_scale_frame
+(
+ YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst,
+ unsigned char *temp_area,
+ unsigned char temp_height,
+ unsigned int hscale,
+ unsigned int hratio,
+ unsigned int vscale,
+ unsigned int vratio,
+ unsigned int interlaced
+)
+{
+ int i;
+ int dw = (hscale - 1 + src->y_width * hratio) / hscale;
+ int dh = (vscale - 1 + src->y_height * vratio) / vscale;
+
+ // call our internal scaling routines!!
+ Scale2D((unsigned char *) src->y_buffer, src->y_stride, src->y_width, src->y_height,
+ (unsigned char *) dst->y_buffer, dst->y_stride, dw, dh,
+ temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
+
+ if (dw < (int)dst->y_width)
+ for (i = 0; i < dh; i++)
+ duck_memset(dst->y_buffer + i * dst->y_stride + dw - 1, dst->y_buffer[i*dst->y_stride+dw-2], dst->y_width - dw + 1);
+
+ if (dh < (int)dst->y_height)
+ for (i = dh - 1; i < (int)dst->y_height; i++)
+ duck_memcpy(dst->y_buffer + i * dst->y_stride, dst->y_buffer + (dh - 2) * dst->y_stride, dst->y_width + 1);
+
+ Scale2D((unsigned char *) src->u_buffer, src->uv_stride, src->uv_width, src->uv_height,
+ (unsigned char *) dst->u_buffer, dst->uv_stride, dw / 2, dh / 2,
+ temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
+
+ if (dw / 2 < (int)dst->uv_width)
+ for (i = 0; i < dst->uv_height; i++)
+ duck_memset(dst->u_buffer + i * dst->uv_stride + dw / 2 - 1, dst->u_buffer[i*dst->uv_stride+dw/2-2], dst->uv_width - dw / 2 + 1);
+
+ if (dh / 2 < (int)dst->uv_height)
+ for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
+ duck_memcpy(dst->u_buffer + i * dst->uv_stride, dst->u_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
+
+ Scale2D((unsigned char *) src->v_buffer, src->uv_stride, src->uv_width, src->uv_height,
+ (unsigned char *) dst->v_buffer, dst->uv_stride, dw / 2, dh / 2,
+ temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
+
+ if (dw / 2 < (int)dst->uv_width)
+ for (i = 0; i < dst->uv_height; i++)
+ duck_memset(dst->v_buffer + i * dst->uv_stride + dw / 2 - 1, dst->v_buffer[i*dst->uv_stride+dw/2-2], dst->uv_width - dw / 2 + 1);
+
+ if (dh / 2 < (int) dst->uv_height)
+ for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
+ duck_memcpy(dst->v_buffer + i * dst->uv_stride, dst->v_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
+}
+/****************************************************************************
+ *
+ * ROUTINE : any_ratio_2d_scale
+ *
+ * INPUTS : SCALE_INSTANCE *si : Pointer to post-processor instance (NOT USED).
+ * const unsigned char *source : Pointer to source image.
+ * unsigned int source_pitch : Stride of source image.
+ * unsigned int source_width : Width of source image.
+ * unsigned int source_height : Height of source image (NOT USED).
+ * unsigned char *dest : Pointer to destination image.
+ * unsigned int dest_pitch : Stride of destination image.
+ * unsigned int dest_width : Width of destination image.
+ * unsigned int dest_height : Height of destination image.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : int: 1 if image scaled, 0 if image could not be scaled.
+ *
+ * FUNCTION : Scale the image with changing apect ratio.
+ *
+ * SPECIAL NOTES : This scaling is a bi-linear scaling. Need to re-work the
+ * whole function for new scaling algorithm.
+ *
+ ****************************************************************************/
+static
+int any_ratio_2d_scale
+(
+ SCALE_VARS *si,
+ const unsigned char *source,
+ unsigned int source_pitch,
+ unsigned int source_width,
+ unsigned int source_height,
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width,
+ unsigned int dest_height
+)
+{
+ unsigned int i, k;
+ unsigned int src_band_height = 0;
+ unsigned int dest_band_height = 0;
+
+ // suggested scale factors
+ int hs = si->HScale;
+ int hr = si->HRatio;
+ int vs = si->VScale;
+ int vr = si->VRatio;
+
+ // assume the ratios are scalable instead of should be centered
+ int ratio_scalable = 1;
+
+ void (*horiz_line_scale)(const unsigned char *, unsigned int, unsigned char *, unsigned int) = NULL;
+ void (*vert_band_scale)(unsigned char *, unsigned int, unsigned int) = NULL;
+ void (*last_vert_band_scale)(unsigned char *, unsigned int, unsigned int) = NULL;
+
+ (void) si;
+
+ // find out the ratio for each direction
+ switch (hr * 10 / hs)
+ {
+ case 8:
+ // 4-5 Scale in Width direction
+ horiz_line_scale = g_scaling_ptrs->vpxhorizontal_line_4_5_scale_t;
+ break;
+ case 6:
+ // 3-5 Scale in Width direction
+ horiz_line_scale = g_scaling_ptrs->vpxhorizontal_line_3_5_scale_t;
+ break;
+ case 5:
+ // 1-2 Scale in Width direction
+ horiz_line_scale = g_scaling_ptrs->vpxhorizontal_line_1_2_scale_t;
+ break;
+ case 10:
+ // no scale in Width direction
+ horiz_line_scale = horizontal_line_copy;
+ break;
+ default:
+ // The ratio is not acceptable now
+ // throw("The ratio is not acceptable for now!");
+ ratio_scalable = 0;
+ break;
+ }
+
+ switch (vr * 10 / vs)
+ {
+ case 8:
+ // 4-5 Scale in vertical direction
+ vert_band_scale = g_scaling_ptrs->vpxvertical_band_4_5_scale_t;
+ last_vert_band_scale = g_scaling_ptrs->vpxlast_vertical_band_4_5_scale_t;
+ src_band_height = 4;
+ dest_band_height = 5;
+ break;
+ case 6:
+ // 3-5 Scale in vertical direction
+ vert_band_scale = g_scaling_ptrs->vpxvertical_band_3_5_scale_t;
+ last_vert_band_scale = g_scaling_ptrs->vpxlast_vertical_band_3_5_scale_t;
+ src_band_height = 3;
+ dest_band_height = 5;
+ break;
+ case 5:
+ // 1-2 Scale in vertical direction
+ vert_band_scale = g_scaling_ptrs->vpxvertical_band_1_2_scale_t;
+ last_vert_band_scale = g_scaling_ptrs->vpxlast_vertical_band_1_2_scale_t;
+ src_band_height = 1;
+ dest_band_height = 2;
+ break;
+ case 10:
+ // no scale in Width direction
+ vert_band_scale = null_scale;
+ last_vert_band_scale = null_scale;
+ src_band_height = 4;
+ dest_band_height = 4;
+ break;
+ default:
+ // The ratio is not acceptable now
+ // throw("The ratio is not acceptable for now!");
+ ratio_scalable = 0;
+ break;
+ }
+
+ if (ratio_scalable == 0)
+ return ratio_scalable;
+
+ horiz_line_scale(source, source_width, dest, dest_width);
+
+ // except last band
+ for (k = 0; k < (dest_height + dest_band_height - 1) / dest_band_height - 1; k++)
+ {
+ // scale one band horizontally
+ for (i = 1; i < src_band_height; i++)
+ {
+ horiz_line_scale(source + i * source_pitch,
+ source_width,
+ dest + i * dest_pitch,
+ dest_width);
+ }
+
+ // first line of next band
+ horiz_line_scale(source + src_band_height * source_pitch,
+ source_width,
+ dest + dest_band_height * dest_pitch,
+ dest_width);
+
+ // Vertical scaling is in place
+ vert_band_scale(dest, dest_pitch, dest_width);
+
+ // Next band...
+ source += src_band_height * source_pitch;
+ dest += dest_band_height * dest_pitch;
+ }
+
+ // scale one band horizontally
+ for (i = 1; i < src_band_height; i++)
+ {
+ horiz_line_scale(source + i * source_pitch,
+ source_width,
+ dest + i * dest_pitch,
+ dest_width);
+ }
+
+ // Vertical scaling is in place
+ last_vert_band_scale(dest, dest_pitch, dest_width);
+
+ return ratio_scalable;
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : any_ratio_frame_scale
+ *
+ * INPUTS : SCALE_INSTANCE *si : Pointer to post-processor instance (NOT USED).
+ * unsigned char *frame_buffer : Pointer to source image.
+ * int YOffset : Offset from start of buffer to Y samples.
+ * int UVOffset : Offset from start of buffer to UV samples.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : int: 1 if image scaled, 0 if image could not be scaled.
+ *
+ * FUNCTION : Scale the image with changing apect ratio.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+int any_ratio_frame_scale(SCALE_VARS *scale_vars, int YOffset, int UVOffset)
+{
+ int i;
+ int ew;
+ int eh;
+
+ // suggested scale factors
+ int hs = scale_vars->HScale;
+ int hr = scale_vars->HRatio;
+ int vs = scale_vars->VScale;
+ int vr = scale_vars->VRatio;
+
+ int ratio_scalable = 1;
+
+ int sw = (scale_vars->expanded_frame_width * hr + hs - 1) / hs;
+ int sh = (scale_vars->expanded_frame_height * vr + vs - 1) / vs;
+ int dw = scale_vars->expanded_frame_width;
+ int dh = scale_vars->expanded_frame_height;
+ YV12_BUFFER_CONFIG *src_yuv_config = scale_vars->src_yuv_config;
+ YV12_BUFFER_CONFIG *dst_yuv_config = scale_vars->dst_yuv_config;
+
+ if (hr == 3)
+ ew = (sw + 2) / 3 * 3 * hs / hr;
+ else
+ ew = (sw + 7) / 8 * 8 * hs / hr;
+
+ if (vr == 3)
+ eh = (sh + 2) / 3 * 3 * vs / vr;
+ else
+ eh = (sh + 7) / 8 * 8 * vs / vr;
+
+ ratio_scalable = any_ratio_2d_scale(scale_vars,
+ (const unsigned char *)src_yuv_config->y_buffer,
+ src_yuv_config->y_stride, sw, sh,
+ (unsigned char *) dst_yuv_config->y_buffer + YOffset,
+ dst_yuv_config->y_stride, dw, dh);
+
+ for (i = 0; i < eh; i++)
+ duck_memset(dst_yuv_config->y_buffer + YOffset + i * dst_yuv_config->y_stride + dw, 0, ew - dw);
+
+ for (i = dh; i < eh; i++)
+ duck_memset(dst_yuv_config->y_buffer + YOffset + i * dst_yuv_config->y_stride, 0, ew);
+
+ if (ratio_scalable == 0)
+ return ratio_scalable;
+
+ sw = (sw + 1) >> 1;
+ sh = (sh + 1) >> 1;
+ dw = (dw + 1) >> 1;
+ dh = (dh + 1) >> 1;
+
+ any_ratio_2d_scale(scale_vars,
+ (const unsigned char *)src_yuv_config->u_buffer,
+ src_yuv_config->y_stride / 2, sw, sh,
+ (unsigned char *)dst_yuv_config->u_buffer + UVOffset,
+ dst_yuv_config->uv_stride, dw, dh);
+
+ any_ratio_2d_scale(scale_vars,
+ (const unsigned char *)src_yuv_config->v_buffer,
+ src_yuv_config->y_stride / 2, sw, sh,
+ (unsigned char *)dst_yuv_config->v_buffer + UVOffset,
+ dst_yuv_config->uv_stride, dw, dh);
+
+ return ratio_scalable;
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : center_image
+ *
+ * INPUTS : SCALE_INSTANCE *si : Pointer to post-processor instance.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Centers the image without scaling in the output buffer.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static void
+center_image(YV12_BUFFER_CONFIG *src_yuv_config, YV12_BUFFER_CONFIG *dst_yuv_config)
+{
+ int i;
+ int row_offset, col_offset;
+ char *src_data_pointer;
+ char *dst_data_pointer;
+
+ // center values
+ row_offset = (dst_yuv_config->y_height - src_yuv_config->y_height) / 2;
+ col_offset = (dst_yuv_config->y_width - src_yuv_config->y_width) / 2;
+
+ // Y's
+ src_data_pointer = src_yuv_config->y_buffer;
+ dst_data_pointer = (char *)dst_yuv_config->y_buffer + (row_offset * dst_yuv_config->y_stride) + col_offset;
+
+ for (i = 0; i < src_yuv_config->y_height; i++)
+ {
+ duck_memcpy(dst_data_pointer, src_data_pointer, src_yuv_config->y_width);
+ dst_data_pointer += dst_yuv_config->y_stride;
+ src_data_pointer += src_yuv_config->y_stride;
+ }
+
+ row_offset /= 2;
+ col_offset /= 2;
+
+ // U's
+ src_data_pointer = src_yuv_config->u_buffer;
+ dst_data_pointer = (char *)dst_yuv_config->u_buffer + (row_offset * dst_yuv_config->uv_stride) + col_offset;
+
+ for (i = 0; i < src_yuv_config->uv_height; i++)
+ {
+ duck_memcpy(dst_data_pointer, src_data_pointer, src_yuv_config->uv_width);
+ dst_data_pointer += dst_yuv_config->uv_stride;
+ src_data_pointer += src_yuv_config->uv_stride;
+ }
+
+ // V's
+ src_data_pointer = src_yuv_config->v_buffer;
+ dst_data_pointer = (char *)dst_yuv_config->v_buffer + (row_offset * dst_yuv_config->uv_stride) + col_offset;
+
+ for (i = 0; i < src_yuv_config->uv_height; i++)
+ {
+ duck_memcpy(dst_data_pointer, src_data_pointer, src_yuv_config->uv_width);
+ dst_data_pointer += dst_yuv_config->uv_stride;
+ src_data_pointer += src_yuv_config->uv_stride;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : scale_or_center
+ *
+ * INPUTS : SCALE_INSTANCE *si : Pointer to post-processor instance.
+ *
+ *
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Decides to scale or center image in scale buffer for blit
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void
+vp8_yv12_scale_or_center
+(
+ YV12_BUFFER_CONFIG *src_yuv_config,
+ YV12_BUFFER_CONFIG *dst_yuv_config,
+ int expanded_frame_width,
+ int expanded_frame_height,
+ int scaling_mode,
+ int HScale,
+ int HRatio,
+ int VScale,
+ int VRatio
+)
+{
+// if ( ppi->post_processing_level )
+ // update_umvborder ( ppi, frame_buffer );
+
+
+ switch (scaling_mode)
+ {
+ case SCALE_TO_FIT:
+ case MAINTAIN_ASPECT_RATIO:
+ {
+ SCALE_VARS scale_vars;
+ // center values
+#if 1
+ int row = (dst_yuv_config->y_height - expanded_frame_height) / 2;
+ int col = (dst_yuv_config->y_width - expanded_frame_width) / 2;
+// int YOffset = row * dst_yuv_config->y_width + col;
+// int UVOffset = (row>>1) * dst_yuv_config->uv_width + (col>>1);
+ int YOffset = row * dst_yuv_config->y_stride + col;
+ int UVOffset = (row >> 1) * dst_yuv_config->uv_stride + (col >> 1);
+#else
+ int row = (src_yuv_config->y_height - expanded_frame_height) / 2;
+ int col = (src_yuv_config->y_width - expanded_frame_width) / 2;
+ int YOffset = row * src_yuv_config->y_width + col;
+ int UVOffset = (row >> 1) * src_yuv_config->uv_width + (col >> 1);
+#endif
+
+ scale_vars.dst_yuv_config = dst_yuv_config;
+ scale_vars.src_yuv_config = src_yuv_config;
+ scale_vars.HScale = HScale;
+ scale_vars.HRatio = HRatio;
+ scale_vars.VScale = VScale;
+ scale_vars.VRatio = VRatio;
+ scale_vars.expanded_frame_width = expanded_frame_width;
+ scale_vars.expanded_frame_height = expanded_frame_height;
+
+ // perform center and scale
+ any_ratio_frame_scale(&scale_vars, YOffset, UVOffset);
+
+ break;
+ }
+ case CENTER:
+ center_image(src_yuv_config, dst_yuv_config);
+ break;
+
+ default:
+ break;
+ }
+}
diff --git a/vpx_scale/leapster/yv12extend.c b/vpx_scale/leapster/yv12extend.c
new file mode 100644
index 000000000..480d971b4
--- /dev/null
+++ b/vpx_scale/leapster/yv12extend.c
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+ *
+ * Module Title : yv12extend.c
+ *
+ * Description :
+ *
+ ***************************************************************************/
+
+/****************************************************************************
+* Header Files
+****************************************************************************/
+//#include <stdlib.h>
+#include "vpx_scale/yv12config.h"
+#include "vpx_mem/vpx_mem.h"
+
+/****************************************************************************
+* Exports
+****************************************************************************/
+
+/****************************************************************************
+ *
+ ****************************************************************************/
+void
+vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf)
+{
+ int i;
+ char *src_ptr1, *src_ptr2;
+ char *dest_ptr1, *dest_ptr2;
+
+ unsigned int Border;
+ int plane_stride;
+ int plane_height;
+ int plane_width;
+
+ /***********/
+ /* Y Plane */
+ /***********/
+ Border = ybf->border;
+ plane_stride = ybf->y_stride;
+ plane_height = ybf->y_height;
+ plane_width = ybf->y_width;
+
+ // copy the left and right most columns out
+ src_ptr1 = ybf->y_buffer;
+ src_ptr2 = src_ptr1 + plane_width - 1;
+ dest_ptr1 = src_ptr1 - Border;
+ dest_ptr2 = src_ptr2 + 1;
+
+ for (i = 0; i < plane_height; i++)
+ {
+ memset(dest_ptr1, src_ptr1[0], Border);
+ memset(dest_ptr2, src_ptr2[0], Border);
+ src_ptr1 += plane_stride;
+ src_ptr2 += plane_stride;
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ // Now copy the top and bottom source lines into each line of the respective borders
+ src_ptr1 = ybf->y_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)Border; i++)
+ {
+ memcpy(dest_ptr1, src_ptr1, plane_stride);
+ memcpy(dest_ptr2, src_ptr2, plane_stride);
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ plane_stride /= 2;
+ plane_height /= 2;
+ plane_width /= 2;
+ Border /= 2;
+
+ /***********/
+ /* U Plane */
+ /***********/
+
+ // copy the left and right most columns out
+ src_ptr1 = ybf->u_buffer;
+ src_ptr2 = src_ptr1 + plane_width - 1;
+ dest_ptr1 = src_ptr1 - Border;
+ dest_ptr2 = src_ptr2 + 1;
+
+ for (i = 0; i < plane_height; i++)
+ {
+ memset(dest_ptr1, src_ptr1[0], Border);
+ memset(dest_ptr2, src_ptr2[0], Border);
+ src_ptr1 += plane_stride;
+ src_ptr2 += plane_stride;
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ // Now copy the top and bottom source lines into each line of the respective borders
+ src_ptr1 = ybf->u_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)(Border); i++)
+ {
+ memcpy(dest_ptr1, src_ptr1, plane_stride);
+ memcpy(dest_ptr2, src_ptr2, plane_stride);
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ /***********/
+ /* V Plane */
+ /***********/
+
+ // copy the left and right most columns out
+ src_ptr1 = ybf->v_buffer;
+ src_ptr2 = src_ptr1 + plane_width - 1;
+ dest_ptr1 = src_ptr1 - Border;
+ dest_ptr2 = src_ptr2 + 1;
+
+ for (i = 0; i < plane_height; i++)
+ {
+ memset(dest_ptr1, src_ptr1[0], Border);
+ memset(dest_ptr2, src_ptr2[0], Border);
+ src_ptr1 += plane_stride;
+ src_ptr2 += plane_stride;
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ // Now copy the top and bottom source lines into each line of the respective borders
+ src_ptr1 = ybf->v_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)(Border); i++)
+ {
+ memcpy(dest_ptr1, src_ptr1, plane_stride);
+ memcpy(dest_ptr2, src_ptr2, plane_stride);
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+}
+/****************************************************************************
+ *
+ * ROUTINE : vp8_yv12_copy_frame
+ *
+ * INPUTS :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies the source image into the destination image and
+ * updates the destination's UMV borders.
+ *
+ * SPECIAL NOTES : The frames are assumed to be identical in size.
+ *
+ ****************************************************************************/
+void
+vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
+{
+ int row;
+ int i;
+ unsigned int *source;
+ _Uncached unsigned int *dest;
+ int height;
+ int width;
+
+ height = src_ybc->y_height + (src_ybc->border * 2);
+ width = src_ybc->y_width + (src_ybc->border * 2);
+ width /= 4;
+ source = (unsigned int *)(src_ybc->y_buffer - (src_ybc->border * src_ybc->y_stride) - src_ybc->border);
+ dest = (_Uncached unsigned int *)(dst_ybc->y_buffer - (dst_ybc->border * dst_ybc->y_stride) - dst_ybc->border);
+
+ for (row = 0; row < height; row++)
+ {
+ for (i = 0; i < width; i++)
+ {
+ dest[i] = source[i];
+ }
+
+ source += width;
+ dest += width;
+ }
+
+ height = src_ybc->uv_height + (src_ybc->border);
+ width = src_ybc->uv_width + (src_ybc->border);
+ width /= 4;
+
+ source = (unsigned int *)(src_ybc->u_buffer - (src_ybc->border / 2 * src_ybc->uv_stride) - src_ybc->border / 2);
+ dest = (_Uncached unsigned int *)(dst_ybc->u_buffer - (dst_ybc->border / 2 * dst_ybc->uv_stride) - dst_ybc->border / 2);
+
+ for (row = 0; row < height; row++)
+ {
+ for (i = 0; i < width; i++)
+ {
+ dest[i] = source[i];
+ }
+
+ source += width;
+ dest += width;
+ }
+
+ source = (unsigned int *)(src_ybc->v_buffer - (src_ybc->border / 2 * src_ybc->uv_stride) - src_ybc->border / 2);
+ dest = (_Uncached unsigned int *)(dst_ybc->v_buffer - (dst_ybc->border / 2 * dst_ybc->uv_stride) - dst_ybc->border / 2);
+
+ for (row = 0; row < height; row++)
+ {
+ for (i = 0; i < width; i++)
+ {
+ dest[i] = source[i];
+ }
+
+ source += width;
+ dest += width;
+ }
+
+}
diff --git a/vpx_scale/scale_mode.h b/vpx_scale/scale_mode.h
new file mode 100644
index 000000000..2a9ab7612
--- /dev/null
+++ b/vpx_scale/scale_mode.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*****************************************************************************
+*/
+
+#ifndef SCALE_MODE_H
+#define SCALE_MODE_H
+
+typedef enum
+{
+ MAINTAIN_ASPECT_RATIO = 0x0,
+ SCALE_TO_FIT = 0x1,
+ CENTER = 0x2,
+ OTHER = 0x3
+} SCALE_MODE;
+
+
+#endif
diff --git a/vpx_scale/symbian/gen_scalers_armv4.asm b/vpx_scale/symbian/gen_scalers_armv4.asm
new file mode 100644
index 000000000..1c904edae
--- /dev/null
+++ b/vpx_scale/symbian/gen_scalers_armv4.asm
@@ -0,0 +1,773 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |horizontal_line_4_5_scale_armv4|
+ EXPORT |vertical_band_4_5_scale_armv4|
+ EXPORT |horizontal_line_2_3_scale_armv4|
+ EXPORT |vertical_band_2_3_scale_armv4|
+ EXPORT |horizontal_line_3_5_scale_armv4|
+ EXPORT |vertical_band_3_5_scale_armv4|
+ EXPORT |horizontal_line_3_4_scale_armv4|
+ EXPORT |vertical_band_3_4_scale_armv4|
+ EXPORT |horizontal_line_1_2_scale_armv4|
+ EXPORT |vertical_band_1_2_scale_armv4|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+src RN r0
+srcw RN r1
+dest RN r2
+mask RN r12
+c51_205 RN r10
+c102_154 RN r11
+;/****************************************************************************
+; *
+; * ROUTINE : horizontal_line_4_5_scale_armv4
+; *
+; * INPUTS : const unsigned char *source : Pointer to source data.
+; * unsigned int source_width : Stride of source.
+; * unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_width : Stride of destination (NOT USED).
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Copies horizontal line of pixels from source to
+; * destination scaling up by 4 to 5.
+; *
+; * SPECIAL NOTES : None.
+; *
+; ****************************************************************************/
+;void horizontal_line_4_5_scale_armv4
+;(
+; r0 = UINT8 *source
+; r1 = UINT32 source_width
+; r2 = UINT8 *dest
+; r3 = UINT32 dest_width
+;)
+|horizontal_line_4_5_scale_armv4| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ mov mask, #255 ; mask for selection
+ ldr c51_205, =0x3300cd
+ ldr c102_154, =0x66009a
+
+ ldr r3, [src], #4
+
+hl45_loop
+
+ and r4, r3, mask ; a = src[0]
+ and r5, mask, r3, lsr #8 ; b = src[1]
+ strb r4, [dest], #1
+
+ orr r6, r4, r5, lsl #16 ; b | a
+ and r7, mask, r3, lsr #16 ; c = src[2]
+ mul r6, c51_205, r6 ; a * 51 + 205 * b
+
+ orr r5, r5, r7, lsl #16 ; c | b
+ mul r5, c102_154, r5 ; b * 102 + 154 * c
+ add r6, r6, #0x8000
+ and r8, mask, r3, lsr #24 ; d = src[3]
+ mov r6, r6, lsr #24
+ strb r6, [dest], #1
+
+ orr r7, r8, r7, lsl #16 ; c | d
+ mul r7, c102_154, r7 ; c * 154 + 102 * d
+ add r5, r5, #0x8000
+ ldr r3, [src], #4
+ mov r5, r5, lsr #24
+ strb r5, [dest], #1
+
+ add r7, r7, #0x8000
+ and r9, mask, r3 ; e = src[4]
+ orr r9, r9, r8, lsl #16 ; d | e
+ mul r9, c51_205, r9 ; d * 205 + 51 * e
+ mov r7, r7, lsr #24
+ strb r7, [dest], #1
+
+ add r9, r9, #0x8000
+ subs srcw, srcw, #4
+ mov r9, r9, lsr #24
+ strb r9, [dest], #1
+
+ bne hl45_loop
+
+ and r4, r3, mask
+ and r5, mask, r3, lsl #8
+ strb r4, [dest], #1
+
+ orr r6, r4, r5, lsl #16 ; b | a
+ mul r6, c51_205, r6
+
+ and r7, mask, r3, lsl #16
+ orr r5, r5, r7, lsl #16 ; c | b
+ mul r5, c102_154, r5
+ add r6, r6, #0x8000
+ and r8, mask, r3, lsl #24
+ mov r6, r6, lsr #24
+ strb r6, [dest], #1
+
+ orr r7, r8, r7, lsl #16 ; c | d
+ mul r7, c102_154, r7
+ add r5, r5, #0x8000
+ mov r5, r5, lsr #24
+ strb r5, [dest], #1
+
+ add r7, r7, #0x8000
+ mov r7, r7, lsr #24
+ strb r7, [dest], #1
+
+ ldrb r3, [src]
+ strb r3, [dest], #1
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ;|vp8cx_horizontal_line_4_5_scale_c|
+
+;/****************************************************************************
+; *
+; * ROUTINE : vertical_band_4_5_scale_armv4
+; *
+; * INPUTS : unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_pitch : Stride of destination data.
+; * unsigned int dest_width : Width of destination data.
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Scales vertical band of pixels by scale 4 to 5. The
+; * height of the band scaled is 4-pixels.
+; *
+; * SPECIAL NOTES : The routine uses the first line of the band below
+; * the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_4_5_scale_armv4
+;(
+; r0 = UINT8 *dest
+; r1 = UINT32 dest_pitch
+; r2 = UINT32 dest_width
+;)
+|vertical_band_4_5_scale_armv4| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr c51_205, =0x3300cd
+ ldr c102_154, =0x66009a
+
+vl45_loop
+ mov r3, src
+ ldrb r4, [r3], r1 ; a = des [0]
+ ldrb r5, [r3], r1 ; b = des [dest_pitch]
+ ldrb r7, [r3], r1 ; c = des[dest_pitch*2]
+ add lr, src, r1
+
+ orr r6, r4, r5, lsl #16 ; b | a
+ mul r6, c51_205, r6 ; a * 51 + 205 * b
+
+ ldrb r8, [r3], r1 ; d = des[dest_pitch*3]
+ orr r5, r5, r7, lsl #16 ; c | b
+ mul r5, c102_154, r5 ; b * 102 + 154 * c
+ add r6, r6, #0x8000
+ orr r7, r8, r7, lsl #16 ; c | d
+ mov r6, r6, lsr #24
+ strb r6, [lr], r1
+
+ ldrb r9, [r3, r1] ; e = des [dest_pitch * 5]
+ mul r7, c102_154, r7 ; c * 154 + 102 * d
+ add r5, r5, #0x8000
+ orr r9, r9, r8, lsl #16 ; d | e
+ mov r5, r5, lsr #24
+ strb r5, [lr], r1
+
+ mul r9, c51_205, r9 ; d * 205 + 51 * e
+ add r7, r7, #0x8000
+ add src, src, #1
+ mov r7, r7, lsr #24
+ strb r7, [lr], r1
+
+ add r9, r9, #0x8000
+ subs r2, r2, #1
+ mov r9, r9, lsr #24
+ strb r9, [lr], r1
+
+ bne vl45_loop
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ;|vertical_band_4_5_scale_armv4|
+
+;/****************************************************************************
+; *
+; * ROUTINE : horizontal_line_2_3_scale_armv4
+; *
+; * INPUTS : const unsigned char *source : Pointer to source data.
+; * unsigned int source_width : Stride of source.
+; * unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_width : Stride of destination (NOT USED).
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Copies horizontal line of pixels from source to
+; * destination scaling up by 2 to 3.
+; *
+; * SPECIAL NOTES : None.
+; *
+; *
+; ****************************************************************************/
+;void horizontal_line_2_3_scale_armv4
+;(
+; const unsigned char *source,
+; unsigned int source_width,
+; unsigned char *dest,
+; unsigned int dest_width
+;)
+|horizontal_line_2_3_scale_armv4| PROC
+ stmdb sp!, {r4 - r11, lr}
+ ldr lr, =85
+ ldr r12, =171
+
+hl23_loop
+
+ ldrb r3, [src], #1 ; a
+ ldrb r4, [src], #1 ; b
+ ldrb r5, [src] ; c
+
+ strb r3, [dest], #1
+ mul r4, r12, r4 ; b * 171
+ mla r6, lr, r3, r4 ; a * 85
+ mla r7, lr, r5, r4 ; c * 85
+
+ add r6, r6, #128
+ mov r6, r6, lsr #8
+ strb r6, [dest], #1
+
+ add r7, r7, #128
+ mov r7, r7, lsr #8
+ strb r7, [dest], #1
+
+ subs srcw, srcw, #2
+ bne hl23_loop
+
+ ldrb r4, [src, #1] ; b
+ strb r5, [dest], #1
+ strb r4, [dest, #1]
+
+ mul r4, r12, r4 ; b * 171
+ mla r6, lr, r5, r4 ; a * 85 + b *171
+
+ add r6, r6, #128
+ mov r6, r6, lsr #8
+ strb r6, [dest]
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ;|horizontal_line_2_3_scale_armv4|
+
+;/****************************************************************************
+; *
+; * ROUTINE : vertical_band_2_3_scale_armv4
+; *
+; * INPUTS : unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_pitch : Stride of destination data.
+; * unsigned int dest_width : Width of destination data.
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Scales vertical band of pixels by scale 2 to 3. The
+; * height of the band scaled is 2-pixels.
+; *
+; * SPECIAL NOTES : The routine uses the first line of the band below
+; * the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_2_3_scale_armv4
+;(
+; r0 = UINT8 *dest
+; r1 = UINT32 dest_pitch
+; r2 = UINT32 dest_width
+;)
+|vertical_band_2_3_scale_armv4| PROC
+ stmdb sp!, {r4 - r8, lr}
+ ldr lr, =85
+ ldr r12, =171
+ add r3, r1, r1, lsl #1 ; 3 * dest_pitch
+
+vl23_loop
+ ldrb r4, [src] ; a = des [0]
+ ldrb r5, [src, r1] ; b = des [dest_pitch]
+ ldrb r7, [src, r3] ; c = des [dest_pitch*3]
+ subs r2, r2, #1
+
+ mul r5, r12, r5 ; b * 171
+ mla r6, lr, r4, r5 ; a * 85
+ mla r8, lr, r7, r5 ; c * 85
+
+ add r6, r6, #128
+ mov r6, r6, lsr #8
+ strb r6, [src, r1]
+
+ add r8, r8, #128
+ mov r8, r8, lsr #8
+ strb r8, [src, r1, lsl #1]
+
+ add src, src, #1
+
+ bne vl23_loop
+
+ ldmia sp!, {r4 - r8, pc}
+ ENDP ;|vertical_band_2_3_scale_armv4|
+
+;/****************************************************************************
+; *
+; * ROUTINE : vp8cx_horizontal_line_3_5_scale_c
+; *
+; * INPUTS : const unsigned char *source : Pointer to source data.
+; * unsigned int source_width : Stride of source.
+; * unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_width : Stride of destination (NOT USED).
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Copies horizontal line of pixels from source to
+; * destination scaling up by 3 to 5.
+; *
+; * SPECIAL NOTES : None.
+; *
+; *
+; ****************************************************************************/
+;void vp8cx_horizontal_line_3_5_scale_c
+;(
+; const unsigned char *source,
+; unsigned int source_width,
+; unsigned char *dest,
+; unsigned int dest_width
+;)
+|horizontal_line_3_5_scale_armv4| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr c51_205, =0x3300cd
+ ldr c102_154, =0x66009a
+
+ ldrb r4, [src], #1 ; a = src[0]
+
+hl35_loop
+
+ ldrb r8, [src], #1 ; b = src[1]
+ strb r4, [dest], #1
+
+ orr r6, r4, r8, lsl #16 ; b | a
+ ldrb r9, [src], #1 ; c = src[2]
+ mul r6, c102_154, r6 ; a * 102 + 154 * b
+
+ orr r5, r9, r8, lsl #16 ; b | c
+ mul r5, c51_205, r5 ; b * 205 + 51 * c
+ add r6, r6, #0x8000
+ ldrb r4, [src], #1 ; d = src[3]
+ mov r6, r6, lsr #24
+ strb r6, [dest], #1
+
+ orr r7, r8, r9, lsl #16 ; c | b
+ mul r7, c51_205, r7 ; c * 205 + 154 * b
+ add r5, r5, #0x8000
+ mov r5, r5, lsr #24
+ strb r5, [dest], #1
+
+ orr r9, r4, r9, lsl #16 ; c | d
+ mul r9, c102_154, r9 ; c * 154 + 102 * d
+ add r7, r7, #0x8000
+ mov r7, r7, lsr #24
+ strb r7, [dest], #1
+
+ add r9, r9, #0x8000
+ subs srcw, srcw, #3
+ mov r9, r9, lsr #24
+ strb r9, [dest], #1
+
+ bpl hl35_loop
+
+ ldrb r5, [src], #1 ; b = src[1]
+ strb r4, [dest], #1
+
+ orr r6, r4, r8, lsl #16 ; b | a
+ ldrb r9, [src], #1 ; c = src[2]
+ mul r6, c102_154, r6 ; a * 102 + 154 * b
+
+ orr r5, r9, r8, lsl #16 ; b | c
+ mul r5, c51_205, r5 ; b * 205 + 51 * c
+ add r6, r6, #0x8000
+ mov r6, r6, lsr #24
+ strb r6, [dest], #1
+
+ orr r7, r8, r9, lsl #16 ; c | b
+ mul r7, c51_205, r7 ; c * 205 + 154 * b
+ add r5, r5, #0x8000
+ mov r5, r5, lsr #24
+ strb r5, [dest], #1
+
+ add r7, r7, #0x8000
+ mov r7, r7, lsr #24
+ strb r7, [dest], #1
+ strb r9, [dest], #1
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ;|vp8cx_horizontal_line_3_5_scale_c|
+
+
+;/****************************************************************************
+; *
+; * ROUTINE : vp8cx_vertical_band_3_5_scale_c
+; *
+; * INPUTS : unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_pitch : Stride of destination data.
+; * unsigned int dest_width : Width of destination data.
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Scales vertical band of pixels by scale 3 to 5. The
+; * height of the band scaled is 3-pixels.
+; *
+; * SPECIAL NOTES : The routine uses the first line of the band below
+; * the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_4_5_scale_armv4
+;(
+; r0 = UINT8 *dest
+; r1 = UINT32 dest_pitch
+; r2 = UINT32 dest_width
+;)
+|vertical_band_3_5_scale_armv4| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr c51_205, =0x3300cd
+ ldr c102_154, =0x66009a
+
+vl35_loop
+ mov r3, src
+ ldrb r4, [r3], r1 ; a = des [0]
+ ldrb r5, [r3], r1 ; b = des [dest_pitch]
+ ldrb r7, [r3], r1 ; c = des[dest_pitch*2]
+ add lr, src, r1
+
+ orr r8, r4, r5, lsl #16 ; b | a
+ mul r6, c102_154, r8 ; a * 102 + 154 * b
+
+ ldrb r8, [r3, r1, lsl #1] ; d = des[dest_pitch*5]
+ orr r3, r7, r5, lsl #16 ; b | c
+ mul r9, c51_205, r3 ; b * 205 + 51 * c
+ add r6, r6, #0x8000
+ orr r3, r5, r7, lsl #16 ; c | b
+ mov r6, r6, lsr #24
+ strb r6, [lr], r1
+
+ mul r5, c51_205, r3 ; c * 205 + 154 * b
+ add r9, r9, #0x8000
+ orr r3, r8, r7, lsl #16 ; c | d
+ mov r9, r9, lsr #24
+ strb r9, [lr], r1
+
+ mul r7, c102_154, r3 ; c * 154 + 102 * d
+ add r5, r5, #0x8000
+ add src, src, #1
+ mov r5, r5, lsr #24
+ strb r5, [lr], r1
+
+ add r7, r7, #0x8000
+ subs r2, r2, #1
+ mov r7, r7, lsr #24
+ strb r7, [lr], r1
+
+
+ bne vl35_loop
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ;|vertical_band_3_5_scale_armv4|
+
+;/****************************************************************************
+; *
+; * ROUTINE : horizontal_line_3_4_scale_armv4
+; *
+; * INPUTS : const unsigned char *source : Pointer to source data.
+; * unsigned int source_width : Stride of source.
+; * unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_width : Stride of destination (NOT USED).
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Copies horizontal line of pixels from source to
+; * destination scaling up by 3 to 4.
+; *
+; * SPECIAL NOTES : None.
+; *
+; *
+; ****************************************************************************/
+;void horizontal_line_3_4_scale_armv4
+;(
+; const unsigned char *source,
+; unsigned int source_width,
+; unsigned char *dest,
+; unsigned int dest_width
+;)
+|horizontal_line_3_4_scale_armv4| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r10, =64
+ ldr r11, =192
+ mov r9, #128
+
+ ldrb r4, [src], #1 ; a = src[0]
+
+hl34_loop
+
+ ldrb r8, [src], #1 ; b = src[1]
+ ldrb r7, [src], #1 ; c = src[2]
+ strb r4, [dest], #1
+
+ mla r4, r10, r4, r9 ; a*64 + 128
+ mla r4, r11, r8, r4 ; a*64 + b*192 + 1
+
+ add r8, r8, #1 ; b + 1
+ add r8, r8, r7 ; b + c + 1
+ mov r8, r8, asr #1 ; (b + c + 1) >> 1
+
+ mov r4, r4, asr #8 ; (a*64 + b*192 + 1) >> 8
+ strb r4, [dest], #1
+
+ strb r8, [dest], #1
+
+ ldrb r4, [src], #1 ; [a+1]
+
+ mla r7, r11, r7, r9 ; c*192 + 128
+ mla r7, r4, r10, r7 ; a*64 + b*192 + 128
+
+ subs srcw, srcw, #3
+
+ mov r7, r7, asr #8 ; (a*64 + b*192 + 128) >> 8
+ strb r7, [dest], #1
+
+ bpl hl34_loop
+
+ ldrb r8, [src], #1 ; b = src[1]
+ ldrb r7, [src], #1 ; c = src[2]
+ strb r4, [dest], #1
+
+ mla r4, r10, r4, r9 ; a*64 + 128
+ mla r4, r11, r8, r4 ; a*64 + b*192 + 1
+ mov r4, r4, asr #8 ; (a*64 + b*192 + 1) >> 8
+ strb r4, [dest], #1
+
+ add r8, r8, #1 ; b + 1
+ add r8, r8, r7 ; b + c + 1
+ mov r8, r8, asr #1 ; (b + c + 1) >> 1
+ strb r8, [dest], #1
+ strb r7, [dest], #1
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ;|vp8cx_horizontal_line_3_4_scale_c|
+
+
+;/****************************************************************************
+; *
+; * ROUTINE : vertical_band_3_4_scale_armv4
+; *
+; * INPUTS : unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_pitch : Stride of destination data.
+; * unsigned int dest_width : Width of destination data.
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Scales vertical band of pixels by scale 3 to 4. The
+; * height of the band scaled is 3-pixels.
+; *
+; * SPECIAL NOTES : The routine uses the first line of the band below
+; * the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_3_4_scale_armv4
+;(
+; r0 = UINT8 *dest
+; r1 = UINT32 dest_pitch
+; r2 = UINT32 dest_width
+;)
+|vertical_band_3_4_scale_armv4| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r10, =64
+ ldr r11, =192
+ mov r9, #128
+
+; ldr r1,[r1]
+vl34_loop
+ mov r3, src
+ ldrb r4, [r3], r1 ; a = des [0]
+ ldrb r5, [r3], r1 ; b = des [dest_pitch]
+ ldrb r7, [r3], r1 ; c = des [dest_pitch*2]
+ add lr, src, r1
+
+ mla r4, r10, r4, r9 ; a*64 + 128
+ mla r4, r11, r5, r4 ; a*64 + b*192 + 1
+
+ add r5, r5, #1 ; b + 1
+ add r5, r5, r7 ; b + c + 1
+ mov r5, r5, asr #1 ; (b + c + 1) >> 1
+
+ mov r4, r4, asr #8 ; (a*64 + b*192 + 1) >> 8
+ strb r4, [lr], r1
+
+ ldrb r4, [r3, r1] ; a = des [dest_pitch*4]
+
+ strb r5, [lr], r1
+
+ mla r7, r11, r7, r9 ; c*192 + 128
+ mla r7, r4, r10, r7 ; a*64 + b*192 + 128
+ mov r7, r7, asr #8 ; (a*64 + b*192 + 128) >> 8
+
+ add src, src, #1
+ subs r2, r2, #1
+
+ strb r7, [lr]
+
+ bne vl34_loop
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ;|vertical_band_3_4_scale_armv4|
+
+;/****************************************************************************
+; *
+; * ROUTINE : vp8cx_horizontal_line_1_2_scale_c
+; *
+; * INPUTS : const unsigned char *source : Pointer to source data.
+; * unsigned int source_width : Stride of source.
+; * unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_width : Stride of destination (NOT USED).
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Copies horizontal line of pixels from source to
+; * destination scaling up by 1 to 2.
+; *
+; * SPECIAL NOTES : None.
+; *
+; ****************************************************************************/
+;void vp8cx_horizontal_line_1_2_scale_c
+;(
+; const unsigned char *source,
+; unsigned int source_width,
+; unsigned char *dest,
+; unsigned int dest_width
+;)
+|horizontal_line_1_2_scale_armv4| PROC
+ stmdb sp!, {r4 - r5, lr}
+
+ sub srcw, srcw, #1
+
+ ldrb r3, [src], #1
+ ldrb r4, [src], #1
+hl12_loop
+ subs srcw, srcw, #1
+
+ add r5, r3, r4
+ add r5, r5, #1
+ mov r5, r5, lsr #1
+
+ orr r5, r3, r5, lsl #8
+ strh r5, [dest], #2
+
+ mov r3, r4
+
+ ldrneb r4, [src], #1
+ bne hl12_loop
+
+ orr r5, r4, r4, lsl #8
+ strh r5, [dest]
+
+ ldmia sp!, {r4 - r5, pc}
+ ENDP ;|vertical_band_3_5_scale_armv4|
+
+;/****************************************************************************
+; *
+; * ROUTINE : vp8cx_vertical_band_1_2_scale_c
+; *
+; * INPUTS : unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_pitch : Stride of destination data.
+; * unsigned int dest_width : Width of destination data.
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Scales vertical band of pixels by scale 1 to 2. The
+; * height of the band scaled is 1-pixel.
+; *
+; * SPECIAL NOTES : The routine uses the first line of the band below
+; * the current band.
+; *
+; ****************************************************************************/
+;void vp8cx_vertical_band_1_2_scale_c
+;(
+; r0 = UINT8 *dest
+; r1 = UINT32 dest_pitch
+; r2 = UINT32 dest_width
+;)
+|vertical_band_1_2_scale_armv4| PROC
+ stmdb sp!, {r4 - r7, lr}
+
+ ldr mask, =0xff00ff ; mask for selection
+ ldr lr, = 0x010001
+
+vl12_loop
+ mov r3, src
+ ldr r4, [r3], r1
+ ldr r5, [r3, r1]
+
+ add src, src, #4
+ subs r2, r2, #4
+
+ and r6, r4, mask
+ and r7, r5, mask
+
+ add r6, r7, r6
+ add r6, r6, lr
+
+ and r4, mask, r4, lsr #8
+ and r5, mask, r5, lsr #8
+
+ mov r6, r6, lsr #1
+ and r6, r6, mask
+
+ add r4, r5, r4
+ add r4, r4, lr
+
+ mov r4, r4, lsr #1
+ and r4, r4, mask
+
+ orr r5, r6, r4, lsl #8
+
+ str r5, [r3]
+
+ bpl vl12_loop
+
+ ldmia sp!, {r4 - r7, pc}
+ ENDP ;|vertical_band_3_5_scale_armv4|
+
+ END
diff --git a/vpx_scale/symbian/gen_scalers_armv4.s b/vpx_scale/symbian/gen_scalers_armv4.s
new file mode 100644
index 000000000..3dfd0b9b9
--- /dev/null
+++ b/vpx_scale/symbian/gen_scalers_armv4.s
@@ -0,0 +1,808 @@
+@ This file was created from a .asm file
+@ using the ads2gas.pl script.
+
+ .equ WIDE_REFERENCE, 0
+ .ifndef ARCHITECTURE
+ .equ ARCHITECTURE, 5
+ .endif
+ .global horizontal_line_4_5_scale_armv4
+ .ifndef NO_TYPE_PSEUDO_OP
+ .type horizontal_line_4_5_scale_armv4, function
+ .endif
+ .global vertical_band_4_5_scale_armv4
+ .ifndef NO_TYPE_PSEUDO_OP
+ .type vertical_band_4_5_scale_armv4, function
+ .endif
+ .global horizontal_line_2_3_scale_armv4
+ .ifndef NO_TYPE_PSEUDO_OP
+ .type horizontal_line_2_3_scale_armv4, function
+ .endif
+ .global vertical_band_2_3_scale_armv4
+ .ifndef NO_TYPE_PSEUDO_OP
+ .type vertical_band_2_3_scale_armv4, function
+ .endif
+ .global horizontal_line_3_5_scale_armv4
+ .ifndef NO_TYPE_PSEUDO_OP
+ .type horizontal_line_3_5_scale_armv4, function
+ .endif
+ .global vertical_band_3_5_scale_armv4
+ .ifndef NO_TYPE_PSEUDO_OP
+ .type vertical_band_3_5_scale_armv4, function
+ .endif
+ .global horizontal_line_3_4_scale_armv4
+ .ifndef NO_TYPE_PSEUDO_OP
+ .type horizontal_line_3_4_scale_armv4, function
+ .endif
+ .global vertical_band_3_4_scale_armv4
+ .ifndef NO_TYPE_PSEUDO_OP
+ .type vertical_band_3_4_scale_armv4, function
+ .endif
+ .global horizontal_line_1_2_scale_armv4
+ .ifndef NO_TYPE_PSEUDO_OP
+ .type horizontal_line_1_2_scale_armv4, function
+ .endif
+ .global vertical_band_1_2_scale_armv4
+ .ifndef NO_TYPE_PSEUDO_OP
+ .type vertical_band_1_2_scale_armv4, function
+ .endif
+
+.text
+
+src .req r0
+srcw .req r1
+dest .req r2
+mask .req r12
+c51_205 .req r10
+c102_154 .req r11
+@/****************************************************************************
+@ *
+@ * ROUTINE : horizontal_line_4_5_scale_armv4
+@ *
+@ * INPUTS : const unsigned char *source : Pointer to source data.
+@ * unsigned int source_width : Stride of source.
+@ * unsigned char *dest : Pointer to destination data.
+@ * unsigned int dest_width : Stride of destination (NOT USED).
+@ *
+@ * OUTPUTS : None.
+@ *
+@ * RETU.req_s : void
+@ *
+@ * FUNCTION : Copies horizontal line of pixels from source to
+@ * destination scaling up by 4 to 5.
+@ *
+@ * SPECIAL NOTES : None.
+@ *
+@ ****************************************************************************/
+@void horizontal_line_4_5_scale_armv4
+@(
+@ r0 = UINT8 *source
+@ r1 = UINT32 source_width
+@ r2 = UINT8 *dest
+@ r3 = UINT32 dest_width
+@)
+_HorizontalLine_4_5_Scale_ARMv4:
+ horizontal_line_4_5_scale_armv4: @
+ stmdb sp!, {r4 - r11, lr}
+
+ mov mask, #255 @ mask for selection
+ ldr c51_205, =0x3300cd
+ ldr c102_154, =0x66009a
+
+ ldr r3, [src], #4
+
+hl45_loop:
+
+ and r4, r3, mask @ a = src[0]
+ and r5, mask, r3, lsr #8 @ b = src[1]
+ strb r4, [dest], #1
+
+ orr r6, r4, r5, lsl #16 @ b | a
+ and r7, mask, r3, lsr #16 @ c = src[2]
+ mul r6, c51_205, r6 @ a * 51 + 205 * b
+
+ orr r5, r5, r7, lsl #16 @ c | b
+ mul r5, c102_154, r5 @ b * 102 + 154 * c
+ add r6, r6, #0x8000
+ and r8, mask, r3, lsr #24 @ d = src[3]
+ mov r6, r6, lsr #24
+ strb r6, [dest], #1
+
+ orr r7, r8, r7, lsl #16 @ c | d
+ mul r7, c102_154, r7 @ c * 154 + 102 * d
+ add r5, r5, #0x8000
+ ldr r3, [src], #4
+ mov r5, r5, lsr #24
+ strb r5, [dest], #1
+
+ add r7, r7, #0x8000
+ and r9, mask, r3 @ e = src[4]
+ orr r9, r9, r8, lsl #16 @ d | e
+ mul r9, c51_205, r9 @ d * 205 + 51 * e
+ mov r7, r7, lsr #24
+ strb r7, [dest], #1
+
+ add r9, r9, #0x8000
+ subs srcw, srcw, #4
+ mov r9, r9, lsr #24
+ strb r9, [dest], #1
+
+ bne hl45_loop
+
+ and r4, r3, mask
+ and r5, mask, r3, lsl #8
+ strb r4, [dest], #1
+
+ orr r6, r4, r5, lsl #16 @ b | a
+ mul r6, c51_205, r6
+
+ and r7, mask, r3, lsl #16
+ orr r5, r5, r7, lsl #16 @ c | b
+ mul r5, c102_154, r5
+ add r6, r6, #0x8000
+ and r8, mask, r3, lsl #24
+ mov r6, r6, lsr #24
+ strb r6, [dest], #1
+
+ orr r7, r8, r7, lsl #16 @ c | d
+ mul r7, c102_154, r7
+ add r5, r5, #0x8000
+ mov r5, r5, lsr #24
+ strb r5, [dest], #1
+
+ add r7, r7, #0x8000
+ mov r7, r7, lsr #24
+ strb r7, [dest], #1
+
+ ldrb r3, [src]
+ strb r3, [dest], #1
+
+ ldmia sp!, {r4 - r11, pc}
+ @ @|vp8cx_horizontal_line_4_5_scale_c|
+
+@/****************************************************************************
+@ *
+@ * ROUTINE : vertical_band_4_5_scale_armv4
+@ *
+@ * INPUTS : unsigned char *dest : Pointer to destination data.
+@ * unsigned int dest_pitch : Stride of destination data.
+@ * unsigned int dest_width : Width of destination data.
+@ *
+@ * OUTPUTS : None.
+@ *
+@ * RETU.req_s : void
+@ *
+@ * FUNCTION : Scales vertical band of pixels by scale 4 to 5. The
+@ * height of the band scaled is 4-pixels.
+@ *
+@ * SPECIAL NOTES : The routine uses the first line of the band below
+@ * the current band.
+@ *
+@ ****************************************************************************/
+@void vertical_band_4_5_scale_armv4
+@(
+@ r0 = UINT8 *dest
+@ r1 = UINT32 dest_pitch
+@ r2 = UINT32 dest_width
+@)
+_VerticalBand_4_5_Scale_ARMv4:
+ vertical_band_4_5_scale_armv4: @
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr c51_205, =0x3300cd
+ ldr c102_154, =0x66009a
+
+vl45_loop:
+ mov r3, src
+ ldrb r4, [r3], r1 @ a = des [0]
+ ldrb r5, [r3], r1 @ b = des [dest_pitch]
+ ldrb r7, [r3], r1 @ c = des[dest_pitch*2]
+ add lr, src, r1
+
+ orr r6, r4, r5, lsl #16 @ b | a
+ mul r6, c51_205, r6 @ a * 51 + 205 * b
+
+ ldrb r8, [r3], r1 @ d = des[dest_pitch*3]
+ orr r5, r5, r7, lsl #16 @ c | b
+ mul r5, c102_154, r5 @ b * 102 + 154 * c
+ add r6, r6, #0x8000
+ orr r7, r8, r7, lsl #16 @ c | d
+ mov r6, r6, lsr #24
+ strb r6, [lr], r1
+
+ ldrb r9, [r3, r1] @ e = des [dest_pitch * 5]
+ mul r7, c102_154, r7 @ c * 154 + 102 * d
+ add r5, r5, #0x8000
+ orr r9, r9, r8, lsl #16 @ d | e
+ mov r5, r5, lsr #24
+ strb r5, [lr], r1
+
+ mul r9, c51_205, r9 @ d * 205 + 51 * e
+ add r7, r7, #0x8000
+ add src, src, #1
+ mov r7, r7, lsr #24
+ strb r7, [lr], r1
+
+ add r9, r9, #0x8000
+ subs r2, r2, #1
+ mov r9, r9, lsr #24
+ strb r9, [lr], r1
+
+ bne vl45_loop
+
+ ldmia sp!, {r4 - r11, pc}
+ @ @|vertical_band_4_5_scale_armv4|
+
+@/****************************************************************************
+@ *
+@ * ROUTINE : horizontal_line_2_3_scale_armv4
+@ *
+@ * INPUTS : const unsigned char *source : Pointer to source data.
+@ * unsigned int source_width : Stride of source.
+@ * unsigned char *dest : Pointer to destination data.
+@ * unsigned int dest_width : Stride of destination (NOT USED).
+@ *
+@ * OUTPUTS : None.
+@ *
+@ * RETU.req_s : void
+@ *
+@ * FUNCTION : Copies horizontal line of pixels from source to
+@ * destination scaling up by 2 to 3.
+@ *
+@ * SPECIAL NOTES : None.
+@ *
+@ *
+@ ****************************************************************************/
+@void horizontal_line_2_3_scale_armv4
+@(
+@ const unsigned char *source,
+@ unsigned int source_width,
+@ unsigned char *dest,
+@ unsigned int dest_width
+@)
+_HorizontalLine_2_3_Scale_ARMv4:
+ horizontal_line_2_3_scale_armv4: @
+ stmdb sp!, {r4 - r11, lr}
+ ldr lr, =85
+ ldr r12, =171
+
+hl23_loop:
+
+ ldrb r3, [src], #1 @ a
+ ldrb r4, [src], #1 @ b
+ ldrb r5, [src] @ c
+
+ strb r3, [dest], #1
+ mul r4, r12, r4 @ b * 171
+ mla r6, lr, r3, r4 @ a * 85
+ mla r7, lr, r5, r4 @ c * 85
+
+ add r6, r6, #128
+ mov r6, r6, lsr #8
+ strb r6, [dest], #1
+
+ add r7, r7, #128
+ mov r7, r7, lsr #8
+ strb r7, [dest], #1
+
+ subs srcw, srcw, #2
+ bne hl23_loop
+
+ ldrb r4, [src, #1] @ b
+ strb r5, [dest], #1
+ strb r4, [dest, #1]
+
+ mul r4, r12, r4 @ b * 171
+ mla r6, lr, r5, r4 @ a * 85 + b *171
+
+ add r6, r6, #128
+ mov r6, r6, lsr #8
+ strb r6, [dest]
+
+ ldmia sp!, {r4 - r11, pc}
+ @ @|horizontal_line_2_3_scale_armv4|
+
+@/****************************************************************************
+@ *
+@ * ROUTINE : vertical_band_2_3_scale_armv4
+@ *
+@ * INPUTS : unsigned char *dest : Pointer to destination data.
+@ * unsigned int dest_pitch : Stride of destination data.
+@ * unsigned int dest_width : Width of destination data.
+@ *
+@ * OUTPUTS : None.
+@ *
+@ * RETU.req_s : void
+@ *
+@ * FUNCTION : Scales vertical band of pixels by scale 2 to 3. The
+@ * height of the band scaled is 2-pixels.
+@ *
+@ * SPECIAL NOTES : The routine uses the first line of the band below
+@ * the current band.
+@ *
+@ ****************************************************************************/
+@void vertical_band_2_3_scale_armv4
+@(
+@ r0 = UINT8 *dest
+@ r1 = UINT32 dest_pitch
+@ r2 = UINT32 dest_width
+@)
+_VerticalBand_2_3_Scale_ARMv4:
+ vertical_band_2_3_scale_armv4: @
+ stmdb sp!, {r4 - r8, lr}
+ ldr lr, =85
+ ldr r12, =171
+ add r3, r1, r1, lsl #1 @ 3 * dest_pitch
+
+vl23_loop:
+ ldrb r4, [src] @ a = des [0]
+ ldrb r5, [src, r1] @ b = des [dest_pitch]
+ ldrb r7, [src, r3] @ c = des [dest_pitch*3]
+ subs r2, r2, #1
+
+ mul r5, r12, r5 @ b * 171
+ mla r6, lr, r4, r5 @ a * 85
+ mla r8, lr, r7, r5 @ c * 85
+
+ add r6, r6, #128
+ mov r6, r6, lsr #8
+ strb r6, [src, r1]
+
+ add r8, r8, #128
+ mov r8, r8, lsr #8
+ strb r8, [src, r1, lsl #1]
+
+ add src, src, #1
+
+ bne vl23_loop
+
+ ldmia sp!, {r4 - r8, pc}
+ @ @|vertical_band_2_3_scale_armv4|
+
+@/****************************************************************************
+@ *
+@ * ROUTINE : vp8cx_horizontal_line_3_5_scale_c
+@ *
+@ * INPUTS : const unsigned char *source : Pointer to source data.
+@ * unsigned int source_width : Stride of source.
+@ * unsigned char *dest : Pointer to destination data.
+@ * unsigned int dest_width : Stride of destination (NOT USED).
+@ *
+@ * OUTPUTS : None.
+@ *
+@ * RETU.req_s : void
+@ *
+@ * FUNCTION : Copies horizontal line of pixels from source to
+@ * destination scaling up by 3 to 5.
+@ *
+@ * SPECIAL NOTES : None.
+@ *
+@ *
+@ ****************************************************************************/
+@void vp8cx_horizontal_line_3_5_scale_c
+@(
+@ const unsigned char *source,
+@ unsigned int source_width,
+@ unsigned char *dest,
+@ unsigned int dest_width
+@)
+_HorizontalLine_3_5_Scale_ARMv4:
+ horizontal_line_3_5_scale_armv4: @
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr c51_205, =0x3300cd
+ ldr c102_154, =0x66009a
+
+ ldrb r4, [src], #1 @ a = src[0]
+
+hl35_loop:
+
+ ldrb r8, [src], #1 @ b = src[1]
+ strb r4, [dest], #1
+
+ orr r6, r4, r8, lsl #16 @ b | a
+ ldrb r9, [src], #1 @ c = src[2]
+ mul r6, c102_154, r6 @ a * 102 + 154 * b
+
+ orr r5, r9, r8, lsl #16 @ b | c
+ mul r5, c51_205, r5 @ b * 205 + 51 * c
+ add r6, r6, #0x8000
+ ldrb r4, [src], #1 @ d = src[3]
+ mov r6, r6, lsr #24
+ strb r6, [dest], #1
+
+ orr r7, r8, r9, lsl #16 @ c | b
+ mul r7, c51_205, r7 @ c * 205 + 154 * b
+ add r5, r5, #0x8000
+ mov r5, r5, lsr #24
+ strb r5, [dest], #1
+
+ orr r9, r4, r9, lsl #16 @ c | d
+ mul r9, c102_154, r9 @ c * 154 + 102 * d
+ add r7, r7, #0x8000
+ mov r7, r7, lsr #24
+ strb r7, [dest], #1
+
+ add r9, r9, #0x8000
+ subs srcw, srcw, #3
+ mov r9, r9, lsr #24
+ strb r9, [dest], #1
+
+ bpl hl35_loop
+
+ ldrb r5, [src], #1 @ b = src[1]
+ strb r4, [dest], #1
+
+ orr r6, r4, r8, lsl #16 @ b | a
+ ldrb r9, [src], #1 @ c = src[2]
+ mul r6, c102_154, r6 @ a * 102 + 154 * b
+
+ orr r5, r9, r8, lsl #16 @ b | c
+ mul r5, c51_205, r5 @ b * 205 + 51 * c
+ add r6, r6, #0x8000
+ mov r6, r6, lsr #24
+ strb r6, [dest], #1
+
+ orr r7, r8, r9, lsl #16 @ c | b
+ mul r7, c51_205, r7 @ c * 205 + 154 * b
+ add r5, r5, #0x8000
+ mov r5, r5, lsr #24
+ strb r5, [dest], #1
+
+ add r7, r7, #0x8000
+ mov r7, r7, lsr #24
+ strb r7, [dest], #1
+ strb r9, [dest], #1
+
+ ldmia sp!, {r4 - r11, pc}
+ @ @|vp8cx_horizontal_line_3_5_scale_c|
+
+
+@/****************************************************************************
+@ *
+@ * ROUTINE : vp8cx_vertical_band_3_5_scale_c
+@ *
+@ * INPUTS : unsigned char *dest : Pointer to destination data.
+@ * unsigned int dest_pitch : Stride of destination data.
+@ * unsigned int dest_width : Width of destination data.
+@ *
+@ * OUTPUTS : None.
+@ *
+@ * RETU.req_s : void
+@ *
+@ * FUNCTION : Scales vertical band of pixels by scale 3 to 5. The
+@ * height of the band scaled is 3-pixels.
+@ *
+@ * SPECIAL NOTES : The routine uses the first line of the band below
+@ * the current band.
+@ *
+@ ****************************************************************************/
+@void vertical_band_4_5_scale_armv4
+@(
+@ r0 = UINT8 *dest
+@ r1 = UINT32 dest_pitch
+@ r2 = UINT32 dest_width
+@)
+_VerticalBand_3_5_Scale_ARMv4:
+ vertical_band_3_5_scale_armv4: @
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr c51_205, =0x3300cd
+ ldr c102_154, =0x66009a
+
+vl35_loop:
+ mov r3, src
+ ldrb r4, [r3], r1 @ a = des [0]
+ ldrb r5, [r3], r1 @ b = des [dest_pitch]
+ ldrb r7, [r3], r1 @ c = des[dest_pitch*2]
+ add lr, src, r1
+
+ orr r8, r4, r5, lsl #16 @ b | a
+ mul r6, c102_154, r8 @ a * 102 + 154 * b
+
+ ldrb r8, [r3, r1, lsl #1] @ d = des[dest_pitch*5]
+ orr r3, r7, r5, lsl #16 @ b | c
+ mul r9, c51_205, r3 @ b * 205 + 51 * c
+ add r6, r6, #0x8000
+ orr r3, r5, r7, lsl #16 @ c | b
+ mov r6, r6, lsr #24
+ strb r6, [lr], r1
+
+ mul r5, c51_205, r3 @ c * 205 + 154 * b
+ add r9, r9, #0x8000
+ orr r3, r8, r7, lsl #16 @ c | d
+ mov r9, r9, lsr #24
+ strb r9, [lr], r1
+
+ mul r7, c102_154, r3 @ c * 154 + 102 * d
+ add r5, r5, #0x8000
+ add src, src, #1
+ mov r5, r5, lsr #24
+ strb r5, [lr], r1
+
+ add r7, r7, #0x8000
+ subs r2, r2, #1
+ mov r7, r7, lsr #24
+ strb r7, [lr], r1
+
+
+ bne vl35_loop
+
+ ldmia sp!, {r4 - r11, pc}
+ @ @|vertical_band_3_5_scale_armv4|
+
+@/****************************************************************************
+@ *
+@ * ROUTINE : horizontal_line_3_4_scale_armv4
+@ *
+@ * INPUTS : const unsigned char *source : Pointer to source data.
+@ * unsigned int source_width : Stride of source.
+@ * unsigned char *dest : Pointer to destination data.
+@ * unsigned int dest_width : Stride of destination (NOT USED).
+@ *
+@ * OUTPUTS : None.
+@ *
+@ * RETU.req_s : void
+@ *
+@ * FUNCTION : Copies horizontal line of pixels from source to
+@ * destination scaling up by 3 to 4.
+@ *
+@ * SPECIAL NOTES : None.
+@ *
+@ *
+@ ****************************************************************************/
+@void horizontal_line_3_4_scale_armv4
+@(
+@ const unsigned char *source,
+@ unsigned int source_width,
+@ unsigned char *dest,
+@ unsigned int dest_width
+@)
+_HorizontalLine_3_4_Scale_ARMv4:
+ horizontal_line_3_4_scale_armv4: @
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r10, =64
+ ldr r11, =192
+ mov r9, #128
+
+ ldrb r4, [src], #1 @ a = src[0]
+
+hl34_loop:
+
+ ldrb r8, [src], #1 @ b = src[1]
+ ldrb r7, [src], #1 @ c = src[2]
+ strb r4, [dest], #1
+
+ mla r4, r10, r4, r9 @ a*64 + 128
+ mla r4, r11, r8, r4 @ a*64 + b*192 + 1
+
+ add r8, r8, #1 @ b + 1
+ add r8, r8, r7 @ b + c + 1
+ mov r8, r8, asr #1 @ (b + c + 1) >> 1
+
+ mov r4, r4, asr #8 @ (a*64 + b*192 + 1) >> 8
+ strb r4, [dest], #1
+
+ strb r8, [dest], #1
+
+ ldrb r4, [src], #1 @ [a+1]
+
+ mla r7, r11, r7, r9 @ c*192 + 128
+ mla r7, r4, r10, r7 @ a*64 + b*192 + 128
+
+ subs srcw, srcw, #3
+
+ mov r7, r7, asr #8 @ (a*64 + b*192 + 128) >> 8
+ strb r7, [dest], #1
+
+ bpl hl34_loop
+
+ ldrb r8, [src], #1 @ b = src[1]
+ ldrb r7, [src], #1 @ c = src[2]
+ strb r4, [dest], #1
+
+ mla r4, r10, r4, r9 @ a*64 + 128
+ mla r4, r11, r8, r4 @ a*64 + b*192 + 1
+ mov r4, r4, asr #8 @ (a*64 + b*192 + 1) >> 8
+ strb r4, [dest], #1
+
+ add r8, r8, #1 @ b + 1
+ add r8, r8, r7 @ b + c + 1
+ mov r8, r8, asr #1 @ (b + c + 1) >> 1
+ strb r8, [dest], #1
+ strb r7, [dest], #1
+
+ ldmia sp!, {r4 - r11, pc}
+ @ @|vp8cx_horizontal_line_3_4_scale_c|
+
+
+@/****************************************************************************
+@ *
+@ * ROUTINE : vertical_band_3_4_scale_armv4
+@ *
+@ * INPUTS : unsigned char *dest : Pointer to destination data.
+@ * unsigned int dest_pitch : Stride of destination data.
+@ * unsigned int dest_width : Width of destination data.
+@ *
+@ * OUTPUTS : None.
+@ *
+@ * RETU.req_s : void
+@ *
+@ * FUNCTION : Scales vertical band of pixels by scale 3 to 4. The
+@ * height of the band scaled is 3-pixels.
+@ *
+@ * SPECIAL NOTES : The routine uses the first line of the band below
+@ * the current band.
+@ *
+@ ****************************************************************************/
+@void vertical_band_3_4_scale_armv4
+@(
+@ r0 = UINT8 *dest
+@ r1 = UINT32 dest_pitch
+@ r2 = UINT32 dest_width
+@)
+_VerticalBand_3_4_Scale_ARMv4:
+ vertical_band_3_4_scale_armv4: @
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r10, =64
+ ldr r11, =192
+ mov r9, #128
+
+@ ldr r1,[r1]
+vl34_loop:
+ mov r3, src
+ ldrb r4, [r3], r1 @ a = des [0]
+ ldrb r5, [r3], r1 @ b = des [dest_pitch]
+ ldrb r7, [r3], r1 @ c = des [dest_pitch*2]
+ add lr, src, r1
+
+ mla r4, r10, r4, r9 @ a*64 + 128
+ mla r4, r11, r5, r4 @ a*64 + b*192 + 1
+
+ add r5, r5, #1 @ b + 1
+ add r5, r5, r7 @ b + c + 1
+ mov r5, r5, asr #1 @ (b + c + 1) >> 1
+
+ mov r4, r4, asr #8 @ (a*64 + b*192 + 1) >> 8
+ strb r4, [lr], r1
+
+ ldrb r4, [r3, r1] @ a = des [dest_pitch*4]
+
+ strb r5, [lr], r1
+
+ mla r7, r11, r7, r9 @ c*192 + 128
+ mla r7, r4, r10, r7 @ a*64 + b*192 + 128
+ mov r7, r7, asr #8 @ (a*64 + b*192 + 128) >> 8
+
+ add src, src, #1
+ subs r2, r2, #1
+
+ strb r7, [lr]
+
+ bne vl34_loop
+
+ ldmia sp!, {r4 - r11, pc}
+ @ @|vertical_band_3_4_scale_armv4|
+
+@/****************************************************************************
+@ *
+@ * ROUTINE : vp8cx_horizontal_line_1_2_scale_c
+@ *
+@ * INPUTS : const unsigned char *source : Pointer to source data.
+@ * unsigned int source_width : Stride of source.
+@ * unsigned char *dest : Pointer to destination data.
+@ * unsigned int dest_width : Stride of destination (NOT USED).
+@ *
+@ * OUTPUTS : None.
+@ *
+@ * RETU.req_s : void
+@ *
+@ * FUNCTION : Copies horizontal line of pixels from source to
+@ * destination scaling up by 1 to 2.
+@ *
+@ * SPECIAL NOTES : None.
+@ *
+@ ****************************************************************************/
+@void vp8cx_horizontal_line_1_2_scale_c
+@(
+@ const unsigned char *source,
+@ unsigned int source_width,
+@ unsigned char *dest,
+@ unsigned int dest_width
+@)
+_HorizontalLine_1_2_Scale_ARMv4:
+ horizontal_line_1_2_scale_armv4: @
+ stmdb sp!, {r4 - r5, lr}
+
+ sub srcw, srcw, #1
+
+ ldrb r3, [src], #1
+ ldrb r4, [src], #1
+hl12_loop:
+ subs srcw, srcw, #1
+
+ add r5, r3, r4
+ add r5, r5, #1
+ mov r5, r5, lsr #1
+
+ orr r5, r3, r5, lsl #8
+ strh r5, [dest], #2
+
+ mov r3, r4
+
+ ldrneb r4, [src], #1
+ bne hl12_loop
+
+ orr r5, r4, r4, lsl #8
+ strh r5, [dest]
+
+ ldmia sp!, {r4 - r5, pc}
+ @ @|vertical_band_3_5_scale_armv4|
+
+@/****************************************************************************
+@ *
+@ * ROUTINE : vp8cx_vertical_band_1_2_scale_c
+@ *
+@ * INPUTS : unsigned char *dest : Pointer to destination data.
+@ * unsigned int dest_pitch : Stride of destination data.
+@ * unsigned int dest_width : Width of destination data.
+@ *
+@ * OUTPUTS : None.
+@ *
+@ * RETU.req_s : void
+@ *
+@ * FUNCTION : Scales vertical band of pixels by scale 1 to 2. The
+@ * height of the band scaled is 1-pixel.
+@ *
+@ * SPECIAL NOTES : The routine uses the first line of the band below
+@ * the current band.
+@ *
+@ ****************************************************************************/
+@void vp8cx_vertical_band_1_2_scale_c
+@(
+@ r0 = UINT8 *dest
+@ r1 = UINT32 dest_pitch
+@ r2 = UINT32 dest_width
+@)
+_VerticalBand_1_2_Scale_ARMv4:
+ vertical_band_1_2_scale_armv4: @
+ stmdb sp!, {r4 - r7, lr}
+
+ ldr mask, =0xff00ff @ mask for selection
+ ldr lr, = 0x010001
+
+vl12_loop:
+ mov r3, src
+ ldr r4, [r3], r1
+ ldr r5, [r3, r1]
+
+ add src, src, #4
+ subs r2, r2, #4
+
+ and r6, r4, mask
+ and r7, r5, mask
+
+ add r6, r7, r6
+ add r6, r6, lr
+
+ and r4, mask, r4, lsr #8
+ and r5, mask, r5, lsr #8
+
+ mov r6, r6, lsr #1
+ and r6, r6, mask
+
+ add r4, r5, r4
+ add r4, r4, lr
+
+ mov r4, r4, lsr #1
+ and r4, r4, mask
+
+ orr r5, r6, r4, lsl #8
+
+ str r5, [r3]
+
+ bpl vl12_loop
+
+ ldmia sp!, {r4 - r7, pc}
+ @ @|vertical_band_3_5_scale_armv4|
diff --git a/vpx_scale/symbian/scalesystemdependant.c b/vpx_scale/symbian/scalesystemdependant.c
new file mode 100644
index 000000000..a2acc3e9d
--- /dev/null
+++ b/vpx_scale/symbian/scalesystemdependant.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/vpxscale.h"
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8_scale_machine_specific_config
+ *
+ * INPUTS : UINT32 Version : Codec version number.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Checks for machine specifc features such as MMX support
+ * sets appropriate flags and function pointers.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8_scale_machine_specific_config()
+{
+#ifndef VPX_NO_GLOBALS
+ vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_armv4;
+ vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_armv4;
+ vp8_last_vertical_band_1_2_scale = vp8cx_last_vertical_band_1_2_scale_c;
+ vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_armv4;
+ vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_armv4;
+ vp8_last_vertical_band_3_5_scale = vp8cx_last_vertical_band_3_5_scale_c;
+ vp8_horizontal_line_3_4_scale = horizontal_line_3_4_scale_armv4;
+ vp8_vertical_band_3_4_scale = vertical_band_3_4_scale_armv4;
+ vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c;
+ vp8_horizontal_line_2_3_scale = horizontal_line_2_3_scale_armv4;
+ vp8_vertical_band_2_3_scale = vertical_band_2_3_scale_armv4;
+ vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c;
+ vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_armv4;
+ vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_armv4;
+ vp8_last_vertical_band_4_5_scale = vp8cx_last_vertical_band_4_5_scale_c;
+
+
+ vp8_vertical_band_5_4_scale = vp8cx_vertical_band_5_4_scale_c;
+ vp8_vertical_band_5_3_scale = vp8cx_vertical_band_5_3_scale_c;
+ vp8_vertical_band_2_1_scale = vp8cx_vertical_band_2_1_scale_c;
+ vp8_vertical_band_2_1_scale_i = vp8cx_vertical_band_2_1_scale_i_c;
+ vp8_horizontal_line_2_1_scale = vp8cx_horizontal_line_2_1_scale_c;
+ vp8_horizontal_line_5_3_scale = vp8cx_horizontal_line_5_3_scale_c;
+ vp8_horizontal_line_5_4_scale = vp8cx_horizontal_line_5_4_scale_c;
+#endif
+}
diff --git a/vpx_scale/vpx_scale.mk b/vpx_scale/vpx_scale.mk
new file mode 100644
index 000000000..f4ab258ed
--- /dev/null
+++ b/vpx_scale/vpx_scale.mk
@@ -0,0 +1,23 @@
+SCALE_SRCS-yes += vpx_scale.mk
+SCALE_SRCS-yes += scale_mode.h
+SCALE_SRCS-yes += yv12extend.h
+SCALE_SRCS-yes += yv12config.h
+SCALE_SRCS-yes += vpxscale.h
+SCALE_SRCS-yes += generic/vpxscale.c
+SCALE_SRCS-yes += generic/yv12config.c
+SCALE_SRCS-yes += generic/yv12extend.c
+SCALE_SRCS-yes += generic/scalesystemdependant.c
+SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += generic/gen_scalers.c
+
+#arm
+SCALE_SRCS-$(HAVE_ARMV7) += arm/scalesystemdependant.c
+SCALE_SRCS-$(HAVE_ARMV7) += arm/yv12extend_arm.c
+SCALE_SRCS_REMOVE-$(HAVE_ARMV7) += generic/scalesystemdependant.c
+
+#neon
+SCALE_SRCS-$(HAVE_ARMV7) += arm/neon/vp8_vpxyv12_copyframe_func_neon$(ASM)
+SCALE_SRCS-$(HAVE_ARMV7) += arm/neon/vp8_vpxyv12_copyframeyonly_neon$(ASM)
+SCALE_SRCS-$(HAVE_ARMV7) += arm/neon/vp8_vpxyv12_copysrcframe_func_neon$(ASM)
+SCALE_SRCS-$(HAVE_ARMV7) += arm/neon/vp8_vpxyv12_extendframeborders_neon$(ASM)
+
+SCALE_SRCS-no += $(SCALE_SRCS_REMOVE-yes)
diff --git a/vpx_scale/vpxscale.h b/vpx_scale/vpxscale.h
new file mode 100644
index 000000000..9a86b75de
--- /dev/null
+++ b/vpx_scale/vpxscale.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef VPXSCALE_H
+#define VPXSCALE_H
+
+#include "vpx_scale/yv12config.h"
+void vp8cx_horizontal_line_4_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_horizontal_line_2_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_horizontal_line_3_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_horizontal_line_3_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_horizontal_line_1_2_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+
+
+extern void (*vp8_vertical_band_4_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_last_vertical_band_4_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_vertical_band_2_3_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_last_vertical_band_2_3_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_vertical_band_3_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_last_vertical_band_3_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_vertical_band_3_4_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_last_vertical_band_3_4_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_horizontal_line_1_2_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+extern void (*vp8_horizontal_line_3_4_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+extern void (*vp8_horizontal_line_3_5_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+extern void (*vp8_horizontal_line_2_3_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+extern void (*vp8_horizontal_line_4_5_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+extern void (*vp8_vertical_band_1_2_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_last_vertical_band_1_2_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_vertical_band_5_4_scale)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_vertical_band_5_3_scale)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_vertical_band_2_1_scale)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_vertical_band_2_1_scale_i)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_horizontal_line_2_1_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+extern void (*vp8_horizontal_line_5_3_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+extern void (*vp8_horizontal_line_5_4_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+
+void horizontal_line_4_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_2_3_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_3_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_3_4_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_1_2_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vertical_band_4_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_2_3_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_3_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_3_4_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_1_2_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+
+
+extern void dmachine_specific_config(int mmx_enabled, int xmm_enabled, int wmt_enabled);
+extern void vp8_yv12_scale_or_center
+(
+ YV12_BUFFER_CONFIG *src_yuv_config,
+ YV12_BUFFER_CONFIG *dst_yuv_config,
+ int expanded_frame_width,
+ int expanded_frame_height,
+ int scaling_mode,
+ int HScale,
+ int HRatio,
+ int VScale,
+ int VRatio
+);
+extern void vp8_scale_frame
+(
+ YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst,
+ unsigned char *temp_area,
+ unsigned char temp_height,
+ unsigned int hscale,
+ unsigned int hratio,
+ unsigned int vscale,
+ unsigned int vratio,
+ unsigned int interlaced
+);
+extern void vp8_scale_machine_specific_config(void);
+
+extern void (*vp8_yv12_extend_frame_borders_ptr)(YV12_BUFFER_CONFIG *ybf);
+extern void vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf);
+extern void vp8_yv12_extend_frame_borders_neon(YV12_BUFFER_CONFIG *ybf);
+
+extern void (*vp8_yv12_copy_frame_yonly_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+extern void vp8_yv12_copy_frame_yonly(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+extern void vp8_yv12_copy_frame_yonly_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+
+extern void (*vp8_yv12_copy_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+extern void vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+extern void vp8_yv12_copy_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+
+#endif
diff --git a/vpx_scale/wce/gen_scalers_armv4.asm b/vpx_scale/wce/gen_scalers_armv4.asm
new file mode 100644
index 000000000..1c904edae
--- /dev/null
+++ b/vpx_scale/wce/gen_scalers_armv4.asm
@@ -0,0 +1,773 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |horizontal_line_4_5_scale_armv4|
+ EXPORT |vertical_band_4_5_scale_armv4|
+ EXPORT |horizontal_line_2_3_scale_armv4|
+ EXPORT |vertical_band_2_3_scale_armv4|
+ EXPORT |horizontal_line_3_5_scale_armv4|
+ EXPORT |vertical_band_3_5_scale_armv4|
+ EXPORT |horizontal_line_3_4_scale_armv4|
+ EXPORT |vertical_band_3_4_scale_armv4|
+ EXPORT |horizontal_line_1_2_scale_armv4|
+ EXPORT |vertical_band_1_2_scale_armv4|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+src RN r0
+srcw RN r1
+dest RN r2
+mask RN r12
+c51_205 RN r10
+c102_154 RN r11
+;/****************************************************************************
+; *
+; * ROUTINE : horizontal_line_4_5_scale_armv4
+; *
+; * INPUTS : const unsigned char *source : Pointer to source data.
+; * unsigned int source_width : Stride of source.
+; * unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_width : Stride of destination (NOT USED).
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Copies horizontal line of pixels from source to
+; * destination scaling up by 4 to 5.
+; *
+; * SPECIAL NOTES : None.
+; *
+; ****************************************************************************/
+;void horizontal_line_4_5_scale_armv4
+;(
+; r0 = UINT8 *source
+; r1 = UINT32 source_width
+; r2 = UINT8 *dest
+; r3 = UINT32 dest_width
+;)
+|horizontal_line_4_5_scale_armv4| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ mov mask, #255 ; mask for selection
+ ldr c51_205, =0x3300cd
+ ldr c102_154, =0x66009a
+
+ ldr r3, [src], #4
+
+hl45_loop
+
+ and r4, r3, mask ; a = src[0]
+ and r5, mask, r3, lsr #8 ; b = src[1]
+ strb r4, [dest], #1
+
+ orr r6, r4, r5, lsl #16 ; b | a
+ and r7, mask, r3, lsr #16 ; c = src[2]
+ mul r6, c51_205, r6 ; a * 51 + 205 * b
+
+ orr r5, r5, r7, lsl #16 ; c | b
+ mul r5, c102_154, r5 ; b * 102 + 154 * c
+ add r6, r6, #0x8000
+ and r8, mask, r3, lsr #24 ; d = src[3]
+ mov r6, r6, lsr #24
+ strb r6, [dest], #1
+
+ orr r7, r8, r7, lsl #16 ; c | d
+ mul r7, c102_154, r7 ; c * 154 + 102 * d
+ add r5, r5, #0x8000
+ ldr r3, [src], #4
+ mov r5, r5, lsr #24
+ strb r5, [dest], #1
+
+ add r7, r7, #0x8000
+ and r9, mask, r3 ; e = src[4]
+ orr r9, r9, r8, lsl #16 ; d | e
+ mul r9, c51_205, r9 ; d * 205 + 51 * e
+ mov r7, r7, lsr #24
+ strb r7, [dest], #1
+
+ add r9, r9, #0x8000
+ subs srcw, srcw, #4
+ mov r9, r9, lsr #24
+ strb r9, [dest], #1
+
+ bne hl45_loop
+
+ and r4, r3, mask
+ and r5, mask, r3, lsl #8
+ strb r4, [dest], #1
+
+ orr r6, r4, r5, lsl #16 ; b | a
+ mul r6, c51_205, r6
+
+ and r7, mask, r3, lsl #16
+ orr r5, r5, r7, lsl #16 ; c | b
+ mul r5, c102_154, r5
+ add r6, r6, #0x8000
+ and r8, mask, r3, lsl #24
+ mov r6, r6, lsr #24
+ strb r6, [dest], #1
+
+ orr r7, r8, r7, lsl #16 ; c | d
+ mul r7, c102_154, r7
+ add r5, r5, #0x8000
+ mov r5, r5, lsr #24
+ strb r5, [dest], #1
+
+ add r7, r7, #0x8000
+ mov r7, r7, lsr #24
+ strb r7, [dest], #1
+
+ ldrb r3, [src]
+ strb r3, [dest], #1
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ;|vp8cx_horizontal_line_4_5_scale_c|
+
+;/****************************************************************************
+; *
+; * ROUTINE : vertical_band_4_5_scale_armv4
+; *
+; * INPUTS : unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_pitch : Stride of destination data.
+; * unsigned int dest_width : Width of destination data.
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Scales vertical band of pixels by scale 4 to 5. The
+; * height of the band scaled is 4-pixels.
+; *
+; * SPECIAL NOTES : The routine uses the first line of the band below
+; * the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_4_5_scale_armv4
+;(
+; r0 = UINT8 *dest
+; r1 = UINT32 dest_pitch
+; r2 = UINT32 dest_width
+;)
+|vertical_band_4_5_scale_armv4| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr c51_205, =0x3300cd
+ ldr c102_154, =0x66009a
+
+vl45_loop
+ mov r3, src
+ ldrb r4, [r3], r1 ; a = des [0]
+ ldrb r5, [r3], r1 ; b = des [dest_pitch]
+ ldrb r7, [r3], r1 ; c = des[dest_pitch*2]
+ add lr, src, r1
+
+ orr r6, r4, r5, lsl #16 ; b | a
+ mul r6, c51_205, r6 ; a * 51 + 205 * b
+
+ ldrb r8, [r3], r1 ; d = des[dest_pitch*3]
+ orr r5, r5, r7, lsl #16 ; c | b
+ mul r5, c102_154, r5 ; b * 102 + 154 * c
+ add r6, r6, #0x8000
+ orr r7, r8, r7, lsl #16 ; c | d
+ mov r6, r6, lsr #24
+ strb r6, [lr], r1
+
+ ldrb r9, [r3, r1] ; e = des [dest_pitch * 5]
+ mul r7, c102_154, r7 ; c * 154 + 102 * d
+ add r5, r5, #0x8000
+ orr r9, r9, r8, lsl #16 ; d | e
+ mov r5, r5, lsr #24
+ strb r5, [lr], r1
+
+ mul r9, c51_205, r9 ; d * 205 + 51 * e
+ add r7, r7, #0x8000
+ add src, src, #1
+ mov r7, r7, lsr #24
+ strb r7, [lr], r1
+
+ add r9, r9, #0x8000
+ subs r2, r2, #1
+ mov r9, r9, lsr #24
+ strb r9, [lr], r1
+
+ bne vl45_loop
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ;|vertical_band_4_5_scale_armv4|
+
+;/****************************************************************************
+; *
+; * ROUTINE : horizontal_line_2_3_scale_armv4
+; *
+; * INPUTS : const unsigned char *source : Pointer to source data.
+; * unsigned int source_width : Stride of source.
+; * unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_width : Stride of destination (NOT USED).
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Copies horizontal line of pixels from source to
+; * destination scaling up by 2 to 3.
+; *
+; * SPECIAL NOTES : None.
+; *
+; *
+; ****************************************************************************/
+;void horizontal_line_2_3_scale_armv4
+;(
+; const unsigned char *source,
+; unsigned int source_width,
+; unsigned char *dest,
+; unsigned int dest_width
+;)
+|horizontal_line_2_3_scale_armv4| PROC
+ stmdb sp!, {r4 - r11, lr}
+ ldr lr, =85
+ ldr r12, =171
+
+hl23_loop
+
+ ldrb r3, [src], #1 ; a
+ ldrb r4, [src], #1 ; b
+ ldrb r5, [src] ; c
+
+ strb r3, [dest], #1
+ mul r4, r12, r4 ; b * 171
+ mla r6, lr, r3, r4 ; a * 85
+ mla r7, lr, r5, r4 ; c * 85
+
+ add r6, r6, #128
+ mov r6, r6, lsr #8
+ strb r6, [dest], #1
+
+ add r7, r7, #128
+ mov r7, r7, lsr #8
+ strb r7, [dest], #1
+
+ subs srcw, srcw, #2
+ bne hl23_loop
+
+ ldrb r4, [src, #1] ; b
+ strb r5, [dest], #1
+ strb r4, [dest, #1]
+
+ mul r4, r12, r4 ; b * 171
+ mla r6, lr, r5, r4 ; a * 85 + b *171
+
+ add r6, r6, #128
+ mov r6, r6, lsr #8
+ strb r6, [dest]
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ;|horizontal_line_2_3_scale_armv4|
+
+;/****************************************************************************
+; *
+; * ROUTINE : vertical_band_2_3_scale_armv4
+; *
+; * INPUTS : unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_pitch : Stride of destination data.
+; * unsigned int dest_width : Width of destination data.
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Scales vertical band of pixels by scale 2 to 3. The
+; * height of the band scaled is 2-pixels.
+; *
+; * SPECIAL NOTES : The routine uses the first line of the band below
+; * the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_2_3_scale_armv4
+;(
+; r0 = UINT8 *dest
+; r1 = UINT32 dest_pitch
+; r2 = UINT32 dest_width
+;)
+|vertical_band_2_3_scale_armv4| PROC
+ stmdb sp!, {r4 - r8, lr}
+ ldr lr, =85
+ ldr r12, =171
+ add r3, r1, r1, lsl #1 ; 3 * dest_pitch
+
+vl23_loop
+ ldrb r4, [src] ; a = des [0]
+ ldrb r5, [src, r1] ; b = des [dest_pitch]
+ ldrb r7, [src, r3] ; c = des [dest_pitch*3]
+ subs r2, r2, #1
+
+ mul r5, r12, r5 ; b * 171
+ mla r6, lr, r4, r5 ; a * 85
+ mla r8, lr, r7, r5 ; c * 85
+
+ add r6, r6, #128
+ mov r6, r6, lsr #8
+ strb r6, [src, r1]
+
+ add r8, r8, #128
+ mov r8, r8, lsr #8
+ strb r8, [src, r1, lsl #1]
+
+ add src, src, #1
+
+ bne vl23_loop
+
+ ldmia sp!, {r4 - r8, pc}
+ ENDP ;|vertical_band_2_3_scale_armv4|
+
+;/****************************************************************************
+; *
+; * ROUTINE : vp8cx_horizontal_line_3_5_scale_c
+; *
+; * INPUTS : const unsigned char *source : Pointer to source data.
+; * unsigned int source_width : Stride of source.
+; * unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_width : Stride of destination (NOT USED).
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Copies horizontal line of pixels from source to
+; * destination scaling up by 3 to 5.
+; *
+; * SPECIAL NOTES : None.
+; *
+; *
+; ****************************************************************************/
+;void vp8cx_horizontal_line_3_5_scale_c
+;(
+; const unsigned char *source,
+; unsigned int source_width,
+; unsigned char *dest,
+; unsigned int dest_width
+;)
+|horizontal_line_3_5_scale_armv4| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr c51_205, =0x3300cd
+ ldr c102_154, =0x66009a
+
+ ldrb r4, [src], #1 ; a = src[0]
+
+hl35_loop
+
+ ldrb r8, [src], #1 ; b = src[1]
+ strb r4, [dest], #1
+
+ orr r6, r4, r8, lsl #16 ; b | a
+ ldrb r9, [src], #1 ; c = src[2]
+ mul r6, c102_154, r6 ; a * 102 + 154 * b
+
+ orr r5, r9, r8, lsl #16 ; b | c
+ mul r5, c51_205, r5 ; b * 205 + 51 * c
+ add r6, r6, #0x8000
+ ldrb r4, [src], #1 ; d = src[3]
+ mov r6, r6, lsr #24
+ strb r6, [dest], #1
+
+ orr r7, r8, r9, lsl #16 ; c | b
+ mul r7, c51_205, r7 ; c * 205 + 154 * b
+ add r5, r5, #0x8000
+ mov r5, r5, lsr #24
+ strb r5, [dest], #1
+
+ orr r9, r4, r9, lsl #16 ; c | d
+ mul r9, c102_154, r9 ; c * 154 + 102 * d
+ add r7, r7, #0x8000
+ mov r7, r7, lsr #24
+ strb r7, [dest], #1
+
+ add r9, r9, #0x8000
+ subs srcw, srcw, #3
+ mov r9, r9, lsr #24
+ strb r9, [dest], #1
+
+ bpl hl35_loop
+
+ ldrb r5, [src], #1 ; b = src[1]
+ strb r4, [dest], #1
+
+ orr r6, r4, r8, lsl #16 ; b | a
+ ldrb r9, [src], #1 ; c = src[2]
+ mul r6, c102_154, r6 ; a * 102 + 154 * b
+
+ orr r5, r9, r8, lsl #16 ; b | c
+ mul r5, c51_205, r5 ; b * 205 + 51 * c
+ add r6, r6, #0x8000
+ mov r6, r6, lsr #24
+ strb r6, [dest], #1
+
+ orr r7, r8, r9, lsl #16 ; c | b
+ mul r7, c51_205, r7 ; c * 205 + 154 * b
+ add r5, r5, #0x8000
+ mov r5, r5, lsr #24
+ strb r5, [dest], #1
+
+ add r7, r7, #0x8000
+ mov r7, r7, lsr #24
+ strb r7, [dest], #1
+ strb r9, [dest], #1
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ;|vp8cx_horizontal_line_3_5_scale_c|
+
+
+;/****************************************************************************
+; *
+; * ROUTINE : vp8cx_vertical_band_3_5_scale_c
+; *
+; * INPUTS : unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_pitch : Stride of destination data.
+; * unsigned int dest_width : Width of destination data.
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Scales vertical band of pixels by scale 3 to 5. The
+; * height of the band scaled is 3-pixels.
+; *
+; * SPECIAL NOTES : The routine uses the first line of the band below
+; * the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_4_5_scale_armv4
+;(
+; r0 = UINT8 *dest
+; r1 = UINT32 dest_pitch
+; r2 = UINT32 dest_width
+;)
+|vertical_band_3_5_scale_armv4| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr c51_205, =0x3300cd
+ ldr c102_154, =0x66009a
+
+vl35_loop
+ mov r3, src
+ ldrb r4, [r3], r1 ; a = des [0]
+ ldrb r5, [r3], r1 ; b = des [dest_pitch]
+ ldrb r7, [r3], r1 ; c = des[dest_pitch*2]
+ add lr, src, r1
+
+ orr r8, r4, r5, lsl #16 ; b | a
+ mul r6, c102_154, r8 ; a * 102 + 154 * b
+
+ ldrb r8, [r3, r1, lsl #1] ; d = des[dest_pitch*5]
+ orr r3, r7, r5, lsl #16 ; b | c
+ mul r9, c51_205, r3 ; b * 205 + 51 * c
+ add r6, r6, #0x8000
+ orr r3, r5, r7, lsl #16 ; c | b
+ mov r6, r6, lsr #24
+ strb r6, [lr], r1
+
+ mul r5, c51_205, r3 ; c * 205 + 154 * b
+ add r9, r9, #0x8000
+ orr r3, r8, r7, lsl #16 ; c | d
+ mov r9, r9, lsr #24
+ strb r9, [lr], r1
+
+ mul r7, c102_154, r3 ; c * 154 + 102 * d
+ add r5, r5, #0x8000
+ add src, src, #1
+ mov r5, r5, lsr #24
+ strb r5, [lr], r1
+
+ add r7, r7, #0x8000
+ subs r2, r2, #1
+ mov r7, r7, lsr #24
+ strb r7, [lr], r1
+
+
+ bne vl35_loop
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ;|vertical_band_3_5_scale_armv4|
+
+;/****************************************************************************
+; *
+; * ROUTINE : horizontal_line_3_4_scale_armv4
+; *
+; * INPUTS : const unsigned char *source : Pointer to source data.
+; * unsigned int source_width : Stride of source.
+; * unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_width : Stride of destination (NOT USED).
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Copies horizontal line of pixels from source to
+; * destination scaling up by 3 to 4.
+; *
+; * SPECIAL NOTES : None.
+; *
+; *
+; ****************************************************************************/
+;void horizontal_line_3_4_scale_armv4
+;(
+; const unsigned char *source,
+; unsigned int source_width,
+; unsigned char *dest,
+; unsigned int dest_width
+;)
+|horizontal_line_3_4_scale_armv4| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r10, =64
+ ldr r11, =192
+ mov r9, #128
+
+ ldrb r4, [src], #1 ; a = src[0]
+
+hl34_loop
+
+ ldrb r8, [src], #1 ; b = src[1]
+ ldrb r7, [src], #1 ; c = src[2]
+ strb r4, [dest], #1
+
+ mla r4, r10, r4, r9 ; a*64 + 128
+ mla r4, r11, r8, r4 ; a*64 + b*192 + 1
+
+ add r8, r8, #1 ; b + 1
+ add r8, r8, r7 ; b + c + 1
+ mov r8, r8, asr #1 ; (b + c + 1) >> 1
+
+ mov r4, r4, asr #8 ; (a*64 + b*192 + 1) >> 8
+ strb r4, [dest], #1
+
+ strb r8, [dest], #1
+
+ ldrb r4, [src], #1 ; [a+1]
+
+ mla r7, r11, r7, r9 ; c*192 + 128
+ mla r7, r4, r10, r7 ; a*64 + b*192 + 128
+
+ subs srcw, srcw, #3
+
+ mov r7, r7, asr #8 ; (a*64 + b*192 + 128) >> 8
+ strb r7, [dest], #1
+
+ bpl hl34_loop
+
+ ldrb r8, [src], #1 ; b = src[1]
+ ldrb r7, [src], #1 ; c = src[2]
+ strb r4, [dest], #1
+
+ mla r4, r10, r4, r9 ; a*64 + 128
+ mla r4, r11, r8, r4 ; a*64 + b*192 + 1
+ mov r4, r4, asr #8 ; (a*64 + b*192 + 1) >> 8
+ strb r4, [dest], #1
+
+ add r8, r8, #1 ; b + 1
+ add r8, r8, r7 ; b + c + 1
+ mov r8, r8, asr #1 ; (b + c + 1) >> 1
+ strb r8, [dest], #1
+ strb r7, [dest], #1
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ;|vp8cx_horizontal_line_3_4_scale_c|
+
+
+;/****************************************************************************
+; *
+; * ROUTINE : vertical_band_3_4_scale_armv4
+; *
+; * INPUTS : unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_pitch : Stride of destination data.
+; * unsigned int dest_width : Width of destination data.
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Scales vertical band of pixels by scale 3 to 4. The
+; * height of the band scaled is 3-pixels.
+; *
+; * SPECIAL NOTES : The routine uses the first line of the band below
+; * the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_3_4_scale_armv4
+;(
+; r0 = UINT8 *dest
+; r1 = UINT32 dest_pitch
+; r2 = UINT32 dest_width
+;)
+|vertical_band_3_4_scale_armv4| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r10, =64
+ ldr r11, =192
+ mov r9, #128
+
+; ldr r1,[r1]
+vl34_loop
+ mov r3, src
+ ldrb r4, [r3], r1 ; a = des [0]
+ ldrb r5, [r3], r1 ; b = des [dest_pitch]
+ ldrb r7, [r3], r1 ; c = des [dest_pitch*2]
+ add lr, src, r1
+
+ mla r4, r10, r4, r9 ; a*64 + 128
+ mla r4, r11, r5, r4 ; a*64 + b*192 + 1
+
+ add r5, r5, #1 ; b + 1
+ add r5, r5, r7 ; b + c + 1
+ mov r5, r5, asr #1 ; (b + c + 1) >> 1
+
+ mov r4, r4, asr #8 ; (a*64 + b*192 + 1) >> 8
+ strb r4, [lr], r1
+
+ ldrb r4, [r3, r1] ; a = des [dest_pitch*4]
+
+ strb r5, [lr], r1
+
+ mla r7, r11, r7, r9 ; c*192 + 128
+ mla r7, r4, r10, r7 ; a*64 + b*192 + 128
+ mov r7, r7, asr #8 ; (a*64 + b*192 + 128) >> 8
+
+ add src, src, #1
+ subs r2, r2, #1
+
+ strb r7, [lr]
+
+ bne vl34_loop
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ;|vertical_band_3_4_scale_armv4|
+
+;/****************************************************************************
+; *
+; * ROUTINE : vp8cx_horizontal_line_1_2_scale_c
+; *
+; * INPUTS : const unsigned char *source : Pointer to source data.
+; * unsigned int source_width : Stride of source.
+; * unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_width : Stride of destination (NOT USED).
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Copies horizontal line of pixels from source to
+; * destination scaling up by 1 to 2.
+; *
+; * SPECIAL NOTES : None.
+; *
+; ****************************************************************************/
+;void vp8cx_horizontal_line_1_2_scale_c
+;(
+; const unsigned char *source,
+; unsigned int source_width,
+; unsigned char *dest,
+; unsigned int dest_width
+;)
+|horizontal_line_1_2_scale_armv4| PROC
+ stmdb sp!, {r4 - r5, lr}
+
+ sub srcw, srcw, #1
+
+ ldrb r3, [src], #1
+ ldrb r4, [src], #1
+hl12_loop
+ subs srcw, srcw, #1
+
+ add r5, r3, r4
+ add r5, r5, #1
+ mov r5, r5, lsr #1
+
+ orr r5, r3, r5, lsl #8
+ strh r5, [dest], #2
+
+ mov r3, r4
+
+ ldrneb r4, [src], #1
+ bne hl12_loop
+
+ orr r5, r4, r4, lsl #8
+ strh r5, [dest]
+
+ ldmia sp!, {r4 - r5, pc}
+ ENDP ;|vertical_band_3_5_scale_armv4|
+
+;/****************************************************************************
+; *
+; * ROUTINE : vp8cx_vertical_band_1_2_scale_c
+; *
+; * INPUTS : unsigned char *dest : Pointer to destination data.
+; * unsigned int dest_pitch : Stride of destination data.
+; * unsigned int dest_width : Width of destination data.
+; *
+; * OUTPUTS : None.
+; *
+; * RETURNS : void
+; *
+; * FUNCTION : Scales vertical band of pixels by scale 1 to 2. The
+; * height of the band scaled is 1-pixel.
+; *
+; * SPECIAL NOTES : The routine uses the first line of the band below
+; * the current band.
+; *
+; ****************************************************************************/
+;void vp8cx_vertical_band_1_2_scale_c
+;(
+; r0 = UINT8 *dest
+; r1 = UINT32 dest_pitch
+; r2 = UINT32 dest_width
+;)
+|vertical_band_1_2_scale_armv4| PROC
+ stmdb sp!, {r4 - r7, lr}
+
+ ldr mask, =0xff00ff ; mask for selection
+ ldr lr, = 0x010001
+
+vl12_loop
+ mov r3, src
+ ldr r4, [r3], r1
+ ldr r5, [r3, r1]
+
+ add src, src, #4
+ subs r2, r2, #4
+
+ and r6, r4, mask
+ and r7, r5, mask
+
+ add r6, r7, r6
+ add r6, r6, lr
+
+ and r4, mask, r4, lsr #8
+ and r5, mask, r5, lsr #8
+
+ mov r6, r6, lsr #1
+ and r6, r6, mask
+
+ add r4, r5, r4
+ add r4, r4, lr
+
+ mov r4, r4, lsr #1
+ and r4, r4, mask
+
+ orr r5, r6, r4, lsl #8
+
+ str r5, [r3]
+
+ bpl vl12_loop
+
+ ldmia sp!, {r4 - r7, pc}
+ ENDP ;|vertical_band_3_5_scale_armv4|
+
+ END
diff --git a/vpx_scale/wce/scalesystemdependant.c b/vpx_scale/wce/scalesystemdependant.c
new file mode 100644
index 000000000..a5a6a5275
--- /dev/null
+++ b/vpx_scale/wce/scalesystemdependant.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/vpxscale.h"
+
+/****************************************************************************
+* Imports
+*****************************************************************************/
+
+/****************************************************************************
+ *
+ * ROUTINE : vp8_scale_machine_specific_config
+ *
+ * INPUTS : UINT32 Version : Codec version number.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Checks for machine specifc features such as MMX support
+ * sets appropriate flags and function pointers.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8_scale_machine_specific_config()
+{
+ vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_armv4;
+ vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_armv4;
+ vp8_last_vertical_band_1_2_scale = vp8cx_last_vertical_band_1_2_scale_c;
+ vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_armv4;
+ vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_armv4;
+ vp8_last_vertical_band_3_5_scale = vp8cx_last_vertical_band_3_5_scale_c;
+ vp8_horizontal_line_3_4_scale = horizontal_line_3_4_scale_armv4;
+ vp8_vertical_band_3_4_scale = vertical_band_3_4_scale_armv4;
+ vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c;
+ vp8_horizontal_line_2_3_scale = horizontal_line_2_3_scale_armv4;
+ vp8_vertical_band_2_3_scale = vertical_band_2_3_scale_armv4;
+ vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c;
+ vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_armv4;
+ vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_armv4;
+ vp8_last_vertical_band_4_5_scale = vp8cx_last_vertical_band_4_5_scale_c;
+
+
+ vp8_vertical_band_5_4_scale = vp8cx_vertical_band_5_4_scale_c;
+ vp8_vertical_band_5_3_scale = vp8cx_vertical_band_5_3_scale_c;
+ vp8_vertical_band_2_1_scale = vp8cx_vertical_band_2_1_scale_c;
+ vp8_vertical_band_2_1_scale_i = vp8cx_vertical_band_2_1_scale_i_c;
+ vp8_horizontal_line_2_1_scale = vp8cx_horizontal_line_2_1_scale_c;
+ vp8_horizontal_line_5_3_scale = vp8cx_horizontal_line_5_3_scale_c;
+ vp8_horizontal_line_5_4_scale = vp8cx_horizontal_line_5_4_scale_c;
+}
diff --git a/vpx_scale/win32/scaleopt.c b/vpx_scale/win32/scaleopt.c
new file mode 100644
index 000000000..da0533e6b
--- /dev/null
+++ b/vpx_scale/win32/scaleopt.c
@@ -0,0 +1,1749 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+* Module Title : scaleopt.cpp
+*
+* Description : Optimized scaling functions
+*
+****************************************************************************/
+#include "pragmas.h"
+
+
+
+/****************************************************************************
+* Module Statics
+****************************************************************************/
+__declspec(align(16)) const static unsigned short one_fifth[] = { 51, 51, 51, 51 };
+__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
+__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
+__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
+__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
+__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
+__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102, 51 };
+__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
+__declspec(align(16)) const static unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
+__declspec(align(16)) const static unsigned short const35_2[] = { 154, 51, 205, 102 };
+__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205, 51, 154 };
+
+
+
+#include "vpx_scale/vpxscale.h"
+#include "vpx_mem/vpx_mem.h"
+
+/****************************************************************************
+ *
+ * ROUTINE : horizontal_line_3_5_scale_mmx
+ *
+ * INPUTS : const unsigned char *source :
+ * unsigned int source_width :
+ * unsigned char *dest :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_3_5_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ (void) dest_width;
+
+ __asm
+ {
+
+ push ebx
+
+ mov esi, source
+ mov edi, dest
+
+ mov ecx, source_width
+ lea edx, [esi+ecx-3];
+
+ movq mm5, const35_1 // mm5 = 66 xx cd xx 33 xx 9a xx
+ movq mm6, const35_2 // mm6 = 9a xx 33 xx cd xx 66 xx
+
+ movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx
+ pxor mm7, mm7 // clear mm7
+
+ horiz_line_3_5_loop:
+
+ mov eax, DWORD PTR [esi] // eax = 00 01 02 03
+ mov ebx, eax
+
+ and ebx, 0xffff00 // ebx = xx 01 02 xx
+ mov ecx, eax // ecx = 00 01 02 03
+
+ and eax, 0xffff0000 // eax = xx xx 02 03
+ xor ecx, eax // ecx = 00 01 xx xx
+
+ shr ebx, 8 // ebx = 01 02 xx xx
+ or eax, ebx // eax = 01 02 02 03
+
+ shl ebx, 16 // ebx = xx xx 01 02
+ movd mm1, eax // mm1 = 01 02 02 03 xx xx xx xx
+
+ or ebx, ecx // ebx = 00 01 01 02
+ punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 03 xx
+
+ movd mm0, ebx // mm0 = 00 01 01 02
+ pmullw mm1, mm6 //
+
+ punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
+ pmullw mm0, mm5 //
+
+ mov [edi], ebx // writeoutput 00 xx xx xx
+ add esi, 3
+
+ add edi, 5
+ paddw mm0, mm1
+
+ paddw mm0, mm4
+ psrlw mm0, 8
+
+ cmp esi, edx
+ packuswb mm0, mm7
+
+ movd DWORD Ptr [edi-4], mm0
+ jl horiz_line_3_5_loop
+
+//Exit:
+ mov eax, DWORD PTR [esi] // eax = 00 01 02 03
+ mov ebx, eax
+
+ and ebx, 0xffff00 // ebx = xx 01 02 xx
+ mov ecx, eax // ecx = 00 01 02 03
+
+ and eax, 0xffff0000 // eax = xx xx 02 03
+ xor ecx, eax // ecx = 00 01 xx xx
+
+ shr ebx, 8 // ebx = 01 02 xx xx
+ or eax, ebx // eax = 01 02 02 03
+
+ shl eax, 8 // eax = xx 01 02 02
+ and eax, 0xffff0000 // eax = xx xx 02 02
+
+ or eax, ebx // eax = 01 02 02 02
+
+ shl ebx, 16 // ebx = xx xx 01 02
+ movd mm1, eax // mm1 = 01 02 02 02 xx xx xx xx
+
+ or ebx, ecx // ebx = 00 01 01 02
+ punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 02 xx
+
+ movd mm0, ebx // mm0 = 00 01 01 02
+ pmullw mm1, mm6 //
+
+ punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
+ pmullw mm0, mm5 //
+
+ mov [edi], ebx // writeoutput 00 xx xx xx
+ paddw mm0, mm1
+
+ paddw mm0, mm4
+ psrlw mm0, 8
+
+ packuswb mm0, mm7
+ movd DWORD Ptr [edi+1], mm0
+
+ pop ebx
+
+ }
+
+}
+
+
+/****************************************************************************
+ *
+ * ROUTINE : horizontal_line_4_5_scale_mmx
+ *
+ * INPUTS : const unsigned char *source :
+ * unsigned int source_width :
+ * unsigned char *dest :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_4_5_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ (void)dest_width;
+
+ __asm
+ {
+
+ mov esi, source
+ mov edi, dest
+
+ mov ecx, source_width
+ lea edx, [esi+ecx-8];
+
+ movq mm5, const45_1 // mm5 = 33 xx 66 xx 9a xx cd xx
+ movq mm6, const45_2 // mm6 = cd xx 9a xx 66 xx 33 xx
+
+ movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx
+ pxor mm7, mm7 // clear mm7
+
+ horiz_line_4_5_loop:
+
+ movq mm0, QWORD PTR [esi] // mm0 = 00 01 02 03 04 05 06 07
+ movq mm1, QWORD PTR [esi+1]; // mm1 = 01 02 03 04 05 06 07 08
+
+ movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
+ movq mm3, mm1 // mm3 = 01 02 03 04 05 06 07 08
+
+ movd DWORD PTR [edi], mm0 // write output 00 xx xx xx
+ punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
+
+ punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
+ pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
+
+ pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
+ punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
+
+ movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx
+ pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
+
+ punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
+ pmullw mm3, mm6 // 05*205 06*154 07*102 08* 51
+
+ paddw mm0, mm1 // added round values
+ paddw mm0, mm4
+
+ psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
+ packuswb mm0, mm7
+
+ movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04
+ add edi, 10
+
+ add esi, 8
+ paddw mm2, mm3 //
+
+ paddw mm2, mm4 // added round values
+ cmp esi, edx
+
+ psrlw mm2, 8
+ packuswb mm2, mm7
+
+ movd DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09
+ jl horiz_line_4_5_loop
+
+//Exit:
+ movq mm0, [esi] // mm0 = 00 01 02 03 04 05 06 07
+ movq mm1, mm0 // mm1 = 00 01 02 03 04 05 06 07
+
+ movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
+ psrlq mm1, 8 // mm1 = 01 02 03 04 05 06 07 00
+
+ movq mm3, mask45 // mm3 = 00 00 00 00 00 00 ff 00
+ pand mm3, mm1 // mm3 = 00 00 00 00 00 00 07 00
+
+ psllq mm3, 8 // mm3 = 00 00 00 00 00 00 00 07
+ por mm1, mm3 // mm1 = 01 02 03 04 05 06 07 07
+
+ movq mm3, mm1
+
+ movd DWORD PTR [edi], mm0 // write output 00 xx xx xx
+ punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
+
+ punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
+ pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
+
+ pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
+ punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
+
+ movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx
+ pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
+
+ punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
+ pmullw mm3, mm6 // 05*205 06*154 07*102 07* 51
+
+ paddw mm0, mm1 // added round values
+ paddw mm0, mm4
+
+ psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
+ packuswb mm0, mm7 // 01 02 03 04 xx xx xx xx
+
+ movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04
+ paddw mm2, mm3 //
+
+ paddw mm2, mm4 // added round values
+ psrlw mm2, 8
+
+ packuswb mm2, mm7
+ movd DWORD PTR [edi+6], mm2 // writeoutput 06 07 08 09
+
+
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vertical_band_4_5_scale_mmx
+ *
+ * INPUTS : unsigned char *dest :
+ * unsigned int dest_pitch :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band. The function also has a "C" only
+ * version.
+ *
+ ****************************************************************************/
+static
+void vertical_band_4_5_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __asm
+ {
+
+ mov esi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ lea edi, [esi+ecx*2] // tow lines below
+ add edi, ecx // three lines below
+
+ pxor mm7, mm7 // clear out mm7
+ mov edx, dest_width // Loop counter
+
+ vs_4_5_loop:
+
+ movq mm0, QWORD ptr [esi] // src[0];
+ movq mm1, QWORD ptr [esi+ecx] // src[1];
+
+ movq mm2, mm0 // Make a copy
+ punpcklbw mm0, mm7 // unpack low to word
+
+ movq mm5, one_fifth
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm0, mm5 // a * 1/5
+
+ movq mm3, mm1 // make a copy
+ punpcklbw mm1, mm7 // unpack low to word
+
+ pmullw mm2, mm5 // a * 1/5
+ movq mm6, four_fifths // constan
+
+ movq mm4, mm1 // copy of low b
+ pmullw mm4, mm6 // b * 4/5
+
+ punpckhbw mm3, mm7 // unpack high to word
+ movq mm5, mm3 // copy of high b
+
+ pmullw mm5, mm6 // b * 4/5
+ paddw mm0, mm4 // a * 1/5 + b * 4/5
+
+ paddw mm2, mm5 // a * 1/5 + b * 4/5
+ paddw mm0, round_values // + 128
+
+ paddw mm2, round_values // + 128
+ psrlw mm0, 8
+
+ psrlw mm2, 8
+ packuswb mm0, mm2 // des [1]
+
+ movq QWORD ptr [esi+ecx], mm0 // write des[1]
+ movq mm0, [esi+ecx*2] // mm0 = src[2]
+
+ // mm1, mm3 --- Src[1]
+ // mm0 --- Src[2]
+ // mm7 for unpacking
+
+ movq mm5, two_fifths
+ movq mm2, mm0 // make a copy
+
+ pmullw mm1, mm5 // b * 2/5
+ movq mm6, three_fifths
+
+
+ punpcklbw mm0, mm7 // unpack low to word
+ pmullw mm3, mm5 // b * 2/5
+
+ movq mm4, mm0 // make copy of c
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm4, mm6 // c * 3/5
+ movq mm5, mm2
+
+ pmullw mm5, mm6 // c * 3/5
+ paddw mm1, mm4 // b * 2/5 + c * 3/5
+
+ paddw mm3, mm5 // b * 2/5 + c * 3/5
+ paddw mm1, round_values // + 128
+
+ paddw mm3, round_values // + 128
+ psrlw mm1, 8
+
+ psrlw mm3, 8
+ packuswb mm1, mm3 // des[2]
+
+ movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
+ movq mm1, [edi] // mm1=Src[3];
+
+ // mm0, mm2 --- Src[2]
+ // mm1 --- Src[3]
+ // mm6 --- 3/5
+ // mm7 for unpacking
+
+ pmullw mm0, mm6 // c * 3/5
+ movq mm5, two_fifths // mm5 = 2/5
+
+ movq mm3, mm1 // make a copy
+ pmullw mm2, mm6 // c * 3/5
+
+ punpcklbw mm1, mm7 // unpack low
+ movq mm4, mm1 // make a copy
+
+ punpckhbw mm3, mm7 // unpack high
+ pmullw mm4, mm5 // d * 2/5
+
+ movq mm6, mm3 // make a copy
+ pmullw mm6, mm5 // d * 2/5
+
+ paddw mm0, mm4 // c * 3/5 + d * 2/5
+ paddw mm2, mm6 // c * 3/5 + d * 2/5
+
+ paddw mm0, round_values // + 128
+ paddw mm2, round_values // + 128
+
+ psrlw mm0, 8
+ psrlw mm2, 8
+
+ packuswb mm0, mm2 // des[3]
+ movq QWORD ptr [edi], mm0 // write des[3]
+
+ // mm1, mm3 --- Src[3]
+ // mm7 -- cleared for unpacking
+
+ movq mm0, [edi+ecx*2] // mm0, Src[0] of the next group
+
+ movq mm5, four_fifths // mm5 = 4/5
+ pmullw mm1, mm5 // d * 4/5
+
+ movq mm6, one_fifth // mm6 = 1/5
+ movq mm2, mm0 // make a copy
+
+ pmullw mm3, mm5 // d * 4/5
+ punpcklbw mm0, mm7 // unpack low
+
+ pmullw mm0, mm6 // an * 1/5
+ punpckhbw mm2, mm7 // unpack high
+
+ paddw mm1, mm0 // d * 4/5 + an * 1/5
+ pmullw mm2, mm6 // an * 1/5
+
+ paddw mm3, mm2 // d * 4/5 + an * 1/5
+ paddw mm1, round_values // + 128
+
+ paddw mm3, round_values // + 128
+ psrlw mm1, 8
+
+ psrlw mm3, 8
+ packuswb mm1, mm3 // des[4]
+
+ movq QWORD ptr [edi+ecx], mm1 // write des[4]
+
+ add edi, 8
+ add esi, 8
+
+ sub edx, 8
+ jg vs_4_5_loop
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : last_vertical_band_4_5_scale_mmx
+ *
+ * INPUTS : unsigned char *dest :
+ * unsigned int dest_pitch :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : None
+ *
+ * FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band. The function also has an "C" only
+ * version.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_4_5_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __asm
+ {
+ mov esi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ lea edi, [esi+ecx*2] // tow lines below
+ add edi, ecx // three lines below
+
+ pxor mm7, mm7 // clear out mm7
+ mov edx, dest_width // Loop counter
+
+ last_vs_4_5_loop:
+
+ movq mm0, QWORD ptr [esi] // src[0];
+ movq mm1, QWORD ptr [esi+ecx] // src[1];
+
+ movq mm2, mm0 // Make a copy
+ punpcklbw mm0, mm7 // unpack low to word
+
+ movq mm5, one_fifth
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm0, mm5 // a * 1/5
+
+ movq mm3, mm1 // make a copy
+ punpcklbw mm1, mm7 // unpack low to word
+
+ pmullw mm2, mm5 // a * 1/5
+ movq mm6, four_fifths // constan
+
+ movq mm4, mm1 // copy of low b
+ pmullw mm4, mm6 // b * 4/5
+
+ punpckhbw mm3, mm7 // unpack high to word
+ movq mm5, mm3 // copy of high b
+
+ pmullw mm5, mm6 // b * 4/5
+ paddw mm0, mm4 // a * 1/5 + b * 4/5
+
+ paddw mm2, mm5 // a * 1/5 + b * 4/5
+ paddw mm0, round_values // + 128
+
+ paddw mm2, round_values // + 128
+ psrlw mm0, 8
+
+ psrlw mm2, 8
+ packuswb mm0, mm2 // des [1]
+
+ movq QWORD ptr [esi+ecx], mm0 // write des[1]
+ movq mm0, [esi+ecx*2] // mm0 = src[2]
+
+ // mm1, mm3 --- Src[1]
+ // mm0 --- Src[2]
+ // mm7 for unpacking
+
+ movq mm5, two_fifths
+ movq mm2, mm0 // make a copy
+
+ pmullw mm1, mm5 // b * 2/5
+ movq mm6, three_fifths
+
+
+ punpcklbw mm0, mm7 // unpack low to word
+ pmullw mm3, mm5 // b * 2/5
+
+ movq mm4, mm0 // make copy of c
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm4, mm6 // c * 3/5
+ movq mm5, mm2
+
+ pmullw mm5, mm6 // c * 3/5
+ paddw mm1, mm4 // b * 2/5 + c * 3/5
+
+ paddw mm3, mm5 // b * 2/5 + c * 3/5
+ paddw mm1, round_values // + 128
+
+ paddw mm3, round_values // + 128
+ psrlw mm1, 8
+
+ psrlw mm3, 8
+ packuswb mm1, mm3 // des[2]
+
+ movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
+ movq mm1, [edi] // mm1=Src[3];
+
+ movq QWORD ptr [edi+ecx], mm1 // write des[4];
+
+ // mm0, mm2 --- Src[2]
+ // mm1 --- Src[3]
+ // mm6 --- 3/5
+ // mm7 for unpacking
+
+ pmullw mm0, mm6 // c * 3/5
+ movq mm5, two_fifths // mm5 = 2/5
+
+ movq mm3, mm1 // make a copy
+ pmullw mm2, mm6 // c * 3/5
+
+ punpcklbw mm1, mm7 // unpack low
+ movq mm4, mm1 // make a copy
+
+ punpckhbw mm3, mm7 // unpack high
+ pmullw mm4, mm5 // d * 2/5
+
+ movq mm6, mm3 // make a copy
+ pmullw mm6, mm5 // d * 2/5
+
+ paddw mm0, mm4 // c * 3/5 + d * 2/5
+ paddw mm2, mm6 // c * 3/5 + d * 2/5
+
+ paddw mm0, round_values // + 128
+ paddw mm2, round_values // + 128
+
+ psrlw mm0, 8
+ psrlw mm2, 8
+
+ packuswb mm0, mm2 // des[3]
+ movq QWORD ptr [edi], mm0 // write des[3]
+
+ // mm1, mm3 --- Src[3]
+ // mm7 -- cleared for unpacking
+ add edi, 8
+ add esi, 8
+
+ sub edx, 8
+ jg last_vs_4_5_loop
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vertical_band_3_5_scale_mmx
+ *
+ * INPUTS : unsigned char *dest :
+ * unsigned int dest_pitch :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band. The function also has an "C" only
+ * version.
+ *
+ ****************************************************************************/
+static
+void vertical_band_3_5_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __asm
+ {
+ mov esi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ lea edi, [esi+ecx*2] // tow lines below
+ add edi, ecx // three lines below
+
+ pxor mm7, mm7 // clear out mm7
+ mov edx, dest_width // Loop counter
+
+ vs_3_5_loop:
+
+ movq mm0, QWORD ptr [esi] // src[0];
+ movq mm1, QWORD ptr [esi+ecx] // src[1];
+
+ movq mm2, mm0 // Make a copy
+ punpcklbw mm0, mm7 // unpack low to word
+
+ movq mm5, two_fifths // mm5 = 2/5
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm0, mm5 // a * 2/5
+
+ movq mm3, mm1 // make a copy
+ punpcklbw mm1, mm7 // unpack low to word
+
+ pmullw mm2, mm5 // a * 2/5
+ movq mm6, three_fifths // mm6 = 3/5
+
+ movq mm4, mm1 // copy of low b
+ pmullw mm4, mm6 // b * 3/5
+
+ punpckhbw mm3, mm7 // unpack high to word
+ movq mm5, mm3 // copy of high b
+
+ pmullw mm5, mm6 // b * 3/5
+ paddw mm0, mm4 // a * 2/5 + b * 3/5
+
+ paddw mm2, mm5 // a * 2/5 + b * 3/5
+ paddw mm0, round_values // + 128
+
+ paddw mm2, round_values // + 128
+ psrlw mm0, 8
+
+ psrlw mm2, 8
+ packuswb mm0, mm2 // des [1]
+
+ movq QWORD ptr [esi+ecx], mm0 // write des[1]
+ movq mm0, [esi+ecx*2] // mm0 = src[2]
+
+ // mm1, mm3 --- Src[1]
+ // mm0 --- Src[2]
+ // mm7 for unpacking
+
+ movq mm4, mm1 // b low
+ pmullw mm1, four_fifths // b * 4/5 low
+
+ movq mm5, mm3 // b high
+ pmullw mm3, four_fifths // b * 4/5 high
+
+ movq mm2, mm0 // c
+ pmullw mm4, one_fifth // b * 1/5
+
+ punpcklbw mm0, mm7 // c low
+ pmullw mm5, one_fifth // b * 1/5
+
+ movq mm6, mm0 // make copy of c low
+ punpckhbw mm2, mm7 // c high
+
+ pmullw mm6, one_fifth // c * 1/5 low
+ movq mm7, mm2 // make copy of c high
+
+ pmullw mm7, one_fifth // c * 1/5 high
+ paddw mm1, mm6 // b * 4/5 + c * 1/5 low
+
+ paddw mm3, mm7 // b * 4/5 + c * 1/5 high
+ movq mm6, mm0 // make copy of c low
+
+ pmullw mm6, four_fifths // c * 4/5 low
+ movq mm7, mm2 // make copy of c high
+
+ pmullw mm7, four_fifths // c * 4/5 high
+
+ paddw mm4, mm6 // b * 1/5 + c * 4/5 low
+ paddw mm5, mm7 // b * 1/5 + c * 4/5 high
+
+ paddw mm1, round_values // + 128
+ paddw mm3, round_values // + 128
+
+ psrlw mm1, 8
+ psrlw mm3, 8
+
+ packuswb mm1, mm3 // des[2]
+ movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
+
+ paddw mm4, round_values // + 128
+ paddw mm5, round_values // + 128
+
+ psrlw mm4, 8
+ psrlw mm5, 8
+
+ packuswb mm4, mm5 // des[3]
+ movq QWORD ptr [edi], mm4 // write des[3]
+
+ // mm0, mm2 --- Src[3]
+
+ pxor mm7, mm7 // clear mm7 for unpacking
+ movq mm1, [edi+ecx*2] // mm1 = Src[0] of the next group
+
+ movq mm5, three_fifths // mm5 = 3/5
+ pmullw mm0, mm5 // d * 3/5
+
+ movq mm6, two_fifths // mm6 = 2/5
+ movq mm3, mm1 // make a copy
+
+ pmullw mm2, mm5 // d * 3/5
+ punpcklbw mm1, mm7 // unpack low
+
+ pmullw mm1, mm6 // an * 2/5
+ punpckhbw mm3, mm7 // unpack high
+
+ paddw mm0, mm1 // d * 3/5 + an * 2/5
+ pmullw mm3, mm6 // an * 2/5
+
+ paddw mm2, mm3 // d * 3/5 + an * 2/5
+ paddw mm0, round_values // + 128
+
+ paddw mm2, round_values // + 128
+ psrlw mm0, 8
+
+ psrlw mm2, 8
+ packuswb mm0, mm2 // des[4]
+
+ movq QWORD ptr [edi+ecx], mm0 // write des[4]
+
+ add edi, 8
+ add esi, 8
+
+ sub edx, 8
+ jg vs_3_5_loop
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : last_vertical_band_3_5_scale_mmx
+ *
+ * INPUTS : unsigned char *dest :
+ * unsigned int dest_pitch :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band. The function also has an "C" only
+ * version.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_3_5_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __asm
+ {
+ mov esi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ lea edi, [esi+ecx*2] // tow lines below
+ add edi, ecx // three lines below
+
+ pxor mm7, mm7 // clear out mm7
+ mov edx, dest_width // Loop counter
+
+
+ last_vs_3_5_loop:
+
+ movq mm0, QWORD ptr [esi] // src[0];
+ movq mm1, QWORD ptr [esi+ecx] // src[1];
+
+ movq mm2, mm0 // Make a copy
+ punpcklbw mm0, mm7 // unpack low to word
+
+ movq mm5, two_fifths // mm5 = 2/5
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm0, mm5 // a * 2/5
+
+ movq mm3, mm1 // make a copy
+ punpcklbw mm1, mm7 // unpack low to word
+
+ pmullw mm2, mm5 // a * 2/5
+ movq mm6, three_fifths // mm6 = 3/5
+
+ movq mm4, mm1 // copy of low b
+ pmullw mm4, mm6 // b * 3/5
+
+ punpckhbw mm3, mm7 // unpack high to word
+ movq mm5, mm3 // copy of high b
+
+ pmullw mm5, mm6 // b * 3/5
+ paddw mm0, mm4 // a * 2/5 + b * 3/5
+
+ paddw mm2, mm5 // a * 2/5 + b * 3/5
+ paddw mm0, round_values // + 128
+
+ paddw mm2, round_values // + 128
+ psrlw mm0, 8
+
+ psrlw mm2, 8
+ packuswb mm0, mm2 // des [1]
+
+ movq QWORD ptr [esi+ecx], mm0 // write des[1]
+ movq mm0, [esi+ecx*2] // mm0 = src[2]
+
+
+
+ // mm1, mm3 --- Src[1]
+ // mm0 --- Src[2]
+ // mm7 for unpacking
+
+ movq mm4, mm1 // b low
+ pmullw mm1, four_fifths // b * 4/5 low
+
+ movq QWORD ptr [edi+ecx], mm0 // write des[4]
+
+ movq mm5, mm3 // b high
+ pmullw mm3, four_fifths // b * 4/5 high
+
+ movq mm2, mm0 // c
+ pmullw mm4, one_fifth // b * 1/5
+
+ punpcklbw mm0, mm7 // c low
+ pmullw mm5, one_fifth // b * 1/5
+
+ movq mm6, mm0 // make copy of c low
+ punpckhbw mm2, mm7 // c high
+
+ pmullw mm6, one_fifth // c * 1/5 low
+ movq mm7, mm2 // make copy of c high
+
+ pmullw mm7, one_fifth // c * 1/5 high
+ paddw mm1, mm6 // b * 4/5 + c * 1/5 low
+
+ paddw mm3, mm7 // b * 4/5 + c * 1/5 high
+ movq mm6, mm0 // make copy of c low
+
+ pmullw mm6, four_fifths // c * 4/5 low
+ movq mm7, mm2 // make copy of c high
+
+ pmullw mm7, four_fifths // c * 4/5 high
+
+ paddw mm4, mm6 // b * 1/5 + c * 4/5 low
+ paddw mm5, mm7 // b * 1/5 + c * 4/5 high
+
+ paddw mm1, round_values // + 128
+ paddw mm3, round_values // + 128
+
+ psrlw mm1, 8
+ psrlw mm3, 8
+
+ packuswb mm1, mm3 // des[2]
+ movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
+
+ paddw mm4, round_values // + 128
+ paddw mm5, round_values // + 128
+
+ psrlw mm4, 8
+ psrlw mm5, 8
+
+ packuswb mm4, mm5 // des[3]
+ movq QWORD ptr [edi], mm4 // write des[3]
+
+ // mm0, mm2 --- Src[3]
+
+ add edi, 8
+ add esi, 8
+
+ sub edx, 8
+ jg last_vs_3_5_loop
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vertical_band_1_2_scale_mmx
+ *
+ * INPUTS : unsigned char *dest :
+ * unsigned int dest_pitch :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 1 to 2 up-scaling of a band of pixels.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band. The function also has an "C" only
+ * version.
+ *
+ ****************************************************************************/
+static
+void vertical_band_1_2_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __asm
+ {
+
+ mov esi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ pxor mm7, mm7 // clear out mm7
+ mov edx, dest_width // Loop counter
+
+ vs_1_2_loop:
+
+ movq mm0, [esi] // get Src[0]
+ movq mm1, [esi + ecx * 2] // get Src[1]
+
+ movq mm2, mm0 // make copy before unpack
+ movq mm3, mm1 // make copy before unpack
+
+ punpcklbw mm0, mm7 // low Src[0]
+ movq mm6, four_ones // mm6= 1, 1, 1, 1
+
+ punpcklbw mm1, mm7 // low Src[1]
+ paddw mm0, mm1 // low (a + b)
+
+ punpckhbw mm2, mm7 // high Src[0]
+ paddw mm0, mm6 // low (a + b + 1)
+
+ punpckhbw mm3, mm7
+ paddw mm2, mm3 // high (a + b )
+
+ psraw mm0, 1 // low (a + b +1 )/2
+ paddw mm2, mm6 // high (a + b + 1)
+
+ psraw mm2, 1 // high (a + b + 1)/2
+ packuswb mm0, mm2 // pack results
+
+ movq [esi+ecx], mm0 // write out eight bytes
+ add esi, 8
+
+ sub edx, 8
+ jg vs_1_2_loop
+ }
+
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : last_vertical_band_1_2_scale_mmx
+ *
+ * INPUTS : unsigned char *dest :
+ * unsigned int dest_pitch :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 1 to 2 up-scaling of band of pixels.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band. The function also has an "C" only
+ * version.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_1_2_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __asm
+ {
+ mov esi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ mov edx, dest_width // Loop counter
+
+ last_vs_1_2_loop:
+
+ movq mm0, [esi] // get Src[0]
+ movq [esi+ecx], mm0 // write out eight bytes
+
+ add esi, 8
+ sub edx, 8
+
+ jg last_vs_1_2_loop
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : horizontal_line_1_2_scale
+ *
+ * INPUTS : const unsigned char *source :
+ * unsigned int source_width :
+ * unsigned char *dest :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_1_2_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ (void) dest_width;
+
+ __asm
+ {
+ mov esi, source
+ mov edi, dest
+
+ pxor mm7, mm7
+ movq mm6, four_ones
+
+ mov ecx, source_width
+
+ hs_1_2_loop:
+
+ movq mm0, [esi]
+ movq mm1, [esi+1]
+
+ movq mm2, mm0
+ movq mm3, mm1
+
+ movq mm4, mm0
+ punpcklbw mm0, mm7
+
+ punpcklbw mm1, mm7
+ paddw mm0, mm1
+
+ paddw mm0, mm6
+ punpckhbw mm2, mm7
+
+ punpckhbw mm3, mm7
+ paddw mm2, mm3
+
+ paddw mm2, mm6
+ psraw mm0, 1
+
+ psraw mm2, 1
+ packuswb mm0, mm2
+
+ movq mm2, mm4
+ punpcklbw mm2, mm0
+
+ movq [edi], mm2
+ punpckhbw mm4, mm0
+
+ movq [edi+8], mm4
+ add esi, 8
+
+ add edi, 16
+ sub ecx, 8
+
+ cmp ecx, 8
+ jg hs_1_2_loop
+
+// last eight pixel
+
+ movq mm0, [esi]
+ movq mm1, mm0
+
+ movq mm2, mm0
+ movq mm3, mm1
+
+ psrlq mm1, 8
+ psrlq mm3, 56
+
+ psllq mm3, 56
+ por mm1, mm3
+
+ movq mm3, mm1
+ movq mm4, mm0
+
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+
+ paddw mm0, mm1
+ paddw mm0, mm6
+
+ punpckhbw mm2, mm7
+ punpckhbw mm3, mm7
+
+ paddw mm2, mm3
+ paddw mm2, mm6
+
+ psraw mm0, 1
+ psraw mm2, 1
+
+ packuswb mm0, mm2
+ movq mm2, mm4
+
+ punpcklbw mm2, mm0
+ movq [edi], mm2
+
+ punpckhbw mm4, mm0
+ movq [edi+8], mm4
+ }
+}
+
+
+
+
+
+__declspec(align(16)) const static unsigned short const54_2[] = { 0, 64, 128, 192 };
+__declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128, 64 };
+
+
+/****************************************************************************
+ *
+ * ROUTINE : horizontal_line_5_4_scale_mmx
+ *
+ * INPUTS : const unsigned char *source : Pointer to source data.
+ * unsigned int source_width : Stride of source.
+ * unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_width : Stride of destination (NOT USED).
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies horizontal line of pixels from source to
+ * destination scaling up by 4 to 5.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_5_4_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ /*
+ unsigned i;
+ unsigned int a, b, c, d, e;
+ unsigned char *des = dest;
+ const unsigned char *src = source;
+
+ (void) dest_width;
+
+ for ( i=0; i<source_width; i+=5 )
+ {
+ a = src[0];
+ b = src[1];
+ c = src[2];
+ d = src[3];
+ e = src[4];
+
+ des[0] = a;
+ des[1] = ((b*192 + c* 64 + 128)>>8);
+ des[2] = ((c*128 + d*128 + 128)>>8);
+ des[3] = ((d* 64 + e*192 + 128)>>8);
+
+ src += 5;
+ des += 4;
+ }
+ */
+ (void) dest_width;
+
+ __asm
+ {
+
+ mov esi, source ;
+ mov edi, dest ;
+
+ mov ecx, source_width ;
+ movq mm5, const54_1 ;
+
+ pxor mm7, mm7 ;
+ movq mm6, const54_2 ;
+
+ movq mm4, round_values ;
+ lea edx, [esi+ecx] ;
+ horizontal_line_5_4_loop:
+
+ movq mm0, QWORD PTR [esi] ;
+ 00 01 02 03 04 05 06 07
+ movq mm1, mm0 ;
+ 00 01 02 03 04 05 06 07
+
+ psrlq mm0, 8 ;
+ 01 02 03 04 05 06 07 xx
+ punpcklbw mm1, mm7 ;
+ xx 00 xx 01 xx 02 xx 03
+
+ punpcklbw mm0, mm7 ;
+ xx 01 xx 02 xx 03 xx 04
+ pmullw mm1, mm5
+
+ pmullw mm0, mm6
+ add esi, 5
+
+ add edi, 4
+ paddw mm1, mm0
+
+ paddw mm1, mm4
+ psrlw mm1, 8
+
+ cmp esi, edx
+ packuswb mm1, mm7
+
+ movd DWORD PTR [edi-4], mm1
+
+ jl horizontal_line_5_4_loop
+
+ }
+
+}
+__declspec(align(16)) const static unsigned short one_fourths[] = { 64, 64, 64, 64 };
+__declspec(align(16)) const static unsigned short two_fourths[] = { 128, 128, 128, 128 };
+__declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
+
+static
+void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+
+ __asm
+ {
+ push ebx
+
+ mov esi, source // Get the source and destination pointer
+ mov ecx, src_pitch // Get the pitch size
+
+ mov edi, dest // tow lines below
+ pxor mm7, mm7 // clear out mm7
+
+ mov edx, dest_pitch // Loop counter
+ mov ebx, dest_width
+
+ vs_5_4_loop:
+
+ movd mm0, DWORD ptr [esi] // src[0];
+ movd mm1, DWORD ptr [esi+ecx] // src[1];
+
+ movd mm2, DWORD ptr [esi+ecx*2]
+ lea eax, [esi+ecx*2] //
+
+ punpcklbw mm1, mm7
+ punpcklbw mm2, mm7
+
+ movq mm3, mm2
+ pmullw mm1, three_fourths
+
+ pmullw mm2, one_fourths
+ movd mm4, [eax+ecx]
+
+ pmullw mm3, two_fourths
+ punpcklbw mm4, mm7
+
+ movq mm5, mm4
+ pmullw mm4, two_fourths
+
+ paddw mm1, mm2
+ movd mm6, [eax+ecx*2]
+
+ pmullw mm5, one_fourths
+ paddw mm1, round_values;
+
+ paddw mm3, mm4
+ psrlw mm1, 8
+
+ punpcklbw mm6, mm7
+ paddw mm3, round_values
+
+ pmullw mm6, three_fourths
+ psrlw mm3, 8
+
+ packuswb mm1, mm7
+ packuswb mm3, mm7
+
+ movd DWORD PTR [edi], mm0
+ movd DWORD PTR [edi+edx], mm1
+
+
+ paddw mm5, mm6
+ movd DWORD PTR [edi+edx*2], mm3
+
+ lea eax, [edi+edx*2]
+ paddw mm5, round_values
+
+ psrlw mm5, 8
+ add edi, 4
+
+ packuswb mm5, mm7
+ movd DWORD PTR [eax+edx], mm5
+
+ add esi, 4
+ sub ebx, 4
+
+ jg vs_5_4_loop
+
+ pop ebx
+ }
+}
+
+
+__declspec(align(16)) const static unsigned short const53_1[] = { 0, 85, 171, 0 };
+__declspec(align(16)) const static unsigned short const53_2[] = {256, 171, 85, 0 };
+
+
+static
+void horizontal_line_5_3_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+
+ (void) dest_width;
+ __asm
+ {
+
+ mov esi, source ;
+ mov edi, dest ;
+
+ mov ecx, source_width ;
+ movq mm5, const53_1 ;
+
+ pxor mm7, mm7 ;
+ movq mm6, const53_2 ;
+
+ movq mm4, round_values ;
+ lea edx, [esi+ecx-5] ;
+ horizontal_line_5_3_loop:
+
+ movq mm0, QWORD PTR [esi] ;
+ 00 01 02 03 04 05 06 07
+ movq mm1, mm0 ;
+ 00 01 02 03 04 05 06 07
+
+ psllw mm0, 8 ;
+ xx 00 xx 02 xx 04 xx 06
+ psrlw mm1, 8 ;
+ 01 xx 03 xx 05 xx 07 xx
+
+ psrlw mm0, 8 ;
+ 00 xx 02 xx 04 xx 06 xx
+ psllq mm1, 16 ;
+ xx xx 01 xx 03 xx 05 xx
+
+ pmullw mm0, mm6
+
+ pmullw mm1, mm5
+ add esi, 5
+
+ add edi, 3
+ paddw mm1, mm0
+
+ paddw mm1, mm4
+ psrlw mm1, 8
+
+ cmp esi, edx
+ packuswb mm1, mm7
+
+ movd DWORD PTR [edi-3], mm1
+ jl horizontal_line_5_3_loop
+
+//exit condition
+ movq mm0, QWORD PTR [esi] ;
+ 00 01 02 03 04 05 06 07
+ movq mm1, mm0 ;
+ 00 01 02 03 04 05 06 07
+
+ psllw mm0, 8 ;
+ xx 00 xx 02 xx 04 xx 06
+ psrlw mm1, 8 ;
+ 01 xx 03 xx 05 xx 07 xx
+
+ psrlw mm0, 8 ;
+ 00 xx 02 xx 04 xx 06 xx
+ psllq mm1, 16 ;
+ xx xx 01 xx 03 xx 05 xx
+
+ pmullw mm0, mm6
+
+ pmullw mm1, mm5
+ paddw mm1, mm0
+
+ paddw mm1, mm4
+ psrlw mm1, 8
+
+ packuswb mm1, mm7
+ movd eax, mm1
+
+ mov edx, eax
+ shr edx, 16
+
+ mov WORD PTR[edi], ax
+ mov BYTE PTR[edi+2], dl
+
+ }
+
+}
+
+__declspec(align(16)) const static unsigned short one_thirds[] = { 85, 85, 85, 85 };
+__declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
+
+static
+void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+
+ __asm
+ {
+ push ebx
+
+ mov esi, source // Get the source and destination pointer
+ mov ecx, src_pitch // Get the pitch size
+
+ mov edi, dest // tow lines below
+ pxor mm7, mm7 // clear out mm7
+
+ mov edx, dest_pitch // Loop counter
+ movq mm5, one_thirds
+
+ movq mm6, two_thirds
+ mov ebx, dest_width;
+
+ vs_5_3_loop:
+
+ movd mm0, DWORD ptr [esi] // src[0];
+ movd mm1, DWORD ptr [esi+ecx] // src[1];
+
+ movd mm2, DWORD ptr [esi+ecx*2]
+ lea eax, [esi+ecx*2] //
+
+ punpcklbw mm1, mm7
+ punpcklbw mm2, mm7
+
+ pmullw mm1, mm5
+ pmullw mm2, mm6
+
+ movd mm3, DWORD ptr [eax+ecx]
+ movd mm4, DWORD ptr [eax+ecx*2]
+
+ punpcklbw mm3, mm7
+ punpcklbw mm4, mm7
+
+ pmullw mm3, mm6
+ pmullw mm4, mm5
+
+
+ movd DWORD PTR [edi], mm0
+ paddw mm1, mm2
+
+ paddw mm1, round_values
+ psrlw mm1, 8
+
+ packuswb mm1, mm7
+ paddw mm3, mm4
+
+ paddw mm3, round_values
+ movd DWORD PTR [edi+edx], mm1
+
+ psrlw mm3, 8
+ packuswb mm3, mm7
+
+ movd DWORD PTR [edi+edx*2], mm3
+
+
+ add edi, 4
+ add esi, 4
+
+ sub ebx, 4
+ jg vs_5_3_loop
+
+ pop ebx
+ }
+}
+
+
+
+
+/****************************************************************************
+ *
+ * ROUTINE : horizontal_line_2_1_scale
+ *
+ * INPUTS : const unsigned char *source :
+ * unsigned int source_width :
+ * unsigned char *dest :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_2_1_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ (void) dest_width;
+ (void) source_width;
+ __asm
+ {
+ mov esi, source
+ mov edi, dest
+
+ pxor mm7, mm7
+ mov ecx, dest_width
+
+ xor edx, edx
+ hs_2_1_loop:
+
+ movq mm0, [esi+edx*2]
+ psllw mm0, 8
+
+ psrlw mm0, 8
+ packuswb mm0, mm7
+
+ movd DWORD Ptr [edi+edx], mm0;
+ add edx, 4
+
+ cmp edx, ecx
+ jl hs_2_1_loop
+
+ }
+}
+
+
+
+static
+void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ (void) dest_pitch;
+ (void) src_pitch;
+ vpx_memcpy(dest, source, dest_width);
+}
+
+
+__declspec(align(16)) const static unsigned short three_sixteenths[] = { 48, 48, 48, 48 };
+__declspec(align(16)) const static unsigned short ten_sixteenths[] = { 160, 160, 160, 160 };
+
+static
+void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+
+ (void) dest_pitch;
+ __asm
+ {
+ mov esi, source
+ mov edi, dest
+
+ mov eax, src_pitch
+ mov edx, dest_width
+
+ pxor mm7, mm7
+ sub esi, eax //back one line
+
+
+ lea ecx, [esi+edx];
+ movq mm6, round_values;
+
+ movq mm5, three_sixteenths;
+ movq mm4, ten_sixteenths;
+
+ vs_2_1_i_loop:
+ movd mm0, [esi] //
+ movd mm1, [esi+eax] //
+
+ movd mm2, [esi+eax*2] //
+ punpcklbw mm0, mm7
+
+ pmullw mm0, mm5
+ punpcklbw mm1, mm7
+
+ pmullw mm1, mm4
+ punpcklbw mm2, mm7
+
+ pmullw mm2, mm5
+ paddw mm0, round_values
+
+ paddw mm1, mm2
+ paddw mm0, mm1
+
+ psrlw mm0, 8
+ packuswb mm0, mm7
+
+ movd DWORD PTR [edi], mm0
+ add esi, 4
+
+ add edi, 4;
+ cmp esi, ecx
+ jl vs_2_1_i_loop
+
+ }
+}
+
+
+
+void
+register_mmxscalers(void)
+{
+ vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_mmx;
+ vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_mmx;
+ vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_mmx;
+ vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_mmx;
+ vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_mmx;
+ vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_mmx;
+ vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_mmx;
+ vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_mmx;
+ vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_mmx;
+
+ vp8_horizontal_line_3_4_scale = vp8cx_horizontal_line_3_4_scale_c;
+ vp8_vertical_band_3_4_scale = vp8cx_vertical_band_3_4_scale_c;
+ vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c;
+ vp8_horizontal_line_2_3_scale = vp8cx_horizontal_line_2_3_scale_c;
+ vp8_vertical_band_2_3_scale = vp8cx_vertical_band_2_3_scale_c;
+ vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c;
+
+
+
+ vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx;
+ vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx;
+ vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx;
+ vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx;
+ vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx;
+ vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx;
+ vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx;
+
+
+
+
+}
diff --git a/vpx_scale/win32/scalesystemdependant.c b/vpx_scale/win32/scalesystemdependant.c
new file mode 100644
index 000000000..9ed48bfc6
--- /dev/null
+++ b/vpx_scale/win32/scalesystemdependant.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+* Module Title : system_dependant.c
+*
+* Description : Miscellaneous system dependant functions
+*
+****************************************************************************/
+
+/****************************************************************************
+* Header Files
+****************************************************************************/
+#include "vpx_scale/vpxscale.h"
+#include "cpuidlib.h"
+
+/****************************************************************************
+* Imports
+*****************************************************************************/
+extern void register_generic_scalers(void);
+extern void register_mmxscalers(void);
+
+/****************************************************************************
+ *
+ * ROUTINE : post_proc_machine_specific_config
+ *
+ * INPUTS : UINT32 Version : Codec version number.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Checks for machine specifc features such as MMX support
+ * sets appropriate flags and function pointers.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void
+vp8_scale_machine_specific_config(void)
+{
+ // If MMX supported then set to use MMX versions of functions else
+ // use original 'C' versions.
+ int mmx_enabled;
+ int xmm_enabled;
+ int wmt_enabled;
+
+ vpx_get_processor_flags(&mmx_enabled, &xmm_enabled, &wmt_enabled);
+
+ if (mmx_enabled || xmm_enabled || wmt_enabled)
+ {
+ register_mmxscalers();
+ }
+ else
+ {
+ vp8_horizontal_line_1_2_scale = vp8cx_horizontal_line_1_2_scale_c;
+ vp8_vertical_band_1_2_scale = vp8cx_vertical_band_1_2_scale_c;
+ vp8_last_vertical_band_1_2_scale = vp8cx_last_vertical_band_1_2_scale_c;
+ vp8_horizontal_line_3_5_scale = vp8cx_horizontal_line_3_5_scale_c;
+ vp8_vertical_band_3_5_scale = vp8cx_vertical_band_3_5_scale_c;
+ vp8_last_vertical_band_3_5_scale = vp8cx_last_vertical_band_3_5_scale_c;
+ vp8_horizontal_line_3_4_scale = vp8cx_horizontal_line_3_4_scale_c;
+ vp8_vertical_band_3_4_scale = vp8cx_vertical_band_3_4_scale_c;
+ vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c;
+ vp8_horizontal_line_2_3_scale = vp8cx_horizontal_line_2_3_scale_c;
+ vp8_vertical_band_2_3_scale = vp8cx_vertical_band_2_3_scale_c;
+ vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c;
+ vp8_horizontal_line_4_5_scale = vp8cx_horizontal_line_4_5_scale_c;
+ vp8_vertical_band_4_5_scale = vp8cx_vertical_band_4_5_scale_c;
+ vp8_last_vertical_band_4_5_scale = vp8cx_last_vertical_band_4_5_scale_c;
+
+
+ vp8_vertical_band_5_4_scale = vp8cx_vertical_band_5_4_scale_c;
+ vp8_vertical_band_5_3_scale = vp8cx_vertical_band_5_3_scale_c;
+ vp8_vertical_band_2_1_scale = vp8cx_vertical_band_2_1_scale_c;
+ vp8_vertical_band_2_1_scale_i = vp8cx_vertical_band_2_1_scale_i_c;
+ vp8_horizontal_line_2_1_scale = vp8cx_horizontal_line_2_1_scale_c;
+ vp8_horizontal_line_5_3_scale = vp8cx_horizontal_line_5_3_scale_c;
+ vp8_horizontal_line_5_4_scale = vp8cx_horizontal_line_5_4_scale_c;
+
+ }
+}
diff --git a/vpx_scale/x86_64/scaleopt.c b/vpx_scale/x86_64/scaleopt.c
new file mode 100644
index 000000000..3d2d5f237
--- /dev/null
+++ b/vpx_scale/x86_64/scaleopt.c
@@ -0,0 +1,1749 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+* Module Title : scaleopt.cpp
+*
+* Description : Optimized scaling functions
+*
+****************************************************************************/
+#include "pragmas.h"
+
+
+
+/****************************************************************************
+* Module Statics
+****************************************************************************/
+__declspec(align(16)) const static unsigned short one_fifth[] = { 51, 51, 51, 51 };
+__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
+__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
+__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
+__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
+__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
+__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102, 51 };
+__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
+__declspec(align(16)) const static unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
+__declspec(align(16)) const static unsigned short const35_2[] = { 154, 51, 205, 102 };
+__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205, 51, 154 };
+
+
+
+#include "vpx_scale/vpxscale.h"
+#include "vpx_mem/vpx_mem.h"
+
+/****************************************************************************
+*
+* ROUTINE : horizontal_line_3_5_scale_mmx
+*
+* INPUTS : const unsigned char *source :
+* unsigned int source_width :
+* unsigned char *dest :
+* unsigned int dest_width :
+*
+* OUTPUTS : None.
+*
+* RETURNS : void
+*
+* FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels.
+*
+* SPECIAL NOTES : None.
+*
+****************************************************************************/
+static
+void horizontal_line_3_5_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ (void) dest_width;
+
+ __asm
+ {
+
+ push rbx
+
+ mov rsi, source
+ mov rdi, dest
+
+ mov ecx, source_width
+ lea rdx, [rsi+rcx-3];
+
+ movq mm5, const35_1 // mm5 = 66 xx cd xx 33 xx 9a xx
+ movq mm6, const35_2 // mm6 = 9a xx 33 xx cd xx 66 xx
+
+ movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx
+ pxor mm7, mm7 // clear mm7
+
+ horiz_line_3_5_loop:
+
+ mov eax, DWORD PTR [rsi] // eax = 00 01 02 03
+ mov ebx, eax
+
+ and ebx, 0xffff00 // ebx = xx 01 02 xx
+ mov ecx, eax // ecx = 00 01 02 03
+
+ and eax, 0xffff0000 // eax = xx xx 02 03
+ xor ecx, eax // ecx = 00 01 xx xx
+
+ shr ebx, 8 // ebx = 01 02 xx xx
+ or eax, ebx // eax = 01 02 02 03
+
+ shl ebx, 16 // ebx = xx xx 01 02
+ movd mm1, eax // mm1 = 01 02 02 03 xx xx xx xx
+
+ or ebx, ecx // ebx = 00 01 01 02
+ punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 03 xx
+
+ movd mm0, ebx // mm0 = 00 01 01 02
+ pmullw mm1, mm6 //
+
+ punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
+ pmullw mm0, mm5 //
+
+ mov [rdi], ebx // writeoutput 00 xx xx xx
+ add rsi, 3
+
+ add rdi, 5
+ paddw mm0, mm1
+
+ paddw mm0, mm4
+ psrlw mm0, 8
+
+ cmp rsi, rdx
+ packuswb mm0, mm7
+
+ movd DWORD Ptr [rdi-4], mm0
+ jl horiz_line_3_5_loop
+
+//Exit:
+ mov eax, DWORD PTR [rsi] // eax = 00 01 02 03
+ mov ebx, eax
+
+ and ebx, 0xffff00 // ebx = xx 01 02 xx
+ mov ecx, eax // ecx = 00 01 02 03
+
+ and eax, 0xffff0000 // eax = xx xx 02 03
+ xor ecx, eax // ecx = 00 01 xx xx
+
+ shr ebx, 8 // ebx = 01 02 xx xx
+ or eax, ebx // eax = 01 02 02 03
+
+ shl eax, 8 // eax = xx 01 02 02
+ and eax, 0xffff0000 // eax = xx xx 02 02
+
+ or eax, ebx // eax = 01 02 02 02
+
+ shl ebx, 16 // ebx = xx xx 01 02
+ movd mm1, eax // mm1 = 01 02 02 02 xx xx xx xx
+
+ or ebx, ecx // ebx = 00 01 01 02
+ punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 02 xx
+
+ movd mm0, ebx // mm0 = 00 01 01 02
+ pmullw mm1, mm6 //
+
+ punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
+ pmullw mm0, mm5 //
+
+ mov [rdi], ebx // writeoutput 00 xx xx xx
+ paddw mm0, mm1
+
+ paddw mm0, mm4
+ psrlw mm0, 8
+
+ packuswb mm0, mm7
+ movd DWORD Ptr [rdi+1], mm0
+
+ pop rbx
+
+ }
+
+}
+
+
+/****************************************************************************
+*
+* ROUTINE : horizontal_line_4_5_scale_mmx
+*
+* INPUTS : const unsigned char *source :
+* unsigned int source_width :
+* unsigned char *dest :
+* unsigned int dest_width :
+*
+* OUTPUTS : None.
+*
+* RETURNS : void
+*
+* FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels.
+*
+* SPECIAL NOTES : None.
+*
+****************************************************************************/
+static
+void horizontal_line_4_5_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ (void)dest_width;
+
+ __asm
+ {
+
+ mov rsi, source
+ mov rdi, dest
+
+ mov ecx, source_width
+ lea rdx, [rsi+rcx-8];
+
+ movq mm5, const45_1 // mm5 = 33 xx 66 xx 9a xx cd xx
+ movq mm6, const45_2 // mm6 = cd xx 9a xx 66 xx 33 xx
+
+ movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx
+ pxor mm7, mm7 // clear mm7
+
+ horiz_line_4_5_loop:
+
+ movq mm0, QWORD PTR [rsi] // mm0 = 00 01 02 03 04 05 06 07
+ movq mm1, QWORD PTR [rsi+1]; // mm1 = 01 02 03 04 05 06 07 08
+
+ movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
+ movq mm3, mm1 // mm3 = 01 02 03 04 05 06 07 08
+
+ movd DWORD PTR [rdi], mm0 // write output 00 xx xx xx
+ punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
+
+ punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
+ pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
+
+ pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
+ punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
+
+ movd DWORD PTR [rdi+5], mm2 // write ouput 05 xx xx xx
+ pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
+
+ punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
+ pmullw mm3, mm6 // 05*205 06*154 07*102 08* 51
+
+ paddw mm0, mm1 // added round values
+ paddw mm0, mm4
+
+ psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
+ packuswb mm0, mm7
+
+ movd DWORD PTR [rdi+1], mm0 // write output 01 02 03 04
+ add rdi, 10
+
+ add rsi, 8
+ paddw mm2, mm3 //
+
+ paddw mm2, mm4 // added round values
+ cmp rsi, rdx
+
+ psrlw mm2, 8
+ packuswb mm2, mm7
+
+ movd DWORD PTR [rdi-4], mm2 // writeoutput 06 07 08 09
+ jl horiz_line_4_5_loop
+
+//Exit:
+ movq mm0, [rsi] // mm0 = 00 01 02 03 04 05 06 07
+ movq mm1, mm0 // mm1 = 00 01 02 03 04 05 06 07
+
+ movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
+ psrlq mm1, 8 // mm1 = 01 02 03 04 05 06 07 00
+
+ movq mm3, mask45 // mm3 = 00 00 00 00 00 00 ff 00
+ pand mm3, mm1 // mm3 = 00 00 00 00 00 00 07 00
+
+ psllq mm3, 8 // mm3 = 00 00 00 00 00 00 00 07
+ por mm1, mm3 // mm1 = 01 02 03 04 05 06 07 07
+
+ movq mm3, mm1
+
+ movd DWORD PTR [rdi], mm0 // write output 00 xx xx xx
+ punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
+
+ punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
+ pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
+
+ pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
+ punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
+
+ movd DWORD PTR [rdi+5], mm2 // write ouput 05 xx xx xx
+ pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
+
+ punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
+ pmullw mm3, mm6 // 05*205 06*154 07*102 07* 51
+
+ paddw mm0, mm1 // added round values
+ paddw mm0, mm4
+
+ psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
+ packuswb mm0, mm7 // 01 02 03 04 xx xx xx xx
+
+ movd DWORD PTR [rdi+1], mm0 // write output 01 02 03 04
+ paddw mm2, mm3 //
+
+ paddw mm2, mm4 // added round values
+ psrlw mm2, 8
+
+ packuswb mm2, mm7
+ movd DWORD PTR [rdi+6], mm2 // writeoutput 06 07 08 09
+
+
+ }
+}
+
+/****************************************************************************
+*
+* ROUTINE : vertical_band_4_5_scale_mmx
+*
+* INPUTS : unsigned char *dest :
+* unsigned int dest_pitch :
+* unsigned int dest_width :
+*
+* OUTPUTS : None.
+*
+* RETURNS : void
+*
+* FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels.
+*
+* SPECIAL NOTES : The routine uses the first line of the band below
+* the current band. The function also has a "C" only
+* version.
+*
+****************************************************************************/
+static
+void vertical_band_4_5_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __asm
+ {
+
+ mov rsi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ lea rdi, [rsi+rcx*2] // tow lines below
+ add rdi, rcx // three lines below
+
+ pxor mm7, mm7 // clear out mm7
+ mov edx, dest_width // Loop counter
+
+ vs_4_5_loop:
+
+ movq mm0, QWORD ptr [rsi] // src[0];
+ movq mm1, QWORD ptr [rsi+rcx] // src[1];
+
+ movq mm2, mm0 // Make a copy
+ punpcklbw mm0, mm7 // unpack low to word
+
+ movq mm5, one_fifth
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm0, mm5 // a * 1/5
+
+ movq mm3, mm1 // make a copy
+ punpcklbw mm1, mm7 // unpack low to word
+
+ pmullw mm2, mm5 // a * 1/5
+ movq mm6, four_fifths // constan
+
+ movq mm4, mm1 // copy of low b
+ pmullw mm4, mm6 // b * 4/5
+
+ punpckhbw mm3, mm7 // unpack high to word
+ movq mm5, mm3 // copy of high b
+
+ pmullw mm5, mm6 // b * 4/5
+ paddw mm0, mm4 // a * 1/5 + b * 4/5
+
+ paddw mm2, mm5 // a * 1/5 + b * 4/5
+ paddw mm0, round_values // + 128
+
+ paddw mm2, round_values // + 128
+ psrlw mm0, 8
+
+ psrlw mm2, 8
+ packuswb mm0, mm2 // des [1]
+
+ movq QWORD ptr [rsi+rcx], mm0 // write des[1]
+ movq mm0, [rsi+rcx*2] // mm0 = src[2]
+
+ // mm1, mm3 --- Src[1]
+ // mm0 --- Src[2]
+ // mm7 for unpacking
+
+ movq mm5, two_fifths
+ movq mm2, mm0 // make a copy
+
+ pmullw mm1, mm5 // b * 2/5
+ movq mm6, three_fifths
+
+
+ punpcklbw mm0, mm7 // unpack low to word
+ pmullw mm3, mm5 // b * 2/5
+
+ movq mm4, mm0 // make copy of c
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm4, mm6 // c * 3/5
+ movq mm5, mm2
+
+ pmullw mm5, mm6 // c * 3/5
+ paddw mm1, mm4 // b * 2/5 + c * 3/5
+
+ paddw mm3, mm5 // b * 2/5 + c * 3/5
+ paddw mm1, round_values // + 128
+
+ paddw mm3, round_values // + 128
+ psrlw mm1, 8
+
+ psrlw mm3, 8
+ packuswb mm1, mm3 // des[2]
+
+ movq QWORD ptr [rsi+rcx*2], mm1 // write des[2]
+ movq mm1, [rdi] // mm1=Src[3];
+
+ // mm0, mm2 --- Src[2]
+ // mm1 --- Src[3]
+ // mm6 --- 3/5
+ // mm7 for unpacking
+
+ pmullw mm0, mm6 // c * 3/5
+ movq mm5, two_fifths // mm5 = 2/5
+
+ movq mm3, mm1 // make a copy
+ pmullw mm2, mm6 // c * 3/5
+
+ punpcklbw mm1, mm7 // unpack low
+ movq mm4, mm1 // make a copy
+
+ punpckhbw mm3, mm7 // unpack high
+ pmullw mm4, mm5 // d * 2/5
+
+ movq mm6, mm3 // make a copy
+ pmullw mm6, mm5 // d * 2/5
+
+ paddw mm0, mm4 // c * 3/5 + d * 2/5
+ paddw mm2, mm6 // c * 3/5 + d * 2/5
+
+ paddw mm0, round_values // + 128
+ paddw mm2, round_values // + 128
+
+ psrlw mm0, 8
+ psrlw mm2, 8
+
+ packuswb mm0, mm2 // des[3]
+ movq QWORD ptr [rdi], mm0 // write des[3]
+
+ // mm1, mm3 --- Src[3]
+ // mm7 -- cleared for unpacking
+
+ movq mm0, [rdi+rcx*2] // mm0, Src[0] of the next group
+
+ movq mm5, four_fifths // mm5 = 4/5
+ pmullw mm1, mm5 // d * 4/5
+
+ movq mm6, one_fifth // mm6 = 1/5
+ movq mm2, mm0 // make a copy
+
+ pmullw mm3, mm5 // d * 4/5
+ punpcklbw mm0, mm7 // unpack low
+
+ pmullw mm0, mm6 // an * 1/5
+ punpckhbw mm2, mm7 // unpack high
+
+ paddw mm1, mm0 // d * 4/5 + an * 1/5
+ pmullw mm2, mm6 // an * 1/5
+
+ paddw mm3, mm2 // d * 4/5 + an * 1/5
+ paddw mm1, round_values // + 128
+
+ paddw mm3, round_values // + 128
+ psrlw mm1, 8
+
+ psrlw mm3, 8
+ packuswb mm1, mm3 // des[4]
+
+ movq QWORD ptr [rdi+rcx], mm1 // write des[4]
+
+ add rdi, 8
+ add rsi, 8
+
+ sub rdx, 8
+ jg vs_4_5_loop
+ }
+}
+
+/****************************************************************************
+*
+* ROUTINE : last_vertical_band_4_5_scale_mmx
+*
+* INPUTS : unsigned char *dest :
+* unsigned int dest_pitch :
+* unsigned int dest_width :
+*
+* OUTPUTS : None.
+*
+* RETURNS : None
+*
+* FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image.
+*
+* SPECIAL NOTES : The routine uses the first line of the band below
+* the current band. The function also has an "C" only
+* version.
+*
+****************************************************************************/
+static
+void last_vertical_band_4_5_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __asm
+ {
+ mov rsi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ lea rdi, [rsi+rcx*2] // tow lines below
+ add rdi, rcx // three lines below
+
+ pxor mm7, mm7 // clear out mm7
+ mov edx, dest_width // Loop counter
+
+ last_vs_4_5_loop:
+
+ movq mm0, QWORD ptr [rsi] // src[0];
+ movq mm1, QWORD ptr [rsi+rcx] // src[1];
+
+ movq mm2, mm0 // Make a copy
+ punpcklbw mm0, mm7 // unpack low to word
+
+ movq mm5, one_fifth
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm0, mm5 // a * 1/5
+
+ movq mm3, mm1 // make a copy
+ punpcklbw mm1, mm7 // unpack low to word
+
+ pmullw mm2, mm5 // a * 1/5
+ movq mm6, four_fifths // constan
+
+ movq mm4, mm1 // copy of low b
+ pmullw mm4, mm6 // b * 4/5
+
+ punpckhbw mm3, mm7 // unpack high to word
+ movq mm5, mm3 // copy of high b
+
+ pmullw mm5, mm6 // b * 4/5
+ paddw mm0, mm4 // a * 1/5 + b * 4/5
+
+ paddw mm2, mm5 // a * 1/5 + b * 4/5
+ paddw mm0, round_values // + 128
+
+ paddw mm2, round_values // + 128
+ psrlw mm0, 8
+
+ psrlw mm2, 8
+ packuswb mm0, mm2 // des [1]
+
+ movq QWORD ptr [rsi+rcx], mm0 // write des[1]
+ movq mm0, [rsi+rcx*2] // mm0 = src[2]
+
+ // mm1, mm3 --- Src[1]
+ // mm0 --- Src[2]
+ // mm7 for unpacking
+
+ movq mm5, two_fifths
+ movq mm2, mm0 // make a copy
+
+ pmullw mm1, mm5 // b * 2/5
+ movq mm6, three_fifths
+
+
+ punpcklbw mm0, mm7 // unpack low to word
+ pmullw mm3, mm5 // b * 2/5
+
+ movq mm4, mm0 // make copy of c
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm4, mm6 // c * 3/5
+ movq mm5, mm2
+
+ pmullw mm5, mm6 // c * 3/5
+ paddw mm1, mm4 // b * 2/5 + c * 3/5
+
+ paddw mm3, mm5 // b * 2/5 + c * 3/5
+ paddw mm1, round_values // + 128
+
+ paddw mm3, round_values // + 128
+ psrlw mm1, 8
+
+ psrlw mm3, 8
+ packuswb mm1, mm3 // des[2]
+
+ movq QWORD ptr [rsi+rcx*2], mm1 // write des[2]
+ movq mm1, [rdi] // mm1=Src[3];
+
+ movq QWORD ptr [rdi+rcx], mm1 // write des[4];
+
+ // mm0, mm2 --- Src[2]
+ // mm1 --- Src[3]
+ // mm6 --- 3/5
+ // mm7 for unpacking
+
+ pmullw mm0, mm6 // c * 3/5
+ movq mm5, two_fifths // mm5 = 2/5
+
+ movq mm3, mm1 // make a copy
+ pmullw mm2, mm6 // c * 3/5
+
+ punpcklbw mm1, mm7 // unpack low
+ movq mm4, mm1 // make a copy
+
+ punpckhbw mm3, mm7 // unpack high
+ pmullw mm4, mm5 // d * 2/5
+
+ movq mm6, mm3 // make a copy
+ pmullw mm6, mm5 // d * 2/5
+
+ paddw mm0, mm4 // c * 3/5 + d * 2/5
+ paddw mm2, mm6 // c * 3/5 + d * 2/5
+
+ paddw mm0, round_values // + 128
+ paddw mm2, round_values // + 128
+
+ psrlw mm0, 8
+ psrlw mm2, 8
+
+ packuswb mm0, mm2 // des[3]
+ movq QWORD ptr [rdi], mm0 // write des[3]
+
+ // mm1, mm3 --- Src[3]
+ // mm7 -- cleared for unpacking
+ add rdi, 8
+ add rsi, 8
+
+ sub rdx, 8
+ jg last_vs_4_5_loop
+ }
+}
+
+/****************************************************************************
+*
+* ROUTINE : vertical_band_3_5_scale_mmx
+*
+* INPUTS : unsigned char *dest :
+* unsigned int dest_pitch :
+* unsigned int dest_width :
+*
+* OUTPUTS : None.
+*
+* RETURNS : void
+*
+* FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
+*
+* SPECIAL NOTES : The routine uses the first line of the band below
+* the current band. The function also has an "C" only
+* version.
+*
+****************************************************************************/
+static
+void vertical_band_3_5_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __asm
+ {
+ mov rsi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ lea rdi, [rsi+rcx*2] // two lines below
+ add rdi, rcx // three lines below
+
+ pxor mm7, mm7 // clear out mm7
+ mov edx, dest_width // Loop counter
+
+ vs_3_5_loop:
+
+ movq mm0, QWORD ptr [rsi] // src[0];
+ movq mm1, QWORD ptr [rsi+rcx] // src[1];
+
+ movq mm2, mm0 // Make a copy
+ punpcklbw mm0, mm7 // unpack low to word
+
+ movq mm5, two_fifths // mm5 = 2/5
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm0, mm5 // a * 2/5
+
+ movq mm3, mm1 // make a copy
+ punpcklbw mm1, mm7 // unpack low to word
+
+ pmullw mm2, mm5 // a * 2/5
+ movq mm6, three_fifths // mm6 = 3/5
+
+ movq mm4, mm1 // copy of low b
+ pmullw mm4, mm6 // b * 3/5
+
+ punpckhbw mm3, mm7 // unpack high to word
+ movq mm5, mm3 // copy of high b
+
+ pmullw mm5, mm6 // b * 3/5
+ paddw mm0, mm4 // a * 2/5 + b * 3/5
+
+ paddw mm2, mm5 // a * 2/5 + b * 3/5
+ paddw mm0, round_values // + 128
+
+ paddw mm2, round_values // + 128
+ psrlw mm0, 8
+
+ psrlw mm2, 8
+ packuswb mm0, mm2 // des [1]
+
+ movq QWORD ptr [rsi+rcx], mm0 // write des[1]
+ movq mm0, [rsi+rcx*2] // mm0 = src[2]
+
+ // mm1, mm3 --- Src[1]
+ // mm0 --- Src[2]
+ // mm7 for unpacking
+
+ movq mm4, mm1 // b low
+ pmullw mm1, four_fifths // b * 4/5 low
+
+ movq mm5, mm3 // b high
+ pmullw mm3, four_fifths // b * 4/5 high
+
+ movq mm2, mm0 // c
+ pmullw mm4, one_fifth // b * 1/5
+
+ punpcklbw mm0, mm7 // c low
+ pmullw mm5, one_fifth // b * 1/5
+
+ movq mm6, mm0 // make copy of c low
+ punpckhbw mm2, mm7 // c high
+
+ pmullw mm6, one_fifth // c * 1/5 low
+ movq mm7, mm2 // make copy of c high
+
+ pmullw mm7, one_fifth // c * 1/5 high
+ paddw mm1, mm6 // b * 4/5 + c * 1/5 low
+
+ paddw mm3, mm7 // b * 4/5 + c * 1/5 high
+ movq mm6, mm0 // make copy of c low
+
+ pmullw mm6, four_fifths // c * 4/5 low
+ movq mm7, mm2 // make copy of c high
+
+ pmullw mm7, four_fifths // c * 4/5 high
+
+ paddw mm4, mm6 // b * 1/5 + c * 4/5 low
+ paddw mm5, mm7 // b * 1/5 + c * 4/5 high
+
+ paddw mm1, round_values // + 128
+ paddw mm3, round_values // + 128
+
+ psrlw mm1, 8
+ psrlw mm3, 8
+
+ packuswb mm1, mm3 // des[2]
+ movq QWORD ptr [rsi+rcx*2], mm1 // write des[2]
+
+ paddw mm4, round_values // + 128
+ paddw mm5, round_values // + 128
+
+ psrlw mm4, 8
+ psrlw mm5, 8
+
+ packuswb mm4, mm5 // des[3]
+ movq QWORD ptr [rdi], mm4 // write des[3]
+
+ // mm0, mm2 --- Src[3]
+
+ pxor mm7, mm7 // clear mm7 for unpacking
+ movq mm1, [rdi+rcx*2] // mm1 = Src[0] of the next group
+
+ movq mm5, three_fifths // mm5 = 3/5
+ pmullw mm0, mm5 // d * 3/5
+
+ movq mm6, two_fifths // mm6 = 2/5
+ movq mm3, mm1 // make a copy
+
+ pmullw mm2, mm5 // d * 3/5
+ punpcklbw mm1, mm7 // unpack low
+
+ pmullw mm1, mm6 // an * 2/5
+ punpckhbw mm3, mm7 // unpack high
+
+ paddw mm0, mm1 // d * 3/5 + an * 2/5
+ pmullw mm3, mm6 // an * 2/5
+
+ paddw mm2, mm3 // d * 3/5 + an * 2/5
+ paddw mm0, round_values // + 128
+
+ paddw mm2, round_values // + 128
+ psrlw mm0, 8
+
+ psrlw mm2, 8
+ packuswb mm0, mm2 // des[4]
+
+ movq QWORD ptr [rdi+rcx], mm0 // write des[4]
+
+ add rdi, 8
+ add rsi, 8
+
+ sub rdx, 8
+ jg vs_3_5_loop
+ }
+}
+
+/****************************************************************************
+*
+* ROUTINE : last_vertical_band_3_5_scale_mmx
+*
+* INPUTS : unsigned char *dest :
+* unsigned int dest_pitch :
+* unsigned int dest_width :
+*
+* OUTPUTS : None.
+*
+* RETURNS : void
+*
+* FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
+*
+* SPECIAL NOTES : The routine uses the first line of the band below
+* the current band. The function also has an "C" only
+* version.
+*
+****************************************************************************/
+static
+void last_vertical_band_3_5_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __asm
+ {
+ mov rsi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ lea rdi, [rsi+rcx*2] // tow lines below
+ add rdi, rcx // three lines below
+
+ pxor mm7, mm7 // clear out mm7
+ mov edx, dest_width // Loop counter
+
+
+ last_vs_3_5_loop:
+
+ movq mm0, QWORD ptr [rsi] // src[0];
+ movq mm1, QWORD ptr [rsi+rcx] // src[1];
+
+ movq mm2, mm0 // Make a copy
+ punpcklbw mm0, mm7 // unpack low to word
+
+ movq mm5, two_fifths // mm5 = 2/5
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm0, mm5 // a * 2/5
+
+ movq mm3, mm1 // make a copy
+ punpcklbw mm1, mm7 // unpack low to word
+
+ pmullw mm2, mm5 // a * 2/5
+ movq mm6, three_fifths // mm6 = 3/5
+
+ movq mm4, mm1 // copy of low b
+ pmullw mm4, mm6 // b * 3/5
+
+ punpckhbw mm3, mm7 // unpack high to word
+ movq mm5, mm3 // copy of high b
+
+ pmullw mm5, mm6 // b * 3/5
+ paddw mm0, mm4 // a * 2/5 + b * 3/5
+
+ paddw mm2, mm5 // a * 2/5 + b * 3/5
+ paddw mm0, round_values // + 128
+
+ paddw mm2, round_values // + 128
+ psrlw mm0, 8
+
+ psrlw mm2, 8
+ packuswb mm0, mm2 // des [1]
+
+ movq QWORD ptr [rsi+rcx], mm0 // write des[1]
+ movq mm0, [rsi+rcx*2] // mm0 = src[2]
+
+
+
+ // mm1, mm3 --- Src[1]
+ // mm0 --- Src[2]
+ // mm7 for unpacking
+
+ movq mm4, mm1 // b low
+ pmullw mm1, four_fifths // b * 4/5 low
+
+ movq QWORD ptr [rdi+rcx], mm0 // write des[4]
+
+ movq mm5, mm3 // b high
+ pmullw mm3, four_fifths // b * 4/5 high
+
+ movq mm2, mm0 // c
+ pmullw mm4, one_fifth // b * 1/5
+
+ punpcklbw mm0, mm7 // c low
+ pmullw mm5, one_fifth // b * 1/5
+
+ movq mm6, mm0 // make copy of c low
+ punpckhbw mm2, mm7 // c high
+
+ pmullw mm6, one_fifth // c * 1/5 low
+ movq mm7, mm2 // make copy of c high
+
+ pmullw mm7, one_fifth // c * 1/5 high
+ paddw mm1, mm6 // b * 4/5 + c * 1/5 low
+
+ paddw mm3, mm7 // b * 4/5 + c * 1/5 high
+ movq mm6, mm0 // make copy of c low
+
+ pmullw mm6, four_fifths // c * 4/5 low
+ movq mm7, mm2 // make copy of c high
+
+ pmullw mm7, four_fifths // c * 4/5 high
+
+ paddw mm4, mm6 // b * 1/5 + c * 4/5 low
+ paddw mm5, mm7 // b * 1/5 + c * 4/5 high
+
+ paddw mm1, round_values // + 128
+ paddw mm3, round_values // + 128
+
+ psrlw mm1, 8
+ psrlw mm3, 8
+
+ packuswb mm1, mm3 // des[2]
+ movq QWORD ptr [rsi+rcx*2], mm1 // write des[2]
+
+ paddw mm4, round_values // + 128
+ paddw mm5, round_values // + 128
+
+ psrlw mm4, 8
+ psrlw mm5, 8
+
+ packuswb mm4, mm5 // des[3]
+ movq QWORD ptr [rdi], mm4 // write des[3]
+
+ // mm0, mm2 --- Src[3]
+
+ add rdi, 8
+ add rsi, 8
+
+ sub rdx, 8
+ jg last_vs_3_5_loop
+ }
+}
+
+/****************************************************************************
+*
+* ROUTINE : vertical_band_1_2_scale_mmx
+*
+* INPUTS : unsigned char *dest :
+* unsigned int dest_pitch :
+* unsigned int dest_width :
+*
+* OUTPUTS : None.
+*
+* RETURNS : void
+*
+* FUNCTION : 1 to 2 up-scaling of a band of pixels.
+*
+* SPECIAL NOTES : The routine uses the first line of the band below
+* the current band. The function also has an "C" only
+* version.
+*
+****************************************************************************/
+static
+void vertical_band_1_2_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __asm
+ {
+
+ mov rsi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ pxor mm7, mm7 // clear out mm7
+ mov edx, dest_width // Loop counter
+
+ vs_1_2_loop:
+
+ movq mm0, [rsi] // get Src[0]
+ movq mm1, [rsi + rcx * 2] // get Src[1]
+
+ movq mm2, mm0 // make copy before unpack
+ movq mm3, mm1 // make copy before unpack
+
+ punpcklbw mm0, mm7 // low Src[0]
+ movq mm6, four_ones // mm6= 1, 1, 1, 1
+
+ punpcklbw mm1, mm7 // low Src[1]
+ paddw mm0, mm1 // low (a + b)
+
+ punpckhbw mm2, mm7 // high Src[0]
+ paddw mm0, mm6 // low (a + b + 1)
+
+ punpckhbw mm3, mm7
+ paddw mm2, mm3 // high (a + b )
+
+ psraw mm0, 1 // low (a + b +1 )/2
+ paddw mm2, mm6 // high (a + b + 1)
+
+ psraw mm2, 1 // high (a + b + 1)/2
+ packuswb mm0, mm2 // pack results
+
+ movq [rsi+rcx], mm0 // write out eight bytes
+ add rsi, 8
+
+ sub rdx, 8
+ jg vs_1_2_loop
+ }
+
+}
+
+/****************************************************************************
+*
+* ROUTINE : last_vertical_band_1_2_scale_mmx
+*
+* INPUTS : unsigned char *dest :
+* unsigned int dest_pitch :
+* unsigned int dest_width :
+*
+* OUTPUTS : None.
+*
+* RETURNS : void
+*
+* FUNCTION : 1 to 2 up-scaling of band of pixels.
+*
+* SPECIAL NOTES : The routine uses the first line of the band below
+* the current band. The function also has an "C" only
+* version.
+*
+****************************************************************************/
+static
+void last_vertical_band_1_2_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __asm
+ {
+ mov rsi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ mov edx, dest_width // Loop counter
+
+ last_vs_1_2_loop:
+
+ movq mm0, [rsi] // get Src[0]
+ movq [rsi+rcx], mm0 // write out eight bytes
+
+ add rsi, 8
+ sub rdx, 8
+
+ jg last_vs_1_2_loop
+ }
+}
+
+/****************************************************************************
+*
+* ROUTINE : horizontal_line_1_2_scale
+*
+* INPUTS : const unsigned char *source :
+* unsigned int source_width :
+* unsigned char *dest :
+* unsigned int dest_width :
+*
+* OUTPUTS : None.
+*
+* RETURNS : void
+*
+* FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
+*
+* SPECIAL NOTES : None.
+*
+****************************************************************************/
+static
+void horizontal_line_1_2_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ (void) dest_width;
+
+ __asm
+ {
+ mov rsi, source
+ mov rdi, dest
+
+ pxor mm7, mm7
+ movq mm6, four_ones
+
+ mov ecx, source_width
+
+ hs_1_2_loop:
+
+ movq mm0, [rsi]
+ movq mm1, [rsi+1]
+
+ movq mm2, mm0
+ movq mm3, mm1
+
+ movq mm4, mm0
+ punpcklbw mm0, mm7
+
+ punpcklbw mm1, mm7
+ paddw mm0, mm1
+
+ paddw mm0, mm6
+ punpckhbw mm2, mm7
+
+ punpckhbw mm3, mm7
+ paddw mm2, mm3
+
+ paddw mm2, mm6
+ psraw mm0, 1
+
+ psraw mm2, 1
+ packuswb mm0, mm2
+
+ movq mm2, mm4
+ punpcklbw mm2, mm0
+
+ movq [rdi], mm2
+ punpckhbw mm4, mm0
+
+ movq [rdi+8], mm4
+ add rsi, 8
+
+ add rdi, 16
+ sub rcx, 8
+
+ cmp rcx, 8
+ jg hs_1_2_loop
+
+// last eight pixel
+
+ movq mm0, [rsi]
+ movq mm1, mm0
+
+ movq mm2, mm0
+ movq mm3, mm1
+
+ psrlq mm1, 8
+ psrlq mm3, 56
+
+ psllq mm3, 56
+ por mm1, mm3
+
+ movq mm3, mm1
+ movq mm4, mm0
+
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+
+ paddw mm0, mm1
+ paddw mm0, mm6
+
+ punpckhbw mm2, mm7
+ punpckhbw mm3, mm7
+
+ paddw mm2, mm3
+ paddw mm2, mm6
+
+ psraw mm0, 1
+ psraw mm2, 1
+
+ packuswb mm0, mm2
+ movq mm2, mm4
+
+ punpcklbw mm2, mm0
+ movq [rdi], mm2
+
+ punpckhbw mm4, mm0
+ movq [rdi+8], mm4
+ }
+}
+
+
+
+
+
+__declspec(align(16)) const static unsigned short const54_2[] = { 0, 64, 128, 192 };
+__declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128, 64 };
+
+
+/****************************************************************************
+*
+* ROUTINE : horizontal_line_5_4_scale_mmx
+*
+* INPUTS : const unsigned char *source : Pointer to source data.
+* unsigned int source_width : Stride of source.
+* unsigned char *dest : Pointer to destination data.
+* unsigned int dest_width : Stride of destination (NOT USED).
+*
+* OUTPUTS : None.
+*
+* RETURNS : void
+*
+* FUNCTION : Copies horizontal line of pixels from source to
+* destination scaling up by 4 to 5.
+*
+* SPECIAL NOTES : None.
+*
+****************************************************************************/
+static
+void horizontal_line_5_4_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ /*
+ unsigned i;
+ unsigned int a, b, c, d, e;
+ unsigned char *des = dest;
+ const unsigned char *src = source;
+
+ (void) dest_width;
+
+ for ( i=0; i<source_width; i+=5 )
+ {
+ a = src[0];
+ b = src[1];
+ c = src[2];
+ d = src[3];
+ e = src[4];
+
+ des[0] = a;
+ des[1] = ((b*192 + c* 64 + 128)>>8);
+ des[2] = ((c*128 + d*128 + 128)>>8);
+ des[3] = ((d* 64 + e*192 + 128)>>8);
+
+ src += 5;
+ des += 4;
+ }
+ */
+ __asm
+ {
+
+ mov rsi, source ;
+ mov rdi, dest ;
+
+ mov ecx, source_width ;
+ movq mm5, const54_1 ;
+
+ pxor mm7, mm7 ;
+ movq mm6, const54_2 ;
+
+ movq mm4, round_values ;
+ lea rdx, [rsi+rcx] ;
+ horizontal_line_5_4_loop:
+
+ movq mm0, QWORD PTR [rsi] ;
+ 00 01 02 03 04 05 06 07
+ movq mm1, mm0 ;
+ 00 01 02 03 04 05 06 07
+
+ psrlq mm0, 8 ;
+ 01 02 03 04 05 06 07 xx
+ punpcklbw mm1, mm7 ;
+ xx 00 xx 01 xx 02 xx 03
+
+ punpcklbw mm0, mm7 ;
+ xx 01 xx 02 xx 03 xx 04
+ pmullw mm1, mm5
+
+ pmullw mm0, mm6
+ add rsi, 5
+
+ add rdi, 4
+ paddw mm1, mm0
+
+ paddw mm1, mm4
+ psrlw mm1, 8
+
+ cmp rsi, rdx
+ packuswb mm1, mm7
+
+ movd DWORD PTR [rdi-4], mm1
+
+ jl horizontal_line_5_4_loop
+
+ }
+
+}
+__declspec(align(16)) const static unsigned short one_fourths[] = { 64, 64, 64, 64 };
+__declspec(align(16)) const static unsigned short two_fourths[] = { 128, 128, 128, 128 };
+__declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
+
+static
+void vertical_band_5_4_scale_mmx
+(
+ unsigned char *source,
+ unsigned int src_pitch,
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+
+ __asm
+ {
+
+ mov rsi, source // Get the source and destination pointer
+ mov ecx, src_pitch // Get the pitch size
+
+ mov rdi, dest // tow lines below
+ pxor mm7, mm7 // clear out mm7
+
+ mov edx, dest_pitch // Loop counter
+ mov ebx, dest_width
+
+ vs_5_4_loop:
+
+ movd mm0, DWORD ptr [rsi] // src[0];
+ movd mm1, DWORD ptr [rsi+rcx] // src[1];
+
+ movd mm2, DWORD ptr [rsi+rcx*2]
+ lea rax, [rsi+rcx*2] //
+
+ punpcklbw mm1, mm7
+ punpcklbw mm2, mm7
+
+ movq mm3, mm2
+ pmullw mm1, three_fourths
+
+ pmullw mm2, one_fourths
+ movd mm4, [rax+rcx]
+
+ pmullw mm3, two_fourths
+ punpcklbw mm4, mm7
+
+ movq mm5, mm4
+ pmullw mm4, two_fourths
+
+ paddw mm1, mm2
+ movd mm6, [rax+rcx*2]
+
+ pmullw mm5, one_fourths
+ paddw mm1, round_values;
+
+ paddw mm3, mm4
+ psrlw mm1, 8
+
+ punpcklbw mm6, mm7
+ paddw mm3, round_values
+
+ pmullw mm6, three_fourths
+ psrlw mm3, 8
+
+ packuswb mm1, mm7
+ packuswb mm3, mm7
+
+ movd DWORD PTR [rdi], mm0
+ movd DWORD PTR [rdi+rdx], mm1
+
+
+ paddw mm5, mm6
+ movd DWORD PTR [rdi+rdx*2], mm3
+
+ lea rax, [rdi+rdx*2]
+ paddw mm5, round_values
+
+ psrlw mm5, 8
+ add rdi, 4
+
+ packuswb mm5, mm7
+ movd DWORD PTR [rax+rdx], mm5
+
+ add rsi, 4
+ sub rbx, 4
+
+ jg vs_5_4_loop
+ }
+}
+
+
+__declspec(align(16)) const static unsigned short const53_1[] = { 0, 85, 171, 0 };
+__declspec(align(16)) const static unsigned short const53_2[] = {256, 171, 85, 0 };
+
+
+static
+void horizontal_line_5_3_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ __asm
+ {
+
+ mov rsi, source ;
+ mov rdi, dest ;
+
+ mov ecx, source_width ;
+ movq mm5, const53_1 ;
+
+ pxor mm7, mm7 ;
+ movq mm6, const53_2 ;
+
+ movq mm4, round_values ;
+ lea rdx, [rsi+rcx-5] ;
+ horizontal_line_5_3_loop:
+
+ movq mm0, QWORD PTR [rsi] ;
+ 00 01 02 03 04 05 06 07
+ movq mm1, mm0 ;
+ 00 01 02 03 04 05 06 07
+
+ psllw mm0, 8 ;
+ xx 00 xx 02 xx 04 xx 06
+ psrlw mm1, 8 ;
+ 01 xx 03 xx 05 xx 07 xx
+
+ psrlw mm0, 8 ;
+ 00 xx 02 xx 04 xx 06 xx
+ psllq mm1, 16 ;
+ xx xx 01 xx 03 xx 05 xx
+
+ pmullw mm0, mm6
+
+ pmullw mm1, mm5
+ add rsi, 5
+
+ add rdi, 3
+ paddw mm1, mm0
+
+ paddw mm1, mm4
+ psrlw mm1, 8
+
+ cmp rsi, rdx
+ packuswb mm1, mm7
+
+ movd DWORD PTR [rdi-3], mm1
+ jl horizontal_line_5_3_loop
+
+//exit condition
+ movq mm0, QWORD PTR [rsi] ;
+ 00 01 02 03 04 05 06 07
+ movq mm1, mm0 ;
+ 00 01 02 03 04 05 06 07
+
+ psllw mm0, 8 ;
+ xx 00 xx 02 xx 04 xx 06
+ psrlw mm1, 8 ;
+ 01 xx 03 xx 05 xx 07 xx
+
+ psrlw mm0, 8 ;
+ 00 xx 02 xx 04 xx 06 xx
+ psllq mm1, 16 ;
+ xx xx 01 xx 03 xx 05 xx
+
+ pmullw mm0, mm6
+
+ pmullw mm1, mm5
+ paddw mm1, mm0
+
+ paddw mm1, mm4
+ psrlw mm1, 8
+
+ packuswb mm1, mm7
+ movd rax, mm1
+
+ mov rdx, rax
+ shr rdx, 16
+
+ mov WORD PTR[rdi], ax
+ mov BYTE PTR[rdi+2], dl
+
+ }
+
+}
+
+__declspec(align(16)) const static unsigned short one_thirds[] = { 85, 85, 85, 85 };
+__declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
+
+static
+void vertical_band_5_3_scale_mmx
+(
+ unsigned char *source,
+ unsigned int src_pitch,
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+
+ __asm
+ {
+
+ mov rsi, source // Get the source and destination pointer
+ mov ecx, src_pitch // Get the pitch size
+
+ mov rdi, dest // tow lines below
+ pxor mm7, mm7 // clear out mm7
+
+ mov edx, dest_pitch // Loop counter
+ movq mm5, one_thirds
+
+ movq mm6, two_thirds
+ mov ebx, dest_width;
+
+ vs_5_3_loop:
+
+ movd mm0, DWORD ptr [rsi] // src[0];
+ movd mm1, DWORD ptr [rsi+rcx] // src[1];
+
+ movd mm2, DWORD ptr [rsi+rcx*2]
+ lea rax, [rsi+rcx*2] //
+
+ punpcklbw mm1, mm7
+ punpcklbw mm2, mm7
+
+ pmullw mm1, mm5
+ pmullw mm2, mm6
+
+ movd mm3, DWORD ptr [rax+rcx]
+ movd mm4, DWORD ptr [rax+rcx*2]
+
+ punpcklbw mm3, mm7
+ punpcklbw mm4, mm7
+
+ pmullw mm3, mm6
+ pmullw mm4, mm5
+
+
+ movd DWORD PTR [rdi], mm0
+ paddw mm1, mm2
+
+ paddw mm1, round_values
+ psrlw mm1, 8
+
+ packuswb mm1, mm7
+ paddw mm3, mm4
+
+ paddw mm3, round_values
+ movd DWORD PTR [rdi+rdx], mm1
+
+ psrlw mm3, 8
+ packuswb mm3, mm7
+
+ movd DWORD PTR [rdi+rdx*2], mm3
+
+
+ add rdi, 4
+ add rsi, 4
+
+ sub rbx, 4
+ jg vs_5_3_loop
+ }
+}
+
+
+
+
+/****************************************************************************
+*
+* ROUTINE : horizontal_line_2_1_scale
+*
+* INPUTS : const unsigned char *source :
+* unsigned int source_width :
+* unsigned char *dest :
+* unsigned int dest_width :
+*
+* OUTPUTS : None.
+*
+* RETURNS : void
+*
+* FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
+*
+* SPECIAL NOTES : None.
+*
+****************************************************************************/
+static
+void horizontal_line_2_1_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ (void) dest_width;
+
+ __asm
+ {
+ mov rsi, source
+ mov rdi, dest
+
+ pxor mm7, mm7
+ mov ecx, dest_width
+
+ xor rdx, rdx
+ hs_2_1_loop:
+
+ movq mm0, [rsi+rdx*2]
+ psllw mm0, 8
+
+ psrlw mm0, 8
+ packuswb mm0, mm7
+
+ movd DWORD Ptr [rdi+rdx], mm0;
+ add rdx, 4
+
+ cmp rdx, rcx
+ jl hs_2_1_loop
+
+ }
+}
+
+
+
+static
+void vertical_band_2_1_scale_mmx
+(
+ unsigned char *source,
+ unsigned int src_pitch,
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width)
+{
+ vpx_memcpy(dest, source, dest_width);
+}
+
+
+__declspec(align(16)) const static unsigned short three_sixteenths[] = { 48, 48, 48, 48 };
+__declspec(align(16)) const static unsigned short ten_sixteenths[] = { 160, 160, 160, 160 };
+
+static
+void vertical_band_2_1_scale_i_mmx
+(
+ unsigned char *source,
+ unsigned int src_pitch,
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __asm
+ {
+ mov rsi, source
+ mov rdi, dest
+
+ mov eax, src_pitch
+ mov edx, dest_width
+
+ pxor mm7, mm7
+ sub rsi, rax //back one line
+
+
+ lea rcx, [rsi+rdx];
+ movq mm6, round_values;
+
+ movq mm5, three_sixteenths;
+ movq mm4, ten_sixteenths;
+
+ vs_2_1_i_loop:
+ movd mm0, [rsi] //
+ movd mm1, [rsi+rax] //
+
+ movd mm2, [rsi+rax*2] //
+ punpcklbw mm0, mm7
+
+ pmullw mm0, mm5
+ punpcklbw mm1, mm7
+
+ pmullw mm1, mm4
+ punpcklbw mm2, mm7
+
+ pmullw mm2, mm5
+ paddw mm0, round_values
+
+ paddw mm1, mm2
+ paddw mm0, mm1
+
+ psrlw mm0, 8
+ packuswb mm0, mm7
+
+ movd DWORD PTR [rdi], mm0
+ add rsi, 4
+
+ add rdi, 4;
+ cmp rsi, rcx
+ jl vs_2_1_i_loop
+
+ }
+}
+
+
+
+void
+register_mmxscalers(void)
+{
+ vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_mmx;
+ vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_mmx;
+ vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_mmx;
+ vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_mmx;
+ vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_mmx;
+ vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_mmx;
+ vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_mmx;
+ vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_mmx;
+ vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_mmx;
+
+ vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx;
+ vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx;
+ vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx;
+ vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx;
+ vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx;
+ vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx;
+ vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx;
+}
diff --git a/vpx_scale/x86_64/scalesystemdependant.c b/vpx_scale/x86_64/scalesystemdependant.c
new file mode 100644
index 000000000..43f05a68c
--- /dev/null
+++ b/vpx_scale/x86_64/scalesystemdependant.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+* Module Title : system_dependant.c
+*
+* Description : Miscellaneous system dependant functions
+*
+****************************************************************************/
+
+/****************************************************************************
+* Header Files
+****************************************************************************/
+#include "vpx_scale/vpxscale.h"
+#include "cpuidlib.h"
+
+/****************************************************************************
+* Imports
+*****************************************************************************/
+extern void register_generic_scalers(void);
+extern void register_mmxscalers(void);
+
+/****************************************************************************
+ *
+ * ROUTINE : post_proc_machine_specific_config
+ *
+ * INPUTS : UINT32 Version : Codec version number.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Checks for machine specifc features such as MMX support
+ * sets appropriate flags and function pointers.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void
+vp8_scale_machine_specific_config(void)
+{
+ int wmt_enabled = 1;
+
+ if (wmt_enabled)
+ {
+ register_mmxscalers();
+ }
+ else
+ {
+ register_generic_scalers();
+ }
+}
diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h
new file mode 100644
index 000000000..a8d0ce45b
--- /dev/null
+++ b/vpx_scale/yv12config.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef YV12_CONFIG_H
+#define YV12_CONFIG_H
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#define VP7BORDERINPIXELS 48
+#define VP8BORDERINPIXELS 32
+
+ /*************************************
+ For INT_YUV:
+
+ Y = (R+G*2+B)/4;
+ U = (R-B)/2;
+ V = (G*2 - R - B)/4;
+ And
+ R = Y+U-V;
+ G = Y+V;
+ B = Y-U-V;
+ ************************************/
+ typedef enum
+ {
+ REG_YUV = 0, // Regular yuv
+ INT_YUV = 1 // The type of yuv that can be tranfer to and from RGB through integer transform
+ }
+ YUV_TYPE;
+
+ typedef struct
+ {
+ int y_width;
+ int y_height;
+ int y_stride;
+// int yinternal_width;
+
+ int uv_width;
+ int uv_height;
+ int uv_stride;
+// int uvinternal_width;
+
+ unsigned char *y_buffer;
+ unsigned char *u_buffer;
+ unsigned char *v_buffer;
+
+ unsigned char *buffer_alloc;
+ int border;
+ int frame_size;
+ YUV_TYPE clrtype;
+ } YV12_BUFFER_CONFIG;
+
+ int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int border);
+ int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf);
+ int vp8_yv12_black_frame_buffer(YV12_BUFFER_CONFIG *ybf);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif //YV12_CONFIG_H
diff --git a/vpx_scale/yv12extend.h b/vpx_scale/yv12extend.h
new file mode 100644
index 000000000..9968feae8
--- /dev/null
+++ b/vpx_scale/yv12extend.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef YV12_EXTEND_H
+#define YV12_EXTEND_H
+
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+ void vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf);
+
+ /* Copy Y,U,V buffer data from src to dst, filling border of dst as well. */
+
+ void vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+ void vp8_yv12_copy_frame_yonly(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif