46 files changed, 17885 insertions, 0 deletions
diff --git a/vpx_scale/arm/armv4/gen_scalers_armv4.asm b/vpx_scale/arm/armv4/gen_scalers_armv4.asm
new file mode 100644
index 000000000..1c904edae
--- /dev/null
+++ b/vpx_scale/arm/armv4/gen_scalers_armv4.asm
@@ -0,0 +1,773 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |horizontal_line_4_5_scale_armv4|
+    EXPORT  |vertical_band_4_5_scale_armv4|
+    EXPORT  |horizontal_line_2_3_scale_armv4|
+    EXPORT  |vertical_band_2_3_scale_armv4|
+    EXPORT  |horizontal_line_3_5_scale_armv4|
+    EXPORT  |vertical_band_3_5_scale_armv4|
+    EXPORT  |horizontal_line_3_4_scale_armv4|
+    EXPORT  |vertical_band_3_4_scale_armv4|
+    EXPORT  |horizontal_line_1_2_scale_armv4|
+    EXPORT  |vertical_band_1_2_scale_armv4|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+src         RN  r0
+srcw        RN  r1
+dest        RN  r2
+mask        RN  r12
+c51_205     RN  r10
+c102_154    RN  r11
+;/****************************************************************************
+; *
+; *  ROUTINE       : horizontal_line_4_5_scale_armv4
+; *
+; *  INPUTS        : const unsigned char *source : Pointer to source data.
+; *                  unsigned int source_width    : Stride of source.
+; *                  unsigned char *dest         : Pointer to destination data.
+; *                  unsigned int dest_width      : Stride of destination (NOT USED).
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Copies horizontal line of pixels from source to
+; *                  destination scaling up by 4 to 5.
+; *
+; *  SPECIAL NOTES : None.
+; *
+; ****************************************************************************/
+;void horizontal_line_4_5_scale_armv4
+;(
+;   r0 = UINT8 *source
+;   r1 = UINT32 source_width
+;   r2 = UINT8 *dest
+;   r3 = UINT32 dest_width
+;)
+|horizontal_line_4_5_scale_armv4| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    mov     mask, #255              ; mask for selection
+    ldr     c51_205, =0x3300cd
+    ldr     c102_154, =0x66009a
+
+    ldr     r3, [src], #4
+
+hl45_loop
+
+    and     r4, r3, mask            ; a = src[0]
+    and     r5, mask, r3, lsr #8    ; b = src[1]
+    strb    r4, [dest], #1
+
+    orr     r6, r4, r5, lsl #16     ; b | a
+    and     r7, mask, r3, lsr #16   ; c = src[2]
+    mul     r6, c51_205, r6         ; a * 51 + 205 * b
+
+    orr     r5, r5, r7, lsl #16     ; c | b
+    mul     r5, c102_154, r5        ; b * 102 + 154 * c
+    add     r6, r6, #0x8000
+    and     r8, mask, r3, lsr #24   ; d = src[3]
+    mov     r6, r6, lsr #24
+    strb    r6, [dest], #1
+
+    orr     r7, r8, r7, lsl #16     ; c | d
+    mul     r7, c102_154, r7        ; c * 154 + 102 * d
+    add     r5, r5, #0x8000
+    ldr     r3, [src], #4
+    mov     r5, r5, lsr #24
+    strb    r5, [dest], #1
+
+    add     r7, r7, #0x8000
+    and     r9, mask, r3            ; e = src[4]
+    orr     r9, r9, r8, lsl #16     ; d | e
+    mul     r9, c51_205, r9         ; d * 205 + 51 * e
+    mov     r7, r7, lsr #24
+    strb    r7, [dest], #1
+
+    add     r9, r9, #0x8000
+    subs    srcw, srcw, #4
+    mov     r9, r9, lsr #24
+    strb    r9, [dest], #1
+
+    bne     hl45_loop
+
+    and     r4, r3, mask
+    and     r5, mask, r3, lsl #8
+    strb    r4, [dest], #1
+
+    orr     r6, r4, r5, lsl #16     ; b | a
+    mul     r6, c51_205, r6
+
+    and     r7, mask, r3, lsl #16
+    orr     r5, r5, r7, lsl #16     ; c | b
+    mul     r5, c102_154, r5
+    add     r6, r6, #0x8000
+    and     r8, mask, r3, lsl #24
+    mov     r6, r6, lsr #24
+    strb    r6, [dest], #1
+
+    orr     r7, r8, r7, lsl #16     ; c | d
+    mul     r7, c102_154, r7
+    add     r5, r5, #0x8000
+    mov     r5, r5, lsr #24
+    strb    r5, [dest], #1
+
+    add     r7, r7, #0x8000
+    mov     r7, r7, lsr #24
+    strb    r7, [dest], #1
+
+    ldrb    r3, [src]
+    strb    r3, [dest], #1
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ;|vp8cx_horizontal_line_4_5_scale_c|
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : vertical_band_4_5_scale_armv4
+; *
+; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+; *                  unsigned int dest_pitch : Stride of destination data.
+; *                  unsigned int dest_width : Width of destination data.
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Scales vertical band of pixels by scale 4 to 5. The
+; *                  height of the band scaled is 4-pixels.
+; *
+; *  SPECIAL NOTES : The routine uses the first line of the band below
+; *                  the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_4_5_scale_armv4
+;(
+;   r0 = UINT8 *dest
+;   r1 = UINT32 dest_pitch
+;   r2 = UINT32 dest_width
+;)
+|vertical_band_4_5_scale_armv4| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     c51_205, =0x3300cd
+    ldr     c102_154, =0x66009a
+
+vl45_loop
+    mov     r3, src
+    ldrb    r4, [r3], r1            ; a = des [0]
+    ldrb    r5, [r3], r1            ; b = des [dest_pitch]
+    ldrb    r7, [r3], r1            ; c = des[dest_pitch*2]
+    add     lr, src, r1
+
+    orr     r6, r4, r5, lsl #16     ; b | a
+    mul     r6, c51_205, r6         ; a * 51 + 205 * b
+
+    ldrb    r8, [r3], r1            ; d = des[dest_pitch*3]
+    orr     r5, r5, r7, lsl #16     ; c | b
+    mul     r5, c102_154, r5        ; b * 102 + 154 * c
+    add     r6, r6, #0x8000
+    orr     r7, r8, r7, lsl #16     ; c | d
+    mov     r6, r6, lsr #24
+    strb    r6, [lr], r1
+
+    ldrb    r9, [r3, r1]            ; e = des [dest_pitch * 5]
+    mul     r7, c102_154, r7        ; c * 154 + 102 * d
+    add     r5, r5, #0x8000
+    orr     r9, r9, r8, lsl #16     ; d | e
+    mov     r5, r5, lsr #24
+    strb    r5, [lr], r1
+
+    mul     r9, c51_205, r9         ; d * 205 + 51 * e
+    add     r7, r7, #0x8000
+    add     src, src, #1
+    mov     r7, r7, lsr #24
+    strb    r7, [lr], r1
+
+    add     r9, r9, #0x8000
+    subs    r2, r2, #1
+    mov     r9, r9, lsr #24
+    strb    r9, [lr], r1
+
+    bne     vl45_loop
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ;|vertical_band_4_5_scale_armv4|
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : horizontal_line_2_3_scale_armv4
+; *
+; *  INPUTS        : const unsigned char *source : Pointer to source data.
+; *                  unsigned int source_width    : Stride of source.
+; *                  unsigned char *dest         : Pointer to destination data.
+; *                  unsigned int dest_width      : Stride of destination (NOT USED).
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Copies horizontal line of pixels from source to
+; *                  destination scaling up by 2 to 3.
+; *
+; *  SPECIAL NOTES : None.
+; *
+; *
+; ****************************************************************************/
+;void horizontal_line_2_3_scale_armv4
+;(
+;   const unsigned char *source,
+;   unsigned int source_width,
+;   unsigned char *dest,
+;   unsigned int dest_width
+;)
+|horizontal_line_2_3_scale_armv4| PROC
+    stmdb   sp!, {r4 - r11, lr}
+    ldr     lr,  =85
+    ldr     r12, =171
+
+hl23_loop
+
+    ldrb    r3, [src], #1           ; a
+    ldrb    r4, [src], #1           ; b
+    ldrb    r5, [src]               ; c
+
+    strb    r3, [dest], #1
+    mul     r4, r12, r4             ; b * 171
+    mla     r6, lr, r3, r4          ; a * 85
+    mla     r7, lr, r5, r4          ; c * 85
+
+    add     r6, r6, #128
+    mov     r6, r6, lsr #8
+    strb    r6, [dest], #1
+
+    add     r7, r7, #128
+    mov     r7, r7, lsr #8
+    strb    r7, [dest], #1
+
+    subs    srcw, srcw, #2
+    bne     hl23_loop
+
+    ldrb    r4, [src, #1]           ; b
+    strb    r5, [dest], #1
+    strb    r4, [dest, #1]
+
+    mul     r4, r12, r4             ; b * 171
+    mla     r6, lr, r5, r4          ; a * 85 + b *171
+
+    add     r6, r6, #128
+    mov     r6, r6, lsr #8
+    strb    r6, [dest]
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ;|horizontal_line_2_3_scale_armv4|
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : vertical_band_2_3_scale_armv4
+; *
+; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+; *                  unsigned int dest_pitch : Stride of destination data.
+; *                  unsigned int dest_width : Width of destination data.
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Scales vertical band of pixels by scale 2 to 3. The
+; *                  height of the band scaled is 2-pixels.
+; *
+; *  SPECIAL NOTES : The routine uses the first line of the band below
+; *                  the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_2_3_scale_armv4
+;(
+;   r0 = UINT8 *dest
+;   r1 = UINT32 dest_pitch
+;   r2 = UINT32 dest_width
+;)
+|vertical_band_2_3_scale_armv4| PROC
+    stmdb   sp!, {r4 - r8, lr}
+    ldr     lr,  =85
+    ldr     r12, =171
+    add     r3, r1, r1, lsl #1      ; 3 * dest_pitch
+
+vl23_loop
+    ldrb    r4, [src]               ; a = des [0]
+    ldrb    r5, [src, r1]           ; b = des [dest_pitch]
+    ldrb    r7, [src, r3]           ; c = des [dest_pitch*3]
+    subs    r2, r2, #1
+
+    mul     r5, r12, r5             ; b * 171
+    mla     r6, lr, r4, r5          ; a * 85
+    mla     r8, lr, r7, r5          ; c * 85
+
+    add     r6, r6, #128
+    mov     r6, r6, lsr #8
+    strb    r6, [src, r1]
+
+    add     r8, r8, #128
+    mov     r8, r8, lsr #8
+    strb    r8, [src, r1, lsl #1]
+
+    add     src, src, #1
+
+    bne     vl23_loop
+
+    ldmia   sp!, {r4 - r8, pc}
+    ENDP    ;|vertical_band_2_3_scale_armv4|
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : vp8cx_horizontal_line_3_5_scale_c
+; *
+; *  INPUTS        : const unsigned char *source : Pointer to source data.
+; *                  unsigned int source_width    : Stride of source.
+; *                  unsigned char *dest         : Pointer to destination data.
+; *                  unsigned int dest_width      : Stride of destination (NOT USED).
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Copies horizontal line of pixels from source to
+; *                  destination scaling up by 3 to 5.
+; *
+; *  SPECIAL NOTES : None.
+; *
+; *
+; ****************************************************************************/
+;void vp8cx_horizontal_line_3_5_scale_c
+;(
+;   const unsigned char *source,
+;   unsigned int source_width,
+;   unsigned char *dest,
+;   unsigned int dest_width
+;)
+|horizontal_line_3_5_scale_armv4| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     c51_205, =0x3300cd
+    ldr     c102_154, =0x66009a
+
+    ldrb    r4, [src], #1           ; a = src[0]
+
+hl35_loop
+
+    ldrb    r8, [src], #1           ; b = src[1]
+    strb    r4, [dest], #1
+
+    orr     r6, r4, r8, lsl #16     ; b | a
+    ldrb    r9, [src], #1           ; c = src[2]
+    mul     r6, c102_154, r6        ; a * 102 + 154 * b
+
+    orr     r5, r9, r8, lsl #16     ; b | c
+    mul     r5, c51_205, r5         ; b * 205 + 51 * c
+    add     r6, r6, #0x8000
+    ldrb    r4, [src], #1           ; d = src[3]
+    mov     r6, r6, lsr #24
+    strb    r6, [dest], #1
+
+    orr     r7, r8, r9, lsl #16     ; c | b
+    mul     r7, c51_205, r7         ; c * 205 + 154 * b
+    add     r5, r5, #0x8000
+    mov     r5, r5, lsr #24
+    strb    r5, [dest], #1
+
+    orr     r9, r4, r9, lsl #16     ; c | d
+    mul     r9, c102_154, r9        ; c * 154 + 102 * d
+    add     r7, r7, #0x8000
+    mov     r7, r7, lsr #24
+    strb    r7, [dest], #1
+
+    add     r9, r9, #0x8000
+    subs    srcw, srcw, #3
+    mov     r9, r9, lsr #24
+    strb    r9, [dest], #1
+
+    bpl     hl35_loop
+
+    ldrb    r5, [src], #1           ; b = src[1]
+    strb    r4, [dest], #1
+
+    orr     r6, r4, r8, lsl #16     ; b | a
+    ldrb    r9, [src], #1           ; c = src[2]
+    mul     r6, c102_154, r6        ; a * 102 + 154 * b
+
+    orr     r5, r9, r8, lsl #16     ; b | c
+    mul     r5, c51_205, r5         ; b * 205 + 51 * c
+    add     r6, r6, #0x8000
+    mov     r6, r6, lsr #24
+    strb    r6, [dest], #1
+
+    orr     r7, r8, r9, lsl #16     ; c | b
+    mul     r7, c51_205, r7         ; c * 205 + 154 * b
+    add     r5, r5, #0x8000
+    mov     r5, r5, lsr #24
+    strb    r5, [dest], #1
+
+    add     r7, r7, #0x8000
+    mov     r7, r7, lsr #24
+    strb    r7, [dest], #1
+    strb    r9, [dest], #1
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ;|vp8cx_horizontal_line_3_5_scale_c|
+
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : vp8cx_vertical_band_3_5_scale_c
+; *
+; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+; *                  unsigned int dest_pitch : Stride of destination data.
+; *                  unsigned int dest_width : Width of destination data.
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Scales vertical band of pixels by scale 3 to 5. The
+; *                  height of the band scaled is 3-pixels.
+; *
+; *  SPECIAL NOTES : The routine uses the first line of the band below
+; *                  the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_4_5_scale_armv4
+;(
+;   r0 = UINT8 *dest
+;   r1 = UINT32 dest_pitch
+;   r2 = UINT32 dest_width
+;)
+|vertical_band_3_5_scale_armv4| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     c51_205, =0x3300cd
+    ldr     c102_154, =0x66009a
+
+vl35_loop
+    mov     r3, src
+    ldrb    r4, [r3], r1            ; a = des [0]
+    ldrb    r5, [r3], r1            ; b = des [dest_pitch]
+    ldrb    r7, [r3], r1            ; c = des[dest_pitch*2]
+    add     lr, src, r1
+
+    orr     r8, r4, r5, lsl #16     ; b | a
+    mul     r6, c102_154, r8        ; a * 102 + 154 * b
+
+    ldrb    r8, [r3, r1, lsl #1]    ; d = des[dest_pitch*5]
+    orr     r3, r7, r5, lsl #16     ; b | c
+    mul     r9, c51_205, r3         ; b * 205 + 51 * c
+    add     r6, r6, #0x8000
+    orr     r3, r5, r7, lsl #16     ; c | b
+    mov     r6, r6, lsr #24
+    strb    r6, [lr], r1
+
+    mul     r5, c51_205, r3         ; c * 205 + 154 * b
+    add     r9, r9, #0x8000
+    orr     r3, r8, r7, lsl #16     ; c | d
+    mov     r9, r9, lsr #24
+    strb    r9, [lr], r1
+
+    mul     r7, c102_154, r3        ; c * 154 + 102 * d
+    add     r5, r5, #0x8000
+    add     src, src, #1
+    mov     r5, r5, lsr #24
+    strb    r5, [lr], r1
+
+    add     r7, r7, #0x8000
+    subs    r2, r2, #1
+    mov     r7, r7, lsr #24
+    strb    r7, [lr], r1
+
+
+    bne     vl35_loop
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ;|vertical_band_3_5_scale_armv4|
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : horizontal_line_3_4_scale_armv4
+; *
+; *  INPUTS        : const unsigned char *source : Pointer to source data.
+; *                  unsigned int source_width    : Stride of source.
+; *                  unsigned char *dest         : Pointer to destination data.
+; *                  unsigned int dest_width      : Stride of destination (NOT USED).
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Copies horizontal line of pixels from source to
+; *                  destination scaling up by 3 to 4.
+; *
+; *  SPECIAL NOTES : None.
+; *
+; *
+; ****************************************************************************/
+;void horizontal_line_3_4_scale_armv4
+;(
+;   const unsigned char *source,
+;   unsigned int source_width,
+;   unsigned char *dest,
+;   unsigned int dest_width
+;)
+|horizontal_line_3_4_scale_armv4| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r10, =64
+    ldr     r11, =192
+    mov     r9, #128
+
+    ldrb    r4, [src], #1           ; a = src[0]
+
+hl34_loop
+
+    ldrb    r8, [src], #1           ; b = src[1]
+    ldrb    r7, [src], #1           ; c = src[2]
+    strb    r4, [dest], #1
+
+    mla     r4, r10, r4, r9         ; a*64 + 128
+    mla     r4, r11, r8, r4         ; a*64 + b*192 + 1
+
+    add     r8, r8, #1              ; b + 1
+    add     r8, r8, r7              ; b + c + 1
+    mov     r8, r8, asr #1          ; (b + c + 1) >> 1
+
+    mov     r4, r4, asr #8          ; (a*64 + b*192 + 1) >> 8
+    strb    r4, [dest], #1
+
+    strb    r8, [dest], #1
+
+    ldrb    r4, [src], #1           ; [a+1]
+
+    mla     r7, r11, r7, r9         ; c*192 + 128
+    mla     r7, r4, r10, r7         ; a*64 + b*192 + 128
+
+    subs    srcw, srcw, #3
+
+    mov     r7, r7, asr #8          ; (a*64 + b*192 + 128) >> 8
+    strb    r7, [dest], #1
+
+    bpl     hl34_loop
+
+    ldrb    r8, [src], #1           ; b = src[1]
+    ldrb    r7, [src], #1           ; c = src[2]
+    strb    r4, [dest], #1
+
+    mla     r4, r10, r4, r9         ; a*64 + 128
+    mla     r4, r11, r8, r4         ; a*64 + b*192 + 1
+    mov     r4, r4, asr #8          ; (a*64 + b*192 + 1) >> 8
+    strb    r4, [dest], #1
+
+    add     r8, r8, #1              ; b + 1
+    add     r8, r8, r7              ; b + c + 1
+    mov     r8, r8, asr #1          ; (b + c + 1) >> 1
+    strb    r8, [dest], #1
+    strb    r7, [dest], #1
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ;|vp8cx_horizontal_line_3_4_scale_c|
+
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : vertical_band_3_4_scale_armv4
+; *
+; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+; *                  unsigned int dest_pitch : Stride of destination data.
+; *                  unsigned int dest_width : Width of destination data.
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Scales vertical band of pixels by scale 3 to 4. The
+; *                  height of the band scaled is 3-pixels.
+; *
+; *  SPECIAL NOTES : The routine uses the first line of the band below
+; *                  the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_3_4_scale_armv4
+;(
+;   r0 = UINT8 *dest
+;   r1 = UINT32 dest_pitch
+;   r2 = UINT32 dest_width
+;)
+|vertical_band_3_4_scale_armv4| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r10, =64
+    ldr     r11, =192
+    mov     r9, #128
+
+;   ldr     r1,[r1]
+vl34_loop
+    mov     r3, src
+    ldrb    r4, [r3], r1            ; a = des [0]
+    ldrb    r5, [r3], r1            ; b = des [dest_pitch]
+    ldrb    r7, [r3], r1            ; c = des [dest_pitch*2]
+    add     lr, src, r1
+
+    mla     r4, r10, r4, r9         ; a*64 + 128
+    mla     r4, r11, r5, r4         ; a*64 + b*192 + 1
+
+    add     r5, r5, #1              ; b + 1
+    add     r5, r5, r7              ; b + c + 1
+    mov     r5, r5, asr #1          ; (b + c + 1) >> 1
+
+    mov     r4, r4, asr #8          ; (a*64 + b*192 + 1) >> 8
+    strb    r4, [lr], r1
+
+    ldrb    r4, [r3, r1]            ; a = des [dest_pitch*4]
+
+    strb    r5, [lr], r1
+
+    mla     r7, r11, r7, r9         ; c*192 + 128
+    mla     r7, r4, r10, r7         ; a*64 + b*192 + 128
+    mov     r7, r7, asr #8          ; (a*64 + b*192 + 128) >> 8
+
+    add     src, src, #1
+    subs    r2, r2, #1
+
+    strb    r7, [lr]
+
+    bne     vl34_loop
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ;|vertical_band_3_4_scale_armv4|
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : vp8cx_horizontal_line_1_2_scale_c
+; *
+; *  INPUTS        : const unsigned char *source : Pointer to source data.
+; *                  unsigned int source_width    : Stride of source.
+; *                  unsigned char *dest         : Pointer to destination data.
+; *                  unsigned int dest_width      : Stride of destination (NOT USED).
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Copies horizontal line of pixels from source to
+; *                  destination scaling up by 1 to 2.
+; *
+; *  SPECIAL NOTES : None.
+; *
+; ****************************************************************************/
+;void vp8cx_horizontal_line_1_2_scale_c
+;(
+;   const unsigned char *source,
+;   unsigned int source_width,
+;   unsigned char *dest,
+;   unsigned int dest_width
+;)
+|horizontal_line_1_2_scale_armv4| PROC
+    stmdb   sp!, {r4 - r5, lr}
+
+    sub     srcw, srcw, #1
+
+    ldrb    r3, [src], #1
+    ldrb    r4, [src], #1
+hl12_loop
+    subs    srcw, srcw, #1
+
+    add     r5, r3, r4
+    add     r5, r5, #1
+    mov     r5, r5, lsr #1
+
+    orr     r5, r3, r5, lsl #8
+    strh    r5, [dest], #2
+
+    mov     r3, r4
+
+    ldrneb  r4, [src], #1
+    bne     hl12_loop
+
+    orr     r5, r4, r4, lsl #8
+    strh    r5, [dest]
+
+    ldmia   sp!, {r4 - r5, pc}
+    ENDP    ;|vertical_band_3_5_scale_armv4|
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : vp8cx_vertical_band_1_2_scale_c
+; *
+; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+; *                  unsigned int dest_pitch : Stride of destination data.
+; *                  unsigned int dest_width : Width of destination data.
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Scales vertical band of pixels by scale 1 to 2. The
+; *                  height of the band scaled is 1-pixel.
+; *
+; *  SPECIAL NOTES : The routine uses the first line of the band below
+; *                  the current band.
+; *
+; ****************************************************************************/
+;void vp8cx_vertical_band_1_2_scale_c
+;(
+;   r0 = UINT8 *dest
+;   r1 = UINT32 dest_pitch
+;   r2 = UINT32 dest_width
+;)
+|vertical_band_1_2_scale_armv4| PROC
+    stmdb   sp!, {r4 - r7, lr}
+
+    ldr     mask, =0xff00ff             ; mask for selection
+    ldr     lr, = 0x010001
+
+vl12_loop
+    mov     r3, src
+    ldr     r4, [r3], r1
+    ldr     r5, [r3, r1]
+
+    add     src, src, #4
+    subs    r2, r2, #4
+
+    and     r6, r4, mask
+    and     r7, r5, mask
+
+    add     r6, r7, r6
+    add     r6, r6, lr
+
+    and     r4, mask, r4, lsr #8
+    and     r5, mask, r5, lsr #8
+
+    mov     r6, r6, lsr #1
+    and     r6, r6, mask
+
+    add     r4, r5, r4
+    add     r4, r4, lr
+
+    mov     r4, r4, lsr #1
+    and     r4, r4, mask
+
+    orr     r5, r6, r4, lsl #8
+
+    str     r5, [r3]
+
+    bpl     vl12_loop
+
+    ldmia   sp!, {r4 - r7, pc}
+    ENDP    ;|vertical_band_3_5_scale_armv4|
+
+    END
diff --git a/vpx_scale/arm/nds/yv12extend.c b/vpx_scale/arm/nds/yv12extend.c
new file mode 100644
index 000000000..56959cb18
--- /dev/null
+++ b/vpx_scale/arm/nds/yv12extend.c
@@ -0,0 +1,220 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     yv12extend.c
+*
+*   Description  :
+*
+***************************************************************************/
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+#include "vpx_scale/yv12config.h"
+#include "vpx_mem/vpx_mem.h"
+#include <nitro.h>
+#include <nitro/mi.h>
+#include <nitro/itcm_begin.h>
+
+//---- DMA Number
+#define DMA_NO  3
+
+/****************************************************************************
+*  Exports
+****************************************************************************/
+
+/****************************************************************************
+*
+****************************************************************************/
+void
+vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf)
+{
+    int i;
+    unsigned char *src_ptr1, *src_ptr2;
+    unsigned char *dest_ptr1, *dest_ptr2;
+
+    unsigned int Border;
+    int plane_stride;
+    int plane_height;
+    int plane_width;
+
+    /***********/
+    /* Y Plane */
+    /***********/
+    Border = ybf->border;
+    plane_stride = ybf->y_stride;
+    plane_height = ybf->y_height;
+    plane_width = ybf->y_width;
+
+    // copy the left and right most columns out
+    src_ptr1 = ybf->y_buffer;
+    src_ptr2 = src_ptr1 + plane_width - 1;
+    dest_ptr1 = src_ptr1 - Border;
+    dest_ptr2 = src_ptr2 + 1;
+
+    for (i = 0; i < plane_height; i++)
+    {
+        mi_cpu_fill8(dest_ptr1, src_ptr1[0], Border);
+        mi_cpu_fill8(dest_ptr2, src_ptr2[0], Border);
+        src_ptr1  += plane_stride;
+        src_ptr2  += plane_stride;
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    // Now copy the top and bottom source lines into each line of the respective borders
+    src_ptr1 = ybf->y_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)Border; i++)
+    {
+        mi_cpu_copy_fast(src_ptr1, dest_ptr1, plane_stride);
+        mi_cpu_copy_fast(src_ptr2, dest_ptr2, plane_stride);
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    plane_stride /= 2;
+    plane_height /= 2;
+    plane_width /= 2;
+    Border /= 2;
+
+    /***********/
+    /* U Plane */
+    /***********/
+
+    // copy the left and right most columns out
+    src_ptr1 = ybf->u_buffer;
+    src_ptr2 = src_ptr1 + plane_width - 1;
+    dest_ptr1 = src_ptr1 - Border;
+    dest_ptr2 = src_ptr2 + 1;
+
+    for (i = 0; i < plane_height; i++)
+    {
+        mi_cpu_fill8(dest_ptr1, src_ptr1[0], Border);
+        mi_cpu_fill8(dest_ptr2, src_ptr2[0], Border);
+        src_ptr1  += plane_stride;
+        src_ptr2  += plane_stride;
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    // Now copy the top and bottom source lines into each line of the respective borders
+    src_ptr1 = ybf->u_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)(Border); i++)
+    {
+        mi_cpu_copy_fast(src_ptr1, dest_ptr1, plane_stride);
+        mi_cpu_copy_fast(src_ptr2, dest_ptr2, plane_stride);
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    /***********/
+    /* V Plane */
+    /***********/
+
+    // copy the left and right most columns out
+    src_ptr1 = ybf->v_buffer;
+    src_ptr2 = src_ptr1 + plane_width - 1;
+    dest_ptr1 = src_ptr1 - Border;
+    dest_ptr2 = src_ptr2 + 1;
+
+    for (i = 0; i < plane_height; i++)
+    {
+        mi_cpu_fill8(dest_ptr1, src_ptr1[0], Border);
+        mi_cpu_fill8(dest_ptr2, src_ptr2[0], Border);
+        src_ptr1  += plane_stride;
+        src_ptr2  += plane_stride;
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    // Now copy the top and bottom source lines into each line of the respective borders
+    src_ptr1 = ybf->v_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)(Border); i++)
+    {
+        mi_cpu_copy_fast(src_ptr1, dest_ptr1, plane_stride);
+        mi_cpu_copy_fast(src_ptr2, dest_ptr2, plane_stride);
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+}
+
+
+
+/****************************************************************************
+*
+*  ROUTINE       : vp8_yv12_copy_frame
+*
+*  INPUTS        :
+*
+*  OUTPUTS       : None.
+*
+*  RETURNS       : void
+*
+*  FUNCTION      : Copies the source image into the destination image and
+*                  updates the destination's UMV borders.
+*
+*  SPECIAL NOTES : The frames are assumed to be identical in size.
+*
+****************************************************************************/
+void
+vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
+{
+    int yplane_size = (src_ybc->y_height + 2 * src_ybc->border) * (src_ybc->y_stride);
+    int mem_size = (yplane_size * 3 / 2) + (src_ybc->y_stride * 2);
+
+    mi_cpu_copy_fast(src_ybc->buffer_alloc, dst_ybc->buffer_alloc, mem_size);
+
+    /*  unsigned char *src_y, *dst_y;
+        unsigned char *src_u, *dst_u;
+        unsigned char *src_v, *dst_v;
+
+        int yheight, uv_height;
+        int ystride, uv_stride;
+        int border;
+        int yoffset, uvoffset;
+
+        border   = src_ybc->border;
+        yheight  = src_ybc->y_height;
+        uv_height = src_ybc->uv_height;
+
+        ystride  = src_ybc->y_stride;
+        uv_stride = src_ybc->uv_stride;
+
+        yoffset  = border * (ystride + 1);
+        uvoffset = border/2 * (uv_stride + 1);
+
+        src_y = src_ybc->y_buffer - yoffset;
+        dst_y = dst_ybc->y_buffer - yoffset;
+        src_u = src_ybc->u_buffer - uvoffset;
+        dst_u = dst_ybc->u_buffer - uvoffset;
+        src_v = src_ybc->v_buffer - uvoffset;
+        dst_v = dst_ybc->v_buffer - uvoffset;
+
+        mi_cpu_copy_fast (src_y, dst_y, ystride *  (yheight + 2 * border));
+        mi_cpu_copy_fast (src_u, dst_u, uv_stride * (uv_height + border));
+        mi_cpu_copy_fast (src_v, dst_v, uv_stride * (uv_height + border));
+    */
+}
+
+#include <nitro/itcm_end.h>
diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm
new file mode 100644
index 000000000..26384c42c
--- /dev/null
+++ b/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm
@@ -0,0 +1,227 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_yv12_copy_frame_func_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    INCLUDE vpx_asm_offsets.asm
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+
+|vp8_yv12_copy_frame_func_neon| PROC
+    push            {r4 - r11, lr}
+    vpush           {d8 - d15}
+
+    sub             sp, sp, #16
+
+    ;Copy Y plane
+    ldr             r8, [r0, #yv12_buffer_config_u_buffer]       ;srcptr1
+    ldr             r9, [r1, #yv12_buffer_config_u_buffer]       ;srcptr1
+    ldr             r10, [r0, #yv12_buffer_config_v_buffer]      ;srcptr1
+    ldr             r11, [r1, #yv12_buffer_config_v_buffer]      ;srcptr1
+
+    ldr             r4, [r0, #yv12_buffer_config_y_height]
+    ldr             r5, [r0, #yv12_buffer_config_y_width]
+    ldr             r6, [r0, #yv12_buffer_config_y_stride]
+    ldr             r7, [r1, #yv12_buffer_config_y_stride]
+    ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
+    ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1
+
+    str             r8, [sp]
+    str             r9, [sp, #4]
+    str             r10, [sp, #8]
+    str             r11, [sp, #12]
+
+    ; copy two rows at one time
+    mov             lr, r4, lsr #1
+
+cp_src_to_dst_height_loop
+    mov             r8, r2
+    mov             r9, r3
+    add             r10, r2, r6
+    add             r11, r3, r7
+    mov             r12, r5, lsr #7
+
+cp_src_to_dst_width_loop
+    vld1.8          {q0, q1}, [r8]!
+    vld1.8          {q8, q9}, [r10]!
+    vld1.8          {q2, q3}, [r8]!
+    vld1.8          {q10, q11}, [r10]!
+    vld1.8          {q4, q5}, [r8]!
+    vld1.8          {q12, q13}, [r10]!
+    vld1.8          {q6, q7}, [r8]!
+    vld1.8          {q14, q15}, [r10]!
+
+    subs            r12, r12, #1
+
+    vst1.8          {q0, q1}, [r9]!
+    vst1.8          {q8, q9}, [r11]!
+    vst1.8          {q2, q3}, [r9]!
+    vst1.8          {q10, q11}, [r11]!
+    vst1.8          {q4, q5}, [r9]!
+    vst1.8          {q12, q13}, [r11]!
+    vst1.8          {q6, q7}, [r9]!
+    vst1.8          {q14, q15}, [r11]!
+
+    bne             cp_src_to_dst_width_loop
+
+    subs            lr, lr, #1
+    add             r2, r2, r6, lsl #1
+    add             r3, r3, r7, lsl #1
+
+    bne             cp_src_to_dst_height_loop
+
+    ands            r10, r5, #0x7f                  ;check to see if extra copy is needed
+    sub             r11, r5, r10
+    ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
+    ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1
+    bne             extra_cp_src_to_dst_width
+end_of_cp_src_to_dst
+
+;Copy U & V planes
+    ldr             r2, [sp]        ;srcptr1
+    ldr             r3, [sp, #4]        ;dstptr1
+    mov             r4, r4, lsr #1                  ;src uv_height
+    mov             r5, r5, lsr #1                  ;src uv_width
+    mov             r6, r6, lsr #1                  ;src uv_stride
+    mov             r7, r7, lsr #1                  ;dst uv_stride
+
+    mov             r1, #2
+
+cp_uv_loop
+
+    ;copy two rows at one time
+    mov             lr, r4, lsr #1
+
+cp_src_to_dst_height_uv_loop
+    mov             r8, r2
+    mov             r9, r3
+    add             r10, r2, r6
+    add             r11, r3, r7
+    mov             r12, r5, lsr #6
+
+cp_src_to_dst_width_uv_loop
+    vld1.8          {q0, q1}, [r8]!
+    vld1.8          {q8, q9}, [r10]!
+    vld1.8          {q2, q3}, [r8]!
+    vld1.8          {q10, q11}, [r10]!
+
+    subs            r12, r12, #1
+
+    vst1.8          {q0, q1}, [r9]!
+    vst1.8          {q8, q9}, [r11]!
+    vst1.8          {q2, q3}, [r9]!
+    vst1.8          {q10, q11}, [r11]!
+
+    bne             cp_src_to_dst_width_uv_loop
+
+    subs            lr, lr, #1
+    add             r2, r2, r6, lsl #1
+    add             r3, r3, r7, lsl #1
+
+    bne             cp_src_to_dst_height_uv_loop
+
+    ands            r10, r5, #0x3f                  ;check to see if extra copy is needed
+    sub             r11, r5, r10
+    ldr             r2, [sp]        ;srcptr1
+    ldr             r3, [sp, #4]        ;dstptr1
+    bne             extra_cp_src_to_dst_uv_width
+end_of_cp_src_to_dst_uv
+
+    subs            r1, r1, #1
+
+    addne               sp, sp, #8
+
+    ldrne               r2, [sp]        ;srcptr1
+    ldrne               r3, [sp, #4]        ;dstptr1
+
+    bne             cp_uv_loop
+
+    add             sp, sp, #8
+
+    vpop            {d8 - d15}
+    pop             {r4 - r11, pc}
+
+;=============================
+extra_cp_src_to_dst_width
+    add             r2, r2, r11
+    add             r3, r3, r11
+    add             r0, r8, r6
+    add             r11, r9, r7
+
+    mov             lr, r4, lsr #1
+extra_cp_src_to_dst_height_loop
+    mov             r8, r2
+    mov             r9, r3
+    add             r0, r8, r6
+    add             r11, r9, r7
+
+    mov             r12, r10
+
+extra_cp_src_to_dst_width_loop
+    vld1.8          {q0}, [r8]!
+    vld1.8          {q1}, [r0]!
+
+    subs            r12, r12, #16
+
+    vst1.8          {q0}, [r9]!
+    vst1.8          {q1}, [r11]!
+    bne             extra_cp_src_to_dst_width_loop
+
+    subs            lr, lr, #1
+
+    add             r2, r2, r6, lsl #1
+    add             r3, r3, r7, lsl #1
+
+    bne             extra_cp_src_to_dst_height_loop
+
+    b               end_of_cp_src_to_dst
+
+;=================================
+extra_cp_src_to_dst_uv_width
+    add             r2, r2, r11
+    add             r3, r3, r11
+    add             r0, r8, r6
+    add             r11, r9, r7
+
+    mov             lr, r4, lsr #1
+extra_cp_src_to_dst_height_uv_loop
+    mov             r8, r2
+    mov             r9, r3
+    add             r0, r8, r6
+    add             r11, r9, r7
+
+    mov             r12, r10
+
+extra_cp_src_to_dst_width_uv_loop
+    vld1.8          {d0}, [r8]!
+    vld1.8          {d1}, [r0]!
+
+    subs            r12, r12, #8
+
+    vst1.8          {d0}, [r9]!
+    vst1.8          {d1}, [r11]!
+    bne             extra_cp_src_to_dst_width_uv_loop
+
+    subs            lr, lr, #1
+
+    add             r2, r2, r6, lsl #1
+    add             r3, r3, r7, lsl #1
+
+    bne             extra_cp_src_to_dst_height_uv_loop
+
+    b               end_of_cp_src_to_dst_uv
+
+    ENDP
+    END
diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copyframeyonly_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copyframeyonly_neon.asm
new file mode 100644
index 000000000..a50ae60d7
--- /dev/null
+++ b/vpx_scale/arm/neon/vp8_vpxyv12_copyframeyonly_neon.asm
@@ -0,0 +1,499 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_yv12_copy_frame_yonly_neon|
+    EXPORT  |vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    INCLUDE vpx_asm_offsets.asm
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vpxyv12_copy_frame_yonly(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+; Note: this is VP8 function, which has border=32 and 16. Internal y_width and y_height
+; are always multiples of 16.
+
+|vp8_yv12_copy_frame_yonly_neon| PROC
+    push            {r4 - r11, lr}
+    vpush           {d8 - d15}
+
+    ldr             r4, [r0, #yv12_buffer_config_y_height]
+    ldr             r5, [r0, #yv12_buffer_config_y_width]
+    ldr             r6, [r0, #yv12_buffer_config_y_stride]
+    ldr             r7, [r1, #yv12_buffer_config_y_stride]
+    ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
+    ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1
+
+    ; copy two rows at one time
+    mov             lr, r4, lsr #1
+
+cp_src_to_dst_height_loop
+    mov             r8, r2
+    mov             r9, r3
+    add             r10, r2, r6
+    add             r11, r3, r7
+    mov             r12, r5, lsr #7
+
+cp_src_to_dst_width_loop
+    vld1.8          {q0, q1}, [r8]!
+    vld1.8          {q8, q9}, [r10]!
+    vld1.8          {q2, q3}, [r8]!
+    vld1.8          {q10, q11}, [r10]!
+    vld1.8          {q4, q5}, [r8]!
+    vld1.8          {q12, q13}, [r10]!
+    vld1.8          {q6, q7}, [r8]!
+    vld1.8          {q14, q15}, [r10]!
+
+    subs            r12, r12, #1
+
+    vst1.8          {q0, q1}, [r9]!
+    vst1.8          {q8, q9}, [r11]!
+    vst1.8          {q2, q3}, [r9]!
+    vst1.8          {q10, q11}, [r11]!
+    vst1.8          {q4, q5}, [r9]!
+    vst1.8          {q12, q13}, [r11]!
+    vst1.8          {q6, q7}, [r9]!
+    vst1.8          {q14, q15}, [r11]!
+
+    bne             cp_src_to_dst_width_loop
+
+    subs            lr, lr, #1
+    add             r2, r2, r6, lsl #1
+    add             r3, r3, r7, lsl #1
+
+    bne             cp_src_to_dst_height_loop
+
+    ands            r10, r5, #0x7f                  ;check to see if extra copy is needed
+    sub             r11, r5, r10
+    ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
+    ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1
+    bne             extra_cp_src_to_dst_width
+end_of_cp_src_to_dst
+
+
+    ;vpxyv12_extend_frame_borders_yonly
+    mov             r0, r1
+    ;Not need to load y_width, since: y_width = y_stride - 2*border
+    ldr             r3, [r0, #yv12_buffer_config_border]
+    ldr             r1, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
+    ldr             r4, [r0, #yv12_buffer_config_y_height]
+    ldr             lr, [r0, #yv12_buffer_config_y_stride]
+
+    cmp             r3, #16
+    beq             b16_extend_frame_borders
+
+;=======================
+b32_extend_frame_borders
+;border = 32
+;=======================
+;Border copy for Y plane
+;copy the left and right most columns out
+    sub             r5, r1, r3              ;destptr1
+    add             r6, r1, lr
+    sub             r6, r6, r3, lsl #1      ;destptr2
+    sub             r2, r6, #1              ;srcptr2
+
+    ;Do four rows at one time
+    mov             r12, r4, lsr #2
+
+copy_left_right_y
+    vld1.8          {d0[], d1[]}, [r1], lr
+    vld1.8          {d4[], d5[]}, [r2], lr
+    vld1.8          {d8[], d9[]}, [r1], lr
+    vld1.8          {d12[], d13[]}, [r2], lr
+    vld1.8          {d16[], d17[]},  [r1], lr
+    vld1.8          {d20[], d21[]}, [r2], lr
+    vld1.8          {d24[], d25[]}, [r1], lr
+    vld1.8          {d28[], d29[]}, [r2], lr
+
+    vmov            q1, q0
+    vmov            q3, q2
+    vmov            q5, q4
+    vmov            q7, q6
+    vmov            q9, q8
+    vmov            q11, q10
+    vmov            q13, q12
+    vmov            q15, q14
+
+    subs            r12, r12, #1
+
+    vst1.8          {q0, q1}, [r5], lr
+    vst1.8          {q2, q3}, [r6], lr
+    vst1.8          {q4, q5}, [r5], lr
+    vst1.8          {q6, q7}, [r6], lr
+    vst1.8          {q8, q9}, [r5], lr
+    vst1.8          {q10, q11}, [r6], lr
+    vst1.8          {q12, q13}, [r5], lr
+    vst1.8          {q14, q15}, [r6], lr
+
+    bne             copy_left_right_y
+
+;Now copy the top and bottom source lines into each line of the respective borders
+    ldr             r7, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
+    mul             r8, r3, lr
+
+    mov             r12, lr, lsr #7
+
+    sub             r6, r1, r3              ;destptr2
+    sub             r2, r6, lr              ;srcptr2
+    sub             r1, r7, r3              ;srcptr1
+    sub             r5, r1, r8              ;destptr1
+
+copy_top_bottom_y
+    vld1.8          {q0, q1}, [r1]!
+    vld1.8          {q8, q9}, [r2]!
+    vld1.8          {q2, q3}, [r1]!
+    vld1.8          {q10, q11}, [r2]!
+    vld1.8          {q4, q5}, [r1]!
+    vld1.8          {q12, q13}, [r2]!
+    vld1.8          {q6, q7}, [r1]!
+    vld1.8          {q14, q15}, [r2]!
+
+    mov             r7, r3
+
+top_bottom_32
+    subs            r7, r7, #1
+
+    vst1.8          {q0, q1}, [r5]!
+    vst1.8          {q8, q9}, [r6]!
+    vst1.8          {q2, q3}, [r5]!
+    vst1.8          {q10, q11}, [r6]!
+    vst1.8          {q4, q5}, [r5]!
+    vst1.8          {q12, q13}, [r6]!
+    vst1.8          {q6, q7}, [r5]!
+    vst1.8          {q14, q15}, [r6]!
+
+    add             r5, r5, lr
+    sub             r5, r5, #128
+    add             r6, r6, lr
+    sub             r6, r6, #128
+
+    bne             top_bottom_32
+
+    sub             r5, r1, r8
+    add             r6, r2, lr
+
+    subs            r12, r12, #1
+    bne             copy_top_bottom_y
+
+    mov             r7, lr, lsr #4              ;check to see if extra copy is needed
+    ands            r7, r7, #0x7
+    bne             extra_top_bottom_y
+end_of_border_copy_y
+
+    vpop            {d8 - d15}
+    pop             {r4 - r11, pc}
+
+;=====================
+;extra copy part for Y
+extra_top_bottom_y
+    vld1.8          {q0}, [r1]!
+    vld1.8          {q2}, [r2]!
+
+    mov             r9, r3, lsr #3
+
+extra_top_bottom_32
+    subs            r9, r9, #1
+
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    bne             extra_top_bottom_32
+
+    sub             r5, r1, r8
+    add             r6, r2, lr
+    subs            r7, r7, #1
+    bne             extra_top_bottom_y
+
+    b               end_of_border_copy_y
+
+
+;=======================
+b16_extend_frame_borders
+;border = 16
+;=======================
+;Border copy for Y plane
+;copy the left and right most columns out
+    sub             r5, r1, r3              ;destptr1
+    add             r6, r1, lr
+    sub             r6, r6, r3, lsl #1      ;destptr2
+    sub             r2, r6, #1              ;srcptr2
+
+    ;Do four rows at one time
+    mov             r12, r4, lsr #2
+
+copy_left_right_y_b16
+    vld1.8          {d0[], d1[]}, [r1], lr
+    vld1.8          {d4[], d5[]}, [r2], lr
+    vld1.8          {d8[], d9[]}, [r1], lr
+    vld1.8          {d12[], d13[]}, [r2], lr
+    vld1.8          {d16[], d17[]},  [r1], lr
+    vld1.8          {d20[], d21[]}, [r2], lr
+    vld1.8          {d24[], d25[]}, [r1], lr
+    vld1.8          {d28[], d29[]}, [r2], lr
+
+    subs            r12, r12, #1
+
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q4}, [r5], lr
+    vst1.8          {q6}, [r6], lr
+    vst1.8          {q8}, [r5], lr
+    vst1.8          {q10}, [r6], lr
+    vst1.8          {q12}, [r5], lr
+    vst1.8          {q14}, [r6], lr
+
+    bne             copy_left_right_y_b16
+
+;Now copy the top and bottom source lines into each line of the respective borders
+    ldr             r7, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
+    mul             r8, r3, lr
+
+    mov             r12, lr, lsr #7
+
+    sub             r6, r1, r3              ;destptr2
+    sub             r2, r6, lr              ;srcptr2
+    sub             r1, r7, r3              ;srcptr1
+    sub             r5, r1, r8              ;destptr1
+
+copy_top_bottom_y_b16
+    vld1.8          {q0, q1}, [r1]!
+    vld1.8          {q8, q9}, [r2]!
+    vld1.8          {q2, q3}, [r1]!
+    vld1.8          {q10, q11}, [r2]!
+    vld1.8          {q4, q5}, [r1]!
+    vld1.8          {q12, q13}, [r2]!
+    vld1.8          {q6, q7}, [r1]!
+    vld1.8          {q14, q15}, [r2]!
+
+    mov             r7, r3
+
+top_bottom_16_b16
+    subs            r7, r7, #1
+
+    vst1.8          {q0, q1}, [r5]!
+    vst1.8          {q8, q9}, [r6]!
+    vst1.8          {q2, q3}, [r5]!
+    vst1.8          {q10, q11}, [r6]!
+    vst1.8          {q4, q5}, [r5]!
+    vst1.8          {q12, q13}, [r6]!
+    vst1.8          {q6, q7}, [r5]!
+    vst1.8          {q14, q15}, [r6]!
+
+    add             r5, r5, lr
+    sub             r5, r5, #128
+    add             r6, r6, lr
+    sub             r6, r6, #128
+
+    bne             top_bottom_16_b16
+
+    sub             r5, r1, r8
+    add             r6, r2, lr
+
+    subs            r12, r12, #1
+    bne             copy_top_bottom_y_b16
+
+    mov             r7, lr, lsr #4              ;check to see if extra copy is needed
+    ands            r7, r7, #0x7
+    bne             extra_top_bottom_y_b16
+end_of_border_copy_y_b16
+
+    vpop            {d8 - d15}
+    pop             {r4 - r11, pc}
+
+;=====================
+;extra copy part for Y
+extra_top_bottom_y_b16
+    vld1.8          {q0}, [r1]!
+    vld1.8          {q2}, [r2]!
+
+    mov             r9, r3, lsr #3
+
+extra_top_bottom_16_b16
+    subs            r9, r9, #1
+
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    bne             extra_top_bottom_16_b16
+
+    sub             r5, r1, r8
+    add             r6, r2, lr
+    subs            r7, r7, #1
+    bne             extra_top_bottom_y_b16
+
+    b               end_of_border_copy_y_b16
+
+;=============================
+extra_cp_src_to_dst_width
+    add             r2, r2, r11
+    add             r3, r3, r11
+    add             r0, r8, r6
+    add             r11, r9, r7
+
+    mov             lr, r4, lsr #1
+extra_cp_src_to_dst_height_loop
+    mov             r8, r2
+    mov             r9, r3
+    add             r0, r8, r6
+    add             r11, r9, r7
+
+    mov             r12, r10
+
+extra_cp_src_to_dst_width_loop
+    vld1.8          {q0}, [r8]!
+    vld1.8          {q1}, [r0]!
+
+    subs            r12, r12, #16
+
+    vst1.8          {q0}, [r9]!
+    vst1.8          {q1}, [r11]!
+    bne             extra_cp_src_to_dst_width_loop
+
+    subs            lr, lr, #1
+
+    add             r2, r2, r6, lsl #1
+    add             r3, r3, r7, lsl #1
+
+    bne             extra_cp_src_to_dst_height_loop
+
+    b               end_of_cp_src_to_dst
+
+    ENDP
+
+;===========================================================
+;In vp8cx_pick_filter_level(), call vp8_yv12_copy_frame_yonly
+;without extend_frame_borders.
+|vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon| PROC
+    push            {r4 - r11, lr}
+    vpush           {d8-d15}
+
+    ldr             r4, [r0, #yv12_buffer_config_y_height]
+    ldr             r5, [r0, #yv12_buffer_config_y_width]
+    ldr             r6, [r0, #yv12_buffer_config_y_stride]
+    ldr             r7, [r1, #yv12_buffer_config_y_stride]
+    ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
+    ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1
+
+    ; copy two rows at one time
+    mov             lr, r4, lsr #1
+
+cp_src_to_dst_height_loop1
+    mov             r8, r2
+    mov             r9, r3
+    add             r10, r2, r6
+    add             r11, r3, r7
+    mov             r12, r5, lsr #7
+
+cp_src_to_dst_width_loop1
+    vld1.8          {q0, q1}, [r8]!
+    vld1.8          {q8, q9}, [r10]!
+    vld1.8          {q2, q3}, [r8]!
+    vld1.8          {q10, q11}, [r10]!
+    vld1.8          {q4, q5}, [r8]!
+    vld1.8          {q12, q13}, [r10]!
+    vld1.8          {q6, q7}, [r8]!
+    vld1.8          {q14, q15}, [r10]!
+
+    subs            r12, r12, #1
+
+    vst1.8          {q0, q1}, [r9]!
+    vst1.8          {q8, q9}, [r11]!
+    vst1.8          {q2, q3}, [r9]!
+    vst1.8          {q10, q11}, [r11]!
+    vst1.8          {q4, q5}, [r9]!
+    vst1.8          {q12, q13}, [r11]!
+    vst1.8          {q6, q7}, [r9]!
+    vst1.8          {q14, q15}, [r11]!
+
+    bne             cp_src_to_dst_width_loop1
+
+    subs            lr, lr, #1
+    add             r2, r2, r6, lsl #1
+    add             r3, r3, r7, lsl #1
+
+    bne             cp_src_to_dst_height_loop1
+
+    ands            r10, r5, #0x7f                  ;check to see if extra copy is needed
+    sub             r11, r5, r10
+    ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
+    ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1
+    bne             extra_cp_src_to_dst_width1
+end_of_cp_src_to_dst1
+
+    vpop            {d8 - d15}
+    pop             {r4-r11, pc}
+
+;=============================
+extra_cp_src_to_dst_width1
+    add             r2, r2, r11
+    add             r3, r3, r11
+    add             r0, r8, r6
+    add             r11, r9, r7
+
+    mov             lr, r4, lsr #1
+extra_cp_src_to_dst_height_loop1
+    mov             r8, r2
+    mov             r9, r3
+    add             r0, r8, r6
+    add             r11, r9, r7
+
+    mov             r12, r10
+
+extra_cp_src_to_dst_width_loop1
+    vld1.8          {q0}, [r8]!
+    vld1.8          {q1}, [r0]!
+
+    subs            r12, r12, #16
+
+    vst1.8          {q0}, [r9]!
+    vst1.8          {q1}, [r11]!
+    bne             extra_cp_src_to_dst_width_loop1
+
+    subs            lr, lr, #1
+
+    add             r2, r2, r6, lsl #1
+    add             r3, r3, r7, lsl #1
+
+    bne             extra_cp_src_to_dst_height_loop1
+
+    b               end_of_cp_src_to_dst1
+
+    ENDP
+
+    END
diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm
new file mode 100644
index 000000000..c8923d5a5
--- /dev/null
+++ b/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm
@@ -0,0 +1,257 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_yv12_copy_src_frame_func_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    INCLUDE vpx_asm_offsets.asm
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;Note: This function is used to copy source data in src_buffer[i] at beginning of
+;the encoding. The buffer has a width and height of cpi->oxcf.Width and cpi->oxcf.Height,
+;which can be ANY numbers(NOT always multiples of 16 or 4).
+
+;void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+
+|vp8_yv12_copy_src_frame_func_neon| PROC
+    push            {r4 - r11, lr}
+    vpush           {d8 - d15}
+
+    ;Copy Y plane
+    ldr             r4, [r0, #yv12_buffer_config_y_height]
+    ldr             r5, [r0, #yv12_buffer_config_y_width]
+    ldr             r6, [r0, #yv12_buffer_config_y_stride]
+    ldr             r7, [r1, #yv12_buffer_config_y_stride]
+    ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
+    ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1
+
+    add             r10, r2, r6             ;second row src
+    add             r11, r3, r7             ;second row dst
+    mov             r6, r6, lsl #1
+    mov             r7, r7, lsl #1
+    sub             r6, r6, r5              ;adjust stride
+    sub             r7, r7, r5
+
+    ; copy two rows at one time
+    mov             lr, r4, lsr #1
+
+cp_src_to_dst_height_loop
+    mov             r12, r5
+
+cp_width_128_loop
+    vld1.8          {q0, q1}, [r2]!
+    vld1.8          {q4, q5}, [r10]!
+    vld1.8          {q2, q3}, [r2]!
+    vld1.8          {q6, q7}, [r10]!
+    vld1.8          {q8, q9}, [r2]!
+    vld1.8          {q12, q13}, [r10]!
+    vld1.8          {q10, q11}, [r2]!
+    vld1.8          {q14, q15}, [r10]!
+    sub             r12, r12, #128
+    cmp             r12, #128
+    vst1.8          {q0, q1}, [r3]!
+    vst1.8          {q4, q5}, [r11]!
+    vst1.8          {q2, q3}, [r3]!
+    vst1.8          {q6, q7}, [r11]!
+    vst1.8          {q8, q9}, [r3]!
+    vst1.8          {q12, q13}, [r11]!
+    vst1.8          {q10, q11}, [r3]!
+    vst1.8          {q14, q15}, [r11]!
+    bhs             cp_width_128_loop
+
+    cmp             r12, #0
+    beq             cp_width_done
+
+cp_width_8_loop
+    vld1.8          {d0}, [r2]!
+    vld1.8          {d1}, [r10]!
+    sub             r12, r12, #8
+    cmp             r12, #8
+    vst1.8          {d0}, [r3]!
+    vst1.8          {d1}, [r11]!
+    bhs             cp_width_8_loop
+
+    cmp             r12, #0
+    beq             cp_width_done
+
+cp_width_1_loop
+    ldrb            r8, [r2], #1
+    subs            r12, r12, #1
+    strb            r8, [r3], #1
+    ldrb            r8, [r10], #1
+    strb            r8, [r11], #1
+    bne             cp_width_1_loop
+
+cp_width_done
+    subs            lr, lr, #1
+    add             r2, r2, r6
+    add             r3, r3, r7
+    add             r10, r10, r6
+    add             r11, r11, r7
+    bne             cp_src_to_dst_height_loop
+
+;copy last line for Y if y_height is odd
+    tst             r4, #1
+    beq             cp_width_done_1
+    mov             r12, r5
+
+cp_width_128_loop_1
+    vld1.8          {q0, q1}, [r2]!
+    vld1.8          {q2, q3}, [r2]!
+    vld1.8          {q8, q9}, [r2]!
+    vld1.8          {q10, q11}, [r2]!
+    sub             r12, r12, #128
+    cmp             r12, #128
+    vst1.8          {q0, q1}, [r3]!
+    vst1.8          {q2, q3}, [r3]!
+    vst1.8          {q8, q9}, [r3]!
+    vst1.8          {q10, q11}, [r3]!
+    bhs             cp_width_128_loop_1
+
+    cmp             r12, #0
+    beq             cp_width_done_1
+
+cp_width_8_loop_1
+    vld1.8          {d0}, [r2]!
+    sub             r12, r12, #8
+    cmp             r12, #8
+    vst1.8          {d0}, [r3]!
+    bhs             cp_width_8_loop_1
+
+    cmp             r12, #0
+    beq             cp_width_done_1
+
+cp_width_1_loop_1
+    ldrb            r8, [r2], #1
+    subs            r12, r12, #1
+    strb            r8, [r3], #1
+    bne             cp_width_1_loop_1
+cp_width_done_1
+
+;Copy U & V planes
+    ldr             r4, [r0, #yv12_buffer_config_uv_height]
+    ldr             r5, [r0, #yv12_buffer_config_uv_width]
+    ldr             r6, [r0, #yv12_buffer_config_uv_stride]
+    ldr             r7, [r1, #yv12_buffer_config_uv_stride]
+    ldr             r2, [r0, #yv12_buffer_config_u_buffer]       ;srcptr1
+    ldr             r3, [r1, #yv12_buffer_config_u_buffer]       ;dstptr1
+
+    add             r10, r2, r6             ;second row src
+    add             r11, r3, r7             ;second row dst
+    mov             r6, r6, lsl #1
+    mov             r7, r7, lsl #1
+    sub             r6, r6, r5              ;adjust stride
+    sub             r7, r7, r5
+
+    mov             r9, #2
+
+cp_uv_loop
+    ;copy two rows at one time
+    mov             lr, r4, lsr #1
+
+cp_src_to_dst_height_uv_loop
+    mov             r12, r5
+
+cp_width_uv_64_loop
+    vld1.8          {q0, q1}, [r2]!
+    vld1.8          {q4, q5}, [r10]!
+    vld1.8          {q2, q3}, [r2]!
+    vld1.8          {q6, q7}, [r10]!
+    sub             r12, r12, #64
+    cmp             r12, #64
+    vst1.8          {q0, q1}, [r3]!
+    vst1.8          {q4, q5}, [r11]!
+    vst1.8          {q2, q3}, [r3]!
+    vst1.8          {q6, q7}, [r11]!
+    bhs             cp_width_uv_64_loop
+
+    cmp             r12, #0
+    beq             cp_width_uv_done
+
+cp_width_uv_8_loop
+    vld1.8          {d0}, [r2]!
+    vld1.8          {d1}, [r10]!
+    sub             r12, r12, #8
+    cmp             r12, #8
+    vst1.8          {d0}, [r3]!
+    vst1.8          {d1}, [r11]!
+    bhs             cp_width_uv_8_loop
+
+    cmp             r12, #0
+    beq             cp_width_uv_done
+
+cp_width_uv_1_loop
+    ldrb            r8, [r2], #1
+    subs            r12, r12, #1
+    strb            r8, [r3], #1
+    ldrb            r8, [r10], #1
+    strb            r8, [r11], #1
+    bne             cp_width_uv_1_loop
+
+cp_width_uv_done
+    subs            lr, lr, #1
+    add             r2, r2, r6
+    add             r3, r3, r7
+    add             r10, r10, r6
+    add             r11, r11, r7
+    bne             cp_src_to_dst_height_uv_loop
+
+;copy last line for U & V if uv_height is odd
+    tst             r4, #1
+    beq             cp_width_uv_done_1
+    mov             r12, r5
+
+cp_width_uv_64_loop_1
+    vld1.8          {q0, q1}, [r2]!
+    vld1.8          {q2, q3}, [r2]!
+    sub             r12, r12, #64
+    cmp             r12, #64
+    vst1.8          {q0, q1}, [r3]!
+    vst1.8          {q2, q3}, [r3]!
+    bhs             cp_width_uv_64_loop_1
+
+    cmp             r12, #0
+    beq             cp_width_uv_done_1
+
+cp_width_uv_8_loop_1
+    vld1.8          {d0}, [r2]!
+    sub             r12, r12, #8
+    cmp             r12, #8
+    vst1.8          {d0}, [r3]!
+    bhs             cp_width_uv_8_loop_1
+
+    cmp             r12, #0
+    beq             cp_width_uv_done_1
+
+cp_width_uv_1_loop_1
+    ldrb            r8, [r2], #1
+    subs            r12, r12, #1
+    strb            r8, [r3], #1
+    bne             cp_width_uv_1_loop_1
+cp_width_uv_done_1
+
+    subs            r9, r9, #1
+    ldrne           r2, [r0, #yv12_buffer_config_v_buffer]      ;srcptr1
+    ldrne           r3, [r1, #yv12_buffer_config_v_buffer]      ;dstptr1
+    ldrne           r10, [r0, #yv12_buffer_config_uv_stride]
+    ldrne           r11, [r1, #yv12_buffer_config_uv_stride]
+
+    addne           r10, r2, r10                ;second row src
+    addne           r11, r3, r11                ;second row dst
+
+    bne             cp_uv_loop
+
+    vpop            {d8 - d15}
+    pop             {r4 - r11, pc}
+
+    ENDP
+    END
diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm
new file mode 100644
index 000000000..8c9ce1962
--- /dev/null
+++ b/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm
@@ -0,0 +1,587 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_yv12_extend_frame_borders_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    INCLUDE vpx_asm_offsets.asm
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_yv12_extend_frame_borders_neon (YV12_BUFFER_CONFIG *ybf);
+;Note: this is VP8 function, which has border=32 and 16. Internal y_width and y_height
+; are always multiples of 16.
+
+|vp8_yv12_extend_frame_borders_neon| PROC
+    push            {r4 - r10, lr}
+    vpush           {d8 - d15}
+
+    ;Not need to load y_width, since: y_width = y_stride - 2*border
+    ldr             r3, [r0, #yv12_buffer_config_border]
+    ldr             r1, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
+    ldr             r4, [r0, #yv12_buffer_config_y_height]
+    ldr             lr, [r0, #yv12_buffer_config_y_stride]
+
+    cmp             r3, #16
+    beq             b16_extend_frame_borders
+
+;=======================
+b32_extend_frame_borders
+;border = 32
+;=======================
+;Border copy for Y plane
+;copy the left and right most columns out
+    sub             r5, r1, r3              ;destptr1
+    add             r6, r1, lr
+    sub             r6, r6, r3, lsl #1      ;destptr2
+    sub             r2, r6, #1              ;srcptr2
+
+    ;Do four rows at one time
+    mov             r12, r4, lsr #2
+
+copy_left_right_y
+    vld1.8          {d0[], d1[]}, [r1], lr
+    vld1.8          {d4[], d5[]}, [r2], lr
+    vld1.8          {d8[], d9[]}, [r1], lr
+    vld1.8          {d12[], d13[]}, [r2], lr
+    vld1.8          {d16[], d17[]},  [r1], lr
+    vld1.8          {d20[], d21[]}, [r2], lr
+    vld1.8          {d24[], d25[]}, [r1], lr
+    vld1.8          {d28[], d29[]}, [r2], lr
+
+    vmov            q1, q0
+    vmov            q3, q2
+    vmov            q5, q4
+    vmov            q7, q6
+    vmov            q9, q8
+    vmov            q11, q10
+    vmov            q13, q12
+    vmov            q15, q14
+
+    subs            r12, r12, #1
+
+    vst1.8          {q0, q1}, [r5], lr
+    vst1.8          {q2, q3}, [r6], lr
+    vst1.8          {q4, q5}, [r5], lr
+    vst1.8          {q6, q7}, [r6], lr
+    vst1.8          {q8, q9}, [r5], lr
+    vst1.8          {q10, q11}, [r6], lr
+    vst1.8          {q12, q13}, [r5], lr
+    vst1.8          {q14, q15}, [r6], lr
+
+    bne             copy_left_right_y
+
+;Now copy the top and bottom source lines into each line of the respective borders
+    ldr             r7, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
+    mul             r8, r3, lr
+
+    mov             r12, lr, lsr #7
+
+    sub             r6, r1, r3              ;destptr2
+    sub             r2, r6, lr              ;srcptr2
+    sub             r1, r7, r3              ;srcptr1
+    sub             r5, r1, r8              ;destptr1
+
+copy_top_bottom_y
+    vld1.8          {q0, q1}, [r1]!
+    vld1.8          {q8, q9}, [r2]!
+    vld1.8          {q2, q3}, [r1]!
+    vld1.8          {q10, q11}, [r2]!
+    vld1.8          {q4, q5}, [r1]!
+    vld1.8          {q12, q13}, [r2]!
+    vld1.8          {q6, q7}, [r1]!
+    vld1.8          {q14, q15}, [r2]!
+
+    mov             r7, r3
+
+top_bottom_32
+    subs            r7, r7, #1
+
+    vst1.8          {q0, q1}, [r5]!
+    vst1.8          {q8, q9}, [r6]!
+    vst1.8          {q2, q3}, [r5]!
+    vst1.8          {q10, q11}, [r6]!
+    vst1.8          {q4, q5}, [r5]!
+    vst1.8          {q12, q13}, [r6]!
+    vst1.8          {q6, q7}, [r5]!
+    vst1.8          {q14, q15}, [r6]!
+
+    add             r5, r5, lr
+    sub             r5, r5, #128
+    add             r6, r6, lr
+    sub             r6, r6, #128
+
+    bne             top_bottom_32
+
+    sub             r5, r1, r8
+    add             r6, r2, lr
+
+    subs            r12, r12, #1
+    bne             copy_top_bottom_y
+
+    mov             r7, lr, lsr #4              ;check to see if extra copy is needed
+    ands            r7, r7, #0x7
+    bne             extra_top_bottom_y
+end_of_border_copy_y
+
+;Border copy for U, V planes
+    ldr             r1, [r0, #yv12_buffer_config_u_buffer]       ;srcptr1
+    mov             lr, lr, lsr #1              ;uv_stride
+    mov             r3, r3, lsr #1              ;border
+    mov             r4, r4, lsr #1              ;uv_height
+    mov             r8, r8, lsr #2
+
+    mov             r10, #2
+
+;copy the left and right most columns out
+border_copy_uv
+    sub             r5, r1, r3              ;destptr1
+    add             r6, r1, lr
+    sub             r6, r6, r3, lsl #1      ;destptr2
+    sub             r2, r6, #1              ;srcptr2
+
+    mov             r7, r1
+
+    ;Do eight rows at one time
+    mov             r12, r4, lsr #3
+
+copy_left_right_uv
+    vld1.8          {d0[], d1[]}, [r1], lr
+    vld1.8          {d2[], d3[]}, [r2], lr
+    vld1.8          {d4[], d5[]}, [r1], lr
+    vld1.8          {d6[], d7[]}, [r2], lr
+    vld1.8          {d8[], d9[]},  [r1], lr
+    vld1.8          {d10[], d11[]}, [r2], lr
+    vld1.8          {d12[], d13[]}, [r1], lr
+    vld1.8          {d14[], d15[]}, [r2], lr
+    vld1.8          {d16[], d17[]}, [r1], lr
+    vld1.8          {d18[], d19[]}, [r2], lr
+    vld1.8          {d20[], d21[]}, [r1], lr
+    vld1.8          {d22[], d23[]}, [r2], lr
+    vld1.8          {d24[], d25[]},  [r1], lr
+    vld1.8          {d26[], d27[]}, [r2], lr
+    vld1.8          {d28[], d29[]}, [r1], lr
+    vld1.8          {d30[], d31[]}, [r2], lr
+
+    subs            r12, r12, #1
+
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q1}, [r6], lr
+    vst1.8          {q2}, [r5], lr
+    vst1.8          {q3}, [r6], lr
+    vst1.8          {q4}, [r5], lr
+    vst1.8          {q5}, [r6], lr
+    vst1.8          {q6}, [r5], lr
+    vst1.8          {q7}, [r6], lr
+    vst1.8          {q8}, [r5], lr
+    vst1.8          {q9}, [r6], lr
+    vst1.8          {q10}, [r5], lr
+    vst1.8          {q11}, [r6], lr
+    vst1.8          {q12}, [r5], lr
+    vst1.8          {q13}, [r6], lr
+    vst1.8          {q14}, [r5], lr
+    vst1.8          {q15}, [r6], lr
+
+    bne             copy_left_right_uv
+
+;Now copy the top and bottom source lines into each line of the respective borders
+    mov             r12, lr, lsr #6
+
+    sub             r6, r1, r3              ;destptr2
+    sub             r2, r6, lr              ;srcptr2
+    sub             r1, r7, r3              ;srcptr1
+    sub             r5, r1, r8              ;destptr1
+
+copy_top_bottom_uv
+    vld1.8          {q0, q1}, [r1]!
+    vld1.8          {q8, q9}, [r2]!
+    vld1.8          {q2, q3}, [r1]!
+    vld1.8          {q10, q11}, [r2]!
+
+    mov             r7, r3
+
+top_bottom_16
+    subs            r7, r7, #1
+
+    vst1.8          {q0, q1}, [r5]!
+    vst1.8          {q8, q9}, [r6]!
+    vst1.8          {q2, q3}, [r5]!
+    vst1.8          {q10, q11}, [r6]!
+
+    add             r5, r5, lr
+    sub             r5, r5, #64
+    add             r6, r6, lr
+    sub             r6, r6, #64
+
+    bne             top_bottom_16
+
+    sub             r5, r1, r8
+    add             r6, r2, lr
+
+    subs            r12, r12, #1
+    bne             copy_top_bottom_uv
+
+    mov             r7, lr, lsr #3              ;check to see if extra copy is needed
+    ands            r7, r7, #0x7
+    bne             extra_top_bottom_uv
+
+end_of_border_copy_uv
+    subs            r10, r10, #1
+    ldrne           r1, [r0, #yv12_buffer_config_v_buffer]       ;srcptr1
+    bne             border_copy_uv
+
+    vpop            {d8 - d15}
+    pop             {r4 - r10, pc}
+
+;;;;;;;;;;;;;;;;;;;;;;
+;extra copy part for Y
+extra_top_bottom_y
+    vld1.8          {q0}, [r1]!
+    vld1.8          {q2}, [r2]!
+
+    mov             r9, r3, lsr #3
+
+extra_top_bottom_32
+    subs            r9, r9, #1
+
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    bne             extra_top_bottom_32
+
+    sub             r5, r1, r8
+    add             r6, r2, lr
+    subs            r7, r7, #1
+    bne             extra_top_bottom_y
+
+    b               end_of_border_copy_y
+
+;extra copy part for UV
+extra_top_bottom_uv
+    vld1.8          {d0}, [r1]!
+    vld1.8          {d8}, [r2]!
+
+    mov             r9, r3, lsr #3
+
+extra_top_bottom_16
+    subs            r9, r9, #1
+
+    vst1.8          {d0}, [r5], lr
+    vst1.8          {d8}, [r6], lr
+    vst1.8          {d0}, [r5], lr
+    vst1.8          {d8}, [r6], lr
+    vst1.8          {d0}, [r5], lr
+    vst1.8          {d8}, [r6], lr
+    vst1.8          {d0}, [r5], lr
+    vst1.8          {d8}, [r6], lr
+    vst1.8          {d0}, [r5], lr
+    vst1.8          {d8}, [r6], lr
+    vst1.8          {d0}, [r5], lr
+    vst1.8          {d8}, [r6], lr
+    vst1.8          {d0}, [r5], lr
+    vst1.8          {d8}, [r6], lr
+    vst1.8          {d0}, [r5], lr
+    vst1.8          {d8}, [r6], lr
+    bne             extra_top_bottom_16
+
+    sub             r5, r1, r8
+    add             r6, r2, lr
+    subs            r7, r7, #1
+    bne             extra_top_bottom_uv
+
+    b               end_of_border_copy_uv
+
+
+;=======================
+b16_extend_frame_borders
+;border = 16
+;=======================
+;Border copy for Y plane
+;copy the left and right most columns out
+    sub             r5, r1, r3              ;destptr1
+    add             r6, r1, lr
+    sub             r6, r6, r3, lsl #1      ;destptr2
+    sub             r2, r6, #1              ;srcptr2
+
+    ;Do four rows at one time
+    mov             r12, r4, lsr #2
+
+copy_left_right_y_b16
+    vld1.8          {d0[], d1[]}, [r1], lr
+    vld1.8          {d4[], d5[]}, [r2], lr
+    vld1.8          {d8[], d9[]}, [r1], lr
+    vld1.8          {d12[], d13[]}, [r2], lr
+    vld1.8          {d16[], d17[]},  [r1], lr
+    vld1.8          {d20[], d21[]}, [r2], lr
+    vld1.8          {d24[], d25[]}, [r1], lr
+    vld1.8          {d28[], d29[]}, [r2], lr
+
+    subs            r12, r12, #1
+
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q4}, [r5], lr
+    vst1.8          {q6}, [r6], lr
+    vst1.8          {q8}, [r5], lr
+    vst1.8          {q10}, [r6], lr
+    vst1.8          {q12}, [r5], lr
+    vst1.8          {q14}, [r6], lr
+
+    bne             copy_left_right_y_b16
+
+;Now copy the top and bottom source lines into each line of the respective borders
+    ldr             r7, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
+    mul             r8, r3, lr
+
+    mov             r12, lr, lsr #7
+
+    sub             r6, r1, r3              ;destptr2
+    sub             r2, r6, lr              ;srcptr2
+    sub             r1, r7, r3              ;srcptr1
+    sub             r5, r1, r8              ;destptr1
+
+copy_top_bottom_y_b16
+    vld1.8          {q0, q1}, [r1]!
+    vld1.8          {q8, q9}, [r2]!
+    vld1.8          {q2, q3}, [r1]!
+    vld1.8          {q10, q11}, [r2]!
+    vld1.8          {q4, q5}, [r1]!
+    vld1.8          {q12, q13}, [r2]!
+    vld1.8          {q6, q7}, [r1]!
+    vld1.8          {q14, q15}, [r2]!
+
+    mov             r7, r3
+
+top_bottom_16_b16
+    subs            r7, r7, #1
+
+    vst1.8          {q0, q1}, [r5]!
+    vst1.8          {q8, q9}, [r6]!
+    vst1.8          {q2, q3}, [r5]!
+    vst1.8          {q10, q11}, [r6]!
+    vst1.8          {q4, q5}, [r5]!
+    vst1.8          {q12, q13}, [r6]!
+    vst1.8          {q6, q7}, [r5]!
+    vst1.8          {q14, q15}, [r6]!
+
+    add             r5, r5, lr
+    sub             r5, r5, #128
+    add             r6, r6, lr
+    sub             r6, r6, #128
+
+    bne             top_bottom_16_b16
+
+    sub             r5, r1, r8
+    add             r6, r2, lr
+
+    subs            r12, r12, #1
+    bne             copy_top_bottom_y_b16
+
+    mov             r7, lr, lsr #4              ;check to see if extra copy is needed
+    ands            r7, r7, #0x7
+    bne             extra_top_bottom_y_b16
+end_of_border_copy_y_b16
+
+;Border copy for U, V planes
+    ldr             r1, [r0, #yv12_buffer_config_u_buffer]       ;srcptr1
+    mov             lr, lr, lsr #1              ;uv_stride
+    mov             r3, r3, lsr #1              ;border
+    mov             r4, r4, lsr #1              ;uv_height
+    mov             r8, r8, lsr #2
+
+    mov             r10, #2
+
+;copy the left and right most columns out
+border_copy_uv_b16
+    sub             r5, r1, r3              ;destptr1
+    add             r6, r1, lr
+    sub             r6, r6, r3, lsl #1      ;destptr2
+    sub             r2, r6, #1              ;srcptr2
+
+    mov             r7, r1
+
+    ;Do eight rows at one time
+    mov             r12, r4, lsr #3
+
+copy_left_right_uv_b16
+    vld1.8          {d0[]}, [r1], lr
+    vld1.8          {d2[]}, [r2], lr
+    vld1.8          {d4[]}, [r1], lr
+    vld1.8          {d6[]}, [r2], lr
+    vld1.8          {d8[]},  [r1], lr
+    vld1.8          {d10[]}, [r2], lr
+    vld1.8          {d12[]}, [r1], lr
+    vld1.8          {d14[]}, [r2], lr
+    vld1.8          {d16[]}, [r1], lr
+    vld1.8          {d18[]}, [r2], lr
+    vld1.8          {d20[]}, [r1], lr
+    vld1.8          {d22[]}, [r2], lr
+    vld1.8          {d24[]},  [r1], lr
+    vld1.8          {d26[]}, [r2], lr
+    vld1.8          {d28[]}, [r1], lr
+    vld1.8          {d30[]}, [r2], lr
+
+    subs            r12, r12, #1
+
+    vst1.8          {d0}, [r5], lr
+    vst1.8          {d2}, [r6], lr
+    vst1.8          {d4}, [r5], lr
+    vst1.8          {d6}, [r6], lr
+    vst1.8          {d8}, [r5], lr
+    vst1.8          {d10}, [r6], lr
+    vst1.8          {d12}, [r5], lr
+    vst1.8          {d14}, [r6], lr
+    vst1.8          {d16}, [r5], lr
+    vst1.8          {d18}, [r6], lr
+    vst1.8          {d20}, [r5], lr
+    vst1.8          {d22}, [r6], lr
+    vst1.8          {d24}, [r5], lr
+    vst1.8          {d26}, [r6], lr
+    vst1.8          {d28}, [r5], lr
+    vst1.8          {d30}, [r6], lr
+
+    bne             copy_left_right_uv_b16
+
+;Now copy the top and bottom source lines into each line of the respective borders
+    mov             r12, lr, lsr #6
+
+    sub             r6, r1, r3              ;destptr2
+    sub             r2, r6, lr              ;srcptr2
+    sub             r1, r7, r3              ;srcptr1
+    sub             r5, r1, r8              ;destptr1
+
+copy_top_bottom_uv_b16
+    vld1.8          {q0, q1}, [r1]!
+    vld1.8          {q8, q9}, [r2]!
+    vld1.8          {q2, q3}, [r1]!
+    vld1.8          {q10, q11}, [r2]!
+
+    mov             r7, r3
+
+top_bottom_8_b16
+    subs            r7, r7, #1
+
+    vst1.8          {q0, q1}, [r5]!
+    vst1.8          {q8, q9}, [r6]!
+    vst1.8          {q2, q3}, [r5]!
+    vst1.8          {q10, q11}, [r6]!
+
+    add             r5, r5, lr
+    sub             r5, r5, #64
+    add             r6, r6, lr
+    sub             r6, r6, #64
+
+    bne             top_bottom_8_b16
+
+    sub             r5, r1, r8
+    add             r6, r2, lr
+
+    subs            r12, r12, #1
+    bne             copy_top_bottom_uv_b16
+
+    mov             r7, lr, lsr #3              ;check to see if extra copy is needed
+    ands            r7, r7, #0x7
+    bne             extra_top_bottom_uv_b16
+
+end_of_border_copy_uv_b16
+    subs            r10, r10, #1
+    ldrne           r1, [r0, #yv12_buffer_config_v_buffer]       ;srcptr1
+    bne             border_copy_uv_b16
+
+    vpop            {d8-d15}
+    pop             {r4 - r10, pc}
+
+;;;;;;;;;;;;;;;;;;;;;;
+;extra copy part for Y
+extra_top_bottom_y_b16
+    vld1.8          {q0}, [r1]!
+    vld1.8          {q2}, [r2]!
+
+    mov             r9, r3, lsr #3
+
+extra_top_bottom_16_b16
+    subs            r9, r9, #1
+
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    vst1.8          {q0}, [r5], lr
+    vst1.8          {q2}, [r6], lr
+    bne             extra_top_bottom_16_b16
+
+    sub             r5, r1, r8
+    add             r6, r2, lr
+    subs            r7, r7, #1
+    bne             extra_top_bottom_y_b16
+
+    b               end_of_border_copy_y_b16
+
+;extra copy part for UV
+extra_top_bottom_uv_b16
+    vld1.8          {d0}, [r1]!
+    vld1.8          {d8}, [r2]!
+
+    mov             r9, r3, lsr #3
+
+extra_top_bottom_8_b16
+    subs            r9, r9, #1
+
+    vst1.8          {d0}, [r5], lr
+    vst1.8          {d8}, [r6], lr
+    vst1.8          {d0}, [r5], lr
+    vst1.8          {d8}, [r6], lr
+    vst1.8          {d0}, [r5], lr
+    vst1.8          {d8}, [r6], lr
+    vst1.8          {d0}, [r5], lr
+    vst1.8          {d8}, [r6], lr
+    vst1.8          {d0}, [r5], lr
+    vst1.8          {d8}, [r6], lr
+    vst1.8          {d0}, [r5], lr
+    vst1.8          {d8}, [r6], lr
+    vst1.8          {d0}, [r5], lr
+    vst1.8          {d8}, [r6], lr
+    vst1.8          {d0}, [r5], lr
+    vst1.8          {d8}, [r6], lr
+    bne             extra_top_bottom_8_b16
+
+    sub             r5, r1, r8
+    add             r6, r2, lr
+    subs            r7, r7, #1
+    bne             extra_top_bottom_uv_b16
+
+    b               end_of_border_copy_uv_b16
+
+    ENDP
+    END
diff --git a/vpx_scale/arm/scalesystemdependant.c b/vpx_scale/arm/scalesystemdependant.c
new file mode 100644
index 000000000..3c355becc
--- /dev/null
+++ b/vpx_scale/arm/scalesystemdependant.c
@@ -0,0 +1,87 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/vpxscale.h"
+
+#ifdef HAVE_CONFIG_H
+#include "vpx_config.h"
+#endif
+
+void (*vp8_yv12_extend_frame_borders_ptr)(YV12_BUFFER_CONFIG *ybf);
+extern void vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf);
+extern void vp8_yv12_extend_frame_borders_neon(YV12_BUFFER_CONFIG *ybf);
+
+void (*vp8_yv12_copy_frame_yonly_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+extern void vp8_yv12_copy_frame_yonly(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+extern void vp8_yv12_copy_frame_yonly_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+
+void (*vp8_yv12_copy_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+extern void vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+extern void vp8_yv12_copy_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+
+/****************************************************************************
+*  Imports
+*****************************************************************************/
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8_scale_machine_specific_config
+ *
+ *  INPUTS        : UINT32 Version : Codec version number.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Checks for machine specifc features such as MMX support
+ *                  sets appropriate flags and function pointers.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8_scale_machine_specific_config()
+{
+    /*
+    vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_armv4;
+    vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_armv4;
+    vp8_last_vertical_band_1_2_scale      = vp8cx_last_vertical_band_1_2_scale_c;
+    vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_armv4;
+    vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_armv4;
+    vp8_last_vertical_band_3_5_scale      = vp8cx_last_vertical_band_3_5_scale_c;
+    vp8_horizontal_line_3_4_scale        = horizontal_line_3_4_scale_armv4;
+    vp8_vertical_band_3_4_scale          = vertical_band_3_4_scale_armv4;
+    vp8_last_vertical_band_3_4_scale      = vp8cx_last_vertical_band_3_4_scale_c;
+    vp8_horizontal_line_2_3_scale        = horizontal_line_2_3_scale_armv4;
+    vp8_vertical_band_2_3_scale          = vertical_band_2_3_scale_armv4;
+    vp8_last_vertical_band_2_3_scale      = vp8cx_last_vertical_band_2_3_scale_c;
+    vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_armv4;
+    vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_armv4;
+    vp8_last_vertical_band_4_5_scale      = vp8cx_last_vertical_band_4_5_scale_c;
+
+    vp8_vertical_band_5_4_scale           = vp8cx_vertical_band_5_4_scale_c;
+    vp8_vertical_band_5_3_scale           = vp8cx_vertical_band_5_3_scale_c;
+    vp8_vertical_band_2_1_scale           = vp8cx_vertical_band_2_1_scale_c;
+    vp8_vertical_band_2_1_scale_i         = vp8cx_vertical_band_2_1_scale_i_c;
+    vp8_horizontal_line_2_1_scale         = vp8cx_horizontal_line_2_1_scale_c;
+    vp8_horizontal_line_5_3_scale         = vp8cx_horizontal_line_5_3_scale_c;
+    vp8_horizontal_line_5_4_scale         = vp8cx_horizontal_line_5_4_scale_c;
+    */
+
+#if HAVE_ARMV7
+    vp8_yv12_extend_frame_borders_ptr      = vp8_yv12_extend_frame_borders_neon;
+    vp8_yv12_copy_frame_yonly_ptr          = vp8_yv12_copy_frame_yonly_neon;
+    vp8_yv12_copy_frame_ptr               = vp8_yv12_copy_frame_neon;
+#else
+    vp8_yv12_extend_frame_borders_ptr      = vp8_yv12_extend_frame_borders;
+    vp8_yv12_copy_frame_yonly_ptr          = vp8_yv12_copy_frame_yonly;
+    vp8_yv12_copy_frame_ptr           = vp8_yv12_copy_frame;
+#endif
+
+}
diff --git a/vpx_scale/arm/yv12extend_arm.c b/vpx_scale/arm/yv12extend_arm.c
new file mode 100644
index 000000000..7c3f7cd07
--- /dev/null
+++ b/vpx_scale/arm/yv12extend_arm.c
@@ -0,0 +1,24 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/yv12config.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/vpxscale.h"
+
+void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+
+void
+vp8_yv12_copy_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
+{
+    vp8_yv12_copy_frame_func_neon(src_ybc, dst_ybc);
+    //printf("Border:%d; plane_stride:%d; plane_height:%d; plane_width:%d\n",dst_ybc->border,dst_ybc->y_stride,dst_ybc->y_height,dst_ybc->y_width);
+
+    vp8_yv12_extend_frame_borders_ptr(dst_ybc);
+}
diff --git a/vpx_scale/blackfin/yv12config.c b/vpx_scale/blackfin/yv12config.c
new file mode 100644
index 000000000..7cb083fb9
--- /dev/null
+++ b/vpx_scale/blackfin/yv12config.c
@@ -0,0 +1,119 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+ *
+ *   Module Title :     yv12config.c
+ *
+ *   Description  :
+ *
+ ***************************************************************************/
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+#include "vpx_scale/yv12config.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include <cdef_bf533.h>
+
+/****************************************************************************
+*  Imports
+****************************************************************************/
+void
+extend_memset(void *dst, unsigned char value, unsigned int size);
+
+/****************************************************************************
+ *
+ ****************************************************************************/
+int
+vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf)
+{
+    if (ybf)
+    {
+        if (ybf->buffer_alloc)
+        {
+            duck_free(ybf->buffer_alloc);
+        }
+
+        ybf->buffer_alloc = 0;
+    }
+    else
+    {
+        return -1;
+    }
+
+    return 0;
+}
+
+/****************************************************************************
+ *
+ ****************************************************************************/
+int
+vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int border)
+{
+//NOTE:
+
+    int yplane_size = (height + 2 * border) * (width + 2 * border);
+    int uvplane_size = (height / 2 + border) * (width / 2 + border);
+
+    if (ybf)
+    {
+        vp8_yv12_de_alloc_frame_buffer(ybf);
+
+        ybf->y_width  = width;
+        ybf->y_height = height;
+        ybf->y_stride = width + 2 * border;
+
+        ybf->uv_width = width / 2;
+        ybf->uv_height = height / 2;
+        ybf->uv_stride = ybf->uv_width + border;
+
+        ybf->border = border;
+
+        // Added 2 extra lines to framebuffer so that copy12x12 doesn't fail
+        // when we have a large motion vector in V on the last v block.
+        // Note : We never use these pixels anyway so this doesn't hurt.
+        ybf->buffer_alloc = (unsigned char *) duck_memalign(32, (yplane_size * 3 / 2) +  ybf->y_stride , 0);
+
+        if (ybf->buffer_alloc == NULL)
+            return -1;
+
+        ybf->y_buffer = ybf->buffer_alloc + border * ybf->y_stride + border;
+        ybf->u_buffer = ybf->buffer_alloc + yplane_size + border / 2  * ybf->uv_stride + border / 2;
+        ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + border / 2  * ybf->uv_stride + border / 2;
+    }
+    else
+    {
+        return -2;
+    }
+
+    return 0;
+}
+/****************************************************************************
+ *
+ ****************************************************************************/
+int
+vp8_yv12_black_frame_buffer(YV12_BUFFER_CONFIG *ybf)
+{
+    if (ybf)
+    {
+        if (ybf->buffer_alloc)
+        {
+            extend_memset(ybf->y_buffer, 0x0, ybf->y_stride *(ybf->y_height + 2 * ybf->border));
+            extend_memset(ybf->u_buffer, 0x80, ybf->uv_stride *(ybf->uv_height + ybf->border));
+            extend_memset(ybf->v_buffer, 0x80, ybf->uv_stride *(ybf->uv_height + ybf->border));
+        }
+
+        return 0;
+    }
+
+    return -1;
+}
diff --git a/vpx_scale/blackfin/yv12extend.c b/vpx_scale/blackfin/yv12extend.c
new file mode 100644
index 000000000..d5be4950d
--- /dev/null
+++ b/vpx_scale/blackfin/yv12extend.c
@@ -0,0 +1,349 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+ *
+ *   Module Title :     yv12extend.c
+ *
+ *   Description  :
+ *
+ ***************************************************************************/
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+#include <cdef_bf533.h>
+
+#include "vpx_scale/yv12config.h"
+#include "vpx_mem/vpx_mem.h"
+
+/****************************************************************************
+*
+****************************************************************************/
+
+
+/****************************************************************************
+*
+****************************************************************************/
+void
+extend_memset(void *dst, unsigned char value, unsigned int size)
+{
+#if 0
+    unsigned int quad_value;
+
+    quad_value = (unsigned int) value;
+    quad_value |= (unsigned int) value << 8;
+    quad_value |= (unsigned int) value << 16;
+    quad_value |= (unsigned int) value << 24;
+#else
+    unsigned short quad_value;
+
+    quad_value = (unsigned int) value;
+    quad_value |= (unsigned int) value << 8;
+#endif
+
+
+    if (size / 2 >= 64 * 1024)
+        printf("_Extend_memset__________ dma memset is broken\n");
+
+    *p_mdma_s1_start_addr = &quad_value;
+    *p_mdma_s1_x_count = size / 2;
+    *p_mdma_s1_x_modify = 0x0;
+    *p_mdma_d1_start_addr = dst;
+    *p_mdma_d1_x_count = size / 2;
+    *p_mdma_d1_x_modify = 2;
+
+    *p_mdma_s1_config = DMAEN | WDSIZE_16;
+    asm("ssync;");
+
+    *p_mdma_d1_config = DI_EN | DMAEN | WNR | WDSIZE_16;
+    asm("ssync;");
+
+    while ((*p_mdma_d1_irq_status & DMA_DONE) == 0);
+
+    *p_mdma_d1_irq_status |= DMA_DONE;
+}
+
+/****************************************************************************
+*
+****************************************************************************/
+void
+extend_memcpy(void *dst, void *src, unsigned int size)
+{
+    if (size / 2 >= 64 * 1024)
+        printf("_Extend_memcpy__________ dma memcpy is broken\n");
+
+
+    if ((size & 0x3))
+        printf("_)__________ size not a multiple of 4\n");
+
+//32 bit dma here caused some data to be corrupted --- WHY ??????
+
+    *p_mdma_s1_start_addr = src;
+    *p_mdma_s1_x_count = size / 2;
+    *p_mdma_s1_x_modify = 2;
+    *p_mdma_d1_start_addr = dst;
+    *p_mdma_d1_x_count = size / 2;
+    *p_mdma_d1_x_modify = 2;
+
+    *p_mdma_s1_config = DMAEN | WDSIZE_16;
+    asm("ssync;");
+
+    *p_mdma_d1_config = DI_EN | DMAEN | WNR | WDSIZE_16;
+    asm("ssync;");
+
+    while ((*p_mdma_d1_irq_status & DMA_DONE) == 0);
+
+    *p_mdma_d1_irq_status |= DMA_DONE;
+}
+
+/****************************************************************************
+ *
+ ****************************************************************************/
+void
+vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf)
+{
+#if 1
+    int i;
+    unsigned char *src_ptr1, *src_ptr2;
+    unsigned char *dest_ptr1, *dest_ptr2;
+
+    unsigned int Border;
+    int plane_stride;
+    int plane_height;
+    int plane_width;
+
+    unsigned int quad_sample;
+    unsigned int sample;
+
+    /***********/
+    /* Y Plane */
+    /***********/
+    Border = ybf->border;
+    plane_stride = ybf->y_stride;
+    plane_height = ybf->y_height;
+    plane_width = ybf->y_width;
+
+    // copy the left and right most columns out
+    src_ptr1 = ybf->y_buffer;
+    src_ptr2 = src_ptr1 + plane_width - 1;
+    dest_ptr1 = src_ptr1 - Border;
+    dest_ptr2 = src_ptr2 + 1;
+
+    for (i = 0; i < plane_height; i++)
+    {
+        extend_memset(dest_ptr1, src_ptr1[0], Border);
+        extend_memset(dest_ptr2, src_ptr2[0], Border);
+        src_ptr1  += plane_stride;
+        src_ptr2  += plane_stride;
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    // Now copy the top and bottom source lines into each line of the respective borders
+    src_ptr1 = ybf->y_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)Border; i++)
+    {
+        extend_memcpy(dest_ptr1, src_ptr1, plane_stride);
+        dest_ptr1 += plane_stride;
+    }
+
+    for (i = 0; i < (int)Border; i++)
+    {
+        extend_memcpy(dest_ptr2, src_ptr2, plane_stride);
+        dest_ptr2 += plane_stride;
+    }
+
+    plane_stride /= 2;
+    plane_height /= 2;
+    plane_width /= 2;
+    Border /= 2;
+
+    /***********/
+    /* U Plane */
+    /***********/
+
+    // copy the left and right most columns out
+    src_ptr1 = ybf->u_buffer;
+    src_ptr2 = src_ptr1 + plane_width - 1;
+    dest_ptr1 = src_ptr1 - Border;
+    dest_ptr2 = src_ptr2 + 1;
+
+    for (i = 0; i < plane_height; i++)
+    {
+        extend_memset(dest_ptr1, src_ptr1[0], Border);
+        extend_memset(dest_ptr2, src_ptr2[0], Border);
+        src_ptr1  += plane_stride;
+        src_ptr2  += plane_stride;
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    // Now copy the top and bottom source lines into each line of the respective borders
+    src_ptr1 = ybf->u_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)(Border); i++)
+    {
+        extend_memcpy(dest_ptr1, src_ptr1, plane_stride);
+        dest_ptr1 += plane_stride;
+    }
+
+    for (i = 0; i < (int)(Border); i++)
+    {
+        extend_memcpy(dest_ptr2, src_ptr2, plane_stride);
+        dest_ptr2 += plane_stride;
+    }
+
+    /***********/
+    /* V Plane */
+    /***********/
+
+    // copy the left and right most columns out
+    src_ptr1 = ybf->v_buffer;
+    src_ptr2 = src_ptr1 + plane_width - 1;
+    dest_ptr1 = src_ptr1 - Border;
+    dest_ptr2 = src_ptr2 + 1;
+
+    for (i = 0; i < plane_height; i++)
+    {
+        extend_memset(dest_ptr1, src_ptr1[0], Border);
+        extend_memset(dest_ptr2, src_ptr2[0], Border);
+        src_ptr1  += plane_stride;
+        src_ptr2  += plane_stride;
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    // Now copy the top and bottom source lines into each line of the respective borders
+    src_ptr1 = ybf->v_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)(Border); i++)
+    {
+        extend_memcpy(dest_ptr1, src_ptr1, plane_stride);
+        dest_ptr1 += plane_stride;
+    }
+
+    for (i = 0; i < (int)(Border); i++)
+    {
+        extend_memcpy(dest_ptr2, src_ptr2, plane_stride);
+        dest_ptr2 += plane_stride;
+    }
+
+#endif
+}
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8_yv12_copy_frame
+ *
+ *  INPUTS        :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies the source image into the destination image and
+ *                  updates the destination's UMV borders.
+ *
+ *  SPECIAL NOTES : The frames are assumed to be identical in size.
+ *
+ ****************************************************************************/
+void
+vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
+{
+#if 1
+    int row;
+    unsigned char *source, *dest;
+
+    source = src_ybc->y_buffer;
+    dest = dst_ybc->y_buffer;
+
+    for (row = 0; row < src_ybc->y_height; row++)
+    {
+        extend_memcpy(dest, source, src_ybc->y_width);
+        source += src_ybc->y_stride;
+        dest   += dst_ybc->y_stride;
+    }
+
+    source = src_ybc->u_buffer;
+    dest = dst_ybc->u_buffer;
+
+    for (row = 0; row < src_ybc->uv_height; row++)
+    {
+        extend_memcpy(dest, source, src_ybc->uv_width);
+        source += src_ybc->uv_stride;
+        dest   += dst_ybc->uv_stride;
+    }
+
+    source = src_ybc->v_buffer;
+    dest = dst_ybc->v_buffer;
+
+    for (row = 0; row < src_ybc->uv_height; row++)
+    {
+        extend_memcpy(dest, source, src_ybc->uv_width);
+        source += src_ybc->uv_stride;
+        dest   += dst_ybc->uv_stride;
+    }
+
+    vp8_yv12_extend_frame_borders(dst_ybc);
+
+#else
+    int row;
+    char *source, *dest;
+    int height;
+    int width;
+
+    height = src_ybc->y_height + (src_ybc->border * 2);
+    width =  src_ybc->y_width + (src_ybc->border * 2);
+    source = src_ybc->y_buffer;
+    dest = dst_ybc->y_buffer;
+
+    for (row = 0; row < height; row++)
+    {
+        extend_memcpy(dest, source, width);
+        source += src_ybc->y_stride;
+        dest   += dst_ybc->y_stride;
+    }
+
+    height = src_ybc->uv_height + (src_ybc->border);
+    width =  src_ybc->uv_width + (src_ybc->border);
+
+    source = src_ybc->u_buffer;
+    dest = dst_ybc->u_buffer;
+
+    for (row = 0; row < height; row++)
+    {
+        extend_memcpy(dest, source, width);
+        source += src_ybc->uv_stride;
+        dest   += dst_ybc->uv_stride;
+    }
+
+    source = src_ybc->v_buffer;
+    dest = dst_ybc->v_buffer;
+
+    for (row = 0; row < height; row++)
+    {
+        extend_memcpy(dest, source, width);
+        source += src_ybc->uv_stride;
+        dest   += dst_ybc->uv_stride;
+    }
+
+#endif
+
+}
diff --git a/vpx_scale/dm642/bicubic_scaler_c64.c b/vpx_scale/dm642/bicubic_scaler_c64.c
new file mode 100644
index 000000000..9bd379725
--- /dev/null
+++ b/vpx_scale/dm642/bicubic_scaler_c64.c
@@ -0,0 +1,193 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include "vpx_mem/vpx_mem.h"
+#include "vpxscale_arbitrary.h"
+
+extern BICUBIC_SCALER_STRUCT g_b_scaler;
+
+int bicubic_scale_c64(int in_width, int in_height, int in_stride,
+                      int out_width, int out_height, int out_stride,
+                      unsigned char *input_image, unsigned char *output_image)
+{
+    short *restrict l_w, * restrict l_h;
+    short *restrict c_w, * restrict c_h;
+    unsigned char *restrict ip, * restrict op, *restrict op_w;
+    unsigned char *restrict hbuf;
+    int h, w, lw, lh;
+    int phase_offset_w, phase_offset_h;
+    double coeff;
+    int max_phase;
+
+    c_w = g_b_scaler.c_w;
+    c_h = g_b_scaler.c_h;
+
+    op = output_image;
+
+    l_w = g_b_scaler.l_w;
+    l_h = g_b_scaler.l_h;
+
+    phase_offset_h = 0;
+
+    for (h = 0; h < out_height; h++)
+    {
+        // select the row to work on
+        lh = l_h[h];
+        ip = input_image + (in_stride * lh);
+
+        coeff = _memd8_const(&c_h[phase_offset_h*4]);
+
+        // vp8_filter the row vertically into an temporary buffer.
+        //  If the phase offset == 0 then all the multiplication
+        //  is going to result in the output equalling the input.
+        //  So instead point the temporary buffer to the input.
+        //  Also handle the boundry condition of not being able to
+        //  filter that last lines.
+        if (phase_offset_h && (lh < in_height - 2))
+        {
+            hbuf = g_b_scaler.hbuf;
+
+            for (w = 0; w < in_width; w += 4)
+            {
+                int ip1, ip2, ip3, ip4;
+                int y13_12, y11_10, y23_22, y21_20, y33_32, y31_30, y43_42, y41_40;
+                int y10_20, y11_21, y12_22, y13_23, y30_40, y31_41, y32_42, y33_43;
+                int s1, s2, s3, s4;
+
+                ip1 = _mem4_const(&ip[w - in_stride]);
+                ip2 = _mem4_const(&ip[w]);
+                ip3 = _mem4_const(&ip[w + in_stride]);
+                ip4 = _mem4_const(&ip[w + 2*in_stride]);
+
+                // realignment of data.  Unpack the data so that it is in short
+                //  format instead of bytes.
+                y13_12 = _unpkhu4(ip1);
+                y11_10 = _unpklu4(ip1);
+                y23_22 = _unpkhu4(ip2);
+                y21_20 = _unpklu4(ip2);
+                y33_32 = _unpkhu4(ip3);
+                y31_30 = _unpklu4(ip3);
+                y43_42 = _unpkhu4(ip4);
+                y41_40 = _unpklu4(ip4);
+
+                // repack the data so that elements 1 and 2 are together.  this
+                //  lines up so that a dot product with the coefficients can be
+                //  done.
+                y10_20 = _pack2(y11_10, y21_20);
+                y11_21 = _packh2(y11_10, y21_20);
+                y12_22 = _pack2(y13_12, y23_22);
+                y13_23 = _packh2(y13_12, y23_22);
+
+                s1 = _dotp2(_hi(coeff), y10_20);
+                s2 = _dotp2(_hi(coeff), y11_21);
+                s3 = _dotp2(_hi(coeff), y12_22);
+                s4 = _dotp2(_hi(coeff), y13_23);
+
+                y30_40 = _pack2(y31_30, y41_40);
+                y31_41 = _packh2(y31_30, y41_40);
+                y32_42 = _pack2(y33_32, y43_42);
+                y33_43 = _packh2(y33_32, y43_42);
+
+                // now repack elements 3 and 4 together.
+                s1 += _dotp2(_lo(coeff), y30_40);
+                s2 += _dotp2(_lo(coeff), y31_41);
+                s3 += _dotp2(_lo(coeff), y32_42);
+                s4 += _dotp2(_lo(coeff), y33_43);
+
+                s1 = s1 >> 12;
+                s2 = s2 >> 12;
+                s3 = s3 >> 12;
+                s4 = s4 >> 12;
+
+                s1 = _pack2(s2, s1);
+                s2 = _pack2(s4, s3);
+
+                _amem4(&hbuf[w])  = _spacku4(s2, s1);
+            }
+        }
+        else
+            hbuf = ip;
+
+        // increase the phase offset for the next time around.
+        if (++phase_offset_h >= g_b_scaler.nh)
+            phase_offset_h = 0;
+
+        op_w = op;
+
+        // will never be able to interpolate first pixel, so just copy it
+        // over here.
+        phase_offset_w = 1;
+        *op_w++ = hbuf[0];
+
+        if (1 >= g_b_scaler.nw) phase_offset_w = 0;
+
+        max_phase = g_b_scaler.nw;
+
+        for (w = 1; w < out_width; w++)
+        {
+            double coefficients;
+            int hbuf_high, hbuf_low, hbuf_both;
+            int sum_high, sum_low, sum;
+
+            // get the index to use to expand the image
+            lw = l_w[w];
+            coefficients = _amemd8_const(&c_w[phase_offset_w*4]);
+            hbuf_both = _mem4_const(&hbuf[lw-1]);
+
+            hbuf_high = _unpkhu4(hbuf_both);
+            hbuf_low  = _unpklu4(hbuf_both);
+
+            sum_high = _dotp2(_hi(coefficients), hbuf_high);
+            sum_low  = _dotp2(_lo(coefficients), hbuf_low);
+
+            sum = (sum_high + sum_low) >> 12;
+
+            if (++phase_offset_w >= max_phase)
+                phase_offset_w = 0;
+
+            if ((lw + 2) >= in_width)
+                sum = hbuf[lw];
+
+            *op_w++ = sum;
+        }
+
+        op += out_stride;
+    }
+
+    return 0;
+}
+
+void bicubic_scale_frame_c64(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+                             int new_width, int new_height)
+{
+
+    dst->y_width = new_width;
+    dst->y_height = new_height;
+    dst->uv_width = new_width / 2;
+    dst->uv_height = new_height / 2;
+
+    dst->y_stride = dst->y_width;
+    dst->uv_stride = dst->uv_width;
+
+    bicubic_scale_c64(src->y_width, src->y_height, src->y_stride,
+                      new_width, new_height, dst->y_stride,
+                      src->y_buffer, dst->y_buffer);
+
+    bicubic_scale_c64(src->uv_width, src->uv_height, src->uv_stride,
+                      new_width / 2, new_height / 2, dst->uv_stride,
+                      src->u_buffer, dst->u_buffer);
+
+    bicubic_scale_c64(src->uv_width, src->uv_height, src->uv_stride,
+                      new_width / 2, new_height / 2, dst->uv_stride,
+                      src->v_buffer, dst->v_buffer);
+}
diff --git a/vpx_scale/dm642/gen_scalers_c64.c b/vpx_scale/dm642/gen_scalers_c64.c
new file mode 100644
index 000000000..2126a7534
--- /dev/null
+++ b/vpx_scale/dm642/gen_scalers_c64.c
@@ -0,0 +1,607 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+ *
+ *   Module Title :     gen_scalers.c
+ *
+ *   Description  :     Generic image scaling functions.
+ *
+ ***************************************************************************/
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+#include "vpx_scale/vpxscale.h"
+
+/****************************************************************************
+*  Imports
+****************************************************************************/
+
+/****************************************************************************
+ *
+ *  ROUTINE       : horizontal_line_4_5_scale_c4
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width    : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 4 to 5.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_4_5_scale_c64
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    unsigned i;
+    unsigned int ba, cb, dc, ed;
+    unsigned char *restrict des = dest;
+    unsigned int *restrict src = (unsigned int *)source;
+    unsigned int const_51_205, const_102_154,
+             const_205_51, const_154_102;
+
+    unsigned int src_current, src_next;
+
+    (void) dest_width;
+
+    // Constants that are to be used for the filtering.  For
+    //  best speed we are going to want to right shift by 16.
+    //  In the generic version they were shift by 8, so put
+    //  an extra 8 in now so that 16 will come out later.
+    const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
+    const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
+    const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
+    const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
+
+    // 5 points are needed to filter to give 5 output points.
+    //  A load can pull up 4 at a time, and one needs to be
+    //  "borrowed" from the next set of data.  So instead of
+    //  loading those 5 points each time, "steal" a point from
+    //  the next set and only load up 4 each time through.
+    src_current = _mem4(src);
+
+    for (i = 0; i < source_width - 4; i += 4)
+    {
+        src_next = _mem4(src++);
+
+        // Reorder the data so that it is ready for the
+        //  dot product.
+        ba = _unpklu4(src_current);
+        cb = _unpkhu4(_rotl(src_current, 8));
+        dc = _unpkhu4(src_current);
+        ed = _unpkhu4(_shrmb(src_next, src_current));
+
+        // Use the dot product with round and shift.
+        des [0] = src_current & 0xff;
+        des [1] = _dotprsu2(ba, const_205_51);
+        des [2] = _dotprsu2(cb, const_154_102);
+        des [3] = _dotprsu2(dc, const_102_154);
+        des [4] = _dotprsu2(ed, const_51_205);
+
+        des += 5;
+
+        // reuse loaded vales next time around.
+        src_current = src_next;
+    }
+
+    // vp8_filter the last set of points.  Normally a point from the next set
+    //  would be used, but there is no next set, so just fill.
+    ba = _unpklu4(src_current);
+    cb = _unpkhu4(_rotl(src_current, 8));
+    dc = _unpkhu4(src_current);
+
+    des [0] = src_current & 0xff;
+    des [1] = _dotprsu2(ba, const_205_51);
+    des [2] = _dotprsu2(cb, const_154_102);
+    des [3] = _dotprsu2(dc, const_102_154);
+    des [4] = src_current & 0xff;
+
+}
+/****************************************************************************
+ *
+ *  ROUTINE       : vertical_band_4_5_scale_c64
+ *
+ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+ *                  unsigned int dest_pitch : Stride of destination data.
+ *                  unsigned int dest_width : Width of destination data.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Scales vertical band of pixels by scale 4 to 5. The
+ *                  height of the band scaled is 4-pixels.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band.
+ *
+ ****************************************************************************/
+static
+void vertical_band_4_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned int a, b, c, d, e;
+    unsigned int ba, cb, dc, ed;
+    unsigned char *restrict src = dest;
+    unsigned char *restrict des = dest;
+    unsigned int const_51_205, const_102_154,
+             const_205_51, const_154_102;
+
+    const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
+    const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
+    const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
+    const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
+
+    // Force a loop unroll here so that there is not such a
+    //  dependancy.
+    a = src [0];
+    b = src [dest_pitch];
+    c = src [dest_pitch*2];
+    d = src [dest_pitch*3];
+    e = src [dest_pitch*5];
+    src ++;
+
+    for (i = 0; i < dest_width; i++)
+    {
+        ba = _pack2(b, a);
+        cb = _pack2(c, b);
+        dc = _pack2(d, c);
+        ed = _pack2(e, d);
+
+        a = src [0];
+        b = src [dest_pitch];
+        c = src [dest_pitch*2];
+        d = src [dest_pitch*3];
+        e = src [dest_pitch*5];
+        src ++;
+
+        des [dest_pitch] = _dotprsu2(ba, const_205_51);
+        des [dest_pitch*2] = _dotprsu2(cb, const_154_102);
+        des [dest_pitch*3] = _dotprsu2(dc, const_102_154);
+        des [dest_pitch*4] = _dotprsu2(ed, const_51_205);
+
+        des ++;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : last_vertical_band_4_5_scale_c64
+ *
+ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+ *                  unsigned int dest_pitch : Stride of destination data.
+ *                  unsigned int dest_width : Width of destination data.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Scales last vertical band of pixels by scale 4 to 5. The
+ *                  height of the band scaled is 4-pixels.
+ *
+ *  SPECIAL NOTES : The routine does not have available the first line of
+ *                  the band below the current band, since this is the
+ *                  last band.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_4_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned int a, b, c, d;
+    unsigned int ba, cb, dc;
+    unsigned char *restrict src = dest;
+    unsigned char *restrict des = dest;
+    unsigned int const_102_154, const_205_51, const_154_102;
+
+    const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
+    const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
+    const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
+
+    a = src [0];
+    b = src [dest_pitch];
+    c = src [dest_pitch*2];
+    d = src [dest_pitch*3];
+    src ++;
+
+    for (i = 0; i < dest_width; ++i)
+    {
+        ba = _pack2(b, a);
+        cb = _pack2(c, b);
+        dc = _pack2(d, c);
+
+        a = src [0];
+        b = src [dest_pitch];
+        c = src [dest_pitch*2];
+        d = src [dest_pitch*3];
+        src ++;
+
+        des [dest_pitch] = _dotprsu2(ba, const_205_51);
+        des [dest_pitch*2] = _dotprsu2(cb, const_154_102);
+        des [dest_pitch*3] = _dotprsu2(dc, const_102_154);
+        des [dest_pitch*4] = (unsigned char) d;
+
+        des++;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : horizontal_line_3_5_scale_c64
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width    : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 3 to 5.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ *
+ ****************************************************************************/
+static
+void horizontal_line_3_5_scale_c64
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    unsigned int i;
+    unsigned int ba, cb, dc;
+    unsigned int src_current;
+    unsigned char *restrict des = dest;
+    unsigned char *restrict src = (unsigned char *)source;
+    unsigned int const_51_205, const_102_154,
+             const_205_51, const_154_102;
+
+    (void) dest_width;
+
+    const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
+    const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
+    const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
+    const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
+
+    for (i = 0; i < source_width - 3; i += 3)
+    {
+        src_current = _mem4(src);
+
+        // Reorder the data so that it is ready for the
+        //  dot product.
+        ba = _unpklu4(src_current);
+        cb = _unpkhu4(_rotl(src_current, 8));
+        dc = _unpkhu4(src_current);
+
+        des [0] = src_current & 0xff;
+        des [1] = _dotprsu2(ba, const_154_102);
+        des [2] = _dotprsu2(cb, const_51_205);
+        des [3] = _dotprsu2(cb, const_205_51);
+        des [4] = _dotprsu2(dc, const_102_154);
+
+        src += 3;
+        des += 5;
+    }
+
+    src_current = _mem4(src);
+
+    ba = _unpklu4(src_current);
+    cb = _unpkhu4(_rotl(src_current, 8));
+    dc = _unpkhu4(src_current);
+
+
+    des [0] = src_current & 0xff;
+    des [1] = _dotprsu2(ba, const_154_102);
+    des [2] = _dotprsu2(cb, const_51_205);
+    des [3] = _dotprsu2(cb, const_205_51);
+    des [4] = dc & 0xff;
+
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vertical_band_3_5_scale_c64
+ *
+ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+ *                  unsigned int dest_pitch : Stride of destination data.
+ *                  unsigned int dest_width : Width of destination data.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Scales vertical band of pixels by scale 3 to 5. The
+ *                  height of the band scaled is 3-pixels.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band.
+ *
+ ****************************************************************************/
+static
+void vertical_band_3_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned int a, b, c, d;
+    unsigned int ba, cb, dc;
+    unsigned char *restrict src = dest;
+    unsigned char *restrict des = dest;
+    unsigned int const_51_205, const_102_154,
+             const_205_51, const_154_102;
+
+    const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
+    const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
+    const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
+    const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
+
+    a = src [0];
+    b = src [dest_pitch];
+    c = src [dest_pitch*2];
+    d = src [dest_pitch*5];
+    src ++;
+
+    for (i = 0; i < dest_width; i++)
+    {
+        ba = _pack2(b, a);
+        cb = _pack2(c, b);
+        dc = _pack2(d, c);
+
+        a = src [0];
+        b = src [dest_pitch];
+        c = src [dest_pitch*2];
+        d = src [dest_pitch*5];
+        src ++;
+
+        des [dest_pitch]   = _dotprsu2(ba, const_154_102);
+        des [dest_pitch*2] = _dotprsu2(cb, const_51_205);
+        des [dest_pitch*3] = _dotprsu2(cb, const_205_51);
+        des [dest_pitch*4] = _dotprsu2(dc, const_102_154);
+
+        des++;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : last_vertical_band_3_5_scale_c64
+ *
+ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+ *                  unsigned int dest_pitch : Stride of destination data.
+ *                  unsigned int dest_width : Width of destination data.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Scales last vertical band of pixels by scale 3 to 5. The
+ *                  height of the band scaled is 3-pixels.
+ *
+ *  SPECIAL NOTES : The routine does not have available the first line of
+ *                  the band below the current band, since this is the
+ *                  last band.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_3_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned int a, b, c;
+    unsigned int ba, cb;
+    unsigned char *restrict src = dest;
+    unsigned char *restrict des = dest;
+    unsigned int const_51_205, const_205_51, const_154_102;
+
+    const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
+    const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
+    const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
+
+    a = src [0];
+    b = src [dest_pitch];
+    c = src [dest_pitch*2];
+    src ++;
+
+    for (i = 0; i < dest_width; ++i)
+    {
+        ba = _pack2(b, a);
+        cb = _pack2(c, b);
+
+        a = src [0];
+        b = src [dest_pitch];
+        c = src [dest_pitch*2];
+        src ++;
+
+        des [dest_pitch]   = _dotprsu2(ba, const_154_102);
+        des [dest_pitch*2] = _dotprsu2(cb, const_51_205);
+        des [dest_pitch*3] = _dotprsu2(cb, const_205_51);
+        des [dest_pitch*4] = (unsigned char)(c) ;
+
+        des++;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : horizontal_line_1_2_scale_c64
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width    : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 1 to 2.
+ *
+ *  SPECIAL NOTES : source width must be a multiple of 4.
+ *
+ ****************************************************************************/
+void horizontal_line_1_2_scale_c64
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    unsigned int i;
+    unsigned char *restrict des = dest;
+    unsigned char *restrict src = (unsigned char *)source;
+    unsigned int src7_4i, src4_1i, src3_0i;
+    unsigned int a4_0i, ahi, alo;
+    double src7_0d, src3_0d;
+    const unsigned int k01 = 0x01010101;
+
+    for (i = 0; i < source_width / 4; i += 1)
+    {
+        // Load up the data from src.  Here a wide load is
+        //  used to get 8 bytes at once, only 5 will be used
+        //  for the actual computation.
+        src7_0d = _memd8(src);
+        src3_0i = _lo(src7_0d);
+        src7_4i = _hi(src7_0d);
+
+        // Need to average between points.  Shift byte 5 into
+        //  the lower word.  This will result in bytes 5-1
+        //  averaged with 4-0.
+        src4_1i = _shrmb(src7_4i, src3_0i);
+        a4_0i = _avgu4(src4_1i, src3_0i);
+
+        // Expand the data out. Could do an unpack, however
+        //  all but the multiply units are getting pretty hard
+        //  here the multiply unit can take some of the computations.
+        src3_0d = _mpyu4(src3_0i, k01);
+
+        // The averages need to be unpacked so that they are in 16
+        //  bit form and will be able to be interleaved with the
+        //  original data
+        ahi = _unpkhu4(a4_0i);
+        alo = _unpklu4(a4_0i);
+
+        ahi = _swap4(ahi);
+        alo = _swap4(alo);
+
+        // Mix the average result in with the orginal data.
+        ahi = _hi(src3_0d) | ahi;
+        alo = _lo(src3_0d) | alo;
+
+        _memd8(des) = _itod(ahi, alo);
+
+        des += 8;
+        src += 4;
+    }
+}
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vertical_band_1_2_scale_c64
+ *
+ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+ *                  unsigned int dest_pitch : Stride of destination data.
+ *                  unsigned int dest_width : Width of destination data.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Scales vertical band of pixels by scale 1 to 2. The
+ *                  height of the band scaled is 1-pixel.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band.
+ *                  Destination width must be a multiple of 4.  Because the
+ *                  intput must be, therefore the output must be.
+ *
+ ****************************************************************************/
+static
+void vertical_band_1_2_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned int a, b;
+    unsigned int *restrict line_a = (unsigned int *)dest;
+    unsigned int *restrict line_b = (unsigned int *)(dest + (dest_pitch * 2));
+    unsigned int *restrict des = (unsigned int *)(dest + dest_pitch);
+
+    for (i = 0; i < dest_width / 4; i++)
+    {
+        a = _mem4(line_a++);
+        b = _mem4(line_b++);
+
+        _mem4(des++) = _avgu4(a, b);
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : last_vertical_band_1_2_scale_c64
+ *
+ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+ *                  unsigned int dest_pitch : Stride of destination data.
+ *                  unsigned int dest_width : Width of destination data.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Scales last vertical band of pixels by scale 1 to 2. The
+ *                  height of the band scaled is 1-pixel.
+ *
+ *  SPECIAL NOTES : The routine does not have available the first line of
+ *                  the band below the current band, since this is the
+ *                  last band.  Again, width must be a multiple of 4.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_1_2_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned int *restrict src = (unsigned int *)dest;
+    unsigned int *restrict des = (unsigned int *)(dest + dest_pitch);
+
+    for (i = 0; i < dest_width / 4; ++i)
+    {
+        _mem4(des++) = _mem4(src++);
+    }
+}
+
+void
+register_generic_scalers(void)
+{
+    vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_c64;
+    vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_c64;
+    vp8_last_vertical_band_1_2_scale      = last_vertical_band_1_2_scale_c64;
+    vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_c64;
+    vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_c64;
+    vp8_last_vertical_band_3_5_scale      = last_vertical_band_3_5_scale_c64;
+    vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_c64;
+    vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_c64;
+    vp8_last_vertical_band_4_5_scale      = last_vertical_band_4_5_scale_c64;
+}
diff --git a/vpx_scale/dm642/yv12extend.c b/vpx_scale/dm642/yv12extend.c
new file mode 100644
index 000000000..ca25a5fce
--- /dev/null
+++ b/vpx_scale/dm642/yv12extend.c
@@ -0,0 +1,445 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+ *
+ *   Module Title :     yv12extend.c
+ *
+ *   Description  :
+ *
+ ***************************************************************************/
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+//#include <stdlib.h>
+#include "csl_dat.h"
+#include "vpx_scale/yv12config.h"
+#include "vpx_mem/vpx_mem.h"
+
+/****************************************************************************
+*  Exports
+****************************************************************************/
+#define UINT8 unsigned char
+#define UINT32 unsigned int
+
+
+static inline
+void copy_yleft_right_border(
+    UINT8 *restrict src_ptr1,
+    UINT8 *restrict src_ptr2,
+    UINT8 *restrict dest_ptr1,
+    UINT8 *restrict dest_ptr2,
+    UINT32  plane_height,
+    UINT32  plane_stride
+)
+{
+    UINT32 left, right, left2, left4, right2, right4;
+    double dl, dr;
+    int i;
+
+#pragma MUST_ITERATE(16,16,16)
+
+    for (i = 0; i < plane_height; i++)
+    {
+        left  = src_ptr1[0];
+        right = src_ptr2[0];
+
+        left2 = _pack2(left, left);
+        left4 = _packl4(left2, left2);
+
+        right2 = _pack2(right, right);
+        right4 = _packl4(right2, right2);
+
+        dl = _itod(left4, left4);
+        dr = _itod(right4, right4);
+
+        _amemd8(&dest_ptr1[ 0]) = dl;
+        _amemd8(&dest_ptr2[ 0]) = dr;
+
+        _amemd8(&dest_ptr1[ 8]) = dl;
+        _amemd8(&dest_ptr2[ 8]) = dr;
+
+        _amemd8(&dest_ptr1[16]) = dl;
+        _amemd8(&dest_ptr2[16]) = dr;
+
+        _amemd8(&dest_ptr1[24]) = dl;
+        _amemd8(&dest_ptr2[24]) = dr;
+
+        _amemd8(&dest_ptr1[32]) = dl;
+        _amemd8(&dest_ptr2[32]) = dr;
+
+        _amemd8(&dest_ptr1[40]) = dl;
+        _amemd8(&dest_ptr2[40]) = dr;
+
+
+        src_ptr1 += plane_stride;
+        src_ptr2 += plane_stride;
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+}
+/****************************************************************************
+ *
+ *
+ ****************************************************************************/
+static
+void copy_uvleft_right_border(
+    UINT8 *restrict src_ptr1,
+    UINT8 *restrict src_ptr2,
+    UINT8 *restrict dest_ptr1,
+    UINT8 *restrict dest_ptr2,
+    UINT32  plane_height,
+    UINT32  plane_stride
+)
+{
+    UINT32 left, right, left2, left4, right2, right4;
+    double dl, dr;
+    int i;
+
+#pragma MUST_ITERATE(8,8 ,8)
+
+    for (i = 0; i < plane_height; i++)
+    {
+        left  = src_ptr1[0];
+        right = src_ptr2[0];
+
+        left2 = _pack2(left, left);
+        left4 = _packl4(left2, left2);
+
+        right2 = _pack2(right, right);
+        right4 = _packl4(right2, right2);
+
+        dl = _itod(left4, left4);
+        dr = _itod(right4, right4);
+
+        _amemd8(&dest_ptr1[ 0]) = dl;
+        _amemd8(&dest_ptr2[ 0]) = dr;
+
+        _amemd8(&dest_ptr1[ 8]) = dl;
+        _amemd8(&dest_ptr2[ 8]) = dr;
+
+        _amemd8(&dest_ptr1[16]) = dl;
+        _amemd8(&dest_ptr2[16]) = dr;
+
+
+        src_ptr1 += plane_stride;
+        src_ptr2 += plane_stride;
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+}
+/****************************************************************************
+ *
+ ****************************************************************************/
+void
+vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf)
+{
+    int i;
+    unsigned char *src_ptr1, *src_ptr2;
+    unsigned char *dest_ptr1, *dest_ptr2;
+
+    unsigned int Border;
+    int plane_stride;
+    int plane_height;
+    int plane_width;
+
+    /***********/
+    /* Y Plane */
+    /***********/
+    Border = ybf->border;
+    plane_stride = ybf->y_stride;
+    plane_height = ybf->y_height;
+    plane_width = ybf->y_width;
+
+#if 1
+    // copy the left and right most columns out
+    src_ptr1 = ybf->y_buffer;
+    src_ptr2 = src_ptr1 + plane_width - 1;
+    dest_ptr1 = src_ptr1 - Border;
+    dest_ptr2 = src_ptr2 + 1;
+    copy_yleft_right_border(src_ptr1, src_ptr2, dest_ptr1, dest_ptr2, plane_height, plane_stride);
+#endif
+
+    // Now copy the top and bottom source lines into each line of the respective borders
+    src_ptr1 = ybf->y_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)Border; i++)
+    {
+        vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+        vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    plane_stride /= 2;
+    plane_height /= 2;
+    plane_width /= 2;
+    Border /= 2;
+
+    /***********/
+    /* U Plane */
+    /***********/
+#if 1
+    // copy the left and right most columns out
+    src_ptr1 = ybf->u_buffer;
+    src_ptr2 = src_ptr1 + plane_width - 1;
+    dest_ptr1 = src_ptr1 - Border;
+    dest_ptr2 = src_ptr2 + 1;
+
+    copy_uvleft_right_border(src_ptr1, src_ptr2, dest_ptr1, dest_ptr2, plane_height, plane_stride);
+
+
+#endif
+
+    // Now copy the top and bottom source lines into each line of the respective borders
+    src_ptr1 = ybf->u_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)(Border); i++)
+    {
+        vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+        vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    /***********/
+    /* V Plane */
+    /***********/
+#if 1
+    // copy the left and right most columns out
+    src_ptr1 = ybf->v_buffer;
+    src_ptr2 = src_ptr1 + plane_width - 1;
+    dest_ptr1 = src_ptr1 - Border;
+    dest_ptr2 = src_ptr2 + 1;
+
+    copy_uvleft_right_border(src_ptr1, src_ptr2, dest_ptr1, dest_ptr2, plane_height, plane_stride);
+
+#endif
+
+    // Now copy the top and bottom source lines into each line of the respective borders
+    src_ptr1 = ybf->v_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)(Border); i++)
+    {
+        vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+        vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+}
+/****************************************************************************
+ *
+ ****************************************************************************/
+void
+vpxyv12_extend_frame_tbborders(YV12_BUFFER_CONFIG *ybf)
+{
+    int i;
+    unsigned char *src_ptr1, *src_ptr2;
+    unsigned char *dest_ptr1, *dest_ptr2;
+    int tid1, tid2;
+
+    unsigned int Border;
+    int plane_stride;
+    int plane_height;
+    int plane_width;
+
+    /***********/
+    /* Y Plane */
+    /***********/
+    Border = ybf->border;
+    plane_stride = ybf->y_stride;
+    plane_height = ybf->y_height;
+    plane_width = ybf->y_width;
+
+
+    // Now copy the top and bottom source lines into each line of the respective borders
+    src_ptr1 = ybf->y_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+
+    for (i = 0; i < (int)Border; i++)
+    {
+        dat_copy(src_ptr1, dest_ptr1, plane_stride);
+        dat_copy(src_ptr2, dest_ptr2, plane_stride);
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    plane_stride /= 2;
+    plane_height /= 2;
+    plane_width /= 2;
+    Border /= 2;
+
+    /***********/
+    /* U Plane */
+    /***********/
+    // Now copy the top and bottom source lines into each line of the respective borders
+    src_ptr1 = ybf->u_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)(Border); i++)
+    {
+        dat_copy(src_ptr1, dest_ptr1, plane_stride);
+        dat_copy(src_ptr2, dest_ptr2, plane_stride);
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    /***********/
+    /* V Plane */
+    /***********/
+    // Now copy the top and bottom source lines into each line of the respective borders
+    src_ptr1 = ybf->v_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)(Border); i++)
+    {
+        tid1 = dat_copy(src_ptr1, dest_ptr1, plane_stride);
+        tid2 = dat_copy(src_ptr2, dest_ptr2, plane_stride);
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    dat_wait(tid1);
+    dat_wait(tid2);
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8_yv12_copy_frame
+ *
+ *  INPUTS        :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies the source image into the destination image and
+ *                  updates the destination's UMV borders.  Because the
+ *                  borders have been update prior to this so the whole frame
+ *                  is copied, borders and all.  This is also to circumvent
+ *                  using copy_left_right Border functions when copying data
+ *                  between L2 and main memory.  When that occurs a cache
+ *                  clean needs to be done, which would require invalidating
+ *                  an entire frame.
+ *
+ *  SPECIAL NOTES : The frames are assumed to be identical in size.
+ *
+ ****************************************************************************/
+void
+vpxyv12_copy_frame_dma(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
+{
+    int yheight, uv_height;
+    int ystride, uv_stride;
+    int border;
+    int yoffset, uvoffset;
+
+    border = src_ybc->border;
+    yheight = src_ybc->y_height;
+    uv_height = src_ybc->uv_height;
+
+    ystride = src_ybc->y_stride;
+    uv_stride = src_ybc->uv_stride;
+
+    yoffset = border * (ystride + 1);
+    uvoffset = border / 2 * (uv_stride + 1);
+
+    dat_copy2d(DAT_2D2D,
+               src_ybc->y_buffer - yoffset,
+               dst_ybc->y_buffer - yoffset,
+               ystride,
+               yheight + 2 * border,
+               ystride);
+    dat_copy2d(DAT_2D2D,
+               src_ybc->u_buffer - uvoffset,
+               dst_ybc->u_buffer - uvoffset,
+               uv_stride,
+               uv_height + border,
+               uv_stride);
+    dat_copy2d(DAT_2D2D,
+               src_ybc->v_buffer - uvoffset,
+               dst_ybc->v_buffer - uvoffset,
+               uv_stride,
+               uv_height + border,
+               uv_stride);
+
+}
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8_yv12_copy_frame
+ *
+ *  INPUTS        :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies the source image into the destination image and
+ *                  updates the destination's UMV borders.
+ *
+ *  SPECIAL NOTES : The frames are assumed to be identical in size.
+ *
+ ****************************************************************************/
+void
+vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
+{
+    int row;
+    unsigned char *source, *dest;
+
+    source = src_ybc->y_buffer;
+    dest = dst_ybc->y_buffer;
+
+    for (row = 0; row < src_ybc->y_height; row++)
+    {
+        vpx_memcpy(dest, source, src_ybc->y_width);
+        source += src_ybc->y_stride;
+        dest   += dst_ybc->y_stride;
+    }
+
+    source = src_ybc->u_buffer;
+    dest = dst_ybc->u_buffer;
+
+    for (row = 0; row < src_ybc->uv_height; row++)
+    {
+        vpx_memcpy(dest, source, src_ybc->uv_width);
+        source += src_ybc->uv_stride;
+        dest   += dst_ybc->uv_stride;
+    }
+
+    source = src_ybc->v_buffer;
+    dest = dst_ybc->v_buffer;
+
+    for (row = 0; row < src_ybc->uv_height; row++)
+    {
+        vpx_memcpy(dest, source, src_ybc->uv_width);
+        source += src_ybc->uv_stride;
+        dest   += dst_ybc->uv_stride;
+    }
+
+    vp8_yv12_extend_frame_borders(dst_ybc);
+}
diff --git a/vpx_scale/generic/bicubic_scaler.c b/vpx_scale/generic/bicubic_scaler.c
new file mode 100644
index 000000000..e3c2b4a80
--- /dev/null
+++ b/vpx_scale/generic/bicubic_scaler.c
@@ -0,0 +1,601 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include "vpx_mem/vpx_mem.h"
+#include "vpxscale_arbitrary.h"
+
+#define FIXED_POINT
+
+#define MAX_IN_WIDTH        800
+#define MAX_IN_HEIGHT       600
+#define MAX_OUT_WIDTH       800
+#define MAX_OUT_HEIGHT      600
+#define MAX_OUT_DIMENSION   ((MAX_OUT_WIDTH > MAX_OUT_HEIGHT) ? \
+                             MAX_OUT_WIDTH : MAX_OUT_HEIGHT)
+
+BICUBIC_SCALER_STRUCT g_b_scaler;
+static int g_first_time = 1;
+
+#pragma DATA_SECTION(g_hbuf, "VP6_HEAP")
+#pragma DATA_ALIGN (g_hbuf, 32);
+unsigned char g_hbuf[MAX_OUT_DIMENSION];
+
+#pragma DATA_SECTION(g_hbuf_uv, "VP6_HEAP")
+#pragma DATA_ALIGN (g_hbuf_uv, 32);
+unsigned char g_hbuf_uv[MAX_OUT_DIMENSION];
+
+
+#ifdef FIXED_POINT
+static int a_i = 0.6 * 65536;
+#else
+static float a = -0.6;
+#endif
+
+#ifdef FIXED_POINT
+//         3     2
+// C0 = a*t - a*t
+//
+static INLINE short c0_fixed(unsigned int t)
+{
+    // put t in Q16 notation
+    unsigned short v1, v2;
+
+    // Q16
+    v1 = (a_i * t) >> 16;
+    v1 = (v1 * t) >> 16;
+
+    // Q16
+    v2 = (a_i * t) >> 16;
+    v2 = (v2 * t) >> 16;
+    v2 = (v2 * t) >> 16;
+
+    // Q12
+    return -((v1 - v2) >> 4);
+}
+
+//                     2          3
+// C1 = a*t + (3-2*a)*t  - (2-a)*t
+//
+static INLINE short c1_fixed(unsigned int t)
+{
+    unsigned short v1, v2, v3;
+    unsigned short two, three;
+
+    // Q16
+    v1 = (a_i * t) >> 16;
+
+    // Q13
+    two = 2 << 13;
+    v2 = two - (a_i >> 3);
+    v2 = (v2 * t) >> 16;
+    v2 = (v2 * t) >> 16;
+    v2 = (v2 * t) >> 16;
+
+    // Q13
+    three = 3 << 13;
+    v3 = three - (2 * (a_i >> 3));
+    v3 = (v3 * t) >> 16;
+    v3 = (v3 * t) >> 16;
+
+    // Q12
+    return (((v1 >> 3) - v2 + v3) >> 1);
+
+}
+
+//                 2          3
+// C2 = 1 - (3-a)*t  + (2-a)*t
+//
+static INLINE short c2_fixed(unsigned int t)
+{
+    unsigned short v1, v2, v3;
+    unsigned short two, three;
+
+    // Q13
+    v1 = 1 << 13;
+
+    // Q13
+    three = 3 << 13;
+    v2 = three - (a_i >> 3);
+    v2 = (v2 * t) >> 16;
+    v2 = (v2 * t) >> 16;
+
+    // Q13
+    two = 2 << 13;
+    v3 = two - (a_i >> 3);
+    v3 = (v3 * t) >> 16;
+    v3 = (v3 * t) >> 16;
+    v3 = (v3 * t) >> 16;
+
+    // Q12
+    return (v1 - v2 + v3) >> 1;
+}
+
+//                 2      3
+// C3 = a*t - 2*a*t  + a*t
+//
+static INLINE short c3_fixed(unsigned int t)
+{
+    int v1, v2, v3;
+
+    // Q16
+    v1 = (a_i * t) >> 16;
+
+    // Q15
+    v2 = 2 * (a_i >> 1);
+    v2 = (v2 * t) >> 16;
+    v2 = (v2 * t) >> 16;
+
+    // Q16
+    v3 = (a_i * t) >> 16;
+    v3 = (v3 * t) >> 16;
+    v3 = (v3 * t) >> 16;
+
+    // Q12
+    return ((v2 - (v1 >> 1) - (v3 >> 1)) >> 3);
+}
+#else
+//          3     2
+// C0 = -a*t + a*t
+//
+float C0(float t)
+{
+    return -a * t * t * t + a * t * t;
+}
+
+//                      2          3
+// C1 = -a*t + (2*a+3)*t  - (a+2)*t
+//
+float C1(float t)
+{
+    return -(a + 2.0f) * t * t * t + (2.0f * a + 3.0f) * t * t - a * t;
+}
+
+//                 2          3
+// C2 = 1 - (a+3)*t  + (a+2)*t
+//
+float C2(float t)
+{
+    return (a + 2.0f) * t * t * t - (a + 3.0f) * t * t + 1.0f;
+}
+
+//                 2      3
+// C3 = a*t - 2*a*t  + a*t
+//
+float C3(float t)
+{
+    return a * t * t * t - 2.0f * a * t * t + a * t;
+}
+#endif
+
+#if 0
+int compare_real_fixed()
+{
+    int i, errors = 0;
+    float mult = 1.0 / 10000.0;
+    unsigned int fixed_mult = mult * 4294967296;//65536;
+    unsigned int phase_offset_int;
+    float phase_offset_real;
+
+    for (i = 0; i < 10000; i++)
+    {
+        int fixed0, fixed1, fixed2, fixed3, fixed_total;
+        int real0, real1, real2, real3, real_total;
+
+        phase_offset_real = (float)i * mult;
+        phase_offset_int = (fixed_mult * i) >> 16;
+//      phase_offset_int = phase_offset_real * 65536;
+
+        fixed0 = c0_fixed(phase_offset_int);
+        real0 = C0(phase_offset_real) * 4096.0;
+
+        if ((abs(fixed0) > (abs(real0) + 1)) || (abs(fixed0) < (abs(real0) - 1)))
+            errors++;
+
+        fixed1 = c1_fixed(phase_offset_int);
+        real1 = C1(phase_offset_real) * 4096.0;
+
+        if ((abs(fixed1) > (abs(real1) + 1)) || (abs(fixed1) < (abs(real1) - 1)))
+            errors++;
+
+        fixed2 = c2_fixed(phase_offset_int);
+        real2 = C2(phase_offset_real) * 4096.0;
+
+        if ((abs(fixed2) > (abs(real2) + 1)) || (abs(fixed2) < (abs(real2) - 1)))
+            errors++;
+
+        fixed3 = c3_fixed(phase_offset_int);
+        real3 = C3(phase_offset_real) * 4096.0;
+
+        if ((abs(fixed3) > (abs(real3) + 1)) || (abs(fixed3) < (abs(real3) - 1)))
+            errors++;
+
+        fixed_total = fixed0 + fixed1 + fixed2 + fixed3;
+        real_total = real0 + real1 + real2 + real3;
+
+        if ((fixed_total > 4097) || (fixed_total < 4094))
+            errors ++;
+
+        if ((real_total > 4097) || (real_total < 4095))
+            errors ++;
+    }
+
+    return errors;
+}
+#endif
+
+// Find greatest common denominator between two integers.  Method used here is
+//  slow compared to Euclid's algorithm, but does not require any division.
+int gcd(int a, int b)
+{
+    // Problem with this algorithm is that if a or b = 0 this function
+    //  will never exit.  Don't want to return 0 because any computation
+    //  that was based on a common denoninator and tried to reduce by
+    //  dividing by 0 would fail.  Best solution that could be thought of
+    //  would to be fail by returing a 1;
+    if (a <= 0 || b <= 0)
+        return 1;
+
+    while (a != b)
+    {
+        if (b > a)
+            b = b - a;
+        else
+        {
+            int tmp = a;//swap large and
+            a = b; //small
+            b = tmp;
+        }
+    }
+
+    return b;
+}
+
+void bicubic_coefficient_init()
+{
+    vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));
+    g_first_time = 0;
+}
+
+void bicubic_coefficient_destroy()
+{
+    if (!g_first_time)
+    {
+        if (g_b_scaler.l_w) vpx_free(g_b_scaler.l_w);
+
+        if (g_b_scaler.l_h) vpx_free(g_b_scaler.l_h);
+
+        if (g_b_scaler.l_h_uv) vpx_free(g_b_scaler.l_h_uv);
+
+        if (g_b_scaler.c_w) vpx_free(g_b_scaler.c_w);
+
+        if (g_b_scaler.c_h) vpx_free(g_b_scaler.c_h);
+
+        if (g_b_scaler.c_h_uv) vpx_free(g_b_scaler.c_h_uv);
+
+        vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));
+    }
+}
+
+// Create the coeffients that will be used for the cubic interpolation.
+//  Because scaling does not have to be equal in the vertical and horizontal
+//  regimes the phase offsets will be different.  There are 4 coefficents
+//  for each point, two on each side.  The layout is that there are the
+//  4 coefficents for each phase in the array and then the next phase.
+int bicubic_coefficient_setup(int in_width, int in_height, int out_width, int out_height)
+{
+    int i;
+#ifdef FIXED_POINT
+    int phase_offset_int;
+    unsigned int fixed_mult;
+    int product_val = 0;
+#else
+    float phase_offset;
+#endif
+    int gcd_w, gcd_h, gcd_h_uv, d_w, d_h, d_h_uv;
+
+    if (g_first_time)
+        bicubic_coefficient_init();
+
+
+    // check to see if the coefficents have already been set up correctly
+    if ((in_width == g_b_scaler.in_width) && (in_height == g_b_scaler.in_height)
+        && (out_width == g_b_scaler.out_width) && (out_height == g_b_scaler.out_height))
+        return 0;
+
+    g_b_scaler.in_width = in_width;
+    g_b_scaler.in_height = in_height;
+    g_b_scaler.out_width = out_width;
+    g_b_scaler.out_height = out_height;
+
+    // Don't want to allow crazy scaling, just try and prevent a catastrophic
+    //  failure here.  Want to fail after setting the member functions so if
+    //  if the scaler is called the member functions will not scale.
+    if (out_width <= 0 || out_height <= 0)
+        return -1;
+
+    // reduce in/out width and height ratios using the gcd
+    gcd_w = gcd(out_width, in_width);
+    gcd_h = gcd(out_height, in_height);
+    gcd_h_uv = gcd(out_height, in_height / 2);
+
+    // the numerator width and height are to be saved in
+    //  globals so they can be used during the scaling process
+    //  without having to be recalculated.
+    g_b_scaler.nw = out_width / gcd_w;
+    d_w = in_width / gcd_w;
+
+    g_b_scaler.nh = out_height / gcd_h;
+    d_h = in_height / gcd_h;
+
+    g_b_scaler.nh_uv = out_height / gcd_h_uv;
+    d_h_uv = (in_height / 2) / gcd_h_uv;
+
+    // allocate memory for the coefficents
+    if (g_b_scaler.l_w) vpx_free(g_b_scaler.l_w);
+
+    if (g_b_scaler.l_h) vpx_free(g_b_scaler.l_h);
+
+    if (g_b_scaler.l_h_uv) vpx_free(g_b_scaler.l_h_uv);
+
+    g_b_scaler.l_w = (short *)vpx_memalign(32, out_width * 2);
+    g_b_scaler.l_h = (short *)vpx_memalign(32, out_height * 2);
+    g_b_scaler.l_h_uv = (short *)vpx_memalign(32, out_height * 2);
+
+    if (g_b_scaler.c_w) vpx_free(g_b_scaler.c_w);
+
+    if (g_b_scaler.c_h) vpx_free(g_b_scaler.c_h);
+
+    if (g_b_scaler.c_h_uv) vpx_free(g_b_scaler.c_h_uv);
+
+    g_b_scaler.c_w = (short *)vpx_memalign(32, g_b_scaler.nw * 4 * 2);
+    g_b_scaler.c_h = (short *)vpx_memalign(32, g_b_scaler.nh * 4 * 2);
+    g_b_scaler.c_h_uv = (short *)vpx_memalign(32, g_b_scaler.nh_uv * 4 * 2);
+
+    g_b_scaler.hbuf = g_hbuf;
+    g_b_scaler.hbuf_uv = g_hbuf_uv;
+
+    // Set up polyphase filter taps.  This needs to be done before
+    //  the scaling because of the floating point math required.  The
+    //  coefficients are multiplied by 2^12 so that fixed point math
+    //  can be used in the main scaling loop.
+#ifdef FIXED_POINT
+    fixed_mult = (1.0 / (float)g_b_scaler.nw) * 4294967296;
+
+    product_val = 0;
+
+    for (i = 0; i < g_b_scaler.nw; i++)
+    {
+        if (product_val > g_b_scaler.nw)
+            product_val -= g_b_scaler.nw;
+
+        phase_offset_int = (fixed_mult * product_val) >> 16;
+
+        g_b_scaler.c_w[i*4]   = c3_fixed(phase_offset_int);
+        g_b_scaler.c_w[i*4+1] = c2_fixed(phase_offset_int);
+        g_b_scaler.c_w[i*4+2] = c1_fixed(phase_offset_int);
+        g_b_scaler.c_w[i*4+3] = c0_fixed(phase_offset_int);
+
+        product_val += d_w;
+    }
+
+
+    fixed_mult = (1.0 / (float)g_b_scaler.nh) * 4294967296;
+
+    product_val = 0;
+
+    for (i = 0; i < g_b_scaler.nh; i++)
+    {
+        if (product_val > g_b_scaler.nh)
+            product_val -= g_b_scaler.nh;
+
+        phase_offset_int = (fixed_mult * product_val) >> 16;
+
+        g_b_scaler.c_h[i*4]   = c0_fixed(phase_offset_int);
+        g_b_scaler.c_h[i*4+1] = c1_fixed(phase_offset_int);
+        g_b_scaler.c_h[i*4+2] = c2_fixed(phase_offset_int);
+        g_b_scaler.c_h[i*4+3] = c3_fixed(phase_offset_int);
+
+        product_val += d_h;
+    }
+
+    fixed_mult = (1.0 / (float)g_b_scaler.nh_uv) * 4294967296;
+
+    product_val = 0;
+
+    for (i = 0; i < g_b_scaler.nh_uv; i++)
+    {
+        if (product_val > g_b_scaler.nh_uv)
+            product_val -= g_b_scaler.nh_uv;
+
+        phase_offset_int = (fixed_mult * product_val) >> 16;
+
+        g_b_scaler.c_h_uv[i*4]   = c0_fixed(phase_offset_int);
+        g_b_scaler.c_h_uv[i*4+1] = c1_fixed(phase_offset_int);
+        g_b_scaler.c_h_uv[i*4+2] = c2_fixed(phase_offset_int);
+        g_b_scaler.c_h_uv[i*4+3] = c3_fixed(phase_offset_int);
+
+        product_val += d_h_uv;
+    }
+
+#else
+
+    for (i = 0; i < g_nw; i++)
+    {
+        phase_offset = (float)((i * d_w) % g_nw) / (float)g_nw;
+        g_c_w[i*4]   = (C3(phase_offset) * 4096.0);
+        g_c_w[i*4+1] = (C2(phase_offset) * 4096.0);
+        g_c_w[i*4+2] = (C1(phase_offset) * 4096.0);
+        g_c_w[i*4+3] = (C0(phase_offset) * 4096.0);
+    }
+
+    for (i = 0; i < g_nh; i++)
+    {
+        phase_offset = (float)((i * d_h) % g_nh) / (float)g_nh;
+        g_c_h[i*4]   = (C0(phase_offset) * 4096.0);
+        g_c_h[i*4+1] = (C1(phase_offset) * 4096.0);
+        g_c_h[i*4+2] = (C2(phase_offset) * 4096.0);
+        g_c_h[i*4+3] = (C3(phase_offset) * 4096.0);
+    }
+
+    for (i = 0; i < g_nh_uv; i++)
+    {
+        phase_offset = (float)((i * d_h_uv) % g_nh_uv) / (float)g_nh_uv;
+        g_c_h_uv[i*4]   = (C0(phase_offset) * 4096.0);
+        g_c_h_uv[i*4+1] = (C1(phase_offset) * 4096.0);
+        g_c_h_uv[i*4+2] = (C2(phase_offset) * 4096.0);
+        g_c_h_uv[i*4+3] = (C3(phase_offset) * 4096.0);
+    }
+
+#endif
+
+    // Create an array that corresponds input lines to output lines.
+    //  This doesn't require floating point math, but it does require
+    //  a division and because hardware division is not present that
+    //  is a call.
+    for (i = 0; i < out_width; i++)
+    {
+        g_b_scaler.l_w[i] = (i * d_w) / g_b_scaler.nw;
+
+        if ((g_b_scaler.l_w[i] + 2) <= in_width)
+            g_b_scaler.max_usable_out_width = i;
+
+    }
+
+    for (i = 0; i < out_height + 1; i++)
+    {
+        g_b_scaler.l_h[i] = (i * d_h) / g_b_scaler.nh;
+        g_b_scaler.l_h_uv[i] = (i * d_h_uv) / g_b_scaler.nh_uv;
+    }
+
+    return 0;
+}
+
+int bicubic_scale(int in_width, int in_height, int in_stride,
+                  int out_width, int out_height, int out_stride,
+                  unsigned char *input_image, unsigned char *output_image)
+{
+    short *RESTRICT l_w, * RESTRICT l_h;
+    short *RESTRICT c_w, * RESTRICT c_h;
+    unsigned char *RESTRICT ip, * RESTRICT op;
+    unsigned char *RESTRICT hbuf;
+    int h, w, lw, lh;
+    int temp_sum;
+    int phase_offset_w, phase_offset_h;
+
+    c_w = g_b_scaler.c_w;
+    c_h = g_b_scaler.c_h;
+
+    op = output_image;
+
+    l_w = g_b_scaler.l_w;
+    l_h = g_b_scaler.l_h;
+
+    phase_offset_h = 0;
+
+    for (h = 0; h < out_height; h++)
+    {
+        // select the row to work on
+        lh = l_h[h];
+        ip = input_image + (in_stride * lh);
+
+        // vp8_filter the row vertically into an temporary buffer.
+        //  If the phase offset == 0 then all the multiplication
+        //  is going to result in the output equalling the input.
+        //  So instead point the temporary buffer to the input.
+        //  Also handle the boundry condition of not being able to
+        //  filter that last lines.
+        if (phase_offset_h && (lh < in_height - 2))
+        {
+            hbuf = g_b_scaler.hbuf;
+
+            for (w = 0; w < in_width; w++)
+            {
+                temp_sum =  c_h[phase_offset_h*4+3] * ip[w - in_stride];
+                temp_sum += c_h[phase_offset_h*4+2] * ip[w];
+                temp_sum += c_h[phase_offset_h*4+1] * ip[w + in_stride];
+                temp_sum += c_h[phase_offset_h*4]   * ip[w + 2*in_stride];
+
+                hbuf[w] = temp_sum >> 12;
+            }
+        }
+        else
+            hbuf = ip;
+
+        // increase the phase offset for the next time around.
+        if (++phase_offset_h >= g_b_scaler.nh)
+            phase_offset_h = 0;
+
+        // now filter and expand it horizontally into the final
+        //  output buffer
+        phase_offset_w = 0;
+
+        for (w = 0; w < out_width; w++)
+        {
+            // get the index to use to expand the image
+            lw = l_w[w];
+
+            temp_sum =  c_w[phase_offset_w*4]   * hbuf[lw - 1];
+            temp_sum += c_w[phase_offset_w*4+1] * hbuf[lw];
+            temp_sum += c_w[phase_offset_w*4+2] * hbuf[lw + 1];
+            temp_sum += c_w[phase_offset_w*4+3] * hbuf[lw + 2];
+            temp_sum = temp_sum >> 12;
+
+            if (++phase_offset_w >= g_b_scaler.nw)
+                phase_offset_w = 0;
+
+            // boundry conditions
+            if ((lw + 2) >= in_width)
+                temp_sum = hbuf[lw];
+
+            if (lw == 0)
+                temp_sum = hbuf[0];
+
+            op[w] = temp_sum;
+        }
+
+        op += out_stride;
+    }
+
+    return 0;
+}
+
+void bicubic_scale_frame_reset()
+{
+    g_b_scaler.out_width = 0;
+    g_b_scaler.out_height = 0;
+}
+
+void bicubic_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+                         int new_width, int new_height)
+{
+
+    dst->y_width = new_width;
+    dst->y_height = new_height;
+    dst->uv_width = new_width / 2;
+    dst->uv_height = new_height / 2;
+
+    dst->y_stride = dst->y_width;
+    dst->uv_stride = dst->uv_width;
+
+    bicubic_scale(src->y_width, src->y_height, src->y_stride,
+                  new_width, new_height, dst->y_stride,
+                  src->y_buffer, dst->y_buffer);
+
+    bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,
+                  new_width / 2, new_height / 2, dst->uv_stride,
+                  src->u_buffer, dst->u_buffer);
+
+    bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,
+                  new_width / 2, new_height / 2, dst->uv_stride,
+                  src->v_buffer, dst->v_buffer);
+}
diff --git a/vpx_scale/generic/gen_scalers.c b/vpx_scale/generic/gen_scalers.c
new file mode 100644
index 000000000..a5e545f70
--- /dev/null
+++ b/vpx_scale/generic/gen_scalers.c
@@ -0,0 +1,954 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/vpxscale.h"
+#include "vpx_mem/vpx_mem.h"
+/****************************************************************************
+*  Imports
+****************************************************************************/
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_horizontal_line_4_5_scale_c
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width    : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 4 to 5.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8cx_horizontal_line_4_5_scale_c
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    unsigned i;
+    unsigned int a, b, c;
+    unsigned char *des = dest;
+    const unsigned char *src = source;
+
+    (void) dest_width;
+
+    for (i = 0; i < source_width - 4; i += 4)
+    {
+        a = src[0];
+        b = src[1];
+        des [0] = (unsigned char) a;
+        des [1] = (unsigned char)((a * 51 + 205 * b + 128) >> 8);
+        c = src[2] * 154;
+        a = src[3];
+        des [2] = (unsigned char)((b * 102 + c + 128) >> 8);
+        des [3] = (unsigned char)((c + 102 * a + 128) >> 8);
+        b = src[4];
+        des [4] = (unsigned char)((a * 205 + 51 * b + 128) >> 8);
+
+        src += 4;
+        des += 5;
+    }
+
+    a = src[0];
+    b = src[1];
+    des [0] = (unsigned char)(a);
+    des [1] = (unsigned char)((a * 51 + 205 * b + 128) >> 8);
+    c = src[2] * 154;
+    a = src[3];
+    des [2] = (unsigned char)((b * 102 + c + 128) >> 8);
+    des [3] = (unsigned char)((c + 102 * a + 128) >> 8);
+    des [4] = (unsigned char)(a);
+
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_vertical_band_4_5_scale_c
+ *
+ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+ *                  unsigned int dest_pitch : Stride of destination data.
+ *                  unsigned int dest_width : Width of destination data.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Scales vertical band of pixels by scale 4 to 5. The
+ *                  height of the band scaled is 4-pixels.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band.
+ *
+ ****************************************************************************/
+void vp8cx_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned int a, b, c, d;
+    unsigned char *des = dest;
+
+    for (i = 0; i < dest_width; i++)
+    {
+        a = des [0];
+        b = des [dest_pitch];
+
+        des[dest_pitch] = (unsigned char)((a * 51 + 205 * b + 128) >> 8);
+
+        c = des[dest_pitch*2] * 154;
+        d = des[dest_pitch*3];
+
+        des [dest_pitch*2] = (unsigned char)((b * 102 + c + 128) >> 8);
+        des [dest_pitch*3] = (unsigned char)((c + 102 * d + 128) >> 8);
+
+        // First line in next band
+        a = des [dest_pitch * 5];
+        des [dest_pitch * 4] = (unsigned char)((d * 205 + 51 * a + 128) >> 8);
+
+        des ++;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_last_vertical_band_4_5_scale_c
+ *
+ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+ *                  unsigned int dest_pitch : Stride of destination data.
+ *                  unsigned int dest_width : Width of destination data.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Scales last vertical band of pixels by scale 4 to 5. The
+ *                  height of the band scaled is 4-pixels.
+ *
+ *  SPECIAL NOTES : The routine does not have available the first line of
+ *                  the band below the current band, since this is the
+ *                  last band.
+ *
+ ****************************************************************************/
+void vp8cx_last_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned int a, b, c, d;
+    unsigned char *des = dest;
+
+    for (i = 0; i < dest_width; ++i)
+    {
+        a = des[0];
+        b = des[dest_pitch];
+
+        des[dest_pitch] = (unsigned char)((a * 51 + 205 * b + 128) >> 8);
+
+        c = des[dest_pitch*2] * 154;
+        d = des[dest_pitch*3];
+
+        des [dest_pitch*2] = (unsigned char)((b * 102 + c + 128) >> 8);
+        des [dest_pitch*3] = (unsigned char)((c + 102 * d + 128) >> 8);
+
+        // No other line for interplation of this line, so ..
+        des[dest_pitch*4] = (unsigned char) d;
+
+        des++;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_horizontal_line_2_3_scale_c
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width    : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 2 to 3.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ *
+ ****************************************************************************/
+void vp8cx_horizontal_line_2_3_scale_c
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    unsigned int i;
+    unsigned int a, b, c;
+    unsigned char *des = dest;
+    const unsigned char *src = source;
+
+    (void) dest_width;
+
+    for (i = 0; i < source_width - 2; i += 2)
+    {
+        a = src[0];
+        b = src[1];
+        c = src[2];
+
+        des [0] = (unsigned char)(a);
+        des [1] = (unsigned char)((a * 85 + 171 * b + 128) >> 8);
+        des [2] = (unsigned char)((b * 171 + 85 * c + 128) >> 8);
+
+        src += 2;
+        des += 3;
+    }
+
+    a = src[0];
+    b = src[1];
+    des [0] = (unsigned char)(a);
+    des [1] = (unsigned char)((a * 85 + 171 * b + 128) >> 8);
+    des [2] = (unsigned char)(b);
+}
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_vertical_band_2_3_scale_c
+ *
+ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+ *                  unsigned int dest_pitch : Stride of destination data.
+ *                  unsigned int dest_width : Width of destination data.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Scales vertical band of pixels by scale 2 to 3. The
+ *                  height of the band scaled is 2-pixels.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band.
+ *
+ ****************************************************************************/
+void vp8cx_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned int a, b, c;
+    unsigned char *des = dest;
+
+    for (i = 0; i < dest_width; i++)
+    {
+        a = des [0];
+        b = des [dest_pitch];
+        c = des[dest_pitch*3];
+        des [dest_pitch  ] = (unsigned char)((a * 85 + 171 * b + 128) >> 8);
+        des [dest_pitch*2] = (unsigned char)((b * 171 + 85 * c + 128) >> 8);
+
+        des++;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_last_vertical_band_2_3_scale_c
+ *
+ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+ *                  unsigned int dest_pitch : Stride of destination data.
+ *                  unsigned int dest_width : Width of destination data.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Scales last vertical band of pixels by scale 2 to 3. The
+ *                  height of the band scaled is 2-pixels.
+ *
+ *  SPECIAL NOTES : The routine does not have available the first line of
+ *                  the band below the current band, since this is the
+ *                  last band.
+ *
+ ****************************************************************************/
+void vp8cx_last_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned int a, b;
+    unsigned char *des = dest;
+
+    for (i = 0; i < dest_width; ++i)
+    {
+        a = des [0];
+        b = des [dest_pitch];
+
+        des [dest_pitch  ] = (unsigned char)((a * 85 + 171 * b + 128) >> 8);
+        des [dest_pitch*2] = (unsigned char)(b);
+        des++;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_horizontal_line_3_5_scale_c
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width    : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 3 to 5.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ *
+ ****************************************************************************/
+void vp8cx_horizontal_line_3_5_scale_c
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    unsigned int i;
+    unsigned int a, b, c;
+    unsigned char *des = dest;
+    const unsigned char *src = source;
+
+    (void) dest_width;
+
+    for (i = 0; i < source_width - 3; i += 3)
+    {
+        a = src[0];
+        b = src[1];
+        des [0] = (unsigned char)(a);
+        des [1] = (unsigned char)((a * 102 + 154 * b + 128) >> 8);
+
+        c = src[2] ;
+        des [2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8);
+        des [3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8);
+
+        a = src[3];
+        des [4] = (unsigned char)((c * 154 + a * 102 + 128) >> 8);
+
+        src += 3;
+        des += 5;
+    }
+
+    a = src[0];
+    b = src[1];
+    des [0] = (unsigned char)(a);
+
+    des [1] = (unsigned char)((a * 102 + 154 * b + 128) >> 8);
+    c = src[2] ;
+    des [2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8);
+    des [3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8);
+
+    des [4] = (unsigned char)(c);
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_vertical_band_3_5_scale_c
+ *
+ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+ *                  unsigned int dest_pitch : Stride of destination data.
+ *                  unsigned int dest_width : Width of destination data.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Scales vertical band of pixels by scale 3 to 5. The
+ *                  height of the band scaled is 3-pixels.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band.
+ *
+ ****************************************************************************/
+void vp8cx_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned int a, b, c;
+    unsigned char *des = dest;
+
+    for (i = 0; i < dest_width; i++)
+    {
+        a = des [0];
+        b = des [dest_pitch];
+        des [dest_pitch] = (unsigned char)((a * 102 + 154 * b + 128) >> 8);
+
+        c = des[dest_pitch*2];
+        des [dest_pitch*2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8);
+        des [dest_pitch*3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8);
+
+        // First line in next band...
+        a = des [dest_pitch * 5];
+        des [dest_pitch * 4] = (unsigned char)((c * 154 + a * 102 + 128) >> 8);
+
+        des++;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_last_vertical_band_3_5_scale_c
+ *
+ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+ *                  unsigned int dest_pitch : Stride of destination data.
+ *                  unsigned int dest_width : Width of destination data.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Scales last vertical band of pixels by scale 3 to 5. The
+ *                  height of the band scaled is 3-pixels.
+ *
+ *  SPECIAL NOTES : The routine does not have available the first line of
+ *                  the band below the current band, since this is the
+ *                  last band.
+ *
+ ****************************************************************************/
+void vp8cx_last_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned int a, b, c;
+    unsigned char *des = dest;
+
+    for (i = 0; i < dest_width; ++i)
+    {
+        a = des [0];
+        b = des [dest_pitch];
+
+        des [ dest_pitch ] = (unsigned char)((a * 102 + 154 * b + 128) >> 8);
+
+        c = des[dest_pitch*2];
+        des [dest_pitch*2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8);
+        des [dest_pitch*3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8);
+
+        // No other line for interplation of this line, so ..
+        des [ dest_pitch * 4 ] = (unsigned char)(c) ;
+
+        des++;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_horizontal_line_3_4_scale_c
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width    : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 3 to 4.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ *
+ ****************************************************************************/
+void vp8cx_horizontal_line_3_4_scale_c
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    unsigned int i;
+    unsigned int a, b, c;
+    unsigned char *des = dest;
+    const unsigned char *src = source;
+
+    (void) dest_width;
+
+    for (i = 0; i < source_width - 3; i += 3)
+    {
+        a = src[0];
+        b = src[1];
+        des [0] = (unsigned char)(a);
+        des [1] = (unsigned char)((a * 64 + b * 192 + 128) >> 8);
+
+        c = src[2];
+        des [2] = (unsigned char)((b + c + 1) >> 1);
+
+        a = src[3];
+        des [3] = (unsigned char)((c * 192 + a * 64 + 128) >> 8);
+
+        src += 3;
+        des += 4;
+    }
+
+    a = src[0];
+    b = src[1];
+    des [0] = (unsigned char)(a);
+    des [1] = (unsigned char)((a * 64 + b * 192 + 128) >> 8);
+
+    c = src[2] ;
+    des [2] = (unsigned char)((b + c + 1) >> 1);
+    des [3] = (unsigned char)(c);
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_vertical_band_3_4_scale_c
+ *
+ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+ *                  unsigned int dest_pitch : Stride of destination data.
+ *                  unsigned int dest_width : Width of destination data.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Scales vertical band of pixels by scale 3 to 4. The
+ *                  height of the band scaled is 3-pixels.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band.
+ *
+ ****************************************************************************/
+void vp8cx_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned int a, b, c;
+    unsigned char *des = dest;
+
+    for (i = 0; i < dest_width; i++)
+    {
+        a = des [0];
+        b = des [dest_pitch];
+        des [dest_pitch]   = (unsigned char)((a * 64 + b * 192 + 128) >> 8);
+
+        c = des[dest_pitch*2];
+        des [dest_pitch*2] = (unsigned char)((b + c + 1) >> 1);
+
+        // First line in next band...
+        a = des [dest_pitch*4];
+        des [dest_pitch*3] = (unsigned char)((c * 192 + a * 64 + 128) >> 8);
+
+        des++;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_last_vertical_band_3_4_scale_c
+ *
+ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+ *                  unsigned int dest_pitch : Stride of destination data.
+ *                  unsigned int dest_width : Width of destination data.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Scales last vertical band of pixels by scale 3 to 4. The
+ *                  height of the band scaled is 3-pixels.
+ *
+ *  SPECIAL NOTES : The routine does not have available the first line of
+ *                  the band below the current band, since this is the
+ *                  last band.
+ *
+ ****************************************************************************/
+void vp8cx_last_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned int a, b, c;
+    unsigned char *des = dest;
+
+    for (i = 0; i < dest_width; ++i)
+    {
+        a = des [0];
+        b = des [dest_pitch];
+
+        des [dest_pitch]   = (unsigned char)((a * 64 + b * 192 + 128) >> 8);
+
+        c = des[dest_pitch*2];
+        des [dest_pitch*2] = (unsigned char)((b + c + 1) >> 1);
+
+        // No other line for interplation of this line, so ..
+        des [dest_pitch*3] = (unsigned char)(c);
+
+        des++;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_horizontal_line_1_2_scale_c
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width    : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 1 to 2.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8cx_horizontal_line_1_2_scale_c
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    unsigned int i;
+    unsigned int a, b;
+    unsigned char *des = dest;
+    const unsigned char *src = source;
+
+    (void) dest_width;
+
+    for (i = 0; i < source_width - 1; i += 1)
+    {
+        a = src[0];
+        b = src[1];
+        des [0] = (unsigned char)(a);
+        des [1] = (unsigned char)((a + b + 1) >> 1);
+        src += 1;
+        des += 2;
+    }
+
+    a = src[0];
+    des [0] = (unsigned char)(a);
+    des [1] = (unsigned char)(a);
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_vertical_band_1_2_scale_c
+ *
+ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+ *                  unsigned int dest_pitch : Stride of destination data.
+ *                  unsigned int dest_width : Width of destination data.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Scales vertical band of pixels by scale 1 to 2. The
+ *                  height of the band scaled is 1-pixel.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band.
+ *
+ ****************************************************************************/
+void vp8cx_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned int a, b;
+    unsigned char *des = dest;
+
+    for (i = 0; i < dest_width; i++)
+    {
+        a = des [0];
+        b = des [dest_pitch * 2];
+
+        des[dest_pitch] = (unsigned char)((a + b + 1) >> 1);
+
+        des++;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_last_vertical_band_1_2_scale_c
+ *
+ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+ *                  unsigned int dest_pitch : Stride of destination data.
+ *                  unsigned int dest_width : Width of destination data.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Scales last vertical band of pixels by scale 1 to 2. The
+ *                  height of the band scaled is 1-pixel.
+ *
+ *  SPECIAL NOTES : The routine does not have available the first line of
+ *                  the band below the current band, since this is the
+ *                  last band.
+ *
+ ****************************************************************************/
+void vp8cx_last_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned char *des = dest;
+
+    for (i = 0; i < dest_width; ++i)
+    {
+        des[dest_pitch] = des[0];
+        des++;
+    }
+}
+
+
+
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_horizontal_line_4_5_scale_c
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width    : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 4 to 5.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8cx_horizontal_line_5_4_scale_c
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    unsigned i;
+    unsigned int a, b, c, d, e;
+    unsigned char *des = dest;
+    const unsigned char *src = source;
+
+    (void) dest_width;
+
+    for (i = 0; i < source_width; i += 5)
+    {
+        a = src[0];
+        b = src[1];
+        c = src[2];
+        d = src[3];
+        e = src[4];
+
+        des[0] = (unsigned char) a;
+        des[1] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
+        des[2] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
+        des[3] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
+
+        src += 5;
+        des += 4;
+    }
+}
+
+
+
+
+void vp8cx_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned int a, b, c, d, e;
+    unsigned char *des = dest;
+    unsigned char *src = source;
+
+    for (i = 0; i < dest_width; i++)
+    {
+
+        a = src[0 * src_pitch];
+        b = src[1 * src_pitch];
+        c = src[2 * src_pitch];
+        d = src[3 * src_pitch];
+        e = src[4 * src_pitch];
+
+        des[0 * dest_pitch] = (unsigned char) a;
+        des[1 * dest_pitch] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
+        des[2 * dest_pitch] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
+        des[3 * dest_pitch] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
+
+        src ++;
+        des ++;
+
+    }
+}
+
+
+/*7***************************************************************************
+ *
+ *  ROUTINE       : vp8cx_horizontal_line_3_5_scale_c
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width    : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 3 to 5.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ *
+ ****************************************************************************/
+void vp8cx_horizontal_line_5_3_scale_c
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    unsigned int i;
+    unsigned int a, b, c, d , e;
+    unsigned char *des = dest;
+    const unsigned char *src = source;
+
+    (void) dest_width;
+
+    for (i = 0; i < source_width; i += 5)
+    {
+        a = src[0];
+        b = src[1];
+        c = src[2];
+        d = src[3];
+        e = src[4];
+
+        des[0] = (unsigned char) a;
+        des[1] = (unsigned char)((b * 85  + c * 171 + 128) >> 8);
+        des[2] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
+
+        src += 5;
+        des += 3;
+    }
+
+}
+
+void vp8cx_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned int a, b, c, d, e;
+    unsigned char *des = dest;
+    unsigned char *src = source;
+
+    for (i = 0; i < dest_width; i++)
+    {
+
+        a = src[0 * src_pitch];
+        b = src[1 * src_pitch];
+        c = src[2 * src_pitch];
+        d = src[3 * src_pitch];
+        e = src[4 * src_pitch];
+
+        des[0 * dest_pitch] = (unsigned char) a;
+        des[1 * dest_pitch] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
+        des[2 * dest_pitch] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
+
+        src ++;
+        des ++;
+
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_horizontal_line_1_2_scale_c
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width    : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 1 to 2.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8cx_horizontal_line_2_1_scale_c
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    unsigned int i;
+    unsigned int a;
+    unsigned char *des = dest;
+    const unsigned char *src = source;
+
+    (void) dest_width;
+
+    for (i = 0; i < source_width; i += 2)
+    {
+        a = src[0];
+        des [0] = (unsigned char)(a);
+        src += 2;
+        des += 1;
+    }
+
+
+
+}
+void vp8cx_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    (void) dest_pitch;
+    (void) src_pitch;
+    vpx_memcpy(dest, source, dest_width);
+}
+
+void vp8cx_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    int temp;
+
+    (void) dest_pitch;
+
+    for (i = 0; i < dest_width; i++)
+    {
+        temp = 8;
+        temp += source[i-(int)src_pitch] * 3;
+        temp += source[i] * 10;
+        temp += source[i+src_pitch] * 3;
+        temp >>= 4 ;
+        dest[i] = (unsigned char)(temp);
+    }
+
+}
diff --git a/vpx_scale/generic/scalesystemdependant.c b/vpx_scale/generic/scalesystemdependant.c
new file mode 100644
index 000000000..28f5c7252
--- /dev/null
+++ b/vpx_scale/generic/scalesystemdependant.c
@@ -0,0 +1,79 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/vpxscale.h"
+
+#ifdef HAVE_CONFIG_H
+#include "vpx_config.h"
+#endif
+
+void (*vp8_yv12_extend_frame_borders_ptr)(YV12_BUFFER_CONFIG *ybf);
+extern void vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf);
+
+void (*vp8_yv12_copy_frame_yonly_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+extern void vp8_yv12_copy_frame_yonly(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+
+void (*vp8_yv12_copy_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+extern void vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+
+/****************************************************************************
+*  Imports
+*****************************************************************************/
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8_scale_machine_specific_config
+ *
+ *  INPUTS        : UINT32 Version : Codec version number.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Checks for machine specifc features such as MMX support
+ *                  sets appropriate flags and function pointers.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8_scale_machine_specific_config()
+{
+#if CONFIG_SPATIAL_RESAMPLING
+    vp8_horizontal_line_1_2_scale        = vp8cx_horizontal_line_1_2_scale_c;
+    vp8_vertical_band_1_2_scale          = vp8cx_vertical_band_1_2_scale_c;
+    vp8_last_vertical_band_1_2_scale      = vp8cx_last_vertical_band_1_2_scale_c;
+    vp8_horizontal_line_3_5_scale        = vp8cx_horizontal_line_3_5_scale_c;
+    vp8_vertical_band_3_5_scale          = vp8cx_vertical_band_3_5_scale_c;
+    vp8_last_vertical_band_3_5_scale      = vp8cx_last_vertical_band_3_5_scale_c;
+    vp8_horizontal_line_3_4_scale        = vp8cx_horizontal_line_3_4_scale_c;
+    vp8_vertical_band_3_4_scale          = vp8cx_vertical_band_3_4_scale_c;
+    vp8_last_vertical_band_3_4_scale      = vp8cx_last_vertical_band_3_4_scale_c;
+    vp8_horizontal_line_2_3_scale        = vp8cx_horizontal_line_2_3_scale_c;
+    vp8_vertical_band_2_3_scale          = vp8cx_vertical_band_2_3_scale_c;
+    vp8_last_vertical_band_2_3_scale      = vp8cx_last_vertical_band_2_3_scale_c;
+    vp8_horizontal_line_4_5_scale        = vp8cx_horizontal_line_4_5_scale_c;
+    vp8_vertical_band_4_5_scale          = vp8cx_vertical_band_4_5_scale_c;
+    vp8_last_vertical_band_4_5_scale      = vp8cx_last_vertical_band_4_5_scale_c;
+
+
+    vp8_vertical_band_5_4_scale           = vp8cx_vertical_band_5_4_scale_c;
+    vp8_vertical_band_5_3_scale           = vp8cx_vertical_band_5_3_scale_c;
+    vp8_vertical_band_2_1_scale           = vp8cx_vertical_band_2_1_scale_c;
+    vp8_vertical_band_2_1_scale_i         = vp8cx_vertical_band_2_1_scale_i_c;
+    vp8_horizontal_line_2_1_scale         = vp8cx_horizontal_line_2_1_scale_c;
+    vp8_horizontal_line_5_3_scale         = vp8cx_horizontal_line_5_3_scale_c;
+    vp8_horizontal_line_5_4_scale         = vp8cx_horizontal_line_5_4_scale_c;
+#endif
+
+    vp8_yv12_extend_frame_borders_ptr      = vp8_yv12_extend_frame_borders;
+    vp8_yv12_copy_frame_yonly_ptr          = vp8_yv12_copy_frame_yonly;
+    vp8_yv12_copy_frame_ptr           = vp8_yv12_copy_frame;
+
+}
diff --git a/vpx_scale/generic/vpxscale.c b/vpx_scale/generic/vpxscale.c
new file mode 100644
index 000000000..206cd5512
--- /dev/null
+++ b/vpx_scale/generic/vpxscale.c
@@ -0,0 +1,1088 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+ *
+ *   Module Title :     scale.c
+ *
+ *   Description  :     Image scaling functions.
+ *
+ ***************************************************************************/
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12config.h"
+#include "vpx_scale/scale_mode.h"
+
+/****************************************************************************
+*  Exports
+****************************************************************************/
+#ifndef VPX_NO_GLOBALS
+void (*vp8_vertical_band_4_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_last_vertical_band_4_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_vertical_band_2_3_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_last_vertical_band_2_3_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_vertical_band_3_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_last_vertical_band_3_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_vertical_band_3_4_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_last_vertical_band_3_4_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_horizontal_line_1_2_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0;
+void (*vp8_horizontal_line_3_5_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0;
+void (*vp8_horizontal_line_3_4_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0;
+void (*vp8_horizontal_line_2_3_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0;
+void (*vp8_horizontal_line_4_5_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0;
+void (*vp8_vertical_band_1_2_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_last_vertical_band_1_2_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+
+void (*vp8_vertical_band_5_4_scale)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_vertical_band_5_3_scale)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_vertical_band_2_1_scale)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_vertical_band_2_1_scale_i)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
+void (*vp8_horizontal_line_2_1_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0;
+void (*vp8_horizontal_line_5_3_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0;
+void (*vp8_horizontal_line_5_4_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0;
+#else
+# include "vpxscale_nofp.h"
+#endif
+
+typedef struct
+{
+    int     expanded_frame_width;
+    int     expanded_frame_height;
+
+    int HScale;
+    int HRatio;
+    int VScale;
+    int VRatio;
+
+    YV12_BUFFER_CONFIG *src_yuv_config;
+    YV12_BUFFER_CONFIG *dst_yuv_config;
+
+} SCALE_VARS;
+
+/****************************************************************************
+ *
+ *  ROUTINE       :     horizontal_line_copy
+ *
+ *  INPUTS        :     None
+ *
+ *
+ *  OUTPUTS       :     None.
+ *
+ *  RETURNS       :     None
+ *
+ *  FUNCTION      :     1 to 1 scaling up for a horizontal line of pixles
+ *
+ *  SPECIAL NOTES :     None.
+ *
+ *  ERRORS        :     None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_copy(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    (void) dest_width;
+
+    duck_memcpy(dest, source, source_width);
+}
+/****************************************************************************
+ *
+ *  ROUTINE       :     null_scale
+ *
+ *  INPUTS        :     None
+ *
+ *
+ *  OUTPUTS       :     None.
+ *
+ *  RETURNS       :     None
+ *
+ *  FUNCTION      :     1 to 1 scaling up for a vertical band
+ *
+ *  SPECIAL NOTES :     None.
+ *
+ *  ERRORS        :     None.
+ *
+ ****************************************************************************/
+static
+void null_scale(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    (void) dest;
+    (void) dest_pitch;
+    (void) dest_width;
+
+    return;
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : scale1d_2t1_i
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
+ *                  int source_step              : Number of pixels to step on in source.
+ *                  unsigned int source_scale    : Scale for source (UNUSED).
+ *                  unsigned int source_length   : Length of source (UNUSED).
+ *                  unsigned char *dest         : Pointer to output data array.
+ *                  int dest_step                : Number of pixels to step on in destination.
+ *                  unsigned int dest_scale      : Scale for destination (UNUSED).
+ *                  unsigned int dest_length     : Length of destination.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-to-1 interpolated scaling.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void scale1d_2t1_i
+(
+    const unsigned char *source,
+    int source_step,
+    unsigned int source_scale,
+    unsigned int source_length,
+    unsigned char *dest,
+    int dest_step,
+    unsigned int dest_scale,
+    unsigned int dest_length
+)
+{
+    unsigned int i, j;
+    unsigned int temp;
+    int source_pitch = source_step;
+    (void) source_length;
+    (void) source_scale;
+    (void) dest_scale;
+
+    source_step *= 2;
+    dest[0] = source[0];
+
+    for (i = dest_step, j = source_step; i < dest_length * dest_step; i += dest_step, j += source_step)
+    {
+        temp = 8;
+        temp += 3 * source[j-source_pitch];
+        temp += 10 * source[j];
+        temp += 3 * source[j+source_pitch];
+        temp >>= 4;
+        dest[i] = (char)(temp);
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : scale1d_2t1_ps
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
+ *                  int source_step              : Number of pixels to step on in source.
+ *                  unsigned int source_scale    : Scale for source (UNUSED).
+ *                  unsigned int source_length   : Length of source (UNUSED).
+ *                  unsigned char *dest         : Pointer to output data array.
+ *                  int dest_step                : Number of pixels to step on in destination.
+ *                  unsigned int dest_scale      : Scale for destination (UNUSED).
+ *                  unsigned int dest_length     : Length of destination.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-to-1 point subsampled scaling.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void scale1d_2t1_ps
+(
+    const unsigned char *source,
+    int source_step,
+    unsigned int source_scale,
+    unsigned int source_length,
+    unsigned char *dest,
+    int dest_step,
+    unsigned int dest_scale,
+    unsigned int dest_length
+)
+{
+    unsigned int i, j;
+
+    (void) source_length;
+    (void) source_scale;
+    (void) dest_scale;
+
+    source_step *= 2;
+    j = 0;
+
+    for (i = 0; i < dest_length * dest_step; i += dest_step, j += source_step)
+        dest[i] = source[j];
+}
+/****************************************************************************
+ *
+ *  ROUTINE       : scale1d_c
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
+ *                  int source_step              : Number of pixels to step on in source.
+ *                  unsigned int source_scale    : Scale for source.
+ *                  unsigned int source_length   : Length of source (UNUSED).
+ *                  unsigned char *dest         : Pointer to output data array.
+ *                  int dest_step                : Number of pixels to step on in destination.
+ *                  unsigned int dest_scale      : Scale for destination.
+ *                  unsigned int dest_length     : Length of destination.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs linear interpolation in one dimension.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void scale1d_c
+(
+    const unsigned char *source,
+    int source_step,
+    unsigned int source_scale,
+    unsigned int source_length,
+    unsigned char *dest,
+    int dest_step,
+    unsigned int dest_scale,
+    unsigned int dest_length
+)
+{
+    unsigned int i;
+    unsigned int round_value = dest_scale / 2;
+    unsigned int left_modifier = dest_scale;
+    unsigned int right_modifier = 0;
+    unsigned char left_pixel = *source;
+    unsigned char right_pixel = *(source + source_step);
+
+    (void) source_length;
+
+    // These asserts are needed if there are boundary issues...
+    //assert ( dest_scale > source_scale );
+    //assert ( (source_length-1) * dest_scale >= (dest_length-1) * source_scale );
+
+    for (i = 0; i < dest_length * dest_step; i += dest_step)
+    {
+        dest[i] = (char)((left_modifier * left_pixel + right_modifier * right_pixel + round_value) / dest_scale);
+
+        right_modifier += source_scale;
+
+        while (right_modifier > dest_scale)
+        {
+            right_modifier -= dest_scale;
+            source += source_step;
+            left_pixel = *source;
+            right_pixel = *(source + source_step);
+        }
+
+        left_modifier = dest_scale - right_modifier;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : Scale2D
+ *
+ *  INPUTS        : const unsigned char *source  : Pointer to data to be scaled.
+ *                  int source_pitch              : Stride of source image.
+ *                  unsigned int source_width     : Width of input image.
+ *                  unsigned int source_height    : Height of input image.
+ *                  unsigned char *dest          : Pointer to output data array.
+ *                  int dest_pitch                : Stride of destination image.
+ *                  unsigned int dest_width       : Width of destination image.
+ *                  unsigned int dest_height      : Height of destination image.
+ *                  unsigned char *temp_area      : Pointer to temp work area.
+ *                  unsigned char temp_area_height : Height of temp work area.
+ *                  unsigned int hscale          : Horizontal scale factor numerator.
+ *                  unsigned int hratio          : Horizontal scale factor denominator.
+ *                  unsigned int vscale          : Vertical scale factor numerator.
+ *                  unsigned int vratio          : Vertical scale factor denominator.
+ *                  unsigned int interlaced      : Interlace flag.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-tap linear interpolation in two dimensions.
+ *
+ *  SPECIAL NOTES : Expansion is performed one band at a time to help with
+ *                  caching.
+ *
+ ****************************************************************************/
+static
+void Scale2D
+(
+    //const
+    unsigned char *source,
+    int source_pitch,
+    unsigned int source_width,
+    unsigned int source_height,
+    unsigned char *dest,
+    int dest_pitch,
+    unsigned int dest_width,
+    unsigned int dest_height,
+    unsigned char *temp_area,
+    unsigned char temp_area_height,
+    unsigned int hscale,
+    unsigned int hratio,
+    unsigned int vscale,
+    unsigned int vratio,
+    unsigned int interlaced
+)
+{
+    //unsigned
+    int i, j, k;
+    int bands;
+    int dest_band_height;
+    int source_band_height;
+
+    typedef void (*Scale1D)(const unsigned char * source, int source_step, unsigned int source_scale, unsigned int source_length,
+                            unsigned char * dest, int dest_step, unsigned int dest_scale, unsigned int dest_length);
+
+    Scale1D Scale1Dv = scale1d_c;
+    Scale1D Scale1Dh = scale1d_c;
+
+    void (*horiz_line_scale)(const unsigned char *, unsigned int, unsigned char *, unsigned int) = NULL;
+    void (*vert_band_scale)(unsigned char *, unsigned int, unsigned char *, unsigned int, unsigned int) = NULL;
+
+    int ratio_scalable = 1;
+    int interpolation = 0;
+
+    unsigned char *source_base; // = (unsigned char *) ((source_pitch >= 0) ? source : (source + ((source_height-1) * source_pitch)));
+    unsigned char *line_src;
+
+
+    source_base = (unsigned char *)source;
+
+    if (source_pitch < 0)
+    {
+        int offset;
+
+        offset = (source_height - 1);
+        offset *= source_pitch;
+
+        source_base += offset;
+    }
+
+    // find out the ratio for each direction
+    switch (hratio * 10 / hscale)
+    {
+    case 8:
+        // 4-5 Scale in Width direction
+        horiz_line_scale = vp8_horizontal_line_5_4_scale;
+        break;
+    case 6:
+        // 3-5 Scale in Width direction
+        horiz_line_scale = vp8_horizontal_line_5_3_scale;
+        break;
+    case 5:
+        // 1-2 Scale in Width direction
+        horiz_line_scale = vp8_horizontal_line_2_1_scale;
+        break;
+    default:
+        // The ratio is not acceptable now
+        // throw("The ratio is not acceptable for now!");
+        ratio_scalable = 0;
+        break;
+    }
+
+    switch (vratio * 10 / vscale)
+    {
+    case 8:
+        // 4-5 Scale in vertical direction
+        vert_band_scale     = vp8_vertical_band_5_4_scale;
+        source_band_height  = 5;
+        dest_band_height    = 4;
+        break;
+    case 6:
+        // 3-5 Scale in vertical direction
+        vert_band_scale     = vp8_vertical_band_5_3_scale;
+        source_band_height  = 5;
+        dest_band_height    = 3;
+        break;
+    case 5:
+        // 1-2 Scale in vertical direction
+
+        if (interlaced)
+        {
+            //if the content is interlaced, point sampling is used
+            vert_band_scale     = vp8_vertical_band_2_1_scale;
+        }
+        else
+        {
+
+            interpolation = 1;
+            //if the content is progressive, interplo
+            vert_band_scale     = vp8_vertical_band_2_1_scale_i;
+
+        }
+
+        source_band_height  = 2;
+        dest_band_height    = 1;
+        break;
+    default:
+        // The ratio is not acceptable now
+        // throw("The ratio is not acceptable for now!");
+        ratio_scalable = 0;
+        break;
+    }
+
+    if (ratio_scalable)
+    {
+        if (source_height == dest_height)
+        {
+            // for each band of the image
+            for (k = 0; k < (int)dest_height; k++)
+            {
+                horiz_line_scale(source, source_width, dest, dest_width);
+                source += source_pitch;
+                dest   += dest_pitch;
+            }
+
+            return;
+        }
+
+        if (interpolation)
+        {
+            if (source < source_base)
+                source = source_base;
+
+            horiz_line_scale(source, source_width, temp_area, dest_width);
+        }
+
+        for (k = 0; k < (int)(dest_height + dest_band_height - 1) / dest_band_height; k++)
+        {
+            // scale one band horizontally
+            for (i = 0; i < source_band_height; i++)
+            {
+                // Trap case where we could read off the base of the source buffer
+
+                line_src = (unsigned char *)source + i * source_pitch;
+
+                if (line_src < source_base)
+                    line_src = source_base;
+
+                horiz_line_scale(line_src, source_width,
+                                 temp_area + (i + 1)*dest_pitch, dest_width);
+            }
+
+            // Vertical scaling is in place
+            vert_band_scale(temp_area + dest_pitch, dest_pitch, dest, dest_pitch, dest_width);
+
+            if (interpolation)
+                vpx_memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_width);
+
+            // Next band...
+            source += (unsigned long) source_band_height  * source_pitch;
+            dest   += (unsigned long) dest_band_height * dest_pitch;
+        }
+
+        return;
+    }
+
+    if (hscale == 2 && hratio == 1)
+        Scale1Dh = scale1d_2t1_ps;
+
+    if (vscale == 2 && vratio == 1)
+    {
+        if (interlaced)
+            Scale1Dv = scale1d_2t1_ps;
+        else
+            Scale1Dv = scale1d_2t1_i;
+    }
+
+    if (source_height == dest_height)
+    {
+        // for each band of the image
+        for (k = 0; k < (int)dest_height; k++)
+        {
+            Scale1Dh(source, 1, hscale, source_width + 1, dest, 1, hratio, dest_width);
+            source += source_pitch;
+            dest   += dest_pitch;
+        }
+
+        return;
+    }
+
+    if (dest_height > source_height)
+    {
+        dest_band_height   = temp_area_height - 1;
+        source_band_height = dest_band_height * source_height / dest_height;
+    }
+    else
+    {
+        source_band_height = temp_area_height - 1;
+        dest_band_height   = source_band_height * vratio / vscale;
+    }
+
+    // first row needs to be done so that we can stay one row ahead for vertical zoom
+    Scale1Dh(source, 1, hscale, source_width + 1, temp_area, 1, hratio, dest_width);
+
+    // for each band of the image
+    bands = (dest_height + dest_band_height - 1) / dest_band_height;
+
+    for (k = 0; k < bands; k++)
+    {
+        // scale one band horizontally
+        for (i = 1; i < source_band_height + 1; i++)
+        {
+            if (k * source_band_height + i < (int) source_height)
+            {
+                Scale1Dh(source + i * source_pitch, 1, hscale, source_width + 1,
+                         temp_area + i * dest_pitch, 1, hratio, dest_width);
+            }
+            else  //  Duplicate the last row
+            {
+                // copy temp_area row 0 over from last row in the past
+                duck_memcpy(temp_area + i * dest_pitch, temp_area + (i - 1)*dest_pitch, dest_pitch);
+            }
+        }
+
+        // scale one band vertically
+        for (j = 0; j < (int)dest_width; j++)
+        {
+            Scale1Dv(&temp_area[j], dest_pitch, vscale, source_band_height + 1,
+                     &dest[j], dest_pitch, vratio, dest_band_height);
+        }
+
+        // copy temp_area row 0 over from last row in the past
+        duck_memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch);
+
+        // move to the next band
+        source += source_band_height * source_pitch;
+        dest   += dest_band_height * dest_pitch;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       :
+ *
+ *  INPUTS        : YV12_BUFFER_CONFIG *src       : Pointer to frame to be scaled.
+ *                  YV12_BUFFER_CONFIG *dst       : Pointer to buffer to hold scaled frame.
+ *                  unsigned char *temp_area      : Pointer to temp work area.
+ *                  unsigned char temp_area_height : Height of temp work area.
+ *                  unsigned int hscale          : Horizontal scale factor numerator.
+ *                  unsigned int hratio          : Horizontal scale factor denominator.
+ *                  unsigned int vscale          : Vertical scale factor numerator.
+ *                  unsigned int vratio          : Vertical scale factor denominator.
+ *                  unsigned int interlaced      : Interlace flag.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-tap linear interpolation in two dimensions.
+ *
+ *  SPECIAL NOTES : Expansion is performed one band at a time to help with
+ *                  caching.
+ *
+ ****************************************************************************/
+void vp8_scale_frame
+(
+    YV12_BUFFER_CONFIG *src,
+    YV12_BUFFER_CONFIG *dst,
+    unsigned char *temp_area,
+    unsigned char temp_height,
+    unsigned int hscale,
+    unsigned int hratio,
+    unsigned int vscale,
+    unsigned int vratio,
+    unsigned int interlaced
+)
+{
+    int i;
+    int dw = (hscale - 1 + src->y_width * hratio) / hscale;
+    int dh = (vscale - 1 + src->y_height * vratio) / vscale;
+
+    // call our internal scaling routines!!
+    Scale2D((unsigned char *) src->y_buffer, src->y_stride, src->y_width, src->y_height,
+            (unsigned char *) dst->y_buffer, dst->y_stride, dw, dh,
+            temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
+
+    if (dw < (int)dst->y_width)
+        for (i = 0; i < dh; i++)
+            duck_memset(dst->y_buffer + i * dst->y_stride + dw - 1, dst->y_buffer[i*dst->y_stride+dw-2], dst->y_width - dw + 1);
+
+    if (dh < (int)dst->y_height)
+        for (i = dh - 1; i < (int)dst->y_height; i++)
+            duck_memcpy(dst->y_buffer + i * dst->y_stride, dst->y_buffer + (dh - 2) * dst->y_stride, dst->y_width + 1);
+
+    Scale2D((unsigned char *) src->u_buffer, src->uv_stride, src->uv_width, src->uv_height,
+            (unsigned char *) dst->u_buffer, dst->uv_stride, dw / 2, dh / 2,
+            temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
+
+    if (dw / 2 < (int)dst->uv_width)
+        for (i = 0; i < dst->uv_height; i++)
+            duck_memset(dst->u_buffer + i * dst->uv_stride + dw / 2 - 1, dst->u_buffer[i*dst->uv_stride+dw/2-2], dst->uv_width - dw / 2 + 1);
+
+    if (dh / 2 < (int)dst->uv_height)
+        for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
+            duck_memcpy(dst->u_buffer + i * dst->uv_stride, dst->u_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
+
+    Scale2D((unsigned char *) src->v_buffer, src->uv_stride, src->uv_width, src->uv_height,
+            (unsigned char *) dst->v_buffer, dst->uv_stride, dw / 2, dh / 2,
+            temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
+
+    if (dw / 2 < (int)dst->uv_width)
+        for (i = 0; i < dst->uv_height; i++)
+            duck_memset(dst->v_buffer + i * dst->uv_stride + dw / 2 - 1, dst->v_buffer[i*dst->uv_stride+dw/2-2], dst->uv_width - dw / 2 + 1);
+
+    if (dh / 2 < (int) dst->uv_height)
+        for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
+            duck_memcpy(dst->v_buffer + i * dst->uv_stride, dst->v_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
+}
+/****************************************************************************
+ *
+ *  ROUTINE       : any_ratio_2d_scale
+ *
+ *  INPUTS        : SCALE_INSTANCE *si      : Pointer to post-processor instance (NOT USED).
+ *                  const unsigned char *source : Pointer to source image.
+ *                  unsigned int source_pitch    : Stride of source image.
+ *                  unsigned int source_width    : Width of source image.
+ *                  unsigned int source_height   : Height of source image (NOT USED).
+ *                  unsigned char *dest         : Pointer to destination image.
+ *                  unsigned int dest_pitch      : Stride of destination image.
+ *                  unsigned int dest_width      : Width of destination image.
+ *                  unsigned int dest_height     : Height of destination image.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : int: 1 if image scaled, 0 if image could not be scaled.
+ *
+ *  FUNCTION      : Scale the image with changing apect ratio.
+ *
+ *  SPECIAL NOTES : This scaling is a bi-linear scaling. Need to re-work the
+ *                  whole function for new scaling algorithm.
+ *
+ ****************************************************************************/
+static
+int any_ratio_2d_scale
+(
+    SCALE_VARS *si,
+    const unsigned char *source,
+    int source_pitch,
+    unsigned int source_width,
+    unsigned int source_height,
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width,
+    unsigned int dest_height
+)
+{
+    unsigned int i, k;
+    unsigned int src_band_height  = 0;
+    unsigned int dest_band_height = 0;
+
+    // suggested scale factors
+    int hs = si->HScale;
+    int hr = si->HRatio;
+    int vs = si->VScale;
+    int vr = si->VRatio;
+
+    // assume the ratios are scalable instead of should be centered
+    int ratio_scalable = 1;
+
+    const unsigned char *source_base = ((source_pitch >= 0) ? source : (source + ((source_height - 1) * source_pitch)));
+    const unsigned char *line_src;
+
+    void (*horiz_line_scale)(const unsigned char *, unsigned int, unsigned char *, unsigned int) = NULL;
+    void (*vert_band_scale)(unsigned char *, unsigned int, unsigned int) = NULL;
+    void (*last_vert_band_scale)(unsigned char *, unsigned int, unsigned int) = NULL;
+
+    (void) si;
+
+    // find out the ratio for each direction
+    switch (hr * 30 / hs)
+    {
+    case 24:
+        // 4-5 Scale in Width direction
+        horiz_line_scale = vp8_horizontal_line_4_5_scale;
+        break;
+    case 22:
+        // 3-4 Scale in Width direction
+        horiz_line_scale = vp8_horizontal_line_3_4_scale;
+        break;
+
+    case 20:
+        // 4-5 Scale in Width direction
+        horiz_line_scale = vp8_horizontal_line_2_3_scale;
+        break;
+    case 18:
+        // 3-5 Scale in Width direction
+        horiz_line_scale = vp8_horizontal_line_3_5_scale;
+        break;
+    case 15:
+        // 1-2 Scale in Width direction
+        horiz_line_scale = vp8_horizontal_line_1_2_scale;
+        break;
+    case 30:
+        // no scale in Width direction
+        horiz_line_scale = horizontal_line_copy;
+        break;
+    default:
+        // The ratio is not acceptable now
+        // throw("The ratio is not acceptable for now!");
+        ratio_scalable = 0;
+        break;
+    }
+
+    switch (vr * 30 / vs)
+    {
+    case 24:
+        // 4-5 Scale in vertical direction
+        vert_band_scale     = vp8_vertical_band_4_5_scale;
+        last_vert_band_scale = vp8_last_vertical_band_4_5_scale;
+        src_band_height     = 4;
+        dest_band_height    = 5;
+        break;
+    case 22:
+        // 3-4 Scale in vertical direction
+        vert_band_scale     = vp8_vertical_band_3_4_scale;
+        last_vert_band_scale = vp8_last_vertical_band_3_4_scale;
+        src_band_height     = 3;
+        dest_band_height    = 4;
+        break;
+    case 20:
+        // 2-3 Scale in vertical direction
+        vert_band_scale     = vp8_vertical_band_2_3_scale;
+        last_vert_band_scale = vp8_last_vertical_band_2_3_scale;
+        src_band_height     = 2;
+        dest_band_height    = 3;
+        break;
+    case 18:
+        // 3-5 Scale in vertical direction
+        vert_band_scale     = vp8_vertical_band_3_5_scale;
+        last_vert_band_scale = vp8_last_vertical_band_3_5_scale;
+        src_band_height     = 3;
+        dest_band_height    = 5;
+        break;
+    case 15:
+        // 1-2 Scale in vertical direction
+        vert_band_scale     = vp8_vertical_band_1_2_scale;
+        last_vert_band_scale = vp8_last_vertical_band_1_2_scale;
+        src_band_height     = 1;
+        dest_band_height    = 2;
+        break;
+    case 30:
+        // no scale in Width direction
+        vert_band_scale     = null_scale;
+        last_vert_band_scale = null_scale;
+        src_band_height     = 4;
+        dest_band_height    = 4;
+        break;
+    default:
+        // The ratio is not acceptable now
+        // throw("The ratio is not acceptable for now!");
+        ratio_scalable = 0;
+        break;
+    }
+
+    if (ratio_scalable == 0)
+        return ratio_scalable;
+
+    horiz_line_scale(source, source_width, dest, dest_width);
+
+    // except last band
+    for (k = 0; k < (dest_height + dest_band_height - 1) / dest_band_height - 1; k++)
+    {
+        // scale one band horizontally
+        for (i = 1; i < src_band_height; i++)
+        {
+            // Trap case where we could read off the base of the source buffer
+            line_src = source + i * source_pitch;
+
+            if (line_src < source_base)
+                line_src = source_base;
+
+            horiz_line_scale(line_src, source_width,
+                             dest + i * dest_pitch, dest_width);
+        }
+
+        // first line of next band
+        // Trap case where we could read off the base of the source buffer
+        line_src = source + src_band_height * source_pitch;
+
+        if (line_src < source_base)
+            line_src = source_base;
+
+        horiz_line_scale(line_src, source_width,
+                         dest + dest_band_height * dest_pitch,
+                         dest_width);
+
+        // Vertical scaling is in place
+        vert_band_scale(dest, dest_pitch, dest_width);
+
+        // Next band...
+        source += src_band_height  * source_pitch;
+        dest   += dest_band_height * dest_pitch;
+    }
+
+    // scale one band horizontally
+    for (i = 1; i < src_band_height; i++)
+    {
+        // Trap case where we could read off the base of the source buffer
+        line_src = source + i * source_pitch;
+
+        if (line_src < source_base)
+            line_src = source_base;
+
+        horiz_line_scale(line_src, source_width,
+                         dest + i * dest_pitch,
+                         dest_width);
+    }
+
+    // Vertical scaling is in place
+    last_vert_band_scale(dest, dest_pitch, dest_width);
+
+    return ratio_scalable;
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : any_ratio_frame_scale
+ *
+ *  INPUTS        : SCALE_INSTANCE *si       : Pointer to post-processor instance (NOT USED).
+ *                  unsigned char *frame_buffer           : Pointer to source image.
+ *                  int YOffset                : Offset from start of buffer to Y samples.
+ *                  int UVOffset               : Offset from start of buffer to UV samples.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : int: 1 if image scaled, 0 if image could not be scaled.
+ *
+ *  FUNCTION      : Scale the image with changing apect ratio.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+int any_ratio_frame_scale(SCALE_VARS *scale_vars, int YOffset, int UVOffset)
+{
+    int i;
+    int ew;
+    int eh;
+
+    // suggested scale factors
+    int hs = scale_vars->HScale;
+    int hr = scale_vars->HRatio;
+    int vs = scale_vars->VScale;
+    int vr = scale_vars->VRatio;
+
+    int ratio_scalable = 1;
+
+    int sw = (scale_vars->expanded_frame_width * hr + hs - 1) / hs;
+    int sh = (scale_vars->expanded_frame_height * vr + vs - 1) / vs;
+    int dw = scale_vars->expanded_frame_width;
+    int dh = scale_vars->expanded_frame_height;
+    YV12_BUFFER_CONFIG *src_yuv_config = scale_vars->src_yuv_config;
+    YV12_BUFFER_CONFIG *dst_yuv_config = scale_vars->dst_yuv_config;
+
+    if (hr == 3)
+        ew = (sw + 2) / 3 * 3 * hs / hr;
+    else
+        ew = (sw + 7) / 8 * 8 * hs / hr;
+
+    if (vr == 3)
+        eh = (sh + 2) / 3 * 3 * vs / vr;
+    else
+        eh = (sh + 7) / 8 * 8 * vs / vr;
+
+    ratio_scalable = any_ratio_2d_scale(scale_vars,
+                                        (const unsigned char *)src_yuv_config->y_buffer,
+                                        src_yuv_config->y_stride, sw, sh,
+                                        (unsigned char *) dst_yuv_config->y_buffer + YOffset,
+                                        dst_yuv_config->y_stride, dw, dh);
+
+    for (i = 0; i < eh; i++)
+        duck_memset(dst_yuv_config->y_buffer + YOffset + i * dst_yuv_config->y_stride + dw, 0, ew - dw);
+
+    for (i = dh; i < eh; i++)
+        duck_memset(dst_yuv_config->y_buffer + YOffset + i * dst_yuv_config->y_stride, 0, ew);
+
+    if (ratio_scalable == 0)
+        return ratio_scalable;
+
+    sw = (sw + 1) >> 1;
+    sh = (sh + 1) >> 1;
+    dw = (dw + 1) >> 1;
+    dh = (dh + 1) >> 1;
+
+    any_ratio_2d_scale(scale_vars,
+                       (const unsigned char *)src_yuv_config->u_buffer,
+                       src_yuv_config->y_stride / 2, sw, sh,
+                       (unsigned char *)dst_yuv_config->u_buffer + UVOffset,
+                       dst_yuv_config->uv_stride, dw, dh);
+
+    any_ratio_2d_scale(scale_vars,
+                       (const unsigned char *)src_yuv_config->v_buffer,
+                       src_yuv_config->y_stride / 2, sw, sh,
+                       (unsigned char *)dst_yuv_config->v_buffer + UVOffset,
+                       dst_yuv_config->uv_stride, dw, dh);
+
+    return ratio_scalable;
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : center_image
+ *
+ *  INPUTS        : SCALE_INSTANCE *si       : Pointer to post-processor instance.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Centers the image without scaling in the output buffer.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static void
+center_image(YV12_BUFFER_CONFIG *src_yuv_config, YV12_BUFFER_CONFIG *dst_yuv_config)
+{
+    int i;
+    int row_offset, col_offset;
+    unsigned char *src_data_pointer;
+    unsigned char *dst_data_pointer;
+
+    // center values
+    row_offset = (dst_yuv_config->y_height - src_yuv_config->y_height) / 2;
+    col_offset = (dst_yuv_config->y_width - src_yuv_config->y_width) / 2;
+
+    // Y's
+    src_data_pointer = src_yuv_config->y_buffer;
+    dst_data_pointer = (unsigned char *)dst_yuv_config->y_buffer + (row_offset * dst_yuv_config->y_stride) + col_offset;
+
+    for (i = 0; i < src_yuv_config->y_height; i++)
+    {
+        duck_memcpy(dst_data_pointer, src_data_pointer, src_yuv_config->y_width);
+        dst_data_pointer += dst_yuv_config->y_stride;
+        src_data_pointer += src_yuv_config->y_stride;
+    }
+
+    row_offset /= 2;
+    col_offset /= 2;
+
+    // U's
+    src_data_pointer = src_yuv_config->u_buffer;
+    dst_data_pointer = (unsigned char *)dst_yuv_config->u_buffer + (row_offset * dst_yuv_config->uv_stride) + col_offset;
+
+    for (i = 0; i < src_yuv_config->uv_height; i++)
+    {
+        duck_memcpy(dst_data_pointer, src_data_pointer, src_yuv_config->uv_width);
+        dst_data_pointer += dst_yuv_config->uv_stride;
+        src_data_pointer += src_yuv_config->uv_stride;
+    }
+
+    // V's
+    src_data_pointer = src_yuv_config->v_buffer;
+    dst_data_pointer = (unsigned char *)dst_yuv_config->v_buffer + (row_offset * dst_yuv_config->uv_stride) + col_offset;
+
+    for (i = 0; i < src_yuv_config->uv_height; i++)
+    {
+        duck_memcpy(dst_data_pointer, src_data_pointer, src_yuv_config->uv_width);
+        dst_data_pointer += dst_yuv_config->uv_stride;
+        src_data_pointer += src_yuv_config->uv_stride;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : scale_or_center
+ *
+ *  INPUTS        : SCALE_INSTANCE *si       : Pointer to post-processor instance.
+ *
+ *
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Decides to scale or center image in scale buffer for blit
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void
+vp8_yv12_scale_or_center
+(
+    YV12_BUFFER_CONFIG *src_yuv_config,
+    YV12_BUFFER_CONFIG *dst_yuv_config,
+    int expanded_frame_width,
+    int expanded_frame_height,
+    int scaling_mode,
+    int HScale,
+    int HRatio,
+    int VScale,
+    int VRatio
+)
+{
+//    if ( ppi->post_processing_level )
+    //      update_umvborder ( ppi, frame_buffer );
+
+
+    switch (scaling_mode)
+    {
+    case SCALE_TO_FIT:
+    case MAINTAIN_ASPECT_RATIO:
+    {
+        SCALE_VARS scale_vars;
+        // center values
+#if 1
+        int row = (dst_yuv_config->y_height - expanded_frame_height) / 2;
+        int col = (dst_yuv_config->y_width  - expanded_frame_width) / 2;
+//        int YOffset  = row * dst_yuv_config->y_width + col;
+//        int UVOffset = (row>>1) * dst_yuv_config->uv_width + (col>>1);
+        int YOffset  = row * dst_yuv_config->y_stride + col;
+        int UVOffset = (row >> 1) * dst_yuv_config->uv_stride + (col >> 1);
+#else
+        int row = (src_yuv_config->y_height - expanded_frame_height) / 2;
+        int col = (src_yuv_config->y_width  - expanded_frame_width) / 2;
+        int YOffset  = row * src_yuv_config->y_width + col;
+        int UVOffset = (row >> 1) * src_yuv_config->uv_width + (col >> 1);
+#endif
+
+        scale_vars.dst_yuv_config = dst_yuv_config;
+        scale_vars.src_yuv_config = src_yuv_config;
+        scale_vars.HScale = HScale;
+        scale_vars.HRatio = HRatio;
+        scale_vars.VScale = VScale;
+        scale_vars.VRatio = VRatio;
+        scale_vars.expanded_frame_width = expanded_frame_width;
+        scale_vars.expanded_frame_height = expanded_frame_height;
+
+        // perform center and scale
+        any_ratio_frame_scale(&scale_vars, YOffset, UVOffset);
+
+        break;
+    }
+    case CENTER:
+        center_image(src_yuv_config, dst_yuv_config);
+        break;
+
+    default:
+        break;
+    }
+}
diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c
new file mode 100644
index 000000000..04617be51
--- /dev/null
+++ b/vpx_scale/generic/yv12config.c
@@ -0,0 +1,110 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/yv12config.h"
+#include "vpx_mem/vpx_mem.h"
+
+/****************************************************************************
+*  Exports
+****************************************************************************/
+
+/****************************************************************************
+ *
+ ****************************************************************************/
+int
+vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf)
+{
+    if (ybf)
+    {
+        if (ybf->buffer_alloc)
+        {
+            duck_free(ybf->buffer_alloc);
+        }
+
+        ybf->buffer_alloc = 0;
+    }
+    else
+    {
+        return -1;
+    }
+
+    return 0;
+}
+
+/****************************************************************************
+ *
+ ****************************************************************************/
+int
+vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int border)
+{
+//NOTE:
+
+    int yplane_size = (height + 2 * border) * (width + 2 * border);
+    int uvplane_size = ((1 + height) / 2 + border) * ((1 + width) / 2 + border);
+
+    if (ybf)
+    {
+        vp8_yv12_de_alloc_frame_buffer(ybf);
+
+        ybf->y_width  = width;
+        ybf->y_height = height;
+        ybf->y_stride = width + 2 * border;
+
+        ybf->uv_width = (1 + width) / 2;
+        ybf->uv_height = (1 + height) / 2;
+        ybf->uv_stride = ybf->uv_width + border;
+
+        ybf->border = border;
+        ybf->frame_size = yplane_size + 2 * uvplane_size;
+
+        // Added 2 extra lines to framebuffer so that copy12x12 doesn't fail
+        // when we have a large motion vector in V on the last v block.
+        // Note : We never use these pixels anyway so this doesn't hurt.
+        ybf->buffer_alloc = (unsigned char *) duck_memalign(32,  ybf->frame_size + (ybf->y_stride * 2) + 32, 0);
+
+        if (ybf->buffer_alloc == NULL)
+            return -1;
+
+        ybf->y_buffer = ybf->buffer_alloc + (border * ybf->y_stride) + border;
+
+        if (yplane_size & 0xf)
+            yplane_size += 16 - (yplane_size & 0xf);
+
+        ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2  * ybf->uv_stride) + border / 2;
+        ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2  * ybf->uv_stride) + border / 2;
+    }
+    else
+    {
+        return -2;
+    }
+
+    return 0;
+}
+
+/****************************************************************************
+ *
+ ****************************************************************************/
+int
+vp8_yv12_black_frame_buffer(YV12_BUFFER_CONFIG *ybf)
+{
+    if (ybf)
+    {
+        if (ybf->buffer_alloc)
+        {
+            duck_memset(ybf->y_buffer, 0x0, ybf->y_stride * ybf->y_height);
+            duck_memset(ybf->u_buffer, 0x80, ybf->uv_stride * ybf->uv_height);
+            duck_memset(ybf->v_buffer, 0x80, ybf->uv_stride * ybf->uv_height);
+        }
+
+        return 0;
+    }
+
+    return -1;
+}
diff --git a/vpx_scale/generic/yv12extend.c b/vpx_scale/generic/yv12extend.c
new file mode 100644
index 000000000..4906625c8
--- /dev/null
+++ b/vpx_scale/generic/yv12extend.c
@@ -0,0 +1,279 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/yv12config.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/vpxscale.h"
+
+/****************************************************************************
+*  Exports
+****************************************************************************/
+
+/****************************************************************************
+ *
+ ****************************************************************************/
+void
+vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf)
+{
+    int i;
+    unsigned char *src_ptr1, *src_ptr2;
+    unsigned char *dest_ptr1, *dest_ptr2;
+
+    unsigned int Border;
+    int plane_stride;
+    int plane_height;
+    int plane_width;
+
+    /***********/
+    /* Y Plane */
+    /***********/
+    Border = ybf->border;
+    plane_stride = ybf->y_stride;
+    plane_height = ybf->y_height;
+    plane_width = ybf->y_width;
+
+    // copy the left and right most columns out
+    src_ptr1 = ybf->y_buffer;
+    src_ptr2 = src_ptr1 + plane_width - 1;
+    dest_ptr1 = src_ptr1 - Border;
+    dest_ptr2 = src_ptr2 + 1;
+
+    for (i = 0; i < plane_height; i++)
+    {
+        vpx_memset(dest_ptr1, src_ptr1[0], Border);
+        vpx_memset(dest_ptr2, src_ptr2[0], Border);
+        src_ptr1  += plane_stride;
+        src_ptr2  += plane_stride;
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    // Now copy the top and bottom source lines into each line of the respective borders
+    src_ptr1 = ybf->y_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)Border; i++)
+    {
+        vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+        vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+
+    /***********/
+    /* U Plane */
+    /***********/
+    plane_stride = ybf->uv_stride;
+    plane_height = ybf->uv_height;
+    plane_width = ybf->uv_width;
+    Border /= 2;
+
+    // copy the left and right most columns out
+    src_ptr1 = ybf->u_buffer;
+    src_ptr2 = src_ptr1 + plane_width - 1;
+    dest_ptr1 = src_ptr1 - Border;
+    dest_ptr2 = src_ptr2 + 1;
+
+    for (i = 0; i < plane_height; i++)
+    {
+        vpx_memset(dest_ptr1, src_ptr1[0], Border);
+        vpx_memset(dest_ptr2, src_ptr2[0], Border);
+        src_ptr1  += plane_stride;
+        src_ptr2  += plane_stride;
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    // Now copy the top and bottom source lines into each line of the respective borders
+    src_ptr1 = ybf->u_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)(Border); i++)
+    {
+        vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+        vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    /***********/
+    /* V Plane */
+    /***********/
+
+    // copy the left and right most columns out
+    src_ptr1 = ybf->v_buffer;
+    src_ptr2 = src_ptr1 + plane_width - 1;
+    dest_ptr1 = src_ptr1 - Border;
+    dest_ptr2 = src_ptr2 + 1;
+
+    for (i = 0; i < plane_height; i++)
+    {
+        vpx_memset(dest_ptr1, src_ptr1[0], Border);
+        vpx_memset(dest_ptr2, src_ptr2[0], Border);
+        src_ptr1  += plane_stride;
+        src_ptr2  += plane_stride;
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    // Now copy the top and bottom source lines into each line of the respective borders
+    src_ptr1 = ybf->v_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)(Border); i++)
+    {
+        vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+        vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+}
+
+
+void
+vp8_yv12_extend_frame_borders_yonly(YV12_BUFFER_CONFIG *ybf)
+{
+    int i;
+    unsigned char *src_ptr1, *src_ptr2;
+    unsigned char *dest_ptr1, *dest_ptr2;
+
+    unsigned int Border;
+    int plane_stride;
+    int plane_height;
+    int plane_width;
+
+    /***********/
+    /* Y Plane */
+    /***********/
+    Border = ybf->border;
+    plane_stride = ybf->y_stride;
+    plane_height = ybf->y_height;
+    plane_width = ybf->y_width;
+
+    // copy the left and right most columns out
+    src_ptr1 = ybf->y_buffer;
+    src_ptr2 = src_ptr1 + plane_width - 1;
+    dest_ptr1 = src_ptr1 - Border;
+    dest_ptr2 = src_ptr2 + 1;
+
+    for (i = 0; i < plane_height; i++)
+    {
+        vpx_memset(dest_ptr1, src_ptr1[0], Border);
+        vpx_memset(dest_ptr2, src_ptr2[0], Border);
+        src_ptr1  += plane_stride;
+        src_ptr2  += plane_stride;
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    // Now copy the top and bottom source lines into each line of the respective borders
+    src_ptr1 = ybf->y_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)Border; i++)
+    {
+        vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+        vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    plane_stride /= 2;
+    plane_height /= 2;
+    plane_width /= 2;
+    Border /= 2;
+
+}
+
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8_yv12_copy_frame
+ *
+ *  INPUTS        :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies the source image into the destination image and
+ *                  updates the destination's UMV borders.
+ *
+ *  SPECIAL NOTES : The frames are assumed to be identical in size.
+ *
+ ****************************************************************************/
+void
+vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
+{
+    int row;
+    unsigned char *source, *dest;
+
+    source = src_ybc->y_buffer;
+    dest = dst_ybc->y_buffer;
+
+    for (row = 0; row < src_ybc->y_height; row++)
+    {
+        vpx_memcpy(dest, source, src_ybc->y_width);
+        source += src_ybc->y_stride;
+        dest   += dst_ybc->y_stride;
+    }
+
+    source = src_ybc->u_buffer;
+    dest = dst_ybc->u_buffer;
+
+    for (row = 0; row < src_ybc->uv_height; row++)
+    {
+        vpx_memcpy(dest, source, src_ybc->uv_width);
+        source += src_ybc->uv_stride;
+        dest   += dst_ybc->uv_stride;
+    }
+
+    source = src_ybc->v_buffer;
+    dest = dst_ybc->v_buffer;
+
+    for (row = 0; row < src_ybc->uv_height; row++)
+    {
+        vpx_memcpy(dest, source, src_ybc->uv_width);
+        source += src_ybc->uv_stride;
+        dest   += dst_ybc->uv_stride;
+    }
+
+    vp8_yv12_extend_frame_borders_ptr(dst_ybc);
+}
+
+void
+vp8_yv12_copy_frame_yonly(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
+{
+    int row;
+    unsigned char *source, *dest;
+
+
+    source = src_ybc->y_buffer;
+    dest = dst_ybc->y_buffer;
+
+    for (row = 0; row < src_ybc->y_height; row++)
+    {
+        vpx_memcpy(dest, source, src_ybc->y_width);
+        source += src_ybc->y_stride;
+        dest   += dst_ybc->y_stride;
+    }
+
+    vp8_yv12_extend_frame_borders_yonly(dst_ybc);
+}
diff --git a/vpx_scale/include/arm/vpxscale_nofp.h b/vpx_scale/include/arm/vpxscale_nofp.h
new file mode 100644
index 000000000..d6181d207
--- /dev/null
+++ b/vpx_scale/include/arm/vpxscale_nofp.h
@@ -0,0 +1,67 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+void  vp8cx_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_last_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_last_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_last_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_last_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_horizontal_line_1_2_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void  vp8cx_horizontal_line_3_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void  vp8cx_horizontal_line_3_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void  vp8cx_horizontal_line_2_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void  vp8cx_horizontal_line_4_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void  vp8cx_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_last_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+
+void  vp8cx_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void  vp8cx_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void  vp8cx_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+
+void horizontal_line_4_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_2_3_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_3_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_3_4_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_1_2_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vertical_band_4_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_2_3_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_3_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_3_4_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_1_2_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+
+#define vp8_vertical_band_4_5_scale     vertical_band_4_5_scale_armv4
+#define vp8_last_vertical_band_4_5_scale vp8cx_last_vertical_band_4_5_scale_c
+#define vp8_vertical_band_2_3_scale     vertical_band_2_3_scale_armv4
+#define vp8_last_vertical_band_2_3_scale vp8cx_last_vertical_band_2_3_scale_c
+#define vp8_vertical_band_3_5_scale     vertical_band_3_5_scale_armv4
+#define vp8_last_vertical_band_3_5_scale vp8cx_last_vertical_band_3_5_scale_c
+#define vp8_vertical_band_3_4_scale     vertical_band_3_4_scale_armv4
+#define vp8_last_vertical_band_3_4_scale vp8cx_last_vertical_band_3_4_scale_c
+#define vp8_horizontal_line_1_2_scale   horizontal_line_1_2_scale_armv4
+#define vp8_horizontal_line_3_5_scale   horizontal_line_3_5_scale_armv4
+#define vp8_horizontal_line_3_4_scale   horizontal_line_3_4_scale_armv4
+#define vp8_horizontal_line_4_5_scale   horizontal_line_4_5_scale_armv4
+#define vp8_horizontal_line_2_3_scale   horizontal_line_2_3_scale_armv4
+#define vp8_vertical_band_1_2_scale     vertical_band_1_2_scale_armv4
+#define vp8_last_vertical_band_1_2_scale vp8cx_last_vertical_band_1_2_scale_c
+#define vp8_vertical_band_5_4_scale     vp8cx_vertical_band_5_4_scale_c
+#define vp8_vertical_band_5_3_scale     vp8cx_vertical_band_5_3_scale_c
+#define vp8_vertical_band_2_1_scale     vp8cx_vertical_band_2_1_scale_c
+#define vp8_vertical_band_2_1_scale_i   vp8cx_vertical_band_2_1_scale_i_c
+#define vp8_horizontal_line_2_1_scale   vp8cx_horizontal_line_2_1_scale_c
+#define vp8_horizontal_line_5_3_scale   vp8cx_horizontal_line_5_3_scale_c
+#define vp8_horizontal_line_5_4_scale   vp8cx_horizontal_line_5_4_scale_c
diff --git a/vpx_scale/include/generic/vpxscale_arbitrary.h b/vpx_scale/include/generic/vpxscale_arbitrary.h
new file mode 100644
index 000000000..2b50f24cf
--- /dev/null
+++ b/vpx_scale/include/generic/vpxscale_arbitrary.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __VPX_SCALE_ARBITRARY_H__
+#define __VPX_SCALE_ARBITRARY_H__
+
+#include "vpx_scale/yv12config.h"
+
+typedef struct
+{
+    int in_width;
+    int in_height;
+
+    int out_width;
+    int out_height;
+    int max_usable_out_width;
+
+    // numerator for the width and height
+    int nw;
+    int nh;
+    int nh_uv;
+
+    // output to input correspondance array
+    short *l_w;
+    short *l_h;
+    short *l_h_uv;
+
+    // polyphase coefficients
+    short *c_w;
+    short *c_h;
+    short *c_h_uv;
+
+    // buffer for horizontal filtering.
+    unsigned char *hbuf;
+    unsigned char *hbuf_uv;
+} BICUBIC_SCALER_STRUCT;
+
+int bicubic_coefficient_setup(int in_width, int in_height, int out_width, int out_height);
+int bicubic_scale(int in_width, int in_height, int in_stride,
+                  int out_width, int out_height, int out_stride,
+                  unsigned char *input_image, unsigned char *output_image);
+void bicubic_scale_frame_reset();
+void bicubic_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+                         int new_width, int new_height);
+void bicubic_coefficient_init();
+void bicubic_coefficient_destroy();
+
+#endif /* __VPX_SCALE_ARBITRARY_H__ */
diff --git a/vpx_scale/include/generic/vpxscale_depricated.h b/vpx_scale/include/generic/vpxscale_depricated.h
new file mode 100644
index 000000000..015eed0fc
--- /dev/null
+++ b/vpx_scale/include/generic/vpxscale_depricated.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     postp.h
+*
+*   Description  :     Post processor interface
+*
+****************************************************************************/
+#ifndef VPXSCALE_H
+#define VPXSCALE_H
+
+extern void (*vp8_vertical_band_4_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_last_vertical_band_4_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_vertical_band_3_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_last_vertical_band_3_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_horizontal_line_1_2_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+extern void (*vp8_horizontal_line_3_5_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+extern void (*vp8_horizontal_line_4_5_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+extern void (*vp8_vertical_band_1_2_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_last_vertical_band_1_2_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+
+extern void  dmachine_specific_config(int mmx_enabled, int xmm_enabled, int wmt_enabled);
+
+#endif
diff --git a/vpx_scale/include/generic/vpxscale_nofp.h b/vpx_scale/include/generic/vpxscale_nofp.h
new file mode 100644
index 000000000..c4d5f4c6f
--- /dev/null
+++ b/vpx_scale/include/generic/vpxscale_nofp.h
@@ -0,0 +1,50 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+void  vp8cx_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_last_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_last_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_last_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_horizontal_line_1_2_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void  vp8cx_horizontal_line_3_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void  vp8cx_horizontal_line_2_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void  vp8cx_horizontal_line_4_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void  vp8cx_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_last_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+
+void  vp8cx_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void  vp8cx_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void  vp8cx_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+
+#define vp8_vertical_band_4_5_scale     vp8cx_vertical_band_4_5_scale_c
+#define vp8_last_vertical_band_4_5_scale vp8cx_last_vertical_band_4_5_scale_c
+#define vp8_vertical_band_2_3_scale     vp8cx_vertical_band_2_3_scale_c
+#define vp8_last_vertical_band_2_3_scale vp8cx_last_vertical_band_2_3_scale_c
+#define vp8_vertical_band_3_5_scale     vp8cx_vertical_band_3_5_scale_c
+#define vp8_last_vertical_band_3_5_scale vp8cx_last_vertical_band_3_5_scale_c
+#define vp8_horizontal_line_1_2_scale   vp8cx_horizontal_line_1_2_scale_c
+#define vp8_horizontal_line_3_5_scale   vp8cx_horizontal_line_3_5_scale_c
+#define vp8_horizontal_line_4_5_scale   vp8cx_horizontal_line_4_5_scale_c
+#define vp8_horizontal_line_2_3_scale   vp8cx_horizontal_line_2_3_scale_c
+#define vp8_vertical_band_1_2_scale     vp8cx_vertical_band_1_2_scale_c
+#define vp8_last_vertical_band_1_2_scale vp8cx_last_vertical_band_1_2_scale_c
+#define vp8_vertical_band_5_4_scale     vp8cx_vertical_band_5_4_scale_c
+#define vp8_vertical_band_5_3_scale     vp8cx_vertical_band_5_3_scale_c
+#define vp8_vertical_band_2_1_scale     vp8cx_vertical_band_2_1_scale_c
+#define vp8_vertical_band_2_1_scale_i   vp8cx_vertical_band_2_1_scale_i_c
+#define vp8_horizontal_line_2_1_scale   vp8cx_horizontal_line_2_1_scale_c
+#define vp8_horizontal_line_5_3_scale   vp8cx_horizontal_line_5_3_scale_c
+#define vp8_horizontal_line_5_4_scale   vp8cx_horizontal_line_5_4_scale_c
diff --git a/vpx_scale/include/leapster/vpxscale.h b/vpx_scale/include/leapster/vpxscale.h
new file mode 100644
index 000000000..f70029cae
--- /dev/null
+++ b/vpx_scale/include/leapster/vpxscale.h
@@ -0,0 +1,61 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     postp.h
+*
+*   Description  :     Post processor interface
+*
+****************************************************************************/
+#ifndef VPXSCALE_H
+#define VPXSCALE_H
+
+
+// fwg 2004-10-14
+typedef void (*vpxvertical_band_4_5_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+typedef void (*vpxlast_vertical_band_4_5_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+typedef void (*vpxvertical_band_3_5_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+typedef void (*vpxlast_vertical_band_3_5_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+typedef void (*vpxhorizontal_line_1_2_scale_lf)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+typedef void (*vpxhorizontal_line_3_5_scale_lf)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+typedef void (*vpxhorizontal_line_4_5_scale_lf)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+typedef void (*vpxvertical_band_1_2_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+typedef void (*vpxlast_vertical_band_1_2_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+
+
+typedef struct vpxglobal_scalling_ptrs_t
+{
+    vpxvertical_band_4_5_scale_lf        vpxvertical_band_4_5_scale_t;
+    vpxlast_vertical_band_4_5_scale_lf    vpxlast_vertical_band_4_5_scale_t;
+    vpxvertical_band_3_5_scale_lf        vpxvertical_band_3_5_scale_t;
+    vpxlast_vertical_band_3_5_scale_lf    vpxlast_vertical_band_3_5_scale_t;
+    vpxhorizontal_line_1_2_scale_lf      vpxhorizontal_line_1_2_scale_t;
+    vpxhorizontal_line_3_5_scale_lf      vpxhorizontal_line_3_5_scale_t;
+    vpxhorizontal_line_4_5_scale_lf      vpxhorizontal_line_4_5_scale_t;
+    vpxvertical_band_1_2_scale_lf        vpxvertical_band_1_2_scale_t;
+    vpxlast_vertical_band_1_2_scale_lf    vpxlast_vertical_band_1_2_scale_t;
+} vpxglobal_scalling_ptrs;
+
+extern struct vpxglobal_scalling_ptrs_t *g_scaling_ptrs;
+
+/*
+extern void  (*vp8_vertical_band_4_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+extern void  (*vp8_last_vertical_band_4_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+extern void  (*vp8_vertical_band_3_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+extern void  (*vp8_last_vertical_band_3_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+extern void  (*vp8_horizontal_line_1_2_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width);
+extern void  (*vp8_horizontal_line_3_5_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width);
+extern void  (*vp8_horizontal_line_4_5_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width);
+extern void  (*vp8_vertical_band_1_2_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+extern void  (*vp8_last_vertical_band_1_2_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+*/
+
+#endif
diff --git a/vpx_scale/include/symbian/vpxscale_nofp.h b/vpx_scale/include/symbian/vpxscale_nofp.h
new file mode 100644
index 000000000..d6181d207
--- /dev/null
+++ b/vpx_scale/include/symbian/vpxscale_nofp.h
@@ -0,0 +1,67 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+void  vp8cx_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_last_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_last_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_last_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_last_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_horizontal_line_1_2_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void  vp8cx_horizontal_line_3_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void  vp8cx_horizontal_line_3_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void  vp8cx_horizontal_line_2_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void  vp8cx_horizontal_line_4_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void  vp8cx_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_last_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+
+void  vp8cx_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void  vp8cx_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void  vp8cx_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void  vp8cx_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+
+void horizontal_line_4_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_2_3_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_3_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_3_4_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_1_2_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vertical_band_4_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_2_3_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_3_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_3_4_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_1_2_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+
+#define vp8_vertical_band_4_5_scale     vertical_band_4_5_scale_armv4
+#define vp8_last_vertical_band_4_5_scale vp8cx_last_vertical_band_4_5_scale_c
+#define vp8_vertical_band_2_3_scale     vertical_band_2_3_scale_armv4
+#define vp8_last_vertical_band_2_3_scale vp8cx_last_vertical_band_2_3_scale_c
+#define vp8_vertical_band_3_5_scale     vertical_band_3_5_scale_armv4
+#define vp8_last_vertical_band_3_5_scale vp8cx_last_vertical_band_3_5_scale_c
+#define vp8_vertical_band_3_4_scale     vertical_band_3_4_scale_armv4
+#define vp8_last_vertical_band_3_4_scale vp8cx_last_vertical_band_3_4_scale_c
+#define vp8_horizontal_line_1_2_scale   horizontal_line_1_2_scale_armv4
+#define vp8_horizontal_line_3_5_scale   horizontal_line_3_5_scale_armv4
+#define vp8_horizontal_line_3_4_scale   horizontal_line_3_4_scale_armv4
+#define vp8_horizontal_line_4_5_scale   horizontal_line_4_5_scale_armv4
+#define vp8_horizontal_line_2_3_scale   horizontal_line_2_3_scale_armv4
+#define vp8_vertical_band_1_2_scale     vertical_band_1_2_scale_armv4
+#define vp8_last_vertical_band_1_2_scale vp8cx_last_vertical_band_1_2_scale_c
+#define vp8_vertical_band_5_4_scale     vp8cx_vertical_band_5_4_scale_c
+#define vp8_vertical_band_5_3_scale     vp8cx_vertical_band_5_3_scale_c
+#define vp8_vertical_band_2_1_scale     vp8cx_vertical_band_2_1_scale_c
+#define vp8_vertical_band_2_1_scale_i   vp8cx_vertical_band_2_1_scale_i_c
+#define vp8_horizontal_line_2_1_scale   vp8cx_horizontal_line_2_1_scale_c
+#define vp8_horizontal_line_5_3_scale   vp8cx_horizontal_line_5_3_scale_c
+#define vp8_horizontal_line_5_4_scale   vp8cx_horizontal_line_5_4_scale_c
diff --git a/vpx_scale/include/vpxscale_nofp.h b/vpx_scale/include/vpxscale_nofp.h
new file mode 100644
index 000000000..f6482f944
--- /dev/null
+++ b/vpx_scale/include/vpxscale_nofp.h
@@ -0,0 +1,15 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#if defined(__S60_V20__) || defined(__SYMBIAN32__) && !defined(__WINS__)
+#include "symbian\vpxscale_nofp.h"
+#else
+#include "generic\vpxscale_nofp.h"
+#endif
diff --git a/vpx_scale/intel_linux/scaleopt.c b/vpx_scale/intel_linux/scaleopt.c
new file mode 100644
index 000000000..6555600e9
--- /dev/null
+++ b/vpx_scale/intel_linux/scaleopt.c
@@ -0,0 +1,1852 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     scaleopt.cpp
+*
+*   Description  :     Optimized scaling functions
+*
+****************************************************************************/
+#include "pragmas.h"
+
+/****************************************************************************
+*  Module Statics
+****************************************************************************/
+#if 0
+__declspec(align(16)) const static unsigned short one_fifth[]  = { 51, 51, 51, 51 };
+__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
+__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
+__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
+__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
+__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
+__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102,  51 };
+__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
+__declspec(align(16)) const static unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
+__declspec(align(16)) const static unsigned short const35_2[] = { 154,  51, 205, 102 };
+__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205,  51, 154 };
+#endif
+
+#include "vpx_scale/vpxscale.h"
+#include "vpx_mem/vpx_mem.h"
+
+/****************************************************************************
+ *
+ *  ROUTINE       : horizontal_line_3_5_scale_mmx
+ *
+ *  INPUTS        : const unsigned char *source :
+ *                  unsigned int source_width    :
+ *                  unsigned char *dest         :
+ *                  unsigned int dest_width      :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 3 to 5 up-scaling of a horizontal line of pixels.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_3_5_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    __declspec(align(16)) unsigned short const35_2[] = { 154,  51, 205, 102 };
+    __declspec(align(16)) unsigned short const35_1[] = { 102, 205,  51, 154 };
+    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
+
+    (void) dest_width;
+
+    __asm
+    {
+
+        push ebx
+
+        mov         esi,    source
+        mov         edi,    dest
+
+        mov         ecx,    source_width
+        lea         edx,    [esi+ecx-3];
+
+        movq        mm5,    const35_1       // mm5 = 66 xx cd xx 33 xx 9a xx
+        movq        mm6,    const35_2       // mm6 = 9a xx 33 xx cd xx 66 xx
+
+        movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
+        pxor        mm7,    mm7             // clear mm7
+
+        horiz_line_3_5_loop:
+
+        mov        eax,    DWORD PTR [esi] // eax = 00 01 02 03
+        mov        ebx,    eax
+
+        and         ebx,    0xffff00        // ebx = xx 01 02 xx
+        mov         ecx,    eax             // ecx = 00 01 02 03
+
+        and         eax,    0xffff0000      // eax = xx xx 02 03
+        xor         ecx,    eax             // ecx = 00 01 xx xx
+
+        shr         ebx,    8               // ebx = 01 02 xx xx
+        or          eax,    ebx             // eax = 01 02 02 03
+
+        shl         ebx,    16              // ebx = xx xx 01 02
+        movd        mm1,    eax             // mm1 = 01 02 02 03 xx xx xx xx
+
+        or          ebx,    ecx             // ebx = 00 01 01 02
+        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 03 xx
+
+        movd        mm0,    ebx             // mm0 = 00 01 01 02
+        pmullw      mm1,    mm6             //
+
+        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
+        pmullw      mm0,    mm5             //
+
+        mov         [edi],  ebx             // writeoutput 00 xx xx xx
+        add         esi,    3
+
+        add         edi,    5
+        paddw       mm0,    mm1
+
+        paddw       mm0,    mm4
+        psrlw       mm0,    8
+
+        cmp         esi,    edx
+        packuswb    mm0,    mm7
+
+        movd        DWORD Ptr [edi-4], mm0
+        jl          horiz_line_3_5_loop
+
+//Exit:
+        mov         eax,    DWORD PTR [esi] // eax = 00 01 02 03
+        mov         ebx,    eax
+
+        and         ebx,    0xffff00        // ebx = xx 01 02 xx
+        mov         ecx,    eax             // ecx = 00 01 02 03
+
+        and         eax,    0xffff0000      // eax = xx xx 02 03
+        xor         ecx,    eax             // ecx = 00 01 xx xx
+
+        shr         ebx,    8               // ebx = 01 02 xx xx
+        or          eax,    ebx             // eax = 01 02 02 03
+
+        shl         eax,    8               // eax = xx 01 02 02
+        and         eax,    0xffff0000      // eax = xx xx 02 02
+
+        or          eax,    ebx             // eax = 01 02 02 02
+
+        shl         ebx,    16              // ebx = xx xx 01 02
+        movd        mm1,    eax             // mm1 = 01 02 02 02 xx xx xx xx
+
+        or          ebx,    ecx             // ebx = 00 01 01 02
+        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 02 xx
+
+        movd        mm0,    ebx             // mm0 = 00 01 01 02
+        pmullw      mm1,    mm6             //
+
+        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
+        pmullw      mm0,    mm5             //
+
+        mov         [edi],  ebx             // writeoutput 00 xx xx xx
+        paddw       mm0,    mm1
+
+        paddw       mm0,    mm4
+        psrlw       mm0,    8
+
+        packuswb    mm0,    mm7
+        movd        DWORD Ptr [edi+1], mm0
+
+        pop ebx
+
+    }
+
+    /*
+    const unsigned char *src = source;
+    unsigned char *des = dest;
+    unsigned int a, b, c ;
+    unsigned int i;
+    (void) dest_width;
+
+    for ( i=0; i<source_width-3; i+=3 )
+    {
+        a = src[0];
+        b = src[1];
+        des [0] = (UINT8) (a);
+        // 2 * left + 3 * right /5
+        des [1] = (UINT8) (( a * 102 + 154 * b + 128 ) >> 8);
+        c = src[2] ;
+        // 4 * left + 1 * right /5
+        des [2] = (UINT8) (( b * 205 + c * 51 + 128 ) >> 8);
+        // 1 * left + 4 * right /5
+        des [3] = (UINT8) (( b * 51 + c * 205 + 128 ) >> 8);
+
+        a = src[3];
+        // 3 * left + 2 * right /5
+        des [4] = (UINT8) (( c * 154 + a * 102 + 128 ) >> 8);
+
+        src += 3;
+        des += 5;
+    }
+
+    a = src[0];
+    b = src[1];
+    des [0] = (UINT8) (a);
+    // 2 * left + 3 * right /5
+    des [1] = (UINT8) (( a * 102 + 154 * b + 128 ) >> 8);
+    c = src[2] ;
+    // 4 * left + 1 * right /5
+    des [2] = (UINT8) (( b * 205 + c * 51 + 128 ) >> 8);
+    // 1 * left + 4 * right /5
+    des [3] = (UINT8) (( b * 51 + c * 205 + 128 ) >> 8);
+
+    des [4] = (UINT8) (c);
+    */
+}
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : horizontal_line_4_5_scale_mmx
+ *
+ *  INPUTS        : const unsigned char *source :
+ *                  unsigned int source_width    :
+ *                  unsigned char *dest         :
+ *                  unsigned int dest_width      :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 4 to 5 up-scaling of a horizontal line of pixels.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_4_5_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
+    __declspec(align(16)) unsigned short const45_2[] = {205, 154, 102,  51 };
+    __declspec(align(16)) unsigned short const45_1[] = { 51, 102, 154, 205 };
+    __declspec(align(16)) unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
+
+    (void)dest_width;
+
+    __asm
+    {
+
+        mov         esi,    source
+        mov         edi,    dest
+
+        mov         ecx,    source_width
+        lea         edx,    [esi+ecx-8];
+
+        movq        mm5,    const45_1       // mm5 = 33 xx 66 xx 9a xx cd xx
+        movq        mm6,    const45_2       // mm6 = cd xx 9a xx 66 xx 33 xx
+
+        movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
+        pxor        mm7,    mm7             // clear mm7
+
+        horiz_line_4_5_loop:
+
+        movq        mm0,    QWORD PTR [esi]           // mm0 = 00 01 02 03 04 05 06 07
+        movq        mm1,    QWORD PTR [esi+1];        // mm1 = 01 02 03 04 05 06 07 08
+
+        movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
+        movq        mm3,    mm1             // mm3 = 01 02 03 04 05 06 07 08
+
+        movd        DWORD PTR [edi],  mm0             // write output 00 xx xx xx
+        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
+
+        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
+        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
+
+        pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
+        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
+
+        movd        DWORD PTR [edi+5], mm2            // write ouput 05 xx xx xx
+        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
+
+        punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
+        pmullw      mm3,    mm6             // 05*205 06*154 07*102 08* 51
+
+        paddw       mm0,    mm1             // added round values
+        paddw       mm0,    mm4
+
+        psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
+        packuswb    mm0,    mm7
+
+        movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
+        add         edi,    10
+
+        add         esi,    8
+        paddw       mm2,    mm3             //
+
+        paddw       mm2,    mm4             // added round values
+        cmp         esi,    edx
+
+        psrlw       mm2,    8
+        packuswb    mm2,    mm7
+
+        movd        DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09
+        jl         horiz_line_4_5_loop
+
+//Exit:
+        movq        mm0,    [esi]           // mm0 = 00 01 02 03 04 05 06 07
+        movq        mm1,    mm0             // mm1 = 00 01 02 03 04 05 06 07
+
+        movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
+        psrlq       mm1,    8               // mm1 = 01 02 03 04 05 06 07 00
+
+        movq        mm3,    mask45          // mm3 = 00 00 00 00 00 00 ff 00
+        pand        mm3,    mm1             // mm3 = 00 00 00 00 00 00 07 00
+
+        psllq       mm3,    8               // mm3 = 00 00 00 00 00 00 00 07
+        por         mm1,    mm3             // mm1 = 01 02 03 04 05 06 07 07
+
+        movq        mm3,    mm1
+
+        movd        DWORD PTR [edi],  mm0   // write output 00 xx xx xx
+        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
+
+        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
+        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
+
+        pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
+        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
+
+        movd        DWORD PTR [edi+5], mm2  // write ouput 05 xx xx xx
+        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
+
+        punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
+        pmullw      mm3,    mm6             // 05*205 06*154 07*102 07* 51
+
+        paddw       mm0,    mm1             // added round values
+        paddw       mm0,    mm4
+
+        psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
+        packuswb    mm0,    mm7             // 01 02 03 04 xx xx xx xx
+
+        movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
+        paddw       mm2,    mm3             //
+
+        paddw       mm2,    mm4             // added round values
+        psrlw       mm2,    8
+
+        packuswb    mm2,    mm7
+        movd        DWORD PTR [edi+6], mm2  // writeoutput 06 07 08 09
+
+
+    }
+    /*
+        const unsigned char *src = source;
+        unsigned char *des = dest;
+        unsigned int a, b, c ;
+        unsigned i;
+        (void) dest_width;
+
+        for ( i=0; i<source_width-4; i+=4 )
+        {
+            a = src[0];
+            b = src[1];
+            des [0] = (UINT8) a;
+            des [1] = (UINT8) (( a * 51 + 205 * b + 128) >> 8);
+            c = src[2] * 154;
+            a = src[3];
+            des [2] = (UINT8) (( b * 102 + c + 128) >> 8);
+            des [3] = (UINT8) (( c + 102 * a + 128) >> 8);
+            b = src[4];
+            des [4] = (UINT8) (( a * 205 + 51 * b + 128) >> 8);
+
+            src += 4;
+            des += 5;
+        }
+
+        a = src[0];
+        b = src[1];
+        des [0] = (UINT8) (a);
+        des [1] = (UINT8) (( a * 51 + 205 * b + 128) >> 8);
+        c = src[2] * 154;
+        a = src[3];
+        des [2] = (UINT8) (( b * 102 + c + 128) >> 8);
+        des [3] = (UINT8) (( c + 102 * a + 128) >> 8);
+        des [4] = (UINT8) (a);
+    */
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vertical_band_4_5_scale_mmx
+ *
+ *  INPUTS        : unsigned char *dest    :
+ *                  unsigned int dest_pitch :
+ *                  unsigned int dest_width :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 4 to 5 up-scaling of a 4 pixel high band of pixels.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band. The function also has a "C" only
+ *                  version.
+ *
+ ****************************************************************************/
+static
+void vertical_band_4_5_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+
+    __declspec(align(16)) unsigned short one_fifth[]  = { 51, 51, 51, 51 };
+    __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
+    __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
+    __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
+    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
+
+    __asm
+    {
+
+        mov         esi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        lea         edi,    [esi+ecx*2]             // tow lines below
+        add         edi,    ecx                     // three lines below
+
+        pxor        mm7,    mm7                     // clear out mm7
+        mov         edx,    dest_width               // Loop counter
+
+        vs_4_5_loop:
+
+        movq        mm0,    QWORD ptr [esi]         // src[0];
+        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
+
+        movq        mm2,    mm0                     // Make a copy
+        punpcklbw   mm0,    mm7                     // unpack low to word
+
+        movq        mm5,    one_fifth
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm0,    mm5                     // a * 1/5
+
+        movq        mm3,    mm1                     // make a copy
+        punpcklbw   mm1,    mm7                     // unpack low to word
+
+        pmullw      mm2,    mm5                     // a * 1/5
+        movq        mm6,    four_fifths               // constan
+
+        movq        mm4,    mm1                     // copy of low b
+        pmullw      mm4,    mm6                     // b * 4/5
+
+        punpckhbw   mm3,    mm7                     // unpack high to word
+        movq        mm5,    mm3                     // copy of high b
+
+        pmullw      mm5,    mm6                     // b * 4/5
+        paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
+
+        paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
+        paddw       mm0,    round_values             // + 128
+
+        paddw       mm2,    round_values             // + 128
+        psrlw       mm0,    8
+
+        psrlw       mm2,    8
+        packuswb    mm0,    mm2                     // des [1]
+
+        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
+        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
+
+        // mm1, mm3 --- Src[1]
+        // mm0 --- Src[2]
+        // mm7 for unpacking
+
+        movq        mm5,    two_fifths
+        movq        mm2,    mm0                     // make a copy
+
+        pmullw      mm1,    mm5                     // b * 2/5
+        movq        mm6,    three_fifths
+
+
+        punpcklbw   mm0,    mm7                     // unpack low to word
+        pmullw      mm3,    mm5                     // b * 2/5
+
+        movq        mm4,    mm0                     // make copy of c
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm4,    mm6                     // c * 3/5
+        movq        mm5,    mm2
+
+        pmullw      mm5,    mm6                     // c * 3/5
+        paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
+
+        paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
+        paddw       mm1,    round_values             // + 128
+
+        paddw       mm3,    round_values             // + 128
+        psrlw       mm1,    8
+
+        psrlw       mm3,    8
+        packuswb    mm1,    mm3                     // des[2]
+
+        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
+        movq        mm1,    [edi]                   // mm1=Src[3];
+
+        // mm0, mm2 --- Src[2]
+        // mm1 --- Src[3]
+        // mm6 --- 3/5
+        // mm7 for unpacking
+
+        pmullw      mm0,    mm6                     // c * 3/5
+        movq        mm5,    two_fifths               // mm5 = 2/5
+
+        movq        mm3,    mm1                     // make a copy
+        pmullw      mm2,    mm6                     // c * 3/5
+
+        punpcklbw   mm1,    mm7                     // unpack low
+        movq        mm4,    mm1                     // make a copy
+
+        punpckhbw   mm3,    mm7                     // unpack high
+        pmullw      mm4,    mm5                     // d * 2/5
+
+        movq        mm6,    mm3                     // make a copy
+        pmullw      mm6,    mm5                     // d * 2/5
+
+        paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
+        paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
+
+        paddw       mm0,    round_values             // + 128
+        paddw       mm2,    round_values             // + 128
+
+        psrlw       mm0,    8
+        psrlw       mm2,    8
+
+        packuswb    mm0,    mm2                     // des[3]
+        movq        QWORD ptr [edi], mm0            // write des[3]
+
+        //  mm1, mm3 --- Src[3]
+        //  mm7 -- cleared for unpacking
+
+        movq        mm0,    [edi+ecx*2]             // mm0, Src[0] of the next group
+
+        movq        mm5,    four_fifths              // mm5 = 4/5
+        pmullw      mm1,    mm5                     // d * 4/5
+
+        movq        mm6,    one_fifth                // mm6 = 1/5
+        movq        mm2,    mm0                     // make a copy
+
+        pmullw      mm3,    mm5                     // d * 4/5
+        punpcklbw   mm0,    mm7                     // unpack low
+
+        pmullw      mm0,    mm6                     // an * 1/5
+        punpckhbw   mm2,    mm7                     // unpack high
+
+        paddw       mm1,    mm0                     // d * 4/5 + an * 1/5
+        pmullw      mm2,    mm6                     // an * 1/5
+
+        paddw       mm3,    mm2                     // d * 4/5 + an * 1/5
+        paddw       mm1,    round_values             // + 128
+
+        paddw       mm3,    round_values             // + 128
+        psrlw       mm1,    8
+
+        psrlw       mm3,    8
+        packuswb    mm1,    mm3                     // des[4]
+
+        movq        QWORD ptr [edi+ecx], mm1        // write des[4]
+
+        add         edi,    8
+        add         esi,    8
+
+        sub         edx,    8
+        jg         vs_4_5_loop
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : last_vertical_band_4_5_scale_mmx
+ *
+ *  INPUTS        : unsigned char *dest    :
+ *                  unsigned int dest_pitch :
+ *                  unsigned int dest_width :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : None
+ *
+ *  FUNCTION      : 4 to 5 up-scaling of the last 4-pixel high band in an image.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band. The function also has an "C" only
+ *                  version.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_4_5_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __declspec(align(16)) unsigned short one_fifth[]  = { 51, 51, 51, 51 };
+    __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
+    __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
+    __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
+    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
+
+    __asm
+    {
+        mov         esi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        lea         edi,    [esi+ecx*2]             // tow lines below
+        add         edi,    ecx                     // three lines below
+
+        pxor        mm7,    mm7                     // clear out mm7
+        mov         edx,    dest_width               // Loop counter
+
+        last_vs_4_5_loop:
+
+        movq        mm0,    QWORD ptr [esi]         // src[0];
+        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
+
+        movq        mm2,    mm0                     // Make a copy
+        punpcklbw   mm0,    mm7                     // unpack low to word
+
+        movq        mm5,    one_fifth
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm0,    mm5                     // a * 1/5
+
+        movq        mm3,    mm1                     // make a copy
+        punpcklbw   mm1,    mm7                     // unpack low to word
+
+        pmullw      mm2,    mm5                     // a * 1/5
+        movq        mm6,    four_fifths               // constan
+
+        movq        mm4,    mm1                     // copy of low b
+        pmullw      mm4,    mm6                     // b * 4/5
+
+        punpckhbw   mm3,    mm7                     // unpack high to word
+        movq        mm5,    mm3                     // copy of high b
+
+        pmullw      mm5,    mm6                     // b * 4/5
+        paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
+
+        paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
+        paddw       mm0,    round_values             // + 128
+
+        paddw       mm2,    round_values             // + 128
+        psrlw       mm0,    8
+
+        psrlw       mm2,    8
+        packuswb    mm0,    mm2                     // des [1]
+
+        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
+        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
+
+        // mm1, mm3 --- Src[1]
+        // mm0 --- Src[2]
+        // mm7 for unpacking
+
+        movq        mm5,    two_fifths
+        movq        mm2,    mm0                     // make a copy
+
+        pmullw      mm1,    mm5                     // b * 2/5
+        movq        mm6,    three_fifths
+
+
+        punpcklbw   mm0,    mm7                     // unpack low to word
+        pmullw      mm3,    mm5                     // b * 2/5
+
+        movq        mm4,    mm0                     // make copy of c
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm4,    mm6                     // c * 3/5
+        movq        mm5,    mm2
+
+        pmullw      mm5,    mm6                     // c * 3/5
+        paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
+
+        paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
+        paddw       mm1,    round_values             // + 128
+
+        paddw       mm3,    round_values             // + 128
+        psrlw       mm1,    8
+
+        psrlw       mm3,    8
+        packuswb    mm1,    mm3                     // des[2]
+
+        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
+        movq        mm1,    [edi]                   // mm1=Src[3];
+
+        movq        QWORD ptr [edi+ecx], mm1        // write des[4];
+
+        // mm0, mm2 --- Src[2]
+        // mm1 --- Src[3]
+        // mm6 --- 3/5
+        // mm7 for unpacking
+
+        pmullw      mm0,    mm6                     // c * 3/5
+        movq        mm5,    two_fifths               // mm5 = 2/5
+
+        movq        mm3,    mm1                     // make a copy
+        pmullw      mm2,    mm6                     // c * 3/5
+
+        punpcklbw   mm1,    mm7                     // unpack low
+        movq        mm4,    mm1                     // make a copy
+
+        punpckhbw   mm3,    mm7                     // unpack high
+        pmullw      mm4,    mm5                     // d * 2/5
+
+        movq        mm6,    mm3                     // make a copy
+        pmullw      mm6,    mm5                     // d * 2/5
+
+        paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
+        paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
+
+        paddw       mm0,    round_values             // + 128
+        paddw       mm2,    round_values             // + 128
+
+        psrlw       mm0,    8
+        psrlw       mm2,    8
+
+        packuswb    mm0,    mm2                     // des[3]
+        movq        QWORD ptr [edi], mm0            // write des[3]
+
+        //  mm1, mm3 --- Src[3]
+        //  mm7 -- cleared for unpacking
+        add         edi,    8
+        add         esi,    8
+
+        sub         edx,    8
+        jg          last_vs_4_5_loop
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vertical_band_3_5_scale_mmx
+ *
+ *  INPUTS        : unsigned char *dest    :
+ *                  unsigned int dest_pitch :
+ *                  unsigned int dest_width :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band. The function also has an "C" only
+ *                  version.
+ *
+ ****************************************************************************/
+static
+void vertical_band_3_5_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __declspec(align(16)) unsigned short one_fifth[]  = { 51, 51, 51, 51 };
+    __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
+    __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
+    __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
+    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
+
+    __asm
+    {
+        mov         esi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        lea         edi,    [esi+ecx*2]             // tow lines below
+        add         edi,    ecx                     // three lines below
+
+        pxor        mm7,    mm7                     // clear out mm7
+        mov         edx,    dest_width               // Loop counter
+
+        vs_3_5_loop:
+
+        movq        mm0,    QWORD ptr [esi]         // src[0];
+        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
+
+        movq        mm2,    mm0                     // Make a copy
+        punpcklbw   mm0,    mm7                     // unpack low to word
+
+        movq        mm5,    two_fifths               // mm5 = 2/5
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm0,    mm5                     // a * 2/5
+
+        movq        mm3,    mm1                     // make a copy
+        punpcklbw   mm1,    mm7                     // unpack low to word
+
+        pmullw      mm2,    mm5                     // a * 2/5
+        movq        mm6,    three_fifths             // mm6 = 3/5
+
+        movq        mm4,    mm1                     // copy of low b
+        pmullw      mm4,    mm6                     // b * 3/5
+
+        punpckhbw   mm3,    mm7                     // unpack high to word
+        movq        mm5,    mm3                     // copy of high b
+
+        pmullw      mm5,    mm6                     // b * 3/5
+        paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
+
+        paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
+        paddw       mm0,    round_values             // + 128
+
+        paddw       mm2,    round_values             // + 128
+        psrlw       mm0,    8
+
+        psrlw       mm2,    8
+        packuswb    mm0,    mm2                     // des [1]
+
+        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
+        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
+
+        // mm1, mm3 --- Src[1]
+        // mm0 --- Src[2]
+        // mm7 for unpacking
+
+        movq        mm4,    mm1                     // b low
+        pmullw      mm1,    four_fifths              // b * 4/5 low
+
+        movq        mm5,    mm3                     // b high
+        pmullw      mm3,    four_fifths              // b * 4/5 high
+
+        movq        mm2,    mm0                     // c
+        pmullw      mm4,    one_fifth                // b * 1/5
+
+        punpcklbw   mm0,    mm7                     // c low
+        pmullw      mm5,    one_fifth                // b * 1/5
+
+        movq        mm6,    mm0                     // make copy of c low
+        punpckhbw   mm2,    mm7                     // c high
+
+        pmullw      mm6,    one_fifth                // c * 1/5 low
+        movq        mm7,    mm2                     // make copy of c high
+
+        pmullw      mm7,    one_fifth                // c * 1/5 high
+        paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
+
+        paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
+        movq        mm6,    mm0                     // make copy of c low
+
+        pmullw      mm6,    four_fifths              // c * 4/5 low
+        movq        mm7,    mm2                     // make copy of c high
+
+        pmullw      mm7,    four_fifths              // c * 4/5 high
+
+        paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
+        paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
+
+        paddw       mm1,    round_values             // + 128
+        paddw       mm3,    round_values             // + 128
+
+        psrlw       mm1,    8
+        psrlw       mm3,    8
+
+        packuswb    mm1,    mm3                     // des[2]
+        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
+
+        paddw       mm4,    round_values             // + 128
+        paddw       mm5,    round_values             // + 128
+
+        psrlw       mm4,    8
+        psrlw       mm5,    8
+
+        packuswb    mm4,    mm5                     // des[3]
+        movq        QWORD ptr [edi], mm4            // write des[3]
+
+        //  mm0, mm2 --- Src[3]
+
+        pxor        mm7,    mm7                     // clear mm7 for unpacking
+        movq        mm1,    [edi+ecx*2]             // mm1 = Src[0] of the next group
+
+        movq        mm5,    three_fifths             // mm5 = 3/5
+        pmullw      mm0,    mm5                     // d * 3/5
+
+        movq        mm6,    two_fifths                // mm6 = 2/5
+        movq        mm3,    mm1                     // make a copy
+
+        pmullw      mm2,    mm5                     // d * 3/5
+        punpcklbw   mm1,    mm7                     // unpack low
+
+        pmullw      mm1,    mm6                     // an * 2/5
+        punpckhbw   mm3,    mm7                     // unpack high
+
+        paddw       mm0,    mm1                     // d * 3/5 + an * 2/5
+        pmullw      mm3,    mm6                     // an * 2/5
+
+        paddw       mm2,    mm3                     // d * 3/5 + an * 2/5
+        paddw       mm0,    round_values             // + 128
+
+        paddw       mm2,    round_values             // + 128
+        psrlw       mm0,    8
+
+        psrlw       mm2,    8
+        packuswb    mm0,    mm2                     // des[4]
+
+        movq        QWORD ptr [edi+ecx], mm0        // write des[4]
+
+        add         edi,    8
+        add         esi,    8
+
+        sub         edx,    8
+        jg          vs_3_5_loop
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : last_vertical_band_3_5_scale_mmx
+ *
+ *  INPUTS        : unsigned char *dest    :
+ *                  unsigned int dest_pitch :
+ *                  unsigned int dest_width :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band. The function also has an "C" only
+ *                  version.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_3_5_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __declspec(align(16)) unsigned short one_fifth[]  = { 51, 51, 51, 51 };
+    __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
+    __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
+    __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
+    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
+    __asm
+    {
+        mov         esi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        lea         edi,    [esi+ecx*2]             // tow lines below
+        add         edi,    ecx                     // three lines below
+
+        pxor        mm7,    mm7                     // clear out mm7
+        mov         edx,    dest_width               // Loop counter
+
+
+        last_vs_3_5_loop:
+
+        movq        mm0,    QWORD ptr [esi]         // src[0];
+        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
+
+        movq        mm2,    mm0                     // Make a copy
+        punpcklbw   mm0,    mm7                     // unpack low to word
+
+        movq        mm5,    two_fifths               // mm5 = 2/5
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm0,    mm5                     // a * 2/5
+
+        movq        mm3,    mm1                     // make a copy
+        punpcklbw   mm1,    mm7                     // unpack low to word
+
+        pmullw      mm2,    mm5                     // a * 2/5
+        movq        mm6,    three_fifths             // mm6 = 3/5
+
+        movq        mm4,    mm1                     // copy of low b
+        pmullw      mm4,    mm6                     // b * 3/5
+
+        punpckhbw   mm3,    mm7                     // unpack high to word
+        movq        mm5,    mm3                     // copy of high b
+
+        pmullw      mm5,    mm6                     // b * 3/5
+        paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
+
+        paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
+        paddw       mm0,    round_values             // + 128
+
+        paddw       mm2,    round_values             // + 128
+        psrlw       mm0,    8
+
+        psrlw       mm2,    8
+        packuswb    mm0,    mm2                     // des [1]
+
+        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
+        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
+
+
+
+        // mm1, mm3 --- Src[1]
+        // mm0 --- Src[2]
+        // mm7 for unpacking
+
+        movq        mm4,    mm1                     // b low
+        pmullw      mm1,    four_fifths              // b * 4/5 low
+
+        movq        QWORD ptr [edi+ecx], mm0        // write des[4]
+
+        movq        mm5,    mm3                     // b high
+        pmullw      mm3,    four_fifths              // b * 4/5 high
+
+        movq        mm2,    mm0                     // c
+        pmullw      mm4,    one_fifth                // b * 1/5
+
+        punpcklbw   mm0,    mm7                     // c low
+        pmullw      mm5,    one_fifth                // b * 1/5
+
+        movq        mm6,    mm0                     // make copy of c low
+        punpckhbw   mm2,    mm7                     // c high
+
+        pmullw      mm6,    one_fifth                // c * 1/5 low
+        movq        mm7,    mm2                     // make copy of c high
+
+        pmullw      mm7,    one_fifth                // c * 1/5 high
+        paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
+
+        paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
+        movq        mm6,    mm0                     // make copy of c low
+
+        pmullw      mm6,    four_fifths              // c * 4/5 low
+        movq        mm7,    mm2                     // make copy of c high
+
+        pmullw      mm7,    four_fifths              // c * 4/5 high
+
+        paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
+        paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
+
+        paddw       mm1,    round_values             // + 128
+        paddw       mm3,    round_values             // + 128
+
+        psrlw       mm1,    8
+        psrlw       mm3,    8
+
+        packuswb    mm1,    mm3                     // des[2]
+        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
+
+        paddw       mm4,    round_values             // + 128
+        paddw       mm5,    round_values             // + 128
+
+        psrlw       mm4,    8
+        psrlw       mm5,    8
+
+        packuswb    mm4,    mm5                     // des[3]
+        movq        QWORD ptr [edi], mm4            // write des[3]
+
+        //  mm0, mm2 --- Src[3]
+
+        add         edi,    8
+        add         esi,    8
+
+        sub         edx,    8
+        jg          last_vs_3_5_loop
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vertical_band_1_2_scale_mmx
+ *
+ *  INPUTS        : unsigned char *dest    :
+ *                  unsigned int dest_pitch :
+ *                  unsigned int dest_width :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 1 to 2 up-scaling of a band of pixels.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band. The function also has an "C" only
+ *                  version.
+ *
+ ****************************************************************************/
+static
+void vertical_band_1_2_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __declspec(align(16))unsigned short four_ones[] = { 1, 1, 1, 1};
+
+    __asm
+    {
+
+        mov         esi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        pxor        mm7,    mm7                     // clear out mm7
+        mov         edx,    dest_width               // Loop counter
+
+        vs_1_2_loop:
+
+        movq        mm0,    [esi]                   // get Src[0]
+        movq        mm1,    [esi + ecx * 2]         // get Src[1]
+
+        movq        mm2,    mm0                     // make copy before unpack
+        movq        mm3,    mm1                     // make copy before unpack
+
+        punpcklbw   mm0,    mm7                     // low Src[0]
+        movq        mm6,    four_ones                // mm6= 1, 1, 1, 1
+
+        punpcklbw   mm1,    mm7                     // low Src[1]
+        paddw       mm0,    mm1                     // low (a + b)
+
+        punpckhbw   mm2,    mm7                     // high Src[0]
+        paddw       mm0,    mm6                     // low (a + b + 1)
+
+        punpckhbw   mm3,    mm7
+        paddw       mm2,    mm3                     // high (a + b )
+
+        psraw       mm0,    1                       // low (a + b +1 )/2
+        paddw       mm2,    mm6                     // high (a + b + 1)
+
+        psraw       mm2,    1                       // high (a + b + 1)/2
+        packuswb    mm0,    mm2                     // pack results
+
+        movq        [esi+ecx], mm0                  // write out eight bytes
+        add         esi,    8
+
+        sub         edx,    8
+        jg          vs_1_2_loop
+    }
+
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : last_vertical_band_1_2_scale_mmx
+ *
+ *  INPUTS        : unsigned char *dest    :
+ *                  unsigned int dest_pitch :
+ *                  unsigned int dest_width :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 1 to 2 up-scaling of band of pixels.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band. The function also has an "C" only
+ *                  version.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_1_2_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __asm
+    {
+        mov         esi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        mov         edx,    dest_width               // Loop counter
+
+        last_vs_1_2_loop:
+
+        movq        mm0,    [esi]                   // get Src[0]
+        movq        [esi+ecx], mm0                  // write out eight bytes
+
+        add         esi,    8
+        sub         edx,    8
+
+        jg         last_vs_1_2_loop
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : horizontal_line_1_2_scale
+ *
+ *  INPUTS        : const unsigned char *source :
+ *                  unsigned int source_width    :
+ *                  unsigned char *dest         :
+ *                  unsigned int dest_width      :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_1_2_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    __declspec(align(16))unsigned short four_ones[] = { 1, 1, 1, 1};
+
+    (void) dest_width;
+
+    __asm
+    {
+        mov         esi,    source
+        mov         edi,    dest
+
+        pxor        mm7,    mm7
+        movq        mm6,    four_ones
+
+        mov         ecx,    source_width
+
+        hs_1_2_loop:
+
+        movq        mm0,    [esi]
+        movq        mm1,    [esi+1]
+
+        movq        mm2,    mm0
+        movq        mm3,    mm1
+
+        movq        mm4,    mm0
+        punpcklbw   mm0,    mm7
+
+        punpcklbw   mm1,    mm7
+        paddw       mm0,    mm1
+
+        paddw       mm0,    mm6
+        punpckhbw   mm2,    mm7
+
+        punpckhbw   mm3,    mm7
+        paddw       mm2,    mm3
+
+        paddw       mm2,    mm6
+        psraw       mm0,    1
+
+        psraw       mm2,    1
+        packuswb    mm0,    mm2
+
+        movq        mm2,    mm4
+        punpcklbw   mm2,    mm0
+
+        movq        [edi],  mm2
+        punpckhbw   mm4,    mm0
+
+        movq        [edi+8], mm4
+        add         esi,    8
+
+        add         edi,    16
+        sub         ecx,    8
+
+        cmp         ecx,    8
+        jg          hs_1_2_loop
+
+// last eight pixel
+
+        movq        mm0,    [esi]
+        movq        mm1,    mm0
+
+        movq        mm2,    mm0
+        movq        mm3,    mm1
+
+        psrlq       mm1,    8
+        psrlq       mm3,    56
+
+        psllq       mm3,    56
+        por         mm1,    mm3
+
+        movq        mm3,    mm1
+        movq        mm4,    mm0
+
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+
+        paddw       mm0,    mm1
+        paddw       mm0,    mm6
+
+        punpckhbw   mm2,    mm7
+        punpckhbw   mm3,    mm7
+
+        paddw       mm2,    mm3
+        paddw       mm2,    mm6
+
+        psraw       mm0,    1
+        psraw       mm2,    1
+
+        packuswb    mm0,    mm2
+        movq        mm2,    mm4
+
+        punpcklbw   mm2,    mm0
+        movq        [edi],  mm2
+
+        punpckhbw   mm4,    mm0
+        movq        [edi+8], mm4
+    }
+}
+
+
+
+
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : horizontal_line_5_4_scale_mmx
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width    : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 4 to 5.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_5_4_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+
+    __declspec(align(16)) const unsigned short const54_2[] = {  0,  64, 128, 192 };
+    __declspec(align(16)) const unsigned short const54_1[] = {256, 192, 128,  64 };
+    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
+    /*
+    unsigned i;
+    unsigned int a, b, c, d, e;
+    unsigned char *des = dest;
+    const unsigned char *src = source;
+
+    (void) dest_width;
+
+    for ( i=0; i<source_width; i+=5 )
+    {
+        a = src[0];
+        b = src[1];
+        c = src[2];
+        d = src[3];
+        e = src[4];
+
+        des[0] = a;
+        des[1] = ((b*192 + c* 64 + 128)>>8);
+        des[2] = ((c*128 + d*128 + 128)>>8);
+        des[3] = ((d* 64 + e*192 + 128)>>8);
+
+        src += 5;
+        des += 4;
+    }
+    */
+    __asm
+    {
+
+        mov         esi,        source              ;
+        mov         edi,        dest                ;
+
+        mov         ecx,        source_width         ;
+        movq        mm5,        const54_1           ;
+
+        pxor        mm7,        mm7                 ;
+        movq        mm6,        const54_2           ;
+
+        movq        mm4,        round_values         ;
+        lea         edx,        [esi+ecx]           ;
+        horizontal_line_5_4_loop:
+
+        movq        mm0,        QWORD PTR  [esi]    ;
+        00 01 02 03 04 05 06 07
+        movq        mm1,        mm0                 ;
+        00 01 02 03 04 05 06 07
+
+        psrlq       mm0,        8                   ;
+        01 02 03 04 05 06 07 xx
+        punpcklbw   mm1,        mm7                 ;
+        xx 00 xx 01 xx 02 xx 03
+
+        punpcklbw   mm0,        mm7                 ;
+        xx 01 xx 02 xx 03 xx 04
+        pmullw      mm1,        mm5
+
+        pmullw      mm0,        mm6
+        add         esi,        5
+
+        add         edi,        4
+        paddw       mm1,        mm0
+
+        paddw       mm1,        mm4
+        psrlw       mm1,        8
+
+        cmp         esi,        edx
+        packuswb    mm1,        mm7
+
+        movd        DWORD PTR [edi-4], mm1
+
+        jl          horizontal_line_5_4_loop
+
+    }
+
+}
+
+static
+void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+
+    __declspec(align(16)) const unsigned short one_fourths[]   = {  64,  64,  64, 64  };
+    __declspec(align(16)) const unsigned short two_fourths[]   = { 128, 128, 128, 128 };
+    __declspec(align(16)) const unsigned short three_fourths[] = { 192, 192, 192, 192 };
+
+    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
+    __asm
+    {
+        push        ebx
+
+        mov         esi,    source                    // Get the source and destination pointer
+        mov         ecx,    src_pitch               // Get the pitch size
+
+        mov         edi,    dest                    // tow lines below
+        pxor        mm7,    mm7                     // clear out mm7
+
+        mov         edx,    dest_pitch               // Loop counter
+        mov         ebx,    dest_width
+
+        vs_5_4_loop:
+
+        movd        mm0,    DWORD ptr [esi]         // src[0];
+        movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
+
+        movd        mm2,    DWORD ptr [esi+ecx*2]
+        lea         eax,    [esi+ecx*2]             //
+
+        punpcklbw   mm1,    mm7
+        punpcklbw   mm2,    mm7
+
+        movq        mm3,    mm2
+        pmullw      mm1,    three_fourths
+
+        pmullw      mm2,    one_fourths
+        movd        mm4,    [eax+ecx]
+
+        pmullw      mm3,    two_fourths
+        punpcklbw   mm4,    mm7
+
+        movq        mm5,    mm4
+        pmullw      mm4,    two_fourths
+
+        paddw       mm1,    mm2
+        movd        mm6,    [eax+ecx*2]
+
+        pmullw      mm5,    one_fourths
+        paddw       mm1,    round_values;
+
+        paddw       mm3,    mm4
+        psrlw       mm1,    8
+
+        punpcklbw   mm6,    mm7
+        paddw       mm3,    round_values
+
+        pmullw      mm6,    three_fourths
+        psrlw       mm3,    8
+
+        packuswb    mm1,    mm7
+        packuswb    mm3,    mm7
+
+        movd        DWORD PTR [edi], mm0
+        movd        DWORD PTR [edi+edx], mm1
+
+
+        paddw       mm5,    mm6
+        movd        DWORD PTR [edi+edx*2], mm3
+
+        lea         eax,    [edi+edx*2]
+        paddw       mm5,    round_values
+
+        psrlw       mm5,    8
+        add         edi,    4
+
+        packuswb    mm5,    mm7
+        movd        DWORD PTR [eax+edx], mm5
+
+        add         esi,    4
+        sub         ebx,    4
+
+        jg         vs_5_4_loop
+
+        pop         ebx
+    }
+}
+
+
+
+static
+void horizontal_line_5_3_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    __declspec(align(16)) const unsigned short const53_1[] = {  0,  85, 171, 0 };
+    __declspec(align(16)) const unsigned short const53_2[] = {256, 171,  85, 0 };
+    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
+    __asm
+    {
+
+        mov         esi,        source              ;
+        mov         edi,        dest                ;
+
+        mov         ecx,        source_width         ;
+        movq        mm5,        const53_1           ;
+
+        pxor        mm7,        mm7                 ;
+        movq        mm6,        const53_2           ;
+
+        movq        mm4,        round_values         ;
+        lea         edx,        [esi+ecx-5]         ;
+        horizontal_line_5_3_loop:
+
+        movq        mm0,        QWORD PTR  [esi]    ;
+        00 01 02 03 04 05 06 07
+        movq        mm1,        mm0                 ;
+        00 01 02 03 04 05 06 07
+
+        psllw       mm0,        8                   ;
+        xx 00 xx 02 xx 04 xx 06
+        psrlw       mm1,        8                   ;
+        01 xx 03 xx 05 xx 07 xx
+
+        psrlw       mm0,        8                   ;
+        00 xx 02 xx 04 xx 06 xx
+        psllq       mm1,        16                  ;
+        xx xx 01 xx 03 xx 05 xx
+
+        pmullw      mm0,        mm6
+
+        pmullw      mm1,        mm5
+        add         esi,        5
+
+        add         edi,        3
+        paddw       mm1,        mm0
+
+        paddw       mm1,        mm4
+        psrlw       mm1,        8
+
+        cmp         esi,        edx
+        packuswb    mm1,        mm7
+
+        movd        DWORD PTR [edi-3], mm1
+        jl          horizontal_line_5_3_loop
+
+//exit condition
+        movq        mm0,        QWORD PTR  [esi]    ;
+        00 01 02 03 04 05 06 07
+        movq        mm1,        mm0                 ;
+        00 01 02 03 04 05 06 07
+
+        psllw       mm0,        8                   ;
+        xx 00 xx 02 xx 04 xx 06
+        psrlw       mm1,        8                   ;
+        01 xx 03 xx 05 xx 07 xx
+
+        psrlw       mm0,        8                   ;
+        00 xx 02 xx 04 xx 06 xx
+        psllq       mm1,        16                  ;
+        xx xx 01 xx 03 xx 05 xx
+
+        pmullw      mm0,        mm6
+
+        pmullw      mm1,        mm5
+        paddw       mm1,        mm0
+
+        paddw       mm1,        mm4
+        psrlw       mm1,        8
+
+        packuswb    mm1,        mm7
+        movd        eax,        mm1
+
+        mov         edx,        eax
+        shr         edx,        16
+
+        mov         WORD PTR[edi],   ax
+        mov         BYTE PTR[edi+2], dl
+
+    }
+
+}
+
+
+static
+void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
+    __declspec(align(16)) const unsigned short one_thirds[] = {  85,  85,  85,  85 };
+    __declspec(align(16)) const unsigned short two_thirds[] = { 171, 171, 171, 171 };
+
+    __asm
+    {
+        push        ebx
+
+        mov         esi,    source                    // Get the source and destination pointer
+        mov         ecx,    src_pitch               // Get the pitch size
+
+        mov         edi,    dest                    // tow lines below
+        pxor        mm7,    mm7                     // clear out mm7
+
+        mov         edx,    dest_pitch               // Loop counter
+        movq        mm5,    one_thirds
+
+        movq        mm6,    two_thirds
+        mov         ebx,    dest_width;
+
+        vs_5_3_loop:
+
+        movd        mm0,    DWORD ptr [esi]         // src[0];
+        movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
+
+        movd        mm2,    DWORD ptr [esi+ecx*2]
+        lea         eax,    [esi+ecx*2]             //
+
+        punpcklbw   mm1,    mm7
+        punpcklbw   mm2,    mm7
+
+        pmullw      mm1,    mm5
+        pmullw      mm2,    mm6
+
+        movd        mm3,    DWORD ptr [eax+ecx]
+        movd        mm4,    DWORD ptr [eax+ecx*2]
+
+        punpcklbw   mm3,    mm7
+        punpcklbw   mm4,    mm7
+
+        pmullw      mm3,    mm6
+        pmullw      mm4,    mm5
+
+
+        movd        DWORD PTR [edi], mm0
+        paddw       mm1,    mm2
+
+        paddw       mm1,    round_values
+        psrlw       mm1,    8
+
+        packuswb    mm1,    mm7
+        paddw       mm3,    mm4
+
+        paddw       mm3,    round_values
+        movd        DWORD PTR [edi+edx], mm1
+
+        psrlw       mm3,    8
+        packuswb    mm3,    mm7
+
+        movd        DWORD PTR [edi+edx*2], mm3
+
+
+        add         edi,    4
+        add         esi,    4
+
+        sub         ebx,    4
+        jg          vs_5_3_loop
+
+        pop         ebx
+    }
+}
+
+
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : horizontal_line_2_1_scale
+ *
+ *  INPUTS        : const unsigned char *source :
+ *                  unsigned int source_width    :
+ *                  unsigned char *dest         :
+ *                  unsigned int dest_width      :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_2_1_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    (void) dest_width;
+
+    __asm
+    {
+        mov         esi,    source
+        mov         edi,    dest
+
+        pxor        mm7,    mm7
+        mov         ecx,    dest_width
+
+        xor         edx,    edx
+        hs_2_1_loop:
+
+        movq        mm0,    [esi+edx*2]
+        psllw       mm0,    8
+
+        psrlw       mm0,    8
+        packuswb    mm0,    mm7
+
+        movd        DWORD Ptr [edi+edx], mm0;
+        add         edx,    4
+
+        cmp         edx,    ecx
+        jl          hs_2_1_loop
+
+    }
+}
+
+
+
+static
+void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    vpx_memcpy(dest, source, dest_width);
+}
+
+
+
+static
+void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+
+    __declspec(align(16)) const unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
+    __declspec(align(16)) const unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };
+    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
+    __asm
+    {
+        mov         esi,        source
+        mov         edi,        dest
+
+        mov         eax,        src_pitch
+        mov         edx,        dest_width
+
+        pxor        mm7,        mm7
+        sub         esi,        eax             //back one line
+
+
+        lea         ecx,        [esi+edx];
+        movq        mm6,        round_values;
+
+        movq        mm5,        three_sixteenths;
+        movq        mm4,        ten_sixteenths;
+
+        vs_2_1_i_loop:
+        movd        mm0,        [esi]           //
+        movd        mm1,        [esi+eax]       //
+
+        movd        mm2,        [esi+eax*2]     //
+        punpcklbw   mm0,        mm7
+
+        pmullw      mm0,        mm5
+        punpcklbw   mm1,        mm7
+
+        pmullw      mm1,        mm4
+        punpcklbw   mm2,        mm7
+
+        pmullw      mm2,        mm5
+        paddw       mm0,        round_values
+
+        paddw       mm1,        mm2
+        paddw       mm0,        mm1
+
+        psrlw       mm0,        8
+        packuswb    mm0,        mm7
+
+        movd        DWORD PTR [edi],        mm0
+        add         esi,        4
+
+        add         edi,        4;
+        cmp         esi,        ecx
+        jl          vs_2_1_i_loop
+
+    }
+}
+
+void
+register_mmxscalers(void)
+{
+    vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_mmx;
+    vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_mmx;
+    vp8_last_vertical_band_1_2_scale      = last_vertical_band_1_2_scale_mmx;
+    vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_mmx;
+    vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_mmx;
+    vp8_last_vertical_band_3_5_scale      = last_vertical_band_3_5_scale_mmx;
+    vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_mmx;
+    vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_mmx;
+    vp8_last_vertical_band_4_5_scale      = last_vertical_band_4_5_scale_mmx;
+
+    vp8_horizontal_line_3_4_scale        = vp8cx_horizontal_line_3_4_scale_c;
+    vp8_vertical_band_3_4_scale          = vp8cx_vertical_band_3_4_scale_c;
+    vp8_last_vertical_band_3_4_scale      = vp8cx_last_vertical_band_3_4_scale_c;
+    vp8_horizontal_line_2_3_scale        = vp8cx_horizontal_line_2_3_scale_c;
+    vp8_vertical_band_2_3_scale          = vp8cx_vertical_band_2_3_scale_c;
+    vp8_last_vertical_band_2_3_scale      = vp8cx_last_vertical_band_2_3_scale_c;
+
+
+
+    vp8_vertical_band_5_4_scale          = vertical_band_5_4_scale_mmx;
+    vp8_vertical_band_5_3_scale          = vertical_band_5_3_scale_mmx;
+    vp8_vertical_band_2_1_scale          = vertical_band_2_1_scale_mmx;
+    vp8_vertical_band_2_1_scale_i        = vertical_band_2_1_scale_i_mmx;
+    vp8_horizontal_line_2_1_scale        = horizontal_line_2_1_scale_mmx;
+    vp8_horizontal_line_5_3_scale        = horizontal_line_5_3_scale_mmx;
+    vp8_horizontal_line_5_4_scale        = horizontal_line_5_4_scale_mmx;
+
+}
diff --git a/vpx_scale/intel_linux/scalesystemdependant.c b/vpx_scale/intel_linux/scalesystemdependant.c
new file mode 100644
index 000000000..9ed48bfc6
--- /dev/null
+++ b/vpx_scale/intel_linux/scalesystemdependant.c
@@ -0,0 +1,90 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     system_dependant.c
+*
+*   Description  :     Miscellaneous system dependant functions
+*
+****************************************************************************/
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+#include "vpx_scale/vpxscale.h"
+#include "cpuidlib.h"
+
+/****************************************************************************
+*  Imports
+*****************************************************************************/
+extern void register_generic_scalers(void);
+extern void register_mmxscalers(void);
+
+/****************************************************************************
+ *
+ *  ROUTINE       : post_proc_machine_specific_config
+ *
+ *  INPUTS        : UINT32 Version : Codec version number.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Checks for machine specifc features such as MMX support
+ *                  sets appropriate flags and function pointers.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void
+vp8_scale_machine_specific_config(void)
+{
+    // If MMX supported then set to use MMX versions of functions else
+    // use original 'C' versions.
+    int mmx_enabled;
+    int xmm_enabled;
+    int wmt_enabled;
+
+    vpx_get_processor_flags(&mmx_enabled, &xmm_enabled, &wmt_enabled);
+
+    if (mmx_enabled || xmm_enabled || wmt_enabled)
+    {
+        register_mmxscalers();
+    }
+    else
+    {
+        vp8_horizontal_line_1_2_scale        = vp8cx_horizontal_line_1_2_scale_c;
+        vp8_vertical_band_1_2_scale          = vp8cx_vertical_band_1_2_scale_c;
+        vp8_last_vertical_band_1_2_scale      = vp8cx_last_vertical_band_1_2_scale_c;
+        vp8_horizontal_line_3_5_scale        = vp8cx_horizontal_line_3_5_scale_c;
+        vp8_vertical_band_3_5_scale          = vp8cx_vertical_band_3_5_scale_c;
+        vp8_last_vertical_band_3_5_scale      = vp8cx_last_vertical_band_3_5_scale_c;
+        vp8_horizontal_line_3_4_scale        = vp8cx_horizontal_line_3_4_scale_c;
+        vp8_vertical_band_3_4_scale          = vp8cx_vertical_band_3_4_scale_c;
+        vp8_last_vertical_band_3_4_scale      = vp8cx_last_vertical_band_3_4_scale_c;
+        vp8_horizontal_line_2_3_scale        = vp8cx_horizontal_line_2_3_scale_c;
+        vp8_vertical_band_2_3_scale          = vp8cx_vertical_band_2_3_scale_c;
+        vp8_last_vertical_band_2_3_scale      = vp8cx_last_vertical_band_2_3_scale_c;
+        vp8_horizontal_line_4_5_scale        = vp8cx_horizontal_line_4_5_scale_c;
+        vp8_vertical_band_4_5_scale          = vp8cx_vertical_band_4_5_scale_c;
+        vp8_last_vertical_band_4_5_scale      = vp8cx_last_vertical_band_4_5_scale_c;
+
+
+        vp8_vertical_band_5_4_scale           = vp8cx_vertical_band_5_4_scale_c;
+        vp8_vertical_band_5_3_scale           = vp8cx_vertical_band_5_3_scale_c;
+        vp8_vertical_band_2_1_scale           = vp8cx_vertical_band_2_1_scale_c;
+        vp8_vertical_band_2_1_scale_i         = vp8cx_vertical_band_2_1_scale_i_c;
+        vp8_horizontal_line_2_1_scale         = vp8cx_horizontal_line_2_1_scale_c;
+        vp8_horizontal_line_5_3_scale         = vp8cx_horizontal_line_5_3_scale_c;
+        vp8_horizontal_line_5_4_scale         = vp8cx_horizontal_line_5_4_scale_c;
+
+    }
+}
diff --git a/vpx_scale/leapster/doptsystemdependant_lf.c b/vpx_scale/leapster/doptsystemdependant_lf.c
new file mode 100644
index 000000000..ca1316730
--- /dev/null
+++ b/vpx_scale/leapster/doptsystemdependant_lf.c
@@ -0,0 +1,71 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     system_dependant.c
+*
+*   Description  :     Miscellaneous system dependant functions
+*
+****************************************************************************/
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+#include "vpx_scale/vpxscale.h"
+
+/****************************************************************************
+*  Imports
+*****************************************************************************/
+extern int register_generic_scalers(void);
+extern int de_register_generic_scalers(void);
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8_scale_machine_specific_config
+ *
+ *  INPUTS        : UINT32 Version : Codec version number.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : int
+ *
+ *  FUNCTION      : Checks for machine specifc features such as MMX support
+ *                  sets appropriate flags and function pointers.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+int
+vp8_scale_machine_specific_config()
+{
+    return register_generic_scalers();
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8_scale_machine_specific_config
+ *
+ *  INPUTS        : UINT32 Version : Codec version number.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : int
+ *
+ *  FUNCTION      : Resets the funtion pointers and deallocates memory.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+int
+scale_machine_specific_de_config()
+{
+    return de_register_generic_scalers();
+}
diff --git a/vpx_scale/leapster/gen_scalers_lf.c b/vpx_scale/leapster/gen_scalers_lf.c
new file mode 100644
index 000000000..1b9c7c745
--- /dev/null
+++ b/vpx_scale/leapster/gen_scalers_lf.c
@@ -0,0 +1,521 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+ *
+ *   Module Title :     gen_scalers.c
+ *
+ *   Description  :     Generic image scaling functions.
+ *
+ ***************************************************************************/
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+#include "vpx_scale/vpxscale.h"
+
+/****************************************************************************
+*  Imports
+****************************************************************************/
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_horizontal_line_4_5_scale_c
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width    : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 4 to 5.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void vp8cx_horizontal_line_4_5_scale_c
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    unsigned i;
+    unsigned int a, b, c;
+    unsigned char *des = dest;
+    const unsigned char *src = source;
+
+    (void) dest_width;
+
+    for (i = 0; i < source_width - 4; i += 4)
+    {
+        a = src[0];
+        b = src[1];
+        des [0] = (unsigned char) a;
+        des [1] = (unsigned char)((a * 51 + 205 * b + 128) >> 8);
+        c = src[2] * 154;
+        a = src[3];
+        des [2] = (unsigned char)((b * 102 + c + 128) >> 8);
+        des [3] = (unsigned char)((c + 102 * a + 128) >> 8);
+        b = src[4];
+        des [4] = (unsigned char)((a * 205 + 51 * b + 128) >> 8);
+
+        src += 4;
+        des += 5;
+    }
+
+    a = src[0];
+    b = src[1];
+    des [0] = (unsigned char)(a);
+    des [1] = (unsigned char)((a * 51 + 205 * b + 128) >> 8);
+    c = src[2] * 154;
+    a = src[3];
+    des [2] = (unsigned char)((b * 102 + c + 128) >> 8);
+    des [3] = (unsigned char)((c + 102 * a + 128) >> 8);
+    des [4] = (unsigned char)(a);
+
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_vertical_band_4_5_scale_c
+ *
+ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+ *                  unsigned int dest_pitch : Stride of destination data.
+ *                  unsigned int dest_width : Width of destination data.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Scales vertical band of pixels by scale 4 to 5. The
+ *                  height of the band scaled is 4-pixels.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band.
+ *
+ ****************************************************************************/
+static
+void vp8cx_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned int a, b, c, d;
+    unsigned char *des = dest;
+
+    for (i = 0; i < dest_width; i++)
+    {
+        a = des [0];
+        b = des [dest_pitch];
+
+        des[dest_pitch] = (unsigned char)((a * 51 + 205 * b + 128) >> 8);
+
+        c = des[dest_pitch*2] * 154;
+        d = des[dest_pitch*3];
+
+        des [dest_pitch*2] = (unsigned char)((b * 102 + c + 128) >> 8);
+        des [dest_pitch*3] = (unsigned char)((c + 102 * d + 128) >> 8);
+
+        // First line in next band
+        a = des [dest_pitch * 5];
+        des [dest_pitch * 4] = (unsigned char)((d * 205 + 51 * a + 128) >> 8);
+
+        des ++;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_last_vertical_band_4_5_scale_c
+ *
+ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+ *                  unsigned int dest_pitch : Stride of destination data.
+ *                  unsigned int dest_width : Width of destination data.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Scales last vertical band of pixels by scale 4 to 5. The
+ *                  height of the band scaled is 4-pixels.
+ *
+ *  SPECIAL NOTES : The routine does not have available the first line of
+ *                  the band below the current band, since this is the
+ *                  last band.
+ *
+ ****************************************************************************/
+static
+void vp8cx_last_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned int a, b, c, d;
+    unsigned char *des = dest;
+
+    for (i = 0; i < dest_width; ++i)
+    {
+        a = des[0];
+        b = des[dest_pitch];
+
+        des[dest_pitch] = (unsigned char)((a * 51 + 205 * b + 128) >> 8);
+
+        c = des[dest_pitch*2] * 154;
+        d = des[dest_pitch*3];
+
+        des [dest_pitch*2] = (unsigned char)((b * 102 + c + 128) >> 8);
+        des [dest_pitch*3] = (unsigned char)((c + 102 * d + 128) >> 8);
+
+        // No other line for interplation of this line, so ..
+        des[dest_pitch*4] = (unsigned char) d;
+
+        des++;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_horizontal_line_3_5_scale_c
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width    : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 3 to 5.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ *
+ ****************************************************************************/
+static
+void vp8cx_horizontal_line_3_5_scale_c
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    unsigned int i;
+    unsigned int a, b, c;
+    unsigned char *des = dest;
+    const unsigned char *src = source;
+
+    (void) dest_width;
+
+    for (i = 0; i < source_width - 3; i += 3)
+    {
+        a = src[0];
+        b = src[1];
+        des [0] = (unsigned char)(a);
+        des [1] = (unsigned char)((a * 102 + 154 * b + 128) >> 8);
+
+        c = src[2] ;
+        des [2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8);
+        des [3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8);
+
+        a = src[3];
+        des [4] = (unsigned char)((c * 154 + a * 102 + 128) >> 8);
+
+        src += 3;
+        des += 5;
+    }
+
+    a = src[0];
+    b = src[1];
+    des [0] = (unsigned char)(a);
+
+    des [1] = (unsigned char)((a * 102 + 154 * b + 128) >> 8);
+    c = src[2] ;
+    des [2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8);
+    des [3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8);
+
+    des [4] = (unsigned char)(c);
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_vertical_band_3_5_scale_c
+ *
+ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+ *                  unsigned int dest_pitch : Stride of destination data.
+ *                  unsigned int dest_width : Width of destination data.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Scales vertical band of pixels by scale 3 to 5. The
+ *                  height of the band scaled is 3-pixels.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band.
+ *
+ ****************************************************************************/
+static
+void vp8cx_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned int a, b, c;
+    unsigned char *des = dest;
+
+    for (i = 0; i < dest_width; i++)
+    {
+        a = des [0];
+        b = des [dest_pitch];
+        des [dest_pitch] = (unsigned char)((a * 102 + 154 * b + 128) >> 8);
+
+        c = des[dest_pitch*2];
+        des [dest_pitch*2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8);
+        des [dest_pitch*3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8);
+
+        // First line in next band...
+        a = des [dest_pitch * 5];
+        des [dest_pitch * 4] = (unsigned char)((c * 154 + a * 102 + 128) >> 8);
+
+        des++;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_last_vertical_band_3_5_scale_c
+ *
+ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+ *                  unsigned int dest_pitch : Stride of destination data.
+ *                  unsigned int dest_width : Width of destination data.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Scales last vertical band of pixels by scale 3 to 5. The
+ *                  height of the band scaled is 3-pixels.
+ *
+ *  SPECIAL NOTES : The routine does not have available the first line of
+ *                  the band below the current band, since this is the
+ *                  last band.
+ *
+ ****************************************************************************/
+static
+void vp8cx_last_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned int a, b, c;
+    unsigned char *des = dest;
+
+    for (i = 0; i < dest_width; ++i)
+    {
+        a = des [0];
+        b = des [dest_pitch];
+
+        des [ dest_pitch ] = (unsigned char)((a * 102 + 154 * b + 128) >> 8);
+
+        c = des[dest_pitch*2];
+        des [dest_pitch*2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8);
+        des [dest_pitch*3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8);
+
+        // No other line for interplation of this line, so ..
+        des [ dest_pitch * 4 ] = (unsigned char)(c) ;
+
+        des++;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_horizontal_line_1_2_scale_c
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width    : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 1 to 2.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void vp8cx_horizontal_line_1_2_scale_c
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    unsigned int i;
+    unsigned int a, b;
+    unsigned char *des = dest;
+    const unsigned char *src = source;
+
+    (void) dest_width;
+
+    for (i = 0; i < source_width - 1; i += 1)
+    {
+        a = src[0];
+        b = src[1];
+        des [0] = (unsigned char)(a);
+        des [1] = (unsigned char)((a + b + 1) >> 1);
+        src += 1;
+        des += 2;
+    }
+
+    a = src[0];
+    des [0] = (unsigned char)(a);
+    des [1] = (unsigned char)(a);
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_vertical_band_1_2_scale_c
+ *
+ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+ *                  unsigned int dest_pitch : Stride of destination data.
+ *                  unsigned int dest_width : Width of destination data.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Scales vertical band of pixels by scale 1 to 2. The
+ *                  height of the band scaled is 1-pixel.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band.
+ *
+ ****************************************************************************/
+static
+void vp8cx_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned int a, b;
+    unsigned char *des = dest;
+
+    for (i = 0; i < dest_width; i++)
+    {
+        a = des [0];
+        b = des [dest_pitch * 2];
+
+        des[dest_pitch] = (unsigned char)((a + b + 1) >> 1);
+
+        des++;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8cx_last_vertical_band_1_2_scale_c
+ *
+ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+ *                  unsigned int dest_pitch : Stride of destination data.
+ *                  unsigned int dest_width : Width of destination data.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Scales last vertical band of pixels by scale 1 to 2. The
+ *                  height of the band scaled is 1-pixel.
+ *
+ *  SPECIAL NOTES : The routine does not have available the first line of
+ *                  the band below the current band, since this is the
+ *                  last band.
+ *
+ ****************************************************************************/
+static
+void vp8cx_last_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    unsigned int i;
+    unsigned char *des = dest;
+
+    for (i = 0; i < dest_width; ++i)
+    {
+        des[dest_pitch] = des[0];
+        des++;
+    }
+}
+
+#include "vpx_scale/vpxscale.h"
+#include "vpx_mem/vpx_mem.h"
+
+struct vpxglobal_scalling_ptrs_t *g_scaling_ptrs = 0;
+
+int
+register_generic_scalers(void)
+{
+    int rv = 0;
+
+    g_scaling_ptrs = (struct vpxglobal_scalling_ptrs_t *)vpx_malloc(sizeof(struct vpxglobal_scalling_ptrs_t));
+
+    if (g_scaling_ptrs)
+    {
+        g_scaling_ptrs->vpxhorizontal_line_1_2_scale_t        = vp8cx_horizontal_line_1_2_scale_c;
+        g_scaling_ptrs->vpxvertical_band_1_2_scale_t          = vp8cx_vertical_band_1_2_scale_c;
+        g_scaling_ptrs->vpxlast_vertical_band_1_2_scale_t      = vp8cx_last_vertical_band_1_2_scale_c;
+        g_scaling_ptrs->vpxhorizontal_line_3_5_scale_t        = vp8cx_horizontal_line_3_5_scale_c;
+        g_scaling_ptrs->vpxvertical_band_3_5_scale_t          = vp8cx_vertical_band_3_5_scale_c;
+        g_scaling_ptrs->vpxlast_vertical_band_3_5_scale_t      = vp8cx_last_vertical_band_3_5_scale_c;
+        g_scaling_ptrs->vpxhorizontal_line_4_5_scale_t        = vp8cx_horizontal_line_4_5_scale_c;
+        g_scaling_ptrs->vpxvertical_band_4_5_scale_t          = vp8cx_vertical_band_4_5_scale_c;
+        g_scaling_ptrs->vpxlast_vertical_band_4_5_scale_t      = vp8cx_last_vertical_band_4_5_scale_c;
+    }
+    else
+    {
+        rv = -1;
+    }
+
+    /*
+    vp8_horizontal_line_1_2_scale        = vp8cx_horizontal_line_1_2_scale_c;
+    vp8_vertical_band_1_2_scale          = vp8cx_vertical_band_1_2_scale_c;
+    vp8_last_vertical_band_1_2_scale      = vp8cx_last_vertical_band_1_2_scale_c;
+    vp8_horizontal_line_3_5_scale        = vp8cx_horizontal_line_3_5_scale_c;
+    vp8_vertical_band_3_5_scale          = vp8cx_vertical_band_3_5_scale_c;
+    vp8_last_vertical_band_3_5_scale      = vp8cx_last_vertical_band_3_5_scale_c;
+    vp8_horizontal_line_4_5_scale        = vp8cx_horizontal_line_4_5_scale_c;
+    vp8_vertical_band_4_5_scale          = vp8cx_vertical_band_4_5_scale_c;
+    vp8_last_vertical_band_4_5_scale      = vp8cx_last_vertical_band_4_5_scale_c;
+    */
+
+    return rv;
+}
+
+int
+de_register_generic_scalers(void)
+{
+    int rv = 0;
+
+    if (g_scaling_ptrs)
+    {
+        vpx_free(g_scaling_ptrs);
+        g_scaling_ptrs = 0;
+    }
+    else
+    {
+        rv = -1;
+    }
+
+    return rv;
+}
diff --git a/vpx_scale/leapster/vpxscale_lf.c b/vpx_scale/leapster/vpxscale_lf.c
new file mode 100644
index 000000000..5f05e5de0
--- /dev/null
+++ b/vpx_scale/leapster/vpxscale_lf.c
@@ -0,0 +1,890 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+ *
+ *   Module Title :     scale.c
+ *
+ *   Description  :     Image scaling functions.
+ *
+ ***************************************************************************/
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+#include "stdlib.h"
+#include "vpx_scale/vpxscale.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12config.h"
+#include "codec_common_interface.h"
+
+/****************************************************************************
+*  Exports
+****************************************************************************/
+/*
+void  (*vp8_vertical_band_4_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+void  (*vp8_last_vertical_band_4_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+void  (*vp8_vertical_band_3_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+void  (*vp8_last_vertical_band_3_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+void  (*vp8_horizontal_line_1_2_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width);
+void  (*vp8_horizontal_line_3_5_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width);
+void  (*vp8_horizontal_line_4_5_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width);
+void  (*vp8_vertical_band_1_2_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+void  (*vp8_last_vertical_band_1_2_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
+*/
+
+
+typedef struct
+{
+    int     expanded_frame_width;
+    int     expanded_frame_height;
+
+    int HScale;
+    int HRatio;
+    int VScale;
+    int VRatio;
+
+    YV12_BUFFER_CONFIG *src_yuv_config;
+    YV12_BUFFER_CONFIG *dst_yuv_config;
+
+} SCALE_VARS;
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       :     horizontal_line_copy
+ *
+ *  INPUTS        :     None
+ *
+ *
+ *  OUTPUTS       :     None.
+ *
+ *  RETURNS       :     None
+ *
+ *  FUNCTION      :     1 to 1 scaling up for a horizontal line of pixles
+ *
+ *  SPECIAL NOTES :     None.
+ *
+ *  ERRORS        :     None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_copy(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    duck_memcpy(dest, source, source_width);
+}
+/****************************************************************************
+ *
+ *  ROUTINE       :     null_scale
+ *
+ *  INPUTS        :     None
+ *
+ *
+ *  OUTPUTS       :     None.
+ *
+ *  RETURNS       :     None
+ *
+ *  FUNCTION      :     1 to 1 scaling up for a vertical band
+ *
+ *  SPECIAL NOTES :     None.
+ *
+ *  ERRORS        :     None.
+ *
+ ****************************************************************************/
+static
+void null_scale(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    return;
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : scale1d_2t1_i
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
+ *                  int source_step              : Number of pixels to step on in source.
+ *                  unsigned int source_scale    : Scale for source (UNUSED).
+ *                  unsigned int source_length   : Length of source (UNUSED).
+ *                  unsigned char *dest         : Pointer to output data array.
+ *                  int dest_step                : Number of pixels to step on in destination.
+ *                  unsigned int dest_scale      : Scale for destination (UNUSED).
+ *                  unsigned int dest_length     : Length of destination.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-to-1 interpolated scaling.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void scale1d_2t1_i
+(
+    const unsigned char *source,
+    int source_step,
+    unsigned int source_scale,
+    unsigned int source_length,
+    unsigned char *dest,
+    int dest_step,
+    unsigned int dest_scale,
+    unsigned int dest_length
+)
+{
+    unsigned int i, j;
+    unsigned int temp;
+
+    (void) source_length;
+    (void) source_scale;
+    (void) dest_scale;
+
+    source_step *= 2;
+    dest[0] = source[0];
+
+    for (i = dest_step, j = source_step; i < dest_length * dest_step; i += dest_step, j += source_step)
+    {
+        temp = 8;
+        temp += 3 * source[j-source_step];
+        temp += 10 * source[j];
+        temp += 3 * source[j+source_step];
+        temp >>= 4;
+        dest[i] = (char)(temp);
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : scale1d_2t1_ps
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
+ *                  int source_step              : Number of pixels to step on in source.
+ *                  unsigned int source_scale    : Scale for source (UNUSED).
+ *                  unsigned int source_length   : Length of source (UNUSED).
+ *                  unsigned char *dest         : Pointer to output data array.
+ *                  int dest_step                : Number of pixels to step on in destination.
+ *                  unsigned int dest_scale      : Scale for destination (UNUSED).
+ *                  unsigned int dest_length     : Length of destination.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-to-1 point subsampled scaling.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void scale1d_2t1_ps
+(
+    const unsigned char *source,
+    int source_step,
+    unsigned int source_scale,
+    unsigned int source_length,
+    unsigned char *dest,
+    int dest_step,
+    unsigned int dest_scale,
+    unsigned int dest_length
+)
+{
+    unsigned int i, j;
+
+    (void) source_length;
+    (void) source_scale;
+    (void) dest_scale;
+
+    source_step *= 2;
+    j = 0;
+
+    for (i = 0; i < dest_length * dest_step; i += dest_step, j += source_step)
+        dest[i] = source[j];
+}
+/****************************************************************************
+ *
+ *  ROUTINE       : scale1d_c
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
+ *                  int source_step              : Number of pixels to step on in source.
+ *                  unsigned int source_scale    : Scale for source.
+ *                  unsigned int source_length   : Length of source (UNUSED).
+ *                  unsigned char *dest         : Pointer to output data array.
+ *                  int dest_step                : Number of pixels to step on in destination.
+ *                  unsigned int dest_scale      : Scale for destination.
+ *                  unsigned int dest_length     : Length of destination.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs linear interpolation in one dimension.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void scale1d_c
+(
+    const unsigned char *source,
+    int source_step,
+    unsigned int source_scale,
+    unsigned int source_length,
+    unsigned char *dest,
+    int dest_step,
+    unsigned int dest_scale,
+    unsigned int dest_length
+)
+{
+    unsigned int i;
+    unsigned int round_value = dest_scale / 2;
+    unsigned int left_modifier = dest_scale;
+    unsigned int right_modifier = 0;
+    unsigned char left_pixel = *source;
+    unsigned char right_pixel = *(source + source_step);
+
+    (void) source_length;
+
+    // These asserts are needed if there are boundary issues...
+    //assert ( dest_scale > source_scale );
+    //assert ( (source_length-1) * dest_scale >= (dest_length-1) * source_scale );
+
+    for (i = 0; i < dest_length * dest_step; i += dest_step)
+    {
+        dest[i] = (char)((left_modifier * left_pixel + right_modifier * right_pixel + round_value) / dest_scale);
+
+        right_modifier += source_scale;
+
+        while (right_modifier > dest_scale)
+        {
+            right_modifier -= dest_scale;
+            source += source_step;
+            left_pixel = *source;
+            right_pixel = *(source + source_step);
+        }
+
+        left_modifier = dest_scale - right_modifier;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : Scale2D
+ *
+ *  INPUTS        : const unsigned char *source  : Pointer to data to be scaled.
+ *                  int source_pitch              : Stride of source image.
+ *                  unsigned int source_width     : Width of input image.
+ *                  unsigned int source_height    : Height of input image.
+ *                  unsigned char *dest          : Pointer to output data array.
+ *                  int dest_pitch                : Stride of destination image.
+ *                  unsigned int dest_width       : Width of destination image.
+ *                  unsigned int dest_height      : Height of destination image.
+ *                  unsigned char *temp_area      : Pointer to temp work area.
+ *                  unsigned char temp_area_height : Height of temp work area.
+ *                  unsigned int hscale          : Horizontal scale factor numerator.
+ *                  unsigned int hratio          : Horizontal scale factor denominator.
+ *                  unsigned int vscale          : Vertical scale factor numerator.
+ *                  unsigned int vratio          : Vertical scale factor denominator.
+ *                  unsigned int interlaced      : Interlace flag.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-tap linear interpolation in two dimensions.
+ *
+ *  SPECIAL NOTES : Expansion is performed one band at a time to help with
+ *                  caching.
+ *
+ ****************************************************************************/
+static
+void Scale2D
+(
+    const unsigned char *source,
+    int source_pitch,
+    unsigned int source_width,
+    unsigned int source_height,
+    unsigned char *dest,
+    int dest_pitch,
+    unsigned int dest_width,
+    unsigned int dest_height,
+    unsigned char *temp_area,
+    unsigned char temp_area_height,
+    unsigned int hscale,
+    unsigned int hratio,
+    unsigned int vscale,
+    unsigned int vratio,
+    unsigned int interlaced
+)
+{
+    unsigned int i, j, k;
+    unsigned int bands;
+    unsigned int dest_band_height;
+    unsigned int source_band_height;
+
+    typedef void (*Scale1D)(const unsigned char * source, int source_step, unsigned int source_scale, unsigned int source_length,
+                            unsigned char * dest, int dest_step, unsigned int dest_scale, unsigned int dest_length);
+
+    Scale1D Scale1Dv = scale1d_c;
+    Scale1D Scale1Dh = scale1d_c;
+
+    if (hscale == 2 && hratio == 1)
+        Scale1Dh = scale1d_2t1_ps;
+
+    if (vscale == 2 && vratio == 1)
+    {
+        if (interlaced)
+            Scale1Dv = scale1d_2t1_ps;
+        else
+            Scale1Dv = scale1d_2t1_i;
+    }
+
+    if (source_height == dest_height)
+    {
+        // for each band of the image
+        for (k = 0; k < dest_height; k++)
+        {
+            Scale1Dh(source, 1, hscale, source_width + 1, dest, 1, hratio, dest_width);
+            source += source_pitch;
+            dest   += dest_pitch;
+        }
+
+        return;
+    }
+
+    if (dest_height > source_height)
+    {
+        dest_band_height   = temp_area_height - 1;
+        source_band_height = dest_band_height * source_height / dest_height;
+    }
+    else
+    {
+        source_band_height = temp_area_height - 1;
+        dest_band_height   = source_band_height * vratio / vscale;
+    }
+
+    // first row needs to be done so that we can stay one row ahead for vertical zoom
+    Scale1Dh(source, 1, hscale, source_width + 1, temp_area, 1, hratio, dest_width);
+
+    // for each band of the image
+    bands = (dest_height + dest_band_height - 1) / dest_band_height;
+
+    for (k = 0; k < bands; k++)
+    {
+        // scale one band horizontally
+        for (i = 1; i < source_band_height + 1; i++)
+        {
+            if (k * source_band_height + i < source_height)
+            {
+                Scale1Dh(source + i * source_pitch, 1, hscale, source_width + 1,
+                         temp_area + i * dest_pitch, 1, hratio, dest_width);
+            }
+            else  //  Duplicate the last row
+            {
+                // copy temp_area row 0 over from last row in the past
+                duck_memcpy(temp_area + i * dest_pitch, temp_area + (i - 1)*dest_pitch, dest_pitch);
+            }
+        }
+
+        // scale one band vertically
+        for (j = 0; j < dest_width; j++)
+        {
+            Scale1Dv(&temp_area[j], dest_pitch, vscale, source_band_height + 1,
+                     &dest[j], dest_pitch, vratio, dest_band_height);
+        }
+
+        // copy temp_area row 0 over from last row in the past
+        duck_memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch);
+
+        // move to the next band
+        source += source_band_height * source_pitch;
+        dest   += dest_band_height * dest_pitch;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8_scale_frame
+ *
+ *  INPUTS        : YV12_BUFFER_CONFIG *src       : Pointer to frame to be scaled.
+ *                  YV12_BUFFER_CONFIG *dst       : Pointer to buffer to hold scaled frame.
+ *                  unsigned char *temp_area      : Pointer to temp work area.
+ *                  unsigned char temp_area_height : Height of temp work area.
+ *                  unsigned int hscale          : Horizontal scale factor numerator.
+ *                  unsigned int hratio          : Horizontal scale factor denominator.
+ *                  unsigned int vscale          : Vertical scale factor numerator.
+ *                  unsigned int vratio          : Vertical scale factor denominator.
+ *                  unsigned int interlaced      : Interlace flag.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-tap linear interpolation in two dimensions.
+ *
+ *  SPECIAL NOTES : Expansion is performed one band at a time to help with
+ *                  caching.
+ *
+ ****************************************************************************/
+void vp8_scale_frame
+(
+    YV12_BUFFER_CONFIG *src,
+    YV12_BUFFER_CONFIG *dst,
+    unsigned char *temp_area,
+    unsigned char temp_height,
+    unsigned int hscale,
+    unsigned int hratio,
+    unsigned int vscale,
+    unsigned int vratio,
+    unsigned int interlaced
+)
+{
+    int i;
+    int dw = (hscale - 1 + src->y_width * hratio) / hscale;
+    int dh = (vscale - 1 + src->y_height * vratio) / vscale;
+
+    // call our internal scaling routines!!
+    Scale2D((unsigned char *) src->y_buffer, src->y_stride, src->y_width, src->y_height,
+            (unsigned char *) dst->y_buffer, dst->y_stride, dw, dh,
+            temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
+
+    if (dw < (int)dst->y_width)
+        for (i = 0; i < dh; i++)
+            duck_memset(dst->y_buffer + i * dst->y_stride + dw - 1, dst->y_buffer[i*dst->y_stride+dw-2], dst->y_width - dw + 1);
+
+    if (dh < (int)dst->y_height)
+        for (i = dh - 1; i < (int)dst->y_height; i++)
+            duck_memcpy(dst->y_buffer + i * dst->y_stride, dst->y_buffer + (dh - 2) * dst->y_stride, dst->y_width + 1);
+
+    Scale2D((unsigned char *) src->u_buffer, src->uv_stride, src->uv_width, src->uv_height,
+            (unsigned char *) dst->u_buffer, dst->uv_stride, dw / 2, dh / 2,
+            temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
+
+    if (dw / 2 < (int)dst->uv_width)
+        for (i = 0; i < dst->uv_height; i++)
+            duck_memset(dst->u_buffer + i * dst->uv_stride + dw / 2 - 1, dst->u_buffer[i*dst->uv_stride+dw/2-2], dst->uv_width - dw / 2 + 1);
+
+    if (dh / 2 < (int)dst->uv_height)
+        for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
+            duck_memcpy(dst->u_buffer + i * dst->uv_stride, dst->u_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
+
+    Scale2D((unsigned char *) src->v_buffer, src->uv_stride, src->uv_width, src->uv_height,
+            (unsigned char *) dst->v_buffer, dst->uv_stride, dw / 2, dh / 2,
+            temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
+
+    if (dw / 2 < (int)dst->uv_width)
+        for (i = 0; i < dst->uv_height; i++)
+            duck_memset(dst->v_buffer + i * dst->uv_stride + dw / 2 - 1, dst->v_buffer[i*dst->uv_stride+dw/2-2], dst->uv_width - dw / 2 + 1);
+
+    if (dh / 2 < (int) dst->uv_height)
+        for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
+            duck_memcpy(dst->v_buffer + i * dst->uv_stride, dst->v_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
+}
+/****************************************************************************
+ *
+ *  ROUTINE       : any_ratio_2d_scale
+ *
+ *  INPUTS        : SCALE_INSTANCE *si      : Pointer to post-processor instance (NOT USED).
+ *                  const unsigned char *source : Pointer to source image.
+ *                  unsigned int source_pitch    : Stride of source image.
+ *                  unsigned int source_width    : Width of source image.
+ *                  unsigned int source_height   : Height of source image (NOT USED).
+ *                  unsigned char *dest         : Pointer to destination image.
+ *                  unsigned int dest_pitch      : Stride of destination image.
+ *                  unsigned int dest_width      : Width of destination image.
+ *                  unsigned int dest_height     : Height of destination image.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : int: 1 if image scaled, 0 if image could not be scaled.
+ *
+ *  FUNCTION      : Scale the image with changing apect ratio.
+ *
+ *  SPECIAL NOTES : This scaling is a bi-linear scaling. Need to re-work the
+ *                  whole function for new scaling algorithm.
+ *
+ ****************************************************************************/
+static
+int any_ratio_2d_scale
+(
+    SCALE_VARS *si,
+    const unsigned char *source,
+    unsigned int source_pitch,
+    unsigned int source_width,
+    unsigned int source_height,
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width,
+    unsigned int dest_height
+)
+{
+    unsigned int i, k;
+    unsigned int src_band_height  = 0;
+    unsigned int dest_band_height = 0;
+
+    // suggested scale factors
+    int hs = si->HScale;
+    int hr = si->HRatio;
+    int vs = si->VScale;
+    int vr = si->VRatio;
+
+    // assume the ratios are scalable instead of should be centered
+    int ratio_scalable = 1;
+
+    void (*horiz_line_scale)(const unsigned char *, unsigned int, unsigned char *, unsigned int) = NULL;
+    void (*vert_band_scale)(unsigned char *, unsigned int, unsigned int) = NULL;
+    void (*last_vert_band_scale)(unsigned char *, unsigned int, unsigned int) = NULL;
+
+    (void) si;
+
+    // find out the ratio for each direction
+    switch (hr * 10 / hs)
+    {
+    case 8:
+        // 4-5 Scale in Width direction
+        horiz_line_scale = g_scaling_ptrs->vpxhorizontal_line_4_5_scale_t;
+        break;
+    case 6:
+        // 3-5 Scale in Width direction
+        horiz_line_scale = g_scaling_ptrs->vpxhorizontal_line_3_5_scale_t;
+        break;
+    case 5:
+        // 1-2 Scale in Width direction
+        horiz_line_scale = g_scaling_ptrs->vpxhorizontal_line_1_2_scale_t;
+        break;
+    case 10:
+        // no scale in Width direction
+        horiz_line_scale = horizontal_line_copy;
+        break;
+    default:
+        // The ratio is not acceptable now
+        // throw("The ratio is not acceptable for now!");
+        ratio_scalable = 0;
+        break;
+    }
+
+    switch (vr * 10 / vs)
+    {
+    case 8:
+        // 4-5 Scale in vertical direction
+        vert_band_scale     = g_scaling_ptrs->vpxvertical_band_4_5_scale_t;
+        last_vert_band_scale = g_scaling_ptrs->vpxlast_vertical_band_4_5_scale_t;
+        src_band_height     = 4;
+        dest_band_height    = 5;
+        break;
+    case 6:
+        // 3-5 Scale in vertical direction
+        vert_band_scale     = g_scaling_ptrs->vpxvertical_band_3_5_scale_t;
+        last_vert_band_scale = g_scaling_ptrs->vpxlast_vertical_band_3_5_scale_t;
+        src_band_height     = 3;
+        dest_band_height    = 5;
+        break;
+    case 5:
+        // 1-2 Scale in vertical direction
+        vert_band_scale     = g_scaling_ptrs->vpxvertical_band_1_2_scale_t;
+        last_vert_band_scale = g_scaling_ptrs->vpxlast_vertical_band_1_2_scale_t;
+        src_band_height     = 1;
+        dest_band_height    = 2;
+        break;
+    case 10:
+        // no scale in Width direction
+        vert_band_scale     = null_scale;
+        last_vert_band_scale = null_scale;
+        src_band_height     = 4;
+        dest_band_height    = 4;
+        break;
+    default:
+        // The ratio is not acceptable now
+        // throw("The ratio is not acceptable for now!");
+        ratio_scalable = 0;
+        break;
+    }
+
+    if (ratio_scalable == 0)
+        return ratio_scalable;
+
+    horiz_line_scale(source, source_width, dest, dest_width);
+
+    // except last band
+    for (k = 0; k < (dest_height + dest_band_height - 1) / dest_band_height - 1; k++)
+    {
+        // scale one band horizontally
+        for (i = 1; i < src_band_height; i++)
+        {
+            horiz_line_scale(source + i * source_pitch,
+                             source_width,
+                             dest + i * dest_pitch,
+                             dest_width);
+        }
+
+        // first line of next band
+        horiz_line_scale(source + src_band_height * source_pitch,
+                         source_width,
+                         dest + dest_band_height * dest_pitch,
+                         dest_width);
+
+        // Vertical scaling is in place
+        vert_band_scale(dest, dest_pitch, dest_width);
+
+        // Next band...
+        source += src_band_height  * source_pitch;
+        dest   += dest_band_height * dest_pitch;
+    }
+
+    // scale one band horizontally
+    for (i = 1; i < src_band_height; i++)
+    {
+        horiz_line_scale(source + i * source_pitch,
+                         source_width,
+                         dest + i * dest_pitch,
+                         dest_width);
+    }
+
+    // Vertical scaling is in place
+    last_vert_band_scale(dest, dest_pitch, dest_width);
+
+    return ratio_scalable;
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : any_ratio_frame_scale
+ *
+ *  INPUTS        : SCALE_INSTANCE *si       : Pointer to post-processor instance (NOT USED).
+ *                  unsigned char *frame_buffer           : Pointer to source image.
+ *                  int YOffset                : Offset from start of buffer to Y samples.
+ *                  int UVOffset               : Offset from start of buffer to UV samples.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : int: 1 if image scaled, 0 if image could not be scaled.
+ *
+ *  FUNCTION      : Scale the image with changing apect ratio.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+int any_ratio_frame_scale(SCALE_VARS *scale_vars, int YOffset, int UVOffset)
+{
+    int i;
+    int ew;
+    int eh;
+
+    // suggested scale factors
+    int hs = scale_vars->HScale;
+    int hr = scale_vars->HRatio;
+    int vs = scale_vars->VScale;
+    int vr = scale_vars->VRatio;
+
+    int ratio_scalable = 1;
+
+    int sw = (scale_vars->expanded_frame_width * hr + hs - 1) / hs;
+    int sh = (scale_vars->expanded_frame_height * vr + vs - 1) / vs;
+    int dw = scale_vars->expanded_frame_width;
+    int dh = scale_vars->expanded_frame_height;
+    YV12_BUFFER_CONFIG *src_yuv_config = scale_vars->src_yuv_config;
+    YV12_BUFFER_CONFIG *dst_yuv_config = scale_vars->dst_yuv_config;
+
+    if (hr == 3)
+        ew = (sw + 2) / 3 * 3 * hs / hr;
+    else
+        ew = (sw + 7) / 8 * 8 * hs / hr;
+
+    if (vr == 3)
+        eh = (sh + 2) / 3 * 3 * vs / vr;
+    else
+        eh = (sh + 7) / 8 * 8 * vs / vr;
+
+    ratio_scalable = any_ratio_2d_scale(scale_vars,
+                                        (const unsigned char *)src_yuv_config->y_buffer,
+                                        src_yuv_config->y_stride, sw, sh,
+                                        (unsigned char *) dst_yuv_config->y_buffer + YOffset,
+                                        dst_yuv_config->y_stride, dw, dh);
+
+    for (i = 0; i < eh; i++)
+        duck_memset(dst_yuv_config->y_buffer + YOffset + i * dst_yuv_config->y_stride + dw, 0, ew - dw);
+
+    for (i = dh; i < eh; i++)
+        duck_memset(dst_yuv_config->y_buffer + YOffset + i * dst_yuv_config->y_stride, 0, ew);
+
+    if (ratio_scalable == 0)
+        return ratio_scalable;
+
+    sw = (sw + 1) >> 1;
+    sh = (sh + 1) >> 1;
+    dw = (dw + 1) >> 1;
+    dh = (dh + 1) >> 1;
+
+    any_ratio_2d_scale(scale_vars,
+                       (const unsigned char *)src_yuv_config->u_buffer,
+                       src_yuv_config->y_stride / 2, sw, sh,
+                       (unsigned char *)dst_yuv_config->u_buffer + UVOffset,
+                       dst_yuv_config->uv_stride, dw, dh);
+
+    any_ratio_2d_scale(scale_vars,
+                       (const unsigned char *)src_yuv_config->v_buffer,
+                       src_yuv_config->y_stride / 2, sw, sh,
+                       (unsigned char *)dst_yuv_config->v_buffer + UVOffset,
+                       dst_yuv_config->uv_stride, dw, dh);
+
+    return ratio_scalable;
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : center_image
+ *
+ *  INPUTS        : SCALE_INSTANCE *si       : Pointer to post-processor instance.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Centers the image without scaling in the output buffer.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static void
+center_image(YV12_BUFFER_CONFIG *src_yuv_config, YV12_BUFFER_CONFIG *dst_yuv_config)
+{
+    int i;
+    int row_offset, col_offset;
+    char *src_data_pointer;
+    char *dst_data_pointer;
+
+    // center values
+    row_offset = (dst_yuv_config->y_height - src_yuv_config->y_height) / 2;
+    col_offset = (dst_yuv_config->y_width - src_yuv_config->y_width) / 2;
+
+    // Y's
+    src_data_pointer = src_yuv_config->y_buffer;
+    dst_data_pointer = (char *)dst_yuv_config->y_buffer + (row_offset * dst_yuv_config->y_stride) + col_offset;
+
+    for (i = 0; i < src_yuv_config->y_height; i++)
+    {
+        duck_memcpy(dst_data_pointer, src_data_pointer, src_yuv_config->y_width);
+        dst_data_pointer += dst_yuv_config->y_stride;
+        src_data_pointer += src_yuv_config->y_stride;
+    }
+
+    row_offset /= 2;
+    col_offset /= 2;
+
+    // U's
+    src_data_pointer = src_yuv_config->u_buffer;
+    dst_data_pointer = (char *)dst_yuv_config->u_buffer + (row_offset * dst_yuv_config->uv_stride) + col_offset;
+
+    for (i = 0; i < src_yuv_config->uv_height; i++)
+    {
+        duck_memcpy(dst_data_pointer, src_data_pointer, src_yuv_config->uv_width);
+        dst_data_pointer += dst_yuv_config->uv_stride;
+        src_data_pointer += src_yuv_config->uv_stride;
+    }
+
+    // V's
+    src_data_pointer = src_yuv_config->v_buffer;
+    dst_data_pointer = (char *)dst_yuv_config->v_buffer + (row_offset * dst_yuv_config->uv_stride) + col_offset;
+
+    for (i = 0; i < src_yuv_config->uv_height; i++)
+    {
+        duck_memcpy(dst_data_pointer, src_data_pointer, src_yuv_config->uv_width);
+        dst_data_pointer += dst_yuv_config->uv_stride;
+        src_data_pointer += src_yuv_config->uv_stride;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : scale_or_center
+ *
+ *  INPUTS        : SCALE_INSTANCE *si       : Pointer to post-processor instance.
+ *
+ *
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Decides to scale or center image in scale buffer for blit
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void
+vp8_yv12_scale_or_center
+(
+    YV12_BUFFER_CONFIG *src_yuv_config,
+    YV12_BUFFER_CONFIG *dst_yuv_config,
+    int expanded_frame_width,
+    int expanded_frame_height,
+    int scaling_mode,
+    int HScale,
+    int HRatio,
+    int VScale,
+    int VRatio
+)
+{
+//    if ( ppi->post_processing_level )
+    //      update_umvborder ( ppi, frame_buffer );
+
+
+    switch (scaling_mode)
+    {
+    case SCALE_TO_FIT:
+    case MAINTAIN_ASPECT_RATIO:
+    {
+        SCALE_VARS scale_vars;
+        // center values
+#if 1
+        int row = (dst_yuv_config->y_height - expanded_frame_height) / 2;
+        int col = (dst_yuv_config->y_width  - expanded_frame_width) / 2;
+//        int YOffset  = row * dst_yuv_config->y_width + col;
+//        int UVOffset = (row>>1) * dst_yuv_config->uv_width + (col>>1);
+        int YOffset  = row * dst_yuv_config->y_stride + col;
+        int UVOffset = (row >> 1) * dst_yuv_config->uv_stride + (col >> 1);
+#else
+        int row = (src_yuv_config->y_height - expanded_frame_height) / 2;
+        int col = (src_yuv_config->y_width  - expanded_frame_width) / 2;
+        int YOffset  = row * src_yuv_config->y_width + col;
+        int UVOffset = (row >> 1) * src_yuv_config->uv_width + (col >> 1);
+#endif
+
+        scale_vars.dst_yuv_config = dst_yuv_config;
+        scale_vars.src_yuv_config = src_yuv_config;
+        scale_vars.HScale = HScale;
+        scale_vars.HRatio = HRatio;
+        scale_vars.VScale = VScale;
+        scale_vars.VRatio = VRatio;
+        scale_vars.expanded_frame_width = expanded_frame_width;
+        scale_vars.expanded_frame_height = expanded_frame_height;
+
+        // perform center and scale
+        any_ratio_frame_scale(&scale_vars, YOffset, UVOffset);
+
+        break;
+    }
+    case CENTER:
+        center_image(src_yuv_config, dst_yuv_config);
+        break;
+
+    default:
+        break;
+    }
+}
diff --git a/vpx_scale/leapster/yv12extend.c b/vpx_scale/leapster/yv12extend.c
new file mode 100644
index 000000000..480d971b4
--- /dev/null
+++ b/vpx_scale/leapster/yv12extend.c
@@ -0,0 +1,231 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+ *
+ *   Module Title :     yv12extend.c
+ *
+ *   Description  :
+ *
+ ***************************************************************************/
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+//#include <stdlib.h>
+#include "vpx_scale/yv12config.h"
+#include "vpx_mem/vpx_mem.h"
+
+/****************************************************************************
+*  Exports
+****************************************************************************/
+
+/****************************************************************************
+ *
+ ****************************************************************************/
+void
+vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf)
+{
+    int i;
+    char *src_ptr1, *src_ptr2;
+    char *dest_ptr1, *dest_ptr2;
+
+    unsigned int Border;
+    int plane_stride;
+    int plane_height;
+    int plane_width;
+
+    /***********/
+    /* Y Plane */
+    /***********/
+    Border = ybf->border;
+    plane_stride = ybf->y_stride;
+    plane_height = ybf->y_height;
+    plane_width = ybf->y_width;
+
+    // copy the left and right most columns out
+    src_ptr1 = ybf->y_buffer;
+    src_ptr2 = src_ptr1 + plane_width - 1;
+    dest_ptr1 = src_ptr1 - Border;
+    dest_ptr2 = src_ptr2 + 1;
+
+    for (i = 0; i < plane_height; i++)
+    {
+        memset(dest_ptr1, src_ptr1[0], Border);
+        memset(dest_ptr2, src_ptr2[0], Border);
+        src_ptr1  += plane_stride;
+        src_ptr2  += plane_stride;
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    // Now copy the top and bottom source lines into each line of the respective borders
+    src_ptr1 = ybf->y_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)Border; i++)
+    {
+        memcpy(dest_ptr1, src_ptr1, plane_stride);
+        memcpy(dest_ptr2, src_ptr2, plane_stride);
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    plane_stride /= 2;
+    plane_height /= 2;
+    plane_width /= 2;
+    Border /= 2;
+
+    /***********/
+    /* U Plane */
+    /***********/
+
+    // copy the left and right most columns out
+    src_ptr1 = ybf->u_buffer;
+    src_ptr2 = src_ptr1 + plane_width - 1;
+    dest_ptr1 = src_ptr1 - Border;
+    dest_ptr2 = src_ptr2 + 1;
+
+    for (i = 0; i < plane_height; i++)
+    {
+        memset(dest_ptr1, src_ptr1[0], Border);
+        memset(dest_ptr2, src_ptr2[0], Border);
+        src_ptr1  += plane_stride;
+        src_ptr2  += plane_stride;
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    // Now copy the top and bottom source lines into each line of the respective borders
+    src_ptr1 = ybf->u_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)(Border); i++)
+    {
+        memcpy(dest_ptr1, src_ptr1, plane_stride);
+        memcpy(dest_ptr2, src_ptr2, plane_stride);
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    /***********/
+    /* V Plane */
+    /***********/
+
+    // copy the left and right most columns out
+    src_ptr1 = ybf->v_buffer;
+    src_ptr2 = src_ptr1 + plane_width - 1;
+    dest_ptr1 = src_ptr1 - Border;
+    dest_ptr2 = src_ptr2 + 1;
+
+    for (i = 0; i < plane_height; i++)
+    {
+        memset(dest_ptr1, src_ptr1[0], Border);
+        memset(dest_ptr2, src_ptr2[0], Border);
+        src_ptr1  += plane_stride;
+        src_ptr2  += plane_stride;
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    // Now copy the top and bottom source lines into each line of the respective borders
+    src_ptr1 = ybf->v_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)(Border); i++)
+    {
+        memcpy(dest_ptr1, src_ptr1, plane_stride);
+        memcpy(dest_ptr2, src_ptr2, plane_stride);
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+}
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8_yv12_copy_frame
+ *
+ *  INPUTS        :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies the source image into the destination image and
+ *                  updates the destination's UMV borders.
+ *
+ *  SPECIAL NOTES : The frames are assumed to be identical in size.
+ *
+ ****************************************************************************/
+void
+vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
+{
+    int row;
+    int i;
+    unsigned int *source;
+    _Uncached unsigned int *dest;
+    int height;
+    int width;
+
+    height = src_ybc->y_height + (src_ybc->border * 2);
+    width =  src_ybc->y_width + (src_ybc->border * 2);
+    width /= 4;
+    source = (unsigned int *)(src_ybc->y_buffer - (src_ybc->border * src_ybc->y_stride) - src_ybc->border);
+    dest = (_Uncached unsigned int *)(dst_ybc->y_buffer - (dst_ybc->border * dst_ybc->y_stride) - dst_ybc->border);
+
+    for (row = 0; row < height; row++)
+    {
+        for (i = 0; i < width; i++)
+        {
+            dest[i] = source[i];
+        }
+
+        source += width;
+        dest   += width;
+    }
+
+    height = src_ybc->uv_height + (src_ybc->border);
+    width =  src_ybc->uv_width + (src_ybc->border);
+    width /= 4;
+
+    source = (unsigned int *)(src_ybc->u_buffer - (src_ybc->border / 2 * src_ybc->uv_stride) - src_ybc->border / 2);
+    dest = (_Uncached unsigned int *)(dst_ybc->u_buffer - (dst_ybc->border / 2 * dst_ybc->uv_stride) - dst_ybc->border / 2);
+
+    for (row = 0; row < height; row++)
+    {
+        for (i = 0; i < width; i++)
+        {
+            dest[i] = source[i];
+        }
+
+        source += width;
+        dest   += width;
+    }
+
+    source = (unsigned int *)(src_ybc->v_buffer - (src_ybc->border / 2 * src_ybc->uv_stride) - src_ybc->border / 2);
+    dest = (_Uncached unsigned int *)(dst_ybc->v_buffer - (dst_ybc->border / 2 * dst_ybc->uv_stride) - dst_ybc->border / 2);
+
+    for (row = 0; row < height; row++)
+    {
+        for (i = 0; i < width; i++)
+        {
+            dest[i] = source[i];
+        }
+
+        source += width;
+        dest   += width;
+    }
+
+}
diff --git a/vpx_scale/scale_mode.h b/vpx_scale/scale_mode.h
new file mode 100644
index 000000000..2a9ab7612
--- /dev/null
+++ b/vpx_scale/scale_mode.h
@@ -0,0 +1,28 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*****************************************************************************
+*/
+
+#ifndef SCALE_MODE_H
+#define SCALE_MODE_H
+
+typedef enum
+{
+    MAINTAIN_ASPECT_RATIO   = 0x0,
+    SCALE_TO_FIT            = 0x1,
+    CENTER                  = 0x2,
+    OTHER                   = 0x3
+} SCALE_MODE;
+
+
+#endif
diff --git a/vpx_scale/symbian/gen_scalers_armv4.asm b/vpx_scale/symbian/gen_scalers_armv4.asm
new file mode 100644
index 000000000..1c904edae
--- /dev/null
+++ b/vpx_scale/symbian/gen_scalers_armv4.asm
@@ -0,0 +1,773 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |horizontal_line_4_5_scale_armv4|
+    EXPORT  |vertical_band_4_5_scale_armv4|
+    EXPORT  |horizontal_line_2_3_scale_armv4|
+    EXPORT  |vertical_band_2_3_scale_armv4|
+    EXPORT  |horizontal_line_3_5_scale_armv4|
+    EXPORT  |vertical_band_3_5_scale_armv4|
+    EXPORT  |horizontal_line_3_4_scale_armv4|
+    EXPORT  |vertical_band_3_4_scale_armv4|
+    EXPORT  |horizontal_line_1_2_scale_armv4|
+    EXPORT  |vertical_band_1_2_scale_armv4|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+src         RN  r0
+srcw        RN  r1
+dest        RN  r2
+mask        RN  r12
+c51_205     RN  r10
+c102_154    RN  r11
+;/****************************************************************************
+; *
+; *  ROUTINE       : horizontal_line_4_5_scale_armv4
+; *
+; *  INPUTS        : const unsigned char *source : Pointer to source data.
+; *                  unsigned int source_width    : Stride of source.
+; *                  unsigned char *dest         : Pointer to destination data.
+; *                  unsigned int dest_width      : Stride of destination (NOT USED).
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Copies horizontal line of pixels from source to
+; *                  destination scaling up by 4 to 5.
+; *
+; *  SPECIAL NOTES : None.
+; *
+; ****************************************************************************/
+;void horizontal_line_4_5_scale_armv4
+;(
+;   r0 = UINT8 *source
+;   r1 = UINT32 source_width
+;   r2 = UINT8 *dest
+;   r3 = UINT32 dest_width
+;)
+|horizontal_line_4_5_scale_armv4| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    mov     mask, #255              ; mask for selection
+    ldr     c51_205, =0x3300cd
+    ldr     c102_154, =0x66009a
+
+    ldr     r3, [src], #4
+
+hl45_loop
+
+    and     r4, r3, mask            ; a = src[0]
+    and     r5, mask, r3, lsr #8    ; b = src[1]
+    strb    r4, [dest], #1
+
+    orr     r6, r4, r5, lsl #16     ; b | a
+    and     r7, mask, r3, lsr #16   ; c = src[2]
+    mul     r6, c51_205, r6         ; a * 51 + 205 * b
+
+    orr     r5, r5, r7, lsl #16     ; c | b
+    mul     r5, c102_154, r5        ; b * 102 + 154 * c
+    add     r6, r6, #0x8000
+    and     r8, mask, r3, lsr #24   ; d = src[3]
+    mov     r6, r6, lsr #24
+    strb    r6, [dest], #1
+
+    orr     r7, r8, r7, lsl #16     ; c | d
+    mul     r7, c102_154, r7        ; c * 154 + 102 * d
+    add     r5, r5, #0x8000
+    ldr     r3, [src], #4
+    mov     r5, r5, lsr #24
+    strb    r5, [dest], #1
+
+    add     r7, r7, #0x8000
+    and     r9, mask, r3            ; e = src[4]
+    orr     r9, r9, r8, lsl #16     ; d | e
+    mul     r9, c51_205, r9         ; d * 205 + 51 * e
+    mov     r7, r7, lsr #24
+    strb    r7, [dest], #1
+
+    add     r9, r9, #0x8000
+    subs    srcw, srcw, #4
+    mov     r9, r9, lsr #24
+    strb    r9, [dest], #1
+
+    bne     hl45_loop
+
+    and     r4, r3, mask
+    and     r5, mask, r3, lsl #8
+    strb    r4, [dest], #1
+
+    orr     r6, r4, r5, lsl #16     ; b | a
+    mul     r6, c51_205, r6
+
+    and     r7, mask, r3, lsl #16
+    orr     r5, r5, r7, lsl #16     ; c | b
+    mul     r5, c102_154, r5
+    add     r6, r6, #0x8000
+    and     r8, mask, r3, lsl #24
+    mov     r6, r6, lsr #24
+    strb    r6, [dest], #1
+
+    orr     r7, r8, r7, lsl #16     ; c | d
+    mul     r7, c102_154, r7
+    add     r5, r5, #0x8000
+    mov     r5, r5, lsr #24
+    strb    r5, [dest], #1
+
+    add     r7, r7, #0x8000
+    mov     r7, r7, lsr #24
+    strb    r7, [dest], #1
+
+    ldrb    r3, [src]
+    strb    r3, [dest], #1
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ;|vp8cx_horizontal_line_4_5_scale_c|
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : vertical_band_4_5_scale_armv4
+; *
+; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+; *                  unsigned int dest_pitch : Stride of destination data.
+; *                  unsigned int dest_width : Width of destination data.
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Scales vertical band of pixels by scale 4 to 5. The
+; *                  height of the band scaled is 4-pixels.
+; *
+; *  SPECIAL NOTES : The routine uses the first line of the band below
+; *                  the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_4_5_scale_armv4
+;(
+;   r0 = UINT8 *dest
+;   r1 = UINT32 dest_pitch
+;   r2 = UINT32 dest_width
+;)
+|vertical_band_4_5_scale_armv4| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     c51_205, =0x3300cd
+    ldr     c102_154, =0x66009a
+
+vl45_loop
+    mov     r3, src
+    ldrb    r4, [r3], r1            ; a = des [0]
+    ldrb    r5, [r3], r1            ; b = des [dest_pitch]
+    ldrb    r7, [r3], r1            ; c = des[dest_pitch*2]
+    add     lr, src, r1
+
+    orr     r6, r4, r5, lsl #16     ; b | a
+    mul     r6, c51_205, r6         ; a * 51 + 205 * b
+
+    ldrb    r8, [r3], r1            ; d = des[dest_pitch*3]
+    orr     r5, r5, r7, lsl #16     ; c | b
+    mul     r5, c102_154, r5        ; b * 102 + 154 * c
+    add     r6, r6, #0x8000
+    orr     r7, r8, r7, lsl #16     ; c | d
+    mov     r6, r6, lsr #24
+    strb    r6, [lr], r1
+
+    ldrb    r9, [r3, r1]            ; e = des [dest_pitch * 5]
+    mul     r7, c102_154, r7        ; c * 154 + 102 * d
+    add     r5, r5, #0x8000
+    orr     r9, r9, r8, lsl #16     ; d | e
+    mov     r5, r5, lsr #24
+    strb    r5, [lr], r1
+
+    mul     r9, c51_205, r9         ; d * 205 + 51 * e
+    add     r7, r7, #0x8000
+    add     src, src, #1
+    mov     r7, r7, lsr #24
+    strb    r7, [lr], r1
+
+    add     r9, r9, #0x8000
+    subs    r2, r2, #1
+    mov     r9, r9, lsr #24
+    strb    r9, [lr], r1
+
+    bne     vl45_loop
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ;|vertical_band_4_5_scale_armv4|
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : horizontal_line_2_3_scale_armv4
+; *
+; *  INPUTS        : const unsigned char *source : Pointer to source data.
+; *                  unsigned int source_width    : Stride of source.
+; *                  unsigned char *dest         : Pointer to destination data.
+; *                  unsigned int dest_width      : Stride of destination (NOT USED).
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Copies horizontal line of pixels from source to
+; *                  destination scaling up by 2 to 3.
+; *
+; *  SPECIAL NOTES : None.
+; *
+; *
+; ****************************************************************************/
+;void horizontal_line_2_3_scale_armv4
+;(
+;   const unsigned char *source,
+;   unsigned int source_width,
+;   unsigned char *dest,
+;   unsigned int dest_width
+;)
+|horizontal_line_2_3_scale_armv4| PROC
+    stmdb   sp!, {r4 - r11, lr}
+    ldr     lr,  =85
+    ldr     r12, =171
+
+hl23_loop
+
+    ldrb    r3, [src], #1           ; a
+    ldrb    r4, [src], #1           ; b
+    ldrb    r5, [src]               ; c
+
+    strb    r3, [dest], #1
+    mul     r4, r12, r4             ; b * 171
+    mla     r6, lr, r3, r4          ; a * 85
+    mla     r7, lr, r5, r4          ; c * 85
+
+    add     r6, r6, #128
+    mov     r6, r6, lsr #8
+    strb    r6, [dest], #1
+
+    add     r7, r7, #128
+    mov     r7, r7, lsr #8
+    strb    r7, [dest], #1
+
+    subs    srcw, srcw, #2
+    bne     hl23_loop
+
+    ldrb    r4, [src, #1]           ; b
+    strb    r5, [dest], #1
+    strb    r4, [dest, #1]
+
+    mul     r4, r12, r4             ; b * 171
+    mla     r6, lr, r5, r4          ; a * 85 + b *171
+
+    add     r6, r6, #128
+    mov     r6, r6, lsr #8
+    strb    r6, [dest]
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ;|horizontal_line_2_3_scale_armv4|
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : vertical_band_2_3_scale_armv4
+; *
+; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+; *                  unsigned int dest_pitch : Stride of destination data.
+; *                  unsigned int dest_width : Width of destination data.
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Scales vertical band of pixels by scale 2 to 3. The
+; *                  height of the band scaled is 2-pixels.
+; *
+; *  SPECIAL NOTES : The routine uses the first line of the band below
+; *                  the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_2_3_scale_armv4
+;(
+;   r0 = UINT8 *dest
+;   r1 = UINT32 dest_pitch
+;   r2 = UINT32 dest_width
+;)
+|vertical_band_2_3_scale_armv4| PROC
+    stmdb   sp!, {r4 - r8, lr}
+    ldr     lr,  =85
+    ldr     r12, =171
+    add     r3, r1, r1, lsl #1      ; 3 * dest_pitch
+
+vl23_loop
+    ldrb    r4, [src]               ; a = des [0]
+    ldrb    r5, [src, r1]           ; b = des [dest_pitch]
+    ldrb    r7, [src, r3]           ; c = des [dest_pitch*3]
+    subs    r2, r2, #1
+
+    mul     r5, r12, r5             ; b * 171
+    mla     r6, lr, r4, r5          ; a * 85
+    mla     r8, lr, r7, r5          ; c * 85
+
+    add     r6, r6, #128
+    mov     r6, r6, lsr #8
+    strb    r6, [src, r1]
+
+    add     r8, r8, #128
+    mov     r8, r8, lsr #8
+    strb    r8, [src, r1, lsl #1]
+
+    add     src, src, #1
+
+    bne     vl23_loop
+
+    ldmia   sp!, {r4 - r8, pc}
+    ENDP    ;|vertical_band_2_3_scale_armv4|
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : vp8cx_horizontal_line_3_5_scale_c
+; *
+; *  INPUTS        : const unsigned char *source : Pointer to source data.
+; *                  unsigned int source_width    : Stride of source.
+; *                  unsigned char *dest         : Pointer to destination data.
+; *                  unsigned int dest_width      : Stride of destination (NOT USED).
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Copies horizontal line of pixels from source to
+; *                  destination scaling up by 3 to 5.
+; *
+; *  SPECIAL NOTES : None.
+; *
+; *
+; ****************************************************************************/
+;void vp8cx_horizontal_line_3_5_scale_c
+;(
+;   const unsigned char *source,
+;   unsigned int source_width,
+;   unsigned char *dest,
+;   unsigned int dest_width
+;)
+|horizontal_line_3_5_scale_armv4| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     c51_205, =0x3300cd
+    ldr     c102_154, =0x66009a
+
+    ldrb    r4, [src], #1           ; a = src[0]
+
+hl35_loop
+
+    ldrb    r8, [src], #1           ; b = src[1]
+    strb    r4, [dest], #1
+
+    orr     r6, r4, r8, lsl #16     ; b | a
+    ldrb    r9, [src], #1           ; c = src[2]
+    mul     r6, c102_154, r6        ; a * 102 + 154 * b
+
+    orr     r5, r9, r8, lsl #16     ; b | c
+    mul     r5, c51_205, r5         ; b * 205 + 51 * c
+    add     r6, r6, #0x8000
+    ldrb    r4, [src], #1           ; d = src[3]
+    mov     r6, r6, lsr #24
+    strb    r6, [dest], #1
+
+    orr     r7, r8, r9, lsl #16     ; c | b
+    mul     r7, c51_205, r7         ; c * 205 + 154 * b
+    add     r5, r5, #0x8000
+    mov     r5, r5, lsr #24
+    strb    r5, [dest], #1
+
+    orr     r9, r4, r9, lsl #16     ; c | d
+    mul     r9, c102_154, r9        ; c * 154 + 102 * d
+    add     r7, r7, #0x8000
+    mov     r7, r7, lsr #24
+    strb    r7, [dest], #1
+
+    add     r9, r9, #0x8000
+    subs    srcw, srcw, #3
+    mov     r9, r9, lsr #24
+    strb    r9, [dest], #1
+
+    bpl     hl35_loop
+
+    ldrb    r5, [src], #1           ; b = src[1]
+    strb    r4, [dest], #1
+
+    orr     r6, r4, r8, lsl #16     ; b | a
+    ldrb    r9, [src], #1           ; c = src[2]
+    mul     r6, c102_154, r6        ; a * 102 + 154 * b
+
+    orr     r5, r9, r8, lsl #16     ; b | c
+    mul     r5, c51_205, r5         ; b * 205 + 51 * c
+    add     r6, r6, #0x8000
+    mov     r6, r6, lsr #24
+    strb    r6, [dest], #1
+
+    orr     r7, r8, r9, lsl #16     ; c | b
+    mul     r7, c51_205, r7         ; c * 205 + 154 * b
+    add     r5, r5, #0x8000
+    mov     r5, r5, lsr #24
+    strb    r5, [dest], #1
+
+    add     r7, r7, #0x8000
+    mov     r7, r7, lsr #24
+    strb    r7, [dest], #1
+    strb    r9, [dest], #1
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ;|vp8cx_horizontal_line_3_5_scale_c|
+
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : vp8cx_vertical_band_3_5_scale_c
+; *
+; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+; *                  unsigned int dest_pitch : Stride of destination data.
+; *                  unsigned int dest_width : Width of destination data.
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Scales vertical band of pixels by scale 3 to 5. The
+; *                  height of the band scaled is 3-pixels.
+; *
+; *  SPECIAL NOTES : The routine uses the first line of the band below
+; *                  the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_4_5_scale_armv4
+;(
+;   r0 = UINT8 *dest
+;   r1 = UINT32 dest_pitch
+;   r2 = UINT32 dest_width
+;)
+|vertical_band_3_5_scale_armv4| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     c51_205, =0x3300cd
+    ldr     c102_154, =0x66009a
+
+vl35_loop
+    mov     r3, src
+    ldrb    r4, [r3], r1            ; a = des [0]
+    ldrb    r5, [r3], r1            ; b = des [dest_pitch]
+    ldrb    r7, [r3], r1            ; c = des[dest_pitch*2]
+    add     lr, src, r1
+
+    orr     r8, r4, r5, lsl #16     ; b | a
+    mul     r6, c102_154, r8        ; a * 102 + 154 * b
+
+    ldrb    r8, [r3, r1, lsl #1]    ; d = des[dest_pitch*5]
+    orr     r3, r7, r5, lsl #16     ; b | c
+    mul     r9, c51_205, r3         ; b * 205 + 51 * c
+    add     r6, r6, #0x8000
+    orr     r3, r5, r7, lsl #16     ; c | b
+    mov     r6, r6, lsr #24
+    strb    r6, [lr], r1
+
+    mul     r5, c51_205, r3         ; c * 205 + 154 * b
+    add     r9, r9, #0x8000
+    orr     r3, r8, r7, lsl #16     ; c | d
+    mov     r9, r9, lsr #24
+    strb    r9, [lr], r1
+
+    mul     r7, c102_154, r3        ; c * 154 + 102 * d
+    add     r5, r5, #0x8000
+    add     src, src, #1
+    mov     r5, r5, lsr #24
+    strb    r5, [lr], r1
+
+    add     r7, r7, #0x8000
+    subs    r2, r2, #1
+    mov     r7, r7, lsr #24
+    strb    r7, [lr], r1
+
+
+    bne     vl35_loop
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ;|vertical_band_3_5_scale_armv4|
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : horizontal_line_3_4_scale_armv4
+; *
+; *  INPUTS        : const unsigned char *source : Pointer to source data.
+; *                  unsigned int source_width    : Stride of source.
+; *                  unsigned char *dest         : Pointer to destination data.
+; *                  unsigned int dest_width      : Stride of destination (NOT USED).
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Copies horizontal line of pixels from source to
+; *                  destination scaling up by 3 to 4.
+; *
+; *  SPECIAL NOTES : None.
+; *
+; *
+; ****************************************************************************/
+;void horizontal_line_3_4_scale_armv4
+;(
+;   const unsigned char *source,
+;   unsigned int source_width,
+;   unsigned char *dest,
+;   unsigned int dest_width
+;)
+|horizontal_line_3_4_scale_armv4| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r10, =64
+    ldr     r11, =192
+    mov     r9, #128
+
+    ldrb    r4, [src], #1           ; a = src[0]
+
+hl34_loop
+
+    ldrb    r8, [src], #1           ; b = src[1]
+    ldrb    r7, [src], #1           ; c = src[2]
+    strb    r4, [dest], #1
+
+    mla     r4, r10, r4, r9         ; a*64 + 128
+    mla     r4, r11, r8, r4         ; a*64 + b*192 + 1
+
+    add     r8, r8, #1              ; b + 1
+    add     r8, r8, r7              ; b + c + 1
+    mov     r8, r8, asr #1          ; (b + c + 1) >> 1
+
+    mov     r4, r4, asr #8          ; (a*64 + b*192 + 1) >> 8
+    strb    r4, [dest], #1
+
+    strb    r8, [dest], #1
+
+    ldrb    r4, [src], #1           ; [a+1]
+
+    mla     r7, r11, r7, r9         ; c*192 + 128
+    mla     r7, r4, r10, r7         ; a*64 + b*192 + 128
+
+    subs    srcw, srcw, #3
+
+    mov     r7, r7, asr #8          ; (a*64 + b*192 + 128) >> 8
+    strb    r7, [dest], #1
+
+    bpl     hl34_loop
+
+    ldrb    r8, [src], #1           ; b = src[1]
+    ldrb    r7, [src], #1           ; c = src[2]
+    strb    r4, [dest], #1
+
+    mla     r4, r10, r4, r9         ; a*64 + 128
+    mla     r4, r11, r8, r4         ; a*64 + b*192 + 1
+    mov     r4, r4, asr #8          ; (a*64 + b*192 + 1) >> 8
+    strb    r4, [dest], #1
+
+    add     r8, r8, #1              ; b + 1
+    add     r8, r8, r7              ; b + c + 1
+    mov     r8, r8, asr #1          ; (b + c + 1) >> 1
+    strb    r8, [dest], #1
+    strb    r7, [dest], #1
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ;|vp8cx_horizontal_line_3_4_scale_c|
+
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : vertical_band_3_4_scale_armv4
+; *
+; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+; *                  unsigned int dest_pitch : Stride of destination data.
+; *                  unsigned int dest_width : Width of destination data.
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Scales vertical band of pixels by scale 3 to 4. The
+; *                  height of the band scaled is 3-pixels.
+; *
+; *  SPECIAL NOTES : The routine uses the first line of the band below
+; *                  the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_3_4_scale_armv4
+;(
+;   r0 = UINT8 *dest
+;   r1 = UINT32 dest_pitch
+;   r2 = UINT32 dest_width
+;)
+|vertical_band_3_4_scale_armv4| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r10, =64
+    ldr     r11, =192
+    mov     r9, #128
+
+;   ldr     r1,[r1]
+vl34_loop
+    mov     r3, src
+    ldrb    r4, [r3], r1            ; a = des [0]
+    ldrb    r5, [r3], r1            ; b = des [dest_pitch]
+    ldrb    r7, [r3], r1            ; c = des [dest_pitch*2]
+    add     lr, src, r1
+
+    mla     r4, r10, r4, r9         ; a*64 + 128
+    mla     r4, r11, r5, r4         ; a*64 + b*192 + 1
+
+    add     r5, r5, #1              ; b + 1
+    add     r5, r5, r7              ; b + c + 1
+    mov     r5, r5, asr #1          ; (b + c + 1) >> 1
+
+    mov     r4, r4, asr #8          ; (a*64 + b*192 + 1) >> 8
+    strb    r4, [lr], r1
+
+    ldrb    r4, [r3, r1]            ; a = des [dest_pitch*4]
+
+    strb    r5, [lr], r1
+
+    mla     r7, r11, r7, r9         ; c*192 + 128
+    mla     r7, r4, r10, r7         ; a*64 + b*192 + 128
+    mov     r7, r7, asr #8          ; (a*64 + b*192 + 128) >> 8
+
+    add     src, src, #1
+    subs    r2, r2, #1
+
+    strb    r7, [lr]
+
+    bne     vl34_loop
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ;|vertical_band_3_4_scale_armv4|
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : vp8cx_horizontal_line_1_2_scale_c
+; *
+; *  INPUTS        : const unsigned char *source : Pointer to source data.
+; *                  unsigned int source_width    : Stride of source.
+; *                  unsigned char *dest         : Pointer to destination data.
+; *                  unsigned int dest_width      : Stride of destination (NOT USED).
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Copies horizontal line of pixels from source to
+; *                  destination scaling up by 1 to 2.
+; *
+; *  SPECIAL NOTES : None.
+; *
+; ****************************************************************************/
+;void vp8cx_horizontal_line_1_2_scale_c
+;(
+;   const unsigned char *source,
+;   unsigned int source_width,
+;   unsigned char *dest,
+;   unsigned int dest_width
+;)
+|horizontal_line_1_2_scale_armv4| PROC
+    stmdb   sp!, {r4 - r5, lr}
+
+    sub     srcw, srcw, #1
+
+    ldrb    r3, [src], #1
+    ldrb    r4, [src], #1
+hl12_loop
+    subs    srcw, srcw, #1
+
+    add     r5, r3, r4
+    add     r5, r5, #1
+    mov     r5, r5, lsr #1
+
+    orr     r5, r3, r5, lsl #8
+    strh    r5, [dest], #2
+
+    mov     r3, r4
+
+    ldrneb  r4, [src], #1
+    bne     hl12_loop
+
+    orr     r5, r4, r4, lsl #8
+    strh    r5, [dest]
+
+    ldmia   sp!, {r4 - r5, pc}
+    ENDP    ;|vertical_band_3_5_scale_armv4|
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : vp8cx_vertical_band_1_2_scale_c
+; *
+; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+; *                  unsigned int dest_pitch : Stride of destination data.
+; *                  unsigned int dest_width : Width of destination data.
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Scales vertical band of pixels by scale 1 to 2. The
+; *                  height of the band scaled is 1-pixel.
+; *
+; *  SPECIAL NOTES : The routine uses the first line of the band below
+; *                  the current band.
+; *
+; ****************************************************************************/
+;void vp8cx_vertical_band_1_2_scale_c
+;(
+;   r0 = UINT8 *dest
+;   r1 = UINT32 dest_pitch
+;   r2 = UINT32 dest_width
+;)
+|vertical_band_1_2_scale_armv4| PROC
+    stmdb   sp!, {r4 - r7, lr}
+
+    ldr     mask, =0xff00ff             ; mask for selection
+    ldr     lr, = 0x010001
+
+vl12_loop
+    mov     r3, src
+    ldr     r4, [r3], r1
+    ldr     r5, [r3, r1]
+
+    add     src, src, #4
+    subs    r2, r2, #4
+
+    and     r6, r4, mask
+    and     r7, r5, mask
+
+    add     r6, r7, r6
+    add     r6, r6, lr
+
+    and     r4, mask, r4, lsr #8
+    and     r5, mask, r5, lsr #8
+
+    mov     r6, r6, lsr #1
+    and     r6, r6, mask
+
+    add     r4, r5, r4
+    add     r4, r4, lr
+
+    mov     r4, r4, lsr #1
+    and     r4, r4, mask
+
+    orr     r5, r6, r4, lsl #8
+
+    str     r5, [r3]
+
+    bpl     vl12_loop
+
+    ldmia   sp!, {r4 - r7, pc}
+    ENDP    ;|vertical_band_3_5_scale_armv4|
+
+    END
diff --git a/vpx_scale/symbian/gen_scalers_armv4.s b/vpx_scale/symbian/gen_scalers_armv4.s
new file mode 100644
index 000000000..3dfd0b9b9
--- /dev/null
+++ b/vpx_scale/symbian/gen_scalers_armv4.s
@@ -0,0 +1,808 @@
+@ This file was created from a .asm file
+@  using the ads2gas.pl script.
+
+    .equ WIDE_REFERENCE, 0
+    .ifndef ARCHITECTURE
+    .equ ARCHITECTURE, 5
+    .endif
+    .global horizontal_line_4_5_scale_armv4
+    .ifndef NO_TYPE_PSEUDO_OP
+    .type horizontal_line_4_5_scale_armv4, function
+    .endif
+    .global vertical_band_4_5_scale_armv4
+    .ifndef NO_TYPE_PSEUDO_OP
+    .type vertical_band_4_5_scale_armv4, function
+    .endif
+    .global horizontal_line_2_3_scale_armv4
+    .ifndef NO_TYPE_PSEUDO_OP
+    .type horizontal_line_2_3_scale_armv4, function
+    .endif
+    .global vertical_band_2_3_scale_armv4
+    .ifndef NO_TYPE_PSEUDO_OP
+    .type vertical_band_2_3_scale_armv4, function
+    .endif
+    .global horizontal_line_3_5_scale_armv4
+    .ifndef NO_TYPE_PSEUDO_OP
+    .type horizontal_line_3_5_scale_armv4, function
+    .endif
+    .global vertical_band_3_5_scale_armv4
+    .ifndef NO_TYPE_PSEUDO_OP
+    .type vertical_band_3_5_scale_armv4, function
+    .endif
+    .global horizontal_line_3_4_scale_armv4
+    .ifndef NO_TYPE_PSEUDO_OP
+    .type horizontal_line_3_4_scale_armv4, function
+    .endif
+    .global vertical_band_3_4_scale_armv4
+    .ifndef NO_TYPE_PSEUDO_OP
+    .type vertical_band_3_4_scale_armv4, function
+    .endif
+    .global horizontal_line_1_2_scale_armv4
+    .ifndef NO_TYPE_PSEUDO_OP
+    .type horizontal_line_1_2_scale_armv4, function
+    .endif
+    .global vertical_band_1_2_scale_armv4
+    .ifndef NO_TYPE_PSEUDO_OP
+    .type vertical_band_1_2_scale_armv4, function
+    .endif
+
+.text
+
+src         .req    r0
+srcw        .req    r1
+dest        .req    r2
+mask        .req    r12
+c51_205     .req    r10
+c102_154    .req    r11
+@/****************************************************************************
+@ *
+@ *  ROUTINE       : horizontal_line_4_5_scale_armv4
+@ *
+@ *  INPUTS        : const unsigned char *source : Pointer to source data.
+@ *                  unsigned int source_width    : Stride of source.
+@ *                  unsigned char *dest         : Pointer to destination data.
+@ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+@ *
+@ *  OUTPUTS       : None.
+@ *
+@ *  RETU.req_s       : void
+@ *
+@ *  FUNCTION      : Copies horizontal line of pixels from source to
+@ *                  destination scaling up by 4 to 5.
+@ *
+@ *  SPECIAL NOTES : None.
+@ *
+@ ****************************************************************************/
+@void horizontal_line_4_5_scale_armv4
+@(
+@   r0 = UINT8 *source
+@   r1 = UINT32 source_width
+@   r2 = UINT8 *dest
+@   r3 = UINT32 dest_width
+@)
+_HorizontalLine_4_5_Scale_ARMv4:
+    horizontal_line_4_5_scale_armv4: @
+    stmdb   sp!, {r4 - r11, lr}
+
+    mov     mask, #255              @ mask for selection
+    ldr     c51_205, =0x3300cd
+    ldr     c102_154, =0x66009a
+
+    ldr     r3, [src], #4
+
+hl45_loop:
+
+    and     r4, r3, mask            @ a = src[0]
+    and     r5, mask, r3, lsr #8    @ b = src[1]
+    strb    r4, [dest], #1
+
+    orr     r6, r4, r5, lsl #16     @ b | a
+    and     r7, mask, r3, lsr #16   @ c = src[2]
+    mul     r6, c51_205, r6         @ a * 51 + 205 * b
+
+    orr     r5, r5, r7, lsl #16     @ c | b
+    mul     r5, c102_154, r5        @ b * 102 + 154 * c
+    add     r6, r6, #0x8000
+    and     r8, mask, r3, lsr #24   @ d = src[3]
+    mov     r6, r6, lsr #24
+    strb    r6, [dest], #1
+
+    orr     r7, r8, r7, lsl #16     @ c | d
+    mul     r7, c102_154, r7        @ c * 154 + 102 * d
+    add     r5, r5, #0x8000
+    ldr     r3, [src], #4
+    mov     r5, r5, lsr #24
+    strb    r5, [dest], #1
+
+    add     r7, r7, #0x8000
+    and     r9, mask, r3            @ e = src[4]
+    orr     r9, r9, r8, lsl #16     @ d | e
+    mul     r9, c51_205, r9         @ d * 205 + 51 * e
+    mov     r7, r7, lsr #24
+    strb    r7, [dest], #1
+
+    add     r9, r9, #0x8000
+    subs    srcw, srcw, #4
+    mov     r9, r9, lsr #24
+    strb    r9, [dest], #1
+
+    bne     hl45_loop
+
+    and     r4, r3, mask
+    and     r5, mask, r3, lsl #8
+    strb    r4, [dest], #1
+
+    orr     r6, r4, r5, lsl #16     @ b | a
+    mul     r6, c51_205, r6
+
+    and     r7, mask, r3, lsl #16
+    orr     r5, r5, r7, lsl #16     @ c | b
+    mul     r5, c102_154, r5
+    add     r6, r6, #0x8000
+    and     r8, mask, r3, lsl #24
+    mov     r6, r6, lsr #24
+    strb    r6, [dest], #1
+
+    orr     r7, r8, r7, lsl #16     @ c | d
+    mul     r7, c102_154, r7
+    add     r5, r5, #0x8000
+    mov     r5, r5, lsr #24
+    strb    r5, [dest], #1
+
+    add     r7, r7, #0x8000
+    mov     r7, r7, lsr #24
+    strb    r7, [dest], #1
+
+    ldrb    r3, [src]
+    strb    r3, [dest], #1
+
+    ldmia   sp!, {r4 - r11, pc}
+    @   @|vp8cx_horizontal_line_4_5_scale_c|
+
+@/****************************************************************************
+@ *
+@ *  ROUTINE       : vertical_band_4_5_scale_armv4
+@ *
+@ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+@ *                  unsigned int dest_pitch : Stride of destination data.
+@ *                  unsigned int dest_width : Width of destination data.
+@ *
+@ *  OUTPUTS       : None.
+@ *
+@ *  RETU.req_s       : void
+@ *
+@ *  FUNCTION      : Scales vertical band of pixels by scale 4 to 5. The
+@ *                  height of the band scaled is 4-pixels.
+@ *
+@ *  SPECIAL NOTES : The routine uses the first line of the band below
+@ *                  the current band.
+@ *
+@ ****************************************************************************/
+@void vertical_band_4_5_scale_armv4
+@(
+@   r0 = UINT8 *dest
+@   r1 = UINT32 dest_pitch
+@   r2 = UINT32 dest_width
+@)
+_VerticalBand_4_5_Scale_ARMv4:
+    vertical_band_4_5_scale_armv4: @
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     c51_205, =0x3300cd
+    ldr     c102_154, =0x66009a
+
+vl45_loop:
+    mov     r3, src
+    ldrb    r4, [r3], r1            @ a = des [0]
+    ldrb    r5, [r3], r1            @ b = des [dest_pitch]
+    ldrb    r7, [r3], r1            @ c = des[dest_pitch*2]
+    add     lr, src, r1
+
+    orr     r6, r4, r5, lsl #16     @ b | a
+    mul     r6, c51_205, r6         @ a * 51 + 205 * b
+
+    ldrb    r8, [r3], r1            @ d = des[dest_pitch*3]
+    orr     r5, r5, r7, lsl #16     @ c | b
+    mul     r5, c102_154, r5        @ b * 102 + 154 * c
+    add     r6, r6, #0x8000
+    orr     r7, r8, r7, lsl #16     @ c | d
+    mov     r6, r6, lsr #24
+    strb    r6, [lr], r1
+
+    ldrb    r9, [r3, r1]            @ e = des [dest_pitch * 5]
+    mul     r7, c102_154, r7        @ c * 154 + 102 * d
+    add     r5, r5, #0x8000
+    orr     r9, r9, r8, lsl #16     @ d | e
+    mov     r5, r5, lsr #24
+    strb    r5, [lr], r1
+
+    mul     r9, c51_205, r9         @ d * 205 + 51 * e
+    add     r7, r7, #0x8000
+    add     src, src, #1
+    mov     r7, r7, lsr #24
+    strb    r7, [lr], r1
+
+    add     r9, r9, #0x8000
+    subs    r2, r2, #1
+    mov     r9, r9, lsr #24
+    strb    r9, [lr], r1
+
+    bne     vl45_loop
+
+    ldmia   sp!, {r4 - r11, pc}
+    @   @|vertical_band_4_5_scale_armv4|
+
+@/****************************************************************************
+@ *
+@ *  ROUTINE       : horizontal_line_2_3_scale_armv4
+@ *
+@ *  INPUTS        : const unsigned char *source : Pointer to source data.
+@ *                  unsigned int source_width    : Stride of source.
+@ *                  unsigned char *dest         : Pointer to destination data.
+@ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+@ *
+@ *  OUTPUTS       : None.
+@ *
+@ *  RETU.req_s       : void
+@ *
+@ *  FUNCTION      : Copies horizontal line of pixels from source to
+@ *                  destination scaling up by 2 to 3.
+@ *
+@ *  SPECIAL NOTES : None.
+@ *
+@ *
+@ ****************************************************************************/
+@void horizontal_line_2_3_scale_armv4
+@(
+@   const unsigned char *source,
+@   unsigned int source_width,
+@   unsigned char *dest,
+@   unsigned int dest_width
+@)
+_HorizontalLine_2_3_Scale_ARMv4:
+    horizontal_line_2_3_scale_armv4: @
+    stmdb   sp!, {r4 - r11, lr}
+    ldr     lr,  =85
+    ldr     r12, =171
+
+hl23_loop:
+
+    ldrb    r3, [src], #1           @ a
+    ldrb    r4, [src], #1           @ b
+    ldrb    r5, [src]               @ c
+
+    strb    r3, [dest], #1
+    mul     r4, r12, r4             @ b * 171
+    mla     r6, lr, r3, r4          @ a * 85
+    mla     r7, lr, r5, r4          @ c * 85
+
+    add     r6, r6, #128
+    mov     r6, r6, lsr #8
+    strb    r6, [dest], #1
+
+    add     r7, r7, #128
+    mov     r7, r7, lsr #8
+    strb    r7, [dest], #1
+
+    subs    srcw, srcw, #2
+    bne     hl23_loop
+
+    ldrb    r4, [src, #1]           @ b
+    strb    r5, [dest], #1
+    strb    r4, [dest, #1]
+
+    mul     r4, r12, r4             @ b * 171
+    mla     r6, lr, r5, r4          @ a * 85 + b *171
+
+    add     r6, r6, #128
+    mov     r6, r6, lsr #8
+    strb    r6, [dest]
+
+    ldmia   sp!, {r4 - r11, pc}
+    @   @|horizontal_line_2_3_scale_armv4|
+
+@/****************************************************************************
+@ *
+@ *  ROUTINE       : vertical_band_2_3_scale_armv4
+@ *
+@ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+@ *                  unsigned int dest_pitch : Stride of destination data.
+@ *                  unsigned int dest_width : Width of destination data.
+@ *
+@ *  OUTPUTS       : None.
+@ *
+@ *  RETU.req_s       : void
+@ *
+@ *  FUNCTION      : Scales vertical band of pixels by scale 2 to 3. The
+@ *                  height of the band scaled is 2-pixels.
+@ *
+@ *  SPECIAL NOTES : The routine uses the first line of the band below
+@ *                  the current band.
+@ *
+@ ****************************************************************************/
+@void vertical_band_2_3_scale_armv4
+@(
+@   r0 = UINT8 *dest
+@   r1 = UINT32 dest_pitch
+@   r2 = UINT32 dest_width
+@)
+_VerticalBand_2_3_Scale_ARMv4:
+    vertical_band_2_3_scale_armv4: @
+    stmdb   sp!, {r4 - r8, lr}
+    ldr     lr,  =85
+    ldr     r12, =171
+    add     r3, r1, r1, lsl #1      @ 3 * dest_pitch
+
+vl23_loop:
+    ldrb    r4, [src]               @ a = des [0]
+    ldrb    r5, [src, r1]           @ b = des [dest_pitch]
+    ldrb    r7, [src, r3]           @ c = des [dest_pitch*3]
+    subs    r2, r2, #1
+
+    mul     r5, r12, r5             @ b * 171
+    mla     r6, lr, r4, r5          @ a * 85
+    mla     r8, lr, r7, r5          @ c * 85
+
+    add     r6, r6, #128
+    mov     r6, r6, lsr #8
+    strb    r6, [src, r1]
+
+    add     r8, r8, #128
+    mov     r8, r8, lsr #8
+    strb    r8, [src, r1, lsl #1]
+
+    add     src, src, #1
+
+    bne     vl23_loop
+
+    ldmia   sp!, {r4 - r8, pc}
+    @   @|vertical_band_2_3_scale_armv4|
+
+@/****************************************************************************
+@ *
+@ *  ROUTINE       : vp8cx_horizontal_line_3_5_scale_c
+@ *
+@ *  INPUTS        : const unsigned char *source : Pointer to source data.
+@ *                  unsigned int source_width    : Stride of source.
+@ *                  unsigned char *dest         : Pointer to destination data.
+@ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+@ *
+@ *  OUTPUTS       : None.
+@ *
+@ *  RETU.req_s       : void
+@ *
+@ *  FUNCTION      : Copies horizontal line of pixels from source to
+@ *                  destination scaling up by 3 to 5.
+@ *
+@ *  SPECIAL NOTES : None.
+@ *
+@ *
+@ ****************************************************************************/
+@void vp8cx_horizontal_line_3_5_scale_c
+@(
+@   const unsigned char *source,
+@   unsigned int source_width,
+@   unsigned char *dest,
+@   unsigned int dest_width
+@)
+_HorizontalLine_3_5_Scale_ARMv4:
+    horizontal_line_3_5_scale_armv4: @
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     c51_205, =0x3300cd
+    ldr     c102_154, =0x66009a
+
+    ldrb    r4, [src], #1           @ a = src[0]
+
+hl35_loop:
+
+    ldrb    r8, [src], #1           @ b = src[1]
+    strb    r4, [dest], #1
+
+    orr     r6, r4, r8, lsl #16     @ b | a
+    ldrb    r9, [src], #1           @ c = src[2]
+    mul     r6, c102_154, r6        @ a * 102 + 154 * b
+
+    orr     r5, r9, r8, lsl #16     @ b | c
+    mul     r5, c51_205, r5         @ b * 205 + 51 * c
+    add     r6, r6, #0x8000
+    ldrb    r4, [src], #1           @ d = src[3]
+    mov     r6, r6, lsr #24
+    strb    r6, [dest], #1
+
+    orr     r7, r8, r9, lsl #16     @ c | b
+    mul     r7, c51_205, r7         @ c * 205 + 154 * b
+    add     r5, r5, #0x8000
+    mov     r5, r5, lsr #24
+    strb    r5, [dest], #1
+
+    orr     r9, r4, r9, lsl #16     @ c | d
+    mul     r9, c102_154, r9        @ c * 154 + 102 * d
+    add     r7, r7, #0x8000
+    mov     r7, r7, lsr #24
+    strb    r7, [dest], #1
+
+    add     r9, r9, #0x8000
+    subs    srcw, srcw, #3
+    mov     r9, r9, lsr #24
+    strb    r9, [dest], #1
+
+    bpl     hl35_loop
+
+    ldrb    r5, [src], #1           @ b = src[1]
+    strb    r4, [dest], #1
+
+    orr     r6, r4, r8, lsl #16     @ b | a
+    ldrb    r9, [src], #1           @ c = src[2]
+    mul     r6, c102_154, r6        @ a * 102 + 154 * b
+
+    orr     r5, r9, r8, lsl #16     @ b | c
+    mul     r5, c51_205, r5         @ b * 205 + 51 * c
+    add     r6, r6, #0x8000
+    mov     r6, r6, lsr #24
+    strb    r6, [dest], #1
+
+    orr     r7, r8, r9, lsl #16     @ c | b
+    mul     r7, c51_205, r7         @ c * 205 + 154 * b
+    add     r5, r5, #0x8000
+    mov     r5, r5, lsr #24
+    strb    r5, [dest], #1
+
+    add     r7, r7, #0x8000
+    mov     r7, r7, lsr #24
+    strb    r7, [dest], #1
+    strb    r9, [dest], #1
+
+    ldmia   sp!, {r4 - r11, pc}
+    @   @|vp8cx_horizontal_line_3_5_scale_c|
+
+
+@/****************************************************************************
+@ *
+@ *  ROUTINE       : vp8cx_vertical_band_3_5_scale_c
+@ *
+@ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+@ *                  unsigned int dest_pitch : Stride of destination data.
+@ *                  unsigned int dest_width : Width of destination data.
+@ *
+@ *  OUTPUTS       : None.
+@ *
+@ *  RETU.req_s       : void
+@ *
+@ *  FUNCTION      : Scales vertical band of pixels by scale 3 to 5. The
+@ *                  height of the band scaled is 3-pixels.
+@ *
+@ *  SPECIAL NOTES : The routine uses the first line of the band below
+@ *                  the current band.
+@ *
+@ ****************************************************************************/
+@void vertical_band_4_5_scale_armv4
+@(
+@   r0 = UINT8 *dest
+@   r1 = UINT32 dest_pitch
+@   r2 = UINT32 dest_width
+@)
+_VerticalBand_3_5_Scale_ARMv4:
+    vertical_band_3_5_scale_armv4: @
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     c51_205, =0x3300cd
+    ldr     c102_154, =0x66009a
+
+vl35_loop:
+    mov     r3, src
+    ldrb    r4, [r3], r1            @ a = des [0]
+    ldrb    r5, [r3], r1            @ b = des [dest_pitch]
+    ldrb    r7, [r3], r1            @ c = des[dest_pitch*2]
+    add     lr, src, r1
+
+    orr     r8, r4, r5, lsl #16     @ b | a
+    mul     r6, c102_154, r8        @ a * 102 + 154 * b
+
+    ldrb    r8, [r3, r1, lsl #1]    @ d = des[dest_pitch*5]
+    orr     r3, r7, r5, lsl #16     @ b | c
+    mul     r9, c51_205, r3         @ b * 205 + 51 * c
+    add     r6, r6, #0x8000
+    orr     r3, r5, r7, lsl #16     @ c | b
+    mov     r6, r6, lsr #24
+    strb    r6, [lr], r1
+
+    mul     r5, c51_205, r3         @ c * 205 + 154 * b
+    add     r9, r9, #0x8000
+    orr     r3, r8, r7, lsl #16     @ c | d
+    mov     r9, r9, lsr #24
+    strb    r9, [lr], r1
+
+    mul     r7, c102_154, r3        @ c * 154 + 102 * d
+    add     r5, r5, #0x8000
+    add     src, src, #1
+    mov     r5, r5, lsr #24
+    strb    r5, [lr], r1
+
+    add     r7, r7, #0x8000
+    subs    r2, r2, #1
+    mov     r7, r7, lsr #24
+    strb    r7, [lr], r1
+
+
+    bne     vl35_loop
+
+    ldmia   sp!, {r4 - r11, pc}
+    @   @|vertical_band_3_5_scale_armv4|
+
+@/****************************************************************************
+@ *
+@ *  ROUTINE       : horizontal_line_3_4_scale_armv4
+@ *
+@ *  INPUTS        : const unsigned char *source : Pointer to source data.
+@ *                  unsigned int source_width    : Stride of source.
+@ *                  unsigned char *dest         : Pointer to destination data.
+@ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+@ *
+@ *  OUTPUTS       : None.
+@ *
+@ *  RETU.req_s       : void
+@ *
+@ *  FUNCTION      : Copies horizontal line of pixels from source to
+@ *                  destination scaling up by 3 to 4.
+@ *
+@ *  SPECIAL NOTES : None.
+@ *
+@ *
+@ ****************************************************************************/
+@void horizontal_line_3_4_scale_armv4
+@(
+@   const unsigned char *source,
+@   unsigned int source_width,
+@   unsigned char *dest,
+@   unsigned int dest_width
+@)
+_HorizontalLine_3_4_Scale_ARMv4:
+    horizontal_line_3_4_scale_armv4: @
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r10, =64
+    ldr     r11, =192
+    mov     r9, #128
+
+    ldrb    r4, [src], #1           @ a = src[0]
+
+hl34_loop:
+
+    ldrb    r8, [src], #1           @ b = src[1]
+    ldrb    r7, [src], #1           @ c = src[2]
+    strb    r4, [dest], #1
+
+    mla     r4, r10, r4, r9         @ a*64 + 128
+    mla     r4, r11, r8, r4         @ a*64 + b*192 + 1
+
+    add     r8, r8, #1              @ b + 1
+    add     r8, r8, r7              @ b + c + 1
+    mov     r8, r8, asr #1          @ (b + c + 1) >> 1
+
+    mov     r4, r4, asr #8          @ (a*64 + b*192 + 1) >> 8
+    strb    r4, [dest], #1
+
+    strb    r8, [dest], #1
+
+    ldrb    r4, [src], #1           @ [a+1]
+
+    mla     r7, r11, r7, r9         @ c*192 + 128
+    mla     r7, r4, r10, r7         @ a*64 + b*192 + 128
+
+    subs    srcw, srcw, #3
+
+    mov     r7, r7, asr #8          @ (a*64 + b*192 + 128) >> 8
+    strb    r7, [dest], #1
+
+    bpl     hl34_loop
+
+    ldrb    r8, [src], #1           @ b = src[1]
+    ldrb    r7, [src], #1           @ c = src[2]
+    strb    r4, [dest], #1
+
+    mla     r4, r10, r4, r9         @ a*64 + 128
+    mla     r4, r11, r8, r4         @ a*64 + b*192 + 1
+    mov     r4, r4, asr #8          @ (a*64 + b*192 + 1) >> 8
+    strb    r4, [dest], #1
+
+    add     r8, r8, #1              @ b + 1
+    add     r8, r8, r7              @ b + c + 1
+    mov     r8, r8, asr #1          @ (b + c + 1) >> 1
+    strb    r8, [dest], #1
+    strb    r7, [dest], #1
+
+    ldmia   sp!, {r4 - r11, pc}
+    @   @|vp8cx_horizontal_line_3_4_scale_c|
+
+
+@/****************************************************************************
+@ *
+@ *  ROUTINE       : vertical_band_3_4_scale_armv4
+@ *
+@ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+@ *                  unsigned int dest_pitch : Stride of destination data.
+@ *                  unsigned int dest_width : Width of destination data.
+@ *
+@ *  OUTPUTS       : None.
+@ *
+@ *  RETU.req_s       : void
+@ *
+@ *  FUNCTION      : Scales vertical band of pixels by scale 3 to 4. The
+@ *                  height of the band scaled is 3-pixels.
+@ *
+@ *  SPECIAL NOTES : The routine uses the first line of the band below
+@ *                  the current band.
+@ *
+@ ****************************************************************************/
+@void vertical_band_3_4_scale_armv4
+@(
+@   r0 = UINT8 *dest
+@   r1 = UINT32 dest_pitch
+@   r2 = UINT32 dest_width
+@)
+_VerticalBand_3_4_Scale_ARMv4:
+    vertical_band_3_4_scale_armv4: @
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r10, =64
+    ldr     r11, =192
+    mov     r9, #128
+
+@   ldr     r1,[r1]
+vl34_loop:
+    mov     r3, src
+    ldrb    r4, [r3], r1            @ a = des [0]
+    ldrb    r5, [r3], r1            @ b = des [dest_pitch]
+    ldrb    r7, [r3], r1            @ c = des [dest_pitch*2]
+    add     lr, src, r1
+
+    mla     r4, r10, r4, r9         @ a*64 + 128
+    mla     r4, r11, r5, r4         @ a*64 + b*192 + 1
+
+    add     r5, r5, #1              @ b + 1
+    add     r5, r5, r7              @ b + c + 1
+    mov     r5, r5, asr #1          @ (b + c + 1) >> 1
+
+    mov     r4, r4, asr #8          @ (a*64 + b*192 + 1) >> 8
+    strb    r4, [lr], r1
+
+    ldrb    r4, [r3, r1]            @ a = des [dest_pitch*4]
+
+    strb    r5, [lr], r1
+
+    mla     r7, r11, r7, r9         @ c*192 + 128
+    mla     r7, r4, r10, r7         @ a*64 + b*192 + 128
+    mov     r7, r7, asr #8          @ (a*64 + b*192 + 128) >> 8
+
+    add     src, src, #1
+    subs    r2, r2, #1
+
+    strb    r7, [lr]
+
+    bne     vl34_loop
+
+    ldmia   sp!, {r4 - r11, pc}
+    @   @|vertical_band_3_4_scale_armv4|
+
+@/****************************************************************************
+@ *
+@ *  ROUTINE       : vp8cx_horizontal_line_1_2_scale_c
+@ *
+@ *  INPUTS        : const unsigned char *source : Pointer to source data.
+@ *                  unsigned int source_width    : Stride of source.
+@ *                  unsigned char *dest         : Pointer to destination data.
+@ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+@ *
+@ *  OUTPUTS       : None.
+@ *
+@ *  RETU.req_s       : void
+@ *
+@ *  FUNCTION      : Copies horizontal line of pixels from source to
+@ *                  destination scaling up by 1 to 2.
+@ *
+@ *  SPECIAL NOTES : None.
+@ *
+@ ****************************************************************************/
+@void vp8cx_horizontal_line_1_2_scale_c
+@(
+@   const unsigned char *source,
+@   unsigned int source_width,
+@   unsigned char *dest,
+@   unsigned int dest_width
+@)
+_HorizontalLine_1_2_Scale_ARMv4:
+    horizontal_line_1_2_scale_armv4: @
+    stmdb   sp!, {r4 - r5, lr}
+
+    sub     srcw, srcw, #1
+
+    ldrb    r3, [src], #1
+    ldrb    r4, [src], #1
+hl12_loop:
+    subs    srcw, srcw, #1
+
+    add     r5, r3, r4
+    add     r5, r5, #1
+    mov     r5, r5, lsr #1
+
+    orr     r5, r3, r5, lsl #8
+    strh    r5, [dest], #2
+
+    mov     r3, r4
+
+    ldrneb  r4, [src], #1
+    bne     hl12_loop
+
+    orr     r5, r4, r4, lsl #8
+    strh    r5, [dest]
+
+    ldmia   sp!, {r4 - r5, pc}
+    @   @|vertical_band_3_5_scale_armv4|
+
+@/****************************************************************************
+@ *
+@ *  ROUTINE       : vp8cx_vertical_band_1_2_scale_c
+@ *
+@ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+@ *                  unsigned int dest_pitch : Stride of destination data.
+@ *                  unsigned int dest_width : Width of destination data.
+@ *
+@ *  OUTPUTS       : None.
+@ *
+@ *  RETU.req_s       : void
+@ *
+@ *  FUNCTION      : Scales vertical band of pixels by scale 1 to 2. The
+@ *                  height of the band scaled is 1-pixel.
+@ *
+@ *  SPECIAL NOTES : The routine uses the first line of the band below
+@ *                  the current band.
+@ *
+@ ****************************************************************************/
+@void vp8cx_vertical_band_1_2_scale_c
+@(
+@   r0 = UINT8 *dest
+@   r1 = UINT32 dest_pitch
+@   r2 = UINT32 dest_width
+@)
+_VerticalBand_1_2_Scale_ARMv4:
+    vertical_band_1_2_scale_armv4: @
+    stmdb   sp!, {r4 - r7, lr}
+
+    ldr     mask, =0xff00ff             @ mask for selection
+    ldr     lr, = 0x010001
+
+vl12_loop:
+    mov     r3, src
+    ldr     r4, [r3], r1
+    ldr     r5, [r3, r1]
+
+    add     src, src, #4
+    subs    r2, r2, #4
+
+    and     r6, r4, mask
+    and     r7, r5, mask
+
+    add     r6, r7, r6
+    add     r6, r6, lr
+
+    and     r4, mask, r4, lsr #8
+    and     r5, mask, r5, lsr #8
+
+    mov     r6, r6, lsr #1
+    and     r6, r6, mask
+
+    add     r4, r5, r4
+    add     r4, r4, lr
+
+    mov     r4, r4, lsr #1
+    and     r4, r4, mask
+
+    orr     r5, r6, r4, lsl #8
+
+    str     r5, [r3]
+
+    bpl     vl12_loop
+
+    ldmia   sp!, {r4 - r7, pc}
+    @   @|vertical_band_3_5_scale_armv4|
diff --git a/vpx_scale/symbian/scalesystemdependant.c b/vpx_scale/symbian/scalesystemdependant.c
new file mode 100644
index 000000000..a2acc3e9d
--- /dev/null
+++ b/vpx_scale/symbian/scalesystemdependant.c
@@ -0,0 +1,57 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/vpxscale.h"
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8_scale_machine_specific_config
+ *
+ *  INPUTS        : UINT32 Version : Codec version number.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Checks for machine specifc features such as MMX support
+ *                  sets appropriate flags and function pointers.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8_scale_machine_specific_config()
+{
+#ifndef VPX_NO_GLOBALS
+    vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_armv4;
+    vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_armv4;
+    vp8_last_vertical_band_1_2_scale      = vp8cx_last_vertical_band_1_2_scale_c;
+    vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_armv4;
+    vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_armv4;
+    vp8_last_vertical_band_3_5_scale      = vp8cx_last_vertical_band_3_5_scale_c;
+    vp8_horizontal_line_3_4_scale        = horizontal_line_3_4_scale_armv4;
+    vp8_vertical_band_3_4_scale          = vertical_band_3_4_scale_armv4;
+    vp8_last_vertical_band_3_4_scale      = vp8cx_last_vertical_band_3_4_scale_c;
+    vp8_horizontal_line_2_3_scale        = horizontal_line_2_3_scale_armv4;
+    vp8_vertical_band_2_3_scale          = vertical_band_2_3_scale_armv4;
+    vp8_last_vertical_band_2_3_scale      = vp8cx_last_vertical_band_2_3_scale_c;
+    vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_armv4;
+    vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_armv4;
+    vp8_last_vertical_band_4_5_scale      = vp8cx_last_vertical_band_4_5_scale_c;
+
+
+    vp8_vertical_band_5_4_scale           = vp8cx_vertical_band_5_4_scale_c;
+    vp8_vertical_band_5_3_scale           = vp8cx_vertical_band_5_3_scale_c;
+    vp8_vertical_band_2_1_scale           = vp8cx_vertical_band_2_1_scale_c;
+    vp8_vertical_band_2_1_scale_i         = vp8cx_vertical_band_2_1_scale_i_c;
+    vp8_horizontal_line_2_1_scale         = vp8cx_horizontal_line_2_1_scale_c;
+    vp8_horizontal_line_5_3_scale         = vp8cx_horizontal_line_5_3_scale_c;
+    vp8_horizontal_line_5_4_scale         = vp8cx_horizontal_line_5_4_scale_c;
+#endif
+}
diff --git a/vpx_scale/vpx_scale.mk b/vpx_scale/vpx_scale.mk
new file mode 100644
index 000000000..f4ab258ed
--- /dev/null
+++ b/vpx_scale/vpx_scale.mk
@@ -0,0 +1,23 @@
+SCALE_SRCS-yes += vpx_scale.mk
+SCALE_SRCS-yes += scale_mode.h
+SCALE_SRCS-yes += yv12extend.h
+SCALE_SRCS-yes += yv12config.h
+SCALE_SRCS-yes += vpxscale.h
+SCALE_SRCS-yes += generic/vpxscale.c
+SCALE_SRCS-yes += generic/yv12config.c
+SCALE_SRCS-yes += generic/yv12extend.c
+SCALE_SRCS-yes += generic/scalesystemdependant.c
+SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += generic/gen_scalers.c
+
+#arm
+SCALE_SRCS-$(HAVE_ARMV7)         += arm/scalesystemdependant.c
+SCALE_SRCS-$(HAVE_ARMV7)         += arm/yv12extend_arm.c
+SCALE_SRCS_REMOVE-$(HAVE_ARMV7)  += generic/scalesystemdependant.c
+
+#neon
+SCALE_SRCS-$(HAVE_ARMV7)  += arm/neon/vp8_vpxyv12_copyframe_func_neon$(ASM)
+SCALE_SRCS-$(HAVE_ARMV7)  += arm/neon/vp8_vpxyv12_copyframeyonly_neon$(ASM)
+SCALE_SRCS-$(HAVE_ARMV7)  += arm/neon/vp8_vpxyv12_copysrcframe_func_neon$(ASM)
+SCALE_SRCS-$(HAVE_ARMV7)  += arm/neon/vp8_vpxyv12_extendframeborders_neon$(ASM)
+
+SCALE_SRCS-no += $(SCALE_SRCS_REMOVE-yes)
diff --git a/vpx_scale/vpxscale.h b/vpx_scale/vpxscale.h
new file mode 100644
index 000000000..9a86b75de
--- /dev/null
+++ b/vpx_scale/vpxscale.h
@@ -0,0 +1,113 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef VPXSCALE_H
+#define VPXSCALE_H
+
+#include "vpx_scale/yv12config.h"
+void vp8cx_horizontal_line_4_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_horizontal_line_2_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_horizontal_line_3_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_horizontal_line_3_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_horizontal_line_1_2_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_last_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vp8cx_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vp8cx_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+
+
+extern void (*vp8_vertical_band_4_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_last_vertical_band_4_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_vertical_band_2_3_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_last_vertical_band_2_3_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_vertical_band_3_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_last_vertical_band_3_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_vertical_band_3_4_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_last_vertical_band_3_4_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_horizontal_line_1_2_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+extern void (*vp8_horizontal_line_3_4_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+extern void (*vp8_horizontal_line_3_5_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+extern void (*vp8_horizontal_line_2_3_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+extern void (*vp8_horizontal_line_4_5_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+extern void (*vp8_vertical_band_1_2_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_last_vertical_band_1_2_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_vertical_band_5_4_scale)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_vertical_band_5_3_scale)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_vertical_band_2_1_scale)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_vertical_band_2_1_scale_i)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+extern void (*vp8_horizontal_line_2_1_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+extern void (*vp8_horizontal_line_5_3_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+extern void (*vp8_horizontal_line_5_4_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+
+void horizontal_line_4_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_2_3_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_3_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_3_4_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void horizontal_line_1_2_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+void vertical_band_4_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_2_3_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_3_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_3_4_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+void vertical_band_1_2_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+
+
+extern void  dmachine_specific_config(int mmx_enabled, int xmm_enabled, int wmt_enabled);
+extern void vp8_yv12_scale_or_center
+(
+    YV12_BUFFER_CONFIG *src_yuv_config,
+    YV12_BUFFER_CONFIG *dst_yuv_config,
+    int expanded_frame_width,
+    int expanded_frame_height,
+    int scaling_mode,
+    int HScale,
+    int HRatio,
+    int VScale,
+    int VRatio
+);
+extern void vp8_scale_frame
+(
+    YV12_BUFFER_CONFIG *src,
+    YV12_BUFFER_CONFIG *dst,
+    unsigned char *temp_area,
+    unsigned char temp_height,
+    unsigned int hscale,
+    unsigned int hratio,
+    unsigned int vscale,
+    unsigned int vratio,
+    unsigned int interlaced
+);
+extern void vp8_scale_machine_specific_config(void);
+
+extern void (*vp8_yv12_extend_frame_borders_ptr)(YV12_BUFFER_CONFIG *ybf);
+extern void vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf);
+extern void vp8_yv12_extend_frame_borders_neon(YV12_BUFFER_CONFIG *ybf);
+
+extern void (*vp8_yv12_copy_frame_yonly_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+extern void vp8_yv12_copy_frame_yonly(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+extern void vp8_yv12_copy_frame_yonly_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+
+extern void (*vp8_yv12_copy_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+extern void vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+extern void vp8_yv12_copy_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+
+#endif
diff --git a/vpx_scale/wce/gen_scalers_armv4.asm b/vpx_scale/wce/gen_scalers_armv4.asm
new file mode 100644
index 000000000..1c904edae
--- /dev/null
+++ b/vpx_scale/wce/gen_scalers_armv4.asm
@@ -0,0 +1,773 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |horizontal_line_4_5_scale_armv4|
+    EXPORT  |vertical_band_4_5_scale_armv4|
+    EXPORT  |horizontal_line_2_3_scale_armv4|
+    EXPORT  |vertical_band_2_3_scale_armv4|
+    EXPORT  |horizontal_line_3_5_scale_armv4|
+    EXPORT  |vertical_band_3_5_scale_armv4|
+    EXPORT  |horizontal_line_3_4_scale_armv4|
+    EXPORT  |vertical_band_3_4_scale_armv4|
+    EXPORT  |horizontal_line_1_2_scale_armv4|
+    EXPORT  |vertical_band_1_2_scale_armv4|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+src         RN  r0
+srcw        RN  r1
+dest        RN  r2
+mask        RN  r12
+c51_205     RN  r10
+c102_154    RN  r11
+;/****************************************************************************
+; *
+; *  ROUTINE       : horizontal_line_4_5_scale_armv4
+; *
+; *  INPUTS        : const unsigned char *source : Pointer to source data.
+; *                  unsigned int source_width    : Stride of source.
+; *                  unsigned char *dest         : Pointer to destination data.
+; *                  unsigned int dest_width      : Stride of destination (NOT USED).
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Copies horizontal line of pixels from source to
+; *                  destination scaling up by 4 to 5.
+; *
+; *  SPECIAL NOTES : None.
+; *
+; ****************************************************************************/
+;void horizontal_line_4_5_scale_armv4
+;(
+;   r0 = UINT8 *source
+;   r1 = UINT32 source_width
+;   r2 = UINT8 *dest
+;   r3 = UINT32 dest_width
+;)
+|horizontal_line_4_5_scale_armv4| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    mov     mask, #255              ; mask for selection
+    ldr     c51_205, =0x3300cd
+    ldr     c102_154, =0x66009a
+
+    ldr     r3, [src], #4
+
+hl45_loop
+
+    and     r4, r3, mask            ; a = src[0]
+    and     r5, mask, r3, lsr #8    ; b = src[1]
+    strb    r4, [dest], #1
+
+    orr     r6, r4, r5, lsl #16     ; b | a
+    and     r7, mask, r3, lsr #16   ; c = src[2]
+    mul     r6, c51_205, r6         ; a * 51 + 205 * b
+
+    orr     r5, r5, r7, lsl #16     ; c | b
+    mul     r5, c102_154, r5        ; b * 102 + 154 * c
+    add     r6, r6, #0x8000
+    and     r8, mask, r3, lsr #24   ; d = src[3]
+    mov     r6, r6, lsr #24
+    strb    r6, [dest], #1
+
+    orr     r7, r8, r7, lsl #16     ; c | d
+    mul     r7, c102_154, r7        ; c * 154 + 102 * d
+    add     r5, r5, #0x8000
+    ldr     r3, [src], #4
+    mov     r5, r5, lsr #24
+    strb    r5, [dest], #1
+
+    add     r7, r7, #0x8000
+    and     r9, mask, r3            ; e = src[4]
+    orr     r9, r9, r8, lsl #16     ; d | e
+    mul     r9, c51_205, r9         ; d * 205 + 51 * e
+    mov     r7, r7, lsr #24
+    strb    r7, [dest], #1
+
+    add     r9, r9, #0x8000
+    subs    srcw, srcw, #4
+    mov     r9, r9, lsr #24
+    strb    r9, [dest], #1
+
+    bne     hl45_loop
+
+    and     r4, r3, mask
+    and     r5, mask, r3, lsl #8
+    strb    r4, [dest], #1
+
+    orr     r6, r4, r5, lsl #16     ; b | a
+    mul     r6, c51_205, r6
+
+    and     r7, mask, r3, lsl #16
+    orr     r5, r5, r7, lsl #16     ; c | b
+    mul     r5, c102_154, r5
+    add     r6, r6, #0x8000
+    and     r8, mask, r3, lsl #24
+    mov     r6, r6, lsr #24
+    strb    r6, [dest], #1
+
+    orr     r7, r8, r7, lsl #16     ; c | d
+    mul     r7, c102_154, r7
+    add     r5, r5, #0x8000
+    mov     r5, r5, lsr #24
+    strb    r5, [dest], #1
+
+    add     r7, r7, #0x8000
+    mov     r7, r7, lsr #24
+    strb    r7, [dest], #1
+
+    ldrb    r3, [src]
+    strb    r3, [dest], #1
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ;|vp8cx_horizontal_line_4_5_scale_c|
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : vertical_band_4_5_scale_armv4
+; *
+; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+; *                  unsigned int dest_pitch : Stride of destination data.
+; *                  unsigned int dest_width : Width of destination data.
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Scales vertical band of pixels by scale 4 to 5. The
+; *                  height of the band scaled is 4-pixels.
+; *
+; *  SPECIAL NOTES : The routine uses the first line of the band below
+; *                  the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_4_5_scale_armv4
+;(
+;   r0 = UINT8 *dest
+;   r1 = UINT32 dest_pitch
+;   r2 = UINT32 dest_width
+;)
+|vertical_band_4_5_scale_armv4| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     c51_205, =0x3300cd
+    ldr     c102_154, =0x66009a
+
+vl45_loop
+    mov     r3, src
+    ldrb    r4, [r3], r1            ; a = des [0]
+    ldrb    r5, [r3], r1            ; b = des [dest_pitch]
+    ldrb    r7, [r3], r1            ; c = des[dest_pitch*2]
+    add     lr, src, r1
+
+    orr     r6, r4, r5, lsl #16     ; b | a
+    mul     r6, c51_205, r6         ; a * 51 + 205 * b
+
+    ldrb    r8, [r3], r1            ; d = des[dest_pitch*3]
+    orr     r5, r5, r7, lsl #16     ; c | b
+    mul     r5, c102_154, r5        ; b * 102 + 154 * c
+    add     r6, r6, #0x8000
+    orr     r7, r8, r7, lsl #16     ; c | d
+    mov     r6, r6, lsr #24
+    strb    r6, [lr], r1
+
+    ldrb    r9, [r3, r1]            ; e = des [dest_pitch * 5]
+    mul     r7, c102_154, r7        ; c * 154 + 102 * d
+    add     r5, r5, #0x8000
+    orr     r9, r9, r8, lsl #16     ; d | e
+    mov     r5, r5, lsr #24
+    strb    r5, [lr], r1
+
+    mul     r9, c51_205, r9         ; d * 205 + 51 * e
+    add     r7, r7, #0x8000
+    add     src, src, #1
+    mov     r7, r7, lsr #24
+    strb    r7, [lr], r1
+
+    add     r9, r9, #0x8000
+    subs    r2, r2, #1
+    mov     r9, r9, lsr #24
+    strb    r9, [lr], r1
+
+    bne     vl45_loop
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ;|vertical_band_4_5_scale_armv4|
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : horizontal_line_2_3_scale_armv4
+; *
+; *  INPUTS        : const unsigned char *source : Pointer to source data.
+; *                  unsigned int source_width    : Stride of source.
+; *                  unsigned char *dest         : Pointer to destination data.
+; *                  unsigned int dest_width      : Stride of destination (NOT USED).
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Copies horizontal line of pixels from source to
+; *                  destination scaling up by 2 to 3.
+; *
+; *  SPECIAL NOTES : None.
+; *
+; *
+; ****************************************************************************/
+;void horizontal_line_2_3_scale_armv4
+;(
+;   const unsigned char *source,
+;   unsigned int source_width,
+;   unsigned char *dest,
+;   unsigned int dest_width
+;)
+|horizontal_line_2_3_scale_armv4| PROC
+    stmdb   sp!, {r4 - r11, lr}
+    ldr     lr,  =85
+    ldr     r12, =171
+
+hl23_loop
+
+    ldrb    r3, [src], #1           ; a
+    ldrb    r4, [src], #1           ; b
+    ldrb    r5, [src]               ; c
+
+    strb    r3, [dest], #1
+    mul     r4, r12, r4             ; b * 171
+    mla     r6, lr, r3, r4          ; a * 85
+    mla     r7, lr, r5, r4          ; c * 85
+
+    add     r6, r6, #128
+    mov     r6, r6, lsr #8
+    strb    r6, [dest], #1
+
+    add     r7, r7, #128
+    mov     r7, r7, lsr #8
+    strb    r7, [dest], #1
+
+    subs    srcw, srcw, #2
+    bne     hl23_loop
+
+    ldrb    r4, [src, #1]           ; b
+    strb    r5, [dest], #1
+    strb    r4, [dest, #1]
+
+    mul     r4, r12, r4             ; b * 171
+    mla     r6, lr, r5, r4          ; a * 85 + b *171
+
+    add     r6, r6, #128
+    mov     r6, r6, lsr #8
+    strb    r6, [dest]
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ;|horizontal_line_2_3_scale_armv4|
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : vertical_band_2_3_scale_armv4
+; *
+; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+; *                  unsigned int dest_pitch : Stride of destination data.
+; *                  unsigned int dest_width : Width of destination data.
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Scales vertical band of pixels by scale 2 to 3. The
+; *                  height of the band scaled is 2-pixels.
+; *
+; *  SPECIAL NOTES : The routine uses the first line of the band below
+; *                  the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_2_3_scale_armv4
+;(
+;   r0 = UINT8 *dest
+;   r1 = UINT32 dest_pitch
+;   r2 = UINT32 dest_width
+;)
+|vertical_band_2_3_scale_armv4| PROC
+    stmdb   sp!, {r4 - r8, lr}
+    ldr     lr,  =85
+    ldr     r12, =171
+    add     r3, r1, r1, lsl #1      ; 3 * dest_pitch
+
+vl23_loop
+    ldrb    r4, [src]               ; a = des [0]
+    ldrb    r5, [src, r1]           ; b = des [dest_pitch]
+    ldrb    r7, [src, r3]           ; c = des [dest_pitch*3]
+    subs    r2, r2, #1
+
+    mul     r5, r12, r5             ; b * 171
+    mla     r6, lr, r4, r5          ; a * 85
+    mla     r8, lr, r7, r5          ; c * 85
+
+    add     r6, r6, #128
+    mov     r6, r6, lsr #8
+    strb    r6, [src, r1]
+
+    add     r8, r8, #128
+    mov     r8, r8, lsr #8
+    strb    r8, [src, r1, lsl #1]
+
+    add     src, src, #1
+
+    bne     vl23_loop
+
+    ldmia   sp!, {r4 - r8, pc}
+    ENDP    ;|vertical_band_2_3_scale_armv4|
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : vp8cx_horizontal_line_3_5_scale_c
+; *
+; *  INPUTS        : const unsigned char *source : Pointer to source data.
+; *                  unsigned int source_width    : Stride of source.
+; *                  unsigned char *dest         : Pointer to destination data.
+; *                  unsigned int dest_width      : Stride of destination (NOT USED).
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Copies horizontal line of pixels from source to
+; *                  destination scaling up by 3 to 5.
+; *
+; *  SPECIAL NOTES : None.
+; *
+; *
+; ****************************************************************************/
+;void vp8cx_horizontal_line_3_5_scale_c
+;(
+;   const unsigned char *source,
+;   unsigned int source_width,
+;   unsigned char *dest,
+;   unsigned int dest_width
+;)
+|horizontal_line_3_5_scale_armv4| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     c51_205, =0x3300cd
+    ldr     c102_154, =0x66009a
+
+    ldrb    r4, [src], #1           ; a = src[0]
+
+hl35_loop
+
+    ldrb    r8, [src], #1           ; b = src[1]
+    strb    r4, [dest], #1
+
+    orr     r6, r4, r8, lsl #16     ; b | a
+    ldrb    r9, [src], #1           ; c = src[2]
+    mul     r6, c102_154, r6        ; a * 102 + 154 * b
+
+    orr     r5, r9, r8, lsl #16     ; b | c
+    mul     r5, c51_205, r5         ; b * 205 + 51 * c
+    add     r6, r6, #0x8000
+    ldrb    r4, [src], #1           ; d = src[3]
+    mov     r6, r6, lsr #24
+    strb    r6, [dest], #1
+
+    orr     r7, r8, r9, lsl #16     ; c | b
+    mul     r7, c51_205, r7         ; c * 205 + 154 * b
+    add     r5, r5, #0x8000
+    mov     r5, r5, lsr #24
+    strb    r5, [dest], #1
+
+    orr     r9, r4, r9, lsl #16     ; c | d
+    mul     r9, c102_154, r9        ; c * 154 + 102 * d
+    add     r7, r7, #0x8000
+    mov     r7, r7, lsr #24
+    strb    r7, [dest], #1
+
+    add     r9, r9, #0x8000
+    subs    srcw, srcw, #3
+    mov     r9, r9, lsr #24
+    strb    r9, [dest], #1
+
+    bpl     hl35_loop
+
+    ldrb    r5, [src], #1           ; b = src[1]
+    strb    r4, [dest], #1
+
+    orr     r6, r4, r8, lsl #16     ; b | a
+    ldrb    r9, [src], #1           ; c = src[2]
+    mul     r6, c102_154, r6        ; a * 102 + 154 * b
+
+    orr     r5, r9, r8, lsl #16     ; b | c
+    mul     r5, c51_205, r5         ; b * 205 + 51 * c
+    add     r6, r6, #0x8000
+    mov     r6, r6, lsr #24
+    strb    r6, [dest], #1
+
+    orr     r7, r8, r9, lsl #16     ; c | b
+    mul     r7, c51_205, r7         ; c * 205 + 154 * b
+    add     r5, r5, #0x8000
+    mov     r5, r5, lsr #24
+    strb    r5, [dest], #1
+
+    add     r7, r7, #0x8000
+    mov     r7, r7, lsr #24
+    strb    r7, [dest], #1
+    strb    r9, [dest], #1
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ;|vp8cx_horizontal_line_3_5_scale_c|
+
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : vp8cx_vertical_band_3_5_scale_c
+; *
+; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+; *                  unsigned int dest_pitch : Stride of destination data.
+; *                  unsigned int dest_width : Width of destination data.
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Scales vertical band of pixels by scale 3 to 5. The
+; *                  height of the band scaled is 3-pixels.
+; *
+; *  SPECIAL NOTES : The routine uses the first line of the band below
+; *                  the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_4_5_scale_armv4
+;(
+;   r0 = UINT8 *dest
+;   r1 = UINT32 dest_pitch
+;   r2 = UINT32 dest_width
+;)
+|vertical_band_3_5_scale_armv4| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     c51_205, =0x3300cd
+    ldr     c102_154, =0x66009a
+
+vl35_loop
+    mov     r3, src
+    ldrb    r4, [r3], r1            ; a = des [0]
+    ldrb    r5, [r3], r1            ; b = des [dest_pitch]
+    ldrb    r7, [r3], r1            ; c = des[dest_pitch*2]
+    add     lr, src, r1
+
+    orr     r8, r4, r5, lsl #16     ; b | a
+    mul     r6, c102_154, r8        ; a * 102 + 154 * b
+
+    ldrb    r8, [r3, r1, lsl #1]    ; d = des[dest_pitch*5]
+    orr     r3, r7, r5, lsl #16     ; b | c
+    mul     r9, c51_205, r3         ; b * 205 + 51 * c
+    add     r6, r6, #0x8000
+    orr     r3, r5, r7, lsl #16     ; c | b
+    mov     r6, r6, lsr #24
+    strb    r6, [lr], r1
+
+    mul     r5, c51_205, r3         ; c * 205 + 154 * b
+    add     r9, r9, #0x8000
+    orr     r3, r8, r7, lsl #16     ; c | d
+    mov     r9, r9, lsr #24
+    strb    r9, [lr], r1
+
+    mul     r7, c102_154, r3        ; c * 154 + 102 * d
+    add     r5, r5, #0x8000
+    add     src, src, #1
+    mov     r5, r5, lsr #24
+    strb    r5, [lr], r1
+
+    add     r7, r7, #0x8000
+    subs    r2, r2, #1
+    mov     r7, r7, lsr #24
+    strb    r7, [lr], r1
+
+
+    bne     vl35_loop
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ;|vertical_band_3_5_scale_armv4|
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : horizontal_line_3_4_scale_armv4
+; *
+; *  INPUTS        : const unsigned char *source : Pointer to source data.
+; *                  unsigned int source_width    : Stride of source.
+; *                  unsigned char *dest         : Pointer to destination data.
+; *                  unsigned int dest_width      : Stride of destination (NOT USED).
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Copies horizontal line of pixels from source to
+; *                  destination scaling up by 3 to 4.
+; *
+; *  SPECIAL NOTES : None.
+; *
+; *
+; ****************************************************************************/
+;void horizontal_line_3_4_scale_armv4
+;(
+;   const unsigned char *source,
+;   unsigned int source_width,
+;   unsigned char *dest,
+;   unsigned int dest_width
+;)
+|horizontal_line_3_4_scale_armv4| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r10, =64
+    ldr     r11, =192
+    mov     r9, #128
+
+    ldrb    r4, [src], #1           ; a = src[0]
+
+hl34_loop
+
+    ldrb    r8, [src], #1           ; b = src[1]
+    ldrb    r7, [src], #1           ; c = src[2]
+    strb    r4, [dest], #1
+
+    mla     r4, r10, r4, r9         ; a*64 + 128
+    mla     r4, r11, r8, r4         ; a*64 + b*192 + 1
+
+    add     r8, r8, #1              ; b + 1
+    add     r8, r8, r7              ; b + c + 1
+    mov     r8, r8, asr #1          ; (b + c + 1) >> 1
+
+    mov     r4, r4, asr #8          ; (a*64 + b*192 + 1) >> 8
+    strb    r4, [dest], #1
+
+    strb    r8, [dest], #1
+
+    ldrb    r4, [src], #1           ; [a+1]
+
+    mla     r7, r11, r7, r9         ; c*192 + 128
+    mla     r7, r4, r10, r7         ; a*64 + b*192 + 128
+
+    subs    srcw, srcw, #3
+
+    mov     r7, r7, asr #8          ; (a*64 + b*192 + 128) >> 8
+    strb    r7, [dest], #1
+
+    bpl     hl34_loop
+
+    ldrb    r8, [src], #1           ; b = src[1]
+    ldrb    r7, [src], #1           ; c = src[2]
+    strb    r4, [dest], #1
+
+    mla     r4, r10, r4, r9         ; a*64 + 128
+    mla     r4, r11, r8, r4         ; a*64 + b*192 + 1
+    mov     r4, r4, asr #8          ; (a*64 + b*192 + 1) >> 8
+    strb    r4, [dest], #1
+
+    add     r8, r8, #1              ; b + 1
+    add     r8, r8, r7              ; b + c + 1
+    mov     r8, r8, asr #1          ; (b + c + 1) >> 1
+    strb    r8, [dest], #1
+    strb    r7, [dest], #1
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ;|vp8cx_horizontal_line_3_4_scale_c|
+
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : vertical_band_3_4_scale_armv4
+; *
+; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+; *                  unsigned int dest_pitch : Stride of destination data.
+; *                  unsigned int dest_width : Width of destination data.
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Scales vertical band of pixels by scale 3 to 4. The
+; *                  height of the band scaled is 3-pixels.
+; *
+; *  SPECIAL NOTES : The routine uses the first line of the band below
+; *                  the current band.
+; *
+; ****************************************************************************/
+;void vertical_band_3_4_scale_armv4
+;(
+;   r0 = UINT8 *dest
+;   r1 = UINT32 dest_pitch
+;   r2 = UINT32 dest_width
+;)
+|vertical_band_3_4_scale_armv4| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r10, =64
+    ldr     r11, =192
+    mov     r9, #128
+
+;   ldr     r1,[r1]
+vl34_loop
+    mov     r3, src
+    ldrb    r4, [r3], r1            ; a = des [0]
+    ldrb    r5, [r3], r1            ; b = des [dest_pitch]
+    ldrb    r7, [r3], r1            ; c = des [dest_pitch*2]
+    add     lr, src, r1
+
+    mla     r4, r10, r4, r9         ; a*64 + 128
+    mla     r4, r11, r5, r4         ; a*64 + b*192 + 1
+
+    add     r5, r5, #1              ; b + 1
+    add     r5, r5, r7              ; b + c + 1
+    mov     r5, r5, asr #1          ; (b + c + 1) >> 1
+
+    mov     r4, r4, asr #8          ; (a*64 + b*192 + 1) >> 8
+    strb    r4, [lr], r1
+
+    ldrb    r4, [r3, r1]            ; a = des [dest_pitch*4]
+
+    strb    r5, [lr], r1
+
+    mla     r7, r11, r7, r9         ; c*192 + 128
+    mla     r7, r4, r10, r7         ; a*64 + b*192 + 128
+    mov     r7, r7, asr #8          ; (a*64 + b*192 + 128) >> 8
+
+    add     src, src, #1
+    subs    r2, r2, #1
+
+    strb    r7, [lr]
+
+    bne     vl34_loop
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ;|vertical_band_3_4_scale_armv4|
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : vp8cx_horizontal_line_1_2_scale_c
+; *
+; *  INPUTS        : const unsigned char *source : Pointer to source data.
+; *                  unsigned int source_width    : Stride of source.
+; *                  unsigned char *dest         : Pointer to destination data.
+; *                  unsigned int dest_width      : Stride of destination (NOT USED).
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Copies horizontal line of pixels from source to
+; *                  destination scaling up by 1 to 2.
+; *
+; *  SPECIAL NOTES : None.
+; *
+; ****************************************************************************/
+;void vp8cx_horizontal_line_1_2_scale_c
+;(
+;   const unsigned char *source,
+;   unsigned int source_width,
+;   unsigned char *dest,
+;   unsigned int dest_width
+;)
+|horizontal_line_1_2_scale_armv4| PROC
+    stmdb   sp!, {r4 - r5, lr}
+
+    sub     srcw, srcw, #1
+
+    ldrb    r3, [src], #1
+    ldrb    r4, [src], #1
+hl12_loop
+    subs    srcw, srcw, #1
+
+    add     r5, r3, r4
+    add     r5, r5, #1
+    mov     r5, r5, lsr #1
+
+    orr     r5, r3, r5, lsl #8
+    strh    r5, [dest], #2
+
+    mov     r3, r4
+
+    ldrneb  r4, [src], #1
+    bne     hl12_loop
+
+    orr     r5, r4, r4, lsl #8
+    strh    r5, [dest]
+
+    ldmia   sp!, {r4 - r5, pc}
+    ENDP    ;|vertical_band_3_5_scale_armv4|
+
+;/****************************************************************************
+; *
+; *  ROUTINE       : vp8cx_vertical_band_1_2_scale_c
+; *
+; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
+; *                  unsigned int dest_pitch : Stride of destination data.
+; *                  unsigned int dest_width : Width of destination data.
+; *
+; *  OUTPUTS       : None.
+; *
+; *  RETURNS       : void
+; *
+; *  FUNCTION      : Scales vertical band of pixels by scale 1 to 2. The
+; *                  height of the band scaled is 1-pixel.
+; *
+; *  SPECIAL NOTES : The routine uses the first line of the band below
+; *                  the current band.
+; *
+; ****************************************************************************/
+;void vp8cx_vertical_band_1_2_scale_c
+;(
+;   r0 = UINT8 *dest
+;   r1 = UINT32 dest_pitch
+;   r2 = UINT32 dest_width
+;)
+|vertical_band_1_2_scale_armv4| PROC
+    stmdb   sp!, {r4 - r7, lr}
+
+    ldr     mask, =0xff00ff             ; mask for selection
+    ldr     lr, = 0x010001
+
+vl12_loop
+    mov     r3, src
+    ldr     r4, [r3], r1
+    ldr     r5, [r3, r1]
+
+    add     src, src, #4
+    subs    r2, r2, #4
+
+    and     r6, r4, mask
+    and     r7, r5, mask
+
+    add     r6, r7, r6
+    add     r6, r6, lr
+
+    and     r4, mask, r4, lsr #8
+    and     r5, mask, r5, lsr #8
+
+    mov     r6, r6, lsr #1
+    and     r6, r6, mask
+
+    add     r4, r5, r4
+    add     r4, r4, lr
+
+    mov     r4, r4, lsr #1
+    and     r4, r4, mask
+
+    orr     r5, r6, r4, lsl #8
+
+    str     r5, [r3]
+
+    bpl     vl12_loop
+
+    ldmia   sp!, {r4 - r7, pc}
+    ENDP    ;|vertical_band_3_5_scale_armv4|
+
+    END
diff --git a/vpx_scale/wce/scalesystemdependant.c b/vpx_scale/wce/scalesystemdependant.c
new file mode 100644
index 000000000..a5a6a5275
--- /dev/null
+++ b/vpx_scale/wce/scalesystemdependant.c
@@ -0,0 +1,59 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/vpxscale.h"
+
+/****************************************************************************
+*  Imports
+*****************************************************************************/
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8_scale_machine_specific_config
+ *
+ *  INPUTS        : UINT32 Version : Codec version number.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Checks for machine specifc features such as MMX support
+ *                  sets appropriate flags and function pointers.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8_scale_machine_specific_config()
+{
+    vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_armv4;
+    vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_armv4;
+    vp8_last_vertical_band_1_2_scale      = vp8cx_last_vertical_band_1_2_scale_c;
+    vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_armv4;
+    vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_armv4;
+    vp8_last_vertical_band_3_5_scale      = vp8cx_last_vertical_band_3_5_scale_c;
+    vp8_horizontal_line_3_4_scale        = horizontal_line_3_4_scale_armv4;
+    vp8_vertical_band_3_4_scale          = vertical_band_3_4_scale_armv4;
+    vp8_last_vertical_band_3_4_scale      = vp8cx_last_vertical_band_3_4_scale_c;
+    vp8_horizontal_line_2_3_scale        = horizontal_line_2_3_scale_armv4;
+    vp8_vertical_band_2_3_scale          = vertical_band_2_3_scale_armv4;
+    vp8_last_vertical_band_2_3_scale      = vp8cx_last_vertical_band_2_3_scale_c;
+    vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_armv4;
+    vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_armv4;
+    vp8_last_vertical_band_4_5_scale      = vp8cx_last_vertical_band_4_5_scale_c;
+
+
+    vp8_vertical_band_5_4_scale           = vp8cx_vertical_band_5_4_scale_c;
+    vp8_vertical_band_5_3_scale           = vp8cx_vertical_band_5_3_scale_c;
+    vp8_vertical_band_2_1_scale           = vp8cx_vertical_band_2_1_scale_c;
+    vp8_vertical_band_2_1_scale_i         = vp8cx_vertical_band_2_1_scale_i_c;
+    vp8_horizontal_line_2_1_scale         = vp8cx_horizontal_line_2_1_scale_c;
+    vp8_horizontal_line_5_3_scale         = vp8cx_horizontal_line_5_3_scale_c;
+    vp8_horizontal_line_5_4_scale         = vp8cx_horizontal_line_5_4_scale_c;
+}
diff --git a/vpx_scale/win32/scaleopt.c b/vpx_scale/win32/scaleopt.c
new file mode 100644
index 000000000..da0533e6b
--- /dev/null
+++ b/vpx_scale/win32/scaleopt.c
@@ -0,0 +1,1749 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     scaleopt.cpp
+*
+*   Description  :     Optimized scaling functions
+*
+****************************************************************************/
+#include "pragmas.h"
+
+
+
+/****************************************************************************
+*  Module Statics
+****************************************************************************/
+__declspec(align(16)) const static unsigned short one_fifth[]  = { 51, 51, 51, 51 };
+__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
+__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
+__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
+__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
+__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
+__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102,  51 };
+__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
+__declspec(align(16)) const static unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
+__declspec(align(16)) const static unsigned short const35_2[] = { 154,  51, 205, 102 };
+__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205,  51, 154 };
+
+
+
+#include "vpx_scale/vpxscale.h"
+#include "vpx_mem/vpx_mem.h"
+
+/****************************************************************************
+ *
+ *  ROUTINE       : horizontal_line_3_5_scale_mmx
+ *
+ *  INPUTS        : const unsigned char *source :
+ *                  unsigned int source_width    :
+ *                  unsigned char *dest         :
+ *                  unsigned int dest_width      :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 3 to 5 up-scaling of a horizontal line of pixels.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_3_5_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    (void) dest_width;
+
+    __asm
+    {
+
+        push ebx
+
+        mov         esi,    source
+        mov         edi,    dest
+
+        mov         ecx,    source_width
+        lea         edx,    [esi+ecx-3];
+
+        movq        mm5,    const35_1       // mm5 = 66 xx cd xx 33 xx 9a xx
+        movq        mm6,    const35_2       // mm6 = 9a xx 33 xx cd xx 66 xx
+
+        movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
+        pxor        mm7,    mm7             // clear mm7
+
+        horiz_line_3_5_loop:
+
+        mov        eax,    DWORD PTR [esi] // eax = 00 01 02 03
+        mov        ebx,    eax
+
+        and         ebx,    0xffff00        // ebx = xx 01 02 xx
+        mov         ecx,    eax             // ecx = 00 01 02 03
+
+        and         eax,    0xffff0000      // eax = xx xx 02 03
+        xor         ecx,    eax             // ecx = 00 01 xx xx
+
+        shr         ebx,    8               // ebx = 01 02 xx xx
+        or          eax,    ebx             // eax = 01 02 02 03
+
+        shl         ebx,    16              // ebx = xx xx 01 02
+        movd        mm1,    eax             // mm1 = 01 02 02 03 xx xx xx xx
+
+        or          ebx,    ecx             // ebx = 00 01 01 02
+        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 03 xx
+
+        movd        mm0,    ebx             // mm0 = 00 01 01 02
+        pmullw      mm1,    mm6             //
+
+        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
+        pmullw      mm0,    mm5             //
+
+        mov         [edi],  ebx             // writeoutput 00 xx xx xx
+        add         esi,    3
+
+        add         edi,    5
+        paddw       mm0,    mm1
+
+        paddw       mm0,    mm4
+        psrlw       mm0,    8
+
+        cmp         esi,    edx
+        packuswb    mm0,    mm7
+
+        movd        DWORD Ptr [edi-4], mm0
+        jl          horiz_line_3_5_loop
+
+//Exit:
+        mov         eax,    DWORD PTR [esi] // eax = 00 01 02 03
+        mov         ebx,    eax
+
+        and         ebx,    0xffff00        // ebx = xx 01 02 xx
+        mov         ecx,    eax             // ecx = 00 01 02 03
+
+        and         eax,    0xffff0000      // eax = xx xx 02 03
+        xor         ecx,    eax             // ecx = 00 01 xx xx
+
+        shr         ebx,    8               // ebx = 01 02 xx xx
+        or          eax,    ebx             // eax = 01 02 02 03
+
+        shl         eax,    8               // eax = xx 01 02 02
+        and         eax,    0xffff0000      // eax = xx xx 02 02
+
+        or          eax,    ebx             // eax = 01 02 02 02
+
+        shl         ebx,    16              // ebx = xx xx 01 02
+        movd        mm1,    eax             // mm1 = 01 02 02 02 xx xx xx xx
+
+        or          ebx,    ecx             // ebx = 00 01 01 02
+        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 02 xx
+
+        movd        mm0,    ebx             // mm0 = 00 01 01 02
+        pmullw      mm1,    mm6             //
+
+        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
+        pmullw      mm0,    mm5             //
+
+        mov         [edi],  ebx             // writeoutput 00 xx xx xx
+        paddw       mm0,    mm1
+
+        paddw       mm0,    mm4
+        psrlw       mm0,    8
+
+        packuswb    mm0,    mm7
+        movd        DWORD Ptr [edi+1], mm0
+
+        pop ebx
+
+    }
+
+}
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : horizontal_line_4_5_scale_mmx
+ *
+ *  INPUTS        : const unsigned char *source :
+ *                  unsigned int source_width    :
+ *                  unsigned char *dest         :
+ *                  unsigned int dest_width      :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 4 to 5 up-scaling of a horizontal line of pixels.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_4_5_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    (void)dest_width;
+
+    __asm
+    {
+
+        mov         esi,    source
+        mov         edi,    dest
+
+        mov         ecx,    source_width
+        lea         edx,    [esi+ecx-8];
+
+        movq        mm5,    const45_1       // mm5 = 33 xx 66 xx 9a xx cd xx
+        movq        mm6,    const45_2       // mm6 = cd xx 9a xx 66 xx 33 xx
+
+        movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
+        pxor        mm7,    mm7             // clear mm7
+
+        horiz_line_4_5_loop:
+
+        movq        mm0,    QWORD PTR [esi]           // mm0 = 00 01 02 03 04 05 06 07
+        movq        mm1,    QWORD PTR [esi+1];        // mm1 = 01 02 03 04 05 06 07 08
+
+        movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
+        movq        mm3,    mm1             // mm3 = 01 02 03 04 05 06 07 08
+
+        movd        DWORD PTR [edi],  mm0             // write output 00 xx xx xx
+        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
+
+        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
+        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
+
+        pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
+        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
+
+        movd        DWORD PTR [edi+5], mm2            // write ouput 05 xx xx xx
+        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
+
+        punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
+        pmullw      mm3,    mm6             // 05*205 06*154 07*102 08* 51
+
+        paddw       mm0,    mm1             // added round values
+        paddw       mm0,    mm4
+
+        psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
+        packuswb    mm0,    mm7
+
+        movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
+        add         edi,    10
+
+        add         esi,    8
+        paddw       mm2,    mm3             //
+
+        paddw       mm2,    mm4             // added round values
+        cmp         esi,    edx
+
+        psrlw       mm2,    8
+        packuswb    mm2,    mm7
+
+        movd        DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09
+        jl         horiz_line_4_5_loop
+
+//Exit:
+        movq        mm0,    [esi]           // mm0 = 00 01 02 03 04 05 06 07
+        movq        mm1,    mm0             // mm1 = 00 01 02 03 04 05 06 07
+
+        movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
+        psrlq       mm1,    8               // mm1 = 01 02 03 04 05 06 07 00
+
+        movq        mm3,    mask45          // mm3 = 00 00 00 00 00 00 ff 00
+        pand        mm3,    mm1             // mm3 = 00 00 00 00 00 00 07 00
+
+        psllq       mm3,    8               // mm3 = 00 00 00 00 00 00 00 07
+        por         mm1,    mm3             // mm1 = 01 02 03 04 05 06 07 07
+
+        movq        mm3,    mm1
+
+        movd        DWORD PTR [edi],  mm0   // write output 00 xx xx xx
+        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
+
+        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
+        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
+
+        pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
+        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
+
+        movd        DWORD PTR [edi+5], mm2  // write ouput 05 xx xx xx
+        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
+
+        punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
+        pmullw      mm3,    mm6             // 05*205 06*154 07*102 07* 51
+
+        paddw       mm0,    mm1             // added round values
+        paddw       mm0,    mm4
+
+        psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
+        packuswb    mm0,    mm7             // 01 02 03 04 xx xx xx xx
+
+        movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
+        paddw       mm2,    mm3             //
+
+        paddw       mm2,    mm4             // added round values
+        psrlw       mm2,    8
+
+        packuswb    mm2,    mm7
+        movd        DWORD PTR [edi+6], mm2  // writeoutput 06 07 08 09
+
+
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vertical_band_4_5_scale_mmx
+ *
+ *  INPUTS        : unsigned char *dest    :
+ *                  unsigned int dest_pitch :
+ *                  unsigned int dest_width :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 4 to 5 up-scaling of a 4 pixel high band of pixels.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band. The function also has a "C" only
+ *                  version.
+ *
+ ****************************************************************************/
+static
+void vertical_band_4_5_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __asm
+    {
+
+        mov         esi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        lea         edi,    [esi+ecx*2]             // tow lines below
+        add         edi,    ecx                     // three lines below
+
+        pxor        mm7,    mm7                     // clear out mm7
+        mov         edx,    dest_width               // Loop counter
+
+        vs_4_5_loop:
+
+        movq        mm0,    QWORD ptr [esi]         // src[0];
+        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
+
+        movq        mm2,    mm0                     // Make a copy
+        punpcklbw   mm0,    mm7                     // unpack low to word
+
+        movq        mm5,    one_fifth
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm0,    mm5                     // a * 1/5
+
+        movq        mm3,    mm1                     // make a copy
+        punpcklbw   mm1,    mm7                     // unpack low to word
+
+        pmullw      mm2,    mm5                     // a * 1/5
+        movq        mm6,    four_fifths               // constan
+
+        movq        mm4,    mm1                     // copy of low b
+        pmullw      mm4,    mm6                     // b * 4/5
+
+        punpckhbw   mm3,    mm7                     // unpack high to word
+        movq        mm5,    mm3                     // copy of high b
+
+        pmullw      mm5,    mm6                     // b * 4/5
+        paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
+
+        paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
+        paddw       mm0,    round_values             // + 128
+
+        paddw       mm2,    round_values             // + 128
+        psrlw       mm0,    8
+
+        psrlw       mm2,    8
+        packuswb    mm0,    mm2                     // des [1]
+
+        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
+        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
+
+        // mm1, mm3 --- Src[1]
+        // mm0 --- Src[2]
+        // mm7 for unpacking
+
+        movq        mm5,    two_fifths
+        movq        mm2,    mm0                     // make a copy
+
+        pmullw      mm1,    mm5                     // b * 2/5
+        movq        mm6,    three_fifths
+
+
+        punpcklbw   mm0,    mm7                     // unpack low to word
+        pmullw      mm3,    mm5                     // b * 2/5
+
+        movq        mm4,    mm0                     // make copy of c
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm4,    mm6                     // c * 3/5
+        movq        mm5,    mm2
+
+        pmullw      mm5,    mm6                     // c * 3/5
+        paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
+
+        paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
+        paddw       mm1,    round_values             // + 128
+
+        paddw       mm3,    round_values             // + 128
+        psrlw       mm1,    8
+
+        psrlw       mm3,    8
+        packuswb    mm1,    mm3                     // des[2]
+
+        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
+        movq        mm1,    [edi]                   // mm1=Src[3];
+
+        // mm0, mm2 --- Src[2]
+        // mm1 --- Src[3]
+        // mm6 --- 3/5
+        // mm7 for unpacking
+
+        pmullw      mm0,    mm6                     // c * 3/5
+        movq        mm5,    two_fifths               // mm5 = 2/5
+
+        movq        mm3,    mm1                     // make a copy
+        pmullw      mm2,    mm6                     // c * 3/5
+
+        punpcklbw   mm1,    mm7                     // unpack low
+        movq        mm4,    mm1                     // make a copy
+
+        punpckhbw   mm3,    mm7                     // unpack high
+        pmullw      mm4,    mm5                     // d * 2/5
+
+        movq        mm6,    mm3                     // make a copy
+        pmullw      mm6,    mm5                     // d * 2/5
+
+        paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
+        paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
+
+        paddw       mm0,    round_values             // + 128
+        paddw       mm2,    round_values             // + 128
+
+        psrlw       mm0,    8
+        psrlw       mm2,    8
+
+        packuswb    mm0,    mm2                     // des[3]
+        movq        QWORD ptr [edi], mm0            // write des[3]
+
+        //  mm1, mm3 --- Src[3]
+        //  mm7 -- cleared for unpacking
+
+        movq        mm0,    [edi+ecx*2]             // mm0, Src[0] of the next group
+
+        movq        mm5,    four_fifths              // mm5 = 4/5
+        pmullw      mm1,    mm5                     // d * 4/5
+
+        movq        mm6,    one_fifth                // mm6 = 1/5
+        movq        mm2,    mm0                     // make a copy
+
+        pmullw      mm3,    mm5                     // d * 4/5
+        punpcklbw   mm0,    mm7                     // unpack low
+
+        pmullw      mm0,    mm6                     // an * 1/5
+        punpckhbw   mm2,    mm7                     // unpack high
+
+        paddw       mm1,    mm0                     // d * 4/5 + an * 1/5
+        pmullw      mm2,    mm6                     // an * 1/5
+
+        paddw       mm3,    mm2                     // d * 4/5 + an * 1/5
+        paddw       mm1,    round_values             // + 128
+
+        paddw       mm3,    round_values             // + 128
+        psrlw       mm1,    8
+
+        psrlw       mm3,    8
+        packuswb    mm1,    mm3                     // des[4]
+
+        movq        QWORD ptr [edi+ecx], mm1        // write des[4]
+
+        add         edi,    8
+        add         esi,    8
+
+        sub         edx,    8
+        jg         vs_4_5_loop
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : last_vertical_band_4_5_scale_mmx
+ *
+ *  INPUTS        : unsigned char *dest    :
+ *                  unsigned int dest_pitch :
+ *                  unsigned int dest_width :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : None
+ *
+ *  FUNCTION      : 4 to 5 up-scaling of the last 4-pixel high band in an image.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band. The function also has an "C" only
+ *                  version.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_4_5_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __asm
+    {
+        mov         esi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        lea         edi,    [esi+ecx*2]             // tow lines below
+        add         edi,    ecx                     // three lines below
+
+        pxor        mm7,    mm7                     // clear out mm7
+        mov         edx,    dest_width               // Loop counter
+
+        last_vs_4_5_loop:
+
+        movq        mm0,    QWORD ptr [esi]         // src[0];
+        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
+
+        movq        mm2,    mm0                     // Make a copy
+        punpcklbw   mm0,    mm7                     // unpack low to word
+
+        movq        mm5,    one_fifth
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm0,    mm5                     // a * 1/5
+
+        movq        mm3,    mm1                     // make a copy
+        punpcklbw   mm1,    mm7                     // unpack low to word
+
+        pmullw      mm2,    mm5                     // a * 1/5
+        movq        mm6,    four_fifths               // constan
+
+        movq        mm4,    mm1                     // copy of low b
+        pmullw      mm4,    mm6                     // b * 4/5
+
+        punpckhbw   mm3,    mm7                     // unpack high to word
+        movq        mm5,    mm3                     // copy of high b
+
+        pmullw      mm5,    mm6                     // b * 4/5
+        paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
+
+        paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
+        paddw       mm0,    round_values             // + 128
+
+        paddw       mm2,    round_values             // + 128
+        psrlw       mm0,    8
+
+        psrlw       mm2,    8
+        packuswb    mm0,    mm2                     // des [1]
+
+        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
+        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
+
+        // mm1, mm3 --- Src[1]
+        // mm0 --- Src[2]
+        // mm7 for unpacking
+
+        movq        mm5,    two_fifths
+        movq        mm2,    mm0                     // make a copy
+
+        pmullw      mm1,    mm5                     // b * 2/5
+        movq        mm6,    three_fifths
+
+
+        punpcklbw   mm0,    mm7                     // unpack low to word
+        pmullw      mm3,    mm5                     // b * 2/5
+
+        movq        mm4,    mm0                     // make copy of c
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm4,    mm6                     // c * 3/5
+        movq        mm5,    mm2
+
+        pmullw      mm5,    mm6                     // c * 3/5
+        paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
+
+        paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
+        paddw       mm1,    round_values             // + 128
+
+        paddw       mm3,    round_values             // + 128
+        psrlw       mm1,    8
+
+        psrlw       mm3,    8
+        packuswb    mm1,    mm3                     // des[2]
+
+        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
+        movq        mm1,    [edi]                   // mm1=Src[3];
+
+        movq        QWORD ptr [edi+ecx], mm1        // write des[4];
+
+        // mm0, mm2 --- Src[2]
+        // mm1 --- Src[3]
+        // mm6 --- 3/5
+        // mm7 for unpacking
+
+        pmullw      mm0,    mm6                     // c * 3/5
+        movq        mm5,    two_fifths               // mm5 = 2/5
+
+        movq        mm3,    mm1                     // make a copy
+        pmullw      mm2,    mm6                     // c * 3/5
+
+        punpcklbw   mm1,    mm7                     // unpack low
+        movq        mm4,    mm1                     // make a copy
+
+        punpckhbw   mm3,    mm7                     // unpack high
+        pmullw      mm4,    mm5                     // d * 2/5
+
+        movq        mm6,    mm3                     // make a copy
+        pmullw      mm6,    mm5                     // d * 2/5
+
+        paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
+        paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
+
+        paddw       mm0,    round_values             // + 128
+        paddw       mm2,    round_values             // + 128
+
+        psrlw       mm0,    8
+        psrlw       mm2,    8
+
+        packuswb    mm0,    mm2                     // des[3]
+        movq        QWORD ptr [edi], mm0            // write des[3]
+
+        //  mm1, mm3 --- Src[3]
+        //  mm7 -- cleared for unpacking
+        add         edi,    8
+        add         esi,    8
+
+        sub         edx,    8
+        jg          last_vs_4_5_loop
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vertical_band_3_5_scale_mmx
+ *
+ *  INPUTS        : unsigned char *dest    :
+ *                  unsigned int dest_pitch :
+ *                  unsigned int dest_width :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band. The function also has an "C" only
+ *                  version.
+ *
+ ****************************************************************************/
+static
+void vertical_band_3_5_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __asm
+    {
+        mov         esi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        lea         edi,    [esi+ecx*2]             // tow lines below
+        add         edi,    ecx                     // three lines below
+
+        pxor        mm7,    mm7                     // clear out mm7
+        mov         edx,    dest_width               // Loop counter
+
+        vs_3_5_loop:
+
+        movq        mm0,    QWORD ptr [esi]         // src[0];
+        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
+
+        movq        mm2,    mm0                     // Make a copy
+        punpcklbw   mm0,    mm7                     // unpack low to word
+
+        movq        mm5,    two_fifths               // mm5 = 2/5
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm0,    mm5                     // a * 2/5
+
+        movq        mm3,    mm1                     // make a copy
+        punpcklbw   mm1,    mm7                     // unpack low to word
+
+        pmullw      mm2,    mm5                     // a * 2/5
+        movq        mm6,    three_fifths             // mm6 = 3/5
+
+        movq        mm4,    mm1                     // copy of low b
+        pmullw      mm4,    mm6                     // b * 3/5
+
+        punpckhbw   mm3,    mm7                     // unpack high to word
+        movq        mm5,    mm3                     // copy of high b
+
+        pmullw      mm5,    mm6                     // b * 3/5
+        paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
+
+        paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
+        paddw       mm0,    round_values             // + 128
+
+        paddw       mm2,    round_values             // + 128
+        psrlw       mm0,    8
+
+        psrlw       mm2,    8
+        packuswb    mm0,    mm2                     // des [1]
+
+        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
+        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
+
+        // mm1, mm3 --- Src[1]
+        // mm0 --- Src[2]
+        // mm7 for unpacking
+
+        movq        mm4,    mm1                     // b low
+        pmullw      mm1,    four_fifths              // b * 4/5 low
+
+        movq        mm5,    mm3                     // b high
+        pmullw      mm3,    four_fifths              // b * 4/5 high
+
+        movq        mm2,    mm0                     // c
+        pmullw      mm4,    one_fifth                // b * 1/5
+
+        punpcklbw   mm0,    mm7                     // c low
+        pmullw      mm5,    one_fifth                // b * 1/5
+
+        movq        mm6,    mm0                     // make copy of c low
+        punpckhbw   mm2,    mm7                     // c high
+
+        pmullw      mm6,    one_fifth                // c * 1/5 low
+        movq        mm7,    mm2                     // make copy of c high
+
+        pmullw      mm7,    one_fifth                // c * 1/5 high
+        paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
+
+        paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
+        movq        mm6,    mm0                     // make copy of c low
+
+        pmullw      mm6,    four_fifths              // c * 4/5 low
+        movq        mm7,    mm2                     // make copy of c high
+
+        pmullw      mm7,    four_fifths              // c * 4/5 high
+
+        paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
+        paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
+
+        paddw       mm1,    round_values             // + 128
+        paddw       mm3,    round_values             // + 128
+
+        psrlw       mm1,    8
+        psrlw       mm3,    8
+
+        packuswb    mm1,    mm3                     // des[2]
+        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
+
+        paddw       mm4,    round_values             // + 128
+        paddw       mm5,    round_values             // + 128
+
+        psrlw       mm4,    8
+        psrlw       mm5,    8
+
+        packuswb    mm4,    mm5                     // des[3]
+        movq        QWORD ptr [edi], mm4            // write des[3]
+
+        //  mm0, mm2 --- Src[3]
+
+        pxor        mm7,    mm7                     // clear mm7 for unpacking
+        movq        mm1,    [edi+ecx*2]             // mm1 = Src[0] of the next group
+
+        movq        mm5,    three_fifths             // mm5 = 3/5
+        pmullw      mm0,    mm5                     // d * 3/5
+
+        movq        mm6,    two_fifths                // mm6 = 2/5
+        movq        mm3,    mm1                     // make a copy
+
+        pmullw      mm2,    mm5                     // d * 3/5
+        punpcklbw   mm1,    mm7                     // unpack low
+
+        pmullw      mm1,    mm6                     // an * 2/5
+        punpckhbw   mm3,    mm7                     // unpack high
+
+        paddw       mm0,    mm1                     // d * 3/5 + an * 2/5
+        pmullw      mm3,    mm6                     // an * 2/5
+
+        paddw       mm2,    mm3                     // d * 3/5 + an * 2/5
+        paddw       mm0,    round_values             // + 128
+
+        paddw       mm2,    round_values             // + 128
+        psrlw       mm0,    8
+
+        psrlw       mm2,    8
+        packuswb    mm0,    mm2                     // des[4]
+
+        movq        QWORD ptr [edi+ecx], mm0        // write des[4]
+
+        add         edi,    8
+        add         esi,    8
+
+        sub         edx,    8
+        jg          vs_3_5_loop
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : last_vertical_band_3_5_scale_mmx
+ *
+ *  INPUTS        : unsigned char *dest    :
+ *                  unsigned int dest_pitch :
+ *                  unsigned int dest_width :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band. The function also has an "C" only
+ *                  version.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_3_5_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __asm
+    {
+        mov         esi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        lea         edi,    [esi+ecx*2]             // tow lines below
+        add         edi,    ecx                     // three lines below
+
+        pxor        mm7,    mm7                     // clear out mm7
+        mov         edx,    dest_width               // Loop counter
+
+
+        last_vs_3_5_loop:
+
+        movq        mm0,    QWORD ptr [esi]         // src[0];
+        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
+
+        movq        mm2,    mm0                     // Make a copy
+        punpcklbw   mm0,    mm7                     // unpack low to word
+
+        movq        mm5,    two_fifths               // mm5 = 2/5
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm0,    mm5                     // a * 2/5
+
+        movq        mm3,    mm1                     // make a copy
+        punpcklbw   mm1,    mm7                     // unpack low to word
+
+        pmullw      mm2,    mm5                     // a * 2/5
+        movq        mm6,    three_fifths             // mm6 = 3/5
+
+        movq        mm4,    mm1                     // copy of low b
+        pmullw      mm4,    mm6                     // b * 3/5
+
+        punpckhbw   mm3,    mm7                     // unpack high to word
+        movq        mm5,    mm3                     // copy of high b
+
+        pmullw      mm5,    mm6                     // b * 3/5
+        paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
+
+        paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
+        paddw       mm0,    round_values             // + 128
+
+        paddw       mm2,    round_values             // + 128
+        psrlw       mm0,    8
+
+        psrlw       mm2,    8
+        packuswb    mm0,    mm2                     // des [1]
+
+        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
+        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
+
+
+
+        // mm1, mm3 --- Src[1]
+        // mm0 --- Src[2]
+        // mm7 for unpacking
+
+        movq        mm4,    mm1                     // b low
+        pmullw      mm1,    four_fifths              // b * 4/5 low
+
+        movq        QWORD ptr [edi+ecx], mm0        // write des[4]
+
+        movq        mm5,    mm3                     // b high
+        pmullw      mm3,    four_fifths              // b * 4/5 high
+
+        movq        mm2,    mm0                     // c
+        pmullw      mm4,    one_fifth                // b * 1/5
+
+        punpcklbw   mm0,    mm7                     // c low
+        pmullw      mm5,    one_fifth                // b * 1/5
+
+        movq        mm6,    mm0                     // make copy of c low
+        punpckhbw   mm2,    mm7                     // c high
+
+        pmullw      mm6,    one_fifth                // c * 1/5 low
+        movq        mm7,    mm2                     // make copy of c high
+
+        pmullw      mm7,    one_fifth                // c * 1/5 high
+        paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
+
+        paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
+        movq        mm6,    mm0                     // make copy of c low
+
+        pmullw      mm6,    four_fifths              // c * 4/5 low
+        movq        mm7,    mm2                     // make copy of c high
+
+        pmullw      mm7,    four_fifths              // c * 4/5 high
+
+        paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
+        paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
+
+        paddw       mm1,    round_values             // + 128
+        paddw       mm3,    round_values             // + 128
+
+        psrlw       mm1,    8
+        psrlw       mm3,    8
+
+        packuswb    mm1,    mm3                     // des[2]
+        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
+
+        paddw       mm4,    round_values             // + 128
+        paddw       mm5,    round_values             // + 128
+
+        psrlw       mm4,    8
+        psrlw       mm5,    8
+
+        packuswb    mm4,    mm5                     // des[3]
+        movq        QWORD ptr [edi], mm4            // write des[3]
+
+        //  mm0, mm2 --- Src[3]
+
+        add         edi,    8
+        add         esi,    8
+
+        sub         edx,    8
+        jg          last_vs_3_5_loop
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vertical_band_1_2_scale_mmx
+ *
+ *  INPUTS        : unsigned char *dest    :
+ *                  unsigned int dest_pitch :
+ *                  unsigned int dest_width :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 1 to 2 up-scaling of a band of pixels.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band. The function also has an "C" only
+ *                  version.
+ *
+ ****************************************************************************/
+static
+void vertical_band_1_2_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __asm
+    {
+
+        mov         esi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        pxor        mm7,    mm7                     // clear out mm7
+        mov         edx,    dest_width               // Loop counter
+
+        vs_1_2_loop:
+
+        movq        mm0,    [esi]                   // get Src[0]
+        movq        mm1,    [esi + ecx * 2]         // get Src[1]
+
+        movq        mm2,    mm0                     // make copy before unpack
+        movq        mm3,    mm1                     // make copy before unpack
+
+        punpcklbw   mm0,    mm7                     // low Src[0]
+        movq        mm6,    four_ones                // mm6= 1, 1, 1, 1
+
+        punpcklbw   mm1,    mm7                     // low Src[1]
+        paddw       mm0,    mm1                     // low (a + b)
+
+        punpckhbw   mm2,    mm7                     // high Src[0]
+        paddw       mm0,    mm6                     // low (a + b + 1)
+
+        punpckhbw   mm3,    mm7
+        paddw       mm2,    mm3                     // high (a + b )
+
+        psraw       mm0,    1                       // low (a + b +1 )/2
+        paddw       mm2,    mm6                     // high (a + b + 1)
+
+        psraw       mm2,    1                       // high (a + b + 1)/2
+        packuswb    mm0,    mm2                     // pack results
+
+        movq        [esi+ecx], mm0                  // write out eight bytes
+        add         esi,    8
+
+        sub         edx,    8
+        jg          vs_1_2_loop
+    }
+
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : last_vertical_band_1_2_scale_mmx
+ *
+ *  INPUTS        : unsigned char *dest    :
+ *                  unsigned int dest_pitch :
+ *                  unsigned int dest_width :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 1 to 2 up-scaling of band of pixels.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band. The function also has an "C" only
+ *                  version.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_1_2_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __asm
+    {
+        mov         esi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        mov         edx,    dest_width               // Loop counter
+
+        last_vs_1_2_loop:
+
+        movq        mm0,    [esi]                   // get Src[0]
+        movq        [esi+ecx], mm0                  // write out eight bytes
+
+        add         esi,    8
+        sub         edx,    8
+
+        jg         last_vs_1_2_loop
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : horizontal_line_1_2_scale
+ *
+ *  INPUTS        : const unsigned char *source :
+ *                  unsigned int source_width    :
+ *                  unsigned char *dest         :
+ *                  unsigned int dest_width      :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_1_2_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    (void) dest_width;
+
+    __asm
+    {
+        mov         esi,    source
+        mov         edi,    dest
+
+        pxor        mm7,    mm7
+        movq        mm6,    four_ones
+
+        mov         ecx,    source_width
+
+        hs_1_2_loop:
+
+        movq        mm0,    [esi]
+        movq        mm1,    [esi+1]
+
+        movq        mm2,    mm0
+        movq        mm3,    mm1
+
+        movq        mm4,    mm0
+        punpcklbw   mm0,    mm7
+
+        punpcklbw   mm1,    mm7
+        paddw       mm0,    mm1
+
+        paddw       mm0,    mm6
+        punpckhbw   mm2,    mm7
+
+        punpckhbw   mm3,    mm7
+        paddw       mm2,    mm3
+
+        paddw       mm2,    mm6
+        psraw       mm0,    1
+
+        psraw       mm2,    1
+        packuswb    mm0,    mm2
+
+        movq        mm2,    mm4
+        punpcklbw   mm2,    mm0
+
+        movq        [edi],  mm2
+        punpckhbw   mm4,    mm0
+
+        movq        [edi+8], mm4
+        add         esi,    8
+
+        add         edi,    16
+        sub         ecx,    8
+
+        cmp         ecx,    8
+        jg          hs_1_2_loop
+
+// last eight pixel
+
+        movq        mm0,    [esi]
+        movq        mm1,    mm0
+
+        movq        mm2,    mm0
+        movq        mm3,    mm1
+
+        psrlq       mm1,    8
+        psrlq       mm3,    56
+
+        psllq       mm3,    56
+        por         mm1,    mm3
+
+        movq        mm3,    mm1
+        movq        mm4,    mm0
+
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+
+        paddw       mm0,    mm1
+        paddw       mm0,    mm6
+
+        punpckhbw   mm2,    mm7
+        punpckhbw   mm3,    mm7
+
+        paddw       mm2,    mm3
+        paddw       mm2,    mm6
+
+        psraw       mm0,    1
+        psraw       mm2,    1
+
+        packuswb    mm0,    mm2
+        movq        mm2,    mm4
+
+        punpcklbw   mm2,    mm0
+        movq        [edi],  mm2
+
+        punpckhbw   mm4,    mm0
+        movq        [edi+8], mm4
+    }
+}
+
+
+
+
+
+__declspec(align(16)) const static unsigned short const54_2[] = {  0,  64, 128, 192 };
+__declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128,  64 };
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : horizontal_line_5_4_scale_mmx
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width    : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 4 to 5.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_5_4_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    /*
+    unsigned i;
+    unsigned int a, b, c, d, e;
+    unsigned char *des = dest;
+    const unsigned char *src = source;
+
+    (void) dest_width;
+
+    for ( i=0; i<source_width; i+=5 )
+    {
+        a = src[0];
+        b = src[1];
+        c = src[2];
+        d = src[3];
+        e = src[4];
+
+        des[0] = a;
+        des[1] = ((b*192 + c* 64 + 128)>>8);
+        des[2] = ((c*128 + d*128 + 128)>>8);
+        des[3] = ((d* 64 + e*192 + 128)>>8);
+
+        src += 5;
+        des += 4;
+    }
+    */
+    (void) dest_width;
+
+    __asm
+    {
+
+        mov         esi,        source              ;
+        mov         edi,        dest                ;
+
+        mov         ecx,        source_width         ;
+        movq        mm5,        const54_1           ;
+
+        pxor        mm7,        mm7                 ;
+        movq        mm6,        const54_2           ;
+
+        movq        mm4,        round_values         ;
+        lea         edx,        [esi+ecx]           ;
+        horizontal_line_5_4_loop:
+
+        movq        mm0,        QWORD PTR  [esi]    ;
+        00 01 02 03 04 05 06 07
+        movq        mm1,        mm0                 ;
+        00 01 02 03 04 05 06 07
+
+        psrlq       mm0,        8                   ;
+        01 02 03 04 05 06 07 xx
+        punpcklbw   mm1,        mm7                 ;
+        xx 00 xx 01 xx 02 xx 03
+
+        punpcklbw   mm0,        mm7                 ;
+        xx 01 xx 02 xx 03 xx 04
+        pmullw      mm1,        mm5
+
+        pmullw      mm0,        mm6
+        add         esi,        5
+
+        add         edi,        4
+        paddw       mm1,        mm0
+
+        paddw       mm1,        mm4
+        psrlw       mm1,        8
+
+        cmp         esi,        edx
+        packuswb    mm1,        mm7
+
+        movd        DWORD PTR [edi-4], mm1
+
+        jl          horizontal_line_5_4_loop
+
+    }
+
+}
+__declspec(align(16)) const static unsigned short one_fourths[]   = {  64,  64,  64, 64  };
+__declspec(align(16)) const static unsigned short two_fourths[]   = { 128, 128, 128, 128 };
+__declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
+
+static
+void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+
+    __asm
+    {
+        push        ebx
+
+        mov         esi,    source                    // Get the source and destination pointer
+        mov         ecx,    src_pitch               // Get the pitch size
+
+        mov         edi,    dest                    // tow lines below
+        pxor        mm7,    mm7                     // clear out mm7
+
+        mov         edx,    dest_pitch               // Loop counter
+        mov         ebx,    dest_width
+
+        vs_5_4_loop:
+
+        movd        mm0,    DWORD ptr [esi]         // src[0];
+        movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
+
+        movd        mm2,    DWORD ptr [esi+ecx*2]
+        lea         eax,    [esi+ecx*2]             //
+
+        punpcklbw   mm1,    mm7
+        punpcklbw   mm2,    mm7
+
+        movq        mm3,    mm2
+        pmullw      mm1,    three_fourths
+
+        pmullw      mm2,    one_fourths
+        movd        mm4,    [eax+ecx]
+
+        pmullw      mm3,    two_fourths
+        punpcklbw   mm4,    mm7
+
+        movq        mm5,    mm4
+        pmullw      mm4,    two_fourths
+
+        paddw       mm1,    mm2
+        movd        mm6,    [eax+ecx*2]
+
+        pmullw      mm5,    one_fourths
+        paddw       mm1,    round_values;
+
+        paddw       mm3,    mm4
+        psrlw       mm1,    8
+
+        punpcklbw   mm6,    mm7
+        paddw       mm3,    round_values
+
+        pmullw      mm6,    three_fourths
+        psrlw       mm3,    8
+
+        packuswb    mm1,    mm7
+        packuswb    mm3,    mm7
+
+        movd        DWORD PTR [edi], mm0
+        movd        DWORD PTR [edi+edx], mm1
+
+
+        paddw       mm5,    mm6
+        movd        DWORD PTR [edi+edx*2], mm3
+
+        lea         eax,    [edi+edx*2]
+        paddw       mm5,    round_values
+
+        psrlw       mm5,    8
+        add         edi,    4
+
+        packuswb    mm5,    mm7
+        movd        DWORD PTR [eax+edx], mm5
+
+        add         esi,    4
+        sub         ebx,    4
+
+        jg         vs_5_4_loop
+
+        pop         ebx
+    }
+}
+
+
+__declspec(align(16)) const static unsigned short const53_1[] = {  0,  85, 171, 0 };
+__declspec(align(16)) const static unsigned short const53_2[] = {256, 171,  85, 0 };
+
+
+static
+void horizontal_line_5_3_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+
+    (void) dest_width;
+    __asm
+    {
+
+        mov         esi,        source              ;
+        mov         edi,        dest                ;
+
+        mov         ecx,        source_width         ;
+        movq        mm5,        const53_1           ;
+
+        pxor        mm7,        mm7                 ;
+        movq        mm6,        const53_2           ;
+
+        movq        mm4,        round_values         ;
+        lea         edx,        [esi+ecx-5]         ;
+        horizontal_line_5_3_loop:
+
+        movq        mm0,        QWORD PTR  [esi]    ;
+        00 01 02 03 04 05 06 07
+        movq        mm1,        mm0                 ;
+        00 01 02 03 04 05 06 07
+
+        psllw       mm0,        8                   ;
+        xx 00 xx 02 xx 04 xx 06
+        psrlw       mm1,        8                   ;
+        01 xx 03 xx 05 xx 07 xx
+
+        psrlw       mm0,        8                   ;
+        00 xx 02 xx 04 xx 06 xx
+        psllq       mm1,        16                  ;
+        xx xx 01 xx 03 xx 05 xx
+
+        pmullw      mm0,        mm6
+
+        pmullw      mm1,        mm5
+        add         esi,        5
+
+        add         edi,        3
+        paddw       mm1,        mm0
+
+        paddw       mm1,        mm4
+        psrlw       mm1,        8
+
+        cmp         esi,        edx
+        packuswb    mm1,        mm7
+
+        movd        DWORD PTR [edi-3], mm1
+        jl          horizontal_line_5_3_loop
+
+//exit condition
+        movq        mm0,        QWORD PTR  [esi]    ;
+        00 01 02 03 04 05 06 07
+        movq        mm1,        mm0                 ;
+        00 01 02 03 04 05 06 07
+
+        psllw       mm0,        8                   ;
+        xx 00 xx 02 xx 04 xx 06
+        psrlw       mm1,        8                   ;
+        01 xx 03 xx 05 xx 07 xx
+
+        psrlw       mm0,        8                   ;
+        00 xx 02 xx 04 xx 06 xx
+        psllq       mm1,        16                  ;
+        xx xx 01 xx 03 xx 05 xx
+
+        pmullw      mm0,        mm6
+
+        pmullw      mm1,        mm5
+        paddw       mm1,        mm0
+
+        paddw       mm1,        mm4
+        psrlw       mm1,        8
+
+        packuswb    mm1,        mm7
+        movd        eax,        mm1
+
+        mov         edx,        eax
+        shr         edx,        16
+
+        mov         WORD PTR[edi],   ax
+        mov         BYTE PTR[edi+2], dl
+
+    }
+
+}
+
+__declspec(align(16)) const static unsigned short one_thirds[] = {  85,  85,  85,  85 };
+__declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
+
+static
+void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+
+    __asm
+    {
+        push        ebx
+
+        mov         esi,    source                    // Get the source and destination pointer
+        mov         ecx,    src_pitch               // Get the pitch size
+
+        mov         edi,    dest                    // tow lines below
+        pxor        mm7,    mm7                     // clear out mm7
+
+        mov         edx,    dest_pitch               // Loop counter
+        movq        mm5,    one_thirds
+
+        movq        mm6,    two_thirds
+        mov         ebx,    dest_width;
+
+        vs_5_3_loop:
+
+        movd        mm0,    DWORD ptr [esi]         // src[0];
+        movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
+
+        movd        mm2,    DWORD ptr [esi+ecx*2]
+        lea         eax,    [esi+ecx*2]             //
+
+        punpcklbw   mm1,    mm7
+        punpcklbw   mm2,    mm7
+
+        pmullw      mm1,    mm5
+        pmullw      mm2,    mm6
+
+        movd        mm3,    DWORD ptr [eax+ecx]
+        movd        mm4,    DWORD ptr [eax+ecx*2]
+
+        punpcklbw   mm3,    mm7
+        punpcklbw   mm4,    mm7
+
+        pmullw      mm3,    mm6
+        pmullw      mm4,    mm5
+
+
+        movd        DWORD PTR [edi], mm0
+        paddw       mm1,    mm2
+
+        paddw       mm1,    round_values
+        psrlw       mm1,    8
+
+        packuswb    mm1,    mm7
+        paddw       mm3,    mm4
+
+        paddw       mm3,    round_values
+        movd        DWORD PTR [edi+edx], mm1
+
+        psrlw       mm3,    8
+        packuswb    mm3,    mm7
+
+        movd        DWORD PTR [edi+edx*2], mm3
+
+
+        add         edi,    4
+        add         esi,    4
+
+        sub         ebx,    4
+        jg          vs_5_3_loop
+
+        pop         ebx
+    }
+}
+
+
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : horizontal_line_2_1_scale
+ *
+ *  INPUTS        : const unsigned char *source :
+ *                  unsigned int source_width    :
+ *                  unsigned char *dest         :
+ *                  unsigned int dest_width      :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_2_1_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    (void) dest_width;
+    (void) source_width;
+    __asm
+    {
+        mov         esi,    source
+        mov         edi,    dest
+
+        pxor        mm7,    mm7
+        mov         ecx,    dest_width
+
+        xor         edx,    edx
+        hs_2_1_loop:
+
+        movq        mm0,    [esi+edx*2]
+        psllw       mm0,    8
+
+        psrlw       mm0,    8
+        packuswb    mm0,    mm7
+
+        movd        DWORD Ptr [edi+edx], mm0;
+        add         edx,    4
+
+        cmp         edx,    ecx
+        jl          hs_2_1_loop
+
+    }
+}
+
+
+
+static
+void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    (void) dest_pitch;
+    (void) src_pitch;
+    vpx_memcpy(dest, source, dest_width);
+}
+
+
+__declspec(align(16)) const static unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
+__declspec(align(16)) const static unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };
+
+static
+void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+
+    (void) dest_pitch;
+    __asm
+    {
+        mov         esi,        source
+        mov         edi,        dest
+
+        mov         eax,        src_pitch
+        mov         edx,        dest_width
+
+        pxor        mm7,        mm7
+        sub         esi,        eax             //back one line
+
+
+        lea         ecx,        [esi+edx];
+        movq        mm6,        round_values;
+
+        movq        mm5,        three_sixteenths;
+        movq        mm4,        ten_sixteenths;
+
+        vs_2_1_i_loop:
+        movd        mm0,        [esi]           //
+        movd        mm1,        [esi+eax]       //
+
+        movd        mm2,        [esi+eax*2]     //
+        punpcklbw   mm0,        mm7
+
+        pmullw      mm0,        mm5
+        punpcklbw   mm1,        mm7
+
+        pmullw      mm1,        mm4
+        punpcklbw   mm2,        mm7
+
+        pmullw      mm2,        mm5
+        paddw       mm0,        round_values
+
+        paddw       mm1,        mm2
+        paddw       mm0,        mm1
+
+        psrlw       mm0,        8
+        packuswb    mm0,        mm7
+
+        movd        DWORD PTR [edi],        mm0
+        add         esi,        4
+
+        add         edi,        4;
+        cmp         esi,        ecx
+        jl          vs_2_1_i_loop
+
+    }
+}
+
+
+
+void
+register_mmxscalers(void)
+{
+    vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_mmx;
+    vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_mmx;
+    vp8_last_vertical_band_1_2_scale      = last_vertical_band_1_2_scale_mmx;
+    vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_mmx;
+    vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_mmx;
+    vp8_last_vertical_band_3_5_scale      = last_vertical_band_3_5_scale_mmx;
+    vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_mmx;
+    vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_mmx;
+    vp8_last_vertical_band_4_5_scale      = last_vertical_band_4_5_scale_mmx;
+
+    vp8_horizontal_line_3_4_scale        = vp8cx_horizontal_line_3_4_scale_c;
+    vp8_vertical_band_3_4_scale          = vp8cx_vertical_band_3_4_scale_c;
+    vp8_last_vertical_band_3_4_scale      = vp8cx_last_vertical_band_3_4_scale_c;
+    vp8_horizontal_line_2_3_scale        = vp8cx_horizontal_line_2_3_scale_c;
+    vp8_vertical_band_2_3_scale          = vp8cx_vertical_band_2_3_scale_c;
+    vp8_last_vertical_band_2_3_scale      = vp8cx_last_vertical_band_2_3_scale_c;
+
+
+
+    vp8_vertical_band_5_4_scale           = vertical_band_5_4_scale_mmx;
+    vp8_vertical_band_5_3_scale           = vertical_band_5_3_scale_mmx;
+    vp8_vertical_band_2_1_scale           = vertical_band_2_1_scale_mmx;
+    vp8_vertical_band_2_1_scale_i         = vertical_band_2_1_scale_i_mmx;
+    vp8_horizontal_line_2_1_scale         = horizontal_line_2_1_scale_mmx;
+    vp8_horizontal_line_5_3_scale         = horizontal_line_5_3_scale_mmx;
+    vp8_horizontal_line_5_4_scale         = horizontal_line_5_4_scale_mmx;
+
+
+
+
+}
diff --git a/vpx_scale/win32/scalesystemdependant.c b/vpx_scale/win32/scalesystemdependant.c
new file mode 100644
index 000000000..9ed48bfc6
--- /dev/null
+++ b/vpx_scale/win32/scalesystemdependant.c
@@ -0,0 +1,90 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     system_dependant.c
+*
+*   Description  :     Miscellaneous system dependant functions
+*
+****************************************************************************/
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+#include "vpx_scale/vpxscale.h"
+#include "cpuidlib.h"
+
+/****************************************************************************
+*  Imports
+*****************************************************************************/
+extern void register_generic_scalers(void);
+extern void register_mmxscalers(void);
+
+/****************************************************************************
+ *
+ *  ROUTINE       : post_proc_machine_specific_config
+ *
+ *  INPUTS        : UINT32 Version : Codec version number.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Checks for machine specifc features such as MMX support
+ *                  sets appropriate flags and function pointers.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void
+vp8_scale_machine_specific_config(void)
+{
+    // If MMX supported then set to use MMX versions of functions else
+    // use original 'C' versions.
+    int mmx_enabled;
+    int xmm_enabled;
+    int wmt_enabled;
+
+    vpx_get_processor_flags(&mmx_enabled, &xmm_enabled, &wmt_enabled);
+
+    if (mmx_enabled || xmm_enabled || wmt_enabled)
+    {
+        register_mmxscalers();
+    }
+    else
+    {
+        vp8_horizontal_line_1_2_scale        = vp8cx_horizontal_line_1_2_scale_c;
+        vp8_vertical_band_1_2_scale          = vp8cx_vertical_band_1_2_scale_c;
+        vp8_last_vertical_band_1_2_scale      = vp8cx_last_vertical_band_1_2_scale_c;
+        vp8_horizontal_line_3_5_scale        = vp8cx_horizontal_line_3_5_scale_c;
+        vp8_vertical_band_3_5_scale          = vp8cx_vertical_band_3_5_scale_c;
+        vp8_last_vertical_band_3_5_scale      = vp8cx_last_vertical_band_3_5_scale_c;
+        vp8_horizontal_line_3_4_scale        = vp8cx_horizontal_line_3_4_scale_c;
+        vp8_vertical_band_3_4_scale          = vp8cx_vertical_band_3_4_scale_c;
+        vp8_last_vertical_band_3_4_scale      = vp8cx_last_vertical_band_3_4_scale_c;
+        vp8_horizontal_line_2_3_scale        = vp8cx_horizontal_line_2_3_scale_c;
+        vp8_vertical_band_2_3_scale          = vp8cx_vertical_band_2_3_scale_c;
+        vp8_last_vertical_band_2_3_scale      = vp8cx_last_vertical_band_2_3_scale_c;
+        vp8_horizontal_line_4_5_scale        = vp8cx_horizontal_line_4_5_scale_c;
+        vp8_vertical_band_4_5_scale          = vp8cx_vertical_band_4_5_scale_c;
+        vp8_last_vertical_band_4_5_scale      = vp8cx_last_vertical_band_4_5_scale_c;
+
+
+        vp8_vertical_band_5_4_scale           = vp8cx_vertical_band_5_4_scale_c;
+        vp8_vertical_band_5_3_scale           = vp8cx_vertical_band_5_3_scale_c;
+        vp8_vertical_band_2_1_scale           = vp8cx_vertical_band_2_1_scale_c;
+        vp8_vertical_band_2_1_scale_i         = vp8cx_vertical_band_2_1_scale_i_c;
+        vp8_horizontal_line_2_1_scale         = vp8cx_horizontal_line_2_1_scale_c;
+        vp8_horizontal_line_5_3_scale         = vp8cx_horizontal_line_5_3_scale_c;
+        vp8_horizontal_line_5_4_scale         = vp8cx_horizontal_line_5_4_scale_c;
+
+    }
+}
diff --git a/vpx_scale/x86_64/scaleopt.c b/vpx_scale/x86_64/scaleopt.c
new file mode 100644
index 000000000..3d2d5f237
--- /dev/null
+++ b/vpx_scale/x86_64/scaleopt.c
@@ -0,0 +1,1749 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     scaleopt.cpp
+*
+*   Description  :     Optimized scaling functions
+*
+****************************************************************************/
+#include "pragmas.h"
+
+
+
+/****************************************************************************
+*  Module Statics
+****************************************************************************/
+__declspec(align(16)) const static unsigned short one_fifth[]  = { 51, 51, 51, 51 };
+__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
+__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
+__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
+__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
+__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
+__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102,  51 };
+__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
+__declspec(align(16)) const static unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
+__declspec(align(16)) const static unsigned short const35_2[] = { 154,  51, 205, 102 };
+__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205,  51, 154 };
+
+
+
+#include "vpx_scale/vpxscale.h"
+#include "vpx_mem/vpx_mem.h"
+
+/****************************************************************************
+*
+*  ROUTINE       : horizontal_line_3_5_scale_mmx
+*
+*  INPUTS        : const unsigned char *source :
+*                  unsigned int source_width    :
+*                  unsigned char *dest         :
+*                  unsigned int dest_width      :
+*
+*  OUTPUTS       : None.
+*
+*  RETURNS       : void
+*
+*  FUNCTION      : 3 to 5 up-scaling of a horizontal line of pixels.
+*
+*  SPECIAL NOTES : None.
+*
+****************************************************************************/
+static
+void horizontal_line_3_5_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    (void) dest_width;
+
+    __asm
+    {
+
+        push        rbx
+
+        mov         rsi,    source
+        mov         rdi,    dest
+
+        mov         ecx,    source_width
+        lea         rdx,    [rsi+rcx-3];
+
+        movq        mm5,    const35_1       // mm5 = 66 xx cd xx 33 xx 9a xx
+        movq        mm6,    const35_2       // mm6 = 9a xx 33 xx cd xx 66 xx
+
+        movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
+        pxor        mm7,    mm7             // clear mm7
+
+        horiz_line_3_5_loop:
+
+        mov         eax,    DWORD PTR [rsi] // eax = 00 01 02 03
+        mov         ebx,    eax
+
+        and         ebx,    0xffff00        // ebx = xx 01 02 xx
+        mov         ecx,    eax             // ecx = 00 01 02 03
+
+        and         eax,    0xffff0000      // eax = xx xx 02 03
+        xor         ecx,    eax             // ecx = 00 01 xx xx
+
+        shr         ebx,    8               // ebx = 01 02 xx xx
+        or          eax,    ebx             // eax = 01 02 02 03
+
+        shl         ebx,    16              // ebx = xx xx 01 02
+        movd        mm1,    eax             // mm1 = 01 02 02 03 xx xx xx xx
+
+        or          ebx,    ecx             // ebx = 00 01 01 02
+        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 03 xx
+
+        movd        mm0,    ebx             // mm0 = 00 01 01 02
+        pmullw      mm1,    mm6             //
+
+        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
+        pmullw      mm0,    mm5             //
+
+        mov         [rdi],  ebx             // writeoutput 00 xx xx xx
+        add         rsi,    3
+
+        add         rdi,    5
+        paddw       mm0,    mm1
+
+        paddw       mm0,    mm4
+        psrlw       mm0,    8
+
+        cmp         rsi,    rdx
+        packuswb    mm0,    mm7
+
+        movd        DWORD Ptr [rdi-4], mm0
+        jl          horiz_line_3_5_loop
+
+//Exit:
+        mov         eax,    DWORD PTR [rsi] // eax = 00 01 02 03
+        mov         ebx,    eax
+
+        and         ebx,    0xffff00        // ebx = xx 01 02 xx
+        mov         ecx,    eax             // ecx = 00 01 02 03
+
+        and         eax,    0xffff0000      // eax = xx xx 02 03
+        xor         ecx,    eax             // ecx = 00 01 xx xx
+
+        shr         ebx,    8               // ebx = 01 02 xx xx
+        or          eax,    ebx             // eax = 01 02 02 03
+
+        shl         eax,    8               // eax = xx 01 02 02
+        and         eax,    0xffff0000      // eax = xx xx 02 02
+
+        or          eax,    ebx             // eax = 01 02 02 02
+
+        shl         ebx,    16              // ebx = xx xx 01 02
+        movd        mm1,    eax             // mm1 = 01 02 02 02 xx xx xx xx
+
+        or          ebx,    ecx             // ebx = 00 01 01 02
+        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 02 xx
+
+        movd        mm0,    ebx             // mm0 = 00 01 01 02
+        pmullw      mm1,    mm6             //
+
+        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
+        pmullw      mm0,    mm5             //
+
+        mov         [rdi],  ebx             // writeoutput 00 xx xx xx
+        paddw       mm0,    mm1
+
+        paddw       mm0,    mm4
+        psrlw       mm0,    8
+
+        packuswb    mm0,    mm7
+        movd        DWORD Ptr [rdi+1], mm0
+
+        pop rbx
+
+    }
+
+}
+
+
+/****************************************************************************
+*
+*  ROUTINE       : horizontal_line_4_5_scale_mmx
+*
+*  INPUTS        : const unsigned char *source :
+*                  unsigned int source_width    :
+*                  unsigned char *dest         :
+*                  unsigned int dest_width      :
+*
+*  OUTPUTS       : None.
+*
+*  RETURNS       : void
+*
+*  FUNCTION      : 4 to 5 up-scaling of a horizontal line of pixels.
+*
+*  SPECIAL NOTES : None.
+*
+****************************************************************************/
+static
+void horizontal_line_4_5_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    (void)dest_width;
+
+    __asm
+    {
+
+        mov         rsi,    source
+        mov         rdi,    dest
+
+        mov         ecx,    source_width
+        lea         rdx,    [rsi+rcx-8];
+
+        movq        mm5,    const45_1       // mm5 = 33 xx 66 xx 9a xx cd xx
+        movq        mm6,    const45_2       // mm6 = cd xx 9a xx 66 xx 33 xx
+
+        movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
+        pxor        mm7,    mm7             // clear mm7
+
+        horiz_line_4_5_loop:
+
+        movq        mm0,    QWORD PTR [rsi]           // mm0 = 00 01 02 03 04 05 06 07
+        movq        mm1,    QWORD PTR [rsi+1];        // mm1 = 01 02 03 04 05 06 07 08
+
+        movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
+        movq        mm3,    mm1             // mm3 = 01 02 03 04 05 06 07 08
+
+        movd        DWORD PTR [rdi],  mm0             // write output 00 xx xx xx
+        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
+
+        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
+        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
+
+        pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
+        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
+
+        movd        DWORD PTR [rdi+5], mm2            // write ouput 05 xx xx xx
+        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
+
+        punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
+        pmullw      mm3,    mm6             // 05*205 06*154 07*102 08* 51
+
+        paddw       mm0,    mm1             // added round values
+        paddw       mm0,    mm4
+
+        psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
+        packuswb    mm0,    mm7
+
+        movd        DWORD PTR [rdi+1], mm0  // write output 01 02 03 04
+        add         rdi,    10
+
+        add         rsi,    8
+        paddw       mm2,    mm3             //
+
+        paddw       mm2,    mm4             // added round values
+        cmp         rsi,    rdx
+
+        psrlw       mm2,    8
+        packuswb    mm2,    mm7
+
+        movd        DWORD PTR [rdi-4], mm2 // writeoutput 06 07 08 09
+        jl         horiz_line_4_5_loop
+
+//Exit:
+        movq        mm0,    [rsi]           // mm0 = 00 01 02 03 04 05 06 07
+        movq        mm1,    mm0             // mm1 = 00 01 02 03 04 05 06 07
+
+        movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
+        psrlq       mm1,    8               // mm1 = 01 02 03 04 05 06 07 00
+
+        movq        mm3,    mask45          // mm3 = 00 00 00 00 00 00 ff 00
+        pand        mm3,    mm1             // mm3 = 00 00 00 00 00 00 07 00
+
+        psllq       mm3,    8               // mm3 = 00 00 00 00 00 00 00 07
+        por         mm1,    mm3             // mm1 = 01 02 03 04 05 06 07 07
+
+        movq        mm3,    mm1
+
+        movd        DWORD PTR [rdi],  mm0   // write output 00 xx xx xx
+        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
+
+        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
+        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
+
+        pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
+        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
+
+        movd        DWORD PTR [rdi+5], mm2  // write ouput 05 xx xx xx
+        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
+
+        punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
+        pmullw      mm3,    mm6             // 05*205 06*154 07*102 07* 51
+
+        paddw       mm0,    mm1             // added round values
+        paddw       mm0,    mm4
+
+        psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
+        packuswb    mm0,    mm7             // 01 02 03 04 xx xx xx xx
+
+        movd        DWORD PTR [rdi+1], mm0  // write output 01 02 03 04
+        paddw       mm2,    mm3             //
+
+        paddw       mm2,    mm4             // added round values
+        psrlw       mm2,    8
+
+        packuswb    mm2,    mm7
+        movd        DWORD PTR [rdi+6], mm2  // writeoutput 06 07 08 09
+
+
+    }
+}
+
+/****************************************************************************
+*
+*  ROUTINE       : vertical_band_4_5_scale_mmx
+*
+*  INPUTS        : unsigned char *dest    :
+*                  unsigned int dest_pitch :
+*                  unsigned int dest_width :
+*
+*  OUTPUTS       : None.
+*
+*  RETURNS       : void
+*
+*  FUNCTION      : 4 to 5 up-scaling of a 4 pixel high band of pixels.
+*
+*  SPECIAL NOTES : The routine uses the first line of the band below
+*                  the current band. The function also has a "C" only
+*                  version.
+*
+****************************************************************************/
+static
+void vertical_band_4_5_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __asm
+    {
+
+        mov         rsi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        lea         rdi,    [rsi+rcx*2]             // tow lines below
+        add         rdi,    rcx                     // three lines below
+
+        pxor        mm7,    mm7                     // clear out mm7
+        mov         edx,    dest_width               // Loop counter
+
+        vs_4_5_loop:
+
+        movq        mm0,    QWORD ptr [rsi]         // src[0];
+        movq        mm1,    QWORD ptr [rsi+rcx]     // src[1];
+
+        movq        mm2,    mm0                     // Make a copy
+        punpcklbw   mm0,    mm7                     // unpack low to word
+
+        movq        mm5,    one_fifth
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm0,    mm5                     // a * 1/5
+
+        movq        mm3,    mm1                     // make a copy
+        punpcklbw   mm1,    mm7                     // unpack low to word
+
+        pmullw      mm2,    mm5                     // a * 1/5
+        movq        mm6,    four_fifths               // constan
+
+        movq        mm4,    mm1                     // copy of low b
+        pmullw      mm4,    mm6                     // b * 4/5
+
+        punpckhbw   mm3,    mm7                     // unpack high to word
+        movq        mm5,    mm3                     // copy of high b
+
+        pmullw      mm5,    mm6                     // b * 4/5
+        paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
+
+        paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
+        paddw       mm0,    round_values             // + 128
+
+        paddw       mm2,    round_values             // + 128
+        psrlw       mm0,    8
+
+        psrlw       mm2,    8
+        packuswb    mm0,    mm2                     // des [1]
+
+        movq        QWORD ptr [rsi+rcx], mm0        // write des[1]
+        movq        mm0,    [rsi+rcx*2]             // mm0 = src[2]
+
+        // mm1, mm3 --- Src[1]
+        // mm0 --- Src[2]
+        // mm7 for unpacking
+
+        movq        mm5,    two_fifths
+        movq        mm2,    mm0                     // make a copy
+
+        pmullw      mm1,    mm5                     // b * 2/5
+        movq        mm6,    three_fifths
+
+
+        punpcklbw   mm0,    mm7                     // unpack low to word
+        pmullw      mm3,    mm5                     // b * 2/5
+
+        movq        mm4,    mm0                     // make copy of c
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm4,    mm6                     // c * 3/5
+        movq        mm5,    mm2
+
+        pmullw      mm5,    mm6                     // c * 3/5
+        paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
+
+        paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
+        paddw       mm1,    round_values             // + 128
+
+        paddw       mm3,    round_values             // + 128
+        psrlw       mm1,    8
+
+        psrlw       mm3,    8
+        packuswb    mm1,    mm3                     // des[2]
+
+        movq        QWORD ptr [rsi+rcx*2], mm1      // write des[2]
+        movq        mm1,    [rdi]                   // mm1=Src[3];
+
+        // mm0, mm2 --- Src[2]
+        // mm1 --- Src[3]
+        // mm6 --- 3/5
+        // mm7 for unpacking
+
+        pmullw      mm0,    mm6                     // c * 3/5
+        movq        mm5,    two_fifths               // mm5 = 2/5
+
+        movq        mm3,    mm1                     // make a copy
+        pmullw      mm2,    mm6                     // c * 3/5
+
+        punpcklbw   mm1,    mm7                     // unpack low
+        movq        mm4,    mm1                     // make a copy
+
+        punpckhbw   mm3,    mm7                     // unpack high
+        pmullw      mm4,    mm5                     // d * 2/5
+
+        movq        mm6,    mm3                     // make a copy
+        pmullw      mm6,    mm5                     // d * 2/5
+
+        paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
+        paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
+
+        paddw       mm0,    round_values             // + 128
+        paddw       mm2,    round_values             // + 128
+
+        psrlw       mm0,    8
+        psrlw       mm2,    8
+
+        packuswb    mm0,    mm2                     // des[3]
+        movq        QWORD ptr [rdi], mm0            // write des[3]
+
+        //  mm1, mm3 --- Src[3]
+        //  mm7 -- cleared for unpacking
+
+        movq        mm0,    [rdi+rcx*2]             // mm0, Src[0] of the next group
+
+        movq        mm5,    four_fifths              // mm5 = 4/5
+        pmullw      mm1,    mm5                     // d * 4/5
+
+        movq        mm6,    one_fifth                // mm6 = 1/5
+        movq        mm2,    mm0                     // make a copy
+
+        pmullw      mm3,    mm5                     // d * 4/5
+        punpcklbw   mm0,    mm7                     // unpack low
+
+        pmullw      mm0,    mm6                     // an * 1/5
+        punpckhbw   mm2,    mm7                     // unpack high
+
+        paddw       mm1,    mm0                     // d * 4/5 + an * 1/5
+        pmullw      mm2,    mm6                     // an * 1/5
+
+        paddw       mm3,    mm2                     // d * 4/5 + an * 1/5
+        paddw       mm1,    round_values             // + 128
+
+        paddw       mm3,    round_values             // + 128
+        psrlw       mm1,    8
+
+        psrlw       mm3,    8
+        packuswb    mm1,    mm3                     // des[4]
+
+        movq        QWORD ptr [rdi+rcx], mm1        // write des[4]
+
+        add         rdi,    8
+        add         rsi,    8
+
+        sub         rdx,    8
+        jg          vs_4_5_loop
+    }
+}
+
+/****************************************************************************
+*
+*  ROUTINE       : last_vertical_band_4_5_scale_mmx
+*
+*  INPUTS        : unsigned char *dest    :
+*                  unsigned int dest_pitch :
+*                  unsigned int dest_width :
+*
+*  OUTPUTS       : None.
+*
+*  RETURNS       : None
+*
+*  FUNCTION      : 4 to 5 up-scaling of the last 4-pixel high band in an image.
+*
+*  SPECIAL NOTES : The routine uses the first line of the band below
+*                  the current band. The function also has an "C" only
+*                  version.
+*
+****************************************************************************/
+static
+void last_vertical_band_4_5_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __asm
+    {
+        mov         rsi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        lea         rdi,    [rsi+rcx*2]             // tow lines below
+        add         rdi,    rcx                     // three lines below
+
+        pxor        mm7,    mm7                     // clear out mm7
+        mov         edx,    dest_width               // Loop counter
+
+        last_vs_4_5_loop:
+
+        movq        mm0,    QWORD ptr [rsi]         // src[0];
+        movq        mm1,    QWORD ptr [rsi+rcx]     // src[1];
+
+        movq        mm2,    mm0                     // Make a copy
+        punpcklbw   mm0,    mm7                     // unpack low to word
+
+        movq        mm5,    one_fifth
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm0,    mm5                     // a * 1/5
+
+        movq        mm3,    mm1                     // make a copy
+        punpcklbw   mm1,    mm7                     // unpack low to word
+
+        pmullw      mm2,    mm5                     // a * 1/5
+        movq        mm6,    four_fifths               // constan
+
+        movq        mm4,    mm1                     // copy of low b
+        pmullw      mm4,    mm6                     // b * 4/5
+
+        punpckhbw   mm3,    mm7                     // unpack high to word
+        movq        mm5,    mm3                     // copy of high b
+
+        pmullw      mm5,    mm6                     // b * 4/5
+        paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
+
+        paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
+        paddw       mm0,    round_values             // + 128
+
+        paddw       mm2,    round_values             // + 128
+        psrlw       mm0,    8
+
+        psrlw       mm2,    8
+        packuswb    mm0,    mm2                     // des [1]
+
+        movq        QWORD ptr [rsi+rcx], mm0        // write des[1]
+        movq        mm0,    [rsi+rcx*2]             // mm0 = src[2]
+
+        // mm1, mm3 --- Src[1]
+        // mm0 --- Src[2]
+        // mm7 for unpacking
+
+        movq        mm5,    two_fifths
+        movq        mm2,    mm0                     // make a copy
+
+        pmullw      mm1,    mm5                     // b * 2/5
+        movq        mm6,    three_fifths
+
+
+        punpcklbw   mm0,    mm7                     // unpack low to word
+        pmullw      mm3,    mm5                     // b * 2/5
+
+        movq        mm4,    mm0                     // make copy of c
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm4,    mm6                     // c * 3/5
+        movq        mm5,    mm2
+
+        pmullw      mm5,    mm6                     // c * 3/5
+        paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
+
+        paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
+        paddw       mm1,    round_values             // + 128
+
+        paddw       mm3,    round_values             // + 128
+        psrlw       mm1,    8
+
+        psrlw       mm3,    8
+        packuswb    mm1,    mm3                     // des[2]
+
+        movq        QWORD ptr [rsi+rcx*2], mm1      // write des[2]
+        movq        mm1,    [rdi]                   // mm1=Src[3];
+
+        movq        QWORD ptr [rdi+rcx], mm1        // write des[4];
+
+        // mm0, mm2 --- Src[2]
+        // mm1 --- Src[3]
+        // mm6 --- 3/5
+        // mm7 for unpacking
+
+        pmullw      mm0,    mm6                     // c * 3/5
+        movq        mm5,    two_fifths               // mm5 = 2/5
+
+        movq        mm3,    mm1                     // make a copy
+        pmullw      mm2,    mm6                     // c * 3/5
+
+        punpcklbw   mm1,    mm7                     // unpack low
+        movq        mm4,    mm1                     // make a copy
+
+        punpckhbw   mm3,    mm7                     // unpack high
+        pmullw      mm4,    mm5                     // d * 2/5
+
+        movq        mm6,    mm3                     // make a copy
+        pmullw      mm6,    mm5                     // d * 2/5
+
+        paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
+        paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
+
+        paddw       mm0,    round_values             // + 128
+        paddw       mm2,    round_values             // + 128
+
+        psrlw       mm0,    8
+        psrlw       mm2,    8
+
+        packuswb    mm0,    mm2                     // des[3]
+        movq        QWORD ptr [rdi], mm0            // write des[3]
+
+        //  mm1, mm3 --- Src[3]
+        //  mm7 -- cleared for unpacking
+        add         rdi,    8
+        add         rsi,    8
+
+        sub         rdx,    8
+        jg          last_vs_4_5_loop
+    }
+}
+
+/****************************************************************************
+*
+*  ROUTINE       : vertical_band_3_5_scale_mmx
+*
+*  INPUTS        : unsigned char *dest    :
+*                  unsigned int dest_pitch :
+*                  unsigned int dest_width :
+*
+*  OUTPUTS       : None.
+*
+*  RETURNS       : void
+*
+*  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
+*
+*  SPECIAL NOTES : The routine uses the first line of the band below
+*                  the current band. The function also has an "C" only
+*                  version.
+*
+****************************************************************************/
+static
+void vertical_band_3_5_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __asm
+    {
+        mov         rsi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        lea         rdi,    [rsi+rcx*2]             // two lines below
+        add         rdi,    rcx                     // three lines below
+
+        pxor        mm7,    mm7                     // clear out mm7
+        mov         edx,    dest_width               // Loop counter
+
+        vs_3_5_loop:
+
+        movq        mm0,    QWORD ptr [rsi]         // src[0];
+        movq        mm1,    QWORD ptr [rsi+rcx]     // src[1];
+
+        movq        mm2,    mm0                     // Make a copy
+        punpcklbw   mm0,    mm7                     // unpack low to word
+
+        movq        mm5,    two_fifths               // mm5 = 2/5
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm0,    mm5                     // a * 2/5
+
+        movq        mm3,    mm1                     // make a copy
+        punpcklbw   mm1,    mm7                     // unpack low to word
+
+        pmullw      mm2,    mm5                     // a * 2/5
+        movq        mm6,    three_fifths             // mm6 = 3/5
+
+        movq        mm4,    mm1                     // copy of low b
+        pmullw      mm4,    mm6                     // b * 3/5
+
+        punpckhbw   mm3,    mm7                     // unpack high to word
+        movq        mm5,    mm3                     // copy of high b
+
+        pmullw      mm5,    mm6                     // b * 3/5
+        paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
+
+        paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
+        paddw       mm0,    round_values             // + 128
+
+        paddw       mm2,    round_values             // + 128
+        psrlw       mm0,    8
+
+        psrlw       mm2,    8
+        packuswb    mm0,    mm2                     // des [1]
+
+        movq        QWORD ptr [rsi+rcx], mm0        // write des[1]
+        movq        mm0,    [rsi+rcx*2]             // mm0 = src[2]
+
+        // mm1, mm3 --- Src[1]
+        // mm0 --- Src[2]
+        // mm7 for unpacking
+
+        movq        mm4,    mm1                     // b low
+        pmullw      mm1,    four_fifths              // b * 4/5 low
+
+        movq        mm5,    mm3                     // b high
+        pmullw      mm3,    four_fifths              // b * 4/5 high
+
+        movq        mm2,    mm0                     // c
+        pmullw      mm4,    one_fifth                // b * 1/5
+
+        punpcklbw   mm0,    mm7                     // c low
+        pmullw      mm5,    one_fifth                // b * 1/5
+
+        movq        mm6,    mm0                     // make copy of c low
+        punpckhbw   mm2,    mm7                     // c high
+
+        pmullw      mm6,    one_fifth                // c * 1/5 low
+        movq        mm7,    mm2                     // make copy of c high
+
+        pmullw      mm7,    one_fifth                // c * 1/5 high
+        paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
+
+        paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
+        movq        mm6,    mm0                     // make copy of c low
+
+        pmullw      mm6,    four_fifths              // c * 4/5 low
+        movq        mm7,    mm2                     // make copy of c high
+
+        pmullw      mm7,    four_fifths              // c * 4/5 high
+
+        paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
+        paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
+
+        paddw       mm1,    round_values             // + 128
+        paddw       mm3,    round_values             // + 128
+
+        psrlw       mm1,    8
+        psrlw       mm3,    8
+
+        packuswb    mm1,    mm3                     // des[2]
+        movq        QWORD ptr [rsi+rcx*2], mm1      // write des[2]
+
+        paddw       mm4,    round_values             // + 128
+        paddw       mm5,    round_values             // + 128
+
+        psrlw       mm4,    8
+        psrlw       mm5,    8
+
+        packuswb    mm4,    mm5                     // des[3]
+        movq        QWORD ptr [rdi], mm4            // write des[3]
+
+        //  mm0, mm2 --- Src[3]
+
+        pxor        mm7,    mm7                     // clear mm7 for unpacking
+        movq        mm1,    [rdi+rcx*2]             // mm1 = Src[0] of the next group
+
+        movq        mm5,    three_fifths             // mm5 = 3/5
+        pmullw      mm0,    mm5                     // d * 3/5
+
+        movq        mm6,    two_fifths                // mm6 = 2/5
+        movq        mm3,    mm1                     // make a copy
+
+        pmullw      mm2,    mm5                     // d * 3/5
+        punpcklbw   mm1,    mm7                     // unpack low
+
+        pmullw      mm1,    mm6                     // an * 2/5
+        punpckhbw   mm3,    mm7                     // unpack high
+
+        paddw       mm0,    mm1                     // d * 3/5 + an * 2/5
+        pmullw      mm3,    mm6                     // an * 2/5
+
+        paddw       mm2,    mm3                     // d * 3/5 + an * 2/5
+        paddw       mm0,    round_values             // + 128
+
+        paddw       mm2,    round_values             // + 128
+        psrlw       mm0,    8
+
+        psrlw       mm2,    8
+        packuswb    mm0,    mm2                     // des[4]
+
+        movq        QWORD ptr [rdi+rcx], mm0        // write des[4]
+
+        add         rdi,    8
+        add         rsi,    8
+
+        sub         rdx,    8
+        jg          vs_3_5_loop
+    }
+}
+
+/****************************************************************************
+*
+*  ROUTINE       : last_vertical_band_3_5_scale_mmx
+*
+*  INPUTS        : unsigned char *dest    :
+*                  unsigned int dest_pitch :
+*                  unsigned int dest_width :
+*
+*  OUTPUTS       : None.
+*
+*  RETURNS       : void
+*
+*  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
+*
+*  SPECIAL NOTES : The routine uses the first line of the band below
+*                  the current band. The function also has an "C" only
+*                  version.
+*
+****************************************************************************/
+static
+void last_vertical_band_3_5_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __asm
+    {
+        mov         rsi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        lea         rdi,    [rsi+rcx*2]             // tow lines below
+        add         rdi,    rcx                     // three lines below
+
+        pxor        mm7,    mm7                     // clear out mm7
+        mov         edx,    dest_width               // Loop counter
+
+
+        last_vs_3_5_loop:
+
+        movq        mm0,    QWORD ptr [rsi]         // src[0];
+        movq        mm1,    QWORD ptr [rsi+rcx]     // src[1];
+
+        movq        mm2,    mm0                     // Make a copy
+        punpcklbw   mm0,    mm7                     // unpack low to word
+
+        movq        mm5,    two_fifths               // mm5 = 2/5
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm0,    mm5                     // a * 2/5
+
+        movq        mm3,    mm1                     // make a copy
+        punpcklbw   mm1,    mm7                     // unpack low to word
+
+        pmullw      mm2,    mm5                     // a * 2/5
+        movq        mm6,    three_fifths             // mm6 = 3/5
+
+        movq        mm4,    mm1                     // copy of low b
+        pmullw      mm4,    mm6                     // b * 3/5
+
+        punpckhbw   mm3,    mm7                     // unpack high to word
+        movq        mm5,    mm3                     // copy of high b
+
+        pmullw      mm5,    mm6                     // b * 3/5
+        paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
+
+        paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
+        paddw       mm0,    round_values             // + 128
+
+        paddw       mm2,    round_values             // + 128
+        psrlw       mm0,    8
+
+        psrlw       mm2,    8
+        packuswb    mm0,    mm2                     // des [1]
+
+        movq        QWORD ptr [rsi+rcx], mm0        // write des[1]
+        movq        mm0,    [rsi+rcx*2]             // mm0 = src[2]
+
+
+
+        // mm1, mm3 --- Src[1]
+        // mm0 --- Src[2]
+        // mm7 for unpacking
+
+        movq        mm4,    mm1                     // b low
+        pmullw      mm1,    four_fifths              // b * 4/5 low
+
+        movq        QWORD ptr [rdi+rcx], mm0        // write des[4]
+
+        movq        mm5,    mm3                     // b high
+        pmullw      mm3,    four_fifths              // b * 4/5 high
+
+        movq        mm2,    mm0                     // c
+        pmullw      mm4,    one_fifth                // b * 1/5
+
+        punpcklbw   mm0,    mm7                     // c low
+        pmullw      mm5,    one_fifth                // b * 1/5
+
+        movq        mm6,    mm0                     // make copy of c low
+        punpckhbw   mm2,    mm7                     // c high
+
+        pmullw      mm6,    one_fifth                // c * 1/5 low
+        movq        mm7,    mm2                     // make copy of c high
+
+        pmullw      mm7,    one_fifth                // c * 1/5 high
+        paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
+
+        paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
+        movq        mm6,    mm0                     // make copy of c low
+
+        pmullw      mm6,    four_fifths              // c * 4/5 low
+        movq        mm7,    mm2                     // make copy of c high
+
+        pmullw      mm7,    four_fifths              // c * 4/5 high
+
+        paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
+        paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
+
+        paddw       mm1,    round_values             // + 128
+        paddw       mm3,    round_values             // + 128
+
+        psrlw       mm1,    8
+        psrlw       mm3,    8
+
+        packuswb    mm1,    mm3                     // des[2]
+        movq        QWORD ptr [rsi+rcx*2], mm1      // write des[2]
+
+        paddw       mm4,    round_values             // + 128
+        paddw       mm5,    round_values             // + 128
+
+        psrlw       mm4,    8
+        psrlw       mm5,    8
+
+        packuswb    mm4,    mm5                     // des[3]
+        movq        QWORD ptr [rdi], mm4            // write des[3]
+
+        //  mm0, mm2 --- Src[3]
+
+        add         rdi,    8
+        add         rsi,    8
+
+        sub         rdx,    8
+        jg          last_vs_3_5_loop
+    }
+}
+
+/****************************************************************************
+*
+*  ROUTINE       : vertical_band_1_2_scale_mmx
+*
+*  INPUTS        : unsigned char *dest    :
+*                  unsigned int dest_pitch :
+*                  unsigned int dest_width :
+*
+*  OUTPUTS       : None.
+*
+*  RETURNS       : void
+*
+*  FUNCTION      : 1 to 2 up-scaling of a band of pixels.
+*
+*  SPECIAL NOTES : The routine uses the first line of the band below
+*                  the current band. The function also has an "C" only
+*                  version.
+*
+****************************************************************************/
+static
+void vertical_band_1_2_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __asm
+    {
+
+        mov         rsi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        pxor        mm7,    mm7                     // clear out mm7
+        mov         edx,    dest_width               // Loop counter
+
+        vs_1_2_loop:
+
+        movq        mm0,    [rsi]                   // get Src[0]
+        movq        mm1,    [rsi + rcx * 2]         // get Src[1]
+
+        movq        mm2,    mm0                     // make copy before unpack
+        movq        mm3,    mm1                     // make copy before unpack
+
+        punpcklbw   mm0,    mm7                     // low Src[0]
+        movq        mm6,    four_ones                // mm6= 1, 1, 1, 1
+
+        punpcklbw   mm1,    mm7                     // low Src[1]
+        paddw       mm0,    mm1                     // low (a + b)
+
+        punpckhbw   mm2,    mm7                     // high Src[0]
+        paddw       mm0,    mm6                     // low (a + b + 1)
+
+        punpckhbw   mm3,    mm7
+        paddw       mm2,    mm3                     // high (a + b )
+
+        psraw       mm0,    1                       // low (a + b +1 )/2
+        paddw       mm2,    mm6                     // high (a + b + 1)
+
+        psraw       mm2,    1                       // high (a + b + 1)/2
+        packuswb    mm0,    mm2                     // pack results
+
+        movq        [rsi+rcx], mm0                  // write out eight bytes
+        add         rsi,    8
+
+        sub         rdx,    8
+        jg          vs_1_2_loop
+    }
+
+}
+
+/****************************************************************************
+*
+*  ROUTINE       : last_vertical_band_1_2_scale_mmx
+*
+*  INPUTS        : unsigned char *dest    :
+*                  unsigned int dest_pitch :
+*                  unsigned int dest_width :
+*
+*  OUTPUTS       : None.
+*
+*  RETURNS       : void
+*
+*  FUNCTION      : 1 to 2 up-scaling of band of pixels.
+*
+*  SPECIAL NOTES : The routine uses the first line of the band below
+*                  the current band. The function also has an "C" only
+*                  version.
+*
+****************************************************************************/
+static
+void last_vertical_band_1_2_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __asm
+    {
+        mov         rsi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        mov         edx,    dest_width               // Loop counter
+
+        last_vs_1_2_loop:
+
+        movq        mm0,    [rsi]                   // get Src[0]
+        movq        [rsi+rcx], mm0                  // write out eight bytes
+
+        add         rsi,    8
+        sub         rdx,    8
+
+        jg          last_vs_1_2_loop
+    }
+}
+
+/****************************************************************************
+*
+*  ROUTINE       : horizontal_line_1_2_scale
+*
+*  INPUTS        : const unsigned char *source :
+*                  unsigned int source_width    :
+*                  unsigned char *dest         :
+*                  unsigned int dest_width      :
+*
+*  OUTPUTS       : None.
+*
+*  RETURNS       : void
+*
+*  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
+*
+*  SPECIAL NOTES : None.
+*
+****************************************************************************/
+static
+void horizontal_line_1_2_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    (void) dest_width;
+
+    __asm
+    {
+        mov         rsi,    source
+        mov         rdi,    dest
+
+        pxor        mm7,    mm7
+        movq        mm6,    four_ones
+
+        mov         ecx,    source_width
+
+        hs_1_2_loop:
+
+        movq        mm0,    [rsi]
+        movq        mm1,    [rsi+1]
+
+        movq        mm2,    mm0
+        movq        mm3,    mm1
+
+        movq        mm4,    mm0
+        punpcklbw   mm0,    mm7
+
+        punpcklbw   mm1,    mm7
+        paddw       mm0,    mm1
+
+        paddw       mm0,    mm6
+        punpckhbw   mm2,    mm7
+
+        punpckhbw   mm3,    mm7
+        paddw       mm2,    mm3
+
+        paddw       mm2,    mm6
+        psraw       mm0,    1
+
+        psraw       mm2,    1
+        packuswb    mm0,    mm2
+
+        movq        mm2,    mm4
+        punpcklbw   mm2,    mm0
+
+        movq        [rdi],  mm2
+        punpckhbw   mm4,    mm0
+
+        movq        [rdi+8], mm4
+        add         rsi,    8
+
+        add         rdi,    16
+        sub         rcx,    8
+
+        cmp         rcx,    8
+        jg          hs_1_2_loop
+
+// last eight pixel
+
+        movq        mm0,    [rsi]
+        movq        mm1,    mm0
+
+        movq        mm2,    mm0
+        movq        mm3,    mm1
+
+        psrlq       mm1,    8
+        psrlq       mm3,    56
+
+        psllq       mm3,    56
+        por         mm1,    mm3
+
+        movq        mm3,    mm1
+        movq        mm4,    mm0
+
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+
+        paddw       mm0,    mm1
+        paddw       mm0,    mm6
+
+        punpckhbw   mm2,    mm7
+        punpckhbw   mm3,    mm7
+
+        paddw       mm2,    mm3
+        paddw       mm2,    mm6
+
+        psraw       mm0,    1
+        psraw       mm2,    1
+
+        packuswb    mm0,    mm2
+        movq        mm2,    mm4
+
+        punpcklbw   mm2,    mm0
+        movq        [rdi],  mm2
+
+        punpckhbw   mm4,    mm0
+        movq        [rdi+8], mm4
+    }
+}
+
+
+
+
+
+__declspec(align(16)) const static unsigned short const54_2[] = {  0,  64, 128, 192 };
+__declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128,  64 };
+
+
+/****************************************************************************
+*
+*  ROUTINE       : horizontal_line_5_4_scale_mmx
+*
+*  INPUTS        : const unsigned char *source : Pointer to source data.
+*                  unsigned int source_width    : Stride of source.
+*                  unsigned char *dest         : Pointer to destination data.
+*                  unsigned int dest_width      : Stride of destination (NOT USED).
+*
+*  OUTPUTS       : None.
+*
+*  RETURNS       : void
+*
+*  FUNCTION      : Copies horizontal line of pixels from source to
+*                  destination scaling up by 4 to 5.
+*
+*  SPECIAL NOTES : None.
+*
+****************************************************************************/
+static
+void horizontal_line_5_4_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    /*
+    unsigned i;
+    unsigned int a, b, c, d, e;
+    unsigned char *des = dest;
+    const unsigned char *src = source;
+
+    (void) dest_width;
+
+    for ( i=0; i<source_width; i+=5 )
+    {
+        a = src[0];
+        b = src[1];
+        c = src[2];
+        d = src[3];
+        e = src[4];
+
+        des[0] = a;
+        des[1] = ((b*192 + c* 64 + 128)>>8);
+        des[2] = ((c*128 + d*128 + 128)>>8);
+        des[3] = ((d* 64 + e*192 + 128)>>8);
+
+        src += 5;
+        des += 4;
+    }
+    */
+    __asm
+    {
+
+        mov         rsi,        source              ;
+        mov         rdi,        dest                ;
+
+        mov         ecx,        source_width         ;
+        movq        mm5,        const54_1           ;
+
+        pxor        mm7,        mm7                 ;
+        movq        mm6,        const54_2           ;
+
+        movq        mm4,        round_values         ;
+        lea         rdx,        [rsi+rcx]           ;
+        horizontal_line_5_4_loop:
+
+        movq        mm0,        QWORD PTR  [rsi]    ;
+        00 01 02 03 04 05 06 07
+        movq        mm1,        mm0                 ;
+        00 01 02 03 04 05 06 07
+
+        psrlq       mm0,        8                   ;
+        01 02 03 04 05 06 07 xx
+        punpcklbw   mm1,        mm7                 ;
+        xx 00 xx 01 xx 02 xx 03
+
+        punpcklbw   mm0,        mm7                 ;
+        xx 01 xx 02 xx 03 xx 04
+        pmullw      mm1,        mm5
+
+        pmullw      mm0,        mm6
+        add         rsi,        5
+
+        add         rdi,        4
+        paddw       mm1,        mm0
+
+        paddw       mm1,        mm4
+        psrlw       mm1,        8
+
+        cmp         rsi,        rdx
+        packuswb    mm1,        mm7
+
+        movd        DWORD PTR [rdi-4], mm1
+
+        jl          horizontal_line_5_4_loop
+
+    }
+
+}
+__declspec(align(16)) const static unsigned short one_fourths[]   = {  64,  64,  64, 64  };
+__declspec(align(16)) const static unsigned short two_fourths[]   = { 128, 128, 128, 128 };
+__declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
+
+static
+void vertical_band_5_4_scale_mmx
+(
+    unsigned char *source,
+    unsigned int src_pitch,
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+
+    __asm
+    {
+
+        mov         rsi,    source                    // Get the source and destination pointer
+        mov         ecx,    src_pitch               // Get the pitch size
+
+        mov         rdi,    dest                    // tow lines below
+        pxor        mm7,    mm7                     // clear out mm7
+
+        mov         edx,    dest_pitch               // Loop counter
+        mov         ebx,    dest_width
+
+        vs_5_4_loop:
+
+        movd        mm0,    DWORD ptr [rsi]         // src[0];
+        movd        mm1,    DWORD ptr [rsi+rcx]     // src[1];
+
+        movd        mm2,    DWORD ptr [rsi+rcx*2]
+        lea         rax,    [rsi+rcx*2]             //
+
+        punpcklbw   mm1,    mm7
+        punpcklbw   mm2,    mm7
+
+        movq        mm3,    mm2
+        pmullw      mm1,    three_fourths
+
+        pmullw      mm2,    one_fourths
+        movd        mm4,    [rax+rcx]
+
+        pmullw      mm3,    two_fourths
+        punpcklbw   mm4,    mm7
+
+        movq        mm5,    mm4
+        pmullw      mm4,    two_fourths
+
+        paddw       mm1,    mm2
+        movd        mm6,    [rax+rcx*2]
+
+        pmullw      mm5,    one_fourths
+        paddw       mm1,    round_values;
+
+        paddw       mm3,    mm4
+        psrlw       mm1,    8
+
+        punpcklbw   mm6,    mm7
+        paddw       mm3,    round_values
+
+        pmullw      mm6,    three_fourths
+        psrlw       mm3,    8
+
+        packuswb    mm1,    mm7
+        packuswb    mm3,    mm7
+
+        movd        DWORD PTR [rdi], mm0
+        movd        DWORD PTR [rdi+rdx], mm1
+
+
+        paddw       mm5,    mm6
+        movd        DWORD PTR [rdi+rdx*2], mm3
+
+        lea         rax,    [rdi+rdx*2]
+        paddw       mm5,    round_values
+
+        psrlw       mm5,    8
+        add         rdi,    4
+
+        packuswb    mm5,    mm7
+        movd        DWORD PTR [rax+rdx], mm5
+
+        add         rsi,    4
+        sub         rbx,    4
+
+        jg         vs_5_4_loop
+    }
+}
+
+
+__declspec(align(16)) const static unsigned short const53_1[] = {  0,  85, 171, 0 };
+__declspec(align(16)) const static unsigned short const53_2[] = {256, 171,  85, 0 };
+
+
+static
+void horizontal_line_5_3_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    __asm
+    {
+
+        mov         rsi,        source              ;
+        mov         rdi,        dest                ;
+
+        mov         ecx,        source_width         ;
+        movq        mm5,        const53_1           ;
+
+        pxor        mm7,        mm7                 ;
+        movq        mm6,        const53_2           ;
+
+        movq        mm4,        round_values         ;
+        lea         rdx,        [rsi+rcx-5]         ;
+        horizontal_line_5_3_loop:
+
+        movq        mm0,        QWORD PTR  [rsi]    ;
+        00 01 02 03 04 05 06 07
+        movq        mm1,        mm0                 ;
+        00 01 02 03 04 05 06 07
+
+        psllw       mm0,        8                   ;
+        xx 00 xx 02 xx 04 xx 06
+        psrlw       mm1,        8                   ;
+        01 xx 03 xx 05 xx 07 xx
+
+        psrlw       mm0,        8                   ;
+        00 xx 02 xx 04 xx 06 xx
+        psllq       mm1,        16                  ;
+        xx xx 01 xx 03 xx 05 xx
+
+        pmullw      mm0,        mm6
+
+        pmullw      mm1,        mm5
+        add         rsi,        5
+
+        add         rdi,        3
+        paddw       mm1,        mm0
+
+        paddw       mm1,        mm4
+        psrlw       mm1,        8
+
+        cmp         rsi,        rdx
+        packuswb    mm1,        mm7
+
+        movd        DWORD PTR [rdi-3], mm1
+        jl          horizontal_line_5_3_loop
+
+//exit condition
+        movq        mm0,        QWORD PTR  [rsi]    ;
+        00 01 02 03 04 05 06 07
+        movq        mm1,        mm0                 ;
+        00 01 02 03 04 05 06 07
+
+        psllw       mm0,        8                   ;
+        xx 00 xx 02 xx 04 xx 06
+        psrlw       mm1,        8                   ;
+        01 xx 03 xx 05 xx 07 xx
+
+        psrlw       mm0,        8                   ;
+        00 xx 02 xx 04 xx 06 xx
+        psllq       mm1,        16                  ;
+        xx xx 01 xx 03 xx 05 xx
+
+        pmullw      mm0,        mm6
+
+        pmullw      mm1,        mm5
+        paddw       mm1,        mm0
+
+        paddw       mm1,        mm4
+        psrlw       mm1,        8
+
+        packuswb    mm1,        mm7
+        movd        rax,        mm1
+
+        mov         rdx,        rax
+        shr         rdx,        16
+
+        mov         WORD PTR[rdi],   ax
+        mov         BYTE PTR[rdi+2], dl
+
+    }
+
+}
+
+__declspec(align(16)) const static unsigned short one_thirds[] = {  85,  85,  85,  85 };
+__declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
+
+static
+void vertical_band_5_3_scale_mmx
+(
+    unsigned char *source,
+    unsigned int src_pitch,
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+
+    __asm
+    {
+
+        mov         rsi,    source                    // Get the source and destination pointer
+        mov         ecx,    src_pitch               // Get the pitch size
+
+        mov         rdi,    dest                    // tow lines below
+        pxor        mm7,    mm7                     // clear out mm7
+
+        mov         edx,    dest_pitch               // Loop counter
+        movq        mm5,    one_thirds
+
+        movq        mm6,    two_thirds
+        mov         ebx,    dest_width;
+
+        vs_5_3_loop:
+
+        movd        mm0,    DWORD ptr [rsi]         // src[0];
+        movd        mm1,    DWORD ptr [rsi+rcx]     // src[1];
+
+        movd        mm2,    DWORD ptr [rsi+rcx*2]
+        lea         rax,    [rsi+rcx*2]             //
+
+        punpcklbw   mm1,    mm7
+        punpcklbw   mm2,    mm7
+
+        pmullw      mm1,    mm5
+        pmullw      mm2,    mm6
+
+        movd        mm3,    DWORD ptr [rax+rcx]
+        movd        mm4,    DWORD ptr [rax+rcx*2]
+
+        punpcklbw   mm3,    mm7
+        punpcklbw   mm4,    mm7
+
+        pmullw      mm3,    mm6
+        pmullw      mm4,    mm5
+
+
+        movd        DWORD PTR [rdi], mm0
+        paddw       mm1,    mm2
+
+        paddw       mm1,    round_values
+        psrlw       mm1,    8
+
+        packuswb    mm1,    mm7
+        paddw       mm3,    mm4
+
+        paddw       mm3,    round_values
+        movd        DWORD PTR [rdi+rdx], mm1
+
+        psrlw       mm3,    8
+        packuswb    mm3,    mm7
+
+        movd        DWORD PTR [rdi+rdx*2], mm3
+
+
+        add         rdi,    4
+        add         rsi,    4
+
+        sub         rbx,    4
+        jg          vs_5_3_loop
+    }
+}
+
+
+
+
+/****************************************************************************
+*
+*  ROUTINE       : horizontal_line_2_1_scale
+*
+*  INPUTS        : const unsigned char *source :
+*                  unsigned int source_width    :
+*                  unsigned char *dest         :
+*                  unsigned int dest_width      :
+*
+*  OUTPUTS       : None.
+*
+*  RETURNS       : void
+*
+*  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
+*
+*  SPECIAL NOTES : None.
+*
+****************************************************************************/
+static
+void horizontal_line_2_1_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    (void) dest_width;
+
+    __asm
+    {
+        mov         rsi,    source
+        mov         rdi,    dest
+
+        pxor        mm7,    mm7
+        mov         ecx,    dest_width
+
+        xor         rdx,    rdx
+        hs_2_1_loop:
+
+        movq        mm0,    [rsi+rdx*2]
+        psllw       mm0,    8
+
+        psrlw       mm0,    8
+        packuswb    mm0,    mm7
+
+        movd        DWORD Ptr [rdi+rdx], mm0;
+        add         rdx,    4
+
+        cmp         rdx,    rcx
+        jl          hs_2_1_loop
+
+    }
+}
+
+
+
+static
+void vertical_band_2_1_scale_mmx
+(
+    unsigned char *source,
+    unsigned int src_pitch,
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width)
+{
+    vpx_memcpy(dest, source, dest_width);
+}
+
+
+__declspec(align(16)) const static unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
+__declspec(align(16)) const static unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };
+
+static
+void vertical_band_2_1_scale_i_mmx
+(
+    unsigned char *source,
+    unsigned int src_pitch,
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __asm
+    {
+        mov         rsi,        source
+        mov         rdi,        dest
+
+        mov         eax,        src_pitch
+        mov         edx,        dest_width
+
+        pxor        mm7,        mm7
+        sub         rsi,        rax             //back one line
+
+
+        lea         rcx,        [rsi+rdx];
+        movq        mm6,        round_values;
+
+        movq        mm5,        three_sixteenths;
+        movq        mm4,        ten_sixteenths;
+
+        vs_2_1_i_loop:
+        movd        mm0,        [rsi]           //
+        movd        mm1,        [rsi+rax]       //
+
+        movd        mm2,        [rsi+rax*2]     //
+        punpcklbw   mm0,        mm7
+
+        pmullw      mm0,        mm5
+        punpcklbw   mm1,        mm7
+
+        pmullw      mm1,        mm4
+        punpcklbw   mm2,        mm7
+
+        pmullw      mm2,        mm5
+        paddw       mm0,        round_values
+
+        paddw       mm1,        mm2
+        paddw       mm0,        mm1
+
+        psrlw       mm0,        8
+        packuswb    mm0,        mm7
+
+        movd        DWORD PTR [rdi],        mm0
+        add         rsi,        4
+
+        add         rdi,        4;
+        cmp         rsi,        rcx
+        jl          vs_2_1_i_loop
+
+    }
+}
+
+
+
+void
+register_mmxscalers(void)
+{
+    vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_mmx;
+    vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_mmx;
+    vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_mmx;
+    vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_mmx;
+    vp8_last_vertical_band_1_2_scale      = last_vertical_band_1_2_scale_mmx;
+    vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_mmx;
+    vp8_last_vertical_band_3_5_scale      = last_vertical_band_3_5_scale_mmx;
+    vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_mmx;
+    vp8_last_vertical_band_4_5_scale      = last_vertical_band_4_5_scale_mmx;
+
+    vp8_vertical_band_5_4_scale           = vertical_band_5_4_scale_mmx;
+    vp8_vertical_band_5_3_scale           = vertical_band_5_3_scale_mmx;
+    vp8_vertical_band_2_1_scale           = vertical_band_2_1_scale_mmx;
+    vp8_vertical_band_2_1_scale_i         = vertical_band_2_1_scale_i_mmx;
+    vp8_horizontal_line_2_1_scale         = horizontal_line_2_1_scale_mmx;
+    vp8_horizontal_line_5_3_scale         = horizontal_line_5_3_scale_mmx;
+    vp8_horizontal_line_5_4_scale         = horizontal_line_5_4_scale_mmx;
+}
diff --git a/vpx_scale/x86_64/scalesystemdependant.c b/vpx_scale/x86_64/scalesystemdependant.c
new file mode 100644
index 000000000..43f05a68c
--- /dev/null
+++ b/vpx_scale/x86_64/scalesystemdependant.c
@@ -0,0 +1,60 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     system_dependant.c
+*
+*   Description  :     Miscellaneous system dependant functions
+*
+****************************************************************************/
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+#include "vpx_scale/vpxscale.h"
+#include "cpuidlib.h"
+
+/****************************************************************************
+*  Imports
+*****************************************************************************/
+extern void register_generic_scalers(void);
+extern void register_mmxscalers(void);
+
+/****************************************************************************
+ *
+ *  ROUTINE       : post_proc_machine_specific_config
+ *
+ *  INPUTS        : UINT32 Version : Codec version number.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Checks for machine specifc features such as MMX support
+ *                  sets appropriate flags and function pointers.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void
+vp8_scale_machine_specific_config(void)
+{
+    int wmt_enabled = 1;
+
+    if (wmt_enabled)
+    {
+        register_mmxscalers();
+    }
+    else
+    {
+        register_generic_scalers();
+    }
+}
diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h
new file mode 100644
index 000000000..a8d0ce45b
--- /dev/null
+++ b/vpx_scale/yv12config.h
@@ -0,0 +1,70 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef YV12_CONFIG_H
+#define YV12_CONFIG_H
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#define VP7BORDERINPIXELS       48
+#define VP8BORDERINPIXELS       32
+
+    /*************************************
+     For INT_YUV:
+
+     Y = (R+G*2+B)/4;
+     U = (R-B)/2;
+     V =  (G*2 - R - B)/4;
+    And
+     R = Y+U-V;
+     G = Y+V;
+     B = Y-U-V;
+    ************************************/
+    typedef enum
+    {
+        REG_YUV = 0,    // Regular yuv
+        INT_YUV = 1     // The type of yuv that can be tranfer to and from RGB through integer transform
+              }
+              YUV_TYPE;
+
+    typedef struct
+    {
+        int   y_width;
+        int   y_height;
+        int   y_stride;
+//    int   yinternal_width;
+
+        int   uv_width;
+        int   uv_height;
+        int   uv_stride;
+//    int   uvinternal_width;
+
+        unsigned char *y_buffer;
+        unsigned char *u_buffer;
+        unsigned char *v_buffer;
+
+        unsigned char *buffer_alloc;
+        int border;
+        int frame_size;
+        YUV_TYPE clrtype;
+    } YV12_BUFFER_CONFIG;
+
+    int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int border);
+    int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf);
+    int vp8_yv12_black_frame_buffer(YV12_BUFFER_CONFIG *ybf);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif //YV12_CONFIG_H
diff --git a/vpx_scale/yv12extend.h b/vpx_scale/yv12extend.h
new file mode 100644
index 000000000..9968feae8
--- /dev/null
+++ b/vpx_scale/yv12extend.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef YV12_EXTEND_H
+#define YV12_EXTEND_H
+
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+    void vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf);
+
+    /* Copy Y,U,V buffer data from src to dst, filling border of dst as well. */
+
+    void vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+    void vp8_yv12_copy_frame_yonly(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif