Initial WebM release

author: John Koleszar <jkoleszar@google.com> 2010-05-18 11:58:33 -0400
committer: John Koleszar <jkoleszar@google.com> 2010-05-18 11:58:33 -0400
commit: 0ea50ce9cb4b65eee6afa1d041fe8beb5abda667 (patch)
tree: 1f3b9019f28bc56fd3156f96e5a9653a983ee61b /vp8/common
download: libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar
libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.gz
libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.bz2
libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.zip
168 files changed, 37458 insertions, 0 deletions
diff --git a/vp8/common/alloccommon.c b/vp8/common/alloccommon.c
new file mode 100644
index 000000000..ac110f761
--- /dev/null
+++ b/vp8/common/alloccommon.c
@@ -0,0 +1,251 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "blockd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "onyxc_int.h"
+#include "findnearmv.h"
+#include "entropymode.h"
+#include "systemdependent.h"
+#include "vpxerrors.h"
+
+#ifdef HAVE_CONFIG_H
+#include "vpx_config.h"
+#endif
+
+extern  void vp8_init_scan_order_mask();
+
+void vp8_update_mode_info_border(MODE_INFO *mi, int rows, int cols)
+{
+    int i;
+    vpx_memset(mi - cols - 1, 0, sizeof(MODE_INFO) * cols + 1);
+
+    for (i = 0; i < rows; i++)
+    {
+        vpx_memset(&mi[i*cols-1], 0, sizeof(MODE_INFO));
+    }
+}
+void vp8_de_alloc_frame_buffers(VP8_COMMON *oci)
+{
+    vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);
+    vp8_yv12_de_alloc_frame_buffer(&oci->new_frame);
+    vp8_yv12_de_alloc_frame_buffer(&oci->last_frame);
+    vp8_yv12_de_alloc_frame_buffer(&oci->golden_frame);
+    vp8_yv12_de_alloc_frame_buffer(&oci->alt_ref_frame);
+    vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);
+
+    vpx_free(oci->above_context[Y1CONTEXT]);
+    vpx_free(oci->above_context[UCONTEXT]);
+    vpx_free(oci->above_context[VCONTEXT]);
+    vpx_free(oci->above_context[Y2CONTEXT]);
+    vpx_free(oci->mip);
+
+    oci->above_context[Y1CONTEXT] = 0;
+    oci->above_context[UCONTEXT]  = 0;
+    oci->above_context[VCONTEXT]  = 0;
+    oci->above_context[Y2CONTEXT] = 0;
+    oci->mip = 0;
+
+    // Structure used to minitor GF useage
+    if (oci->gf_active_flags != 0)
+        vpx_free(oci->gf_active_flags);
+
+    oci->gf_active_flags = 0;
+}
+
+int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
+{
+    vp8_de_alloc_frame_buffers(oci);
+
+    // our internal buffers are always multiples of 16
+    if ((width & 0xf) != 0)
+        width += 16 - (width & 0xf);
+
+    if ((height & 0xf) != 0)
+        height += 16 - (height & 0xf);
+
+
+    if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame,   width, 16, VP8BORDERINPIXELS) < 0)
+    {
+        vp8_de_alloc_frame_buffers(oci);
+        return ALLOC_FAILURE;
+    }
+
+
+    if (vp8_yv12_alloc_frame_buffer(&oci->new_frame,   width, height, VP8BORDERINPIXELS) < 0)
+    {
+        vp8_de_alloc_frame_buffers(oci);
+        return ALLOC_FAILURE;
+    }
+
+    if (vp8_yv12_alloc_frame_buffer(&oci->last_frame,  width, height, VP8BORDERINPIXELS) < 0)
+    {
+        vp8_de_alloc_frame_buffers(oci);
+        return ALLOC_FAILURE;
+    }
+
+    if (vp8_yv12_alloc_frame_buffer(&oci->golden_frame, width, height, VP8BORDERINPIXELS) < 0)
+    {
+        vp8_de_alloc_frame_buffers(oci);
+        return ALLOC_FAILURE;
+    }
+
+    if (vp8_yv12_alloc_frame_buffer(&oci->alt_ref_frame, width, height, VP8BORDERINPIXELS) < 0)
+    {
+        vp8_de_alloc_frame_buffers(oci);
+        return ALLOC_FAILURE;
+    }
+
+    if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0)
+    {
+        vp8_de_alloc_frame_buffers(oci);
+        return ALLOC_FAILURE;
+    }
+
+    oci->mb_rows = height >> 4;
+    oci->mb_cols = width >> 4;
+    oci->MBs = oci->mb_rows * oci->mb_cols;
+    oci->mode_info_stride = oci->mb_cols + 1;
+    oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
+
+    if (!oci->mip)
+    {
+        vp8_de_alloc_frame_buffers(oci);
+        return ALLOC_FAILURE;
+    }
+
+    oci->mi = oci->mip + oci->mode_info_stride + 1;
+
+
+    oci->above_context[Y1CONTEXT] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * oci->mb_cols * 4 , 1);
+
+    if (!oci->above_context[Y1CONTEXT])
+    {
+        vp8_de_alloc_frame_buffers(oci);
+        return ALLOC_FAILURE;
+    }
+
+    oci->above_context[UCONTEXT]  = vpx_calloc(sizeof(ENTROPY_CONTEXT) * oci->mb_cols * 2 , 1);
+
+    if (!oci->above_context[UCONTEXT])
+    {
+        vp8_de_alloc_frame_buffers(oci);
+        return ALLOC_FAILURE;
+    }
+
+    oci->above_context[VCONTEXT]  = vpx_calloc(sizeof(ENTROPY_CONTEXT) * oci->mb_cols * 2 , 1);
+
+    if (!oci->above_context[VCONTEXT])
+    {
+        vp8_de_alloc_frame_buffers(oci);
+        return ALLOC_FAILURE;
+    }
+
+    oci->above_context[Y2CONTEXT] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * oci->mb_cols     , 1);
+
+    if (!oci->above_context[Y2CONTEXT])
+    {
+        vp8_de_alloc_frame_buffers(oci);
+        return ALLOC_FAILURE;
+    }
+
+    vp8_update_mode_info_border(oci->mi, oci->mb_rows, oci->mb_cols);
+
+    // Structures used to minitor GF usage
+    if (oci->gf_active_flags != 0)
+        vpx_free(oci->gf_active_flags);
+
+    oci->gf_active_flags = (unsigned char *)vpx_calloc(oci->mb_rows * oci->mb_cols, 1);
+
+    if (!oci->gf_active_flags)
+    {
+        vp8_de_alloc_frame_buffers(oci);
+        return ALLOC_FAILURE;
+    }
+
+    oci->gf_active_count = oci->mb_rows * oci->mb_cols;
+
+    return 0;
+}
+void vp8_setup_version(VP8_COMMON *cm)
+{
+    switch (cm->version)
+    {
+    case 0:
+        cm->no_lpf = 0;
+        cm->simpler_lpf = 0;
+        cm->use_bilinear_mc_filter = 0;
+        cm->full_pixel = 0;
+        break;
+    case 1:
+        cm->no_lpf = 0;
+        cm->simpler_lpf = 1;
+        cm->use_bilinear_mc_filter = 1;
+        cm->full_pixel = 0;
+        break;
+    case 2:
+        cm->no_lpf = 1;
+        cm->simpler_lpf = 0;
+        cm->use_bilinear_mc_filter = 1;
+        cm->full_pixel = 0;
+        break;
+    case 3:
+        cm->no_lpf = 1;
+        cm->simpler_lpf = 1;
+        cm->use_bilinear_mc_filter = 1;
+        cm->full_pixel = 1;
+        break;
+    default:
+        //4,5,6,7 are reserved for future use
+        cm->no_lpf = 0;
+        cm->simpler_lpf = 0;
+        cm->use_bilinear_mc_filter = 0;
+        cm->full_pixel = 0;
+        break;
+    }
+}
+void vp8_create_common(VP8_COMMON *oci)
+{
+    vp8_machine_specific_config(oci);
+    vp8_default_coef_probs(oci);
+    vp8_init_mbmode_probs(oci);
+    vp8_default_bmode_probs(oci->fc.bmode_prob);
+
+    oci->mb_no_coeff_skip = 1;
+    oci->no_lpf = 0;
+    oci->simpler_lpf = 0;
+    oci->use_bilinear_mc_filter = 0;
+    oci->full_pixel = 0;
+    oci->multi_token_partition = ONE_PARTITION;
+    oci->clr_type = REG_YUV;
+    oci->clamp_type = RECON_CLAMP_REQUIRED;
+
+    // Initialise reference frame sign bias structure to defaults
+    vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
+
+    // Default disable buffer to buffer copying
+    oci->copy_buffer_to_gf = 0;
+    oci->copy_buffer_to_arf = 0;
+}
+
+void vp8_remove_common(VP8_COMMON *oci)
+{
+    vp8_de_alloc_frame_buffers(oci);
+}
+
+void vp8_initialize_common()
+{
+    vp8_coef_tree_initialize();
+
+    vp8_entropy_mode_init();
+
+    vp8_init_scan_order_mask();
+
+}
diff --git a/vp8/common/alloccommon.h b/vp8/common/alloccommon.h
new file mode 100644
index 000000000..73c7383c7
--- /dev/null
+++ b/vp8/common/alloccommon.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ALLOCCOMMON_H
+#define __INC_ALLOCCOMMON_H
+
+#include "onyxc_int.h"
+
+void vp8_create_common(VP8_COMMON *oci);
+void vp8_remove_common(VP8_COMMON *oci);
+void vp8_de_alloc_frame_buffers(VP8_COMMON *oci);
+int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height);
+void vp8_setup_version(VP8_COMMON *oci);
+
+#endif
diff --git a/vp8/common/arm/armv6/bilinearfilter_v6.asm b/vp8/common/arm/armv6/bilinearfilter_v6.asm
new file mode 100644
index 000000000..4428cf8ff
--- /dev/null
+++ b/vp8/common/arm/armv6/bilinearfilter_v6.asm
@@ -0,0 +1,237 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_filter_block2d_bil_first_pass_armv6|
+    EXPORT  |vp8_filter_block2d_bil_second_pass_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;-------------------------------------
+; r0    unsigned char *src_ptr,
+; r1    unsigned short *output_ptr,
+; r2    unsigned int src_pixels_per_line,
+; r3    unsigned int output_height,
+; stack    unsigned int output_width,
+; stack    const short *vp8_filter
+;-------------------------------------
+; The output is transposed stroed in output array to make it easy for second pass filtering.
+|vp8_filter_block2d_bil_first_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp8_filter address
+    ldr     r4, [sp, #36]                   ; output width
+
+    mov     r12, r3                         ; outer-loop counter
+    sub     r2, r2, r4                      ; src increment for height loop
+
+    ;;IF ARCHITECTURE=6
+    pld     [r0]
+    ;;ENDIF
+
+    ldr     r5, [r11]                       ; load up filter coefficients
+
+    mov     r3, r3, lsl #1                  ; output_height*2
+    add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
+
+    mov     r11, r1                         ; save output_ptr for each row
+
+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
+    beq     bil_null_1st_filter
+
+|bil_height_loop_1st_v6|
+    ldrb    r6, [r0]                        ; load source data
+    ldrb    r7, [r0, #1]
+    ldrb    r8, [r0, #2]
+    mov     lr, r4, lsr #2                  ; 4-in-parellel loop counter
+
+|bil_width_loop_1st_v6|
+    ldrb    r9, [r0, #3]
+    ldrb    r10, [r0, #4]
+
+    pkhbt   r6, r6, r7, lsl #16             ; src[1] | src[0]
+    pkhbt   r7, r7, r8, lsl #16             ; src[2] | src[1]
+
+    smuad   r6, r6, r5                      ; apply the filter
+    pkhbt   r8, r8, r9, lsl #16             ; src[3] | src[2]
+    smuad   r7, r7, r5
+    pkhbt   r9, r9, r10, lsl #16            ; src[4] | src[3]
+
+    smuad   r8, r8, r5
+    smuad   r9, r9, r5
+
+    add     r0, r0, #4
+    subs    lr, lr, #1
+
+    add     r6, r6, #0x40                   ; round_shift_and_clamp
+    add     r7, r7, #0x40
+    usat    r6, #16, r6, asr #7
+    usat    r7, #16, r7, asr #7
+
+    strh    r6, [r1], r3                    ; result is transposed and stored
+
+    add     r8, r8, #0x40                   ; round_shift_and_clamp
+    strh    r7, [r1], r3
+    add     r9, r9, #0x40
+    usat    r8, #16, r8, asr #7
+    usat    r9, #16, r9, asr #7
+
+    strh    r8, [r1], r3                    ; result is transposed and stored
+
+    ldrneb  r6, [r0]                        ; load source data
+    strh    r9, [r1], r3
+
+    ldrneb  r7, [r0, #1]
+    ldrneb  r8, [r0, #2]
+
+    bne     bil_width_loop_1st_v6
+
+    add     r0, r0, r2                      ; move to next input row
+    subs    r12, r12, #1
+
+    ;;IF ARCHITECTURE=6
+    pld     [r0]
+    ;;ENDIF
+
+    add     r11, r11, #2                    ; move over to next column
+    mov     r1, r11
+
+    bne     bil_height_loop_1st_v6
+
+    ldmia   sp!, {r4 - r11, pc}
+
+|bil_null_1st_filter|
+|bil_height_loop_null_1st|
+    mov     lr, r4, lsr #2                  ; loop counter
+
+|bil_width_loop_null_1st|
+    ldrb    r6, [r0]                        ; load data
+    ldrb    r7, [r0, #1]
+    ldrb    r8, [r0, #2]
+    ldrb    r9, [r0, #3]
+
+    strh    r6, [r1], r3                    ; store it to immediate buffer
+    add     r0, r0, #4
+    strh    r7, [r1], r3
+    subs    lr, lr, #1
+    strh    r8, [r1], r3
+    strh    r9, [r1], r3
+
+    bne     bil_width_loop_null_1st
+
+    subs    r12, r12, #1
+    add     r0, r0, r2                      ; move to next input line
+    add     r11, r11, #2                    ; move over to next column
+    mov     r1, r11
+
+    bne     bil_height_loop_null_1st
+
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP  ; |vp8_filter_block2d_bil_first_pass_armv6|
+
+
+;---------------------------------
+; r0    unsigned short *src_ptr,
+; r1    unsigned char *output_ptr,
+; r2    int output_pitch,
+; r3    unsigned int  output_height,
+; stack unsigned int  output_width,
+; stack const short *vp8_filter
+;---------------------------------
+|vp8_filter_block2d_bil_second_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp8_filter address
+    ldr     r4, [sp, #36]                   ; output width
+
+    ldr     r5, [r11]                       ; load up filter coefficients
+    mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix
+    mov     r11, r1
+
+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
+    beq     bil_null_2nd_filter
+
+|bil_height_loop_2nd|
+    ldr     r6, [r0]                        ; load the data
+    ldr     r8, [r0, #4]
+    ldrh    r10, [r0, #8]
+    mov     lr, r3, lsr #2                  ; loop counter
+
+|bil_width_loop_2nd|
+    pkhtb   r7, r6, r8                      ; src[1] | src[2]
+    pkhtb   r9, r8, r10                     ; src[3] | src[4]
+
+    smuad   r6, r6, r5                      ; apply filter
+    smuad   r8, r8, r5                      ; apply filter
+
+    subs    lr, lr, #1
+
+    smuadx  r7, r7, r5                      ; apply filter
+    smuadx  r9, r9, r5                      ; apply filter
+
+    add     r0, r0, #8
+
+    add     r6, r6, #0x40                   ; round_shift_and_clamp
+    add     r7, r7, #0x40
+    usat    r6, #8, r6, asr #7
+    usat    r7, #8, r7, asr #7
+    strb    r6, [r1], r2                    ; the result is transposed back and stored
+
+    add     r8, r8, #0x40                   ; round_shift_and_clamp
+    strb    r7, [r1], r2
+    add     r9, r9, #0x40
+    usat    r8, #8, r8, asr #7
+    usat    r9, #8, r9, asr #7
+    strb    r8, [r1], r2                    ; the result is transposed back and stored
+
+    ldrne   r6, [r0]                        ; load data
+    strb    r9, [r1], r2
+    ldrne   r8, [r0, #4]
+    ldrneh  r10, [r0, #8]
+
+    bne     bil_width_loop_2nd
+
+    subs    r12, r12, #1
+    add     r0, r0, #4                      ; update src for next row
+    add     r11, r11, #1
+    mov     r1, r11
+
+    bne     bil_height_loop_2nd
+    ldmia   sp!, {r4 - r11, pc}
+
+|bil_null_2nd_filter|
+|bil_height_loop_null_2nd|
+    mov     lr, r3, lsr #2
+
+|bil_width_loop_null_2nd|
+    ldr     r6, [r0], #4                    ; load data
+    subs    lr, lr, #1
+    ldr     r8, [r0], #4
+
+    strb    r6, [r1], r2                    ; store data
+    mov     r7, r6, lsr #16
+    strb    r7, [r1], r2
+    mov     r9, r8, lsr #16
+    strb    r8, [r1], r2
+    strb    r9, [r1], r2
+
+    bne     bil_width_loop_null_2nd
+
+    subs    r12, r12, #1
+    add     r0, r0, #4
+    add     r11, r11, #1
+    mov     r1, r11
+
+    bne     bil_height_loop_null_2nd
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP  ; |vp8_filter_block2d_second_pass_armv6|
+
+    END
diff --git a/vp8/common/arm/armv6/copymem16x16_v6.asm b/vp8/common/arm/armv6/copymem16x16_v6.asm
new file mode 100644
index 000000000..00e97397c
--- /dev/null
+++ b/vp8/common/arm/armv6/copymem16x16_v6.asm
@@ -0,0 +1,181 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_copy_mem16x16_v6|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem16x16_v6| PROC
+    stmdb       sp!, {r4 - r7}
+    ;push   {r4-r7}
+
+    ;preload
+    pld     [r0]
+    pld     [r0, r1]
+    pld     [r0, r1, lsl #1]
+
+    ands    r4, r0, #15
+    beq     copy_mem16x16_fast
+
+    ands    r4, r0, #7
+    beq     copy_mem16x16_8
+
+    ands    r4, r0, #3
+    beq     copy_mem16x16_4
+
+    ;copy one byte each time
+    ldrb    r4, [r0]
+    ldrb    r5, [r0, #1]
+    ldrb    r6, [r0, #2]
+    ldrb    r7, [r0, #3]
+
+    mov     r12, #16
+
+copy_mem16x16_1_loop
+    strb    r4, [r2]
+    strb    r5, [r2, #1]
+    strb    r6, [r2, #2]
+    strb    r7, [r2, #3]
+
+    ldrb    r4, [r0, #4]
+    ldrb    r5, [r0, #5]
+    ldrb    r6, [r0, #6]
+    ldrb    r7, [r0, #7]
+
+    subs    r12, r12, #1
+
+    strb    r4, [r2, #4]
+    strb    r5, [r2, #5]
+    strb    r6, [r2, #6]
+    strb    r7, [r2, #7]
+
+    ldrb    r4, [r0, #8]
+    ldrb    r5, [r0, #9]
+    ldrb    r6, [r0, #10]
+    ldrb    r7, [r0, #11]
+
+    strb    r4, [r2, #8]
+    strb    r5, [r2, #9]
+    strb    r6, [r2, #10]
+    strb    r7, [r2, #11]
+
+    ldrb    r4, [r0, #12]
+    ldrb    r5, [r0, #13]
+    ldrb    r6, [r0, #14]
+    ldrb    r7, [r0, #15]
+
+    add     r0, r0, r1
+
+    strb    r4, [r2, #12]
+    strb    r5, [r2, #13]
+    strb    r6, [r2, #14]
+    strb    r7, [r2, #15]
+
+    add     r2, r2, r3
+
+    ldrneb  r4, [r0]
+    ldrneb  r5, [r0, #1]
+    ldrneb  r6, [r0, #2]
+    ldrneb  r7, [r0, #3]
+
+    bne     copy_mem16x16_1_loop
+
+    ldmia       sp!, {r4 - r7}
+    ;pop        {r4-r7}
+    mov     pc, lr
+
+;copy 4 bytes each time
+copy_mem16x16_4
+    ldr     r4, [r0]
+    ldr     r5, [r0, #4]
+    ldr     r6, [r0, #8]
+    ldr     r7, [r0, #12]
+
+    mov     r12, #16
+
+copy_mem16x16_4_loop
+    subs    r12, r12, #1
+    add     r0, r0, r1
+
+    str     r4, [r2]
+    str     r5, [r2, #4]
+    str     r6, [r2, #8]
+    str     r7, [r2, #12]
+
+    add     r2, r2, r3
+
+    ldrne   r4, [r0]
+    ldrne   r5, [r0, #4]
+    ldrne   r6, [r0, #8]
+    ldrne   r7, [r0, #12]
+
+    bne     copy_mem16x16_4_loop
+
+    ldmia       sp!, {r4 - r7}
+    ;pop        {r4-r7}
+    mov     pc, lr
+
+;copy 8 bytes each time
+copy_mem16x16_8
+    sub     r1, r1, #16
+    sub     r3, r3, #16
+
+    mov     r12, #16
+
+copy_mem16x16_8_loop
+    ldmia   r0!, {r4-r5}
+    ;ldm        r0, {r4-r5}
+    ldmia   r0!, {r6-r7}
+
+    add     r0, r0, r1
+
+    stmia   r2!, {r4-r5}
+    subs    r12, r12, #1
+    ;stm        r2, {r4-r5}
+    stmia   r2!, {r6-r7}
+
+    add     r2, r2, r3
+
+    bne     copy_mem16x16_8_loop
+
+    ldmia       sp!, {r4 - r7}
+    ;pop        {r4-r7}
+    mov     pc, lr
+
+;copy 16 bytes each time
+copy_mem16x16_fast
+    ;sub        r1, r1, #16
+    ;sub        r3, r3, #16
+
+    mov     r12, #16
+
+copy_mem16x16_fast_loop
+    ldmia   r0, {r4-r7}
+    ;ldm        r0, {r4-r7}
+    add     r0, r0, r1
+
+    subs    r12, r12, #1
+    stmia   r2, {r4-r7}
+    ;stm        r2, {r4-r7}
+    add     r2, r2, r3
+
+    bne     copy_mem16x16_fast_loop
+
+    ldmia       sp!, {r4 - r7}
+    ;pop        {r4-r7}
+    mov     pc, lr
+
+    ENDP  ; |vp8_copy_mem16x16_v6|
+
+    END
diff --git a/vp8/common/arm/armv6/copymem8x4_v6.asm b/vp8/common/arm/armv6/copymem8x4_v6.asm
new file mode 100644
index 000000000..94473ca65
--- /dev/null
+++ b/vp8/common/arm/armv6/copymem8x4_v6.asm
@@ -0,0 +1,127 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_copy_mem8x4_v6|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void vp8_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem8x4_v6| PROC
+    ;push   {r4-r5}
+    stmdb  sp!, {r4-r5}
+
+    ;preload
+    pld     [r0]
+    pld     [r0, r1]
+    pld     [r0, r1, lsl #1]
+
+    ands    r4, r0, #7
+    beq     copy_mem8x4_fast
+
+    ands    r4, r0, #3
+    beq     copy_mem8x4_4
+
+    ;copy 1 byte each time
+    ldrb    r4, [r0]
+    ldrb    r5, [r0, #1]
+
+    mov     r12, #4
+
+copy_mem8x4_1_loop
+    strb    r4, [r2]
+    strb    r5, [r2, #1]
+
+    ldrb    r4, [r0, #2]
+    ldrb    r5, [r0, #3]
+
+    subs    r12, r12, #1
+
+    strb    r4, [r2, #2]
+    strb    r5, [r2, #3]
+
+    ldrb    r4, [r0, #4]
+    ldrb    r5, [r0, #5]
+
+    strb    r4, [r2, #4]
+    strb    r5, [r2, #5]
+
+    ldrb    r4, [r0, #6]
+    ldrb    r5, [r0, #7]
+
+    add     r0, r0, r1
+
+    strb    r4, [r2, #6]
+    strb    r5, [r2, #7]
+
+    add     r2, r2, r3
+
+    ldrneb  r4, [r0]
+    ldrneb  r5, [r0, #1]
+
+    bne     copy_mem8x4_1_loop
+
+    ldmia       sp!, {r4 - r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+;copy 4 bytes each time
+copy_mem8x4_4
+    ldr     r4, [r0]
+    ldr     r5, [r0, #4]
+
+    mov     r12, #4
+
+copy_mem8x4_4_loop
+    subs    r12, r12, #1
+    add     r0, r0, r1
+
+    str     r4, [r2]
+    str     r5, [r2, #4]
+
+    add     r2, r2, r3
+
+    ldrne   r4, [r0]
+    ldrne   r5, [r0, #4]
+
+    bne     copy_mem8x4_4_loop
+
+    ldmia  sp!, {r4-r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+;copy 8 bytes each time
+copy_mem8x4_fast
+    ;sub        r1, r1, #8
+    ;sub        r3, r3, #8
+
+    mov     r12, #4
+
+copy_mem8x4_fast_loop
+    ldmia   r0, {r4-r5}
+    ;ldm        r0, {r4-r5}
+    add     r0, r0, r1
+
+    subs    r12, r12, #1
+    stmia   r2, {r4-r5}
+    ;stm        r2, {r4-r5}
+    add     r2, r2, r3
+
+    bne     copy_mem8x4_fast_loop
+
+    ldmia  sp!, {r4-r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+    ENDP  ; |vp8_copy_mem8x4_v6|
+
+    END
diff --git a/vp8/common/arm/armv6/copymem8x8_v6.asm b/vp8/common/arm/armv6/copymem8x8_v6.asm
new file mode 100644
index 000000000..7cfa53389
--- /dev/null
+++ b/vp8/common/arm/armv6/copymem8x8_v6.asm
@@ -0,0 +1,127 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_copy_mem8x8_v6|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem8x8_v6| PROC
+    ;push   {r4-r5}
+    stmdb  sp!, {r4-r5}
+
+    ;preload
+    pld     [r0]
+    pld     [r0, r1]
+    pld     [r0, r1, lsl #1]
+
+    ands    r4, r0, #7
+    beq     copy_mem8x8_fast
+
+    ands    r4, r0, #3
+    beq     copy_mem8x8_4
+
+    ;copy 1 byte each time
+    ldrb    r4, [r0]
+    ldrb    r5, [r0, #1]
+
+    mov     r12, #8
+
+copy_mem8x8_1_loop
+    strb    r4, [r2]
+    strb    r5, [r2, #1]
+
+    ldrb    r4, [r0, #2]
+    ldrb    r5, [r0, #3]
+
+    subs    r12, r12, #1
+
+    strb    r4, [r2, #2]
+    strb    r5, [r2, #3]
+
+    ldrb    r4, [r0, #4]
+    ldrb    r5, [r0, #5]
+
+    strb    r4, [r2, #4]
+    strb    r5, [r2, #5]
+
+    ldrb    r4, [r0, #6]
+    ldrb    r5, [r0, #7]
+
+    add     r0, r0, r1
+
+    strb    r4, [r2, #6]
+    strb    r5, [r2, #7]
+
+    add     r2, r2, r3
+
+    ldrneb  r4, [r0]
+    ldrneb  r5, [r0, #1]
+
+    bne     copy_mem8x8_1_loop
+
+    ldmia       sp!, {r4 - r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+;copy 4 bytes each time
+copy_mem8x8_4
+    ldr     r4, [r0]
+    ldr     r5, [r0, #4]
+
+    mov     r12, #8
+
+copy_mem8x8_4_loop
+    subs    r12, r12, #1
+    add     r0, r0, r1
+
+    str     r4, [r2]
+    str     r5, [r2, #4]
+
+    add     r2, r2, r3
+
+    ldrne   r4, [r0]
+    ldrne   r5, [r0, #4]
+
+    bne     copy_mem8x8_4_loop
+
+    ldmia       sp!, {r4 - r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+;copy 8 bytes each time
+copy_mem8x8_fast
+    ;sub        r1, r1, #8
+    ;sub        r3, r3, #8
+
+    mov     r12, #8
+
+copy_mem8x8_fast_loop
+    ldmia   r0, {r4-r5}
+    ;ldm        r0, {r4-r5}
+    add     r0, r0, r1
+
+    subs    r12, r12, #1
+    stmia   r2, {r4-r5}
+    ;stm        r2, {r4-r5}
+    add     r2, r2, r3
+
+    bne     copy_mem8x8_fast_loop
+
+    ldmia  sp!, {r4-r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+    ENDP  ; |vp8_copy_mem8x8_v6|
+
+    END
diff --git a/vp8/common/arm/armv6/filter_v6.asm b/vp8/common/arm/armv6/filter_v6.asm
new file mode 100644
index 000000000..a7863fc94
--- /dev/null
+++ b/vp8/common/arm/armv6/filter_v6.asm
@@ -0,0 +1,383 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_filter_block2d_first_pass_armv6|
+    EXPORT  |vp8_filter_block2d_second_pass_armv6|
+    EXPORT  |vp8_filter_block2d_first_pass_only_armv6|
+    EXPORT  |vp8_filter_block2d_second_pass_only_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+;-------------------------------------
+; r0    unsigned char *src_ptr
+; r1    short         *output_ptr
+; r2    unsigned int src_pixels_per_line
+; r3    unsigned int output_width
+; stack unsigned int output_height
+; stack const short *vp8_filter
+;-------------------------------------
+; vp8_filter the input and put in the output array.  Apply the 6 tap FIR filter with
+; the output being a 2 byte value and the intput being a 1 byte value.
+|vp8_filter_block2d_first_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp8_filter address
+    ldr     r7, [sp, #36]                   ; output height
+
+    sub     r2, r2, r3                      ; inside loop increments input array,
+                                            ; so the height loop only needs to add
+                                            ; r2 - width to the input pointer
+
+    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
+    add     r12, r3, #16                    ; square off the output
+    sub     sp, sp, #4
+
+    ;;IF ARCHITECTURE=6
+    ;pld        [r0, #-2]
+    ;;pld       [r0, #30]
+    ;;ENDIF
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    str     r1, [sp]                        ; push destination to stack
+    mov     r7, r7, lsl #16                 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_6|
+    ldrb    r8, [r0, #-2]                   ; load source data
+    ldrb    r9, [r0, #-1]
+    ldrb    r10, [r0], #2
+    orr     r7, r7, r3, lsr #2              ; construct loop counter
+
+|width_loop_1st_6|
+    ldrb    r11, [r0, #-1]
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0]
+
+    smuad   lr, lr, r4                      ; apply the filter
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+    smuad   r8, r8, r4
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0, #1]
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0, #2]
+
+    sub     r7, r7, #1
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r11, r10, r6, r8
+
+    ands    r10, r7, #0xff                  ; test loop counter
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0, #-2]                   ; load data for next loop
+    usat    lr, #8, lr, asr #7
+    add     r11, r11, #0x40
+    ldrneb  r9, [r0, #-1]
+    usat    r11, #8, r11, asr #7
+
+    strh    lr, [r1], r12                   ; result is transposed and stored, which
+                                            ; will make second pass filtering easier.
+    ldrneb  r10, [r0], #2
+    strh    r11, [r1], r12
+
+    bne     width_loop_1st_6
+
+    ;;add       r9, r2, #30                 ; attempt to load 2 adjacent cache lines
+    ;;IF ARCHITECTURE=6
+    ;pld        [r0, r2]
+    ;;pld       [r0, r9]
+    ;;ENDIF
+
+    ldr     r1, [sp]                        ; load and update dst address
+    subs    r7, r7, #0x10000
+    add     r0, r0, r2                      ; move to next input line
+    add     r1, r1, #2                      ; move over to next column
+    str     r1, [sp]
+
+    bne     height_loop_1st_6
+
+    add     sp, sp, #4
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+;---------------------------------
+; r0    short         *src_ptr,
+; r1    unsigned char *output_ptr,
+; r2    unsigned int output_pitch,
+; r3    unsigned int cnt,
+; stack const short *vp8_filter
+;---------------------------------
+|vp8_filter_block2d_second_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #36]                  ; vp8_filter address
+    sub     sp, sp, #4
+    mov     r7, r3, lsl #16                 ; height is top part of counter
+    str     r1, [sp]                        ; push destination to stack
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    pkhbt   r12, r5, r4                     ; pack the filter differently
+    pkhbt   r11, r6, r5
+
+    sub     r0, r0, #4                      ; offset input buffer
+
+|height_loop_2nd|
+    ldr     r8, [r0]                        ; load the data
+    ldr     r9, [r0, #4]
+    orr     r7, r7, r3, lsr #1              ; loop counter
+
+|width_loop_2nd|
+    smuad   lr, r4, r8                      ; apply filter
+    sub     r7, r7, #1
+    smulbt  r8, r4, r8
+
+    ldr     r10, [r0, #8]
+
+    smlad   lr, r5, r9, lr
+    smladx  r8, r12, r9, r8
+
+    ldrh    r9, [r0, #12]
+
+    smlad   lr, r6, r10, lr
+    smladx  r8, r11, r10, r8
+
+    add     r0, r0, #4
+    smlatb  r10, r6, r9, r8
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ands    r8, r7, #0xff
+    usat    lr, #8, lr, asr #7
+    add     r10, r10, #0x40
+    strb    lr, [r1], r2                    ; the result is transposed back and stored
+    usat    r10, #8, r10, asr #7
+
+    ldrne   r8, [r0]                        ; load data for next loop
+    ldrne   r9, [r0, #4]
+    strb    r10, [r1], r2
+
+    bne     width_loop_2nd
+
+    ldr     r1, [sp]                        ; update dst for next loop
+    subs    r7, r7, #0x10000
+    add     r0, r0, #16                     ; updata src for next loop
+    add     r1, r1, #1
+    str     r1, [sp]
+
+    bne     height_loop_2nd
+
+    add     sp, sp, #4
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+;------------------------------------
+; r0    unsigned char *src_ptr
+; r1    unsigned char *output_ptr,
+; r2    unsigned int src_pixels_per_line
+; r3    unsigned int cnt,
+; stack unsigned int output_pitch,
+; stack const short *vp8_filter
+;------------------------------------
+|vp8_filter_block2d_first_pass_only_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r4, [sp, #36]                   ; output pitch
+    ldr     r11, [sp, #40]                  ; HFilter address
+    sub     sp, sp, #8
+
+    mov     r7, r3
+    sub     r2, r2, r3                      ; inside loop increments input array,
+                                            ; so the height loop only needs to add
+                                            ; r2 - width to the input pointer
+
+    sub     r4, r4, r3
+    str     r4, [sp]                        ; save modified output pitch
+    str     r2, [sp, #4]
+
+    mov     r2, #0x40
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+; six tap filter
+|height_loop_1st_only_6|
+    ldrb    r8, [r0, #-2]                   ; load data
+    ldrb    r9, [r0, #-1]
+    ldrb    r10, [r0], #2
+
+    mov     r12, r3, lsr #1                 ; loop counter
+
+|width_loop_1st_only_6|
+    ldrb    r11, [r0, #-1]
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0]
+
+;;  smuad   lr, lr, r4
+    smlad   lr, lr, r4, r2
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+;;  smuad   r8, r8, r4
+    smlad   r8, r8, r4, r2
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0, #1]
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0, #2]
+
+    subs    r12, r12, #1
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r10, r10, r6, r8
+
+;;  add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0, #-2]                   ; load data for next loop
+    usat    lr, #8, lr, asr #7
+;;  add     r10, r10, #0x40
+    strb    lr, [r1], #1                    ; store the result
+    usat    r10, #8, r10, asr #7
+
+    ldrneb  r9, [r0, #-1]
+    strb    r10, [r1], #1
+    ldrneb  r10, [r0], #2
+
+    bne     width_loop_1st_only_6
+
+    ;;add       r9, r2, #30                 ; attempt to load 2 adjacent cache lines
+    ;;IF ARCHITECTURE=6
+    ;pld        [r0, r2]
+    ;;pld       [r0, r9]
+    ;;ENDIF
+
+    ldr     lr, [sp]                        ; load back output pitch
+    ldr     r12, [sp, #4]                   ; load back output pitch
+    subs    r7, r7, #1
+    add     r0, r0, r12                     ; updata src for next loop
+    add     r1, r1, lr                      ; update dst for next loop
+
+    bne     height_loop_1st_only_6
+
+    add     sp, sp, #8
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP  ; |vp8_filter_block2d_first_pass_only_armv6|
+
+
+;------------------------------------
+; r0    unsigned char *src_ptr,
+; r1    unsigned char *output_ptr,
+; r2    unsigned int src_pixels_per_line
+; r3    unsigned int cnt,
+; stack unsigned int output_pitch,
+; stack const short *vp8_filter
+;------------------------------------
+|vp8_filter_block2d_second_pass_only_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; VFilter address
+    ldr     r12, [sp, #36]                  ; output pitch
+
+    mov     r7, r3, lsl #16                 ; height is top part of counter
+    sub     r0, r0, r2, lsl #1              ; need 6 elements for filtering, 2 before, 3 after
+
+    sub     sp, sp, #8
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    str     r0, [sp]                        ; save r0 to stack
+    str     r1, [sp, #4]                    ; save dst to stack
+
+; six tap filter
+|width_loop_2nd_only_6|
+    ldrb    r8, [r0], r2                    ; load data
+    orr     r7, r7, r3                      ; loop counter
+    ldrb    r9, [r0], r2
+    ldrb    r10, [r0], r2
+
+|height_loop_2nd_only_6|
+    ; filter first column in this inner loop, than, move to next colum.
+    ldrb    r11, [r0], r2
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0], r2
+
+    smuad   lr, lr, r4
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+    smuad   r8, r8, r4
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0], r2
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0]
+
+    sub     r7, r7, #2
+    sub     r0, r0, r2, lsl #2
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r10, r10, r6, r8
+
+    ands    r9, r7, #0xff
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0], r2                    ; load data for next loop
+    usat    lr, #8, lr, asr #7
+    add     r10, r10, #0x40
+    strb    lr, [r1], r12                   ; store the result for the column
+    usat    r10, #8, r10, asr #7
+
+    ldrneb  r9, [r0], r2
+    strb    r10, [r1], r12
+    ldrneb  r10, [r0], r2
+
+    bne     height_loop_2nd_only_6
+
+    ldr     r0, [sp]
+    ldr     r1, [sp, #4]
+    subs    r7, r7, #0x10000
+    add     r0, r0, #1                      ; move to filter next column
+    str     r0, [sp]
+    add     r1, r1, #1
+    str     r1, [sp, #4]
+
+    bne     width_loop_2nd_only_6
+
+    add     sp, sp, #8
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP  ; |vp8_filter_block2d_second_pass_only_armv6|
+
+    END
diff --git a/vp8/common/arm/armv6/idct_v6.asm b/vp8/common/arm/armv6/idct_v6.asm
new file mode 100644
index 000000000..25c5165ec
--- /dev/null
+++ b/vp8/common/arm/armv6/idct_v6.asm
@@ -0,0 +1,376 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+;                   r0  r1  r2  r3  r4  r5  r6  r7  r8  r9  r10 r11 r12     r14
+    EXPORT  |vp8_short_idct4x4llm_1_v6|
+    EXPORT  |vp8_short_idct4x4llm_v6|
+    EXPORT  |vp8_short_idct4x4llm_v6_scott|
+    EXPORT  |vp8_short_idct4x4llm_v6_dual|
+
+    EXPORT  |vp8_dc_only_idct_armv6|
+
+    AREA    |.text|, CODE, READONLY
+
+;********************************************************************************
+;*  void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch)
+;*      r0  INT16 * input
+;*      r1  INT16 * output
+;*      r2  INT32 pitch
+;*  bench:  3/5
+;********************************************************************************
+
+|vp8_short_idct4x4llm_1_v6| PROC         ;   cycles  in  out pit
+            ;
+    ldrsh   r0, [r0]    ; load input[0] 1, r0 un 2
+    add r0, r0, #4  ;   1   +4
+    stmdb   sp!, {r4, r5, lr}   ; make room for wide writes 1                   backup
+    mov r0, r0, asr #3  ; (input[0] + 4) >> 3   1, r0 req`d ^1  >> 3
+    pkhbt   r4, r0, r0, lsl #16 ; pack r0 into r4   1, r0 req`d ^1                  pack
+    mov r5, r4  ; expand                        expand
+
+    strd    r4, [r1], r2    ; *output = r0, post inc    1
+    strd    r4, [r1], r2    ;   1
+    strd    r4, [r1], r2    ;   1
+    strd    r4, [r1]    ;   1
+            ;
+    ldmia   sp!, {r4, r5, pc}   ; replace vars, return                      restore
+    ENDP        ; |vp8_short_idct4x4llm_1_v6|
+;********************************************************************************
+;********************************************************************************
+;********************************************************************************
+
+;********************************************************************************
+;*  void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch)
+;*      r0  INT16 * input
+;*      r1  INT16 * output
+;*      r2  INT32 pitch
+;*  bench:
+;********************************************************************************
+
+|vp8_short_idct4x4llm_v6| PROC           ;   cycles  in  out pit
+            ;
+    stmdb   sp!, {r4-r11, lr}   ; backup registers  1                   backup
+            ;
+    mov r4, #0x00004E00 ;   1                   cst
+    orr r4, r4, #0x0000007B ; cospi8sqrt2minus1
+    mov r5, #0x00008A00 ;   1                       cst
+    orr r5, r5, #0x0000008C ; sinpi8sqrt2
+            ;
+    mov r6, #4  ; i=4   1                           i
+loop1           ;
+    ldrsh   r12, [r0, #8]   ; input[4]  1, r12 unavail 2                                                    [4]
+    ldrsh   r3, [r0, #24]   ; input[12] 1, r3 unavail 2             [12]
+    ldrsh   r8, [r0, #16]   ; input[8]  1, r8 unavail 2                                 [8]
+    ldrsh   r7, [r0], #0x2  ; input[0]  1, r7 unavail 2 ++                          [0]
+    smulwb  r10, r5, r12    ; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1                                          t1
+    smulwb  r11, r4, r3 ; ([12] * cospi8sqrt2minus1) >> 16  1, r11 un 2, r3/r4 ^1                                               t2
+    add r9, r7, r8  ; a1 = [0] + [8]    1                                       a1
+    sub r7, r7, r8  ; b1 = [0] - [8]    1                               b1
+    add r11, r3, r11    ; temp2 1
+    rsb r11, r11, r10   ; c1 = temp1 - temp2    1                                               c1
+    smulwb  r3, r5, r3  ; ([12] * sinpi8sqrt2) >> 16    1, r3 un 2, r3/r5 ^ 1               t2
+    smulwb  r10, r4, r12    ; ([4] * cospi8sqrt2minus1) >> 16   1, r10 un 2, r12/r4 ^1                                          t1
+    add r8, r7, r11 ; b1 + c1   1                                   b+c
+    strh    r8, [r1, r2]    ; out[pitch] = b1+c1    1
+    sub r7, r7, r11 ; b1 - c1   1                               b-c
+    add r10, r12, r10   ; temp1 1
+    add r3, r10, r3 ; d1 = temp1 + temp2    1               d1
+    add r10, r9, r3 ; a1 + d1   1                                           a+d
+    sub r3, r9, r3  ; a1 - d1   1               a-d
+    add r8, r2, r2  ; pitch * 2 1                                   p*2
+    strh    r7, [r1, r8]    ; out[pitch*2] = b1-c1  1
+    add r7, r2, r2, lsl #1  ; pitch * 3 1                               p*3
+    strh    r3, [r1, r7]    ; out[pitch*3] = a1-d1  1
+    subs    r6, r6, #1  ; i--   1                           --
+    strh    r10, [r1], #0x2 ; out[0] = a1+d1    1       ++
+    bne loop1   ; if i>0, continue
+            ;
+    sub r1, r1, #8  ; set up out for next loop  1       -4
+            ; for this iteration, input=prev output
+    mov r6, #4  ; i=4   1                           i
+;   b   returnfull
+loop2           ;
+    ldrsh   r11, [r1, #2]   ; input[1]  1, r11 un 2                                             [1]
+    ldrsh   r8, [r1, #6]    ; input[3]  1, r8 un 2                                  [3]
+    ldrsh   r3, [r1, #4]    ; input[2]  1, r3 un 2              [2]
+    ldrsh   r0, [r1]    ; input[0]  1, r0 un 2  [0]
+    smulwb  r9, r5, r11 ; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1                                       t1
+    smulwb  r10, r4, r8 ; ([3] * cospi8sqrt2minus1) >> 16   1, r10 un 2, r4/r8 ^1                                           t2
+    add r7, r0, r3  ; a1 = [0] + [2]    1                               a1
+    sub r0, r0, r3  ; b1 = [0] - [2]    1   b1
+    add r10, r8, r10    ; temp2 1
+    rsb r9, r10, r9 ; c1 = temp1 - temp2    1                                       c1
+    smulwb  r8, r5, r8  ; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1                                    t2
+    smulwb  r10, r4, r11    ; ([1] * cospi8sqrt2minus1) >> 16   1, r10 un 2, r4/r11 ^1                                          t1
+    add r3, r0, r9  ; b1+c1 1               b+c
+    add r3, r3, #4  ; b1+c1+4   1               +4
+    add r10, r11, r10   ; temp1 1
+    mov r3, r3, asr #3  ; b1+c1+4 >> 3  1, r3 ^1                >>3
+    strh    r3, [r1, #2]    ; out[1] = b1+c1    1
+    add r10, r10, r8    ; d1 = temp1 + temp2    1                                           d1
+    add r3, r7, r10 ; a1+d1 1               a+d
+    add r3, r3, #4  ; a1+d1+4   1               +4
+    sub r7, r7, r10 ; a1-d1 1                               a-d
+    add r7, r7, #4  ; a1-d1+4   1                               +4
+    mov r3, r3, asr #3  ; a1+d1+4 >> 3  1, r3 ^1                >>3
+    mov r7, r7, asr #3  ; a1-d1+4 >> 3  1, r7 ^1                                >>3
+    strh    r7, [r1, #6]    ; out[3] = a1-d1    1
+    sub r0, r0, r9  ; b1-c1 1   b-c
+    add r0, r0, #4  ; b1-c1+4   1   +4
+    subs    r6, r6, #1  ; i--   1                           --
+    mov r0, r0, asr #3  ; b1-c1+4 >> 3  1, r0 ^1    >>3
+    strh    r0, [r1, #4]    ; out[2] = b1-c1    1
+    strh    r3, [r1], r2    ; out[0] = a1+d1    1
+;   add r1, r1, r2  ; out += pitch  1       ++
+    bne loop2   ; if i>0, continue
+returnfull          ;
+    ldmia   sp!, {r4 - r11, pc} ; replace vars, return                      restore
+    ENDP
+
+;********************************************************************************
+;********************************************************************************
+;********************************************************************************
+
+;********************************************************************************
+;*  void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch)
+;*      r0  INT16 * input
+;*      r1  INT16 * output
+;*      r2  INT32 pitch
+;*  bench:
+;********************************************************************************
+
+|vp8_short_idct4x4llm_v6_scott| PROC         ;   cycles  in  out pit
+;   mov r0, #0  ;
+;   ldr r0, [r0]    ;
+    stmdb   sp!, {r4 - r11, lr} ; backup registers  1                   backup
+            ;
+    mov r3, #0x00004E00 ;                   cos
+    orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
+    mov r4, #0x00008A00 ;                       sin
+    orr r4, r4, #0x0000008C ; sinpi8sqrt2
+            ;
+    mov r5, #0x2    ; i                         i
+            ;
+short_idct4x4llm_v6_scott_loop1          ;
+    ldr r10, [r0, #(4*2)]   ; i5 | i4                                               5,4
+    ldr r11, [r0, #(12*2)]  ; i13 | i12                                                 13,12
+            ;
+    smulwb  r6, r4, r10 ; ((ip[4] * sinpi8sqrt2) >> 16)                             lt1
+    smulwb  r7, r3, r11 ; ((ip[12] * cospi8sqrt2minus1) >> 16)                                  lt2
+            ;
+    smulwb  r12, r3, r10    ; ((ip[4] * cospi8sqrt2misu1) >> 16)                                                        l2t2
+    smulwb  r14, r4, r11    ; ((ip[12] * sinpi8sqrt2) >> 16)                                                                l2t1
+            ;
+    add r6, r6, r7  ; partial c1                                lt1-lt2
+    add r12, r12, r14   ; partial d1                                                        l2t2+l2t1
+            ;
+    smulwt  r14, r4, r10    ; ((ip[5] * sinpi8sqrt2) >> 16)                                                             ht1
+    smulwt  r7, r3, r11 ; ((ip[13] * cospi8sqrt2minus1) >> 16)                                  ht2
+            ;
+    smulwt  r8, r3, r10 ; ((ip[5] * cospi8sqrt2minus1) >> 16)                                       h2t1
+    smulwt  r9, r4, r11 ; ((ip[13] * sinpi8sqrt2) >> 16)                                            h2t2
+            ;
+    add r7, r14, r7 ; partial c1_2                                  ht1+ht2
+    sub r8, r8, r9  ; partial d1_2                                      h2t1-h2t2
+            ;
+    pkhbt   r6, r6, r7, lsl #16 ; partial c1_2 | partial c1_1                               pack
+    pkhbt   r12, r12, r8, lsl #16   ; partial d1_2 | partial d1_1                                                       pack
+            ;
+    usub16  r6, r6, r10 ; c1_2 | c1_1                               c
+    uadd16  r12, r12, r11   ; d1_2 | d1_1                                                       d
+            ;
+    ldr r10, [r0, #0]   ; i1 | i0                                               1,0
+    ldr r11, [r0, #(8*2)]   ; i9 | i10                                                  9,10
+            ;
+;;;;;;  add r0, r0, #0x4    ;       +4
+;;;;;;  add r1, r1, #0x4    ;           +4
+            ;
+    uadd16  r8, r10, r11    ; i1 + i9 | i0 + i8 aka a1                                      a
+    usub16  r9, r10, r11    ; i1 - i9 | i0 - i8 aka b1                                          b
+            ;
+    uadd16  r7, r8, r12 ; a1 + d1 pair                                  a+d
+    usub16  r14, r8, r12    ; a1 - d1 pair                                                              a-d
+            ;
+    str r7, [r1]    ; op[0] = a1 + d1
+    str r14, [r1, r2]   ; op[pitch*3] = a1 - d1
+            ;
+    add r0, r0, #0x4    ; op[pitch] = b1 + c1       ++
+    add r1, r1, #0x4    ; op[pitch*2] = b1 - c1         ++
+            ;
+    subs    r5, r5, #0x1    ;                           --
+    bne short_idct4x4llm_v6_scott_loop1  ;
+            ;
+    sub r1, r1, #16 ; reset output ptr
+    mov r5, #0x4    ;
+    mov r0, r1  ; input = output
+            ;
+short_idct4x4llm_v6_scott_loop2          ;
+            ;
+    subs    r5, r5, #0x1    ;
+    bne short_idct4x4llm_v6_scott_loop2  ;
+            ;
+    ldmia   sp!, {r4 - r11, pc} ;
+    ENDP        ;
+            ;
+;********************************************************************************
+;********************************************************************************
+;********************************************************************************
+
+;********************************************************************************
+;*  void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch)
+;*      r0  INT16 * input
+;*      r1  INT16 * output
+;*      r2  INT32 pitch
+;*  bench:
+;********************************************************************************
+
+|vp8_short_idct4x4llm_v6_dual| PROC          ;   cycles  in  out pit
+            ;
+    stmdb   sp!, {r4-r11, lr}   ; backup registers  1                   backup
+    mov r3, #0x00004E00 ;                   cos
+    orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
+    mov r4, #0x00008A00 ;                       sin
+    orr r4, r4, #0x0000008C ; sinpi8sqrt2
+    mov r5, #0x2    ; i=2                           i
+loop1_dual
+    ldr r6, [r0, #(4*2)]    ; i5 | i4                               5|4
+    ldr r12, [r0, #(12*2)]  ; i13 | i12                                                     13|12
+    ldr r14, [r0, #(8*2)]   ; i9 | i8                                                               9|8
+
+    smulwt  r9, r3, r6  ; (ip[5] * cospi8sqrt2minus1) >> 16                                         5c
+    smulwb  r7, r3, r6  ; (ip[4] * cospi8sqrt2minus1) >> 16                                 4c
+    smulwt  r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16                                               5s
+    smulwb  r8, r4, r6  ; (ip[4] * sinpi8sqrt2) >> 16                                       4s
+    pkhbt   r7, r7, r9, lsl #16 ; 5c | 4c
+    smulwt  r11, r3, r12    ; (ip[13] * cospi8sqrt2minus1) >> 16                                                    13c
+    pkhbt   r8, r8, r10, lsl #16    ; 5s | 4s
+    uadd16  r6, r6, r7  ; 5c+5 | 4c+4
+    smulwt  r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16                                  13s
+    smulwb  r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16                                            12c
+    smulwb  r10, r4, r12    ; (ip[12] * sinpi8sqrt2) >> 16                                              12s
+    subs    r5, r5, #0x1    ; i--                           --
+    pkhbt   r9, r9, r11, lsl #16    ; 13c | 12c
+    ldr r11, [r0], #0x4 ; i1 | i0       ++                                          1|0
+    pkhbt   r10, r10, r7, lsl #16   ; 13s | 12s
+    uadd16  r7, r12, r9 ; 13c+13 | 12c+12
+    usub16  r7, r8, r7  ; c                                 c
+    uadd16  r6, r6, r10 ; d                             d
+    uadd16  r10, r11, r14   ; a                                             a
+    usub16  r8, r11, r14    ; b                                     b
+    uadd16  r9, r10, r6 ; a+d                                           a+d
+    usub16  r10, r10, r6    ; a-d                                               a-d
+    uadd16  r6, r8, r7  ; b+c                               b+c
+    usub16  r7, r8, r7  ; b-c                                   b-c
+    str r6, [r1, r2]    ; o5 | o4
+    add r6, r2, r2  ; pitch * 2                             p2
+    str r7, [r1, r6]    ; o9 | o8
+    add r6,  r6, r2 ; pitch * 3                             p3
+    str r10, [r1, r6]   ; o13 | o12
+    str r9, [r1], #0x4  ; o1 | o0           ++
+    bne loop1_dual  ;
+    mov r5, #0x2    ; i=2                           i
+    sub r0, r1, #8  ; reset input/output        i/o
+loop2_dual
+    ldr r6, [r0, r2]    ; i5 | i4                               5|4
+    ldr r1, [r0]    ; i1 | i0           1|0
+    ldr r12, [r0, #0x4] ; i3 | i2                                                       3|2
+    add r14, r2, #0x4   ; pitch + 2                                                             p+2
+    ldr r14, [r0, r14]  ; i7 | i6                                                               7|6
+    smulwt  r9, r3, r6  ; (ip[5] * cospi8sqrt2minus1) >> 16                                         5c
+    smulwt  r7, r3, r1  ; (ip[1] * cospi8sqrt2minus1) >> 16                                 1c
+    smulwt  r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16                                               5s
+    smulwt  r8, r4, r1  ; (ip[1] * sinpi8sqrt2) >> 16                                       1s
+    pkhbt   r11, r6, r1, lsl #16    ; i0 | i4                                                   0|4
+    pkhbt   r7, r9, r7, lsl #16 ; 1c | 5c
+    pkhbt   r8, r10, r8, lsl #16    ; 1s | 5s = temp1 �                                     tc1
+    pkhtb   r1, r1, r6, asr #16 ; i1 | i5           1|5
+    uadd16  r1, r7, r1  ; 1c+1 | 5c+5 = temp2 (d)           td2
+    pkhbt   r9, r14, r12, lsl #16   ; i2 | i6                                           2|6
+    uadd16  r10, r11, r9    ; a                                             a
+    usub16  r9, r11, r9 ; b                                         b
+    pkhtb   r6, r12, r14, asr #16   ; i3 | i7                               3|7
+    subs    r5, r5, #0x1    ; i--                           --
+    smulwt  r7, r3, r6  ; (ip[3] * cospi8sqrt2minus1) >> 16                                 3c
+    smulwt  r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16                                                   3s
+    smulwb  r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16                                                     7c
+    smulwb  r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16                                                               7s
+
+    pkhbt   r7, r12, r7, lsl #16    ; 3c | 7c
+    pkhbt   r11, r14, r11, lsl #16  ; 3s | 7s = temp1 (d)                                                   td1
+    uadd16  r6, r7, r6  ; 3c+3 | 7c+7 = temp2  (c)                              tc2
+    usub16  r12, r8, r6 ; c (o1 | o5)                                                       c
+    uadd16  r6, r11, r1 ; d (o3 | o7)                               d
+    uadd16  r7, r10, r6 ; a+d                                   a+d
+    mov r8, #0x4    ; set up 4's                                        4
+    orr r8, r8, #0x40000    ;                                       4|4
+    usub16  r6, r10, r6 ; a-d                               a-d
+    uadd16  r6, r6, r8  ; a-d+4                             3|7
+    uadd16  r7, r7, r8  ; a+d+4                                 0|4
+    uadd16  r10, r9, r12    ; b+c                                               b+c
+    usub16  r1, r9, r12 ; b-c           b-c
+    uadd16  r10, r10, r8    ; b+c+4                                             1|5
+    uadd16  r1, r1, r8  ; b-c+4         2|6
+    mov r8, r10, asr #19    ; o1 >> 3
+    strh    r8, [r0, #2]    ; o1
+    mov r8, r1, asr #19 ; o2 >> 3
+    strh    r8, [r0, #4]    ; o2
+    mov r8, r6, asr #19 ; o3 >> 3
+    strh    r8, [r0, #6]    ; o3
+    mov r8, r7, asr #19 ; o0 >> 3
+    strh    r8, [r0], r2    ; o0        +p
+    sxth    r10, r10    ;
+    mov r8, r10, asr #3 ; o5 >> 3
+    strh    r8, [r0, #2]    ; o5
+    sxth    r1, r1  ;
+    mov r8, r1, asr #3  ; o6 >> 3
+    strh    r8, [r0, #4]    ; o6
+    sxth    r6, r6  ;
+    mov r8, r6, asr #3  ; o7 >> 3
+    strh    r8, [r0, #6]    ; o7
+    sxth    r7, r7  ;
+    mov r8, r7, asr #3  ; o4 >> 3
+    strh    r8, [r0], r2    ; o4        +p
+;;;;;   subs    r5, r5, #0x1    ; i--                           --
+    bne loop2_dual  ;
+            ;
+    ldmia   sp!, {r4 - r11, pc} ; replace vars, return                      restore
+    ENDP
+
+
+; sjl added 10/17/08
+;void dc_only_idct_armv6(short input_dc, short *output, int pitch)
+|vp8_dc_only_idct_armv6| PROC
+    stmdb       sp!, {r4 - r6, lr}
+
+    add         r0, r0, #0x4
+    add         r4, r1, r2                      ; output + shortpitch
+    mov         r0, r0, ASR #0x3    ;aka a1
+    add         r5, r1, r2, LSL #1              ; output + shortpitch * 2
+    pkhbt       r0, r0, r0, lsl #16             ; a1 | a1
+    add         r6, r5, r2                      ; output + shortpitch * 3
+
+    str         r0, [r1, #0]
+    str         r0, [r1, #4]
+
+    str         r0, [r4, #0]
+    str         r0, [r4, #4]
+
+    str         r0, [r5, #0]
+    str         r0, [r5, #4]
+
+    str         r0, [r6, #0]
+    str         r0, [r6, #4]
+
+
+    ldmia       sp!, {r4 - r6, pc}
+
+    ENDP  ; |vp8_dc_only_idct_armv6|
+
+    END
diff --git a/vp8/common/arm/armv6/iwalsh_v6.asm b/vp8/common/arm/armv6/iwalsh_v6.asm
new file mode 100644
index 000000000..87475681f
--- /dev/null
+++ b/vp8/common/arm/armv6/iwalsh_v6.asm
@@ -0,0 +1,151 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+    EXPORT |vp8_short_inv_walsh4x4_armv6|
+    EXPORT |vp8_short_inv_walsh4x4_1_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;short vp8_short_inv_walsh4x4_armv6(short *input, short *output)
+|vp8_short_inv_walsh4x4_armv6| PROC
+
+    stmdb       sp!, {r4 - r11, lr}
+
+    ldr         r2, [r0], #4         ; [1  |  0]
+    ldr         r3, [r0], #4         ; [3  |  2]
+    ldr         r4, [r0], #4         ; [5  |  4]
+    ldr         r5, [r0], #4         ; [7  |  6]
+    ldr         r6, [r0], #4         ; [9  |  8]
+    ldr         r7, [r0], #4         ; [11 | 10]
+    ldr         r8, [r0], #4         ; [13 | 12]
+    ldr         r9, [r0]             ; [15 | 14]
+
+    qadd16      r10, r2, r8          ; a1 [1+13  |  0+12]
+    qadd16      r11, r4, r6          ; b1 [5+9   |  4+8]
+    qsub16      r12, r4, r6          ; c1 [5-9   |  4-8]
+    qsub16      lr, r2, r8           ; d1 [1-13  |  0-12]
+
+    qadd16      r2, r10, r11         ; a1 + b1 [1  |  0]
+    qadd16      r4, r12, lr          ; c1 + d1 [5  |  4]
+    qsub16      r6, r10, r11         ; a1 - b1 [9  |  8]
+    qsub16      r8, lr, r12          ; d1 - c1 [13 | 12]
+
+    qadd16      r10, r3, r9          ; a1 [3+15  |  2+14]
+    qadd16      r11, r5, r7          ; b1 [7+11  |  6+10]
+    qsub16      r12, r5, r7          ; c1 [7-11  |  6-10]
+    qsub16      lr, r3, r9           ; d1 [3-15  |  2-14]
+
+    qadd16      r3, r10, r11         ; a1 + b1 [3  |  2]
+    qadd16      r5, r12, lr          ; c1 + d1 [7  |  6]
+    qsub16      r7, r10, r11         ; a1 - b1 [11 | 10]
+    qsub16      r9, lr, r12          ; d1 - c1 [15 | 14]
+
+    ; first transform complete
+
+    qsubaddx    r10, r2, r3          ; [c1|a1] [1-2   |   0+3]
+    qaddsubx    r11, r2, r3          ; [b1|d1] [1+2   |   0-3]
+    qsubaddx    r12, r4, r5          ; [c1|a1] [5-6   |   4+7]
+    qaddsubx    lr, r4, r5           ; [b1|d1] [5+6   |   4-7]
+
+    qaddsubx    r2, r10, r11         ; [b2|c2] [c1+d1 | a1-b1]
+    qaddsubx    r3, r11, r10         ; [a2|d2] [b1+a1 | d1-c1]
+    ldr         r10, c0x00030003
+    qaddsubx    r4, r12, lr          ; [b2|c2] [c1+d1 | a1-b1]
+    qaddsubx    r5, lr, r12          ; [a2|d2] [b1+a1 | d1-c1]
+
+    qadd16      r2, r2, r10          ; [b2+3|c2+3]
+    qadd16      r3, r3, r10          ; [a2+3|d2+3]
+    qadd16      r4, r4, r10          ; [b2+3|c2+3]
+    qadd16      r5, r5, r10          ; [a2+3|d2+3]
+
+    asr         r12, r2, #3          ; [1  |  x]
+    pkhtb       r12, r12, r3, asr #19; [1  |  0]
+    lsl         lr, r3, #16          ; [~3 |  x]
+    lsl         r2, r2, #16          ; [~2 |  x]
+    asr         lr, lr, #3           ; [3  |  x]
+    pkhtb       lr, lr, r2, asr #19  ; [3  |  2]
+
+    asr         r2, r4, #3           ; [5  |  x]
+    pkhtb       r2, r2, r5, asr #19  ; [5  |  4]
+    lsl         r3, r5, #16          ; [~7 |  x]
+    lsl         r4, r4, #16          ; [~6 |  x]
+    asr         r3, r3, #3           ; [7  |  x]
+    pkhtb       r3, r3, r4, asr #19  ; [7  |  6]
+
+    str         r12, [r1], #4
+    str         lr, [r1], #4
+    str         r2, [r1], #4
+    str         r3, [r1], #4
+
+    qsubaddx    r2, r6, r7           ; [c1|a1] [9-10  |  8+11]
+    qaddsubx    r3, r6, r7           ; [b1|d1] [9+10  |  8-11]
+    qsubaddx    r4, r8, r9           ; [c1|a1] [13-14 | 12+15]
+    qaddsubx    r5, r8, r9           ; [b1|d1] [13+14 | 12-15]
+
+    qaddsubx    r6, r2, r3           ; [b2|c2] [c1+d1 | a1-b1]
+    qaddsubx    r7, r3, r2           ; [a2|d2] [b1+a1 | d1-c1]
+    qaddsubx    r8, r4, r5           ; [b2|c2] [c1+d1 | a1-b1]
+    qaddsubx    r9, r5, r4           ; [a2|d2] [b1+a1 | d1-c1]
+
+    qadd16      r6, r6, r10          ; [b2+3|c2+3]
+    qadd16      r7, r7, r10          ; [a2+3|d2+3]
+    qadd16      r8, r8, r10          ; [b2+3|c2+3]
+    qadd16      r9, r9, r10          ; [a2+3|d2+3]
+
+    asr         r2, r6, #3           ; [9  |  x]
+    pkhtb       r2, r2, r7, asr #19  ; [9  |  8]
+    lsl         r3, r7, #16          ; [~11|  x]
+    lsl         r4, r6, #16          ; [~10|  x]
+    asr         r3, r3, #3           ; [11 |  x]
+    pkhtb       r3, r3, r4, asr #19  ; [11 | 10]
+
+    asr         r4, r8, #3           ; [13 |  x]
+    pkhtb       r4, r4, r9, asr #19  ; [13 | 12]
+    lsl         r5, r9, #16          ; [~15|  x]
+    lsl         r6, r8, #16          ; [~14|  x]
+    asr         r5, r5, #3           ; [15 |  x]
+    pkhtb       r5, r5, r6, asr #19  ; [15 | 14]
+
+    str         r2, [r1], #4
+    str         r3, [r1], #4
+    str         r4, [r1], #4
+    str         r5, [r1]
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_short_inv_walsh4x4_armv6|
+
+
+;short vp8_short_inv_walsh4x4_1_armv6(short *input, short *output)
+|vp8_short_inv_walsh4x4_1_armv6| PROC
+
+    ldrsh       r2, [r0]             ; [0]
+    add         r2, r2, #3           ; [0] + 3
+    asr         r2, r2, #3           ; a1 ([0]+3) >> 3
+    lsl         r2, r2, #16          ; [a1 |  x]
+    orr         r2, r2, r2, lsr #16  ; [a1 | a1]
+
+    str         r2, [r1], #4
+    str         r2, [r1], #4
+    str         r2, [r1], #4
+    str         r2, [r1], #4
+    str         r2, [r1], #4
+    str         r2, [r1], #4
+    str         r2, [r1], #4
+    str         r2, [r1]
+
+    bx          lr
+    ENDP        ; |vp8_short_inv_walsh4x4_1_armv6|
+
+; Constant Pool
+c0x00030003 DCD 0x00030003
+    END
diff --git a/vp8/common/arm/armv6/loopfilter_v6.asm b/vp8/common/arm/armv6/loopfilter_v6.asm
new file mode 100644
index 000000000..c2b02dc0a
--- /dev/null
+++ b/vp8/common/arm/armv6/loopfilter_v6.asm
@@ -0,0 +1,1263 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT |vp8_loop_filter_horizontal_edge_armv6|
+    EXPORT |vp8_mbloop_filter_horizontal_edge_armv6|
+    EXPORT |vp8_loop_filter_vertical_edge_armv6|
+    EXPORT |vp8_mbloop_filter_vertical_edge_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+    MACRO
+    TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
+    ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
+    ; a0: 03 02 01 00
+    ; a1: 13 12 11 10
+    ; a2: 23 22 21 20
+    ; a3: 33 32 31 30
+    ;     b3 b2 b1 b0
+
+    uxtb16      $b1, $a1                    ; xx 12 xx 10
+    uxtb16      $b0, $a0                    ; xx 02 xx 00
+    uxtb16      $b3, $a3                    ; xx 32 xx 30
+    uxtb16      $b2, $a2                    ; xx 22 xx 20
+    orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00
+    orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20
+
+    uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11
+    uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31
+    uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01
+    uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21
+    orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01
+    orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21
+
+    pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1
+    pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3
+
+    pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0
+    pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2
+    MEND
+
+
+src         RN  r0
+pstep       RN  r1
+count       RN  r5
+
+;r0     unsigned char *src_ptr,
+;r1     int src_pixel_step,
+;r2     const char *flimit,
+;r3     const char *limit,
+;stack  const char *thresh,
+;stack  int  count
+
+;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed
+;for flimit. Same way applies to limit and thresh.
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_loop_filter_horizontal_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+    ldr         count, [sp, #40]            ; count for 8-in-parallel
+    ldr         r6, [sp, #36]               ; load thresh address
+    sub         sp, sp, #16                 ; create temp buffer
+
+    ldr         r9, [src], pstep            ; p3
+    ldr         r4, [r2], #4                ; flimit
+    ldr         r10, [src], pstep           ; p2
+    ldr         r2, [r3], #4                ; limit
+    ldr         r11, [src], pstep           ; p1
+    uadd8       r4, r4, r4                  ; flimit * 2
+    ldr         r3, [r6], #4                ; thresh
+    mov         count, count, lsl #1        ; 4-in-parallel
+    uadd8       r4, r4, r2                  ; flimit * 2 + limit
+
+|Hnext8|
+    ; vp8_filter_mask() function
+    ; calculate breakout conditions
+    ldr         r12, [src], pstep           ; p0
+
+    uqsub8      r6, r9, r10                 ; p3 - p2
+    uqsub8      r7, r10, r9                 ; p2 - p3
+    uqsub8      r8, r10, r11                ; p2 - p1
+    uqsub8      r10, r11, r10               ; p1 - p2
+
+    orr         r6, r6, r7                  ; abs (p3-p2)
+    orr         r8, r8, r10                 ; abs (p2-p1)
+    uqsub8      lr, r6, r2                  ; compare to limit. lr: vp8_filter_mask
+    uqsub8      r8, r8, r2                  ; compare to limit
+    uqsub8      r6, r11, r12                ; p1 - p0
+    orr         lr, lr, r8
+    uqsub8      r7, r12, r11                ; p0 - p1
+    ldr         r9, [src], pstep            ; q0
+    ldr         r10, [src], pstep           ; q1
+    orr         r6, r6, r7                  ; abs (p1-p0)
+    uqsub8      r7, r6, r2                  ; compare to limit
+    uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later
+    orr         lr, lr, r7
+
+    uqsub8      r6, r11, r10                ; p1 - q1
+    uqsub8      r7, r10, r11                ; q1 - p1
+    uqsub8      r11, r12, r9                ; p0 - q0
+    uqsub8      r12, r9, r12                ; q0 - p0
+    orr         r6, r6, r7                  ; abs (p1-q1)
+    ldr         r7, c0x7F7F7F7F
+    orr         r12, r11, r12               ; abs (p0-q0)
+    ldr         r11, [src], pstep           ; q2
+    uqadd8      r12, r12, r12               ; abs (p0-q0) * 2
+    and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2
+    uqsub8      r7, r9, r10                 ; q0 - q1
+    uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2
+    uqsub8      r6, r10, r9                 ; q1 - q0
+    uqsub8      r12, r12, r4                ; compare to flimit
+    uqsub8      r9, r11, r10                ; q2 - q1
+
+    orr         lr, lr, r12
+
+    ldr         r12, [src], pstep           ; q3
+    uqsub8      r10, r10, r11               ; q1 - q2
+    orr         r6, r7, r6                  ; abs (q1-q0)
+    orr         r10, r9, r10                ; abs (q2-q1)
+    uqsub8      r7, r6, r2                  ; compare to limit
+    uqsub8      r10, r10, r2                ; compare to limit
+    uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later
+    orr         lr, lr, r7
+    orr         lr, lr, r10
+
+    uqsub8      r10, r12, r11               ; q3 - q2
+    uqsub8      r9, r11, r12                ; q2 - q3
+
+    mvn         r11, #0                     ; r11 == -1
+
+    orr         r10, r10, r9                ; abs (q3-q2)
+    uqsub8      r10, r10, r2                ; compare to limit
+
+    mov         r12, #0
+    orr         lr, lr, r10
+    sub         src, src, pstep, lsl #2
+
+    usub8       lr, r12, lr                 ; use usub8 instead of ssub8
+    sel         lr, r11, r12                ; filter mask: lr
+
+    cmp         lr, #0
+    beq         hskip_filter                 ; skip filtering
+
+    sub         src, src, pstep, lsl #1     ; move src pointer down by 6 lines
+
+    ;vp8_hevmask() function
+    ;calculate high edge variance
+    orr         r10, r6, r8                 ; calculate vp8_hevmask
+
+    ldr         r7, [src], pstep            ; p1
+
+    usub8       r10, r12, r10               ; use usub8 instead of ssub8
+    sel         r6, r12, r11                ; obtain vp8_hevmask: r6
+
+    ;vp8_filter() function
+    ldr         r8, [src], pstep            ; p0
+    ldr         r12, c0x80808080
+    ldr         r9, [src], pstep            ; q0
+    ldr         r10, [src], pstep           ; q1
+
+    eor         r7, r7, r12                 ; p1 offset to convert to a signed value
+    eor         r8, r8, r12                 ; p0 offset to convert to a signed value
+    eor         r9, r9, r12                 ; q0 offset to convert to a signed value
+    eor         r10, r10, r12               ; q1 offset to convert to a signed value
+
+    str         r9, [sp]                    ; store qs0 temporarily
+    str         r8, [sp, #4]                ; store ps0 temporarily
+    str         r10, [sp, #8]               ; store qs1 temporarily
+    str         r7, [sp, #12]               ; store ps1 temporarily
+
+    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
+    qsub8       r8, r9, r8                  ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+
+    and         r7, r7, r6                  ; vp8_filter (r7) &= hev
+
+    qadd8       r7, r7, r8
+    ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8
+
+    qadd8       r7, r7, r8
+    ldr         r10, c0x04040404
+
+    qadd8       r7, r7, r8
+    and         r7, r7, lr                  ; vp8_filter &= mask;
+
+    ;modify code for vp8 -- Filter1 = vp8_filter (r7)
+    qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
+    qadd8       r7 , r7 , r10               ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
+
+    mov         r9, #0
+    shadd8      r8 , r8 , r9                ; Filter2 >>= 3
+    shadd8      r7 , r7 , r9                ; vp8_filter >>= 3
+    shadd8      r8 , r8 , r9
+    shadd8      r7 , r7 , r9
+    shadd8      lr , r8 , r9                ; lr: Filter2
+    shadd8      r7 , r7 , r9                ; r7: filter
+
+    ;usub8      lr, r8, r10                 ; s = (s==4)*-1
+    ;sel        lr, r11, r9
+    ;usub8      r8, r10, r8
+    ;sel        r8, r11, r9
+    ;and        r8, r8, lr                  ; -1 for each element that equals 4
+
+    ;calculate output
+    ;qadd8      lr, r8, r7                  ; u = vp8_signed_char_clamp(s + vp8_filter)
+
+    ldr         r8, [sp]                    ; load qs0
+    ldr         r9, [sp, #4]                ; load ps0
+
+    ldr         r10, c0x01010101
+
+    qsub8       r8 ,r8, r7                  ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
+    qadd8       r9, r9, lr                  ; u = vp8_signed_char_clamp(ps0 + Filter2)
+
+    ;end of modification for vp8
+
+    mov         lr, #0
+    sadd8       r7, r7 , r10                ; vp8_filter += 1
+    shadd8      r7, r7, lr                  ; vp8_filter >>= 1
+
+    ldr         r11, [sp, #12]              ; load ps1
+    ldr         r10, [sp, #8]               ; load qs1
+
+    bic         r7, r7, r6                  ; vp8_filter &= ~hev
+    sub         src, src, pstep, lsl #2
+
+    qadd8       r11, r11, r7                ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
+    qsub8       r10, r10,r7                 ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
+
+    eor         r11, r11, r12               ; *op1 = u^0x80
+    str         r11, [src], pstep           ; store op1
+    eor         r9, r9, r12                 ; *op0 = u^0x80
+    str         r9, [src], pstep            ; store op0 result
+    eor         r8, r8, r12                 ; *oq0 = u^0x80
+    str         r8, [src], pstep            ; store oq0 result
+    eor         r10, r10, r12               ; *oq1 = u^0x80
+    str         r10, [src], pstep           ; store oq1
+
+    sub         src, src, pstep, lsl #1
+
+|hskip_filter|
+    add         src, src, #4
+    sub         src, src, pstep, lsl #2
+
+    subs        count, count, #1
+
+    ;pld            [src]
+    ;pld            [src, pstep]
+    ;pld            [src, pstep, lsl #1]
+    ;pld            [src, pstep, lsl #2]
+    ;pld            [src, pstep, lsl #3]
+
+    ldrne       r9, [src], pstep            ; p3
+    ldrne       r10, [src], pstep           ; p2
+    ldrne       r11, [src], pstep           ; p1
+
+    bne         Hnext8
+
+    add         sp, sp, #16
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_loop_filter_horizontal_edge_armv6|
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_mbloop_filter_horizontal_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+    ldr         count, [sp, #40]            ; count for 8-in-parallel
+    ldr         r6, [sp, #36]               ; load thresh address
+    sub         sp, sp, #16                 ; create temp buffer
+
+    ldr         r9, [src], pstep            ; p3
+    ldr         r4, [r2], #4                ; flimit
+    ldr         r10, [src], pstep           ; p2
+    ldr         r2, [r3], #4                ; limit
+    ldr         r11, [src], pstep           ; p1
+    uadd8       r4, r4, r4                  ; flimit * 2
+    ldr         r3, [r6], #4                ; thresh
+    mov         count, count, lsl #1        ; 4-in-parallel
+    uadd8       r4, r4, r2                  ; flimit * 2 + limit
+
+|MBHnext8|
+
+    ; vp8_filter_mask() function
+    ; calculate breakout conditions
+    ldr         r12, [src], pstep           ; p0
+
+    uqsub8      r6, r9, r10                 ; p3 - p2
+    uqsub8      r7, r10, r9                 ; p2 - p3
+    uqsub8      r8, r10, r11                ; p2 - p1
+    uqsub8      r10, r11, r10               ; p1 - p2
+
+    orr         r6, r6, r7                  ; abs (p3-p2)
+    orr         r8, r8, r10                 ; abs (p2-p1)
+    uqsub8      lr, r6, r2                  ; compare to limit. lr: vp8_filter_mask
+    uqsub8      r8, r8, r2                  ; compare to limit
+
+    uqsub8      r6, r11, r12                ; p1 - p0
+    orr         lr, lr, r8
+    uqsub8      r7, r12, r11                ; p0 - p1
+    ldr         r9, [src], pstep            ; q0
+    ldr         r10, [src], pstep           ; q1
+    orr         r6, r6, r7                  ; abs (p1-p0)
+    uqsub8      r7, r6, r2                  ; compare to limit
+    uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later
+    orr         lr, lr, r7
+
+    uqsub8      r6, r11, r10                ; p1 - q1
+    uqsub8      r7, r10, r11                ; q1 - p1
+    uqsub8      r11, r12, r9                ; p0 - q0
+    uqsub8      r12, r9, r12                ; q0 - p0
+    orr         r6, r6, r7                  ; abs (p1-q1)
+    ldr         r7, c0x7F7F7F7F
+    orr         r12, r11, r12               ; abs (p0-q0)
+    ldr         r11, [src], pstep           ; q2
+    uqadd8      r12, r12, r12               ; abs (p0-q0) * 2
+    and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2
+    uqsub8      r7, r9, r10                 ; q0 - q1
+    uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2
+    uqsub8      r6, r10, r9                 ; q1 - q0
+    uqsub8      r12, r12, r4                ; compare to flimit
+    uqsub8      r9, r11, r10                ; q2 - q1
+
+    orr         lr, lr, r12
+
+    ldr         r12, [src], pstep           ; q3
+
+    uqsub8      r10, r10, r11               ; q1 - q2
+    orr         r6, r7, r6                  ; abs (q1-q0)
+    orr         r10, r9, r10                ; abs (q2-q1)
+    uqsub8      r7, r6, r2                  ; compare to limit
+    uqsub8      r10, r10, r2                ; compare to limit
+    uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later
+    orr         lr, lr, r7
+    orr         lr, lr, r10
+
+    uqsub8      r10, r12, r11               ; q3 - q2
+    uqsub8      r9, r11, r12                ; q2 - q3
+
+    mvn         r11, #0                     ; r11 == -1
+
+    orr         r10, r10, r9                ; abs (q3-q2)
+    uqsub8      r10, r10, r2                ; compare to limit
+
+    mov         r12, #0
+
+    orr         lr, lr, r10
+
+    usub8       lr, r12, lr                 ; use usub8 instead of ssub8
+    sel         lr, r11, r12                ; filter mask: lr
+
+    cmp         lr, #0
+    beq         mbhskip_filter               ; skip filtering
+
+    ;vp8_hevmask() function
+    ;calculate high edge variance
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 6 lines
+    sub         src, src, pstep, lsl #1
+
+    orr         r10, r6, r8
+    ldr         r7, [src], pstep            ; p1
+
+    usub8       r10, r12, r10
+    sel         r6, r12, r11                ; hev mask: r6
+
+    ;vp8_mbfilter() function
+    ;p2, q2 are only needed at the end. Don't need to load them in now.
+    ldr         r8, [src], pstep            ; p0
+    ldr         r12, c0x80808080
+    ldr         r9, [src], pstep            ; q0
+    ldr         r10, [src]                  ; q1
+
+    eor         r7, r7, r12                 ; ps1
+    eor         r8, r8, r12                 ; ps0
+    eor         r9, r9, r12                 ; qs0
+    eor         r10, r10, r12               ; qs1
+
+    qsub8       r12, r9, r8                 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    str         r7, [sp, #12]               ; store ps1 temporarily
+    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
+    str         r10, [sp, #8]               ; store qs1 temporarily
+    qadd8       r7, r7, r12
+    str         r9, [sp]                    ; store qs0 temporarily
+    qadd8       r7, r7, r12
+    str         r8, [sp, #4]                ; store ps0 temporarily
+    qadd8       r7, r7, r12                 ; vp8_filter: r7
+
+    ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8
+    ldr         r9, c0x04040404
+
+    and         r7, r7, lr                  ; vp8_filter &= mask (lr is free)
+
+    mov         r12, r7                     ; Filter2: r12
+    and         r12, r12, r6                ; Filter2 &= hev
+
+    ;modify code for vp8
+    ;save bottom 3 bits so that we round one side +4 and the other +3
+    qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
+    qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
+
+    mov         r10, #0
+    shadd8      r8 , r8 , r10               ; Filter1 >>= 3
+    shadd8      r12 , r12 , r10             ; Filter2 >>= 3
+    shadd8      r8 , r8 , r10
+    shadd8      r12 , r12 , r10
+    shadd8      r8 , r8 , r10               ; r8: Filter1
+    shadd8      r12 , r12 , r10             ; r12: Filter2
+
+    ldr         r9, [sp]                    ; load qs0
+    ldr         r11, [sp, #4]               ; load ps0
+
+    qsub8       r9 , r9, r8                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
+    qadd8       r11, r11, r12               ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
+
+    ;save bottom 3 bits so that we round one side +4 and the other +3
+    ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)
+    ;qadd8      r12 , r12 , r9              ; Filter2 = vp8_signed_char_clamp(Filter2+4)
+    ;mov            r10, #0
+    ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3
+    ;usub8      lr, r8, r9                  ; s = (s==4)*-1
+    ;sel            lr, r11, r10
+    ;shadd8     r12 , r12 , r10
+    ;usub8      r8, r9, r8
+    ;sel            r8, r11, r10
+    ;ldr            r9, [sp]                    ; load qs0
+    ;ldr            r11, [sp, #4]               ; load ps0
+    ;shadd8     r12 , r12 , r10
+    ;and            r8, r8, lr                  ; -1 for each element that equals 4
+    ;qadd8      r10, r8, r12                ; u = vp8_signed_char_clamp(s + Filter2)
+    ;qsub8      r9 , r9, r12                ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
+    ;qadd8      r11, r11, r10               ; ps0 = vp8_signed_char_clamp(ps0 + u)
+
+    ;end of modification for vp8
+
+    bic         r12, r7, r6                 ; vp8_filter &= ~hev    ( r6 is free)
+    ;mov        r12, r7
+
+    ;roughly 3/7th difference across boundary
+    mov         lr, #0x1b                   ; 27
+    mov         r7, #0x3f                   ; 63
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r7, r10, lr, r7
+    smultb      r10, r10, lr
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    add         r10, r10, #63
+    ssat        r7, #8, r7, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    ldr         lr, c0x80808080
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r7, r10, lsl #16
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    sub         src, src, pstep
+
+    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
+
+    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs0 - u)
+    qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps0 + u)
+    eor         r8, r8, lr                  ; *oq0 = s^0x80
+    str         r8, [src]                   ; store *oq0
+    sub         src, src, pstep
+    eor         r10, r10, lr                ; *op0 = s^0x80
+    str         r10, [src]                  ; store *op0
+
+    ;roughly 2/7th difference across boundary
+    mov         lr, #0x12                   ; 18
+    mov         r7, #0x3f                   ; 63
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r9, r10, lr, r7
+    smlatb      r10, r10, lr, r7
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    ssat        r9, #8, r9, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    ldr         lr, c0x80808080
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r9, r10, lsl #16
+
+    ldr         r9, [sp, #8]                ; load qs1
+    ldr         r11, [sp, #12]              ; load ps1
+
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    sub         src, src, pstep
+
+    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
+
+    qadd8       r11, r11, r10               ; s = vp8_signed_char_clamp(ps1 + u)
+    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs1 - u)
+    eor         r11, r11, lr                ; *op1 = s^0x80
+    str         r11, [src], pstep           ; store *op1
+    eor         r8, r8, lr                  ; *oq1 = s^0x80
+    add         src, src, pstep, lsl #1
+
+    mov         r7, #0x3f                   ; 63
+
+    str         r8, [src], pstep            ; store *oq1
+
+    ;roughly 1/7th difference across boundary
+    mov         lr, #0x9                    ; 9
+    ldr         r9, [src]                   ; load q2
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r12, r10, lr, r7
+    smlatb      r10, r10, lr, r7
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    ssat        r12, #8, r12, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    sub         src, src, pstep, lsl #2
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r12, r10, lsl #16
+
+    sub         src, src, pstep
+    ldr         lr, c0x80808080
+
+    ldr         r11, [src]                  ; load p2
+
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    eor         r9, r9, lr
+    eor         r11, r11, lr
+
+    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
+
+    qadd8       r8, r11, r10                ; s = vp8_signed_char_clamp(ps2 + u)
+    qsub8       r10, r9, r10                ; s = vp8_signed_char_clamp(qs2 - u)
+    eor         r8, r8, lr                  ; *op2 = s^0x80
+    str         r8, [src], pstep, lsl #2    ; store *op2
+    add         src, src, pstep
+    eor         r10, r10, lr                ; *oq2 = s^0x80
+    str         r10, [src], pstep, lsl #1   ; store *oq2
+
+|mbhskip_filter|
+    add         src, src, #4
+    sub         src, src, pstep, lsl #3
+    subs        count, count, #1
+
+    ldrne       r9, [src], pstep            ; p3
+    ldrne       r10, [src], pstep           ; p2
+    ldrne       r11, [src], pstep           ; p1
+
+    bne         MBHnext8
+
+    add         sp, sp, #16
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_mbloop_filter_horizontal_edge_armv6|
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_loop_filter_vertical_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    sub         src, src, #4                ; move src pointer down by 4
+    ldr         count, [sp, #40]            ; count for 8-in-parallel
+    ldr         r12, [sp, #36]              ; load thresh address
+    sub         sp, sp, #16                 ; create temp buffer
+
+    ldr         r6, [src], pstep            ; load source data
+    ldr         r4, [r2], #4                ; flimit
+    ldr         r7, [src], pstep
+    ldr         r2, [r3], #4                ; limit
+    ldr         r8, [src], pstep
+    uadd8       r4, r4, r4                  ; flimit * 2
+    ldr         r3, [r12], #4               ; thresh
+    ldr         lr, [src], pstep
+    mov         count, count, lsl #1        ; 4-in-parallel
+    uadd8       r4, r4, r2                  ; flimit * 2 + limit
+
+|Vnext8|
+
+    ; vp8_filter_mask() function
+    ; calculate breakout conditions
+    ; transpose the source data for 4-in-parallel operation
+    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+    uqsub8      r7, r9, r10                 ; p3 - p2
+    uqsub8      r8, r10, r9                 ; p2 - p3
+    uqsub8      r9, r10, r11                ; p2 - p1
+    uqsub8      r10, r11, r10               ; p1 - p2
+    orr         r7, r7, r8                  ; abs (p3-p2)
+    orr         r10, r9, r10                ; abs (p2-p1)
+    uqsub8      lr, r7, r2                  ; compare to limit. lr: vp8_filter_mask
+    uqsub8      r10, r10, r2                ; compare to limit
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    orr         lr, lr, r10
+
+    uqsub8      r6, r11, r12                ; p1 - p0
+    uqsub8      r7, r12, r11                ; p0 - p1
+    add         src, src, #4                ; move src pointer up by 4
+    orr         r6, r6, r7                  ; abs (p1-p0)
+    str         r11, [sp, #12]              ; save p1
+    uqsub8      r10, r6, r2                 ; compare to limit
+    uqsub8      r11, r6, r3                 ; compare to thresh
+    orr         lr, lr, r10
+
+    ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
+    ; transpose the source data for 4-in-parallel operation
+    ldr         r6, [src], pstep            ; load source data
+    str         r11, [sp]                   ; push r11 to stack
+    ldr         r7, [src], pstep
+    str         r12, [sp, #4]               ; save current reg before load q0 - q3 data
+    ldr         r8, [src], pstep
+    str         lr, [sp, #8]
+    ldr         lr, [src], pstep
+
+    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+    ldr         lr, [sp, #8]                ; load back (f)limit accumulator
+
+    uqsub8      r6, r12, r11                ; q3 - q2
+    uqsub8      r7, r11, r12                ; q2 - q3
+    uqsub8      r12, r11, r10               ; q2 - q1
+    uqsub8      r11, r10, r11               ; q1 - q2
+    orr         r6, r6, r7                  ; abs (q3-q2)
+    orr         r7, r12, r11                ; abs (q2-q1)
+    uqsub8      r6, r6, r2                  ; compare to limit
+    uqsub8      r7, r7, r2                  ; compare to limit
+    ldr         r11, [sp, #4]               ; load back p0
+    ldr         r12, [sp, #12]              ; load back p1
+    orr         lr, lr, r6
+    orr         lr, lr, r7
+
+    uqsub8      r6, r11, r9                 ; p0 - q0
+    uqsub8      r7, r9, r11                 ; q0 - p0
+    uqsub8      r8, r12, r10                ; p1 - q1
+    uqsub8      r11, r10, r12               ; q1 - p1
+    orr         r6, r6, r7                  ; abs (p0-q0)
+    ldr         r7, c0x7F7F7F7F
+    orr         r8, r8, r11                 ; abs (p1-q1)
+    uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2
+    and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2
+    uqsub8      r11, r10, r9                ; q1 - q0
+    uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2
+    uqsub8      r12, r9, r10                ; q0 - q1
+    uqsub8      r6, r6, r4                  ; compare to flimit
+
+    orr         r9, r11, r12                ; abs (q1-q0)
+    uqsub8      r8, r9, r2                  ; compare to limit
+    uqsub8      r10, r9, r3                 ; compare to thresh
+    orr         lr, lr, r6
+    orr         lr, lr, r8
+
+    mvn         r11, #0                     ; r11 == -1
+    mov         r12, #0
+
+    usub8       lr, r12, lr
+    ldr         r9, [sp]                    ; load the compared result
+    sel         lr, r11, r12                ; filter mask: lr
+
+    cmp         lr, #0
+    beq         vskip_filter                 ; skip filtering
+
+    ;vp8_hevmask() function
+    ;calculate high edge variance
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    orr         r9, r9, r10
+
+    ldrh        r7, [src, #-2]
+    ldrh        r8, [src], pstep
+
+    usub8       r9, r12, r9
+    sel         r6, r12, r11                ; hev mask: r6
+
+    ;vp8_filter() function
+    ; load soure data to r6, r11, r12, lr
+    ldrh        r9, [src, #-2]
+    ldrh        r10, [src], pstep
+
+    pkhbt       r12, r7, r8, lsl #16
+
+    ldrh        r7, [src, #-2]
+    ldrh        r8, [src], pstep
+
+    pkhbt       r11, r9, r10, lsl #16
+
+    ldrh        r9, [src, #-2]
+    ldrh        r10, [src], pstep
+
+    ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
+    str         r6, [sp]
+    str         lr, [sp, #4]
+
+    pkhbt       r6, r7, r8, lsl #16
+    pkhbt       lr, r9, r10, lsl #16
+
+    ;transpose r12, r11, r6, lr to r7, r8, r9, r10
+    TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
+
+    ;load back hev_mask r6 and filter_mask lr
+    ldr         r12, c0x80808080
+    ldr         r6, [sp]
+    ldr         lr, [sp, #4]
+
+    eor         r7, r7, r12                 ; p1 offset to convert to a signed value
+    eor         r8, r8, r12                 ; p0 offset to convert to a signed value
+    eor         r9, r9, r12                 ; q0 offset to convert to a signed value
+    eor         r10, r10, r12               ; q1 offset to convert to a signed value
+
+    str         r9, [sp]                    ; store qs0 temporarily
+    str         r8, [sp, #4]                ; store ps0 temporarily
+    str         r10, [sp, #8]               ; store qs1 temporarily
+    str         r7, [sp, #12]               ; store ps1 temporarily
+
+    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
+    qsub8       r8, r9, r8                  ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+
+    and         r7, r7, r6                  ;  vp8_filter (r7) &= hev (r7 : filter)
+
+    qadd8       r7, r7, r8
+    ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8
+
+    qadd8       r7, r7, r8
+    ldr         r10, c0x04040404
+
+    qadd8       r7, r7, r8
+    ;mvn         r11, #0                     ; r11 == -1
+
+    and         r7, r7, lr                  ; vp8_filter &= mask
+
+    ;modify code for vp8 -- Filter1 = vp8_filter (r7)
+    qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
+    qadd8       r7 , r7 , r10               ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
+
+    mov         r9, #0
+    shadd8      r8 , r8 , r9                ; Filter2 >>= 3
+    shadd8      r7 , r7 , r9                ; vp8_filter >>= 3
+    shadd8      r8 , r8 , r9
+    shadd8      r7 , r7 , r9
+    shadd8      lr , r8 , r9                ; lr: filter2
+    shadd8      r7 , r7 , r9                ; r7: filter
+
+    ;usub8      lr, r8, r10                 ; s = (s==4)*-1
+    ;sel            lr, r11, r9
+    ;usub8      r8, r10, r8
+    ;sel            r8, r11, r9
+    ;and            r8, r8, lr                  ; -1 for each element that equals 4 -- r8: s
+
+    ;calculate output
+    ;qadd8      lr, r8, r7                  ; u = vp8_signed_char_clamp(s + vp8_filter)
+
+    ldr         r8, [sp]                    ; load qs0
+    ldr         r9, [sp, #4]                ; load ps0
+
+    ldr         r10, c0x01010101
+
+    qsub8       r8, r8, r7                  ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
+    qadd8       r9, r9, lr                  ; u = vp8_signed_char_clamp(ps0 + Filter2)
+    ;end of modification for vp8
+
+    eor         r8, r8, r12
+    eor         r9, r9, r12
+
+    mov         lr, #0
+
+    sadd8       r7, r7, r10
+    shadd8      r7, r7, lr
+
+    ldr         r10, [sp, #8]               ; load qs1
+    ldr         r11, [sp, #12]              ; load ps1
+
+    bic         r7, r7, r6                  ; r7: vp8_filter
+
+    qsub8       r10 , r10, r7               ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
+    qadd8       r11, r11, r7                ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
+    eor         r10, r10, r12
+    eor         r11, r11, r12
+
+    sub         src, src, pstep, lsl #2
+
+    ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1
+    ;output is b0, b1, b2, b3
+    ;b0: 03 02 01 00
+    ;b1: 13 12 11 10
+    ;b2: 23 22 21 20
+    ;b3: 33 32 31 30
+    ;    p1 p0 q0 q1
+    ;   (a3 a2 a1 a0)
+    TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr
+
+    strh        r6, [src, #-2]              ; store the result
+    mov         r6, r6, lsr #16
+    strh        r6, [src], pstep
+
+    strh        r7, [src, #-2]
+    mov         r7, r7, lsr #16
+    strh        r7, [src], pstep
+
+    strh        r12, [src, #-2]
+    mov         r12, r12, lsr #16
+    strh        r12, [src], pstep
+
+    strh        lr, [src, #-2]
+    mov         lr, lr, lsr #16
+    strh        lr, [src], pstep
+
+|vskip_filter|
+    sub         src, src, #4
+    subs        count, count, #1
+
+    ldrne       r6, [src], pstep            ; load source data
+    ldrne       r7, [src], pstep
+    ldrne       r8, [src], pstep
+    ldrne       lr, [src], pstep
+
+    bne         Vnext8
+
+    add         sp, sp, #16
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_loop_filter_vertical_edge_armv6|
+
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_mbloop_filter_vertical_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    sub         src, src, #4                ; move src pointer down by 4
+    ldr         count, [sp, #40]            ; count for 8-in-parallel
+    ldr         r12, [sp, #36]              ; load thresh address
+    sub         sp, sp, #16                 ; create temp buffer
+
+    ldr         r6, [src], pstep            ; load source data
+    ldr         r4, [r2], #4                ; flimit
+    ldr         r7, [src], pstep
+    ldr         r2, [r3], #4                ; limit
+    ldr         r8, [src], pstep
+    uadd8       r4, r4, r4                  ; flimit * 2
+    ldr         r3, [r12], #4               ; thresh
+    ldr         lr, [src], pstep
+    mov         count, count, lsl #1        ; 4-in-parallel
+    uadd8       r4, r4, r2                  ; flimit * 2 + limit
+
+|MBVnext8|
+    ; vp8_filter_mask() function
+    ; calculate breakout conditions
+    ; transpose the source data for 4-in-parallel operation
+    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+    uqsub8      r7, r9, r10                 ; p3 - p2
+    uqsub8      r8, r10, r9                 ; p2 - p3
+    uqsub8      r9, r10, r11                ; p2 - p1
+    uqsub8      r10, r11, r10               ; p1 - p2
+    orr         r7, r7, r8                  ; abs (p3-p2)
+    orr         r10, r9, r10                ; abs (p2-p1)
+    uqsub8      lr, r7, r2                  ; compare to limit. lr: vp8_filter_mask
+    uqsub8      r10, r10, r2                ; compare to limit
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    orr         lr, lr, r10
+
+    uqsub8      r6, r11, r12                ; p1 - p0
+    uqsub8      r7, r12, r11                ; p0 - p1
+    add         src, src, #4                ; move src pointer up by 4
+    orr         r6, r6, r7                  ; abs (p1-p0)
+    str         r11, [sp, #12]              ; save p1
+    uqsub8      r10, r6, r2                 ; compare to limit
+    uqsub8      r11, r6, r3                 ; compare to thresh
+    orr         lr, lr, r10
+
+    ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
+    ; transpose the source data for 4-in-parallel operation
+    ldr         r6, [src], pstep            ; load source data
+    str         r11, [sp]                   ; push r11 to stack
+    ldr         r7, [src], pstep
+    str         r12, [sp, #4]               ; save current reg before load q0 - q3 data
+    ldr         r8, [src], pstep
+    str         lr, [sp, #8]
+    ldr         lr, [src], pstep
+
+    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+    ldr         lr, [sp, #8]                ; load back (f)limit accumulator
+
+    uqsub8      r6, r12, r11                ; q3 - q2
+    uqsub8      r7, r11, r12                ; q2 - q3
+    uqsub8      r12, r11, r10               ; q2 - q1
+    uqsub8      r11, r10, r11               ; q1 - q2
+    orr         r6, r6, r7                  ; abs (q3-q2)
+    orr         r7, r12, r11                ; abs (q2-q1)
+    uqsub8      r6, r6, r2                  ; compare to limit
+    uqsub8      r7, r7, r2                  ; compare to limit
+    ldr         r11, [sp, #4]               ; load back p0
+    ldr         r12, [sp, #12]              ; load back p1
+    orr         lr, lr, r6
+    orr         lr, lr, r7
+
+    uqsub8      r6, r11, r9                 ; p0 - q0
+    uqsub8      r7, r9, r11                 ; q0 - p0
+    uqsub8      r8, r12, r10                ; p1 - q1
+    uqsub8      r11, r10, r12               ; q1 - p1
+    orr         r6, r6, r7                  ; abs (p0-q0)
+    ldr         r7, c0x7F7F7F7F
+    orr         r8, r8, r11                 ; abs (p1-q1)
+    uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2
+    and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2
+    uqsub8      r11, r10, r9                ; q1 - q0
+    uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2
+    uqsub8      r12, r9, r10                ; q0 - q1
+    uqsub8      r6, r6, r4                  ; compare to flimit
+
+    orr         r9, r11, r12                ; abs (q1-q0)
+    uqsub8      r8, r9, r2                  ; compare to limit
+    uqsub8      r10, r9, r3                 ; compare to thresh
+    orr         lr, lr, r6
+    orr         lr, lr, r8
+
+    mvn         r11, #0                     ; r11 == -1
+    mov         r12, #0
+
+    usub8       lr, r12, lr
+    ldr         r9, [sp]                    ; load the compared result
+    sel         lr, r11, r12                ; filter mask: lr
+
+    cmp         lr, #0
+    beq         mbvskip_filter               ; skip filtering
+
+
+    ;vp8_hevmask() function
+    ;calculate high edge variance
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    orr         r9, r9, r10
+
+    ldrh        r7, [src, #-2]
+    ldrh        r8, [src], pstep
+
+    usub8       r9, r12, r9
+    sel         r6, r12, r11                ; hev mask: r6
+
+
+    ; vp8_mbfilter() function
+    ; p2, q2 are only needed at the end. Don't need to load them in now.
+    ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
+    ; load soure data to r6, r11, r12, lr
+    ldrh        r9, [src, #-2]
+    ldrh        r10, [src], pstep
+
+    pkhbt       r12, r7, r8, lsl #16
+
+    ldrh        r7, [src, #-2]
+    ldrh        r8, [src], pstep
+
+    pkhbt       r11, r9, r10, lsl #16
+
+    ldrh        r9, [src, #-2]
+    ldrh        r10, [src], pstep
+
+    str         r6, [sp]                    ; save r6
+    str         lr, [sp, #4]                ; save lr
+
+    pkhbt       r6, r7, r8, lsl #16
+    pkhbt       lr, r9, r10, lsl #16
+
+    ;transpose r12, r11, r6, lr to p1, p0, q0, q1
+    TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
+
+    ;load back hev_mask r6 and filter_mask lr
+    ldr         r12, c0x80808080
+    ldr         r6, [sp]
+    ldr         lr, [sp, #4]
+
+    eor         r7, r7, r12                 ; ps1
+    eor         r8, r8, r12                 ; ps0
+    eor         r9, r9, r12                 ; qs0
+    eor         r10, r10, r12               ; qs1
+
+    qsub8       r12, r9, r8                 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    str         r7, [sp, #12]               ; store ps1 temporarily
+    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
+    str         r10, [sp, #8]               ; store qs1 temporarily
+    qadd8       r7, r7, r12
+    str         r9, [sp]                    ; store qs0 temporarily
+    qadd8       r7, r7, r12
+    str         r8, [sp, #4]                ; store ps0 temporarily
+    qadd8       r7, r7, r12                 ; vp8_filter: r7
+
+    ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8
+    ldr         r9, c0x04040404
+    ;mvn         r11, #0                     ; r11 == -1
+
+    and         r7, r7, lr                  ; vp8_filter &= mask (lr is free)
+
+    mov         r12, r7                     ; Filter2: r12
+    and         r12, r12, r6                ; Filter2 &= hev
+
+    ;modify code for vp8
+    ;save bottom 3 bits so that we round one side +4 and the other +3
+    qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
+    qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
+
+    mov         r10, #0
+    shadd8      r8 , r8 , r10               ; Filter1 >>= 3
+    shadd8      r12 , r12 , r10             ; Filter2 >>= 3
+    shadd8      r8 , r8 , r10
+    shadd8      r12 , r12 , r10
+    shadd8      r8 , r8 , r10               ; r8: Filter1
+    shadd8      r12 , r12 , r10             ; r12: Filter2
+
+    ldr         r9, [sp]                    ; load qs0
+    ldr         r11, [sp, #4]               ; load ps0
+
+    qsub8       r9 , r9, r8                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
+    qadd8       r11, r11, r12               ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
+
+    ;save bottom 3 bits so that we round one side +4 and the other +3
+    ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)
+    ;qadd8      r12 , r12 , r9              ; Filter2 = vp8_signed_char_clamp(Filter2+4)
+    ;mov            r10, #0
+    ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3
+    ;usub8      lr, r8, r9                  ; s = (s==4)*-1
+    ;sel            lr, r11, r10
+    ;shadd8     r12 , r12 , r10
+    ;usub8      r8, r9, r8
+    ;sel            r8, r11, r10
+    ;ldr            r9, [sp]                    ; load qs0
+    ;ldr            r11, [sp, #4]               ; load ps0
+    ;shadd8     r12 , r12 , r10
+    ;and            r8, r8, lr                  ; -1 for each element that equals 4
+    ;qadd8      r10, r8, r12                ; u = vp8_signed_char_clamp(s + Filter2)
+    ;qsub8      r9 , r9, r12                ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
+    ;qadd8      r11, r11, r10               ; ps0 = vp8_signed_char_clamp(ps0 + u)
+
+    ;end of modification for vp8
+
+    bic         r12, r7, r6                 ;vp8_filter &= ~hev    ( r6 is free)
+    ;mov            r12, r7
+
+    ;roughly 3/7th difference across boundary
+    mov         lr, #0x1b                   ; 27
+    mov         r7, #0x3f                   ; 63
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r7, r10, lr, r7
+    smultb      r10, r10, lr
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    add         r10, r10, #63
+    ssat        r7, #8, r7, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    ldr         lr, c0x80808080
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r7, r10, lsl #16
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
+
+    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs0 - u)
+    qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps0 + u)
+    eor         r8, r8, lr                  ; *oq0 = s^0x80
+    eor         r10, r10, lr                ; *op0 = s^0x80
+
+    strb        r10, [src, #-1]             ; store op0 result
+    strb        r8, [src], pstep            ; store oq0 result
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+    strb        r10, [src, #-1]
+    strb        r8, [src], pstep
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+    strb        r10, [src, #-1]
+    strb        r8, [src], pstep
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+    strb        r10, [src, #-1]
+    strb        r8, [src], pstep
+
+    ;roughly 2/7th difference across boundary
+    mov         lr, #0x12                   ; 18
+    mov         r7, #0x3f                   ; 63
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r9, r10, lr, r7
+    smlatb      r10, r10, lr, r7
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    ssat        r9, #8, r9, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r9, r10, lsl #16
+
+    ldr         r9, [sp, #8]                ; load qs1
+    ldr         r11, [sp, #12]              ; load ps1
+    ldr         lr, c0x80808080
+
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    add         src, src, #2
+
+    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
+
+    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs1 - u)
+    qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps1 + u)
+    eor         r8, r8, lr                  ; *oq1 = s^0x80
+    eor         r10, r10, lr                ; *op1 = s^0x80
+
+    ldrb        r11, [src, #-5]             ; load p2 for 1/7th difference across boundary
+    strb        r10, [src, #-4]             ; store op1
+    strb        r8, [src, #-1]              ; store oq1
+    ldrb        r9, [src], pstep            ; load q2 for 1/7th difference across boundary
+
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+
+    ldrb        r6, [src, #-5]
+    strb        r10, [src, #-4]
+    strb        r8, [src, #-1]
+    ldrb        r7, [src], pstep
+
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+    orr         r11, r11, r6, lsl #8
+    orr         r9, r9, r7, lsl #8
+
+    ldrb        r6, [src, #-5]
+    strb        r10, [src, #-4]
+    strb        r8, [src, #-1]
+    ldrb        r7, [src], pstep
+
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+    orr         r11, r11, r6, lsl #16
+    orr         r9, r9, r7, lsl #16
+
+    ldrb        r6, [src, #-5]
+    strb        r10, [src, #-4]
+    strb        r8, [src, #-1]
+    ldrb        r7, [src], pstep
+    orr         r11, r11, r6, lsl #24
+    orr         r9, r9, r7, lsl #24
+
+    ;roughly 1/7th difference across boundary
+    eor         r9, r9, lr
+    eor         r11, r11, lr
+
+    mov         lr, #0x9                    ; 9
+    mov         r7, #0x3f                   ; 63
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r12, r10, lr, r7
+    smlatb      r10, r10, lr, r7
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    ssat        r12, #8, r12, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    sub         src, src, pstep, lsl #2
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r12, r10, lsl #16
+
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    ldr         lr, c0x80808080
+
+    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
+
+    qadd8       r8, r11, r10                ; s = vp8_signed_char_clamp(ps2 + u)
+    qsub8       r10, r9, r10                ; s = vp8_signed_char_clamp(qs2 - u)
+    eor         r8, r8, lr                  ; *op2 = s^0x80
+    eor         r10, r10, lr                ; *oq2 = s^0x80
+
+    strb        r8, [src, #-5]              ; store *op2
+    strb        r10, [src], pstep           ; store *oq2
+    mov         r8, r8, lsr #8
+    mov         r10, r10, lsr #8
+    strb        r8, [src, #-5]
+    strb        r10, [src], pstep
+    mov         r8, r8, lsr #8
+    mov         r10, r10, lsr #8
+    strb        r8, [src, #-5]
+    strb        r10, [src], pstep
+    mov         r8, r8, lsr #8
+    mov         r10, r10, lsr #8
+    strb        r8, [src, #-5]
+    strb        r10, [src], pstep
+
+    ;adjust src pointer for next loop
+    sub         src, src, #2
+
+|mbvskip_filter|
+    sub         src, src, #4
+    subs        count, count, #1
+
+    ldrne       r6, [src], pstep            ; load source data
+    ldrne       r7, [src], pstep
+    ldrne       r8, [src], pstep
+    ldrne       lr, [src], pstep
+
+    bne         MBVnext8
+
+    add         sp, sp, #16
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_mbloop_filter_vertical_edge_armv6|
+
+; Constant Pool
+c0x80808080 DCD     0x80808080
+c0x03030303 DCD     0x03030303
+c0x04040404 DCD     0x04040404
+c0x01010101 DCD     0x01010101
+c0x7F7F7F7F DCD     0x7F7F7F7F
+
+    END
diff --git a/vp8/common/arm/armv6/recon_v6.asm b/vp8/common/arm/armv6/recon_v6.asm
new file mode 100644
index 000000000..085ff80c9
--- /dev/null
+++ b/vp8/common/arm/armv6/recon_v6.asm
@@ -0,0 +1,280 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_recon_b_armv6|
+    EXPORT  |vp8_recon2b_armv6|
+    EXPORT  |vp8_recon4b_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+prd     RN  r0
+dif     RN  r1
+dst     RN  r2
+stride      RN  r3
+
+;void recon_b(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride)
+; R0 char* pred_ptr
+; R1 short * dif_ptr
+; R2 char * dst_ptr
+; R3 int stride
+
+; Description:
+; Loop through the block adding the Pred and Diff together.  Clamp and then
+; store back into the Dst.
+
+; Restrictions :
+; all buffers are expected to be 4 byte aligned coming in and
+; going out.
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+;
+;
+;
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_recon_b_armv6| PROC
+    stmdb   sp!, {r4 - r9, lr}
+
+    ;0, 1, 2, 3
+    ldr     r4, [prd], #16          ; 3 | 2 | 1 | 0
+    ldr     r6, [dif, #0]           ;     1 |     0
+    ldr     r7, [dif, #4]           ;     3 |     2
+
+    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0
+    pkhtb   r9, r7, r6, asr #16     ;     3 |     1
+
+    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0
+    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1
+
+    usat16  r8, #8, r8
+    usat16  r9, #8, r9
+    add     dif, dif, #32
+    orr     r8, r8, r9, lsl #8
+
+    str     r8, [dst], stride
+
+    ;0, 1, 2, 3
+    ldr     r4, [prd], #16          ; 3 | 2 | 1 | 0
+;;  ldr     r6, [dif, #8]           ;     1 |     0
+;;  ldr     r7, [dif, #12]          ;     3 |     2
+    ldr     r6, [dif, #0]           ;     1 |     0
+    ldr     r7, [dif, #4]           ;     3 |     2
+
+    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0
+    pkhtb   r9, r7, r6, asr #16     ;     3 |     1
+
+    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0
+    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1
+
+    usat16  r8, #8, r8
+    usat16  r9, #8, r9
+    add     dif, dif, #32
+    orr     r8, r8, r9, lsl #8
+
+    str     r8, [dst], stride
+
+    ;0, 1, 2, 3
+    ldr     r4, [prd], #16          ; 3 | 2 | 1 | 0
+;;  ldr     r6, [dif, #16]          ;     1 |     0
+;;  ldr     r7, [dif, #20]          ;     3 |     2
+    ldr     r6, [dif, #0]           ;     1 |     0
+    ldr     r7, [dif, #4]           ;     3 |     2
+
+    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0
+    pkhtb   r9, r7, r6, asr #16     ;     3 |     1
+
+    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0
+    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1
+
+    usat16  r8, #8, r8
+    usat16  r9, #8, r9
+    add     dif, dif, #32
+    orr     r8, r8, r9, lsl #8
+
+    str     r8, [dst], stride
+
+    ;0, 1, 2, 3
+    ldr     r4, [prd], #16          ; 3 | 2 | 1 | 0
+;;  ldr     r6, [dif, #24]          ;     1 |     0
+;;  ldr     r7, [dif, #28]          ;     3 |     2
+    ldr     r6, [dif, #0]           ;     1 |     0
+    ldr     r7, [dif, #4]           ;     3 |     2
+
+    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0
+    pkhtb   r9, r7, r6, asr #16     ;     3 |     1
+
+    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0
+    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1
+
+    usat16  r8, #8, r8
+    usat16  r9, #8, r9
+    orr     r8, r8, r9, lsl #8
+
+    str     r8, [dst], stride
+
+    ldmia   sp!, {r4 - r9, pc}
+
+    ENDP    ; |recon_b|
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+;
+;
+;
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+; R0 char  *pred_ptr
+; R1 short *dif_ptr
+; R2 char  *dst_ptr
+; R3 int stride
+|vp8_recon4b_armv6| PROC
+    stmdb   sp!, {r4 - r9, lr}
+
+    mov     lr, #4
+
+recon4b_loop
+    ;0, 1, 2, 3
+    ldr     r4, [prd], #4           ; 3 | 2 | 1 | 0
+    ldr     r6, [dif, #0]           ;     1 |     0
+    ldr     r7, [dif, #4]           ;     3 |     2
+
+    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0
+    pkhtb   r9, r7, r6, asr #16     ;     3 |     1
+
+    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0
+    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1
+
+    usat16  r8, #8, r8
+    usat16  r9, #8, r9
+    orr     r8, r8, r9, lsl #8
+
+    str     r8, [dst]
+
+    ;4, 5, 6, 7
+    ldr     r4, [prd], #4
+;;  ldr     r6, [dif, #32]
+;;  ldr     r7, [dif, #36]
+    ldr     r6, [dif, #8]
+    ldr     r7, [dif, #12]
+
+    pkhbt   r8, r6, r7, lsl #16
+    pkhtb   r9, r7, r6, asr #16
+
+    uxtab16 r8, r8, r4
+    uxtab16 r9, r9, r4, ror #8
+    usat16  r8, #8, r8
+    usat16  r9, #8, r9
+    orr     r8, r8, r9, lsl #8
+
+    str     r8, [dst, #4]
+
+    ;8, 9, 10, 11
+    ldr     r4, [prd], #4
+;;  ldr     r6, [dif, #64]
+;;  ldr     r7, [dif, #68]
+    ldr     r6, [dif, #16]
+    ldr     r7, [dif, #20]
+
+    pkhbt   r8, r6, r7, lsl #16
+    pkhtb   r9, r7, r6, asr #16
+
+    uxtab16 r8, r8, r4
+    uxtab16 r9, r9, r4, ror #8
+    usat16  r8, #8, r8
+    usat16  r9, #8, r9
+    orr     r8, r8, r9, lsl #8
+
+    str     r8, [dst, #8]
+
+    ;12, 13, 14, 15
+    ldr     r4, [prd], #4
+;;  ldr     r6, [dif, #96]
+;;  ldr     r7, [dif, #100]
+    ldr     r6, [dif, #24]
+    ldr     r7, [dif, #28]
+
+    pkhbt   r8, r6, r7, lsl #16
+    pkhtb   r9, r7, r6, asr #16
+
+    uxtab16 r8, r8, r4
+    uxtab16 r9, r9, r4, ror #8
+    usat16  r8, #8, r8
+    usat16  r9, #8, r9
+    orr     r8, r8, r9, lsl #8
+
+    str     r8, [dst, #12]
+
+    add     dst, dst, stride
+;;  add     dif, dif, #8
+    add     dif, dif, #32
+
+    subs    lr, lr, #1
+    bne     recon4b_loop
+
+    ldmia   sp!, {r4 - r9, pc}
+
+    ENDP    ; |Recon4B|
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+;
+;
+;
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+; R0 char  *pred_ptr
+; R1 short *dif_ptr
+; R2 char  *dst_ptr
+; R3 int stride
+|vp8_recon2b_armv6| PROC
+    stmdb   sp!, {r4 - r9, lr}
+
+    mov     lr, #4
+
+recon2b_loop
+    ;0, 1, 2, 3
+    ldr     r4, [prd], #4
+    ldr     r6, [dif, #0]
+    ldr     r7, [dif, #4]
+
+    pkhbt   r8, r6, r7, lsl #16
+    pkhtb   r9, r7, r6, asr #16
+
+    uxtab16 r8, r8, r4
+    uxtab16 r9, r9, r4, ror #8
+    usat16  r8, #8, r8
+    usat16  r9, #8, r9
+    orr     r8, r8, r9, lsl #8
+
+    str     r8, [dst]
+
+    ;4, 5, 6, 7
+    ldr     r4, [prd], #4
+;;  ldr     r6, [dif, #32]
+;;  ldr     r7, [dif, #36]
+    ldr     r6, [dif, #8]
+    ldr     r7, [dif, #12]
+
+    pkhbt   r8, r6, r7, lsl #16
+    pkhtb   r9, r7, r6, asr #16
+
+    uxtab16 r8, r8, r4
+    uxtab16 r9, r9, r4, ror #8
+    usat16  r8, #8, r8
+    usat16  r9, #8, r9
+    orr     r8, r8, r9, lsl #8
+
+    str     r8, [dst, #4]
+
+    add     dst, dst, stride
+;;  add     dif, dif, #8
+    add     dif, dif, #16
+
+    subs    lr, lr, #1
+    bne     recon2b_loop
+
+    ldmia   sp!, {r4 - r9, pc}
+
+    ENDP    ; |Recon2B|
+
+    END
diff --git a/vp8/common/arm/armv6/simpleloopfilter_v6.asm b/vp8/common/arm/armv6/simpleloopfilter_v6.asm
new file mode 100644
index 000000000..15c6c7d16
--- /dev/null
+++ b/vp8/common/arm/armv6/simpleloopfilter_v6.asm
@@ -0,0 +1,321 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT |vp8_loop_filter_simple_horizontal_edge_armv6|
+    EXPORT |vp8_loop_filter_simple_vertical_edge_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+    MACRO
+    TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
+    ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
+    ; a0: 03 02 01 00
+    ; a1: 13 12 11 10
+    ; a2: 23 22 21 20
+    ; a3: 33 32 31 30
+    ;     b3 b2 b1 b0
+
+    uxtb16      $b1, $a1                    ; xx 12 xx 10
+    uxtb16      $b0, $a0                    ; xx 02 xx 00
+    uxtb16      $b3, $a3                    ; xx 32 xx 30
+    uxtb16      $b2, $a2                    ; xx 22 xx 20
+    orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00
+    orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20
+
+    uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11
+    uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31
+    uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01
+    uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21
+    orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01
+    orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21
+
+    pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1
+    pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3
+
+    pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0
+    pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2
+    MEND
+
+
+src         RN  r0
+pstep       RN  r1
+
+;r0     unsigned char *src_ptr,
+;r1     int src_pixel_step,
+;r2     const char *flimit,
+;r3     const char *limit,
+;stack  const char *thresh,
+;stack  int  count
+
+;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed
+;for flimit. Same way applies to limit and thresh.
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_loop_filter_simple_horizontal_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    sub         src, src, pstep, lsl #1     ; move src pointer down by 2 lines
+
+    ldr         r12, [r3], #4               ; limit
+    ldr         r3, [src], pstep            ; p1
+
+    ldr         r9, [sp, #36]               ; count for 8-in-parallel
+    ldr         r4, [src], pstep            ; p0
+
+    ldr         r7, [r2], #4                ; flimit
+    ldr         r5, [src], pstep            ; q0
+    ldr         r2, c0x80808080
+
+    ldr         r6, [src]                   ; q1
+
+    uadd8       r7, r7, r7                  ; flimit * 2
+    mov         r9, r9, lsl #1              ; 4-in-parallel
+    uadd8       r12, r7, r12                ; flimit * 2 + limit
+
+|simple_hnext8|
+    ; vp8_simple_filter_mask() function
+
+    uqsub8      r7, r3, r6                  ; p1 - q1
+    uqsub8      r8, r6, r3                  ; q1 - p1
+    uqsub8      r10, r4, r5                 ; p0 - q0
+    uqsub8      r11, r5, r4                 ; q0 - p0
+    orr         r8, r8, r7                  ; abs(p1 - q1)
+    ldr         lr, c0x7F7F7F7F             ; 01111111 mask
+    orr         r10, r10, r11               ; abs(p0 - q0)
+    and         r8, lr, r8, lsr #1          ; abs(p1 - q1) / 2
+    uqadd8      r10, r10, r10               ; abs(p0 - q0) * 2
+    mvn         lr, #0                      ; r10 == -1
+    uqadd8      r10, r10, r8                ; abs(p0 - q0)*2 + abs(p1 - q1)/2
+    ; STALL waiting on r10 :(
+    uqsub8      r10, r10, r12               ; compare to flimit
+    mov         r8, #0
+
+    usub8       r10, r8, r10                ; use usub8 instead of ssub8
+    ; STALL (maybe?) when are flags set? :/
+    sel         r10, lr, r8                 ; filter mask: lr
+
+    cmp         r10, #0
+    beq         simple_hskip_filter         ; skip filtering
+
+    ;vp8_simple_filter() function
+
+    eor         r3, r3, r2                  ; p1 offset to convert to a signed value
+    eor         r6, r6, r2                  ; q1 offset to convert to a signed value
+    eor         r4, r4, r2                  ; p0 offset to convert to a signed value
+    eor         r5, r5, r2                  ; q0 offset to convert to a signed value
+
+    qsub8       r3, r3, r6                  ; vp8_filter (r3) = vp8_signed_char_clamp(p1-q1)
+    qsub8       r6, r5, r4                  ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( q0 - p0))
+
+    qadd8       r3, r3, r6
+    ldr         r8, c0x03030303             ; r8 = 3
+
+    qadd8       r3, r3, r6
+    ldr         r7, c0x04040404
+
+    qadd8       r3, r3, r6
+    and         r3, r3, lr                  ; vp8_filter &= mask;
+
+    ;save bottom 3 bits so that we round one side +4 and the other +3
+    qadd8       r8 , r3 , r8                ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
+    qadd8       r3 , r3 , r7                ; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4)
+
+    mov         r7, #0
+    shadd8      r8 , r8 , r7                ; Filter2 >>= 3
+    shadd8      r3 , r3 , r7                ; Filter1 >>= 3
+    shadd8      r8 , r8 , r7
+    shadd8      r3 , r3 , r7
+    shadd8      r8 , r8 , r7                ; r8: Filter2
+    shadd8      r3 , r3 , r7                ; r7: filter1
+
+    ;calculate output
+    sub         src, src, pstep, lsl #1
+
+    qadd8       r4, r4, r8                  ; u = vp8_signed_char_clamp(p0 + Filter2)
+    qsub8       r5 ,r5, r3                  ; u = vp8_signed_char_clamp(q0 - Filter1)
+    eor         r4, r4, r2                  ; *op0 = u^0x80
+    str         r4, [src], pstep            ; store op0 result
+    eor         r5, r5, r2                  ; *oq0 = u^0x80
+    str         r5, [src], pstep            ; store oq0 result
+
+|simple_hskip_filter|
+    add         src, src, #4
+    sub         src, src, pstep
+    sub         src, src, pstep, lsl #1
+
+    subs        r9, r9, #1
+
+    ;pld            [src]
+    ;pld            [src, pstep]
+    ;pld            [src, pstep, lsl #1]
+
+    ldrne           r3, [src], pstep            ; p1
+    ldrne           r4, [src], pstep            ; p0
+    ldrne           r5, [src], pstep            ; q0
+    ldrne           r6, [src]                   ; q1
+
+    bne         simple_hnext8
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_loop_filter_simple_horizontal_edge_armv6|
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_loop_filter_simple_vertical_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    ldr         r12, [r2], #4               ; r12: flimit
+    ldr         r2, c0x80808080
+    ldr         r7, [r3], #4                ; limit
+
+    ; load soure data to r7, r8, r9, r10
+    ldrh        r3, [src, #-2]
+    ldrh        r4, [src], pstep
+    uadd8       r12, r12, r12               ; flimit * 2
+
+    ldrh        r5, [src, #-2]
+    ldrh        r6, [src], pstep
+    uadd8       r12, r12, r7                ; flimit * 2 + limit
+
+    pkhbt       r7, r3, r4, lsl #16
+
+    ldrh        r3, [src, #-2]
+    ldrh        r4, [src], pstep
+    ldr         r11, [sp, #40]              ; count (r11) for 8-in-parallel
+
+    pkhbt       r8, r5, r6, lsl #16
+
+    ldrh        r5, [src, #-2]
+    ldrh        r6, [src], pstep
+    mov         r11, r11, lsl #1            ; 4-in-parallel
+
+|simple_vnext8|
+    ; vp8_simple_filter_mask() function
+    pkhbt       r9, r3, r4, lsl #16
+    pkhbt       r10, r5, r6, lsl #16
+
+    ;transpose r7, r8, r9, r10 to r3, r4, r5, r6
+    TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6
+
+    uqsub8      r7, r3, r6                  ; p1 - q1
+    uqsub8      r8, r6, r3                  ; q1 - p1
+    uqsub8      r9, r4, r5                  ; p0 - q0
+    uqsub8      r10, r5, r4                 ; q0 - p0
+    orr         r7, r7, r8                  ; abs(p1 - q1)
+    orr         r9, r9, r10                 ; abs(p0 - q0)
+    ldr         lr, c0x7F7F7F7F             ; 0111 1111 mask
+    uqadd8      r9, r9, r9                  ; abs(p0 - q0) * 2
+    and         r7, lr, r7, lsr #1          ; abs(p1 - q1) / 2
+    mov         r8, #0
+    uqadd8      r7, r7, r9                  ; abs(p0 - q0)*2 + abs(p1 - q1)/2
+    mvn         r10, #0                     ; r10 == -1
+    uqsub8      r7, r7, r12                 ; compare to flimit
+
+    usub8       r7, r8, r7
+    sel         r7, r10, r8                 ; filter mask: lr
+
+    cmp         lr, #0
+    beq         simple_vskip_filter         ; skip filtering
+
+    ;vp8_simple_filter() function
+    eor         r3, r3, r2                  ; p1 offset to convert to a signed value
+    eor         r6, r6, r2                  ; q1 offset to convert to a signed value
+    eor         r4, r4, r2                  ; p0 offset to convert to a signed value
+    eor         r5, r5, r2                  ; q0 offset to convert to a signed value
+
+    qsub8       r3, r3, r6                  ; vp8_filter (r3) = vp8_signed_char_clamp(p1-q1)
+    qsub8       r6, r5, r4                  ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( q0 - p0))
+
+    qadd8       r3, r3, r6
+    ldr         r8, c0x03030303             ; r8 = 3
+
+    qadd8       r3, r3, r6
+    ldr         r7, c0x04040404
+
+    qadd8       r3, r3, r6
+    and         r3, r3, lr                  ; vp8_filter &= mask
+
+    ;save bottom 3 bits so that we round one side +4 and the other +3
+    qadd8       r8 , r3 , r8                ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
+    qadd8       r3 , r3 , r7                ; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4)
+
+    mov         r7, #0
+    shadd8      r8 , r8 , r7                ; Filter2 >>= 3
+    shadd8      r3 , r3 , r7                ; Filter1 >>= 3
+    shadd8      r8 , r8 , r7
+    shadd8      r3 , r3 , r7
+    shadd8      r8 , r8 , r7                ; r8: filter2
+    shadd8      r3 , r3 , r7                ; r7: filter1
+
+    ;calculate output
+    sub         src, src, pstep, lsl #2
+
+    qadd8       r4, r4, r8                  ; u = vp8_signed_char_clamp(p0 + Filter2)
+    qsub8       r5, r5, r3                  ; u = vp8_signed_char_clamp(q0 - Filter1)
+    eor         r4, r4, r2                  ; *op0 = u^0x80
+    eor         r5, r5, r2                  ; *oq0 = u^0x80
+
+    strb        r4, [src, #-1]              ; store the result
+    mov         r4, r4, lsr #8
+    strb        r5, [src], pstep
+    mov         r5, r5, lsr #8
+
+    strb        r4, [src, #-1]
+    mov         r4, r4, lsr #8
+    strb        r5, [src], pstep
+    mov         r5, r5, lsr #8
+
+    strb        r4, [src, #-1]
+    mov         r4, r4, lsr #8
+    strb        r5, [src], pstep
+    mov         r5, r5, lsr #8
+
+    strb        r4, [src, #-1]
+    strb        r5, [src], pstep
+
+|simple_vskip_filter|
+    subs        r11, r11, #1
+
+    ;pld            [src]
+    ;pld            [src, pstep]
+    ;pld            [src, pstep, lsl #1]
+
+    ; load soure data to r7, r8, r9, r10
+    ldrneh      r3, [src, #-2]
+    ldrneh      r4, [src], pstep
+
+    ldrneh      r5, [src, #-2]
+    ldrneh      r6, [src], pstep
+
+    pkhbt       r7, r3, r4, lsl #16
+
+    ldrneh      r3, [src, #-2]
+    ldrneh      r4, [src], pstep
+
+    pkhbt       r8, r5, r6, lsl #16
+
+    ldrneh      r5, [src, #-2]
+    ldrneh      r6, [src], pstep
+
+    bne         simple_vnext8
+
+    ldmia       sp!, {r4 - r12, pc}
+    ENDP        ; |vp8_loop_filter_simple_vertical_edge_armv6|
+
+; Constant Pool
+c0x80808080 DCD     0x80808080
+c0x03030303 DCD     0x03030303
+c0x04040404 DCD     0x04040404
+c0x01010101 DCD     0x01010101
+c0x7F7F7F7F DCD     0x7F7F7F7F
+
+    END
diff --git a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
new file mode 100644
index 000000000..551d863e9
--- /dev/null
+++ b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
@@ -0,0 +1,277 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sixtap_predict8x4_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+;-------------------------------------
+; r0    unsigned char *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; stack unsigned char *dst_ptr,
+; stack int  dst_pitch
+;-------------------------------------
+;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
+;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
+;and the result is stored in transpose.
+|vp8_sixtap_predict8x4_armv6| PROC
+    stmdb       sp!, {r4 - r11, lr}
+    sub         sp, sp, #184                ;reserve space on stack for temporary storage: 20x(8+1) +4
+
+    cmp         r2, #0                      ;skip first_pass filter if xoffset=0
+    str         r3, [sp], #4                ;store yoffset
+    beq         skip_firstpass_filter
+
+;first-pass filter
+    ldr         r12, _filter8_coeff_
+    sub         r0, r0, r1, lsl #1
+
+    add         r2, r12, r2, lsl #4         ;calculate filter location
+    add         r0, r0, #3                  ;adjust src only for loading convinience
+
+    ldr         r3, [r2]                    ; load up packed filter coefficients
+    ldr         r4, [r2, #4]
+    ldr         r5, [r2, #8]
+
+    mov         r2, #0x90000                ; height=9 is top part of counter
+
+    sub         r1, r1, #8
+    mov         lr, #20
+
+|first_pass_hloop_v6|
+    ldrb        r6, [r0, #-5]               ; load source data
+    ldrb        r7, [r0, #-4]
+    ldrb        r8, [r0, #-3]
+    ldrb        r9, [r0, #-2]
+    ldrb        r10, [r0, #-1]
+
+    orr         r2, r2, #0x4                ; construct loop counter. width=8=4x2
+
+    pkhbt       r6, r6, r7, lsl #16         ; r7 | r6
+    pkhbt       r7, r7, r8, lsl #16         ; r8 | r7
+
+    pkhbt       r8, r8, r9, lsl #16         ; r9 | r8
+    pkhbt       r9, r9, r10, lsl #16        ; r10 | r9
+
+|first_pass_wloop_v6|
+    smuad       r11, r6, r3                 ; vp8_filter[0], vp8_filter[1]
+    smuad       r12, r7, r3
+
+    ldrb        r6, [r0], #1
+
+    smlad       r11, r8, r4, r11            ; vp8_filter[2], vp8_filter[3]
+    ldrb        r7, [r0], #1
+    smlad       r12, r9, r4, r12
+
+    pkhbt       r10, r10, r6, lsl #16       ; r10 | r9
+    pkhbt       r6, r6, r7, lsl #16         ; r11 | r10
+    smlad       r11, r10, r5, r11           ; vp8_filter[4], vp8_filter[5]
+    smlad       r12, r6, r5, r12
+
+    sub         r2, r2, #1
+
+    add         r11, r11, #0x40             ; round_shift_and_clamp
+    tst         r2, #0xff                   ; test loop counter
+    usat        r11, #8, r11, asr #7
+    add         r12, r12, #0x40
+    strh        r11, [sp], lr               ; result is transposed and stored, which
+    usat        r12, #8, r12, asr #7
+
+    strh        r12, [sp], lr
+
+    movne       r11, r6
+    movne       r12, r7
+
+    movne       r6, r8
+    movne       r7, r9
+    movne       r8, r10
+    movne       r9, r11
+    movne       r10, r12
+
+    bne         first_pass_wloop_v6
+
+    ;;add       r9, ppl, #30                ; attempt to load 2 adjacent cache lines
+    ;;IF ARCHITECTURE=6
+    ;pld        [src, ppl]
+    ;;pld       [src, r9]
+    ;;ENDIF
+
+    subs        r2, r2, #0x10000
+
+    mov         r6, #158
+    sub         sp, sp, r6
+
+    add         r0, r0, r1                  ; move to next input line
+
+    bne         first_pass_hloop_v6
+
+;second pass filter
+secondpass_filter
+    mov         r1, #18
+    sub         sp, sp, r1                  ; 18+4
+
+    ldr         r3, [sp, #-4]               ; load back yoffset
+    ldr         r0, [sp, #216]              ; load dst address from stack 180+36
+    ldr         r1, [sp, #220]              ; load dst stride from stack 180+40
+
+    cmp         r3, #0
+    beq         skip_secondpass_filter
+
+    ldr         r12, _filter8_coeff_
+    add         lr, r12, r3, lsl #4         ;calculate filter location
+
+    mov         r2, #0x00080000
+
+    ldr         r3, [lr]                    ; load up packed filter coefficients
+    ldr         r4, [lr, #4]
+    ldr         r5, [lr, #8]
+
+    pkhbt       r12, r4, r3                 ; pack the filter differently
+    pkhbt       r11, r5, r4
+
+second_pass_hloop_v6
+    ldr         r6, [sp]                    ; load the data
+    ldr         r7, [sp, #4]
+
+    orr         r2, r2, #2                  ; loop counter
+
+second_pass_wloop_v6
+    smuad       lr, r3, r6                  ; apply filter
+    smulbt      r10, r3, r6
+
+    ldr         r8, [sp, #8]
+
+    smlad       lr, r4, r7, lr
+    smladx      r10, r12, r7, r10
+
+    ldrh        r9, [sp, #12]
+
+    smlad       lr, r5, r8, lr
+    smladx      r10, r11, r8, r10
+
+    add         sp, sp, #4
+    smlatb      r10, r5, r9, r10
+
+    sub         r2, r2, #1
+
+    add         lr, lr, #0x40               ; round_shift_and_clamp
+    tst         r2, #0xff
+    usat        lr, #8, lr, asr #7
+    add         r10, r10, #0x40
+    strb        lr, [r0], r1                ; the result is transposed back and stored
+    usat        r10, #8, r10, asr #7
+
+    strb        r10, [r0],r1
+
+    movne       r6, r7
+    movne       r7, r8
+
+    bne         second_pass_wloop_v6
+
+    subs        r2, r2, #0x10000
+    add         sp, sp, #12                 ; updata src for next loop (20-8)
+    sub         r0, r0, r1, lsl #2
+    add         r0, r0, #1
+
+    bne         second_pass_hloop_v6
+
+    add         sp, sp, #20
+    ldmia       sp!, {r4 - r11, pc}
+
+;--------------------
+skip_firstpass_filter
+    sub         r0, r0, r1, lsl #1
+    sub         r1, r1, #8
+    mov         r2, #9
+    mov         r3, #20
+
+skip_firstpass_hloop
+    ldrb        r4, [r0], #1                ; load data
+    subs        r2, r2, #1
+    ldrb        r5, [r0], #1
+    strh        r4, [sp], r3                ; store it to immediate buffer
+    ldrb        r6, [r0], #1                ; load data
+    strh        r5, [sp], r3
+    ldrb        r7, [r0], #1
+    strh        r6, [sp], r3
+    ldrb        r8, [r0], #1
+    strh        r7, [sp], r3
+    ldrb        r9, [r0], #1
+    strh        r8, [sp], r3
+    ldrb        r10, [r0], #1
+    strh        r9, [sp], r3
+    ldrb        r11, [r0], #1
+    strh        r10, [sp], r3
+    add         r0, r0, r1                  ; move to next input line
+    strh        r11, [sp], r3
+
+    mov         r4, #158
+    sub         sp, sp, r4                  ; move over to next column
+    bne         skip_firstpass_hloop
+
+    b           secondpass_filter
+
+;--------------------
+skip_secondpass_filter
+    mov         r2, #8
+    add         sp, sp, #4                  ;start from src[0] instead of src[-2]
+
+skip_secondpass_hloop
+    ldr         r6, [sp], #4
+    subs        r2, r2, #1
+    ldr         r8, [sp], #4
+
+    mov         r7, r6, lsr #16             ; unpack
+    strb        r6, [r0], r1
+    mov         r9, r8, lsr #16
+    strb        r7, [r0], r1
+    add         sp, sp, #12                 ; 20-8
+    strb        r8, [r0], r1
+    strb        r9, [r0], r1
+
+    sub         r0, r0, r1, lsl #2
+    add         r0, r0, #1
+
+    bne         skip_secondpass_hloop
+
+    add         sp, sp, #16                 ; 180 - (160 +4)
+
+    ldmia       sp!, {r4 - r11, pc}
+
+    ENDP
+
+;-----------------
+    AREA    subpelfilters8_dat, DATA, READWRITE         ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_filter8_coeff_
+    DCD     filter8_coeff
+filter8_coeff
+    DCD     0x00000000,     0x00000080,     0x00000000,     0x00000000
+    DCD     0xfffa0000,     0x000c007b,     0x0000ffff,     0x00000000
+    DCD     0xfff50002,     0x0024006c,     0x0001fff8,     0x00000000
+    DCD     0xfff70000,     0x0032005d,     0x0000fffa,     0x00000000
+    DCD     0xfff00003,     0x004d004d,     0x0003fff0,     0x00000000
+    DCD     0xfffa0000,     0x005d0032,     0x0000fff7,     0x00000000
+    DCD     0xfff80001,     0x006c0024,     0x0002fff5,     0x00000000
+    DCD     0xffff0000,     0x007b000c,     0x0000fffa,     0x00000000
+
+    ;DCD        0,  0,  128,    0,   0,  0
+    ;DCD        0, -6,  123,   12,  -1,  0
+    ;DCD        2, -11, 108,   36,  -8,  1
+    ;DCD        0, -9,   93,   50,  -6,  0
+    ;DCD        3, -16,  77,   77, -16,  3
+    ;DCD        0, -6,   50,   93,  -9,  0
+    ;DCD        1, -8,   36,  108, -11,  2
+    ;DCD        0, -1,   12,  123,  -6,  0
+
+    END
diff --git a/vp8/common/arm/bilinearfilter_arm.c b/vp8/common/arm/bilinearfilter_arm.c
new file mode 100644
index 000000000..bf972a3bc
--- /dev/null
+++ b/vp8/common/arm/bilinearfilter_arm.c
@@ -0,0 +1,211 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include "subpixel.h"
+
+#define BLOCK_HEIGHT_WIDTH 4
+#define VP8_FILTER_WEIGHT 128
+#define VP8_FILTER_SHIFT  7
+
+static const short bilinear_filters[8][2] =
+{
+    { 128,   0 },
+    { 112,  16 },
+    {  96,  32 },
+    {  80,  48 },
+    {  64,  64 },
+    {  48,  80 },
+    {  32,  96 },
+    {  16, 112 }
+};
+
+
+extern void vp8_filter_block2d_bil_first_pass_armv6
+(
+    unsigned char *src_ptr,
+    unsigned short *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_height,
+    unsigned int output_width,
+    const short *vp8_filter
+);
+
+extern void vp8_filter_block2d_bil_second_pass_armv6
+(
+    unsigned short *src_ptr,
+    unsigned char  *output_ptr,
+    int output_pitch,
+    unsigned int  output_height,
+    unsigned int  output_width,
+    const short *vp8_filter
+);
+
+/*
+void vp8_filter_block2d_bil_first_pass_6
+(
+    unsigned char *src_ptr,
+    unsigned short *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_height,
+    unsigned int output_width,
+    const short *vp8_filter
+)
+{
+    unsigned int i, j;
+
+    for ( i=0; i<output_height; i++ )
+    {
+        for ( j=0; j<output_width; j++ )
+        {
+            // Apply bilinear filter
+            output_ptr[j] = ( ( (int)src_ptr[0]          * vp8_filter[0]) +
+                               ((int)src_ptr[1] * vp8_filter[1]) +
+                                (VP8_FILTER_WEIGHT/2) ) >> VP8_FILTER_SHIFT;
+            src_ptr++;
+        }
+
+        // Next row...
+        src_ptr    += src_pixels_per_line - output_width;
+        output_ptr += output_width;
+    }
+}
+
+void vp8_filter_block2d_bil_second_pass_6
+(
+    unsigned short *src_ptr,
+    unsigned char  *output_ptr,
+    int output_pitch,
+    unsigned int  output_height,
+    unsigned int  output_width,
+    const short *vp8_filter
+)
+{
+    unsigned int  i,j;
+    int  Temp;
+
+    for ( i=0; i<output_height; i++ )
+    {
+        for ( j=0; j<output_width; j++ )
+        {
+            // Apply filter
+            Temp =  ((int)src_ptr[0]         * vp8_filter[0]) +
+                    ((int)src_ptr[output_width] * vp8_filter[1]) +
+                    (VP8_FILTER_WEIGHT/2);
+            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
+            src_ptr++;
+        }
+
+        // Next row...
+        //src_ptr    += src_pixels_per_line - output_width;
+        output_ptr += output_pitch;
+    }
+}
+*/
+
+void vp8_filter_block2d_bil_armv6
+(
+    unsigned char *src_ptr,
+    unsigned char *output_ptr,
+    unsigned int   src_pixels_per_line,
+    unsigned int   dst_pitch,
+    const short      *HFilter,
+    const short      *VFilter,
+    int            Width,
+    int            Height
+)
+{
+
+    unsigned short FData[36*16]; // Temp data bufffer used in filtering
+
+    // First filter 1-D horizontally...
+    // pixel_step = 1;
+    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pixels_per_line, Height + 1, Width, HFilter);
+
+    // then 1-D vertically...
+    vp8_filter_block2d_bil_second_pass_armv6(FData, output_ptr, dst_pitch, Height, Width, VFilter);
+}
+
+
+void vp8_bilinear_predict4x4_armv6
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+
+    HFilter = bilinear_filters[xoffset];
+    VFilter = bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
+}
+
+void vp8_bilinear_predict8x8_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+
+    HFilter = bilinear_filters[xoffset];
+    VFilter = bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
+}
+
+void vp8_bilinear_predict8x4_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+
+    HFilter = bilinear_filters[xoffset];
+    VFilter = bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
+}
+
+void vp8_bilinear_predict16x16_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+
+    HFilter = bilinear_filters[xoffset];
+    VFilter = bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
+}
diff --git a/vp8/common/arm/filter_arm.c b/vp8/common/arm/filter_arm.c
new file mode 100644
index 000000000..2a4640cae
--- /dev/null
+++ b/vp8/common/arm/filter_arm.c
@@ -0,0 +1,234 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include <math.h>
+#include "subpixel.h"
+#include "vpx_ports/mem.h"
+
+#define BLOCK_HEIGHT_WIDTH 4
+#define VP8_FILTER_WEIGHT 128
+#define VP8_FILTER_SHIFT  7
+
+DECLARE_ALIGNED(16, static const short, sub_pel_filters[8][6]) =
+{
+    { 0,  0,  128,    0,   0,  0 },         // note that 1/8 pel positions are just as per alpha -0.5 bicubic
+    { 0, -6,  123,   12,  -1,  0 },
+    { 2, -11, 108,   36,  -8,  1 },         // New 1/4 pel 6 tap filter
+    { 0, -9,   93,   50,  -6,  0 },
+    { 3, -16,  77,   77, -16,  3 },         // New 1/2 pel 6 tap filter
+    { 0, -6,   50,   93,  -9,  0 },
+    { 1, -8,   36,  108, -11,  2 },         // New 1/4 pel 6 tap filter
+    { 0, -1,   12,  123,  -6,  0 },
+};
+
+
+extern void vp8_filter_block2d_first_pass_armv6
+(
+    unsigned char *src_ptr,
+    short         *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_width,
+    unsigned int output_height,
+    const short *vp8_filter
+);
+
+extern void vp8_filter_block2d_second_pass_armv6
+(
+    short         *src_ptr,
+    unsigned char *output_ptr,
+    unsigned int output_pitch,
+    unsigned int cnt,
+    const short *vp8_filter
+);
+
+extern void vp8_filter_block2d_first_pass_only_armv6
+(
+    unsigned char *src_ptr,
+    unsigned char *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int cnt,
+    unsigned int output_pitch,
+    const short *vp8_filter
+);
+
+
+extern void vp8_filter_block2d_second_pass_only_armv6
+(
+    unsigned char *src_ptr,
+    unsigned char *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int cnt,
+    unsigned int output_pitch,
+    const short *vp8_filter
+);
+
+#if HAVE_ARMV6
+void vp8_sixtap_predict_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); // Temp data bufffer used in filtering
+
+
+    HFilter = sub_pel_filters[xoffset];   // 6 tap
+    VFilter = sub_pel_filters[yoffset];       // 6 tap
+
+    // Vfilter is null. First pass only
+    if (xoffset && !yoffset)
+    {
+        //vp8_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter );
+        //vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter );
+
+        vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter);
+    }
+    // Hfilter is null. Second pass only
+    else if (!xoffset && yoffset)
+    {
+        vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter);
+    }
+    else
+    {
+        // Vfilter is a 4 tap filter
+        if (yoffset & 0x1)
+            vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter);
+        // Vfilter is 6 tap filter
+        else
+            vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter);
+
+        vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
+    }
+}
+
+/*
+void vp8_sixtap_predict8x4_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); // Temp data bufffer used in filtering
+
+    HFilter = sub_pel_filters[xoffset];   // 6 tap
+    VFilter = sub_pel_filters[yoffset];       // 6 tap
+
+
+//  if (xoffset && !yoffset)
+//  {
+//      vp8_filter_block2d_first_pass_only_armv6 (  src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter );
+//  }
+    // Hfilter is null. Second pass only
+//  else if (!xoffset && yoffset)
+//  {
+//      vp8_filter_block2d_second_pass_only_armv6 ( src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter );
+//  }
+//  else
+//  {
+//      if (yoffset & 0x1)
+    //      vp8_filter_block2d_first_pass_armv6 ( src_ptr-src_pixels_per_line, FData+1, src_pixels_per_line, 8, 7, HFilter );
+    //  else
+
+        vp8_filter_block2d_first_pass_armv6 ( src_ptr-(2*src_pixels_per_line), FData, src_pixels_per_line, 8, 9, HFilter );
+
+        vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, 8, VFilter );
+//  }
+}
+*/
+
+void vp8_sixtap_predict8x8_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); // Temp data bufffer used in filtering
+
+    HFilter = sub_pel_filters[xoffset];   // 6 tap
+    VFilter = sub_pel_filters[yoffset];       // 6 tap
+
+    if (xoffset && !yoffset)
+    {
+        vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter);
+    }
+    // Hfilter is null. Second pass only
+    else if (!xoffset && yoffset)
+    {
+        vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter);
+    }
+    else
+    {
+        if (yoffset & 0x1)
+            vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);
+        else
+            vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);
+
+        vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
+    }
+}
+
+
+void vp8_sixtap_predict16x16_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    // Temp data bufffer used in filtering
+
+    HFilter = sub_pel_filters[xoffset];   // 6 tap
+    VFilter = sub_pel_filters[yoffset];       // 6 tap
+
+    if (xoffset && !yoffset)
+    {
+        vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter);
+    }
+    // Hfilter is null. Second pass only
+    else if (!xoffset && yoffset)
+    {
+        vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter);
+    }
+    else
+    {
+        if (yoffset & 0x1)
+            vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);
+        else
+            vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);
+
+        vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
+    }
+
+}
+#endif
diff --git a/vp8/common/arm/idct_arm.h b/vp8/common/arm/idct_arm.h
new file mode 100644
index 000000000..f9ed21e0d
--- /dev/null
+++ b/vp8/common/arm/idct_arm.h
@@ -0,0 +1,60 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef IDCT_ARM_H
+#define IDCT_ARM_H
+
+#if HAVE_ARMV6
+extern prototype_idct(vp8_short_idct4x4llm_1_v6);
+extern prototype_idct(vp8_short_idct4x4llm_v6_dual);
+extern prototype_idct_scalar(vp8_dc_only_idct_armv6);
+extern prototype_second_order(vp8_short_inv_walsh4x4_1_armv6);
+extern prototype_second_order(vp8_short_inv_walsh4x4_armv6);
+
+#undef  vp8_idct_idct1
+#define vp8_idct_idct1 vp8_short_idct4x4llm_1_v6
+
+#undef  vp8_idct_idct16
+#define vp8_idct_idct16 vp8_short_idct4x4llm_v6_dual
+
+#undef  vp8_idct_idct1_scalar
+#define vp8_idct_idct1_scalar vp8_dc_only_idct_armv6
+
+#undef  vp8_idct_iwalsh1
+#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_armv6
+
+#undef  vp8_idct_iwalsh16
+#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_armv6
+#endif
+
+#if HAVE_ARMV7
+extern prototype_idct(vp8_short_idct4x4llm_1_neon);
+extern prototype_idct(vp8_short_idct4x4llm_neon);
+extern prototype_idct_scalar(vp8_dc_only_idct_neon);
+extern prototype_second_order(vp8_short_inv_walsh4x4_1_neon);
+extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
+
+#undef  vp8_idct_idct1
+#define vp8_idct_idct1 vp8_short_idct4x4llm_1_neon
+
+#undef  vp8_idct_idct16
+#define vp8_idct_idct16 vp8_short_idct4x4llm_neon
+
+#undef  vp8_idct_idct1_scalar
+#define vp8_idct_idct1_scalar vp8_dc_only_idct_neon
+
+#undef  vp8_idct_iwalsh1
+#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon
+
+#undef  vp8_idct_iwalsh16
+#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_neon
+#endif
+
+#endif
diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c
new file mode 100644
index 000000000..fa7c62617
--- /dev/null
+++ b/vp8/common/arm/loopfilter_arm.c
@@ -0,0 +1,246 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include <math.h>
+#include "loopfilter.h"
+#include "onyxc_int.h"
+
+typedef void loop_filter_uvfunction
+(
+    unsigned char *u,   // source pointer
+    int p,              // pitch
+    const signed char *flimit,
+    const signed char *limit,
+    const signed char *thresh,
+    unsigned char *v
+);
+
+extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
+extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
+extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
+extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
+extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6);
+extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6);
+
+extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_y_neon);
+extern prototype_loopfilter(vp8_loop_filter_vertical_edge_y_neon);
+extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_y_neon);
+extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_y_neon);
+extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_neon);
+extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_neon);
+
+extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_neon;
+extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_neon;
+extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_neon;
+extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_neon;
+
+
+#if HAVE_ARMV6
+//ARMV6 loopfilter functions
+// Horizontal MB filtering
+void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) simpler_lpf;
+    vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+}
+
+void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                                int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) u_ptr;
+    (void) v_ptr;
+    (void) uv_stride;
+    (void) simpler_lpf;
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+}
+
+// Vertical MB Filtering
+void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) simpler_lpf;
+    vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+}
+
+void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                                int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) u_ptr;
+    (void) v_ptr;
+    (void) uv_stride;
+    (void) simpler_lpf;
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+}
+
+// Horizontal B Filtering
+void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) simpler_lpf;
+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+}
+
+void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) u_ptr;
+    (void) v_ptr;
+    (void) uv_stride;
+    (void) simpler_lpf;
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+}
+
+// Vertical B Filtering
+void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) simpler_lpf;
+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+}
+
+void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) u_ptr;
+    (void) v_ptr;
+    (void) uv_stride;
+    (void) simpler_lpf;
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+}
+#endif
+
+#if HAVE_ARMV7
+// NEON loopfilter functions
+// Horizontal MB filtering
+void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) simpler_lpf;
+    vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
+}
+
+void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) u_ptr;
+    (void) v_ptr;
+    (void) uv_stride;
+    (void) simpler_lpf;
+    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+}
+
+// Vertical MB Filtering
+void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) simpler_lpf;
+    vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
+}
+
+void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) u_ptr;
+    (void) v_ptr;
+    (void) uv_stride;
+    (void) simpler_lpf;
+    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+}
+
+// Horizontal B Filtering
+void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) simpler_lpf;
+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4 * uv_stride);
+}
+
+void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) u_ptr;
+    (void) v_ptr;
+    (void) uv_stride;
+    (void) simpler_lpf;
+    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+}
+
+// Vertical B Filtering
+void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) simpler_lpf;
+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4);
+}
+
+void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) u_ptr;
+    (void) v_ptr;
+    (void) uv_stride;
+    (void) simpler_lpf;
+    vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+}
+#endif
diff --git a/vp8/common/arm/loopfilter_arm.h b/vp8/common/arm/loopfilter_arm.h
new file mode 100644
index 000000000..4bb49456d
--- /dev/null
+++ b/vp8/common/arm/loopfilter_arm.h
@@ -0,0 +1,84 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef LOOPFILTER_ARM_H
+#define LOOPFILTER_ARM_H
+
+#if HAVE_ARMV6
+extern prototype_loopfilter_block(vp8_loop_filter_mbv_armv6);
+extern prototype_loopfilter_block(vp8_loop_filter_bv_armv6);
+extern prototype_loopfilter_block(vp8_loop_filter_mbh_armv6);
+extern prototype_loopfilter_block(vp8_loop_filter_bh_armv6);
+extern prototype_loopfilter_block(vp8_loop_filter_mbvs_armv6);
+extern prototype_loopfilter_block(vp8_loop_filter_bvs_armv6);
+extern prototype_loopfilter_block(vp8_loop_filter_mbhs_armv6);
+extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6);
+
+#undef  vp8_lf_normal_mb_v
+#define vp8_lf_normal_mb_v vp8_loop_filter_mbv_armv6
+
+#undef  vp8_lf_normal_b_v
+#define vp8_lf_normal_b_v vp8_loop_filter_bv_armv6
+
+#undef  vp8_lf_normal_mb_h
+#define vp8_lf_normal_mb_h vp8_loop_filter_mbh_armv6
+
+#undef  vp8_lf_normal_b_h
+#define vp8_lf_normal_b_h vp8_loop_filter_bh_armv6
+
+#undef  vp8_lf_simple_mb_v
+#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_armv6
+
+#undef  vp8_lf_simple_b_v
+#define vp8_lf_simple_b_v vp8_loop_filter_bvs_armv6
+
+#undef  vp8_lf_simple_mb_h
+#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_armv6
+
+#undef  vp8_lf_simple_b_h
+#define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6
+#endif
+
+#if HAVE_ARMV7
+extern prototype_loopfilter_block(vp8_loop_filter_mbv_neon);
+extern prototype_loopfilter_block(vp8_loop_filter_bv_neon);
+extern prototype_loopfilter_block(vp8_loop_filter_mbh_neon);
+extern prototype_loopfilter_block(vp8_loop_filter_bh_neon);
+extern prototype_loopfilter_block(vp8_loop_filter_mbvs_neon);
+extern prototype_loopfilter_block(vp8_loop_filter_bvs_neon);
+extern prototype_loopfilter_block(vp8_loop_filter_mbhs_neon);
+extern prototype_loopfilter_block(vp8_loop_filter_bhs_neon);
+
+#undef  vp8_lf_normal_mb_v
+#define vp8_lf_normal_mb_v vp8_loop_filter_mbv_neon
+
+#undef  vp8_lf_normal_b_v
+#define vp8_lf_normal_b_v vp8_loop_filter_bv_neon
+
+#undef  vp8_lf_normal_mb_h
+#define vp8_lf_normal_mb_h vp8_loop_filter_mbh_neon
+
+#undef  vp8_lf_normal_b_h
+#define vp8_lf_normal_b_h vp8_loop_filter_bh_neon
+
+#undef  vp8_lf_simple_mb_v
+#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_neon
+
+#undef  vp8_lf_simple_b_v
+#define vp8_lf_simple_b_v vp8_loop_filter_bvs_neon
+
+#undef  vp8_lf_simple_mb_h
+#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_neon
+
+#undef  vp8_lf_simple_b_h
+#define vp8_lf_simple_b_h vp8_loop_filter_bhs_neon
+#endif
+
+#endif
diff --git a/vp8/common/arm/neon/bilinearpredict16x16_neon.asm b/vp8/common/arm/neon/bilinearpredict16x16_neon.asm
new file mode 100644
index 000000000..a2fea2bd6
--- /dev/null
+++ b/vp8/common/arm/neon/bilinearpredict16x16_neon.asm
@@ -0,0 +1,361 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_bilinear_predict16x16_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; r4    unsigned char *dst_ptr,
+; stack(r5) int  dst_pitch
+
+|vp8_bilinear_predict16x16_neon| PROC
+    push            {r4-r5, lr}
+
+    ldr             r12, _bifilter16_coeff_
+    ldr             r4, [sp, #12]           ;load parameters from stack
+    ldr             r5, [sp, #16]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_bfilter16x16_only
+
+    add             r2, r12, r2, lsl #3     ;calculate filter location
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+
+    vld1.s32        {d31}, [r2]             ;load first_pass filter
+
+    beq             firstpass_bfilter16x16_only
+
+    sub             sp, sp, #272            ;reserve space on stack for temporary storage
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    mov             lr, sp
+    vld1.u8         {d5, d6, d7}, [r0], r1
+
+    mov             r2, #3                  ;loop counter
+    vld1.u8         {d8, d9, d10}, [r0], r1
+
+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    vdup.8          d1, d31[4]
+
+;First Pass: output_height lines x output_width columns (17x16)
+filt_blk2d_fp16x16_loop_neon
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q8, d3, d0
+    vmull.u8        q9, d5, d0
+    vmull.u8        q10, d6, d0
+    vmull.u8        q11, d8, d0
+    vmull.u8        q12, d9, d0
+    vmull.u8        q13, d11, d0
+    vmull.u8        q14, d12, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+    vext.8          d11, d11, d12, #1
+
+    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q9, d5, d1
+    vmlal.u8        q11, d8, d1
+    vmlal.u8        q13, d11, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+    vext.8          d12, d12, d13, #1
+
+    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q10, d6, d1
+    vmlal.u8        q12, d9, d1
+    vmlal.u8        q14, d12, d1
+
+    subs            r2, r2, #1
+
+    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d15, q8, #7
+    vqrshrn.u16    d16, q9, #7
+    vqrshrn.u16    d17, q10, #7
+    vqrshrn.u16    d18, q11, #7
+    vqrshrn.u16    d19, q12, #7
+    vqrshrn.u16    d20, q13, #7
+
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    vqrshrn.u16    d21, q14, #7
+    vld1.u8         {d5, d6, d7}, [r0], r1
+
+    vst1.u8         {d14, d15, d16, d17}, [lr]!     ;store result
+    vld1.u8         {d8, d9, d10}, [r0], r1
+    vst1.u8         {d18, d19, d20, d21}, [lr]!
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    bne             filt_blk2d_fp16x16_loop_neon
+
+;First-pass filtering for rest 5 lines
+    vld1.u8         {d14, d15, d16}, [r0], r1
+
+    vmull.u8        q9, d2, d0              ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q10, d3, d0
+    vmull.u8        q11, d5, d0
+    vmull.u8        q12, d6, d0
+    vmull.u8        q13, d8, d0
+    vmull.u8        q14, d9, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+
+    vmlal.u8        q9, d2, d1              ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q11, d5, d1
+    vmlal.u8        q13, d8, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+
+    vmlal.u8        q10, d3, d1             ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q12, d6, d1
+    vmlal.u8        q14, d9, d1
+
+    vmull.u8        q1, d11, d0
+    vmull.u8        q2, d12, d0
+    vmull.u8        q3, d14, d0
+    vmull.u8        q4, d15, d0
+
+    vext.8          d11, d11, d12, #1       ;construct src_ptr[1]
+    vext.8          d14, d14, d15, #1
+
+    vmlal.u8        q1, d11, d1             ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q3, d14, d1
+
+    vext.8          d12, d12, d13, #1
+    vext.8          d15, d15, d16, #1
+
+    vmlal.u8        q2, d12, d1             ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q4, d15, d1
+
+    vqrshrn.u16    d10, q9, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d11, q10, #7
+    vqrshrn.u16    d12, q11, #7
+    vqrshrn.u16    d13, q12, #7
+    vqrshrn.u16    d14, q13, #7
+    vqrshrn.u16    d15, q14, #7
+    vqrshrn.u16    d16, q1, #7
+    vqrshrn.u16    d17, q2, #7
+    vqrshrn.u16    d18, q3, #7
+    vqrshrn.u16    d19, q4, #7
+
+    vst1.u8         {d10, d11, d12, d13}, [lr]!         ;store result
+    vst1.u8         {d14, d15, d16, d17}, [lr]!
+    vst1.u8         {d18, d19}, [lr]!
+
+;Second pass: 16x16
+;secondpass_filter
+    add             r3, r12, r3, lsl #3
+    sub             lr, lr, #272
+
+    vld1.u32        {d31}, [r3]             ;load second_pass filter
+
+    vld1.u8         {d22, d23}, [lr]!       ;load src data
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+    mov             r12, #4                 ;loop counter
+
+filt_blk2d_sp16x16_loop_neon
+    vld1.u8         {d24, d25}, [lr]!
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp8_filter[0])
+    vld1.u8         {d26, d27}, [lr]!
+    vmull.u8        q2, d23, d0
+    vld1.u8         {d28, d29}, [lr]!
+    vmull.u8        q3, d24, d0
+    vld1.u8         {d30, d31}, [lr]!
+
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * vp8_filter[1])
+    vmlal.u8        q2, d25, d1
+    vmlal.u8        q3, d26, d1
+    vmlal.u8        q4, d27, d1
+    vmlal.u8        q5, d28, d1
+    vmlal.u8        q6, d29, d1
+    vmlal.u8        q7, d30, d1
+    vmlal.u8        q8, d31, d1
+
+    subs            r12, r12, #1
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+    vqrshrn.u16    d6, q5, #7
+    vqrshrn.u16    d7, q6, #7
+    vqrshrn.u16    d8, q7, #7
+    vqrshrn.u16    d9, q8, #7
+
+    vst1.u8         {d2, d3}, [r4], r5      ;store result
+    vst1.u8         {d4, d5}, [r4], r5
+    vst1.u8         {d6, d7}, [r4], r5
+    vmov            q11, q15
+    vst1.u8         {d8, d9}, [r4], r5
+
+    bne             filt_blk2d_sp16x16_loop_neon
+
+    add             sp, sp, #272
+
+    pop             {r4-r5,pc}
+
+;--------------------
+firstpass_bfilter16x16_only
+    mov             r2, #4                      ;loop counter
+    vdup.8          d0, d31[0]                  ;first_pass filter (d0 d1)
+    vdup.8          d1, d31[4]
+
+;First Pass: output_height lines x output_width columns (16x16)
+filt_blk2d_fpo16x16_loop_neon
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    vld1.u8         {d5, d6, d7}, [r0], r1
+    vld1.u8         {d8, d9, d10}, [r0], r1
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q8, d3, d0
+    vmull.u8        q9, d5, d0
+    vmull.u8        q10, d6, d0
+    vmull.u8        q11, d8, d0
+    vmull.u8        q12, d9, d0
+    vmull.u8        q13, d11, d0
+    vmull.u8        q14, d12, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+    vext.8          d11, d11, d12, #1
+
+    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q9, d5, d1
+    vmlal.u8        q11, d8, d1
+    vmlal.u8        q13, d11, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+    vext.8          d12, d12, d13, #1
+
+    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q10, d6, d1
+    vmlal.u8        q12, d9, d1
+    vmlal.u8        q14, d12, d1
+
+    subs            r2, r2, #1
+
+    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d15, q8, #7
+    vqrshrn.u16    d16, q9, #7
+    vqrshrn.u16    d17, q10, #7
+    vqrshrn.u16    d18, q11, #7
+    vqrshrn.u16    d19, q12, #7
+    vqrshrn.u16    d20, q13, #7
+    vst1.u8         {d14, d15}, [r4], r5        ;store result
+    vqrshrn.u16    d21, q14, #7
+
+    vst1.u8         {d16, d17}, [r4], r5
+    vst1.u8         {d18, d19}, [r4], r5
+    vst1.u8         {d20, d21}, [r4], r5
+
+    bne             filt_blk2d_fpo16x16_loop_neon
+    pop             {r4-r5,pc}
+
+;---------------------
+secondpass_bfilter16x16_only
+;Second pass: 16x16
+;secondpass_filter
+    add             r3, r12, r3, lsl #3
+    mov             r12, #4                     ;loop counter
+    vld1.u32        {d31}, [r3]                 ;load second_pass filter
+    vld1.u8         {d22, d23}, [r0], r1        ;load src data
+
+    vdup.8          d0, d31[0]                  ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+
+filt_blk2d_spo16x16_loop_neon
+    vld1.u8         {d24, d25}, [r0], r1
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp8_filter[0])
+    vld1.u8         {d26, d27}, [r0], r1
+    vmull.u8        q2, d23, d0
+    vld1.u8         {d28, d29}, [r0], r1
+    vmull.u8        q3, d24, d0
+    vld1.u8         {d30, d31}, [r0], r1
+
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * vp8_filter[1])
+    vmlal.u8        q2, d25, d1
+    vmlal.u8        q3, d26, d1
+    vmlal.u8        q4, d27, d1
+    vmlal.u8        q5, d28, d1
+    vmlal.u8        q6, d29, d1
+    vmlal.u8        q7, d30, d1
+    vmlal.u8        q8, d31, d1
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+    vqrshrn.u16    d6, q5, #7
+    vqrshrn.u16    d7, q6, #7
+    vqrshrn.u16    d8, q7, #7
+    vqrshrn.u16    d9, q8, #7
+
+    vst1.u8         {d2, d3}, [r4], r5      ;store result
+    subs            r12, r12, #1
+    vst1.u8         {d4, d5}, [r4], r5
+    vmov            q11, q15
+    vst1.u8         {d6, d7}, [r4], r5
+    vst1.u8         {d8, d9}, [r4], r5
+
+    bne             filt_blk2d_spo16x16_loop_neon
+    pop             {r4-r5,pc}
+
+    ENDP
+
+;-----------------
+    AREA    bifilters16_dat, DATA, READWRITE            ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_bifilter16_coeff_
+    DCD     bifilter16_coeff
+bifilter16_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+    END
diff --git a/vp8/common/arm/neon/bilinearpredict4x4_neon.asm b/vp8/common/arm/neon/bilinearpredict4x4_neon.asm
new file mode 100644
index 000000000..74d2db5dc
--- /dev/null
+++ b/vp8/common/arm/neon/bilinearpredict4x4_neon.asm
@@ -0,0 +1,134 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_bilinear_predict4x4_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; r4    unsigned char *dst_ptr,
+; stack(lr) int  dst_pitch
+
+|vp8_bilinear_predict4x4_neon| PROC
+    push            {r4, lr}
+
+    ldr             r12, _bifilter4_coeff_
+    ldr             r4, [sp, #8]            ;load parameters from stack
+    ldr             lr, [sp, #12]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (5x4)
+    vld1.u8         {d2}, [r0], r1          ;load src data
+    add             r2, r12, r2, lsl #3     ;calculate Hfilter location (2coeffsx4bytes=8bytes)
+
+    vld1.u8         {d3}, [r0], r1
+    vld1.u32        {d31}, [r2]             ;first_pass filter
+
+    vld1.u8         {d4}, [r0], r1
+    vdup.8          d0, d31[0]              ;first_pass filter (d0-d1)
+    vld1.u8         {d5}, [r0], r1
+    vdup.8          d1, d31[4]
+    vld1.u8         {d6}, [r0], r1
+
+    vshr.u64        q4, q1, #8              ;construct src_ptr[1]
+    vshr.u64        q5, q2, #8
+    vshr.u64        d12, d6, #8
+
+    vzip.32         d2, d3                  ;put 2-line data in 1 register (src_ptr[0])
+    vzip.32         d4, d5
+    vzip.32         d8, d9                  ;put 2-line data in 1 register (src_ptr[1])
+    vzip.32         d10, d11
+
+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q8, d4, d0
+    vmull.u8        q9, d6, d0
+
+    vmlal.u8        q7, d8, d1              ;(src_ptr[1] * vp8_filter[1])
+    vmlal.u8        q8, d10, d1
+    vmlal.u8        q9, d12, d1
+
+    vqrshrn.u16    d28, q7, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d29, q8, #7
+    vqrshrn.u16    d30, q9, #7
+
+;Second pass: 4x4
+secondpass_filter
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+    beq             skip_secondpass_filter
+
+    add             r3, r12, r3, lsl #3 ;calculate Vfilter location
+    vld1.u32        {d31}, [r3]         ;load second_pass filter
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0-d5)
+    vdup.8          d1, d31[4]
+
+    vmull.u8        q1, d28, d0
+    vmull.u8        q2, d29, d0
+
+    vext.8          d26, d28, d29, #4       ;construct src_ptr[pixel_step]
+    vext.8          d27, d29, d30, #4
+
+    vmlal.u8        q1, d26, d1
+    vmlal.u8        q2, d27, d1
+
+    add             r0, r4, lr
+    add             r1, r0, lr
+    add             r2, r1, lr
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+
+    vst1.32         {d2[0]}, [r4]           ;store result
+    vst1.32         {d2[1]}, [r0]
+    vst1.32         {d3[0]}, [r1]
+    vst1.32         {d3[1]}, [r2]
+
+    pop             {r4, pc}
+
+;--------------------
+skip_firstpass_filter
+
+    vld1.32         {d28[0]}, [r0], r1      ;load src data
+    vld1.32         {d28[1]}, [r0], r1
+    vld1.32         {d29[0]}, [r0], r1
+    vld1.32         {d29[1]}, [r0], r1
+    vld1.32         {d30[0]}, [r0], r1
+
+    b               secondpass_filter
+
+;---------------------
+skip_secondpass_filter
+    vst1.32         {d28[0]}, [r4], lr      ;store result
+    vst1.32         {d28[1]}, [r4], lr
+    vst1.32         {d29[0]}, [r4], lr
+    vst1.32         {d29[1]}, [r4], lr
+
+    pop             {r4, pc}
+
+    ENDP
+
+;-----------------
+    AREA    bilinearfilters4_dat, DATA, READWRITE           ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_bifilter4_coeff_
+    DCD     bifilter4_coeff
+bifilter4_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+    END
diff --git a/vp8/common/arm/neon/bilinearpredict8x4_neon.asm b/vp8/common/arm/neon/bilinearpredict8x4_neon.asm
new file mode 100644
index 000000000..46ebb0e0b
--- /dev/null
+++ b/vp8/common/arm/neon/bilinearpredict8x4_neon.asm
@@ -0,0 +1,139 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_bilinear_predict8x4_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; r4    unsigned char *dst_ptr,
+; stack(lr) int  dst_pitch
+
+|vp8_bilinear_predict8x4_neon| PROC
+    push            {r4, lr}
+
+    ldr             r12, _bifilter8x4_coeff_
+    ldr             r4, [sp, #8]            ;load parameters from stack
+    ldr             lr, [sp, #12]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (5x8)
+    add             r2, r12, r2, lsl #3     ;calculate filter location
+
+    vld1.u8         {q1}, [r0], r1          ;load src data
+    vld1.u32        {d31}, [r2]             ;load first_pass filter
+    vld1.u8         {q2}, [r0], r1
+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
+    vld1.u8         {q3}, [r0], r1
+    vdup.8          d1, d31[4]
+    vld1.u8         {q4}, [r0], r1
+
+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp8_filter[0])
+    vld1.u8         {q5}, [r0], r1
+    vmull.u8        q7, d4, d0
+    vmull.u8        q8, d6, d0
+    vmull.u8        q9, d8, d0
+    vmull.u8        q10, d10, d0
+
+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
+    vext.8          d5, d4, d5, #1
+    vext.8          d7, d6, d7, #1
+    vext.8          d9, d8, d9, #1
+    vext.8          d11, d10, d11, #1
+
+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp8_filter[1])
+    vmlal.u8        q7, d5, d1
+    vmlal.u8        q8, d7, d1
+    vmlal.u8        q9, d9, d1
+    vmlal.u8        q10, d11, d1
+
+    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d23, q7, #7
+    vqrshrn.u16    d24, q8, #7
+    vqrshrn.u16    d25, q9, #7
+    vqrshrn.u16    d26, q10, #7
+
+;Second pass: 4x8
+secondpass_filter
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+    beq             skip_secondpass_filter
+
+    add             r3, r12, r3, lsl #3
+    add             r0, r4, lr
+
+    vld1.u32        {d31}, [r3]             ;load second_pass filter
+    add             r1, r0, lr
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q2, d23, d0
+    vmull.u8        q3, d24, d0
+    vmull.u8        q4, d25, d0
+
+    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * vp8_filter[1])
+    vmlal.u8        q2, d24, d1
+    vmlal.u8        q3, d25, d1
+    vmlal.u8        q4, d26, d1
+
+    add             r2, r1, lr
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+
+    vst1.u8         {d2}, [r4]              ;store result
+    vst1.u8         {d3}, [r0]
+    vst1.u8         {d4}, [r1]
+    vst1.u8         {d5}, [r2]
+
+    pop             {r4, pc}
+
+;--------------------
+skip_firstpass_filter
+    vld1.u8         {d22}, [r0], r1         ;load src data
+    vld1.u8         {d23}, [r0], r1
+    vld1.u8         {d24}, [r0], r1
+    vld1.u8         {d25}, [r0], r1
+    vld1.u8         {d26}, [r0], r1
+
+    b               secondpass_filter
+
+;---------------------
+skip_secondpass_filter
+    vst1.u8         {d22}, [r4], lr         ;store result
+    vst1.u8         {d23}, [r4], lr
+    vst1.u8         {d24}, [r4], lr
+    vst1.u8         {d25}, [r4], lr
+
+    pop             {r4, pc}
+
+    ENDP
+
+;-----------------
+    AREA    bifilters8x4_dat, DATA, READWRITE           ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_bifilter8x4_coeff_
+    DCD     bifilter8x4_coeff
+bifilter8x4_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+    END
diff --git a/vp8/common/arm/neon/bilinearpredict8x8_neon.asm b/vp8/common/arm/neon/bilinearpredict8x8_neon.asm
new file mode 100644
index 000000000..80728d4f8
--- /dev/null
+++ b/vp8/common/arm/neon/bilinearpredict8x8_neon.asm
@@ -0,0 +1,187 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_bilinear_predict8x8_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; r4    unsigned char *dst_ptr,
+; stack(lr) int  dst_pitch
+
+|vp8_bilinear_predict8x8_neon| PROC
+    push            {r4, lr}
+
+    ldr             r12, _bifilter8_coeff_
+    ldr             r4, [sp, #8]            ;load parameters from stack
+    ldr             lr, [sp, #12]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (9x8)
+    add             r2, r12, r2, lsl #3     ;calculate filter location
+
+    vld1.u8         {q1}, [r0], r1          ;load src data
+    vld1.u32        {d31}, [r2]             ;load first_pass filter
+    vld1.u8         {q2}, [r0], r1
+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
+    vld1.u8         {q3}, [r0], r1
+    vdup.8          d1, d31[4]
+    vld1.u8         {q4}, [r0], r1
+
+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q7, d4, d0
+    vmull.u8        q8, d6, d0
+    vmull.u8        q9, d8, d0
+
+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
+    vext.8          d5, d4, d5, #1
+    vext.8          d7, d6, d7, #1
+    vext.8          d9, d8, d9, #1
+
+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp8_filter[1])
+    vmlal.u8        q7, d5, d1
+    vmlal.u8        q8, d7, d1
+    vmlal.u8        q9, d9, d1
+
+    vld1.u8         {q1}, [r0], r1          ;load src data
+    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8
+    vld1.u8         {q2}, [r0], r1
+    vqrshrn.u16    d23, q7, #7
+    vld1.u8         {q3}, [r0], r1
+    vqrshrn.u16    d24, q8, #7
+    vld1.u8         {q4}, [r0], r1
+    vqrshrn.u16    d25, q9, #7
+
+    ;first_pass filtering on the rest 5-line data
+    vld1.u8         {q5}, [r0], r1
+
+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q7, d4, d0
+    vmull.u8        q8, d6, d0
+    vmull.u8        q9, d8, d0
+    vmull.u8        q10, d10, d0
+
+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
+    vext.8          d5, d4, d5, #1
+    vext.8          d7, d6, d7, #1
+    vext.8          d9, d8, d9, #1
+    vext.8          d11, d10, d11, #1
+
+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp8_filter[1])
+    vmlal.u8        q7, d5, d1
+    vmlal.u8        q8, d7, d1
+    vmlal.u8        q9, d9, d1
+    vmlal.u8        q10, d11, d1
+
+    vqrshrn.u16    d26, q6, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d27, q7, #7
+    vqrshrn.u16    d28, q8, #7
+    vqrshrn.u16    d29, q9, #7
+    vqrshrn.u16    d30, q10, #7
+
+;Second pass: 8x8
+secondpass_filter
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+    beq             skip_secondpass_filter
+
+    add             r3, r12, r3, lsl #3
+    add             r0, r4, lr
+
+    vld1.u32        {d31}, [r3]             ;load second_pass filter
+    add             r1, r0, lr
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q2, d23, d0
+    vmull.u8        q3, d24, d0
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * vp8_filter[1])
+    vmlal.u8        q2, d24, d1
+    vmlal.u8        q3, d25, d1
+    vmlal.u8        q4, d26, d1
+    vmlal.u8        q5, d27, d1
+    vmlal.u8        q6, d28, d1
+    vmlal.u8        q7, d29, d1
+    vmlal.u8        q8, d30, d1
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+    vqrshrn.u16    d6, q5, #7
+    vqrshrn.u16    d7, q6, #7
+    vqrshrn.u16    d8, q7, #7
+    vqrshrn.u16    d9, q8, #7
+
+    vst1.u8         {d2}, [r4]              ;store result
+    vst1.u8         {d3}, [r0]
+    vst1.u8         {d4}, [r1], lr
+    vst1.u8         {d5}, [r1], lr
+    vst1.u8         {d6}, [r1], lr
+    vst1.u8         {d7}, [r1], lr
+    vst1.u8         {d8}, [r1], lr
+    vst1.u8         {d9}, [r1], lr
+
+    pop             {r4, pc}
+
+;--------------------
+skip_firstpass_filter
+    vld1.u8         {d22}, [r0], r1         ;load src data
+    vld1.u8         {d23}, [r0], r1
+    vld1.u8         {d24}, [r0], r1
+    vld1.u8         {d25}, [r0], r1
+    vld1.u8         {d26}, [r0], r1
+    vld1.u8         {d27}, [r0], r1
+    vld1.u8         {d28}, [r0], r1
+    vld1.u8         {d29}, [r0], r1
+    vld1.u8         {d30}, [r0], r1
+
+    b               secondpass_filter
+
+;---------------------
+skip_secondpass_filter
+    vst1.u8         {d22}, [r4], lr         ;store result
+    vst1.u8         {d23}, [r4], lr
+    vst1.u8         {d24}, [r4], lr
+    vst1.u8         {d25}, [r4], lr
+    vst1.u8         {d26}, [r4], lr
+    vst1.u8         {d27}, [r4], lr
+    vst1.u8         {d28}, [r4], lr
+    vst1.u8         {d29}, [r4], lr
+
+    pop             {r4, pc}
+
+    ENDP
+
+;-----------------
+    AREA    bifilters8_dat, DATA, READWRITE         ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_bifilter8_coeff_
+    DCD     bifilter8_coeff
+bifilter8_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+    END
diff --git a/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm b/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
new file mode 100644
index 000000000..f42ac63c9
--- /dev/null
+++ b/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
@@ -0,0 +1,583 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_build_intra_predictors_mby_neon_func|
+    EXPORT  |vp8_build_intra_predictors_mby_s_neon_func|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char *y_buffer
+; r1    unsigned char *ypred_ptr
+; r2    int y_stride
+; r3    int mode
+; stack int Up
+; stack int Left
+
+|vp8_build_intra_predictors_mby_neon_func| PROC
+    push            {r4-r8, lr}
+
+    cmp             r3, #0
+    beq             case_dc_pred
+    cmp             r3, #1
+    beq             case_v_pred
+    cmp             r3, #2
+    beq             case_h_pred
+    cmp             r3, #3
+    beq             case_tm_pred
+
+case_dc_pred
+    ldr             r4, [sp, #24]       ; Up
+    ldr             r5, [sp, #28]       ; Left
+
+    ; Default the DC average to 128
+    mov             r12, #128
+    vdup.u8         q0, r12
+
+    ; Zero out running sum
+    mov             r12, #0
+
+    ; compute shift and jump
+    adds            r7, r4, r5
+    beq             skip_dc_pred_up_left
+
+    ; Load above row, if it exists
+    cmp             r4, #0
+    beq             skip_dc_pred_up
+
+    sub             r6, r0, r2
+    vld1.8          {q1}, [r6]
+    vpaddl.u8       q2, q1
+    vpaddl.u16      q3, q2
+    vpaddl.u32      q4, q3
+
+    vmov.32         r4, d8[0]
+    vmov.32         r6, d9[0]
+
+    add             r12, r4, r6
+
+    ; Move back to interger registers
+
+skip_dc_pred_up
+
+    cmp             r5, #0
+    beq             skip_dc_pred_left
+
+    sub             r0, r0, #1
+
+    ; Load left row, if it exists
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0]
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+skip_dc_pred_left
+    add             r7, r7, #3          ; Shift
+    sub             r4, r7, #1
+    mov             r5, #1
+    add             r12, r12, r5, lsl r4
+    mov             r5, r12, lsr r7     ; expected_dc
+
+    vdup.u8         q0, r5
+
+skip_dc_pred_up_left
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+
+    pop             {r4-r8,pc}
+case_v_pred
+    ; Copy down above row
+    sub             r6, r0, r2
+    vld1.8          {q0}, [r6]
+
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    pop             {r4-r8,pc}
+
+case_h_pred
+    ; Load 4x yleft_col
+    sub             r0, r0, #1
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q1}, [r1]!
+    vst1.u8         {q2}, [r1]!
+    vst1.u8         {q3}, [r1]!
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q1}, [r1]!
+    vst1.u8         {q2}, [r1]!
+    vst1.u8         {q3}, [r1]!
+
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q1}, [r1]!
+    vst1.u8         {q2}, [r1]!
+    vst1.u8         {q3}, [r1]!
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q1}, [r1]!
+    vst1.u8         {q2}, [r1]!
+    vst1.u8         {q3}, [r1]!
+
+    pop             {r4-r8,pc}
+
+case_tm_pred
+    ; Load yabove_row
+    sub             r3, r0, r2
+    vld1.8          {q8}, [r3]
+
+    ; Load ytop_left
+    sub             r3, r3, #1
+    ldrb            r7, [r3]
+
+    vdup.u16        q7, r7
+
+    ; Compute yabove_row - ytop_left
+    mov             r3, #1
+    vdup.u8         q0, r3
+
+    vmull.u8        q4, d16, d0
+    vmull.u8        q5, d17, d0
+
+    vsub.s16        q4, q4, q7
+    vsub.s16        q5, q5, q7
+
+    ; Load 4x yleft_col
+    sub             r0, r0, #1
+    mov             r12, #4
+
+case_tm_pred_loop
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u16        q0, r3
+    vdup.u16        q1, r4
+    vdup.u16        q2, r5
+    vdup.u16        q3, r6
+
+    vqadd.s16       q8, q0, q4
+    vqadd.s16       q9, q0, q5
+
+    vqadd.s16       q10, q1, q4
+    vqadd.s16       q11, q1, q5
+
+    vqadd.s16       q12, q2, q4
+    vqadd.s16       q13, q2, q5
+
+    vqadd.s16       q14, q3, q4
+    vqadd.s16       q15, q3, q5
+
+    vqshrun.s16     d0, q8, #0
+    vqshrun.s16     d1, q9, #0
+
+    vqshrun.s16     d2, q10, #0
+    vqshrun.s16     d3, q11, #0
+
+    vqshrun.s16     d4, q12, #0
+    vqshrun.s16     d5, q13, #0
+
+    vqshrun.s16     d6, q14, #0
+    vqshrun.s16     d7, q15, #0
+
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q1}, [r1]!
+    vst1.u8         {q2}, [r1]!
+    vst1.u8         {q3}, [r1]!
+
+    subs            r12, r12, #1
+    bne             case_tm_pred_loop
+
+    pop             {r4-r8,pc}
+
+    ENDP
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; r0    unsigned char *y_buffer
+; r1    unsigned char *ypred_ptr
+; r2    int y_stride
+; r3    int mode
+; stack int Up
+; stack int Left
+
+|vp8_build_intra_predictors_mby_s_neon_func| PROC
+    push            {r4-r8, lr}
+
+    mov             r1, r0      ;   unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
+
+    cmp             r3, #0
+    beq             case_dc_pred_s
+    cmp             r3, #1
+    beq             case_v_pred_s
+    cmp             r3, #2
+    beq             case_h_pred_s
+    cmp             r3, #3
+    beq             case_tm_pred_s
+
+case_dc_pred_s
+    ldr             r4, [sp, #24]       ; Up
+    ldr             r5, [sp, #28]       ; Left
+
+    ; Default the DC average to 128
+    mov             r12, #128
+    vdup.u8         q0, r12
+
+    ; Zero out running sum
+    mov             r12, #0
+
+    ; compute shift and jump
+    adds            r7, r4, r5
+    beq             skip_dc_pred_up_left_s
+
+    ; Load above row, if it exists
+    cmp             r4, #0
+    beq             skip_dc_pred_up_s
+
+    sub             r6, r0, r2
+    vld1.8          {q1}, [r6]
+    vpaddl.u8       q2, q1
+    vpaddl.u16      q3, q2
+    vpaddl.u32      q4, q3
+
+    vmov.32         r4, d8[0]
+    vmov.32         r6, d9[0]
+
+    add             r12, r4, r6
+
+    ; Move back to interger registers
+
+skip_dc_pred_up_s
+
+    cmp             r5, #0
+    beq             skip_dc_pred_left_s
+
+    sub             r0, r0, #1
+
+    ; Load left row, if it exists
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0]
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+skip_dc_pred_left_s
+    add             r7, r7, #3          ; Shift
+    sub             r4, r7, #1
+    mov             r5, #1
+    add             r12, r12, r5, lsl r4
+    mov             r5, r12, lsr r7     ; expected_dc
+
+    vdup.u8         q0, r5
+
+skip_dc_pred_up_left_s
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+
+    pop             {r4-r8,pc}
+case_v_pred_s
+    ; Copy down above row
+    sub             r6, r0, r2
+    vld1.8          {q0}, [r6]
+
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    pop             {r4-r8,pc}
+
+case_h_pred_s
+    ; Load 4x yleft_col
+    sub             r0, r0, #1
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q1}, [r1], r2
+    vst1.u8         {q2}, [r1], r2
+    vst1.u8         {q3}, [r1], r2
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q1}, [r1], r2
+    vst1.u8         {q2}, [r1], r2
+    vst1.u8         {q3}, [r1], r2
+
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q1}, [r1], r2
+    vst1.u8         {q2}, [r1], r2
+    vst1.u8         {q3}, [r1], r2
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q1}, [r1], r2
+    vst1.u8         {q2}, [r1], r2
+    vst1.u8         {q3}, [r1], r2
+
+    pop             {r4-r8,pc}
+
+case_tm_pred_s
+    ; Load yabove_row
+    sub             r3, r0, r2
+    vld1.8          {q8}, [r3]
+
+    ; Load ytop_left
+    sub             r3, r3, #1
+    ldrb            r7, [r3]
+
+    vdup.u16        q7, r7
+
+    ; Compute yabove_row - ytop_left
+    mov             r3, #1
+    vdup.u8         q0, r3
+
+    vmull.u8        q4, d16, d0
+    vmull.u8        q5, d17, d0
+
+    vsub.s16        q4, q4, q7
+    vsub.s16        q5, q5, q7
+
+    ; Load 4x yleft_col
+    sub             r0, r0, #1
+    mov             r12, #4
+
+case_tm_pred_loop_s
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u16        q0, r3
+    vdup.u16        q1, r4
+    vdup.u16        q2, r5
+    vdup.u16        q3, r6
+
+    vqadd.s16       q8, q0, q4
+    vqadd.s16       q9, q0, q5
+
+    vqadd.s16       q10, q1, q4
+    vqadd.s16       q11, q1, q5
+
+    vqadd.s16       q12, q2, q4
+    vqadd.s16       q13, q2, q5
+
+    vqadd.s16       q14, q3, q4
+    vqadd.s16       q15, q3, q5
+
+    vqshrun.s16     d0, q8, #0
+    vqshrun.s16     d1, q9, #0
+
+    vqshrun.s16     d2, q10, #0
+    vqshrun.s16     d3, q11, #0
+
+    vqshrun.s16     d4, q12, #0
+    vqshrun.s16     d5, q13, #0
+
+    vqshrun.s16     d6, q14, #0
+    vqshrun.s16     d7, q15, #0
+
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q1}, [r1], r2
+    vst1.u8         {q2}, [r1], r2
+    vst1.u8         {q3}, [r1], r2
+
+    subs            r12, r12, #1
+    bne             case_tm_pred_loop_s
+
+    pop             {r4-r8,pc}
+
+    ENDP
+
+
+    END
diff --git a/vp8/common/arm/neon/copymem16x16_neon.asm b/vp8/common/arm/neon/copymem16x16_neon.asm
new file mode 100644
index 000000000..89d5e1018
--- /dev/null
+++ b/vp8/common/arm/neon/copymem16x16_neon.asm
@@ -0,0 +1,58 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_copy_mem16x16_neon|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem16x16_neon| PROC
+
+    vld1.u8     {q0}, [r0], r1
+    vld1.u8     {q1}, [r0], r1
+    vld1.u8     {q2}, [r0], r1
+    vst1.u8     {q0}, [r2], r3
+    vld1.u8     {q3}, [r0], r1
+    vst1.u8     {q1}, [r2], r3
+    vld1.u8     {q4}, [r0], r1
+    vst1.u8     {q2}, [r2], r3
+    vld1.u8     {q5}, [r0], r1
+    vst1.u8     {q3}, [r2], r3
+    vld1.u8     {q6}, [r0], r1
+    vst1.u8     {q4}, [r2], r3
+    vld1.u8     {q7}, [r0], r1
+    vst1.u8     {q5}, [r2], r3
+    vld1.u8     {q8}, [r0], r1
+    vst1.u8     {q6}, [r2], r3
+    vld1.u8     {q9}, [r0], r1
+    vst1.u8     {q7}, [r2], r3
+    vld1.u8     {q10}, [r0], r1
+    vst1.u8     {q8}, [r2], r3
+    vld1.u8     {q11}, [r0], r1
+    vst1.u8     {q9}, [r2], r3
+    vld1.u8     {q12}, [r0], r1
+    vst1.u8     {q10}, [r2], r3
+    vld1.u8     {q13}, [r0], r1
+    vst1.u8     {q11}, [r2], r3
+    vld1.u8     {q14}, [r0], r1
+    vst1.u8     {q12}, [r2], r3
+    vld1.u8     {q15}, [r0], r1
+    vst1.u8     {q13}, [r2], r3
+    vst1.u8     {q14}, [r2], r3
+    vst1.u8     {q15}, [r2], r3
+
+    mov     pc, lr
+
+    ENDP  ; |vp8_copy_mem16x16_neon|
+
+    END
diff --git a/vp8/common/arm/neon/copymem8x4_neon.asm b/vp8/common/arm/neon/copymem8x4_neon.asm
new file mode 100644
index 000000000..302f734ff
--- /dev/null
+++ b/vp8/common/arm/neon/copymem8x4_neon.asm
@@ -0,0 +1,33 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_copy_mem8x4_neon|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem8x4_neon| PROC
+    vld1.u8     {d0}, [r0], r1
+    vld1.u8     {d1}, [r0], r1
+    vst1.u8     {d0}, [r2], r3
+    vld1.u8     {d2}, [r0], r1
+    vst1.u8     {d1}, [r2], r3
+    vld1.u8     {d3}, [r0], r1
+    vst1.u8     {d2}, [r2], r3
+    vst1.u8     {d3}, [r2], r3
+
+    mov     pc, lr
+
+    ENDP  ; |vp8_copy_mem8x4_neon|
+
+    END
diff --git a/vp8/common/arm/neon/copymem8x8_neon.asm b/vp8/common/arm/neon/copymem8x8_neon.asm
new file mode 100644
index 000000000..50d39ef66
--- /dev/null
+++ b/vp8/common/arm/neon/copymem8x8_neon.asm
@@ -0,0 +1,42 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_copy_mem8x8_neon|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem8x8_neon| PROC
+
+    vld1.u8     {d0}, [r0], r1
+    vld1.u8     {d1}, [r0], r1
+    vst1.u8     {d0}, [r2], r3
+    vld1.u8     {d2}, [r0], r1
+    vst1.u8     {d1}, [r2], r3
+    vld1.u8     {d3}, [r0], r1
+    vst1.u8     {d2}, [r2], r3
+    vld1.u8     {d4}, [r0], r1
+    vst1.u8     {d3}, [r2], r3
+    vld1.u8     {d5}, [r0], r1
+    vst1.u8     {d4}, [r2], r3
+    vld1.u8     {d6}, [r0], r1
+    vst1.u8     {d5}, [r2], r3
+    vld1.u8     {d7}, [r0], r1
+    vst1.u8     {d6}, [r2], r3
+    vst1.u8     {d7}, [r2], r3
+
+    mov     pc, lr
+
+    ENDP  ; |vp8_copy_mem8x8_neon|
+
+    END
diff --git a/vp8/common/arm/neon/iwalsh_neon.asm b/vp8/common/arm/neon/iwalsh_neon.asm
new file mode 100644
index 000000000..4fc744c96
--- /dev/null
+++ b/vp8/common/arm/neon/iwalsh_neon.asm
@@ -0,0 +1,95 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+    EXPORT  |vp8_short_inv_walsh4x4_neon|
+    EXPORT  |vp8_short_inv_walsh4x4_1_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;short vp8_short_inv_walsh4x4_neon(short *input, short *output)
+|vp8_short_inv_walsh4x4_neon| PROC
+
+    ; read in all four lines of values: d0->d3
+    vldm.64 r0, {q0, q1}
+
+    ; first for loop
+
+    vadd.s16 d4, d0, d3 ;a = [0] + [12]
+    vadd.s16 d5, d1, d2 ;b = [4] + [8]
+    vsub.s16 d6, d1, d2 ;c = [4] - [8]
+    vsub.s16 d7, d0, d3 ;d = [0] - [12]
+
+    vadd.s16 d0, d4, d5 ;a + b
+    vadd.s16 d1, d6, d7 ;c + d
+    vsub.s16 d2, d4, d5 ;a - b
+    vsub.s16 d3, d7, d6 ;d - c
+
+    vtrn.32 d0, d2 ;d0:  0  1  8  9
+                   ;d2:  2  3 10 11
+    vtrn.32 d1, d3 ;d1:  4  5 12 13
+                   ;d3:  6  7 14 15
+
+    vtrn.16 d0, d1 ;d0:  0  4  8 12
+                   ;d1:  1  5  9 13
+    vtrn.16 d2, d3 ;d2:  2  6 10 14
+                   ;d3:  3  7 11 15
+
+    ; second for loop
+
+    vadd.s16 d4, d0, d3 ;a = [0] + [3]
+    vadd.s16 d5, d1, d2 ;b = [1] + [2]
+    vsub.s16 d6, d1, d2 ;c = [1] - [2]
+    vsub.s16 d7, d0, d3 ;d = [0] - [3]
+
+    vadd.s16 d0, d4, d5 ;e = a + b
+    vadd.s16 d1, d6, d7 ;f = c + d
+    vsub.s16 d2, d4, d5 ;g = a - b
+    vsub.s16 d3, d7, d6 ;h = d - c
+
+    vmov.i16 q2, #3
+    vadd.i16 q0, q0, q2 ;e/f += 3
+    vadd.i16 q1, q1, q2 ;g/h += 3
+
+    vshr.s16 q0, q0, #3 ;e/f >> 3
+    vshr.s16 q1, q1, #3 ;g/h >> 3
+
+    vtrn.32 d0, d2
+    vtrn.32 d1, d3
+    vtrn.16 d0, d1
+    vtrn.16 d2, d3
+
+    vstmia.16 r1!, {q0}
+    vstmia.16 r1!, {q1}
+
+    bx lr
+    ENDP    ; |vp8_short_inv_walsh4x4_neon|
+
+
+;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output)
+|vp8_short_inv_walsh4x4_1_neon| PROC
+    ; load a full line into a neon register
+    vld1.16  {q0}, [r0]
+    ; extract first element and replicate
+    vdup.16 q1, d0[0]
+    ; add 3 to all values
+    vmov.i16 q2, #3
+    vadd.i16 q3, q1, q2
+    ; right shift
+    vshr.s16 q3, q3, #3
+    ; write it back
+    vstmia.16 r1!, {q3}
+    vstmia.16 r1!, {q3}
+
+    bx lr
+    ENDP    ; |vp8_short_inv_walsh4x4_1_neon|
+
+    END
diff --git a/vp8/common/arm/neon/loopfilterhorizontaledge_uv_neon.asm b/vp8/common/arm/neon/loopfilterhorizontaledge_uv_neon.asm
new file mode 100644
index 000000000..e3e8e8a72
--- /dev/null
+++ b/vp8/common/arm/neon/loopfilterhorizontaledge_uv_neon.asm
@@ -0,0 +1,205 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_loop_filter_horizontal_edge_uv_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
+;are equal. So, in the code, only one load is needed
+;for flimit. Same way applies to limit and thresh.
+; r0    unsigned char *u,
+; r1    int p, //pitch
+; r2    const signed char *flimit,
+; r3    const signed char *limit,
+; stack(r4) const signed char *thresh,
+; stack(r5) unsigned char *v
+
+|vp8_loop_filter_horizontal_edge_uv_neon| PROC
+    sub         r0, r0, r1, lsl #2          ; move u pointer down by 4 lines
+    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
+
+    ldr         r2, [sp, #4]                ; load v ptr
+    ldr         r12, [sp, #0]               ; load thresh pointer
+
+    sub         r2, r2, r1, lsl #2          ; move v pointer down by 4 lines
+
+    vld1.u8     {d6}, [r0], r1              ; p3
+    vld1.u8     {d7}, [r2], r1              ; p3
+    vld1.u8     {d8}, [r0], r1              ; p2
+    vld1.u8     {d9}, [r2], r1              ; p2
+    vld1.u8     {d10}, [r0], r1             ; p1
+    vld1.u8     {d11}, [r2], r1             ; p1
+    vld1.u8     {d12}, [r0], r1             ; p0
+    vld1.u8     {d13}, [r2], r1             ; p0
+    vld1.u8     {d14}, [r0], r1             ; q0
+    vld1.u8     {d15}, [r2], r1             ; q0
+    vld1.u8     {d16}, [r0], r1             ; q1
+    vld1.u8     {d17}, [r2], r1             ; q1
+    vld1.u8     {d18}, [r0], r1             ; q2
+    vld1.u8     {d19}, [r2], r1             ; q2
+    vld1.u8     {d20}, [r0], r1             ; q3
+    vld1.u8     {d21}, [r2], r1             ; q3
+
+    vld1.s8     {d2[], d3[]}, [r3]          ; limit
+    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
+
+    ldr         r12, _lfhuv_coeff_
+    ;vp8_filter_mask() function
+    ;vp8_hevmask() function
+    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
+    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
+    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
+    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
+    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
+    vabd.u8     q4, q10, q9                 ; abs(q3 - q2)
+    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
+
+    vcge.u8     q15, q1, q11                ; (abs(p3 - p2) > limit)*-1
+    vcge.u8     q12, q1, q12                ; (abs(p2 - p1) > limit)*-1
+    vcge.u8     q10, q1, q13                ; (abs(p1 - p0) > limit)*-1
+    vcge.u8     q11, q1, q14                ; (abs(q1 - q0) > limit)*-1
+
+    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
+
+    vcge.u8     q3, q1, q3                  ; (abs(q2 - q1) > limit)*-1
+    vcge.u8     q4, q1, q4                  ; (abs(q3 - q2) > limit)*-1
+    vadd.u8     q0, q0, q0                  ; flimit * 2
+    vadd.u8     q0, q0, q1                  ; flimit * 2 + limit
+
+    vand        q15, q15, q12
+    vand        q10, q10, q11
+    vand        q3, q3, q4
+
+    vabd.u8     q2, q5, q8                  ; abs(p1 - q1)
+    vqadd.u8    q9, q9, q9                  ; abs(p0 - q0) * 2
+    vshr.u8     q2, q2, #1                  ; abs(p1 - q1) / 2
+    vqadd.u8    q9, q9, q2                  ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+    vcge.u8     q9, q0, q9                  ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
+
+    vld1.u8     {q0}, [r12]!
+
+    vand        q15, q15, q10
+
+    ;vp8_filter() function
+    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
+    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
+    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
+    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
+;;;;;;;;;;;;;;
+    vld1.u8     {q10}, [r12]!
+
+    ;vqsub.s8   q2, q7, q6                  ; ( qs0 - ps0)
+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
+    vsubl.s8    q11, d15, d13
+
+    vand        q3, q3, q9
+    vmovl.u8    q4, d20
+
+    vqsub.s8    q1, q5, q8                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
+    vorr        q14, q13, q14               ; q14: vp8_hevmask
+
+    ;vmul.i8    q2, q2, q10                 ; 3 * ( qs0 - ps0)
+    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
+    vmul.i16    q11, q11, q4
+
+    vand        q1, q1, q14                 ; vp8_filter &= hev
+    vand        q15, q15, q3                ; q15: vp8_filter_mask
+    ;;
+    ;vld1.u8        {q4}, [r12]!            ;no need 7 any more
+
+    ;vqadd.s8   q1, q1, q2
+    vaddw.s8    q2, q2, d2
+    vaddw.s8    q11, q11, d3
+
+    vld1.u8     {q9}, [r12]!
+    ;
+    vqmovn.s16  d2, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d3, q11
+    ;;
+
+    vand        q1, q1, q15                 ; vp8_filter &= mask
+    ;;
+;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7
+;   vand        q2, q1, q4                  ; s = vp8_filter & 7
+;   vqadd.s8    q1, q1, q9                  ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
+    ;;;;
+;   vshr.s8     q1, q1, #3                  ; vp8_filter >>= 3
+;   vceq.i8     q2, q2, q9                  ; s = (s==4)*-1
+    ;;
+;   ;calculate output
+;   vqsub.s8    q10, q7, q1                 ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
+;   vqadd.s8    q11, q2, q1                 ; u = vp8_signed_char_clamp(s + vp8_filter)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; q10=3
+    vqadd.s8    q2, q1, q10                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
+    vqadd.s8    q1, q1, q9                  ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
+    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
+    vshr.s8     q1, q1, #3                  ; Filter1 >>= 3
+
+    ;calculate output
+    vqadd.s8    q11, q6, q2             ; u = vp8_signed_char_clamp(ps0 + Filter2)
+    vqsub.s8    q10, q7, q1                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+    vrshr.s8    q1, q1, #1                  ;round/shift:  vp8_filter += 1; vp8_filter >>= 1
+
+    sub         r0, r0, r1, lsl #2
+    sub         r0, r0, r1, lsl #1
+    ;
+
+    vbic        q1, q1, q14                 ; vp8_filter &= ~hev
+
+    sub         r2, r2, r1, lsl #2
+    sub         r2, r2, r1, lsl #1
+    ;;
+
+    vqadd.s8    q13, q5, q1                 ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
+    ;vqadd.s8   q11, q6, q11                ; u = vp8_signed_char_clamp(ps0 + u)
+    vqsub.s8    q12, q8, q1                 ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
+    ;
+
+    veor        q5, q13, q0                 ; *op1 = u^0x80
+    veor        q6, q11, q0                 ; *op0 = u^0x80
+    veor        q7, q10, q0                 ; *oq0 = u^0x80
+    veor        q8, q12, q0                 ; *oq1 = u^0x80
+    ;
+
+    vst1.u8     {d10}, [r0], r1             ; store u op1
+    vst1.u8     {d11}, [r2], r1             ; store v op1
+    vst1.u8     {d12}, [r0], r1             ; store u op0
+    vst1.u8     {d13}, [r2], r1             ; store v op0
+    vst1.u8     {d14}, [r0], r1             ; store u oq0
+    vst1.u8     {d15}, [r2], r1             ; store v oq0
+    vst1.u8     {d16}, [r0], r1             ; store u oq1
+    vst1.u8     {d17}, [r2], r1             ; store v oq1
+
+    bx          lr
+    ENDP        ; |vp8_loop_filter_horizontal_edge_uv_neon|
+
+;-----------------
+    AREA    hloopfilteruv_dat, DATA, READWRITE          ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_lfhuv_coeff_
+    DCD     lfhuv_coeff
+lfhuv_coeff
+    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
+    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
+    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
+    DCD     0x01010101, 0x01010101, 0x01010101, 0x01010101
+
+    END
diff --git a/vp8/common/arm/neon/loopfilterhorizontaledge_y_neon.asm b/vp8/common/arm/neon/loopfilterhorizontaledge_y_neon.asm
new file mode 100644
index 000000000..f11055d42
--- /dev/null
+++ b/vp8/common/arm/neon/loopfilterhorizontaledge_y_neon.asm
@@ -0,0 +1,188 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_loop_filter_horizontal_edge_y_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
+;are equal. So, in the code, only one load is needed
+;for flimit. Same way applies to limit and thresh.
+; r0    unsigned char *s,
+; r1    int p, //pitch
+; r2    const signed char *flimit,
+; r3    const signed char *limit,
+; stack(r4) const signed char *thresh,
+; //stack(r5)   int count --unused
+
+|vp8_loop_filter_horizontal_edge_y_neon| PROC
+    sub         r0, r0, r1, lsl #2          ; move src pointer down by 4 lines
+    ldr         r12, [sp, #0]               ; load thresh pointer
+
+    vld1.u8     {q3}, [r0], r1              ; p3
+    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
+    vld1.u8     {q4}, [r0], r1              ; p2
+    vld1.s8     {d2[], d3[]}, [r3]          ; limit
+    vld1.u8     {q5}, [r0], r1              ; p1
+    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
+    vld1.u8     {q6}, [r0], r1              ; p0
+    ldr         r12, _lfhy_coeff_
+    vld1.u8     {q7}, [r0], r1              ; q0
+
+    ;vp8_filter_mask() function
+    ;vp8_hevmask() function
+    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
+    vld1.u8     {q8}, [r0], r1              ; q1
+    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
+    vld1.u8     {q9}, [r0], r1              ; q2
+    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
+    vld1.u8     {q10}, [r0], r1             ; q3
+    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
+    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
+    vabd.u8     q4, q10, q9                 ; abs(q3 - q2)
+    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
+
+    vcge.u8     q15, q1, q11                ; (abs(p3 - p2) > limit)*-1
+    vcge.u8     q12, q1, q12                ; (abs(p2 - p1) > limit)*-1
+    vcge.u8     q10, q1, q13                ; (abs(p1 - p0) > limit)*-1
+    vcge.u8     q11, q1, q14                ; (abs(q1 - q0) > limit)*-1
+
+    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
+
+    vcge.u8     q3, q1, q3                  ; (abs(q2 - q1) > limit)*-1
+    vcge.u8     q4, q1, q4                  ; (abs(q3 - q2) > limit)*-1
+    vadd.u8     q0, q0, q0                  ; flimit * 2
+    vadd.u8     q0, q0, q1                  ; flimit * 2 + limit
+
+    vand        q15, q15, q12
+    vand        q10, q10, q11
+    vand        q3, q3, q4
+
+    vabd.u8     q2, q5, q8                  ; abs(p1 - q1)
+    vqadd.u8    q9, q9, q9                  ; abs(p0 - q0) * 2
+    vshr.u8     q2, q2, #1                  ; abs(p1 - q1) / 2
+    vqadd.u8    q9, q9, q2                  ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+    vcge.u8     q9, q0, q9                  ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
+
+    vld1.u8     {q0}, [r12]!
+
+    vand        q15, q15, q10
+
+    ;vp8_filter() function
+    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
+    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
+    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
+    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
+;;;;;;;;;;;;;;
+    vld1.u8     {q10}, [r12]!
+
+    ;vqsub.s8   q2, q7, q6                  ; ( qs0 - ps0)
+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
+    vsubl.s8    q11, d15, d13
+
+    vand        q3, q3, q9
+    vmovl.u8    q4, d20
+
+    vqsub.s8    q1, q5, q8                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
+    vorr        q14, q13, q14               ; q14: vp8_hevmask
+
+    ;vmul.i8    q2, q2, q10                 ; 3 * ( qs0 - ps0)
+    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
+    vmul.i16    q11, q11, q4
+
+    vand        q1, q1, q14                 ; vp8_filter &= hev
+    vand        q15, q15, q3                ; q15: vp8_filter_mask
+    ;;
+    ;vld1.u8        {q4}, [r12]!            ;no need 7 any more
+
+    ;vqadd.s8   q1, q1, q2
+    vaddw.s8    q2, q2, d2
+    vaddw.s8    q11, q11, d3
+
+    vld1.u8     {q9}, [r12]!
+    ;
+    vqmovn.s16  d2, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d3, q11
+    ;;
+
+    vand        q1, q1, q15                 ; vp8_filter &= mask
+    ;;
+;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7
+;   vand        q2, q1, q4                  ; s = vp8_filter & 7
+;   vqadd.s8    q1, q1, q9                  ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
+    ;;;;
+;   vshr.s8     q1, q1, #3                  ; vp8_filter >>= 3
+;   vceq.i8     q2, q2, q9                  ; s = (s==4)*-1
+    ;;
+;   ;calculate output
+;   vqsub.s8    q10, q7, q1                 ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
+;   vqadd.s8    q11, q2, q1                 ; u = vp8_signed_char_clamp(s + vp8_filter)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; q10=3
+    vqadd.s8    q2, q1, q10                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
+    vqadd.s8    q1, q1, q9                  ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
+    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
+    vshr.s8     q1, q1, #3                  ; Filter1 >>= 3
+
+    ;calculate output
+    vqadd.s8    q11, q6, q2                 ; u = vp8_signed_char_clamp(ps0 + Filter2)
+    vqsub.s8    q10, q7, q1                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+    vrshr.s8    q1, q1, #1                  ;round/shift:  vp8_filter += 1; vp8_filter >>= 1
+
+    sub         r0, r0, r1, lsl #2
+    sub         r0, r0, r1, lsl #1
+    ;
+
+    vbic        q1, q1, q14                 ; vp8_filter &= ~hev
+    ;
+    add         r2, r1, r0
+
+    vqadd.s8    q13, q5, q1                 ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
+    ;vqadd.s8   q11, q6, q11                ; u = vp8_signed_char_clamp(ps0 + u)
+    vqsub.s8    q12, q8, q1                 ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
+
+    add         r3, r2, r1
+
+    veor        q5, q13, q0                 ; *op1 = u^0x80
+    veor        q6, q11, q0                 ; *op0 = u^0x80
+    veor        q7, q10, q0                 ; *oq0 = u^0x80
+    veor        q8, q12, q0                 ; *oq1 = u^0x80
+
+    add         r12, r3, r1
+
+    vst1.u8     {q5}, [r0]                  ; store op1
+    vst1.u8     {q6}, [r2]                  ; store op0
+    vst1.u8     {q7}, [r3]                  ; store oq0
+    vst1.u8     {q8}, [r12]                 ; store oq1
+
+    bx          lr
+    ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|
+
+;-----------------
+    AREA    hloopfiltery_dat, DATA, READWRITE           ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_lfhy_coeff_
+    DCD     lfhy_coeff
+lfhy_coeff
+    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
+    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
+    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
+    DCD     0x01010101, 0x01010101, 0x01010101, 0x01010101
+
+    END
diff --git a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
new file mode 100644
index 000000000..6d74fab52
--- /dev/null
+++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
@@ -0,0 +1,117 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_loop_filter_simple_horizontal_edge_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
+;are equal. So, in the code, only one load is needed
+;for flimit. Same way applies to limit and thresh.
+; r0    unsigned char *s,
+; r1    int p, //pitch
+; r2    const signed char *flimit,
+; r3    const signed char *limit,
+; stack(r4) const signed char *thresh,
+; //stack(r5)   int count --unused
+
+|vp8_loop_filter_simple_horizontal_edge_neon| PROC
+    sub         r0, r0, r1, lsl #1          ; move src pointer down by 2 lines
+
+    ldr         r12, _lfhy_coeff_
+    vld1.u8     {q5}, [r0], r1              ; p1
+    vld1.s8     {d2[], d3[]}, [r2]          ; flimit
+    vld1.s8     {d26[], d27[]}, [r3]        ; limit -> q13
+    vld1.u8     {q6}, [r0], r1              ; p0
+    vld1.u8     {q0}, [r12]!                ; 0x80
+    vld1.u8     {q7}, [r0], r1              ; q0
+    vld1.u8     {q10}, [r12]!               ; 0x03
+    vld1.u8     {q8}, [r0]                  ; q1
+
+    ;vp8_filter_mask() function
+    vabd.u8     q15, q6, q7                 ; abs(p0 - q0)
+    vabd.u8     q14, q5, q8                 ; abs(p1 - q1)
+    vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2
+    vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2
+    vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+
+    ;vp8_filter() function
+    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
+    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
+    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
+    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
+
+    vadd.u8     q1, q1, q1                  ; flimit * 2
+    vadd.u8     q1, q1, q13                 ; flimit * 2 + limit
+    vcge.u8     q15, q1, q15                ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
+
+;;;;;;;;;;
+    ;vqsub.s8   q2, q7, q6                  ; ( qs0 - ps0)
+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
+    vsubl.s8    q3, d15, d13
+
+    vqsub.s8    q4, q5, q8                  ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1)
+
+    ;vmul.i8    q2, q2, q10                 ;  3 * ( qs0 - ps0)
+    vadd.s16    q11, q2, q2                 ;  3 * ( qs0 - ps0)
+    vadd.s16    q12, q3, q3
+
+    vld1.u8     {q9}, [r12]!                ; 0x04
+
+    vadd.s16    q2, q2, q11
+    vadd.s16    q3, q3, q12
+
+    vaddw.s8    q2, q2, d8                  ; vp8_filter + 3 * ( qs0 - ps0)
+    vaddw.s8    q3, q3, d9
+
+    ;vqadd.s8   q4, q4, q2                  ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d8, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d9, q3
+;;;;;;;;;;;;;
+
+    vand        q4, q4, q15                 ; vp8_filter &= mask
+
+    vqadd.s8    q2, q4, q10                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
+    vqadd.s8    q4, q4, q9                  ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
+    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
+    vshr.s8     q4, q4, #3                  ; Filter1 >>= 3
+
+    sub         r0, r0, r1, lsl #1
+
+    ;calculate output
+    vqadd.s8    q11, q6, q2                 ; u = vp8_signed_char_clamp(ps0 + Filter2)
+    vqsub.s8    q10, q7, q4                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
+
+    add         r3, r0, r1
+
+    veor        q6, q11, q0                 ; *op0 = u^0x80
+    veor        q7, q10, q0                 ; *oq0 = u^0x80
+
+    vst1.u8     {q6}, [r0]                  ; store op0
+    vst1.u8     {q7}, [r3]                  ; store oq0
+
+    bx          lr
+    ENDP        ; |vp8_loop_filter_simple_horizontal_edge_neon|
+
+;-----------------
+    AREA    hloopfiltery_dat, DATA, READWRITE           ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_lfhy_coeff_
+    DCD     lfhy_coeff
+lfhy_coeff
+    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
+    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
+    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
+
+    END
diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
new file mode 100644
index 000000000..2bb6222b9
--- /dev/null
+++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
@@ -0,0 +1,158 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_loop_filter_simple_vertical_edge_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;Note: flimit, limit, and thresh should be positive numbers. All 16 elements in flimit
+;are equal. So, in the code, only one load is needed
+;for flimit. Same way applies to limit and thresh.
+; r0    unsigned char *s,
+; r1    int p, //pitch
+; r2    const signed char *flimit,
+; r3    const signed char *limit,
+; stack(r4) const signed char *thresh,
+; //stack(r5)   int count --unused
+
+|vp8_loop_filter_simple_vertical_edge_neon| PROC
+    sub         r0, r0, #2                  ; move src pointer down by 2 columns
+
+    vld4.8      {d6[0], d7[0], d8[0], d9[0]}, [r0], r1
+    vld1.s8     {d2[], d3[]}, [r2]          ; flimit
+    vld1.s8     {d26[], d27[]}, [r3]        ; limit -> q13
+    vld4.8      {d6[1], d7[1], d8[1], d9[1]}, [r0], r1
+    ldr         r12, _vlfy_coeff_
+    vld4.8      {d6[2], d7[2], d8[2], d9[2]}, [r0], r1
+    vld4.8      {d6[3], d7[3], d8[3], d9[3]}, [r0], r1
+    vld4.8      {d6[4], d7[4], d8[4], d9[4]}, [r0], r1
+    vld4.8      {d6[5], d7[5], d8[5], d9[5]}, [r0], r1
+    vld4.8      {d6[6], d7[6], d8[6], d9[6]}, [r0], r1
+    vld4.8      {d6[7], d7[7], d8[7], d9[7]}, [r0], r1
+
+    vld4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
+    vld1.u8     {q0}, [r12]!                ; 0x80
+    vld4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
+    vld1.u8     {q11}, [r12]!               ; 0x03
+    vld4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
+    vld1.u8     {q12}, [r12]!               ; 0x04
+    vld4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
+    vld4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
+    vld4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
+    vld4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
+    vld4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0], r1
+
+    vswp        d7, d10
+    vswp        d12, d9
+    ;vswp       q4, q5                      ; p1:q3, p0:q5, q0:q4, q1:q6
+
+    ;vp8_filter_mask() function
+    ;vp8_hevmask() function
+    sub         r0, r0, r1, lsl #4
+    vabd.u8     q15, q5, q4                 ; abs(p0 - q0)
+    vabd.u8     q14, q3, q6                 ; abs(p1 - q1)
+    vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2
+    vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2
+    vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+
+    veor        q4, q4, q0                  ; qs0: q0 offset to convert to a signed value
+    veor        q5, q5, q0                  ; ps0: p0 offset to convert to a signed value
+    veor        q3, q3, q0                  ; ps1: p1 offset to convert to a signed value
+    veor        q6, q6, q0                  ; qs1: q1 offset to convert to a signed value
+
+    vadd.u8     q1, q1, q1                  ; flimit * 2
+    vadd.u8     q1, q1, q13                 ; flimit * 2 + limit
+    vcge.u8     q15, q1, q15                ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
+
+    ;vp8_filter() function
+;;;;;;;;;;
+    ;vqsub.s8   q2, q5, q4                  ; ( qs0 - ps0)
+    vsubl.s8    q2, d8, d10                 ; ( qs0 - ps0)
+    vsubl.s8    q13, d9, d11
+
+    vqsub.s8    q1, q3, q6                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
+
+    ;vmul.i8    q2, q2, q11                 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    vadd.s16    q10, q2, q2                 ;  3 * ( qs0 - ps0)
+    vadd.s16    q14, q13, q13
+    vadd.s16    q2, q2, q10
+    vadd.s16    q13, q13, q14
+
+    ;vqadd.s8   q1, q1, q2
+    vaddw.s8    q2, q2, d2                  ; vp8_filter + 3 * ( qs0 - ps0)
+    vaddw.s8    q13, q13, d3
+
+    vqmovn.s16  d2, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d3, q13
+
+    add         r0, r0, #1
+    add         r2, r0, r1
+;;;;;;;;;;;
+
+    vand        q1, q1, q15                 ; vp8_filter &= mask
+
+    vqadd.s8    q2, q1, q11                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
+    vqadd.s8    q1, q1, q12                 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
+    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
+    vshr.s8     q1, q1, #3                  ; Filter1 >>= 3
+
+    ;calculate output
+    vqsub.s8    q10, q4, q1                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
+    vqadd.s8    q11, q5, q2                 ; u = vp8_signed_char_clamp(ps0 + Filter2)
+
+    veor        q7, q10, q0                 ; *oq0 = u^0x80
+    veor        q6, q11, q0                 ; *op0 = u^0x80
+
+    add         r3, r2, r1
+    vswp        d13, d14
+    add         r12, r3, r1
+
+    ;store op1, op0, oq0, oq1
+    vst2.8      {d12[0], d13[0]}, [r0]
+    vst2.8      {d12[1], d13[1]}, [r2]
+    vst2.8      {d12[2], d13[2]}, [r3]
+    vst2.8      {d12[3], d13[3]}, [r12], r1
+    add         r0, r12, r1
+    vst2.8      {d12[4], d13[4]}, [r12]
+    vst2.8      {d12[5], d13[5]}, [r0], r1
+    add         r2, r0, r1
+    vst2.8      {d12[6], d13[6]}, [r0]
+    vst2.8      {d12[7], d13[7]}, [r2], r1
+    add         r3, r2, r1
+    vst2.8      {d14[0], d15[0]}, [r2]
+    vst2.8      {d14[1], d15[1]}, [r3], r1
+    add         r12, r3, r1
+    vst2.8      {d14[2], d15[2]}, [r3]
+    vst2.8      {d14[3], d15[3]}, [r12], r1
+    add         r0, r12, r1
+    vst2.8      {d14[4], d15[4]}, [r12]
+    vst2.8      {d14[5], d15[5]}, [r0], r1
+    add         r2, r0, r1
+    vst2.8      {d14[6], d15[6]}, [r0]
+    vst2.8      {d14[7], d15[7]}, [r2]
+
+    bx          lr
+    ENDP        ; |vp8_loop_filter_simple_vertical_edge_neon|
+
+;-----------------
+    AREA    vloopfiltery_dat, DATA, READWRITE           ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_vlfy_coeff_
+    DCD     vlfy_coeff
+vlfy_coeff
+    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
+    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
+    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
+
+    END
diff --git a/vp8/common/arm/neon/loopfilterverticaledge_uv_neon.asm b/vp8/common/arm/neon/loopfilterverticaledge_uv_neon.asm
new file mode 100644
index 000000000..d79cc68a3
--- /dev/null
+++ b/vp8/common/arm/neon/loopfilterverticaledge_uv_neon.asm
@@ -0,0 +1,231 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_loop_filter_vertical_edge_uv_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
+;are equal. So, in the code, only one load is needed
+;for flimit. Same way applies to limit and thresh.
+; r0    unsigned char *u,
+; r1    int p, //pitch
+; r2    const signed char *flimit,
+; r3    const signed char *limit,
+; stack(r4) const signed char *thresh,
+; stack(r5) unsigned char *v
+
+|vp8_loop_filter_vertical_edge_uv_neon| PROC
+    sub         r0, r0, #4          ; move u pointer down by 4 columns
+    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
+
+    ldr         r2, [sp, #4]                ; load v ptr
+    ldr         r12, [sp, #0]               ; load thresh pointer
+
+    sub         r2, r2, #4          ; move v pointer down by 4 columns
+
+    vld1.u8     {d6}, [r0], r1              ;load u data
+    vld1.u8     {d7}, [r2], r1              ;load v data
+    vld1.u8     {d8}, [r0], r1
+    vld1.u8     {d9}, [r2], r1
+    vld1.u8     {d10}, [r0], r1
+    vld1.u8     {d11}, [r2], r1
+    vld1.u8     {d12}, [r0], r1
+    vld1.u8     {d13}, [r2], r1
+    vld1.u8     {d14}, [r0], r1
+    vld1.u8     {d15}, [r2], r1
+    vld1.u8     {d16}, [r0], r1
+    vld1.u8     {d17}, [r2], r1
+    vld1.u8     {d18}, [r0], r1
+    vld1.u8     {d19}, [r2], r1
+    vld1.u8     {d20}, [r0], r1
+    vld1.u8     {d21}, [r2], r1
+
+    ;transpose to 8x16 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    vld1.s8     {d2[], d3[]}, [r3]          ; limit
+    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
+
+    ldr         r12, _vlfuv_coeff_
+    ;vp8_filter_mask() function
+    ;vp8_hevmask() function
+    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
+    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
+    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
+    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
+    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
+    vabd.u8     q4, q10, q9                 ; abs(q3 - q2)
+    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
+
+    vcge.u8     q15, q1, q11                ; (abs(p3 - p2) > limit)*-1
+    vcge.u8     q12, q1, q12                ; (abs(p2 - p1) > limit)*-1
+    vcge.u8     q10, q1, q13                ; (abs(p1 - p0) > limit)*-1
+    vcge.u8     q11, q1, q14                ; (abs(q1 - q0) > limit)*-1
+
+    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
+
+    vcge.u8     q3, q1, q3                  ; (abs(q2 - q1) > limit)*-1
+    vcge.u8     q4, q1, q4                  ; (abs(q3 - q2) > limit)*-1
+    vadd.u8     q0, q0, q0                  ; flimit * 2
+    vadd.u8     q0, q0, q1                  ; flimit * 2 + limit
+
+    vand        q15, q15, q12
+    vand        q10, q10, q11
+    vand        q3, q3, q4
+
+    vabd.u8     q2, q5, q8                  ; abs(p1 - q1)
+    vqadd.u8    q9, q9, q9                  ; abs(p0 - q0) * 2
+    vshr.u8     q2, q2, #1                  ; abs(p1 - q1) / 2
+    vqadd.u8    q9, q9, q2                  ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+    vcge.u8     q9, q0, q9                  ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
+
+    vld1.u8     {q0}, [r12]!
+
+    vand        q15, q15, q10
+
+
+    ;vp8_filter() function
+    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
+    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
+    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
+    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
+;;;;;;;;;;;;;;
+    vld1.u8     {q10}, [r12]!
+
+    ;vqsub.s8   q2, q7, q6                  ; ( qs0 - ps0)
+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
+    vsubl.s8    q11, d15, d13
+
+    vand        q3, q3, q9
+    vmovl.u8    q4, d20
+
+    vqsub.s8    q1, q5, q8                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
+    vorr        q14, q13, q14               ; q14: vp8_hevmask
+
+    ;vmul.i8    q2, q2, q10                 ; 3 * ( qs0 - ps0)
+    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
+    vmul.i16    q11, q11, q4
+
+    vand        q1, q1, q14                 ; vp8_filter &= hev
+    vand        q15, q15, q3                ; q15: vp8_filter_mask
+    ;;
+    ;vld1.u8        {q4}, [r12]!            ;no need 7 any more
+
+    ;vqadd.s8   q1, q1, q2
+    vaddw.s8    q2, q2, d2
+    vaddw.s8    q11, q11, d3
+
+    vld1.u8     {q9}, [r12]!
+    ;
+    vqmovn.s16  d2, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d3, q11
+    ;;
+
+    vand        q1, q1, q15                 ; vp8_filter &= mask
+    ;;
+;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7
+;   vand        q2, q1, q4                  ; s = vp8_filter & 7
+;   vqadd.s8    q1, q1, q9                  ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
+    ;;;;
+;   vshr.s8     q1, q1, #3                  ; vp8_filter >>= 3
+;   vceq.i8     q2, q2, q9                  ; s = (s==4)*-1
+    ;;
+;   ;calculate output
+;   vqsub.s8    q10, q7, q1                 ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
+;   vqadd.s8    q11, q2, q1                 ; u = vp8_signed_char_clamp(s + vp8_filter)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; q10=3
+    vqadd.s8    q2, q1, q10                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
+    vqadd.s8    q1, q1, q9                  ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
+    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
+    vshr.s8     q1, q1, #3                  ; Filter1 >>= 3
+    ;calculate output
+    vqadd.s8    q11, q6, q2             ; u = vp8_signed_char_clamp(ps0 + Filter2)
+    vqsub.s8    q10, q7, q1                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+    vrshr.s8    q1, q1, #1                  ;round/shift:  vp8_filter += 1; vp8_filter >>= 1
+
+    sub         r0, r0, r1, lsl #3
+    add         r0, r0, #2
+
+    vbic        q1, q1, q14                 ; vp8_filter &= ~hev
+
+    sub         r2, r2, r1, lsl #3
+    add         r2, r2, #2
+
+    vqadd.s8    q13, q5, q1                 ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
+    ;vqadd.s8   q11, q6, q11                ; u = vp8_signed_char_clamp(ps0 + u)
+    vqsub.s8    q12, q8, q1                 ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
+
+    veor        q7, q10, q0                 ; *oq0 = u^0x80
+    veor        q5, q13, q0                 ; *op1 = u^0x80
+    veor        q6, q11, q0                 ; *op0 = u^0x80
+    veor        q8, q12, q0                 ; *oq1 = u^0x80
+
+    vswp        d12, d11
+    vswp        d16, d13
+    vswp        d14, d12
+    vswp        d16, d15
+
+    ;store op1, op0, oq0, oq1
+    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
+    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
+    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
+    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r2], r1
+    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
+    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r2], r1
+    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
+    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r2], r1
+    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
+    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r2], r1
+    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
+    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r2], r1
+    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
+    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r2], r1
+    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0], r1
+    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r2], r1
+
+    bx          lr
+    ENDP        ; |vp8_loop_filter_vertical_edge_uv_neon|
+
+;-----------------
+    AREA    vloopfilteruv_dat, DATA, READWRITE          ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_vlfuv_coeff_
+    DCD     vlfuv_coeff
+vlfuv_coeff
+    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
+    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
+    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
+    DCD     0x01010101, 0x01010101, 0x01010101, 0x01010101
+
+    END
diff --git a/vp8/common/arm/neon/loopfilterverticaledge_y_neon.asm b/vp8/common/arm/neon/loopfilterverticaledge_y_neon.asm
new file mode 100644
index 000000000..3a230a953
--- /dev/null
+++ b/vp8/common/arm/neon/loopfilterverticaledge_y_neon.asm
@@ -0,0 +1,235 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_loop_filter_vertical_edge_y_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
+;are equal. So, in the code, only one load is needed
+;for flimit. Same way applies to limit and thresh.
+; r0    unsigned char *s,
+; r1    int p, //pitch
+; r2    const signed char *flimit,
+; r3    const signed char *limit,
+; stack(r4) const signed char *thresh,
+; //stack(r5)   int count --unused
+
+|vp8_loop_filter_vertical_edge_y_neon| PROC
+    sub         r0, r0, #4                  ; move src pointer down by 4 columns
+    ldr         r12, [sp, #0]               ; load thresh pointer
+
+    vld1.u8     {d6}, [r0], r1              ; load first 8-line src data
+    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
+    vld1.u8     {d8}, [r0], r1
+    vld1.s8     {d2[], d3[]}, [r3]          ; limit
+    vld1.u8     {d10}, [r0], r1
+    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
+    vld1.u8     {d12}, [r0], r1
+    ldr         r12, _vlfy_coeff_
+    vld1.u8     {d14}, [r0], r1
+    vld1.u8     {d16}, [r0], r1
+    vld1.u8     {d18}, [r0], r1
+    vld1.u8     {d20}, [r0], r1
+
+    vld1.u8     {d7}, [r0], r1              ; load second 8-line src data
+    vld1.u8     {d9}, [r0], r1
+    vld1.u8     {d11}, [r0], r1
+    vld1.u8     {d13}, [r0], r1
+    vld1.u8     {d15}, [r0], r1
+    vld1.u8     {d17}, [r0], r1
+    vld1.u8     {d19}, [r0], r1
+    vld1.u8     {d21}, [r0], r1
+
+    ;transpose to 8x16 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    ;vp8_filter_mask() function
+    ;vp8_hevmask() function
+    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
+    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
+    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
+    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
+    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
+    vabd.u8     q4, q10, q9                 ; abs(q3 - q2)
+    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
+
+    vcge.u8     q15, q1, q11                ; (abs(p3 - p2) > limit)*-1
+    vcge.u8     q12, q1, q12                ; (abs(p2 - p1) > limit)*-1
+    vcge.u8     q10, q1, q13                ; (abs(p1 - p0) > limit)*-1
+    vcge.u8     q11, q1, q14                ; (abs(q1 - q0) > limit)*-1
+
+    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
+
+    vcge.u8     q3, q1, q3                  ; (abs(q2 - q1) > limit)*-1
+    vcge.u8     q4, q1, q4                  ; (abs(q3 - q2) > limit)*-1
+    vadd.u8     q0, q0, q0                  ; flimit * 2
+    vadd.u8     q0, q0, q1                  ; flimit * 2 + limit
+
+    vand        q15, q15, q12
+    vand        q10, q10, q11
+    vand        q3, q3, q4
+
+    vabd.u8     q2, q5, q8                  ; abs(p1 - q1)
+    vqadd.u8    q9, q9, q9                  ; abs(p0 - q0) * 2
+    vshr.u8     q2, q2, #1                  ; abs(p1 - q1) / 2
+    vqadd.u8    q9, q9, q2                  ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+    vcge.u8     q9, q0, q9                  ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
+
+    vld1.u8     {q0}, [r12]!
+
+    vand        q15, q15, q10
+
+
+    ;vp8_filter() function
+    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
+    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
+    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
+    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
+;;;;;;;;;;;;;;
+    vld1.u8     {q10}, [r12]!
+
+    ;vqsub.s8   q2, q7, q6                  ; ( qs0 - ps0)
+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
+    vsubl.s8    q11, d15, d13
+
+    vand        q3, q3, q9
+    vmovl.u8    q4, d20
+
+    vqsub.s8    q1, q5, q8                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
+    vorr        q14, q13, q14               ; q14: vp8_hevmask
+
+    ;vmul.i8    q2, q2, q10                 ; 3 * ( qs0 - ps0)
+    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
+    vmul.i16    q11, q11, q4
+
+    vand        q1, q1, q14                 ; vp8_filter &= hev
+    vand        q15, q15, q3                ; q15: vp8_filter_mask
+    ;;
+    ;vld1.u8        {q4}, [r12]!            ;no need 7 any more
+
+    ;vqadd.s8   q1, q1, q2
+    vaddw.s8    q2, q2, d2
+    vaddw.s8    q11, q11, d3
+
+    vld1.u8     {q9}, [r12]!
+    ;
+    vqmovn.s16  d2, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d3, q11
+    ;;
+
+    vand        q1, q1, q15                 ; vp8_filter &= mask
+    ;;
+;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7
+;   vand        q2, q1, q4                  ; s = vp8_filter & 7
+;   vqadd.s8    q1, q1, q9                  ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
+    ;;;;
+;   vshr.s8     q1, q1, #3                  ; vp8_filter >>= 3
+;   vceq.i8     q2, q2, q9                  ; s = (s==4)*-1
+    ;;
+;   ;calculate output
+;   vqsub.s8    q10, q7, q1                 ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
+;   vqadd.s8    q11, q2, q1                 ; u = vp8_signed_char_clamp(s + vp8_filter)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; q10=3
+    vqadd.s8    q2, q1, q10                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
+    vqadd.s8    q1, q1, q9                  ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
+    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
+    vshr.s8     q1, q1, #3                  ; Filter1 >>= 3
+    ;calculate output
+    vqadd.s8    q11, q6, q2             ; u = vp8_signed_char_clamp(ps0 + Filter2)
+    vqsub.s8    q10, q7, q1                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+    vrshr.s8    q1, q1, #1                  ;round/shift:  vp8_filter += 1; vp8_filter >>= 1
+
+    sub         r0, r0, r1, lsl #4
+    add         r0, r0, #2
+    ;
+
+    vbic        q1, q1, q14                 ; vp8_filter &= ~hev
+    add         r2, r0, r1
+    ;
+
+    vqadd.s8    q13, q5, q1                 ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
+    ;vqadd.s8   q11, q6, q11                ; u = vp8_signed_char_clamp(ps0 + u)
+    vqsub.s8    q12, q8, q1                 ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
+
+    veor        q7, q10, q0                 ; *oq0 = u^0x80
+    veor        q5, q13, q0                 ; *op1 = u^0x80
+    veor        q6, q11, q0                 ; *op0 = u^0x80
+    veor        q8, q12, q0                 ; *oq1 = u^0x80
+    add         r3, r2, r1
+    ;
+    vswp        d12, d11
+    vswp        d16, d13
+    add         r12, r3, r1
+    vswp        d14, d12
+    vswp        d16, d15
+
+    ;store op1, op0, oq0, oq1
+    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0]
+    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r2]
+    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r3]
+    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
+    add         r0, r12, r1
+    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r12]
+    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
+    add         r2, r0, r1
+    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0]
+    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r2], r1
+    add         r3, r2, r1
+    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r2]
+    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r3], r1
+    add         r12, r3, r1
+    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r3]
+    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
+    add         r0, r12, r1
+    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r12]
+    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r0], r1
+    add         r2, r0, r1
+    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r0]
+    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r2]
+
+    bx          lr
+    ENDP        ; |vp8_loop_filter_vertical_edge_y_neon|
+
+;-----------------
+    AREA    vloopfiltery_dat, DATA, READWRITE           ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_vlfy_coeff_
+    DCD     vlfy_coeff
+vlfy_coeff
+    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
+    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
+    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
+    DCD     0x01010101, 0x01010101, 0x01010101, 0x01010101
+
+    END
diff --git a/vp8/common/arm/neon/mbloopfilterhorizontaledge_uv_neon.asm b/vp8/common/arm/neon/mbloopfilterhorizontaledge_uv_neon.asm
new file mode 100644
index 000000000..86eddaa2e
--- /dev/null
+++ b/vp8/common/arm/neon/mbloopfilterhorizontaledge_uv_neon.asm
@@ -0,0 +1,257 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_mbloop_filter_horizontal_edge_uv_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
+;are equal. So, in the code, only one load is needed
+;for flimit. Same way applies to limit and thresh.
+; r0    unsigned char *u,
+; r1    int p, //pitch
+; r2    const signed char *flimit,
+; r3    const signed char *limit,
+; stack(r4) const signed char *thresh,
+; stack(r5) unsigned char *v
+|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
+    sub         r0, r0, r1, lsl #2          ; move u pointer down by 4 lines
+    vld1.s8     {d2[], d3[]}, [r3]          ; limit
+    ldr         r3, [sp, #4]                ; load v ptr
+    ldr         r12, [sp, #0]               ; load thresh pointer
+    sub         r3, r3, r1, lsl #2          ; move v pointer down by 4 lines
+
+    vld1.u8     {d6}, [r0], r1              ; p3
+    vld1.u8     {d7}, [r3], r1              ; p3
+    vld1.u8     {d8}, [r0], r1              ; p2
+    vld1.u8     {d9}, [r3], r1              ; p2
+    vld1.u8     {d10}, [r0], r1             ; p1
+    vld1.u8     {d11}, [r3], r1             ; p1
+    vld1.u8     {d12}, [r0], r1             ; p0
+    vld1.u8     {d13}, [r3], r1             ; p0
+    vld1.u8     {d14}, [r0], r1             ; q0
+    vld1.u8     {d15}, [r3], r1             ; q0
+    vld1.u8     {d16}, [r0], r1             ; q1
+    vld1.u8     {d17}, [r3], r1             ; q1
+    vld1.u8     {d18}, [r0], r1             ; q2
+    vld1.u8     {d19}, [r3], r1             ; q2
+    vld1.u8     {d20}, [r0], r1             ; q3
+    vld1.u8     {d21}, [r3], r1             ; q3
+
+    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
+
+    ldr         r12, _mbhlfuv_coeff_
+
+    ;vp8_filter_mask() function
+    ;vp8_hevmask() function
+    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
+    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
+    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
+    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
+    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
+    vabd.u8     q0, q10, q9                 ; abs(q3 - q2)
+
+    vcge.u8     q15, q1, q11                ; (abs(p3 - p2) > limit)*-1
+    vcge.u8     q12, q1, q12                ; (abs(p2 - p1) > limit)*-1
+    vcge.u8     q10, q1, q13                ; (abs(p1 - p0) > limit)*-1
+    vcge.u8     q11, q1, q14                ; (abs(q1 - q0) > limit)*-1
+    vcge.u8     q3, q1, q3                  ; (abs(q2 - q1) > limit)*-1
+    vcge.u8     q0, q1, q0                  ; (abs(q3 - q2) > limit)*-1
+
+    vand        q15, q15, q12
+
+    vabd.u8     q12, q6, q7                 ; abs(p0 - q0)
+
+    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
+
+    vld1.s8     {d4[], d5[]}, [r2]          ; flimit
+
+    vand        q10, q10, q11
+    vand        q3, q3, q0
+
+    vld1.u8     {q0}, [r12]!
+
+    vadd.u8     q2, q2, q2                  ; flimit * 2
+    vadd.u8     q2, q2, q1                  ; flimit * 2 +  limit
+
+    vabd.u8     q1, q5, q8                  ; abs(p1 - q1)
+    vqadd.u8    q12, q12, q12               ; abs(p0 - q0) * 2
+    vshr.u8     q1, q1, #1                  ; abs(p1 - q1) / 2
+    vqadd.u8    q12, q12, q1                ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+    vcge.u8     q12, q2, q12                ; (abs(p0 - q0)*2 + abs(p1 - q1)/2 > flimit*2 + limit)*-1
+
+    vand        q15, q15, q10
+
+    ;vp8_filter() function
+    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
+    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
+    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
+    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
+    veor        q4, q4, q0                  ; ps2: p2 offset to convert to a signed value
+    veor        q9, q9, q0                  ; qs2: q2 offset to convert to a signed value
+;;;;;;;;;;;;;
+    vorr        q14, q13, q14               ; q14: vp8_hevmask
+
+    ;vqsub.s8   q2, q7, q6                  ; ( qs0 - ps0)
+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
+    vsubl.s8    q13, d15, d13
+
+    vqsub.s8    q1, q5, q8                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
+
+    ;vadd.s8    q10, q2, q2                 ; 3 * ( qs0 - ps0)
+    vadd.s16    q10, q2, q2                 ; 3 * ( qs0 - ps0)
+    vadd.s16    q11, q13, q13
+
+    vand        q3, q3, q12
+
+    ;vadd.s8    q2, q2, q10
+    vadd.s16    q2, q2, q10
+    vadd.s16    q13, q13, q11
+
+    vld1.u8     {q12}, [r12]!               ;#3
+
+    ;vqadd.s8   q1, q1, q2                  ; vp8_filter + 3 * ( qs0 - ps0)
+    vaddw.s8    q2, q2, d2                  ; vp8_filter + 3 * ( qs0 - ps0)
+    vaddw.s8    q13, q13, d3
+
+    vand        q15, q15, q3                ; q15: vp8_filter_mask
+    vld1.u8     {q11}, [r12]!               ;#4
+
+    vqmovn.s16  d2, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d3, q13
+
+;;;;;;;;;;;;;;
+    vand        q1, q1, q15                 ; vp8_filter &= mask
+
+    vld1.u8     {q15}, [r12]!               ;#63
+    ;
+    vand        q13, q1, q14                ; Filter2: q13; Filter2 &= hev
+
+    vld1.u8     {d7}, [r12]!                ;#9
+    ;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7
+;   vand        q2, q13, q12                ; s = Filter2 & 7
+
+;   vqadd.s8    q13, q13, q11               ; Filter2 = vp8_signed_char_clamp(Filter2+4)
+;   vld1.u8     {d6}, [r12]!                ;#18
+
+;   sub         r0, r0, r1, lsl #3
+;   sub         r3, r3, r1, lsl #3
+
+;   vshr.s8     q13, q13, #3                ; Filter2 >>= 3
+;   vceq.i8     q2, q2, q11                 ; s = (s==4)*-1
+
+;   add         r0, r0, r1
+;   add         r3, r3, r1
+
+;   vqsub.s8    q7, q7, q13                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
+;   vqadd.s8    q11, q2, q13                ; u = vp8_signed_char_clamp(s + Filter2)
+
+;   vld1.u8     {d5}, [r12]!                ;#27
+;   vmov        q10, q15
+;   vmov        q12, q15
+
+;   vqadd.s8    q6, q6, q11                 ; ps0 = vp8_signed_char_clamp(ps0 + u)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    vqadd.s8    q2, q13, q11                ; Filter1 = vp8_signed_char_clamp(Filter2+4)
+    vqadd.s8    q13, q13, q12               ; Filter2 = vp8_signed_char_clamp(Filter2+3)
+
+    vld1.u8     {d6}, [r12]!                ;#18
+
+    sub         r0, r0, r1, lsl #3
+    sub         r3, r3, r1, lsl #3
+
+    vshr.s8     q2, q2, #3                  ; Filter1 >>= 3
+    vshr.s8     q13, q13, #3                ; Filter2 >>= 3
+
+    vmov        q10, q15
+    vmov        q12, q15
+
+    vqsub.s8    q7, q7, q2                  ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
+
+    vld1.u8     {d5}, [r12]!                ;#27
+
+    add         r0, r0, r1
+    add         r3, r3, r1
+
+    vqadd.s8    q6, q6, q13                 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+    vbic        q1, q1, q14                 ; Filter2: q1; vp8_filter &= ~hev; Filter2 = vp8_filter
+
+    ; roughly 1/7th difference across boundary
+    ; roughly 2/7th difference across boundary
+    ; roughly 3/7th difference across boundary
+    vmov        q11, q15
+    vmov        q13, q15
+    vmov        q14, q15
+
+    vmlal.s8    q10, d2, d7                 ; Filter2 * 9
+    vmlal.s8    q11, d3, d7
+    vmlal.s8    q12, d2, d6                 ; Filter2 * 18
+    vmlal.s8    q13, d3, d6
+    vmlal.s8    q14, d2, d5                 ; Filter2 * 27
+    vmlal.s8    q15, d3, d5
+    vqshrn.s16  d20, q10, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
+    vqshrn.s16  d21, q11, #7
+    vqshrn.s16  d24, q12, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
+    vqshrn.s16  d25, q13, #7
+    vqshrn.s16  d28, q14, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
+    vqshrn.s16  d29, q15, #7
+
+    vqsub.s8    q11, q9, q10                ; s = vp8_signed_char_clamp(qs2 - u)
+    vqadd.s8    q10, q4, q10                ; s = vp8_signed_char_clamp(ps2 + u)
+    vqsub.s8    q13, q8, q12                ; s = vp8_signed_char_clamp(qs1 - u)
+    vqadd.s8    q12, q5, q12                ; s = vp8_signed_char_clamp(ps1 + u)
+    vqsub.s8    q15, q7, q14                ; s = vp8_signed_char_clamp(qs0 - u)
+    vqadd.s8    q14, q6, q14                ; s = vp8_signed_char_clamp(ps0 + u)
+    veor        q9, q11, q0                 ; *oq2 = s^0x80
+    veor        q4, q10, q0                 ; *op2 = s^0x80
+    veor        q8, q13, q0                 ; *oq1 = s^0x80
+    veor        q5, q12, q0                 ; *op2 = s^0x80
+    veor        q7, q15, q0                 ; *oq0 = s^0x80
+    veor        q6, q14, q0                 ; *op0 = s^0x80
+
+    vst1.u8     {d8}, [r0], r1              ; store u op2
+    vst1.u8     {d9}, [r3], r1              ; store v op2
+    vst1.u8     {d10}, [r0], r1             ; store u op1
+    vst1.u8     {d11}, [r3], r1             ; store v op1
+    vst1.u8     {d12}, [r0], r1             ; store u op0
+    vst1.u8     {d13}, [r3], r1             ; store v op0
+    vst1.u8     {d14}, [r0], r1             ; store u oq0
+    vst1.u8     {d15}, [r3], r1             ; store v oq0
+    vst1.u8     {d16}, [r0], r1             ; store u oq1
+    vst1.u8     {d17}, [r3], r1             ; store v oq1
+    vst1.u8     {d18}, [r0], r1             ; store u oq2
+    vst1.u8     {d19}, [r3], r1             ; store v oq2
+
+    bx          lr
+    ENDP        ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
+
+;-----------------
+    AREA    mbhloopfilteruv_dat, DATA, READWRITE            ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_mbhlfuv_coeff_
+    DCD     mbhlfuv_coeff
+mbhlfuv_coeff
+    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
+    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
+    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
+    DCD     0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
+    DCD     0x09090909, 0x09090909, 0x12121212, 0x12121212
+    DCD     0x1b1b1b1b, 0x1b1b1b1b
+
+    END
diff --git a/vp8/common/arm/neon/mbloopfilterhorizontaledge_y_neon.asm b/vp8/common/arm/neon/mbloopfilterhorizontaledge_y_neon.asm
new file mode 100644
index 000000000..2ab0fc240
--- /dev/null
+++ b/vp8/common/arm/neon/mbloopfilterhorizontaledge_y_neon.asm
@@ -0,0 +1,236 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_mbloop_filter_horizontal_edge_y_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
+;are equal. So, in the code, only one load is needed
+;for flimit. Same way applies to limit and thresh.
+; r0    unsigned char *s,
+; r1    int p, //pitch
+; r2    const signed char *flimit,
+; r3    const signed char *limit,
+; stack(r4) const signed char *thresh,
+; //stack(r5)   int count --unused
+|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
+    sub         r0, r0, r1, lsl #2          ; move src pointer down by 4 lines
+    ldr         r12, [sp, #0]               ; load thresh pointer
+
+    vld1.u8     {q3}, [r0], r1              ; p3
+    vld1.s8     {d2[], d3[]}, [r3]          ; limit
+    vld1.u8     {q4}, [r0], r1              ; p2
+    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
+    vld1.u8     {q5}, [r0], r1              ; p1
+    ldr         r12, _mbhlfy_coeff_
+    vld1.u8     {q6}, [r0], r1              ; p0
+
+    ;vp8_filter_mask() function
+    ;vp8_hevmask() function
+    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
+    vld1.u8     {q7}, [r0], r1              ; q0
+    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
+    vld1.u8     {q8}, [r0], r1              ; q1
+    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
+    vld1.u8     {q9}, [r0], r1              ; q2
+    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
+    vld1.u8     {q10}, [r0], r1             ; q3
+    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
+    vabd.u8     q0, q10, q9                 ; abs(q3 - q2)
+
+    vcge.u8     q15, q1, q11                ; (abs(p3 - p2) > limit)*-1
+    vcge.u8     q12, q1, q12                ; (abs(p2 - p1) > limit)*-1
+    vcge.u8     q10, q1, q13                ; (abs(p1 - p0) > limit)*-1
+    vcge.u8     q11, q1, q14                ; (abs(q1 - q0) > limit)*-1
+    vcge.u8     q3, q1, q3                  ; (abs(q2 - q1) > limit)*-1
+    vcge.u8     q0, q1, q0                  ; (abs(q3 - q2) > limit)*-1
+
+    vand        q15, q15, q12
+
+    vabd.u8     q12, q6, q7                 ; abs(p0 - q0)
+
+    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
+
+    vld1.s8     {d4[], d5[]}, [r2]          ; flimit
+
+    vand        q10, q10, q11
+    vand        q3, q3, q0
+
+    vld1.u8     {q0}, [r12]!
+
+    vadd.u8     q2, q2, q2                  ; flimit * 2
+    vadd.u8     q2, q2, q1                  ; flimit * 2 + limit
+
+    vabd.u8     q1, q5, q8                  ; abs(p1 - q1)
+    vqadd.u8    q12, q12, q12               ; abs(p0 - q0) * 2
+    vshr.u8     q1, q1, #1                  ; abs(p1 - q1) / 2
+    vqadd.u8    q12, q12, q1                ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+    vcge.u8     q12, q2, q12                ; (abs(p0 - q0)*2 + abs(p1 - q1)/2 > flimit*2 + limit)*-1
+
+    vand        q15, q15, q10
+
+    ;vp8_filter() function
+    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
+    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
+    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
+    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
+    veor        q4, q4, q0                  ; ps2: p2 offset to convert to a signed value
+    veor        q9, q9, q0                  ; qs2: q2 offset to convert to a signed value
+;;;;;;;;;;;;;
+    vorr        q14, q13, q14               ; q14: vp8_hevmask
+
+    ;vqsub.s8   q2, q7, q6                  ; ( qs0 - ps0)
+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
+    vsubl.s8    q13, d15, d13
+
+    vqsub.s8    q1, q5, q8                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
+
+    ;vadd.s8    q10, q2, q2                 ; 3 * ( qs0 - ps0)
+    vadd.s16    q10, q2, q2                 ; 3 * ( qs0 - ps0)
+    vadd.s16    q11, q13, q13
+
+    vand        q3, q3, q12
+
+    ;vadd.s8    q2, q2, q10
+    vadd.s16    q2, q2, q10
+    vadd.s16    q13, q13, q11
+
+    vld1.u8     {q12}, [r12]!               ;#3
+
+    ;vqadd.s8   q1, q1, q2                  ; vp8_filter + 3 * ( qs0 - ps0)
+    vaddw.s8    q2, q2, d2                  ; vp8_filter + 3 * ( qs0 - ps0)
+    vaddw.s8    q13, q13, d3
+
+    vand        q15, q15, q3                ; q15: vp8_filter_mask
+    vld1.u8     {q11}, [r12]!               ;#4
+
+    vqmovn.s16  d2, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d3, q13
+
+;;;;;;;;;;;;;;
+    vand        q1, q1, q15                 ; vp8_filter &= mask
+
+    vld1.u8     {q15}, [r12]!               ;#63
+    ;
+    vand        q13, q1, q14                ; Filter2: q13; Filter2 &= hev
+
+    vld1.u8     {d7}, [r12]!                ;#9
+    sub         r0, r0, r1, lsl #3
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7
+;   vand        q2, q13, q12                ; s = Filter2 & 7
+
+;   vqadd.s8    q13, q13, q11               ; Filter2 = vp8_signed_char_clamp(Filter2+4)
+;   vld1.u8     {d6}, [r12]!                ;#18
+
+;   add         r0, r0, r1
+;   add         r2, r0, r1
+
+;   vshr.s8     q13, q13, #3                ; Filter2 >>= 3
+;   vceq.i8     q2, q2, q11                 ; s = (s==4)*-1
+
+;   add         r3, r2, r1
+
+;   vqsub.s8    q7, q7, q13                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
+;   vqadd.s8    q11, q2, q13                ; u = vp8_signed_char_clamp(s + Filter2)
+
+;   vld1.u8     {d5}, [r12]!                ;#27
+;   vmov        q10, q15
+;   vmov        q12, q15
+
+;   vqadd.s8    q6, q6, q11                 ; ps0 = vp8_signed_char_clamp(ps0 + u)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    vqadd.s8    q2, q13, q11                ; Filter1 = vp8_signed_char_clamp(Filter2+4)
+    vqadd.s8    q13, q13, q12               ; Filter2 = vp8_signed_char_clamp(Filter2+3)
+
+    vld1.u8     {d6}, [r12]!                ;#18
+    add         r0, r0, r1
+    add         r2, r0, r1
+
+    vshr.s8     q2, q2, #3                  ; Filter1 >>= 3
+    vshr.s8     q13, q13, #3                ; Filter2 >>= 3
+
+    vmov        q10, q15
+    vmov        q12, q15
+
+    vqsub.s8    q7, q7, q2                  ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
+
+    vld1.u8     {d5}, [r12]!                ;#27
+    add         r3, r2, r1
+
+    vqadd.s8    q6, q6, q13                 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+    vbic        q1, q1, q14                 ; Filter2: q1; vp8_filter &= ~hev; Filter2 = vp8_filter
+
+    ; roughly 1/7th difference across boundary
+    ; roughly 2/7th difference across boundary
+    ; roughly 3/7th difference across boundary
+    vmov        q11, q15
+    vmov        q13, q15
+    vmov        q14, q15
+
+    vmlal.s8    q10, d2, d7                 ; Filter2 * 9
+    vmlal.s8    q11, d3, d7
+    vmlal.s8    q12, d2, d6                 ; Filter2 * 18
+    vmlal.s8    q13, d3, d6
+    vmlal.s8    q14, d2, d5                 ; Filter2 * 27
+    vmlal.s8    q15, d3, d5
+    vqshrn.s16  d20, q10, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
+    vqshrn.s16  d21, q11, #7
+    vqshrn.s16  d24, q12, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
+    vqshrn.s16  d25, q13, #7
+    vqshrn.s16  d28, q14, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
+    vqshrn.s16  d29, q15, #7
+
+    vqsub.s8    q11, q9, q10                ; s = vp8_signed_char_clamp(qs2 - u)
+    vqadd.s8    q10, q4, q10                ; s = vp8_signed_char_clamp(ps2 + u)
+    vqsub.s8    q13, q8, q12                ; s = vp8_signed_char_clamp(qs1 - u)
+    vqadd.s8    q12, q5, q12                ; s = vp8_signed_char_clamp(ps1 + u)
+    vqsub.s8    q15, q7, q14                ; s = vp8_signed_char_clamp(qs0 - u)
+    vqadd.s8    q14, q6, q14                ; s = vp8_signed_char_clamp(ps0 + u)
+    veor        q9, q11, q0                 ; *oq2 = s^0x80
+    veor        q4, q10, q0                 ; *op2 = s^0x80
+    veor        q5, q12, q0                 ; *op2 = s^0x80
+    veor        q6, q14, q0                 ; *op0 = s^0x80
+    veor        q8, q13, q0                 ; *oq1 = s^0x80
+    veor        q7, q15, q0                 ; *oq0 = s^0x80
+
+    vst1.u8     {q4}, [r0]                  ; store op2
+    vst1.u8     {q5}, [r2]                  ; store op1
+    vst1.u8     {q6}, [r3], r1              ; store op0
+    add         r12, r3, r1
+    vst1.u8     {q7}, [r3]                  ; store oq0
+    vst1.u8     {q8}, [r12], r1             ; store oq1
+    vst1.u8     {q9}, [r12]             ; store oq2
+
+    bx          lr
+    ENDP        ; |vp8_mbloop_filter_horizontal_edge_y_neon|
+
+;-----------------
+    AREA    mbhloopfiltery_dat, DATA, READWRITE         ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_mbhlfy_coeff_
+    DCD     mbhlfy_coeff
+mbhlfy_coeff
+    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
+    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
+    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
+    DCD     0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
+    DCD     0x09090909, 0x09090909, 0x12121212, 0x12121212
+    DCD     0x1b1b1b1b, 0x1b1b1b1b
+
+    END
diff --git a/vp8/common/arm/neon/mbloopfilterverticaledge_uv_neon.asm b/vp8/common/arm/neon/mbloopfilterverticaledge_uv_neon.asm
new file mode 100644
index 000000000..ad5afba34
--- /dev/null
+++ b/vp8/common/arm/neon/mbloopfilterverticaledge_uv_neon.asm
@@ -0,0 +1,296 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_mbloop_filter_vertical_edge_uv_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
+;are equal. So, in the code, only one load is needed
+;for flimit. Same way applies to limit and thresh.
+; r0    unsigned char *u,
+; r1    int p, //pitch
+; r2    const signed char *flimit,
+; r3    const signed char *limit,
+; stack(r4) const signed char *thresh,
+; stack(r5) unsigned char *v
+|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
+    sub         r0, r0, #4                  ; move src pointer down by 4 columns
+    vld1.s8     {d2[], d3[]}, [r3]          ; limit
+    ldr         r3, [sp, #4]                ; load v ptr
+    ldr         r12, [sp, #0]               ; load thresh pointer
+
+    sub         r3, r3, #4                  ; move v pointer down by 4 columns
+
+    vld1.u8     {d6}, [r0], r1              ;load u data
+    vld1.u8     {d7}, [r3], r1              ;load v data
+    vld1.u8     {d8}, [r0], r1
+    vld1.u8     {d9}, [r3], r1
+    vld1.u8     {d10}, [r0], r1
+    vld1.u8     {d11}, [r3], r1
+    vld1.u8     {d12}, [r0], r1
+    vld1.u8     {d13}, [r3], r1
+    vld1.u8     {d14}, [r0], r1
+    vld1.u8     {d15}, [r3], r1
+    vld1.u8     {d16}, [r0], r1
+    vld1.u8     {d17}, [r3], r1
+    vld1.u8     {d18}, [r0], r1
+    vld1.u8     {d19}, [r3], r1
+    vld1.u8     {d20}, [r0], r1
+    vld1.u8     {d21}, [r3], r1
+
+    ;transpose to 8x16 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    sub         sp, sp, #32
+    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
+    vst1.u8     {q3}, [sp]!
+    ldr         r12, _mbvlfuv_coeff_
+    vst1.u8     {q10}, [sp]!
+
+    ;vp8_filter_mask() function
+    ;vp8_hevmask() function
+    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
+    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
+    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
+    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
+    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
+    vabd.u8     q0, q10, q9                 ; abs(q3 - q2)
+
+    vcge.u8     q15, q1, q11                ; (abs(p3 - p2) > limit)*-1
+    vcge.u8     q12, q1, q12                ; (abs(p2 - p1) > limit)*-1
+    vcge.u8     q10, q1, q13                ; (abs(p1 - p0) > limit)*-1
+    vcge.u8     q11, q1, q14                ; (abs(q1 - q0) > limit)*-1
+    vcge.u8     q3, q1, q3                  ; (abs(q2 - q1) > limit)*-1
+    vcge.u8     q0, q1, q0                  ; (abs(q3 - q2) > limit)*-1
+
+    vand        q15, q15, q12
+
+    vabd.u8     q12, q6, q7                 ; abs(p0 - q0)
+
+    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
+
+    vld1.s8     {d4[], d5[]}, [r2]          ; flimit
+
+    vand        q10, q10, q11
+    vand        q3, q3, q0
+
+    vld1.u8     {q0}, [r12]!
+
+    vadd.u8     q2, q2, q2                  ; flimit * 2
+    vadd.u8     q2, q2, q1                  ; flimit * 2 + limit
+
+    vabd.u8     q1, q5, q8                  ; abs(p1 - q1)
+    vqadd.u8    q12, q12, q12               ; abs(p0 - q0) * 2
+    vshr.u8     q1, q1, #1                  ; abs(p1 - q1) / 2
+    vqadd.u8    q12, q12, q1                ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+    vcge.u8     q12, q2, q12                ; (abs(p0 - q0)*2 + abs(p1 - q1)/2 > flimit*2 + limit)*-1
+
+    vand        q15, q15, q10
+
+    ;vp8_filter() function
+    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
+    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
+    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
+    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
+    veor        q4, q4, q0                  ; ps2: p2 offset to convert to a signed value
+    veor        q9, q9, q0                  ; qs2: q2 offset to convert to a signed value
+;;;;;;;;;;;;;
+    vorr        q14, q13, q14               ; q14: vp8_hevmask
+
+    ;vqsub.s8   q2, q7, q6                  ; ( qs0 - ps0)
+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
+    vsubl.s8    q13, d15, d13
+
+    vqsub.s8    q1, q5, q8                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
+
+    ;vadd.s8    q10, q2, q2                 ; 3 * ( qs0 - ps0)
+    vadd.s16    q10, q2, q2                 ; 3 * ( qs0 - ps0)
+    vadd.s16    q11, q13, q13
+
+    vand        q3, q3, q12
+
+    ;vadd.s8    q2, q2, q10
+    vadd.s16    q2, q2, q10
+    vadd.s16    q13, q13, q11
+
+    vld1.u8     {q12}, [r12]!               ;#3
+
+    ;vqadd.s8   q1, q1, q2                  ; vp8_filter + 3 * ( qs0 - ps0)
+    vaddw.s8    q2, q2, d2                  ; vp8_filter + 3 * ( qs0 - ps0)
+    vaddw.s8    q13, q13, d3
+
+    vand        q15, q15, q3                ; q15: vp8_filter_mask
+    vld1.u8     {q11}, [r12]!               ;#4
+
+    vqmovn.s16  d2, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d3, q13
+
+;;;;;;;;;;;;;;
+    vand        q1, q1, q15                 ; vp8_filter &= mask
+
+    vld1.u8     {q15}, [r12]!               ;#63
+    ;
+    vand        q13, q1, q14                ; Filter2: q13; Filter2 &= hev
+
+    vld1.u8     {d7}, [r12]!                ;#9
+    ;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7
+;   vand        q2, q13, q12                ; s = Filter2 & 7
+
+;   vqadd.s8    q13, q13, q11               ; Filter2 = vp8_signed_char_clamp(Filter2+4)
+;   vld1.u8     {d6}, [r12]!                ;#18
+
+;   sub         r0, r0, r1, lsl #3
+;   sub         r3, r3, r1, lsl #3
+;   sub         sp, sp, #32
+
+;   vshr.s8     q13, q13, #3                ; Filter2 >>= 3
+;   vceq.i8     q2, q2, q11                 ; s = (s==4)*-1
+
+;   vqsub.s8    q7, q7, q13                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
+;   vqadd.s8    q11, q2, q13                ; u = vp8_signed_char_clamp(s + Filter2)
+
+;   vld1.u8     {d5}, [r12]!                ;#27
+;   vmov        q10, q15
+;   vmov        q12, q15
+
+;   vqadd.s8    q6, q6, q11                 ; ps0 = vp8_signed_char_clamp(ps0 + u)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    vqadd.s8    q2, q13, q11                ; Filter1 = vp8_signed_char_clamp(Filter2+4)
+    vqadd.s8    q13, q13, q12               ; Filter2 = vp8_signed_char_clamp(Filter2+3)
+
+    vld1.u8     {d6}, [r12]!                ;#18
+
+    sub         r0, r0, r1, lsl #3
+    sub         r3, r3, r1, lsl #3
+
+    vshr.s8     q2, q2, #3                  ; Filter1 >>= 3
+    vshr.s8     q13, q13, #3                ; Filter2 >>= 3
+
+    vmov        q10, q15
+    vmov        q12, q15
+
+    vqsub.s8    q7, q7, q2                  ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
+
+    vld1.u8     {d5}, [r12]!                ;#27
+
+    sub         sp, sp, #32
+
+    vqadd.s8    q6, q6, q13                 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+    vbic        q1, q1, q14                 ; Filter2: q1; vp8_filter &= ~hev; Filter2 = vp8_filter
+
+    ; roughly 1/7th difference across boundary
+    ; roughly 2/7th difference across boundary
+    ; roughly 3/7th difference across boundary
+    vmov        q11, q15
+    vmov        q13, q15
+    vmov        q14, q15
+
+    vmlal.s8    q10, d2, d7                 ; Filter2 * 9
+    vmlal.s8    q11, d3, d7
+    vmlal.s8    q12, d2, d6                 ; Filter2 * 18
+    vmlal.s8    q13, d3, d6
+    vmlal.s8    q14, d2, d5                 ; Filter2 * 27
+    vmlal.s8    q15, d3, d5
+    vqshrn.s16  d20, q10, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
+    vqshrn.s16  d21, q11, #7
+    vqshrn.s16  d24, q12, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
+    vqshrn.s16  d25, q13, #7
+    vqshrn.s16  d28, q14, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
+    vqshrn.s16  d29, q15, #7
+
+    vqsub.s8    q11, q9, q10                ; s = vp8_signed_char_clamp(qs2 - u)
+    vqadd.s8    q10, q4, q10                ; s = vp8_signed_char_clamp(ps2 + u)
+    vqsub.s8    q13, q8, q12                ; s = vp8_signed_char_clamp(qs1 - u)
+    vqadd.s8    q12, q5, q12                ; s = vp8_signed_char_clamp(ps1 + u)
+    vqsub.s8    q15, q7, q14                ; s = vp8_signed_char_clamp(qs0 - u)
+    vqadd.s8    q14, q6, q14                ; s = vp8_signed_char_clamp(ps0 + u)
+    veor        q9, q11, q0                 ; *oq2 = s^0x80
+    veor        q4, q10, q0                 ; *op2 = s^0x80
+    veor        q8, q13, q0                 ; *oq1 = s^0x80
+    veor        q5, q12, q0                 ; *op2 = s^0x80
+    veor        q7, q15, q0                 ; *oq0 = s^0x80
+    vld1.u8     {q3}, [sp]!
+    veor        q6, q14, q0                 ; *op0 = s^0x80
+    vld1.u8     {q10}, [sp]!
+
+    ;transpose to 16x8 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    ;store op2, op1, op0, oq0, oq1, oq2
+    vst1.8      {d6}, [r0], r1
+    vst1.8      {d7}, [r3], r1
+    vst1.8      {d8}, [r0], r1
+    vst1.8      {d9}, [r3], r1
+    vst1.8      {d10}, [r0], r1
+    vst1.8      {d11}, [r3], r1
+    vst1.8      {d12}, [r0], r1
+    vst1.8      {d13}, [r3], r1
+    vst1.8      {d14}, [r0], r1
+    vst1.8      {d15}, [r3], r1
+    vst1.8      {d16}, [r0], r1
+    vst1.8      {d17}, [r3], r1
+    vst1.8      {d18}, [r0], r1
+    vst1.8      {d19}, [r3], r1
+    vst1.8      {d20}, [r0], r1
+    vst1.8      {d21}, [r3], r1
+
+    bx          lr
+    ENDP        ; |vp8_mbloop_filter_vertical_edge_uv_neon|
+
+;-----------------
+    AREA    mbvloopfilteruv_dat, DATA, READWRITE            ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_mbvlfuv_coeff_
+    DCD     mbvlfuv_coeff
+mbvlfuv_coeff
+    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
+    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
+    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
+    DCD     0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
+    DCD     0x09090909, 0x09090909, 0x12121212, 0x12121212
+    DCD     0x1b1b1b1b, 0x1b1b1b1b
+
+    END
diff --git a/vp8/common/arm/neon/mbloopfilterverticaledge_y_neon.asm b/vp8/common/arm/neon/mbloopfilterverticaledge_y_neon.asm
new file mode 100644
index 000000000..60e517519
--- /dev/null
+++ b/vp8/common/arm/neon/mbloopfilterverticaledge_y_neon.asm
@@ -0,0 +1,303 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_mbloop_filter_vertical_edge_y_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
+;are equal. So, in the code, only one load is needed
+;for flimit. Same way applies to limit and thresh.
+; r0    unsigned char *s,
+; r1    int p, //pitch
+; r2    const signed char *flimit,
+; r3    const signed char *limit,
+; stack(r4) const signed char *thresh,
+; //stack(r5)   int count --unused
+|vp8_mbloop_filter_vertical_edge_y_neon| PROC
+    sub         r0, r0, #4                  ; move src pointer down by 4 columns
+
+    vld1.u8     {d6}, [r0], r1              ; load first 8-line src data
+    ldr         r12, [sp, #0]               ; load thresh pointer
+    vld1.u8     {d8}, [r0], r1
+    sub         sp, sp, #32
+    vld1.u8     {d10}, [r0], r1
+    vld1.u8     {d12}, [r0], r1
+    vld1.u8     {d14}, [r0], r1
+    vld1.u8     {d16}, [r0], r1
+    vld1.u8     {d18}, [r0], r1
+    vld1.u8     {d20}, [r0], r1
+
+    vld1.u8     {d7}, [r0], r1              ; load second 8-line src data
+    vld1.u8     {d9}, [r0], r1
+    vld1.u8     {d11}, [r0], r1
+    vld1.u8     {d13}, [r0], r1
+    vld1.u8     {d15}, [r0], r1
+    vld1.u8     {d17}, [r0], r1
+    vld1.u8     {d19}, [r0], r1
+    vld1.u8     {d21}, [r0], r1
+
+    ;transpose to 8x16 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    vld1.s8     {d2[], d3[]}, [r3]          ; limit
+    vst1.u8     {q3}, [sp]!
+    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
+    ldr         r12, _mbvlfy_coeff_
+    vst1.u8     {q10}, [sp]!
+
+    ;vp8_filter_mask() function
+    ;vp8_hevmask() function
+    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
+    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
+    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
+    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
+    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
+    vabd.u8     q0, q10, q9                 ; abs(q3 - q2)
+
+    vcge.u8     q15, q1, q11                ; (abs(p3 - p2) > limit)*-1
+    vcge.u8     q12, q1, q12                ; (abs(p2 - p1) > limit)*-1
+    vcge.u8     q10, q1, q13                ; (abs(p1 - p0) > limit)*-1
+    vcge.u8     q11, q1, q14                ; (abs(q1 - q0) > limit)*-1
+    vcge.u8     q3, q1, q3                  ; (abs(q2 - q1) > limit)*-1
+    vcge.u8     q0, q1, q0                  ; (abs(q3 - q2) > limit)*-1
+
+    vand        q15, q15, q12
+
+    vabd.u8     q12, q6, q7                 ; abs(p0 - q0)
+
+    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
+
+    vld1.s8     {d4[], d5[]}, [r2]          ; flimit
+
+    vand        q10, q10, q11
+    vand        q3, q3, q0
+
+    vld1.u8     {q0}, [r12]!
+
+    vadd.u8     q2, q2, q2                  ; flimit * 2
+    vadd.u8     q2, q2, q1                  ; flimit * 2 + limit
+
+    vabd.u8     q1, q5, q8                  ; abs(p1 - q1)
+    vqadd.u8    q12, q12, q12               ; abs(p0 - q0) * 2
+    vshr.u8     q1, q1, #1                  ; abs(p1 - q1) / 2
+    vqadd.u8    q12, q12, q1                ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+    vcge.u8     q12, q2, q12                ; (abs(p0 - q0)*2 + abs(p1 - q1)/2 > flimit*2 + limit)*-1
+
+    vand        q15, q15, q10
+
+    ;vp8_filter() function
+    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
+    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
+    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
+    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
+    veor        q4, q4, q0                  ; ps2: p2 offset to convert to a signed value
+    veor        q9, q9, q0                  ; qs2: q2 offset to convert to a signed value
+;;;;;;;;;;;;;
+    vorr        q14, q13, q14               ; q14: vp8_hevmask
+
+    ;vqsub.s8   q2, q7, q6                  ; ( qs0 - ps0)
+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
+    vsubl.s8    q13, d15, d13
+
+    vqsub.s8    q1, q5, q8                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
+
+    ;vadd.s8    q10, q2, q2                 ; 3 * ( qs0 - ps0)
+    vadd.s16    q10, q2, q2                 ; 3 * ( qs0 - ps0)
+    vadd.s16    q11, q13, q13
+
+    vand        q3, q3, q12
+
+    ;vadd.s8    q2, q2, q10
+    vadd.s16    q2, q2, q10
+    vadd.s16    q13, q13, q11
+
+    vld1.u8     {q12}, [r12]!               ;#3
+
+    ;vqadd.s8   q1, q1, q2                  ; vp8_filter + 3 * ( qs0 - ps0)
+    vaddw.s8    q2, q2, d2                  ; vp8_filter + 3 * ( qs0 - ps0)
+    vaddw.s8    q13, q13, d3
+
+    vand        q15, q15, q3                ; q15: vp8_filter_mask
+    vld1.u8     {q11}, [r12]!               ;#4
+
+    vqmovn.s16  d2, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d3, q13
+
+;;;;;;;;;;;;;;
+    vand        q1, q1, q15                 ; vp8_filter &= mask
+
+    vld1.u8     {q15}, [r12]!               ;#63
+    ;
+    vand        q13, q1, q14                ; Filter2: q13; Filter2 &= hev
+
+    vld1.u8     {d7}, [r12]!                ;#9
+    ;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7
+;   vand        q2, q13, q12                ; s = Filter2 & 7
+
+;   vqadd.s8    q13, q13, q11               ; Filter2 = vp8_signed_char_clamp(Filter2+4)
+;   vld1.u8     {d6}, [r12]!                ;#18
+
+;   sub         r0, r0, r1, lsl #4
+;   sub         sp, sp, #32
+;   add         r2, r0, r1
+
+;   vshr.s8     q13, q13, #3                ; Filter2 >>= 3
+;   vceq.i8     q2, q2, q11                 ; s = (s==4)*-1
+
+;   add         r3, r2, r1
+
+;   vqsub.s8    q7, q7, q13                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
+;   vqadd.s8    q11, q2, q13                ; u = vp8_signed_char_clamp(s + Filter2)
+
+;   vld1.u8     {d5}, [r12]!                ;#27
+;   vmov        q10, q15
+;   vmov        q12, q15
+
+;   vqadd.s8    q6, q6, q11                 ; ps0 = vp8_signed_char_clamp(ps0 + u)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    vqadd.s8    q2, q13, q11                ; Filter1 = vp8_signed_char_clamp(Filter2+4)
+    vqadd.s8    q13, q13, q12               ; Filter2 = vp8_signed_char_clamp(Filter2+3)
+
+    vld1.u8     {d6}, [r12]!                ;#18
+    sub         r0, r0, r1, lsl #4
+    sub         sp, sp, #32
+
+    add         r2, r0, r1
+
+    vshr.s8     q2, q2, #3                  ; Filter1 >>= 3
+    vshr.s8     q13, q13, #3                ; Filter2 >>= 3
+
+    vmov        q10, q15
+    vmov        q12, q15
+
+    vqsub.s8    q7, q7, q2                  ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
+
+    vld1.u8     {d5}, [r12]!                ;#27
+    add         r3, r2, r1
+
+    vqadd.s8    q6, q6, q13                 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+    vbic        q1, q1, q14                 ; Filter2: q1; vp8_filter &= ~hev; Filter2 = vp8_filter
+
+    ; roughly 1/7th difference across boundary
+    ; roughly 2/7th difference across boundary
+    ; roughly 3/7th difference across boundary
+    vmov        q11, q15
+    vmov        q13, q15
+    vmov        q14, q15
+
+    vmlal.s8    q10, d2, d7                 ; Filter2 * 9
+    vmlal.s8    q11, d3, d7
+    vmlal.s8    q12, d2, d6                 ; Filter2 * 18
+    vmlal.s8    q13, d3, d6
+    vmlal.s8    q14, d2, d5                 ; Filter2 * 27
+    vmlal.s8    q15, d3, d5
+    vqshrn.s16  d20, q10, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
+    vqshrn.s16  d21, q11, #7
+    vqshrn.s16  d24, q12, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
+    vqshrn.s16  d25, q13, #7
+    vqshrn.s16  d28, q14, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
+    vqshrn.s16  d29, q15, #7
+
+    vqsub.s8    q11, q9, q10                ; s = vp8_signed_char_clamp(qs2 - u)
+    vqadd.s8    q10, q4, q10                ; s = vp8_signed_char_clamp(ps2 + u)
+    vqsub.s8    q13, q8, q12                ; s = vp8_signed_char_clamp(qs1 - u)
+    vqadd.s8    q12, q5, q12                ; s = vp8_signed_char_clamp(ps1 + u)
+    vqsub.s8    q15, q7, q14                ; s = vp8_signed_char_clamp(qs0 - u)
+    vqadd.s8    q14, q6, q14                ; s = vp8_signed_char_clamp(ps0 + u)
+    veor        q9, q11, q0                 ; *oq2 = s^0x80
+    veor        q4, q10, q0                 ; *op2 = s^0x80
+    veor        q8, q13, q0                 ; *oq1 = s^0x80
+    veor        q5, q12, q0                 ; *op2 = s^0x80
+    veor        q7, q15, q0                 ; *oq0 = s^0x80
+    vld1.u8     {q3}, [sp]!
+    veor        q6, q14, q0                 ; *op0 = s^0x80
+    vld1.u8     {q10}, [sp]!
+
+    ;transpose to 16x8 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+    add         r12, r3, r1
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    ;store op2, op1, op0, oq0, oq1, oq2
+    vst1.8      {d6}, [r0]
+    vst1.8      {d8}, [r2]
+    vst1.8      {d10}, [r3]
+    vst1.8      {d12}, [r12], r1
+    add         r0, r12, r1
+    vst1.8      {d14}, [r12]
+    vst1.8      {d16}, [r0], r1
+    add         r2, r0, r1
+    vst1.8      {d18}, [r0]
+    vst1.8      {d20}, [r2], r1
+    add         r3, r2, r1
+    vst1.8      {d7}, [r2]
+    vst1.8      {d9}, [r3], r1
+    add         r12, r3, r1
+    vst1.8      {d11}, [r3]
+    vst1.8      {d13}, [r12], r1
+    add         r0, r12, r1
+    vst1.8      {d15}, [r12]
+    vst1.8      {d17}, [r0], r1
+    add         r2, r0, r1
+    vst1.8      {d19}, [r0]
+    vst1.8      {d21}, [r2]
+
+    bx          lr
+    ENDP        ; |vp8_mbloop_filter_vertical_edge_y_neon|
+
+;-----------------
+    AREA    mbvloopfiltery_dat, DATA, READWRITE         ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_mbvlfy_coeff_
+    DCD     mbvlfy_coeff
+mbvlfy_coeff
+    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
+    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
+    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
+    DCD     0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
+    DCD     0x09090909, 0x09090909, 0x12121212, 0x12121212
+    DCD     0x1b1b1b1b, 0x1b1b1b1b
+
+    END
diff --git a/vp8/common/arm/neon/recon16x16mb_neon.asm b/vp8/common/arm/neon/recon16x16mb_neon.asm
new file mode 100644
index 000000000..b9ba1cbc3
--- /dev/null
+++ b/vp8/common/arm/neon/recon16x16mb_neon.asm
@@ -0,0 +1,130 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_recon16x16mb_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char  *pred_ptr,
+; r1    short *diff_ptr,
+; r2    unsigned char *dst_ptr,
+; r3    int ystride,
+; stack unsigned char *udst_ptr,
+; stack unsigned char *vdst_ptr
+
+|vp8_recon16x16mb_neon| PROC
+    mov             r12, #4             ;loop counter for Y loop
+
+recon16x16mb_loop_y
+    vld1.u8         {q12, q13}, [r0]!   ;load data from pred_ptr
+    vld1.16         {q8, q9}, [r1]!     ;load data from diff_ptr
+    vld1.u8         {q14, q15}, [r0]!
+    vld1.16         {q10, q11}, [r1]!
+
+    vmovl.u8        q0, d24             ;modify Pred data from 8 bits to 16 bits
+    vmovl.u8        q1, d25
+    vmovl.u8        q2, d26
+    vmovl.u8        q3, d27
+    vmovl.u8        q4, d28
+    vmovl.u8        q5, d29
+    vmovl.u8        q6, d30
+    vld1.16         {q12, q13}, [r1]!
+    vmovl.u8        q7, d31
+    vld1.16         {q14, q15}, [r1]!
+
+    pld             [r0]
+    pld             [r1]
+    pld             [r1, #64]
+
+    vadd.s16        q0, q0, q8          ;add Diff data and Pred data together
+    vadd.s16        q1, q1, q9
+    vadd.s16        q2, q2, q10
+    vadd.s16        q3, q3, q11
+    vadd.s16        q4, q4, q12
+    vadd.s16        q5, q5, q13
+    vadd.s16        q6, q6, q14
+    vadd.s16        q7, q7, q15
+
+    vqmovun.s16     d0, q0              ;CLAMP() saturation
+    vqmovun.s16     d1, q1
+    vqmovun.s16     d2, q2
+    vqmovun.s16     d3, q3
+    vqmovun.s16     d4, q4
+    vqmovun.s16     d5, q5
+    vst1.u8         {q0}, [r2], r3      ;store result
+    vqmovun.s16     d6, q6
+    vst1.u8         {q1}, [r2], r3
+    vqmovun.s16     d7, q7
+    vst1.u8         {q2}, [r2], r3
+    subs            r12, r12, #1
+
+    moveq           r12, #2             ;loop counter for UV loop
+
+    vst1.u8         {q3}, [r2], r3
+    bne             recon16x16mb_loop_y
+
+    mov             r3, r3, lsr #1      ;uv_stride = ystride>>1
+    ldr             r2, [sp]            ;load upred_ptr
+
+recon16x16mb_loop_uv
+    vld1.u8         {q12, q13}, [r0]!   ;load data from pred_ptr
+    vld1.16         {q8, q9}, [r1]!     ;load data from diff_ptr
+    vld1.u8         {q14, q15}, [r0]!
+    vld1.16         {q10, q11}, [r1]!
+
+    vmovl.u8        q0, d24             ;modify Pred data from 8 bits to 16 bits
+    vmovl.u8        q1, d25
+    vmovl.u8        q2, d26
+    vmovl.u8        q3, d27
+    vmovl.u8        q4, d28
+    vmovl.u8        q5, d29
+    vmovl.u8        q6, d30
+    vld1.16         {q12, q13}, [r1]!
+    vmovl.u8        q7, d31
+    vld1.16         {q14, q15}, [r1]!
+
+    vadd.s16        q0, q0, q8          ;add Diff data and Pred data together
+    vadd.s16        q1, q1, q9
+    vadd.s16        q2, q2, q10
+    vadd.s16        q3, q3, q11
+    vadd.s16        q4, q4, q12
+    vadd.s16        q5, q5, q13
+    vadd.s16        q6, q6, q14
+
+    vqmovun.s16     d0, q0              ;CLAMP() saturation
+    vadd.s16        q7, q7, q15
+    vqmovun.s16     d1, q1
+    vqmovun.s16     d2, q2
+    vqmovun.s16     d3, q3
+    vst1.u8         {d0}, [r2], r3      ;store result
+    vqmovun.s16     d4, q4
+    vst1.u8         {d1}, [r2], r3
+    vqmovun.s16     d5, q5
+    vst1.u8         {d2}, [r2], r3
+    vqmovun.s16     d6, q6
+    vst1.u8         {d3}, [r2], r3
+    vqmovun.s16     d7, q7
+    vst1.u8         {d4}, [r2], r3
+    subs            r12, r12, #1
+
+    vst1.u8         {d5}, [r2], r3
+    vst1.u8         {d6}, [r2], r3
+    vst1.u8         {d7}, [r2], r3
+
+    ldrne           r2, [sp, #4]        ;load vpred_ptr
+    bne             recon16x16mb_loop_uv
+
+    bx             lr
+
+    ENDP
+    END
diff --git a/vp8/common/arm/neon/recon2b_neon.asm b/vp8/common/arm/neon/recon2b_neon.asm
new file mode 100644
index 000000000..25aaf8c8e
--- /dev/null
+++ b/vp8/common/arm/neon/recon2b_neon.asm
@@ -0,0 +1,53 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_recon2b_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char  *pred_ptr,
+; r1    short *diff_ptr,
+; r2    unsigned char *dst_ptr,
+; r3    int stride
+
+|vp8_recon2b_neon| PROC
+    vld1.u8         {q8, q9}, [r0]      ;load data from pred_ptr
+    vld1.16         {q4, q5}, [r1]!     ;load data from diff_ptr
+
+    vmovl.u8        q0, d16             ;modify Pred data from 8 bits to 16 bits
+    vld1.16         {q6, q7}, [r1]!
+    vmovl.u8        q1, d17
+    vmovl.u8        q2, d18
+    vmovl.u8        q3, d19
+
+    vadd.s16        q0, q0, q4          ;add Diff data and Pred data together
+    vadd.s16        q1, q1, q5
+    vadd.s16        q2, q2, q6
+    vadd.s16        q3, q3, q7
+
+    vqmovun.s16     d0, q0              ;CLAMP() saturation
+    vqmovun.s16     d1, q1
+    vqmovun.s16     d2, q2
+    vqmovun.s16     d3, q3
+    add             r0, r2, r3
+
+    vst1.u8         {d0}, [r2]          ;store result
+    vst1.u8         {d1}, [r0], r3
+    add             r2, r0, r3
+    vst1.u8         {d2}, [r0]
+    vst1.u8         {d3}, [r2], r3
+
+    bx             lr
+
+    ENDP
+    END
diff --git a/vp8/common/arm/neon/recon4b_neon.asm b/vp8/common/arm/neon/recon4b_neon.asm
new file mode 100644
index 000000000..a4f5b806b
--- /dev/null
+++ b/vp8/common/arm/neon/recon4b_neon.asm
@@ -0,0 +1,68 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_recon4b_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char  *pred_ptr,
+; r1    short *diff_ptr,
+; r2    unsigned char *dst_ptr,
+; r3    int stride
+
+|vp8_recon4b_neon| PROC
+    vld1.u8         {q12, q13}, [r0]!   ;load data from pred_ptr
+    vld1.16         {q8, q9}, [r1]!     ;load data from diff_ptr
+    vld1.u8         {q14, q15}, [r0]
+    vld1.16         {q10, q11}, [r1]!
+
+    vmovl.u8        q0, d24             ;modify Pred data from 8 bits to 16 bits
+    vmovl.u8        q1, d25
+    vmovl.u8        q2, d26
+    vmovl.u8        q3, d27
+    vmovl.u8        q4, d28
+    vmovl.u8        q5, d29
+    vmovl.u8        q6, d30
+    vld1.16         {q12, q13}, [r1]!
+    vmovl.u8        q7, d31
+    vld1.16         {q14, q15}, [r1]
+
+    vadd.s16        q0, q0, q8          ;add Diff data and Pred data together
+    vadd.s16        q1, q1, q9
+    vadd.s16        q2, q2, q10
+    vadd.s16        q3, q3, q11
+    vadd.s16        q4, q4, q12
+    vadd.s16        q5, q5, q13
+    vadd.s16        q6, q6, q14
+    vadd.s16        q7, q7, q15
+
+    vqmovun.s16     d0, q0              ;CLAMP() saturation
+    vqmovun.s16     d1, q1
+    vqmovun.s16     d2, q2
+    vqmovun.s16     d3, q3
+    vqmovun.s16     d4, q4
+    vqmovun.s16     d5, q5
+    vqmovun.s16     d6, q6
+    vqmovun.s16     d7, q7
+    add             r0, r2, r3
+
+    vst1.u8         {q0}, [r2]          ;store result
+    vst1.u8         {q1}, [r0], r3
+    add             r2, r0, r3
+    vst1.u8         {q2}, [r0]
+    vst1.u8         {q3}, [r2], r3
+
+    bx             lr
+
+    ENDP
+    END
diff --git a/vp8/common/arm/neon/reconb_neon.asm b/vp8/common/arm/neon/reconb_neon.asm
new file mode 100644
index 000000000..16d85a0d5
--- /dev/null
+++ b/vp8/common/arm/neon/reconb_neon.asm
@@ -0,0 +1,60 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_recon_b_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char  *pred_ptr,
+; r1    short *diff_ptr,
+; r2    unsigned char *dst_ptr,
+; r3    int stride
+
+|vp8_recon_b_neon| PROC
+    mov             r12, #16
+
+    vld1.u8         {d28}, [r0], r12    ;load 4 data/line from pred_ptr
+    vld1.16         {q10, q11}, [r1]!   ;load data from diff_ptr
+    vld1.u8         {d29}, [r0], r12
+    vld1.16         {q11, q12}, [r1]!
+    vld1.u8         {d30}, [r0], r12
+    vld1.16         {q12, q13}, [r1]!
+    vld1.u8         {d31}, [r0], r12
+    vld1.16         {q13}, [r1]
+
+    vmovl.u8        q0, d28             ;modify Pred data from 8 bits to 16 bits
+    vmovl.u8        q1, d29             ;Pred data in d0, d2, d4, d6
+    vmovl.u8        q2, d30
+    vmovl.u8        q3, d31
+
+    vadd.s16        d0, d0, d20         ;add Diff data and Pred data together
+    vadd.s16        d2, d2, d22
+    vadd.s16        d4, d4, d24
+    vadd.s16        d6, d6, d26
+
+    vqmovun.s16     d0, q0              ;CLAMP() saturation
+    vqmovun.s16     d1, q1
+    vqmovun.s16     d2, q2
+    vqmovun.s16     d3, q3
+    add             r1, r2, r3
+
+    vst1.32         {d0[0]}, [r2]       ;store result
+    vst1.32         {d1[0]}, [r1], r3
+    add             r2, r1, r3
+    vst1.32         {d2[0]}, [r1]
+    vst1.32         {d3[0]}, [r2], r3
+
+    bx             lr
+
+    ENDP
+    END
diff --git a/vp8/common/arm/neon/save_neon_reg.asm b/vp8/common/arm/neon/save_neon_reg.asm
new file mode 100644
index 000000000..4873e447f
--- /dev/null
+++ b/vp8/common/arm/neon/save_neon_reg.asm
@@ -0,0 +1,35 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_push_neon|
+    EXPORT  |vp8_pop_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vp8_push_neon| PROC
+    vst1.i64            {d8, d9, d10, d11}, [r0]!
+    vst1.i64            {d12, d13, d14, d15}, [r0]!
+    bx              lr
+
+    ENDP
+
+|vp8_pop_neon| PROC
+    vld1.i64            {d8, d9, d10, d11}, [r0]!
+    vld1.i64            {d12, d13, d14, d15}, [r0]!
+    bx              lr
+
+    ENDP
+
+    END
+
diff --git a/vp8/common/arm/neon/shortidct4x4llm_1_neon.asm b/vp8/common/arm/neon/shortidct4x4llm_1_neon.asm
new file mode 100644
index 000000000..7d06ff908
--- /dev/null
+++ b/vp8/common/arm/neon/shortidct4x4llm_1_neon.asm
@@ -0,0 +1,66 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_short_idct4x4llm_1_neon|
+    EXPORT  |vp8_dc_only_idct_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
+; r0    short *input;
+; r1    short *output;
+; r2    int pitch;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+|vp8_short_idct4x4llm_1_neon| PROC
+    vld1.16         {d0[]}, [r0]            ;load input[0]
+
+    add             r3, r1, r2
+    add             r12, r3, r2
+
+    vrshr.s16       d0, d0, #3
+
+    add             r0, r12, r2
+
+    vst1.16         {d0}, [r1]
+    vst1.16         {d0}, [r3]
+    vst1.16         {d0}, [r12]
+    vst1.16         {d0}, [r0]
+
+    bx             lr
+    ENDP
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
+; r0    short input_dc;
+; r1    short *output;
+; r2    int pitch;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+|vp8_dc_only_idct_neon| PROC
+    vdup.16         d0, r0
+
+    add             r3, r1, r2
+    add             r12, r3, r2
+
+    vrshr.s16       d0, d0, #3
+
+    add             r0, r12, r2
+
+    vst1.16         {d0}, [r1]
+    vst1.16         {d0}, [r3]
+    vst1.16         {d0}, [r12]
+    vst1.16         {d0}, [r0]
+
+    bx             lr
+
+    ENDP
+    END
diff --git a/vp8/common/arm/neon/shortidct4x4llm_neon.asm b/vp8/common/arm/neon/shortidct4x4llm_neon.asm
new file mode 100644
index 000000000..ffecfbfbc
--- /dev/null
+++ b/vp8/common/arm/neon/shortidct4x4llm_neon.asm
@@ -0,0 +1,126 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_short_idct4x4llm_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;*************************************************************
+;void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
+;r0 short * input
+;r1 short * output
+;r2 int pitch
+;*************************************************************
+;static const int cospi8sqrt2minus1=20091;
+;static const int sinpi8sqrt2      =35468;
+;static const int rounding = 0;
+;Optimization note: The resulted data from dequantization are signed 13-bit data that is
+;in the range of [-4096, 4095]. This allows to use "vqdmulh"(neon) instruction since
+;it won't go out of range (13+16+1=30bits<32bits). This instruction gives the high half
+;result of the multiplication that is needed in IDCT.
+
+|vp8_short_idct4x4llm_neon| PROC
+    ldr             r12, _idct_coeff_
+    vld1.16         {q1, q2}, [r0]
+    vld1.16         {d0}, [r12]
+
+    vswp            d3, d4                  ;q2(vp[4] vp[12])
+
+    vqdmulh.s16     q3, q2, d0[2]
+    vqdmulh.s16     q4, q2, d0[0]
+
+    vqadd.s16       d12, d2, d3             ;a1
+    vqsub.s16       d13, d2, d3             ;b1
+
+    vshr.s16        q3, q3, #1
+    vshr.s16        q4, q4, #1
+
+    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+    vqadd.s16       q4, q4, q2
+
+    ;d6 - c1:temp1
+    ;d7 - d1:temp2
+    ;d8 - d1:temp1
+    ;d9 - c1:temp2
+
+    vqsub.s16       d10, d6, d9             ;c1
+    vqadd.s16       d11, d7, d8             ;d1
+
+    vqadd.s16       d2, d12, d11
+    vqadd.s16       d3, d13, d10
+    vqsub.s16       d4, d13, d10
+    vqsub.s16       d5, d12, d11
+
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+    vswp            d3, d4
+
+    vqdmulh.s16     q3, q2, d0[2]
+    vqdmulh.s16     q4, q2, d0[0]
+
+    vqadd.s16       d12, d2, d3             ;a1
+    vqsub.s16       d13, d2, d3             ;b1
+
+    vshr.s16        q3, q3, #1
+    vshr.s16        q4, q4, #1
+
+    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+    vqadd.s16       q4, q4, q2
+
+    vqsub.s16       d10, d6, d9             ;c1
+    vqadd.s16       d11, d7, d8             ;d1
+
+    vqadd.s16       d2, d12, d11
+    vqadd.s16       d3, d13, d10
+    vqsub.s16       d4, d13, d10
+    vqsub.s16       d5, d12, d11
+
+    vrshr.s16       d2, d2, #3
+    vrshr.s16       d3, d3, #3
+    vrshr.s16       d4, d4, #3
+    vrshr.s16       d5, d5, #3
+
+    add             r3, r1, r2
+    add             r12, r3, r2
+    add             r0, r12, r2
+
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+    vst1.16         {d2}, [r1]
+    vst1.16         {d3}, [r3]
+    vst1.16         {d4}, [r12]
+    vst1.16         {d5}, [r0]
+
+    bx             lr
+
+    ENDP
+
+;-----------------
+    AREA    idct4x4_dat, DATA, READWRITE            ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_idct_coeff_
+    DCD     idct_coeff
+idct_coeff
+    DCD     0x4e7b4e7b, 0x8a8c8a8c
+
+;20091, 20091, 35468, 35468
+
+    END
diff --git a/vp8/common/arm/neon/sixtappredict16x16_neon.asm b/vp8/common/arm/neon/sixtappredict16x16_neon.asm
new file mode 100644
index 000000000..9f5f0d2ce
--- /dev/null
+++ b/vp8/common/arm/neon/sixtappredict16x16_neon.asm
@@ -0,0 +1,494 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sixtap_predict16x16_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; r4    unsigned char *dst_ptr,
+; stack(r5) int  dst_pitch
+
+;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to
+; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication,
+; the result can be negtive. So, I treat the result as s16. But, since it is also possible
+; that the result can be a large positive number (> 2^15-1), which could be confused as a
+; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2,
+; which ensures that the result stays in s16 range. Finally, saturated add the result by
+; applying 3rd filter coeff. Same applys to other filter functions.
+
+|vp8_sixtap_predict16x16_neon| PROC
+    push            {r4-r5, lr}
+
+    ldr             r12, _filter16_coeff_
+    ldr             r4, [sp, #12]           ;load parameters from stack
+    ldr             r5, [sp, #16]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_filter16x16_only
+
+    add             r2, r12, r2, lsl #5     ;calculate filter location
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+
+    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
+
+    beq             firstpass_filter16x16_only
+
+    sub             sp, sp, #336            ;reserve space on stack for temporary storage
+    mov             lr, sp
+
+    vabs.s32        q12, q14
+    vabs.s32        q13, q15
+
+    mov             r2, #7                  ;loop counter
+    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
+    sub             r0, r0, r1, lsl #1
+
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vdup.8          d1, d24[4]
+    vdup.8          d2, d25[0]
+    vdup.8          d3, d25[4]
+    vdup.8          d4, d26[0]
+    vdup.8          d5, d26[4]
+
+;First Pass: output_height lines x output_width columns (21x16)
+filt_blk2d_fp16x16_loop_neon
+    vld1.u8         {d6, d7, d8}, [r0], r1      ;load src data
+    vld1.u8         {d9, d10, d11}, [r0], r1
+    vld1.u8         {d12, d13, d14}, [r0], r1
+
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q9, d7, d0
+    vmull.u8        q10, d9, d0
+    vmull.u8        q11, d10, d0
+    vmull.u8        q12, d12, d0
+    vmull.u8        q13, d13, d0
+
+    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d29, d9, d10, #1
+    vext.8          d30, d12, d13, #1
+
+    vmlsl.u8        q8, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q10, d29, d1
+    vmlsl.u8        q12, d30, d1
+
+    vext.8          d28, d7, d8, #1
+    vext.8          d29, d10, d11, #1
+    vext.8          d30, d13, d14, #1
+
+    vmlsl.u8        q9, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q11, d29, d1
+    vmlsl.u8        q13, d30, d1
+
+    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d29, d9, d10, #4
+    vext.8          d30, d12, d13, #4
+
+    vmlsl.u8        q8, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q10, d29, d4
+    vmlsl.u8        q12, d30, d4
+
+    vext.8          d28, d7, d8, #4
+    vext.8          d29, d10, d11, #4
+    vext.8          d30, d13, d14, #4
+
+    vmlsl.u8        q9, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q11, d29, d4
+    vmlsl.u8        q13, d30, d4
+
+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d29, d9, d10, #5
+    vext.8          d30, d12, d13, #5
+
+    vmlal.u8        q8, d28, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q10, d29, d5
+    vmlal.u8        q12, d30, d5
+
+    vext.8          d28, d7, d8, #5
+    vext.8          d29, d10, d11, #5
+    vext.8          d30, d13, d14, #5
+
+    vmlal.u8        q9, d28, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q11, d29, d5
+    vmlal.u8        q13, d30, d5
+
+    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d29, d9, d10, #2
+    vext.8          d30, d12, d13, #2
+
+    vmlal.u8        q8, d28, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q10, d29, d2
+    vmlal.u8        q12, d30, d2
+
+    vext.8          d28, d7, d8, #2
+    vext.8          d29, d10, d11, #2
+    vext.8          d30, d13, d14, #2
+
+    vmlal.u8        q9, d28, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q11, d29, d2
+    vmlal.u8        q13, d30, d2
+
+    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d29, d9, d10, #3
+    vext.8          d30, d12, d13, #3
+
+    vext.8          d15, d7, d8, #3
+    vext.8          d31, d10, d11, #3
+    vext.8          d6, d13, d14, #3
+
+    vmull.u8        q4, d28, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q5, d29, d3
+    vmull.u8        q6, d30, d3
+
+    vqadd.s16       q8, q4                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q10, q5
+    vqadd.s16       q12, q6
+
+    vmull.u8        q6, d15, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q7, d31, d3
+    vmull.u8        q3, d6, d3
+
+    subs            r2, r2, #1
+
+    vqadd.s16       q9, q6
+    vqadd.s16       q11, q7
+    vqadd.s16       q13, q3
+
+    vqrshrun.s16    d6, q8, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q9, #7
+    vqrshrun.s16    d8, q10, #7
+    vqrshrun.s16    d9, q11, #7
+    vqrshrun.s16    d10, q12, #7
+    vqrshrun.s16    d11, q13, #7
+
+    vst1.u8         {d6, d7, d8}, [lr]!     ;store result
+    vst1.u8         {d9, d10, d11}, [lr]!
+
+    bne             filt_blk2d_fp16x16_loop_neon
+
+;Second pass: 16x16
+;secondpass_filter - do first 8-columns and then second 8-columns
+    add             r3, r12, r3, lsl #5
+    sub             lr, lr, #336
+
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+    mov             r3, #2                  ;loop counter
+
+    vabs.s32        q7, q5
+    vabs.s32        q8, q6
+
+    mov             r2, #16
+
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vdup.8          d1, d14[4]
+    vdup.8          d2, d15[0]
+    vdup.8          d3, d15[4]
+    vdup.8          d4, d16[0]
+    vdup.8          d5, d16[4]
+
+filt_blk2d_sp16x16_outloop_neon
+    vld1.u8         {d18}, [lr], r2         ;load src data
+    vld1.u8         {d19}, [lr], r2
+    vld1.u8         {d20}, [lr], r2
+    vld1.u8         {d21}, [lr], r2
+    mov             r12, #4                 ;loop counter
+    vld1.u8         {d22}, [lr], r2
+
+secondpass_inner_loop_neon
+    vld1.u8         {d23}, [lr], r2         ;load src data
+    vld1.u8         {d24}, [lr], r2
+    vld1.u8         {d25}, [lr], r2
+    vld1.u8         {d26}, [lr], r2
+
+    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q4, d19, d0
+    vmull.u8        q5, d20, d0
+    vmull.u8        q6, d21, d0
+
+    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q4, d20, d1
+    vmlsl.u8        q5, d21, d1
+    vmlsl.u8        q6, d22, d1
+
+    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q4, d23, d4
+    vmlsl.u8        q5, d24, d4
+    vmlsl.u8        q6, d25, d4
+
+    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q4, d21, d2
+    vmlal.u8        q5, d22, d2
+    vmlal.u8        q6, d23, d2
+
+    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q4, d24, d5
+    vmlal.u8        q5, d25, d5
+    vmlal.u8        q6, d26, d5
+
+    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q8, d22, d3
+    vmull.u8        q9, d23, d3
+    vmull.u8        q10, d24, d3
+
+    subs            r12, r12, #1
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q8, #7
+    vqrshrun.s16    d8, q9, #7
+    vqrshrun.s16    d9, q10, #7
+
+    vst1.u8         {d6}, [r4], r5          ;store result
+    vmov            q9, q11
+    vst1.u8         {d7}, [r4], r5
+    vmov            q10, q12
+    vst1.u8         {d8}, [r4], r5
+    vmov            d22, d26
+    vst1.u8         {d9}, [r4], r5
+
+    bne             secondpass_inner_loop_neon
+
+    subs            r3, r3, #1
+    sub             lr, lr, #336
+    add             lr, lr, #8
+
+    sub             r4, r4, r5, lsl #4
+    add             r4, r4, #8
+
+    bne filt_blk2d_sp16x16_outloop_neon
+
+    add             sp, sp, #336
+    pop             {r4-r5,pc}
+
+;--------------------
+firstpass_filter16x16_only
+    vabs.s32        q12, q14
+    vabs.s32        q13, q15
+
+    mov             r2, #8                  ;loop counter
+    sub             r0, r0, #2              ;move srcptr back to (column-2)
+
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vdup.8          d1, d24[4]
+    vdup.8          d2, d25[0]
+    vdup.8          d3, d25[4]
+    vdup.8          d4, d26[0]
+    vdup.8          d5, d26[4]
+
+;First Pass: output_height lines x output_width columns (16x16)
+filt_blk2d_fpo16x16_loop_neon
+    vld1.u8         {d6, d7, d8}, [r0], r1      ;load src data
+    vld1.u8         {d9, d10, d11}, [r0], r1
+
+    pld             [r0]
+    pld             [r0, r1]
+
+    vmull.u8        q6, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q7, d7, d0
+    vmull.u8        q8, d9, d0
+    vmull.u8        q9, d10, d0
+
+    vext.8          d20, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d21, d9, d10, #1
+    vext.8          d22, d7, d8, #1
+    vext.8          d23, d10, d11, #1
+    vext.8          d24, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d25, d9, d10, #4
+    vext.8          d26, d7, d8, #4
+    vext.8          d27, d10, d11, #4
+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d29, d9, d10, #5
+
+    vmlsl.u8        q6, d20, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q8, d21, d1
+    vmlsl.u8        q7, d22, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q9, d23, d1
+    vmlsl.u8        q6, d24, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q8, d25, d4
+    vmlsl.u8        q7, d26, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q9, d27, d4
+    vmlal.u8        q6, d28, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q8, d29, d5
+
+    vext.8          d20, d7, d8, #5
+    vext.8          d21, d10, d11, #5
+    vext.8          d22, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d23, d9, d10, #2
+    vext.8          d24, d7, d8, #2
+    vext.8          d25, d10, d11, #2
+
+    vext.8          d26, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d27, d9, d10, #3
+    vext.8          d28, d7, d8, #3
+    vext.8          d29, d10, d11, #3
+
+    vmlal.u8        q7, d20, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q9, d21, d5
+    vmlal.u8        q6, d22, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q8, d23, d2
+    vmlal.u8        q7, d24, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q9, d25, d2
+
+    vmull.u8        q10, d26, d3            ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q11, d27, d3
+    vmull.u8        q12, d28, d3            ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q15, d29, d3
+
+    vqadd.s16       q6, q10                 ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q11
+    vqadd.s16       q7, q12
+    vqadd.s16       q9, q15
+
+    subs            r2, r2, #1
+
+    vqrshrun.s16    d6, q6, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q7, #7
+    vqrshrun.s16    d8, q8, #7
+    vqrshrun.s16    d9, q9, #7
+
+    vst1.u8         {q3}, [r4], r5              ;store result
+    vst1.u8         {q4}, [r4], r5
+
+    bne             filt_blk2d_fpo16x16_loop_neon
+
+    pop             {r4-r5,pc}
+
+;--------------------
+secondpass_filter16x16_only
+;Second pass: 16x16
+    add             r3, r12, r3, lsl #5
+    sub             r0, r0, r1, lsl #1
+
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+    mov             r3, #2                  ;loop counter
+
+    vabs.s32        q7, q5
+    vabs.s32        q8, q6
+
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vdup.8          d1, d14[4]
+    vdup.8          d2, d15[0]
+    vdup.8          d3, d15[4]
+    vdup.8          d4, d16[0]
+    vdup.8          d5, d16[4]
+
+filt_blk2d_spo16x16_outloop_neon
+    vld1.u8         {d18}, [r0], r1         ;load src data
+    vld1.u8         {d19}, [r0], r1
+    vld1.u8         {d20}, [r0], r1
+    vld1.u8         {d21}, [r0], r1
+    mov             r12, #4                 ;loop counter
+    vld1.u8         {d22}, [r0], r1
+
+secondpass_only_inner_loop_neon
+    vld1.u8         {d23}, [r0], r1         ;load src data
+    vld1.u8         {d24}, [r0], r1
+    vld1.u8         {d25}, [r0], r1
+    vld1.u8         {d26}, [r0], r1
+
+    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q4, d19, d0
+    vmull.u8        q5, d20, d0
+    vmull.u8        q6, d21, d0
+
+    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q4, d20, d1
+    vmlsl.u8        q5, d21, d1
+    vmlsl.u8        q6, d22, d1
+
+    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q4, d23, d4
+    vmlsl.u8        q5, d24, d4
+    vmlsl.u8        q6, d25, d4
+
+    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q4, d21, d2
+    vmlal.u8        q5, d22, d2
+    vmlal.u8        q6, d23, d2
+
+    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q4, d24, d5
+    vmlal.u8        q5, d25, d5
+    vmlal.u8        q6, d26, d5
+
+    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q8, d22, d3
+    vmull.u8        q9, d23, d3
+    vmull.u8        q10, d24, d3
+
+    subs            r12, r12, #1
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q8, #7
+    vqrshrun.s16    d8, q9, #7
+    vqrshrun.s16    d9, q10, #7
+
+    vst1.u8         {d6}, [r4], r5          ;store result
+    vmov            q9, q11
+    vst1.u8         {d7}, [r4], r5
+    vmov            q10, q12
+    vst1.u8         {d8}, [r4], r5
+    vmov            d22, d26
+    vst1.u8         {d9}, [r4], r5
+
+    bne             secondpass_only_inner_loop_neon
+
+    subs            r3, r3, #1
+    sub             r0, r0, r1, lsl #4
+    sub             r0, r0, r1, lsl #2
+    sub             r0, r0, r1
+    add             r0, r0, #8
+
+    sub             r4, r4, r5, lsl #4
+    add             r4, r4, #8
+
+    bne filt_blk2d_spo16x16_outloop_neon
+
+    pop             {r4-r5,pc}
+
+    ENDP
+
+;-----------------
+    AREA    subpelfilters16_dat, DATA, READWRITE            ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_filter16_coeff_
+    DCD     filter16_coeff
+filter16_coeff
+    DCD     0,  0,  128,    0,   0,  0,   0,  0
+    DCD     0, -6,  123,   12,  -1,  0,   0,  0
+    DCD     2, -11, 108,   36,  -8,  1,   0,  0
+    DCD     0, -9,   93,   50,  -6,  0,   0,  0
+    DCD     3, -16,  77,   77, -16,  3,   0,  0
+    DCD     0, -6,   50,   93,  -9,  0,   0,  0
+    DCD     1, -8,   36,  108, -11,  2,   0,  0
+    DCD     0, -1,   12,  123,  -6,   0,  0,  0
+
+    END
diff --git a/vp8/common/arm/neon/sixtappredict4x4_neon.asm b/vp8/common/arm/neon/sixtappredict4x4_neon.asm
new file mode 100644
index 000000000..c23a9dbd1
--- /dev/null
+++ b/vp8/common/arm/neon/sixtappredict4x4_neon.asm
@@ -0,0 +1,425 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sixtap_predict_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(lr) int  dst_pitch
+
+|vp8_sixtap_predict_neon| PROC
+    push            {r4, lr}
+
+    ldr             r12, _filter4_coeff_
+    ldr             r4, [sp, #8]            ;load parameters from stack
+    ldr             lr, [sp, #12]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_filter4x4_only
+
+    add             r2, r12, r2, lsl #5     ;calculate filter location
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
+
+    beq             firstpass_filter4x4_only
+
+    vabs.s32        q12, q14                ;get abs(filer_parameters)
+    vabs.s32        q13, q15
+
+    sub             r0, r0, #2              ;go back 2 columns of src data
+    sub             r0, r0, r1, lsl #1      ;go back 2 lines of src data
+
+;First pass: output_height lines x output_width columns (9x4)
+    vld1.u8         {q3}, [r0], r1          ;load first 4-line src data
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vld1.u8         {q4}, [r0], r1
+    vdup.8          d1, d24[4]
+    vld1.u8         {q5}, [r0], r1
+    vdup.8          d2, d25[0]
+    vld1.u8         {q6}, [r0], r1
+    vdup.8          d3, d25[4]
+    vdup.8          d4, d26[0]
+    vdup.8          d5, d26[4]
+
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d19, d8, d9, #5
+    vext.8          d20, d10, d11, #5
+    vext.8          d21, d12, d13, #5
+
+    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done
+    vswp            d11, d12
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])
+    vzip.32         d20, d21
+    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmull.u8        q8, d20, d5
+
+    vmov            q4, q3                  ;keep original src data in q4 q6
+    vmov            q6, q5
+
+    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together
+    vzip.32         d10, d11
+    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]
+    vshr.u64        q10, q6, #8
+    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp8_filter[0])
+    vmlal.u8        q8, d10, d0
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])
+    vzip.32         d20, d21
+    vshr.u64        q3, q4, #32             ;construct src_ptr[2]
+    vshr.u64        q5, q6, #32
+    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q8, d20, d1
+
+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])
+    vzip.32         d10, d11
+    vshr.u64        q9, q4, #16             ;construct src_ptr[0]
+    vshr.u64        q10, q6, #16
+    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q8, d10, d4
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])
+    vzip.32         d20, d21
+    vshr.u64        q3, q4, #24             ;construct src_ptr[1]
+    vshr.u64        q5, q6, #24
+    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q8, d20, d2
+
+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])
+    vzip.32         d10, d11
+    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q10, d10, d3
+
+    vld1.u8         {q3}, [r0], r1          ;load rest 5-line src data
+    vld1.u8         {q4}, [r0], r1
+
+    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q10
+
+    vld1.u8         {q5}, [r0], r1
+    vld1.u8         {q6}, [r0], r1
+
+    vqrshrun.s16    d27, q7, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d28, q8, #7
+
+    ;First Pass on rest 5-line data
+    vld1.u8         {q11}, [r0], r1
+
+    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d19, d8, d9, #5
+    vext.8          d20, d10, d11, #5
+    vext.8          d21, d12, d13, #5
+
+    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done
+    vswp            d11, d12
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])
+    vzip.32         d20, d21
+    vext.8          d31, d22, d23, #5       ;construct src_ptr[3]
+    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmull.u8        q8, d20, d5
+    vmull.u8        q12, d31, d5            ;(src_ptr[3] * vp8_filter[5])
+
+    vmov            q4, q3                  ;keep original src data in q4 q6
+    vmov            q6, q5
+
+    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together
+    vzip.32         d10, d11
+    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]
+    vshr.u64        q10, q6, #8
+
+    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp8_filter[0])
+    vmlal.u8        q8, d10, d0
+    vmlal.u8        q12, d22, d0            ;(src_ptr[-2] * vp8_filter[0])
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])
+    vzip.32         d20, d21
+    vshr.u64        q3, q4, #32             ;construct src_ptr[2]
+    vshr.u64        q5, q6, #32
+    vext.8          d31, d22, d23, #1       ;construct src_ptr[-1]
+
+    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q8, d20, d1
+    vmlsl.u8        q12, d31, d1            ;-(src_ptr[-1] * vp8_filter[1])
+
+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])
+    vzip.32         d10, d11
+    vshr.u64        q9, q4, #16             ;construct src_ptr[0]
+    vshr.u64        q10, q6, #16
+    vext.8          d31, d22, d23, #4       ;construct src_ptr[2]
+
+    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q8, d10, d4
+    vmlsl.u8        q12, d31, d4            ;-(src_ptr[2] * vp8_filter[4])
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])
+    vzip.32         d20, d21
+    vshr.u64        q3, q4, #24             ;construct src_ptr[1]
+    vshr.u64        q5, q6, #24
+    vext.8          d31, d22, d23, #2       ;construct src_ptr[0]
+
+    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q8, d20, d2
+    vmlal.u8        q12, d31, d2            ;(src_ptr[0] * vp8_filter[2])
+
+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])
+    vzip.32         d10, d11
+    vext.8          d31, d22, d23, #3       ;construct src_ptr[1]
+    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q10, d10, d3
+    vmull.u8        q11, d31, d3            ;(src_ptr[1] * vp8_filter[3])
+
+    add             r3, r12, r3, lsl #5
+
+    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q10
+    vqadd.s16       q12, q11
+
+    vext.8          d23, d27, d28, #4
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+
+    vqrshrun.s16    d29, q7, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d30, q8, #7
+    vqrshrun.s16    d31, q12, #7
+
+;Second pass: 4x4
+    vabs.s32        q7, q5
+    vabs.s32        q8, q6
+
+    vext.8          d24, d28, d29, #4
+    vext.8          d25, d29, d30, #4
+    vext.8          d26, d30, d31, #4
+
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vdup.8          d1, d14[4]
+    vdup.8          d2, d15[0]
+    vdup.8          d3, d15[4]
+    vdup.8          d4, d16[0]
+    vdup.8          d5, d16[4]
+
+    vmull.u8        q3, d27, d0             ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q4, d28, d0
+
+    vmull.u8        q5, d25, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmull.u8        q6, d26, d5
+
+    vmlsl.u8        q3, d29, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q4, d30, d4
+
+    vmlsl.u8        q5, d23, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q6, d24, d1
+
+    vmlal.u8        q3, d28, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q4, d29, d2
+
+    vmlal.u8        q5, d24, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmlal.u8        q6, d25, d3
+
+    add             r0, r4, lr
+    add             r1, r0, lr
+    add             r2, r1, lr
+
+    vqadd.s16       q5, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q6, q4
+
+    vqrshrun.s16    d3, q5, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d4, q6, #7
+
+    vst1.32         {d3[0]}, [r4]           ;store result
+    vst1.32         {d3[1]}, [r0]
+    vst1.32         {d4[0]}, [r1]
+    vst1.32         {d4[1]}, [r2]
+
+    pop             {r4, pc}
+
+
+;---------------------
+firstpass_filter4x4_only
+    vabs.s32        q12, q14                ;get abs(filer_parameters)
+    vabs.s32        q13, q15
+
+    sub             r0, r0, #2              ;go back 2 columns of src data
+
+;First pass: output_height lines x output_width columns (4x4)
+    vld1.u8         {q3}, [r0], r1          ;load first 4-line src data
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vld1.u8         {q4}, [r0], r1
+    vdup.8          d1, d24[4]
+    vld1.u8         {q5}, [r0], r1
+    vdup.8          d2, d25[0]
+    vld1.u8         {q6}, [r0], r1
+
+    vdup.8          d3, d25[4]
+    vdup.8          d4, d26[0]
+    vdup.8          d5, d26[4]
+
+    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d19, d8, d9, #5
+    vext.8          d20, d10, d11, #5
+    vext.8          d21, d12, d13, #5
+
+    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done
+    vswp            d11, d12
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])
+    vzip.32         d20, d21
+    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmull.u8        q8, d20, d5
+
+    vmov            q4, q3                  ;keep original src data in q4 q6
+    vmov            q6, q5
+
+    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together
+    vzip.32         d10, d11
+    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]
+    vshr.u64        q10, q6, #8
+    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp8_filter[0])
+    vmlal.u8        q8, d10, d0
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])
+    vzip.32         d20, d21
+    vshr.u64        q3, q4, #32             ;construct src_ptr[2]
+    vshr.u64        q5, q6, #32
+    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q8, d20, d1
+
+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])
+    vzip.32         d10, d11
+    vshr.u64        q9, q4, #16             ;construct src_ptr[0]
+    vshr.u64        q10, q6, #16
+    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q8, d10, d4
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])
+    vzip.32         d20, d21
+    vshr.u64        q3, q4, #24             ;construct src_ptr[1]
+    vshr.u64        q5, q6, #24
+    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q8, d20, d2
+
+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])
+    vzip.32         d10, d11
+    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q10, d10, d3
+
+    add             r0, r4, lr
+    add             r1, r0, lr
+    add             r2, r1, lr
+
+    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q10
+
+    vqrshrun.s16    d27, q7, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d28, q8, #7
+
+    vst1.32         {d27[0]}, [r4]          ;store result
+    vst1.32         {d27[1]}, [r0]
+    vst1.32         {d28[0]}, [r1]
+    vst1.32         {d28[1]}, [r2]
+
+    pop             {r4, pc}
+
+
+;---------------------
+secondpass_filter4x4_only
+    sub             r0, r0, r1, lsl #1
+    add             r3, r12, r3, lsl #5
+
+    vld1.32         {d27[0]}, [r0], r1      ;load src data
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+    vld1.32         {d27[1]}, [r0], r1
+    vabs.s32        q7, q5
+    vld1.32         {d28[0]}, [r0], r1
+    vabs.s32        q8, q6
+    vld1.32         {d28[1]}, [r0], r1
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vld1.32         {d29[0]}, [r0], r1
+    vdup.8          d1, d14[4]
+    vld1.32         {d29[1]}, [r0], r1
+    vdup.8          d2, d15[0]
+    vld1.32         {d30[0]}, [r0], r1
+    vdup.8          d3, d15[4]
+    vld1.32         {d30[1]}, [r0], r1
+    vdup.8          d4, d16[0]
+    vld1.32         {d31[0]}, [r0], r1
+    vdup.8          d5, d16[4]
+
+    vext.8          d23, d27, d28, #4
+    vext.8          d24, d28, d29, #4
+    vext.8          d25, d29, d30, #4
+    vext.8          d26, d30, d31, #4
+
+    vmull.u8        q3, d27, d0             ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q4, d28, d0
+
+    vmull.u8        q5, d25, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmull.u8        q6, d26, d5
+
+    vmlsl.u8        q3, d29, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q4, d30, d4
+
+    vmlsl.u8        q5, d23, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q6, d24, d1
+
+    vmlal.u8        q3, d28, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q4, d29, d2
+
+    vmlal.u8        q5, d24, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmlal.u8        q6, d25, d3
+
+    add             r0, r4, lr
+    add             r1, r0, lr
+    add             r2, r1, lr
+
+    vqadd.s16       q5, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q6, q4
+
+    vqrshrun.s16    d3, q5, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d4, q6, #7
+
+    vst1.32         {d3[0]}, [r4]           ;store result
+    vst1.32         {d3[1]}, [r0]
+    vst1.32         {d4[0]}, [r1]
+    vst1.32         {d4[1]}, [r2]
+
+    pop             {r4, pc}
+
+    ENDP
+
+;-----------------
+    AREA    subpelfilters4_dat, DATA, READWRITE         ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_filter4_coeff_
+    DCD     filter4_coeff
+filter4_coeff
+    DCD     0,  0,  128,    0,   0,  0,   0,  0
+    DCD     0, -6,  123,   12,  -1,  0,   0,  0
+    DCD     2, -11, 108,   36,  -8,  1,   0,  0
+    DCD     0, -9,   93,   50,  -6,  0,   0,  0
+    DCD     3, -16,  77,   77, -16,  3,   0,  0
+    DCD     0, -6,   50,   93,  -9,  0,   0,  0
+    DCD     1, -8,   36,  108, -11,  2,   0,  0
+    DCD     0, -1,   12,  123,  -6,   0,  0,  0
+
+    END
diff --git a/vp8/common/arm/neon/sixtappredict8x4_neon.asm b/vp8/common/arm/neon/sixtappredict8x4_neon.asm
new file mode 100644
index 000000000..18e19f958
--- /dev/null
+++ b/vp8/common/arm/neon/sixtappredict8x4_neon.asm
@@ -0,0 +1,476 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sixtap_predict8x4_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; r4    unsigned char *dst_ptr,
+; stack(r5) int  dst_pitch
+
+|vp8_sixtap_predict8x4_neon| PROC
+    push            {r4-r5, lr}
+
+    ldr             r12, _filter8_coeff_
+    ldr             r4, [sp, #12]           ;load parameters from stack
+    ldr             r5, [sp, #16]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_filter8x4_only
+
+    add             r2, r12, r2, lsl #5     ;calculate filter location
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+
+    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
+
+    beq             firstpass_filter8x4_only
+
+    sub             sp, sp, #32             ;reserve space on stack for temporary storage
+    vabs.s32        q12, q14
+    vabs.s32        q13, q15
+
+    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
+    mov             lr, sp
+    sub             r0, r0, r1, lsl #1
+
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vdup.8          d1, d24[4]
+    vdup.8          d2, d25[0]
+
+;First pass: output_height lines x output_width columns (9x8)
+    vld1.u8         {q3}, [r0], r1          ;load src data
+    vdup.8          d3, d25[4]
+    vld1.u8         {q4}, [r0], r1
+    vdup.8          d4, d26[0]
+    vld1.u8         {q5}, [r0], r1
+    vdup.8          d5, d26[4]
+    vld1.u8         {q6}, [r0], r1
+
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q8, d8, d0
+    vmull.u8        q9, d10, d0
+    vmull.u8        q10, d12, d0
+
+    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d29, d8, d9, #1
+    vext.8          d30, d10, d11, #1
+    vext.8          d31, d12, d13, #1
+
+    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q8, d29, d1
+    vmlsl.u8        q9, d30, d1
+    vmlsl.u8        q10, d31, d1
+
+    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d29, d8, d9, #4
+    vext.8          d30, d10, d11, #4
+    vext.8          d31, d12, d13, #4
+
+    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q8, d29, d4
+    vmlsl.u8        q9, d30, d4
+    vmlsl.u8        q10, d31, d4
+
+    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d29, d8, d9, #2
+    vext.8          d30, d10, d11, #2
+    vext.8          d31, d12, d13, #2
+
+    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q8, d29, d2
+    vmlal.u8        q9, d30, d2
+    vmlal.u8        q10, d31, d2
+
+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d29, d8, d9, #5
+    vext.8          d30, d10, d11, #5
+    vext.8          d31, d12, d13, #5
+
+    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q8, d29, d5
+    vmlal.u8        q9, d30, d5
+    vmlal.u8        q10, d31, d5
+
+    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d29, d8, d9, #3
+    vext.8          d30, d10, d11, #3
+    vext.8          d31, d12, d13, #3
+
+    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q4, d29, d3
+    vmull.u8        q5, d30, d3
+    vmull.u8        q6, d31, d3
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vld1.u8         {q3}, [r0], r1          ;load src data
+
+    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d23, q8, #7
+    vqrshrun.s16    d24, q9, #7
+    vqrshrun.s16    d25, q10, #7
+
+    vld1.u8         {q4}, [r0], r1
+    vst1.u8         {d22}, [lr]!            ;store result
+    vld1.u8         {q5}, [r0], r1
+    vst1.u8         {d23}, [lr]!
+    vld1.u8         {q6}, [r0], r1
+    vst1.u8         {d24}, [lr]!
+    vld1.u8         {q7}, [r0], r1
+    vst1.u8         {d25}, [lr]!
+
+    ;first_pass filtering on the rest 5-line data
+    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q9, d8, d0
+    vmull.u8        q10, d10, d0
+    vmull.u8        q11, d12, d0
+    vmull.u8        q12, d14, d0
+
+    vext.8          d27, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d28, d8, d9, #1
+    vext.8          d29, d10, d11, #1
+    vext.8          d30, d12, d13, #1
+    vext.8          d31, d14, d15, #1
+
+    vmlsl.u8        q8, d27, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q9, d28, d1
+    vmlsl.u8        q10, d29, d1
+    vmlsl.u8        q11, d30, d1
+    vmlsl.u8        q12, d31, d1
+
+    vext.8          d27, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d28, d8, d9, #4
+    vext.8          d29, d10, d11, #4
+    vext.8          d30, d12, d13, #4
+    vext.8          d31, d14, d15, #4
+
+    vmlsl.u8        q8, d27, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q9, d28, d4
+    vmlsl.u8        q10, d29, d4
+    vmlsl.u8        q11, d30, d4
+    vmlsl.u8        q12, d31, d4
+
+    vext.8          d27, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d28, d8, d9, #2
+    vext.8          d29, d10, d11, #2
+    vext.8          d30, d12, d13, #2
+    vext.8          d31, d14, d15, #2
+
+    vmlal.u8        q8, d27, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q9, d28, d2
+    vmlal.u8        q10, d29, d2
+    vmlal.u8        q11, d30, d2
+    vmlal.u8        q12, d31, d2
+
+    vext.8          d27, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d28, d8, d9, #5
+    vext.8          d29, d10, d11, #5
+    vext.8          d30, d12, d13, #5
+    vext.8          d31, d14, d15, #5
+
+    vmlal.u8        q8, d27, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q9, d28, d5
+    vmlal.u8        q10, d29, d5
+    vmlal.u8        q11, d30, d5
+    vmlal.u8        q12, d31, d5
+
+    vext.8          d27, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d28, d8, d9, #3
+    vext.8          d29, d10, d11, #3
+    vext.8          d30, d12, d13, #3
+    vext.8          d31, d14, d15, #3
+
+    vmull.u8        q3, d27, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q4, d28, d3
+    vmull.u8        q5, d29, d3
+    vmull.u8        q6, d30, d3
+    vmull.u8        q7, d31, d3
+
+    vqadd.s16       q8, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q9, q4
+    vqadd.s16       q10, q5
+    vqadd.s16       q11, q6
+    vqadd.s16       q12, q7
+
+    vqrshrun.s16    d26, q8, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d27, q9, #7
+    vqrshrun.s16    d28, q10, #7
+    vqrshrun.s16    d29, q11, #7                ;load intermediate data from stack
+    vqrshrun.s16    d30, q12, #7
+
+;Second pass: 8x4
+;secondpass_filter
+    add             r3, r12, r3, lsl #5
+    sub             lr, lr, #32
+
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+    vld1.u8         {q11}, [lr]!
+
+    vabs.s32        q7, q5
+    vabs.s32        q8, q6
+
+    vld1.u8         {q12}, [lr]!
+
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vdup.8          d1, d14[4]
+    vdup.8          d2, d15[0]
+    vdup.8          d3, d15[4]
+    vdup.8          d4, d16[0]
+    vdup.8          d5, d16[4]
+
+    vmull.u8        q3, d22, d0             ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q4, d23, d0
+    vmull.u8        q5, d24, d0
+    vmull.u8        q6, d25, d0
+
+    vmlsl.u8        q3, d23, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q4, d24, d1
+    vmlsl.u8        q5, d25, d1
+    vmlsl.u8        q6, d26, d1
+
+    vmlsl.u8        q3, d26, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q4, d27, d4
+    vmlsl.u8        q5, d28, d4
+    vmlsl.u8        q6, d29, d4
+
+    vmlal.u8        q3, d24, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q4, d25, d2
+    vmlal.u8        q5, d26, d2
+    vmlal.u8        q6, d27, d2
+
+    vmlal.u8        q3, d27, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q4, d28, d5
+    vmlal.u8        q5, d29, d5
+    vmlal.u8        q6, d30, d5
+
+    vmull.u8        q7, d25, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q8, d26, d3
+    vmull.u8        q9, d27, d3
+    vmull.u8        q10, d28, d3
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q8, #7
+    vqrshrun.s16    d8, q9, #7
+    vqrshrun.s16    d9, q10, #7
+
+    vst1.u8         {d6}, [r4], r5          ;store result
+    vst1.u8         {d7}, [r4], r5
+    vst1.u8         {d8}, [r4], r5
+    vst1.u8         {d9}, [r4], r5
+
+    add             sp, sp, #32
+    pop             {r4-r5,pc}
+
+;--------------------
+firstpass_filter8x4_only
+    vabs.s32        q12, q14
+    vabs.s32        q13, q15
+
+    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
+    vld1.u8         {q3}, [r0], r1          ;load src data
+
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vld1.u8         {q4}, [r0], r1
+    vdup.8          d1, d24[4]
+    vld1.u8         {q5}, [r0], r1
+    vdup.8          d2, d25[0]
+    vld1.u8         {q6}, [r0], r1
+    vdup.8          d3, d25[4]
+    vdup.8          d4, d26[0]
+    vdup.8          d5, d26[4]
+
+;First pass: output_height lines x output_width columns (4x8)
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q8, d8, d0
+    vmull.u8        q9, d10, d0
+    vmull.u8        q10, d12, d0
+
+    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d29, d8, d9, #1
+    vext.8          d30, d10, d11, #1
+    vext.8          d31, d12, d13, #1
+
+    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q8, d29, d1
+    vmlsl.u8        q9, d30, d1
+    vmlsl.u8        q10, d31, d1
+
+    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d29, d8, d9, #4
+    vext.8          d30, d10, d11, #4
+    vext.8          d31, d12, d13, #4
+
+    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q8, d29, d4
+    vmlsl.u8        q9, d30, d4
+    vmlsl.u8        q10, d31, d4
+
+    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d29, d8, d9, #2
+    vext.8          d30, d10, d11, #2
+    vext.8          d31, d12, d13, #2
+
+    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q8, d29, d2
+    vmlal.u8        q9, d30, d2
+    vmlal.u8        q10, d31, d2
+
+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d29, d8, d9, #5
+    vext.8          d30, d10, d11, #5
+    vext.8          d31, d12, d13, #5
+
+    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q8, d29, d5
+    vmlal.u8        q9, d30, d5
+    vmlal.u8        q10, d31, d5
+
+    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d29, d8, d9, #3
+    vext.8          d30, d10, d11, #3
+    vext.8          d31, d12, d13, #3
+
+    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q4, d29, d3
+    vmull.u8        q5, d30, d3
+    vmull.u8        q6, d31, d3
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d23, q8, #7
+    vqrshrun.s16    d24, q9, #7
+    vqrshrun.s16    d25, q10, #7
+
+    vst1.u8         {d22}, [r4], r5         ;store result
+    vst1.u8         {d23}, [r4], r5
+    vst1.u8         {d24}, [r4], r5
+    vst1.u8         {d25}, [r4], r5
+
+    pop             {r4-r5,pc}
+
+;---------------------
+secondpass_filter8x4_only
+;Second pass: 8x4
+    add             r3, r12, r3, lsl #5
+    sub             r0, r0, r1, lsl #1
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+    vabs.s32        q7, q5
+    vabs.s32        q8, q6
+
+    vld1.u8         {d22}, [r0], r1
+    vld1.u8         {d23}, [r0], r1
+    vld1.u8         {d24}, [r0], r1
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vld1.u8         {d25}, [r0], r1
+    vdup.8          d1, d14[4]
+    vld1.u8         {d26}, [r0], r1
+    vdup.8          d2, d15[0]
+    vld1.u8         {d27}, [r0], r1
+    vdup.8          d3, d15[4]
+    vld1.u8         {d28}, [r0], r1
+    vdup.8          d4, d16[0]
+    vld1.u8         {d29}, [r0], r1
+    vdup.8          d5, d16[4]
+    vld1.u8         {d30}, [r0], r1
+
+    vmull.u8        q3, d22, d0             ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q4, d23, d0
+    vmull.u8        q5, d24, d0
+    vmull.u8        q6, d25, d0
+
+    vmlsl.u8        q3, d23, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q4, d24, d1
+    vmlsl.u8        q5, d25, d1
+    vmlsl.u8        q6, d26, d1
+
+    vmlsl.u8        q3, d26, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q4, d27, d4
+    vmlsl.u8        q5, d28, d4
+    vmlsl.u8        q6, d29, d4
+
+    vmlal.u8        q3, d24, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q4, d25, d2
+    vmlal.u8        q5, d26, d2
+    vmlal.u8        q6, d27, d2
+
+    vmlal.u8        q3, d27, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q4, d28, d5
+    vmlal.u8        q5, d29, d5
+    vmlal.u8        q6, d30, d5
+
+    vmull.u8        q7, d25, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q8, d26, d3
+    vmull.u8        q9, d27, d3
+    vmull.u8        q10, d28, d3
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q8, #7
+    vqrshrun.s16    d8, q9, #7
+    vqrshrun.s16    d9, q10, #7
+
+    vst1.u8         {d6}, [r4], r5          ;store result
+    vst1.u8         {d7}, [r4], r5
+    vst1.u8         {d8}, [r4], r5
+    vst1.u8         {d9}, [r4], r5
+
+    pop             {r4-r5,pc}
+
+    ENDP
+
+;-----------------
+    AREA    subpelfilters8_dat, DATA, READWRITE         ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_filter8_coeff_
+    DCD     filter8_coeff
+filter8_coeff
+    DCD     0,  0,  128,    0,   0,  0,   0,  0
+    DCD     0, -6,  123,   12,  -1,  0,   0,  0
+    DCD     2, -11, 108,   36,  -8,  1,   0,  0
+    DCD     0, -9,   93,   50,  -6,  0,   0,  0
+    DCD     3, -16,  77,   77, -16,  3,   0,  0
+    DCD     0, -6,   50,   93,  -9,  0,   0,  0
+    DCD     1, -8,   36,  108, -11,  2,   0,  0
+    DCD     0, -1,   12,  123,  -6,   0,  0,  0
+
+    END
diff --git a/vp8/common/arm/neon/sixtappredict8x8_neon.asm b/vp8/common/arm/neon/sixtappredict8x8_neon.asm
new file mode 100644
index 000000000..d27485e6c
--- /dev/null
+++ b/vp8/common/arm/neon/sixtappredict8x8_neon.asm
@@ -0,0 +1,527 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sixtap_predict8x8_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(r5) int  dst_pitch
+
+|vp8_sixtap_predict8x8_neon| PROC
+    push            {r4-r5, lr}
+
+    ldr             r12, _filter8_coeff_
+
+    ldr             r4, [sp, #12]           ;load parameters from stack
+    ldr             r5, [sp, #16]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_filter8x8_only
+
+    add             r2, r12, r2, lsl #5     ;calculate filter location
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+
+    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
+
+    beq             firstpass_filter8x8_only
+
+    sub             sp, sp, #64             ;reserve space on stack for temporary storage
+    mov             lr, sp
+
+    vabs.s32        q12, q14
+    vabs.s32        q13, q15
+
+    mov             r2, #2                  ;loop counter
+    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
+    sub             r0, r0, r1, lsl #1
+
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vdup.8          d1, d24[4]
+    vdup.8          d2, d25[0]
+
+;First pass: output_height lines x output_width columns (13x8)
+    vld1.u8         {q3}, [r0], r1          ;load src data
+    vdup.8          d3, d25[4]
+    vld1.u8         {q4}, [r0], r1
+    vdup.8          d4, d26[0]
+    vld1.u8         {q5}, [r0], r1
+    vdup.8          d5, d26[4]
+    vld1.u8         {q6}, [r0], r1
+
+filt_blk2d_fp8x8_loop_neon
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q8, d8, d0
+    vmull.u8        q9, d10, d0
+    vmull.u8        q10, d12, d0
+
+    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d29, d8, d9, #1
+    vext.8          d30, d10, d11, #1
+    vext.8          d31, d12, d13, #1
+
+    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q8, d29, d1
+    vmlsl.u8        q9, d30, d1
+    vmlsl.u8        q10, d31, d1
+
+    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d29, d8, d9, #4
+    vext.8          d30, d10, d11, #4
+    vext.8          d31, d12, d13, #4
+
+    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q8, d29, d4
+    vmlsl.u8        q9, d30, d4
+    vmlsl.u8        q10, d31, d4
+
+    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d29, d8, d9, #2
+    vext.8          d30, d10, d11, #2
+    vext.8          d31, d12, d13, #2
+
+    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q8, d29, d2
+    vmlal.u8        q9, d30, d2
+    vmlal.u8        q10, d31, d2
+
+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d29, d8, d9, #5
+    vext.8          d30, d10, d11, #5
+    vext.8          d31, d12, d13, #5
+
+    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q8, d29, d5
+    vmlal.u8        q9, d30, d5
+    vmlal.u8        q10, d31, d5
+
+    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d29, d8, d9, #3
+    vext.8          d30, d10, d11, #3
+    vext.8          d31, d12, d13, #3
+
+    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q4, d29, d3
+    vmull.u8        q5, d30, d3
+    vmull.u8        q6, d31, d3
+
+    subs            r2, r2, #1
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vld1.u8         {q3}, [r0], r1          ;load src data
+
+    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d23, q8, #7
+    vqrshrun.s16    d24, q9, #7
+    vqrshrun.s16    d25, q10, #7
+
+    vst1.u8         {d22}, [lr]!            ;store result
+    vld1.u8         {q4}, [r0], r1
+    vst1.u8         {d23}, [lr]!
+    vld1.u8         {q5}, [r0], r1
+    vst1.u8         {d24}, [lr]!
+    vld1.u8         {q6}, [r0], r1
+    vst1.u8         {d25}, [lr]!
+
+    bne             filt_blk2d_fp8x8_loop_neon
+
+    ;first_pass filtering on the rest 5-line data
+    ;vld1.u8            {q3}, [r0], r1          ;load src data
+    ;vld1.u8            {q4}, [r0], r1
+    ;vld1.u8            {q5}, [r0], r1
+    ;vld1.u8            {q6}, [r0], r1
+    vld1.u8         {q7}, [r0], r1
+
+    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q9, d8, d0
+    vmull.u8        q10, d10, d0
+    vmull.u8        q11, d12, d0
+    vmull.u8        q12, d14, d0
+
+    vext.8          d27, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d28, d8, d9, #1
+    vext.8          d29, d10, d11, #1
+    vext.8          d30, d12, d13, #1
+    vext.8          d31, d14, d15, #1
+
+    vmlsl.u8        q8, d27, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q9, d28, d1
+    vmlsl.u8        q10, d29, d1
+    vmlsl.u8        q11, d30, d1
+    vmlsl.u8        q12, d31, d1
+
+    vext.8          d27, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d28, d8, d9, #4
+    vext.8          d29, d10, d11, #4
+    vext.8          d30, d12, d13, #4
+    vext.8          d31, d14, d15, #4
+
+    vmlsl.u8        q8, d27, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q9, d28, d4
+    vmlsl.u8        q10, d29, d4
+    vmlsl.u8        q11, d30, d4
+    vmlsl.u8        q12, d31, d4
+
+    vext.8          d27, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d28, d8, d9, #2
+    vext.8          d29, d10, d11, #2
+    vext.8          d30, d12, d13, #2
+    vext.8          d31, d14, d15, #2
+
+    vmlal.u8        q8, d27, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q9, d28, d2
+    vmlal.u8        q10, d29, d2
+    vmlal.u8        q11, d30, d2
+    vmlal.u8        q12, d31, d2
+
+    vext.8          d27, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d28, d8, d9, #5
+    vext.8          d29, d10, d11, #5
+    vext.8          d30, d12, d13, #5
+    vext.8          d31, d14, d15, #5
+
+    vmlal.u8        q8, d27, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q9, d28, d5
+    vmlal.u8        q10, d29, d5
+    vmlal.u8        q11, d30, d5
+    vmlal.u8        q12, d31, d5
+
+    vext.8          d27, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d28, d8, d9, #3
+    vext.8          d29, d10, d11, #3
+    vext.8          d30, d12, d13, #3
+    vext.8          d31, d14, d15, #3
+
+    vmull.u8        q3, d27, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q4, d28, d3
+    vmull.u8        q5, d29, d3
+    vmull.u8        q6, d30, d3
+    vmull.u8        q7, d31, d3
+
+    vqadd.s16       q8, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q9, q4
+    vqadd.s16       q10, q5
+    vqadd.s16       q11, q6
+    vqadd.s16       q12, q7
+
+    add             r3, r12, r3, lsl #5
+
+    vqrshrun.s16    d26, q8, #7             ;shift/round/saturate to u8
+    sub             lr, lr, #64
+    vqrshrun.s16    d27, q9, #7
+    vld1.u8         {q9}, [lr]!             ;load intermediate data from stack
+    vqrshrun.s16    d28, q10, #7
+    vld1.u8         {q10}, [lr]!
+
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+
+    vqrshrun.s16    d29, q11, #7
+    vld1.u8         {q11}, [lr]!
+
+    vabs.s32        q7, q5
+    vabs.s32        q8, q6
+
+    vqrshrun.s16    d30, q12, #7
+    vld1.u8         {q12}, [lr]!
+
+;Second pass: 8x8
+    mov             r3, #2                  ;loop counter
+
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vdup.8          d1, d14[4]
+    vdup.8          d2, d15[0]
+    vdup.8          d3, d15[4]
+    vdup.8          d4, d16[0]
+    vdup.8          d5, d16[4]
+
+filt_blk2d_sp8x8_loop_neon
+    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q4, d19, d0
+    vmull.u8        q5, d20, d0
+    vmull.u8        q6, d21, d0
+
+    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q4, d20, d1
+    vmlsl.u8        q5, d21, d1
+    vmlsl.u8        q6, d22, d1
+
+    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q4, d23, d4
+    vmlsl.u8        q5, d24, d4
+    vmlsl.u8        q6, d25, d4
+
+    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q4, d21, d2
+    vmlal.u8        q5, d22, d2
+    vmlal.u8        q6, d23, d2
+
+    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q4, d24, d5
+    vmlal.u8        q5, d25, d5
+    vmlal.u8        q6, d26, d5
+
+    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q8, d22, d3
+    vmull.u8        q9, d23, d3
+    vmull.u8        q10, d24, d3
+
+    subs            r3, r3, #1
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q8, #7
+    vqrshrun.s16    d8, q9, #7
+    vqrshrun.s16    d9, q10, #7
+
+    vmov            q9, q11
+    vst1.u8         {d6}, [r4], r5          ;store result
+    vmov            q10, q12
+    vst1.u8         {d7}, [r4], r5
+    vmov            q11, q13
+    vst1.u8         {d8}, [r4], r5
+    vmov            q12, q14
+    vst1.u8         {d9}, [r4], r5
+    vmov            d26, d30
+
+    bne filt_blk2d_sp8x8_loop_neon
+
+    add             sp, sp, #64
+    pop             {r4-r5,pc}
+
+;---------------------
+firstpass_filter8x8_only
+    ;add                r2, r12, r2, lsl #5     ;calculate filter location
+    ;vld1.s32       {q14, q15}, [r2]        ;load first_pass filter
+    vabs.s32        q12, q14
+    vabs.s32        q13, q15
+
+    mov             r2, #2                  ;loop counter
+    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
+
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vdup.8          d1, d24[4]
+    vdup.8          d2, d25[0]
+    vdup.8          d3, d25[4]
+    vdup.8          d4, d26[0]
+    vdup.8          d5, d26[4]
+
+;First pass: output_height lines x output_width columns (8x8)
+filt_blk2d_fpo8x8_loop_neon
+    vld1.u8         {q3}, [r0], r1          ;load src data
+    vld1.u8         {q4}, [r0], r1
+    vld1.u8         {q5}, [r0], r1
+    vld1.u8         {q6}, [r0], r1
+
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q8, d8, d0
+    vmull.u8        q9, d10, d0
+    vmull.u8        q10, d12, d0
+
+    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d29, d8, d9, #1
+    vext.8          d30, d10, d11, #1
+    vext.8          d31, d12, d13, #1
+
+    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q8, d29, d1
+    vmlsl.u8        q9, d30, d1
+    vmlsl.u8        q10, d31, d1
+
+    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d29, d8, d9, #4
+    vext.8          d30, d10, d11, #4
+    vext.8          d31, d12, d13, #4
+
+    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q8, d29, d4
+    vmlsl.u8        q9, d30, d4
+    vmlsl.u8        q10, d31, d4
+
+    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d29, d8, d9, #2
+    vext.8          d30, d10, d11, #2
+    vext.8          d31, d12, d13, #2
+
+    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q8, d29, d2
+    vmlal.u8        q9, d30, d2
+    vmlal.u8        q10, d31, d2
+
+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d29, d8, d9, #5
+    vext.8          d30, d10, d11, #5
+    vext.8          d31, d12, d13, #5
+
+    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q8, d29, d5
+    vmlal.u8        q9, d30, d5
+    vmlal.u8        q10, d31, d5
+
+    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d29, d8, d9, #3
+    vext.8          d30, d10, d11, #3
+    vext.8          d31, d12, d13, #3
+
+    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q4, d29, d3
+    vmull.u8        q5, d30, d3
+    vmull.u8        q6, d31, d3
+ ;
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    subs            r2, r2, #1
+
+    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d23, q8, #7
+    vqrshrun.s16    d24, q9, #7
+    vqrshrun.s16    d25, q10, #7
+
+    vst1.u8         {d22}, [r4], r5         ;store result
+    vst1.u8         {d23}, [r4], r5
+    vst1.u8         {d24}, [r4], r5
+    vst1.u8         {d25}, [r4], r5
+
+    bne             filt_blk2d_fpo8x8_loop_neon
+
+    pop             {r4-r5,pc}
+
+;---------------------
+secondpass_filter8x8_only
+    sub             r0, r0, r1, lsl #1
+    add             r3, r12, r3, lsl #5
+
+    vld1.u8         {d18}, [r0], r1         ;load src data
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+    vld1.u8         {d19}, [r0], r1
+    vabs.s32        q7, q5
+    vld1.u8         {d20}, [r0], r1
+    vabs.s32        q8, q6
+    vld1.u8         {d21}, [r0], r1
+    mov             r3, #2                  ;loop counter
+    vld1.u8         {d22}, [r0], r1
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vld1.u8         {d23}, [r0], r1
+    vdup.8          d1, d14[4]
+    vld1.u8         {d24}, [r0], r1
+    vdup.8          d2, d15[0]
+    vld1.u8         {d25}, [r0], r1
+    vdup.8          d3, d15[4]
+    vld1.u8         {d26}, [r0], r1
+    vdup.8          d4, d16[0]
+    vld1.u8         {d27}, [r0], r1
+    vdup.8          d5, d16[4]
+    vld1.u8         {d28}, [r0], r1
+    vld1.u8         {d29}, [r0], r1
+    vld1.u8         {d30}, [r0], r1
+
+;Second pass: 8x8
+filt_blk2d_spo8x8_loop_neon
+    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q4, d19, d0
+    vmull.u8        q5, d20, d0
+    vmull.u8        q6, d21, d0
+
+    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q4, d20, d1
+    vmlsl.u8        q5, d21, d1
+    vmlsl.u8        q6, d22, d1
+
+    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q4, d23, d4
+    vmlsl.u8        q5, d24, d4
+    vmlsl.u8        q6, d25, d4
+
+    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q4, d21, d2
+    vmlal.u8        q5, d22, d2
+    vmlal.u8        q6, d23, d2
+
+    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q4, d24, d5
+    vmlal.u8        q5, d25, d5
+    vmlal.u8        q6, d26, d5
+
+    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q8, d22, d3
+    vmull.u8        q9, d23, d3
+    vmull.u8        q10, d24, d3
+
+    subs            r3, r3, #1
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q8, #7
+    vqrshrun.s16    d8, q9, #7
+    vqrshrun.s16    d9, q10, #7
+
+    vmov            q9, q11
+    vst1.u8         {d6}, [r4], r5          ;store result
+    vmov            q10, q12
+    vst1.u8         {d7}, [r4], r5
+    vmov            q11, q13
+    vst1.u8         {d8}, [r4], r5
+    vmov            q12, q14
+    vst1.u8         {d9}, [r4], r5
+    vmov            d26, d30
+
+    bne filt_blk2d_spo8x8_loop_neon
+
+    pop             {r4-r5,pc}
+
+    ENDP
+
+;-----------------
+    AREA    subpelfilters8_dat, DATA, READWRITE         ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_filter8_coeff_
+    DCD     filter8_coeff
+filter8_coeff
+    DCD     0,  0,  128,    0,   0,  0,   0,  0
+    DCD     0, -6,  123,   12,  -1,  0,   0,  0
+    DCD     2, -11, 108,   36,  -8,  1,   0,  0
+    DCD     0, -9,   93,   50,  -6,  0,   0,  0
+    DCD     3, -16,  77,   77, -16,  3,   0,  0
+    DCD     0, -6,   50,   93,  -9,  0,   0,  0
+    DCD     1, -8,   36,  108, -11,  2,   0,  0
+    DCD     0, -1,   12,  123,  -6,   0,  0,  0
+
+    END
diff --git a/vp8/common/arm/recon_arm.c b/vp8/common/arm/recon_arm.c
new file mode 100644
index 000000000..130059e64
--- /dev/null
+++ b/vp8/common/arm/recon_arm.c
@@ -0,0 +1,108 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "recon.h"
+#include "blockd.h"
+
+extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr);
+
+/*
+void vp8_recon16x16mby(MACROBLOCKD *x)
+{
+    int i;
+    for(i=0;i<16;i+=4)
+    {
+        //vp8_recon4b(&x->block[i]);
+        BLOCKD *b = &x->block[i];
+        vp8_recon4b (b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    }
+}
+*/
+void vp8_recon16x16mby(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
+{
+    BLOCKD *b = &x->block[0];
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+
+    //b = &x->block[4];
+    b += 4;
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+
+    //b = &x->block[8];
+    b += 4;
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+
+    //b = &x->block[12];
+    b += 4;
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+}
+
+#if HAVE_ARMV7
+void vp8_recon16x16mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
+{
+    unsigned char *pred_ptr = &x->predictor[0];
+    short *diff_ptr = &x->diff[0];
+    unsigned char *dst_ptr = x->dst.y_buffer;
+    unsigned char *udst_ptr = x->dst.u_buffer;
+    unsigned char *vdst_ptr = x->dst.v_buffer;
+    int ystride = x->dst.y_stride;
+    //int uv_stride = x->dst.uv_stride;
+
+    vp8_recon16x16mb_neon(pred_ptr, diff_ptr, dst_ptr, ystride, udst_ptr, vdst_ptr);
+}
+
+#else
+/*
+void vp8_recon16x16mb(MACROBLOCKD *x)
+{
+    int i;
+
+    for(i=0;i<16;i+=4)
+    {
+//      vp8_recon4b(&x->block[i]);
+        BLOCKD *b = &x->block[i];
+        vp8_recon4b (b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+
+    }
+    for(i=16;i<24;i+=2)
+    {
+//      vp8_recon2b(&x->block[i]);
+        BLOCKD *b = &x->block[i];
+        vp8_recon2b (b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    }
+}
+*/
+void vp8_recon16x16mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
+{
+    BLOCKD *b = &x->block[0];
+
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    b += 4;
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    b += 4;
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    b += 4;
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    b += 4;
+
+    //b = &x->block[16];
+
+    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    b++;
+    b++;
+    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    b++;
+    b++;
+    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    b++;
+    b++;
+    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+}
+#endif
diff --git a/vp8/common/arm/recon_arm.h b/vp8/common/arm/recon_arm.h
new file mode 100644
index 000000000..fd9f85eea
--- /dev/null
+++ b/vp8/common/arm/recon_arm.h
@@ -0,0 +1,70 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef RECON_ARM_H
+#define RECON_ARM_H
+
+#if HAVE_ARMV6
+extern prototype_recon_block(vp8_recon_b_armv6);
+extern prototype_recon_block(vp8_recon2b_armv6);
+extern prototype_recon_block(vp8_recon4b_armv6);
+
+extern prototype_copy_block(vp8_copy_mem8x8_v6);
+extern prototype_copy_block(vp8_copy_mem8x4_v6);
+extern prototype_copy_block(vp8_copy_mem16x16_v6);
+
+#undef  vp8_recon_recon
+#define vp8_recon_recon vp8_recon_b_armv6
+
+#undef  vp8_recon_recon2
+#define vp8_recon_recon2 vp8_recon2b_armv6
+
+#undef  vp8_recon_recon4
+#define vp8_recon_recon4 vp8_recon4b_armv6
+
+#undef  vp8_recon_copy8x8
+#define vp8_recon_copy8x8 vp8_copy_mem8x8_v6
+
+#undef  vp8_recon_copy8x4
+#define vp8_recon_copy8x4 vp8_copy_mem8x4_v6
+
+#undef  vp8_recon_copy16x16
+#define vp8_recon_copy16x16 vp8_copy_mem16x16_v6
+#endif
+
+#if HAVE_ARMV7
+extern prototype_recon_block(vp8_recon_b_neon);
+extern prototype_recon_block(vp8_recon2b_neon);
+extern prototype_recon_block(vp8_recon4b_neon);
+
+extern prototype_copy_block(vp8_copy_mem8x8_neon);
+extern prototype_copy_block(vp8_copy_mem8x4_neon);
+extern prototype_copy_block(vp8_copy_mem16x16_neon);
+
+#undef  vp8_recon_recon
+#define vp8_recon_recon vp8_recon_b_neon
+
+#undef  vp8_recon_recon2
+#define vp8_recon_recon2 vp8_recon2b_neon
+
+#undef  vp8_recon_recon4
+#define vp8_recon_recon4 vp8_recon4b_neon
+
+#undef  vp8_recon_copy8x8
+#define vp8_recon_copy8x8 vp8_copy_mem8x8_neon
+
+#undef  vp8_recon_copy8x4
+#define vp8_recon_copy8x4 vp8_copy_mem8x4_neon
+
+#undef  vp8_recon_copy16x16
+#define vp8_recon_copy16x16 vp8_copy_mem16x16_neon
+#endif
+
+#endif
diff --git a/vp8/common/arm/reconintra4x4_arm.c b/vp8/common/arm/reconintra4x4_arm.c
new file mode 100644
index 000000000..334d35236
--- /dev/null
+++ b/vp8/common/arm/reconintra4x4_arm.c
@@ -0,0 +1,408 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "recon.h"
+#include "vpx_mem/vpx_mem.h"
+#include "reconintra.h"
+
+void vp8_predict_intra4x4(BLOCKD *x,
+                          int b_mode,
+                          unsigned char *predictor)
+{
+    int i, r, c;
+
+    unsigned char *Above = *(x->base_dst) + x->dst - x->dst_stride;
+    unsigned char Left[4];
+    unsigned char top_left = Above[-1];
+
+    Left[0] = (*(x->base_dst))[x->dst - 1];
+    Left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride];
+    Left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride];
+    Left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride];
+
+    switch (b_mode)
+    {
+    case B_DC_PRED:
+    {
+        int expected_dc = 0;
+
+        for (i = 0; i < 4; i++)
+        {
+            expected_dc += Above[i];
+            expected_dc += Left[i];
+        }
+
+        expected_dc = (expected_dc + 4) >> 3;
+
+        for (r = 0; r < 4; r++)
+        {
+            for (c = 0; c < 4; c++)
+            {
+                predictor[c] = expected_dc;
+            }
+
+            predictor += 16;
+        }
+    }
+    break;
+    case B_TM_PRED:
+    {
+        // prediction similar to true_motion prediction
+        for (r = 0; r < 4; r++)
+        {
+            for (c = 0; c < 4; c++)
+            {
+                int pred = Above[c] - top_left + Left[r];
+
+                if (pred < 0)
+                    pred = 0;
+
+                if (pred > 255)
+                    pred = 255;
+
+                predictor[c] = pred;
+            }
+
+            predictor += 16;
+        }
+    }
+    break;
+
+    case B_VE_PRED:
+    {
+
+        unsigned int ap[4];
+        ap[0] = (top_left  + 2 * Above[0] + Above[1] + 2) >> 2;
+        ap[1] = (Above[0] + 2 * Above[1] + Above[2] + 2) >> 2;
+        ap[2] = (Above[1] + 2 * Above[2] + Above[3] + 2) >> 2;
+        ap[3] = (Above[2] + 2 * Above[3] + Above[4] + 2) >> 2;
+
+        for (r = 0; r < 4; r++)
+        {
+            for (c = 0; c < 4; c++)
+            {
+
+                predictor[c] = ap[c];
+            }
+
+            predictor += 16;
+        }
+
+    }
+    break;
+
+
+    case B_HE_PRED:
+    {
+
+        unsigned int lp[4];
+        lp[0] = (top_left + 2 * Left[0] + Left[1] + 2) >> 2;
+        lp[1] = (Left[0] + 2 * Left[1] + Left[2] + 2) >> 2;
+        lp[2] = (Left[1] + 2 * Left[2] + Left[3] + 2) >> 2;
+        lp[3] = (Left[2] + 2 * Left[3] + Left[3] + 2) >> 2;
+
+        for (r = 0; r < 4; r++)
+        {
+            for (c = 0; c < 4; c++)
+            {
+                predictor[c] = lp[r];
+            }
+
+            predictor += 16;
+        }
+    }
+    break;
+    case B_LD_PRED:
+    {
+        unsigned char *ptr = Above;
+        predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
+        predictor[0 * 16 + 1] =
+            predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
+        predictor[0 * 16 + 2] =
+            predictor[1 * 16 + 1] =
+                predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
+        predictor[0 * 16 + 3] =
+            predictor[1 * 16 + 2] =
+                predictor[2 * 16 + 1] =
+                    predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
+        predictor[1 * 16 + 3] =
+            predictor[2 * 16 + 2] =
+                predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
+        predictor[2 * 16 + 3] =
+            predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
+        predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
+
+    }
+    break;
+    case B_RD_PRED:
+    {
+
+        unsigned char pp[9];
+
+        pp[0] = Left[3];
+        pp[1] = Left[2];
+        pp[2] = Left[1];
+        pp[3] = Left[0];
+        pp[4] = top_left;
+        pp[5] = Above[0];
+        pp[6] = Above[1];
+        pp[7] = Above[2];
+        pp[8] = Above[3];
+
+        predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+        predictor[3 * 16 + 1] =
+            predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        predictor[3 * 16 + 2] =
+            predictor[2 * 16 + 1] =
+                predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+        predictor[3 * 16 + 3] =
+            predictor[2 * 16 + 2] =
+                predictor[1 * 16 + 1] =
+                    predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+        predictor[2 * 16 + 3] =
+            predictor[1 * 16 + 2] =
+                predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+        predictor[1 * 16 + 3] =
+            predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+        predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
+
+    }
+    break;
+    case B_VR_PRED:
+    {
+
+        unsigned char pp[9];
+
+        pp[0] = Left[3];
+        pp[1] = Left[2];
+        pp[2] = Left[1];
+        pp[3] = Left[0];
+        pp[4] = top_left;
+        pp[5] = Above[0];
+        pp[6] = Above[1];
+        pp[7] = Above[2];
+        pp[8] = Above[3];
+
+
+        predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+        predictor[3 * 16 + 1] =
+            predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+        predictor[2 * 16 + 1] =
+            predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1;
+        predictor[3 * 16 + 2] =
+            predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+        predictor[2 * 16 + 2] =
+            predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1;
+        predictor[3 * 16 + 3] =
+            predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+        predictor[2 * 16 + 3] =
+            predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1;
+        predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
+        predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1;
+
+    }
+    break;
+    case B_VL_PRED:
+    {
+
+        unsigned char *pp = Above;
+
+        predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
+        predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+        predictor[2 * 16 + 0] =
+            predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1;
+        predictor[1 * 16 + 1] =
+            predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        predictor[2 * 16 + 1] =
+            predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1;
+        predictor[3 * 16 + 1] =
+            predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+        predictor[0 * 16 + 3] =
+            predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1;
+        predictor[1 * 16 + 3] =
+            predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+        predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+        predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+    }
+    break;
+
+    case B_HD_PRED:
+    {
+        unsigned char pp[9];
+        pp[0] = Left[3];
+        pp[1] = Left[2];
+        pp[2] = Left[1];
+        pp[3] = Left[0];
+        pp[4] = top_left;
+        pp[5] = Above[0];
+        pp[6] = Above[1];
+        pp[7] = Above[2];
+        pp[8] = Above[3];
+
+
+        predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
+        predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+        predictor[2 * 16 + 0] =
+            predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1;
+        predictor[2 * 16 + 1] =
+            predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        predictor[2 * 16 + 2] =
+            predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
+        predictor[2 * 16 + 3] =
+            predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+        predictor[1 * 16 + 2] =
+            predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1;
+        predictor[1 * 16 + 3] =
+            predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+        predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+        predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+    }
+    break;
+
+
+    case B_HU_PRED:
+    {
+        unsigned char *pp = Left;
+        predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
+        predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+        predictor[0 * 16 + 2] =
+            predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1;
+        predictor[0 * 16 + 3] =
+            predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        predictor[1 * 16 + 2] =
+            predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
+        predictor[1 * 16 + 3] =
+            predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
+        predictor[2 * 16 + 2] =
+            predictor[2 * 16 + 3] =
+                predictor[3 * 16 + 0] =
+                    predictor[3 * 16 + 1] =
+                        predictor[3 * 16 + 2] =
+                            predictor[3 * 16 + 3] = pp[3];
+    }
+    break;
+
+
+    }
+}
+// copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
+// to the right prediction have filled in pixels to use.
+void vp8_intra_prediction_down_copy(MACROBLOCKD *x)
+{
+    unsigned char *above_right = *(x->block[0].base_dst) + x->block[0].dst - x->block[0].dst_stride + 16;
+
+    unsigned int *src_ptr = (unsigned int *)above_right;
+    unsigned int *dst_ptr0 = (unsigned int *)(above_right + 4 * x->block[0].dst_stride);
+    unsigned int *dst_ptr1 = (unsigned int *)(above_right + 8 * x->block[0].dst_stride);
+    unsigned int *dst_ptr2 = (unsigned int *)(above_right + 12 * x->block[0].dst_stride);
+
+    *dst_ptr0 = *src_ptr;
+    *dst_ptr1 = *src_ptr;
+    *dst_ptr2 = *src_ptr;
+}
+
+
+
+/*
+void vp8_recon_intra4x4mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
+{
+    int i;
+
+    vp8_intra_prediction_down_copy(x);
+
+    for(i=0;i<16;i++)
+    {
+        BLOCKD *b = &x->block[i];
+
+        vp8_predict_intra4x4(b, x->block[i].bmi.mode,x->block[i].predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    }
+
+    vp8_recon_intra_mbuv(x);
+
+}
+*/
+void vp8_recon_intra4x4mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
+{
+    int i;
+    BLOCKD *b = &x->block[0];
+
+    vp8_intra_prediction_down_copy(x);
+
+    {
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    }
+
+    vp8_recon_intra_mbuv(rtcd, x);
+
+}
diff --git a/vp8/common/arm/reconintra_arm.c b/vp8/common/arm/reconintra_arm.c
new file mode 100644
index 000000000..d7ee1ddfa
--- /dev/null
+++ b/vp8/common/arm/reconintra_arm.c
@@ -0,0 +1,61 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "blockd.h"
+#include "reconintra.h"
+#include "vpx_mem/vpx_mem.h"
+#include "recon.h"
+
+#if HAVE_ARMV7
+extern void vp8_build_intra_predictors_mby_neon_func(
+    unsigned char *y_buffer,
+    unsigned char *ypred_ptr,
+    int y_stride,
+    int mode,
+    int Up,
+    int Left);
+
+void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x)
+{
+    unsigned char *y_buffer = x->dst.y_buffer;
+    unsigned char *ypred_ptr = x->predictor;
+    int y_stride = x->dst.y_stride;
+    int mode = x->mbmi.mode;
+    int Up = x->up_available;
+    int Left = x->left_available;
+
+    vp8_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
+}
+#endif
+
+
+#if HAVE_ARMV7
+extern void vp8_build_intra_predictors_mby_s_neon_func(
+    unsigned char *y_buffer,
+    unsigned char *ypred_ptr,
+    int y_stride,
+    int mode,
+    int Up,
+    int Left);
+
+void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x)
+{
+    unsigned char *y_buffer = x->dst.y_buffer;
+    unsigned char *ypred_ptr = x->predictor;
+    int y_stride = x->dst.y_stride;
+    int mode = x->mbmi.mode;
+    int Up = x->up_available;
+    int Left = x->left_available;
+
+    vp8_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
+}
+
+#endif
diff --git a/vp8/common/arm/subpixel_arm.h b/vp8/common/arm/subpixel_arm.h
new file mode 100644
index 000000000..56aec55b9
--- /dev/null
+++ b/vp8/common/arm/subpixel_arm.h
@@ -0,0 +1,84 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef SUBPIXEL_ARM_H
+#define SUBPIXEL_ARM_H
+
+#if HAVE_ARMV6
+extern prototype_subpixel_predict(vp8_sixtap_predict16x16_armv6);
+extern prototype_subpixel_predict(vp8_sixtap_predict8x8_armv6);
+extern prototype_subpixel_predict(vp8_sixtap_predict8x4_armv6);
+extern prototype_subpixel_predict(vp8_sixtap_predict_armv6);
+extern prototype_subpixel_predict(vp8_bilinear_predict16x16_armv6);
+extern prototype_subpixel_predict(vp8_bilinear_predict8x8_armv6);
+extern prototype_subpixel_predict(vp8_bilinear_predict8x4_armv6);
+extern prototype_subpixel_predict(vp8_bilinear_predict4x4_armv6);
+
+#undef  vp8_subpix_sixtap16x16
+#define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_armv6
+
+#undef  vp8_subpix_sixtap8x8
+#define vp8_subpix_sixtap8x8 vp8_sixtap_predict8x8_armv6
+
+#undef  vp8_subpix_sixtap8x4
+#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_armv6
+
+#undef  vp8_subpix_sixtap4x4
+#define vp8_subpix_sixtap4x4 vp8_sixtap_predict_armv6
+
+#undef  vp8_subpix_bilinear16x16
+#define vp8_subpix_bilinear16x16 vp8_bilinear_predict16x16_armv6
+
+#undef  vp8_subpix_bilinear8x8
+#define vp8_subpix_bilinear8x8 vp8_bilinear_predict8x8_armv6
+
+#undef  vp8_subpix_bilinear8x4
+#define vp8_subpix_bilinear8x4 vp8_bilinear_predict8x4_armv6
+
+#undef  vp8_subpix_bilinear4x4
+#define vp8_subpix_bilinear4x4 vp8_bilinear_predict4x4_armv6
+#endif
+
+#if HAVE_ARMV7
+extern prototype_subpixel_predict(vp8_sixtap_predict16x16_neon);
+extern prototype_subpixel_predict(vp8_sixtap_predict8x8_neon);
+extern prototype_subpixel_predict(vp8_sixtap_predict8x4_neon);
+extern prototype_subpixel_predict(vp8_sixtap_predict_neon);
+extern prototype_subpixel_predict(vp8_bilinear_predict16x16_neon);
+extern prototype_subpixel_predict(vp8_bilinear_predict8x8_neon);
+extern prototype_subpixel_predict(vp8_bilinear_predict8x4_neon);
+extern prototype_subpixel_predict(vp8_bilinear_predict4x4_neon);
+
+#undef  vp8_subpix_sixtap16x16
+#define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_neon
+
+#undef  vp8_subpix_sixtap8x8
+#define vp8_subpix_sixtap8x8 vp8_sixtap_predict8x8_neon
+
+#undef  vp8_subpix_sixtap8x4
+#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_neon
+
+#undef  vp8_subpix_sixtap4x4
+#define vp8_subpix_sixtap4x4 vp8_sixtap_predict_neon
+
+#undef  vp8_subpix_bilinear16x16
+#define vp8_subpix_bilinear16x16 vp8_bilinear_predict16x16_neon
+
+#undef  vp8_subpix_bilinear8x8
+#define vp8_subpix_bilinear8x8 vp8_bilinear_predict8x8_neon
+
+#undef  vp8_subpix_bilinear8x4
+#define vp8_subpix_bilinear8x4 vp8_bilinear_predict8x4_neon
+
+#undef  vp8_subpix_bilinear4x4
+#define vp8_subpix_bilinear4x4 vp8_bilinear_predict4x4_neon
+#endif
+
+#endif
diff --git a/vp8/common/arm/systemdependent.c b/vp8/common/arm/systemdependent.c
new file mode 100644
index 000000000..ecc6929c0
--- /dev/null
+++ b/vp8/common/arm/systemdependent.c
@@ -0,0 +1,148 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "g_common.h"
+#include "pragmas.h"
+#include "subpixel.h"
+#include "loopfilter.h"
+#include "recon.h"
+#include "idct.h"
+#include "onyxc_int.h"
+
+void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);
+extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);
+extern void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x);
+
+void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x);
+extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x);
+extern void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x);
+
+void vp8_machine_specific_config(VP8_COMMON *ctx)
+{
+#if CONFIG_RUNTIME_CPU_DETECT
+    VP8_COMMON_RTCD *rtcd = &ctx->rtcd;
+
+#if HAVE_ARMV7
+    rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_neon;
+    rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_neon;
+    rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_neon;
+    rtcd->subpix.sixtap4x4     = vp8_sixtap_predict_neon;
+    rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_neon;
+    rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_neon;
+    rtcd->subpix.bilinear8x4   = vp8_bilinear_predict8x4_neon;
+    rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_neon;
+
+    rtcd->idct.idct1        = vp8_short_idct4x4llm_1_neon;
+    rtcd->idct.idct16       = vp8_short_idct4x4llm_neon;
+    rtcd->idct.idct1_scalar = vp8_dc_only_idct_neon;
+    rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_neon;
+    rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_neon;
+
+    rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_neon;
+    rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_neon;
+    rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_neon;
+    rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_neon;
+    rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_neon;
+    rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_neon;
+    rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_neon;
+    rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_neon;
+
+    rtcd->recon.copy16x16   = vp8_copy_mem16x16_neon;
+    rtcd->recon.copy8x8     = vp8_copy_mem8x8_neon;
+    rtcd->recon.copy8x4     = vp8_copy_mem8x4_neon;
+    rtcd->recon.recon       = vp8_recon_b_neon;
+    rtcd->recon.recon2      = vp8_recon2b_neon;
+    rtcd->recon.recon4      = vp8_recon4b_neon;
+#elif HAVE_ARMV6
+
+    rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_armv6;
+    rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_armv6;
+    rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_armv6;
+    rtcd->subpix.sixtap4x4     = vp8_sixtap_predict_armv6;
+    rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_armv6;
+    rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_armv6;
+    rtcd->subpix.bilinear8x4   = vp8_bilinear_predict8x4_armv6;
+    rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_armv6;
+
+    rtcd->idct.idct1        = vp8_short_idct4x4llm_1_v6;
+    rtcd->idct.idct16       = vp8_short_idct4x4llm_v6_dual;
+    rtcd->idct.idct1_scalar = vp8_dc_only_idct_armv6;
+    rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_armv6;
+    rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_armv6;
+
+    rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_armv6;
+    rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_armv6;
+    rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_armv6;
+    rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_armv6;
+    rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_armv6;
+    rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_armv6;
+    rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_armv6;
+    rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_armv6;
+
+    rtcd->recon.copy16x16   = vp8_copy_mem16x16_v6;
+    rtcd->recon.copy8x8     = vp8_copy_mem8x8_v6;
+    rtcd->recon.copy8x4     = vp8_copy_mem8x4_v6;
+    rtcd->recon.recon       = vp8_recon_b_armv6;
+    rtcd->recon.recon2      = vp8_recon2b_armv6;
+    rtcd->recon.recon4      = vp8_recon4b_armv6;
+#else
+//pure c
+    rtcd->idct.idct1        = vp8_short_idct4x4llm_1_c;
+    rtcd->idct.idct16       = vp8_short_idct4x4llm_c;
+    rtcd->idct.idct1_scalar = vp8_dc_only_idct_c;
+    rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_c;
+    rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_c;
+
+    rtcd->recon.copy16x16   = vp8_copy_mem16x16_c;
+    rtcd->recon.copy8x8     = vp8_copy_mem8x8_c;
+    rtcd->recon.copy8x4     = vp8_copy_mem8x4_c;
+    rtcd->recon.recon      = vp8_recon_b_c;
+    rtcd->recon.recon2      = vp8_recon2b_c;
+    rtcd->recon.recon4     = vp8_recon4b_c;
+
+    rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_c;
+    rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_c;
+    rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_c;
+    rtcd->subpix.sixtap4x4     = vp8_sixtap_predict_c;
+    rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_c;
+    rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_c;
+    rtcd->subpix.bilinear8x4   = vp8_bilinear_predict8x4_c;
+    rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_c;
+
+    rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_c;
+    rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_c;
+    rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_c;
+    rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_c;
+    rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_c;
+    rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_c;
+    rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_c;
+    rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_c;
+#endif
+
+    rtcd->postproc.down        = vp8_mbpost_proc_down_c;
+    rtcd->postproc.across      = vp8_mbpost_proc_across_ip_c;
+    rtcd->postproc.downacross  = vp8_post_proc_down_and_across_c;
+    rtcd->postproc.addnoise    = vp8_plane_add_noise_c;
+#endif
+
+#if HAVE_ARMV7
+    vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby_neon;
+    vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s_neon;
+#elif HAVE_ARMV6
+    vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby;
+    vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s;
+#else
+    vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby;
+    vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s;
+
+#endif
+
+}
diff --git a/vp8/common/arm/vpx_asm_offsets.c b/vp8/common/arm/vpx_asm_offsets.c
new file mode 100644
index 000000000..68634bf55
--- /dev/null
+++ b/vp8/common/arm/vpx_asm_offsets.c
@@ -0,0 +1,91 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include <stddef.h>
+
+#if CONFIG_VP8_ENCODER
+#include "vpx_scale/yv12config.h"
+#endif
+
+#if CONFIG_VP8_DECODER
+#include "onyxd_int.h"
+#endif
+
+#define DEFINE(sym, val) int sym = val;
+
+/*
+#define BLANK() asm volatile("\n->" : : )
+*/
+
+/*
+ * int main(void)
+ * {
+ */
+
+#if CONFIG_VP8_DECODER || CONFIG_VP8_ENCODER
+DEFINE(yv12_buffer_config_y_width,               offsetof(YV12_BUFFER_CONFIG, y_width));
+DEFINE(yv12_buffer_config_y_height,              offsetof(YV12_BUFFER_CONFIG, y_height));
+DEFINE(yv12_buffer_config_y_stride,              offsetof(YV12_BUFFER_CONFIG, y_stride));
+DEFINE(yv12_buffer_config_uv_width,               offsetof(YV12_BUFFER_CONFIG, uv_width));
+DEFINE(yv12_buffer_config_uv_height,              offsetof(YV12_BUFFER_CONFIG, uv_height));
+DEFINE(yv12_buffer_config_uv_stride,              offsetof(YV12_BUFFER_CONFIG, uv_stride));
+DEFINE(yv12_buffer_config_y_buffer,              offsetof(YV12_BUFFER_CONFIG, y_buffer));
+DEFINE(yv12_buffer_config_u_buffer,              offsetof(YV12_BUFFER_CONFIG, u_buffer));
+DEFINE(yv12_buffer_config_v_buffer,              offsetof(YV12_BUFFER_CONFIG, v_buffer));
+DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));
+#endif
+
+#if CONFIG_VP8_DECODER
+DEFINE(mb_diff,                                 offsetof(MACROBLOCKD, diff));
+DEFINE(mb_predictor,                            offsetof(MACROBLOCKD, predictor));
+DEFINE(mb_dst_y_stride,                          offsetof(MACROBLOCKD, dst.y_stride));
+DEFINE(mb_dst_y_buffer,                          offsetof(MACROBLOCKD, dst.y_buffer));
+DEFINE(mb_dst_u_buffer,                          offsetof(MACROBLOCKD, dst.u_buffer));
+DEFINE(mb_dst_v_buffer,                          offsetof(MACROBLOCKD, dst.v_buffer));
+DEFINE(mb_mbmi_mode,                            offsetof(MACROBLOCKD, mbmi.mode));
+DEFINE(mb_up_available,                          offsetof(MACROBLOCKD, up_available));
+DEFINE(mb_left_available,                        offsetof(MACROBLOCKD, left_available));
+
+DEFINE(detok_scan,                              offsetof(DETOK, scan));
+DEFINE(detok_ptr_onyxblock2context_leftabove,    offsetof(DETOK, ptr_onyxblock2context_leftabove));
+DEFINE(detok_onyx_coef_tree_ptr,                  offsetof(DETOK, vp8_coef_tree_ptr));
+DEFINE(detok_teb_base_ptr,                        offsetof(DETOK, teb_base_ptr));
+DEFINE(detok_norm_ptr,                           offsetof(DETOK, norm_ptr));
+DEFINE(detok_ptr_onyx_coef_bands_x,                offsetof(DETOK, ptr_onyx_coef_bands_x));
+
+DEFINE(DETOK_A,                                 offsetof(DETOK, A));
+DEFINE(DETOK_L,                                 offsetof(DETOK, L));
+
+DEFINE(detok_qcoeff_start_ptr,                    offsetof(DETOK, qcoeff_start_ptr));
+DEFINE(detok_current_bc,                         offsetof(DETOK, current_bc));
+DEFINE(detok_coef_probs,                         offsetof(DETOK, coef_probs));
+DEFINE(detok_eob,                               offsetof(DETOK, eob));
+
+DEFINE(bool_decoder_lowvalue,                   offsetof(BOOL_DECODER, lowvalue));
+DEFINE(bool_decoder_range,                      offsetof(BOOL_DECODER, range));
+DEFINE(bool_decoder_value,                      offsetof(BOOL_DECODER, value));
+DEFINE(bool_decoder_count,                      offsetof(BOOL_DECODER, count));
+DEFINE(bool_decoder_user_buffer,                offsetof(BOOL_DECODER, user_buffer));
+DEFINE(bool_decoder_user_buffer_sz,             offsetof(BOOL_DECODER, user_buffer_sz));
+DEFINE(bool_decoder_decode_buffer,              offsetof(BOOL_DECODER, decode_buffer));
+DEFINE(bool_decoder_read_ptr,                   offsetof(BOOL_DECODER, read_ptr));
+DEFINE(bool_decoder_write_ptr,                  offsetof(BOOL_DECODER, write_ptr));
+
+DEFINE(tokenextrabits_min_val,                   offsetof(TOKENEXTRABITS, min_val));
+DEFINE(tokenextrabits_length,                   offsetof(TOKENEXTRABITS, Length));
+#endif
+
+//add asserts for any offset that is not supported by assembly code
+//add asserts for any size that is not supported by assembly code
+/*
+ * return 0;
+ * }
+ */
diff --git a/vp8/common/bigend.h b/vp8/common/bigend.h
new file mode 100644
index 000000000..6a91ba1ae
--- /dev/null
+++ b/vp8/common/bigend.h
@@ -0,0 +1,31 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef _bigend_h
+#define _bigend_h
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define invert2(x) ( (((x)>>8)&0x00ff) | (((x)<<8)&0xff00) )
+#define invert4(x) ( ((invert2(x)&0x0000ffff)<<16) | (invert2((x>>16))&0x0000ffff) )
+
+#define high_byte(x) (unsigned char)x
+#define mid2Byte(x) (unsigned char)(x >> 8)
+#define mid1Byte(x) (unsigned char)(x >> 16)
+#define low_byte(x) (unsigned char)(x >> 24)
+
+#define SWAPENDS 1
+
+#if defined(__cplusplus)
+}
+#endif
+#endif
diff --git a/vp8/common/blockd.c b/vp8/common/blockd.c
new file mode 100644
index 000000000..53f5e72d2
--- /dev/null
+++ b/vp8/common/blockd.c
@@ -0,0 +1,23 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "blockd.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_setup_temp_context(TEMP_CONTEXT *t, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, int count)
+{
+    vpx_memcpy(t->l, l, sizeof(ENTROPY_CONTEXT) * count);
+    vpx_memcpy(t->a, a, sizeof(ENTROPY_CONTEXT) * count);
+}
+
+const int vp8_block2left[25] = { 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 1, 1, 0, 0, 1, 1, 0};
+const int vp8_block2above[25] = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0};
+const int vp8_block2type[25] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1};
+const int vp8_block2context[25] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3};
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
new file mode 100644
index 000000000..84ed53ad2
--- /dev/null
+++ b/vp8/common/blockd.h
@@ -0,0 +1,299 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_BLOCKD_H
+#define __INC_BLOCKD_H
+
+void vpx_log(const char *format, ...);
+
+#include "vpx_ports/config.h"
+#include "vpx_scale/yv12config.h"
+#include "mv.h"
+#include "treecoder.h"
+#include "subpixel.h"
+#include "vpx_ports/mem.h"
+
+#define TRUE    1
+#define FALSE   0
+
+//#define DCPRED 1
+#define DCPREDSIMTHRESH 0
+#define DCPREDCNTTHRESH 3
+
+#define Y1CONTEXT 0
+#define UCONTEXT 1
+#define VCONTEXT 2
+#define Y2CONTEXT 3
+
+#define MB_FEATURE_TREE_PROBS   3
+#define MAX_MB_SEGMENTS         4
+
+#define MAX_REF_LF_DELTAS       4
+#define MAX_MODE_LF_DELTAS      4
+
+// Segment Feature Masks
+#define SEGMENT_DELTADATA   0
+#define SEGMENT_ABSDATA     1
+
+typedef struct
+{
+    int r, c;
+} POS;
+
+
+typedef int ENTROPY_CONTEXT;
+
+typedef struct
+{
+    ENTROPY_CONTEXT l[4];
+    ENTROPY_CONTEXT a[4];
+} TEMP_CONTEXT;
+
+extern void vp8_setup_temp_context(TEMP_CONTEXT *t, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, int count);
+extern const int vp8_block2left[25];
+extern const int vp8_block2above[25];
+extern const int vp8_block2type[25];
+extern const int vp8_block2context[25];
+
+#define VP8_COMBINEENTROPYCONTEXTS( Dest, A, B) \
+    Dest = ((A)!=0) + ((B)!=0);
+
+
+typedef enum
+{
+    KEY_FRAME = 0,
+    INTER_FRAME = 1
+} FRAME_TYPE;
+
+typedef enum
+{
+    DC_PRED,            // average of above and left pixels
+    V_PRED,             // vertical prediction
+    H_PRED,             // horizontal prediction
+    TM_PRED,            // Truemotion prediction
+    B_PRED,             // block based prediction, each block has its own prediction mode
+
+    NEARESTMV,
+    NEARMV,
+    ZEROMV,
+    NEWMV,
+    SPLITMV,
+
+    MB_MODE_COUNT
+} MB_PREDICTION_MODE;
+
+// Macroblock level features
+typedef enum
+{
+    MB_LVL_ALT_Q = 0,               // Use alternate Quantizer ....
+    MB_LVL_ALT_LF = 1,              // Use alternate loop filter value...
+    MB_LVL_MAX = 2,                 // Number of MB level features supported
+
+} MB_LVL_FEATURES;
+
+// Segment Feature Masks
+#define SEGMENT_ALTQ    0x01
+#define SEGMENT_ALT_LF  0x02
+
+#define VP8_YMODES  (B_PRED + 1)
+#define VP8_UV_MODES (TM_PRED + 1)
+
+#define VP8_MVREFS (1 + SPLITMV - NEARESTMV)
+
+typedef enum
+{
+    B_DC_PRED,          // average of above and left pixels
+    B_TM_PRED,
+
+    B_VE_PRED,           // vertical prediction
+    B_HE_PRED,           // horizontal prediction
+
+    B_LD_PRED,
+    B_RD_PRED,
+
+    B_VR_PRED,
+    B_VL_PRED,
+    B_HD_PRED,
+    B_HU_PRED,
+
+    LEFT4X4,
+    ABOVE4X4,
+    ZERO4X4,
+    NEW4X4,
+
+    B_MODE_COUNT
+} B_PREDICTION_MODE;
+
+#define VP8_BINTRAMODES (B_HU_PRED + 1)  /* 10 */
+#define VP8_SUBMVREFS (1 + NEW4X4 - LEFT4X4)
+
+/* For keyframes, intra block modes are predicted by the (already decoded)
+   modes for the Y blocks to the left and above us; for interframes, there
+   is a single probability table. */
+
+typedef struct
+{
+    B_PREDICTION_MODE mode;
+    union
+    {
+        int as_int;
+        MV  as_mv;
+    } mv;
+} B_MODE_INFO;
+
+
+typedef enum
+{
+    INTRA_FRAME = 0,
+    LAST_FRAME = 1,
+    GOLDEN_FRAME = 2,
+    ALTREF_FRAME = 3,
+    MAX_REF_FRAMES = 4
+} MV_REFERENCE_FRAME;
+
+typedef struct
+{
+    MB_PREDICTION_MODE mode, uv_mode;
+    MV_REFERENCE_FRAME ref_frame;
+    union
+    {
+        int as_int;
+        MV  as_mv;
+    } mv;
+    int partitioning;
+    int partition_count;
+    int mb_skip_coeff;                                //does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens
+    int dc_diff;
+    unsigned char   segment_id;                  // Which set of segmentation parameters should be used for this MB
+    int force_no_skip;
+
+    B_MODE_INFO partition_bmi[16];
+
+} MB_MODE_INFO;
+
+
+typedef struct
+{
+    MB_MODE_INFO mbmi;
+    B_MODE_INFO bmi[16];
+} MODE_INFO;
+
+
+typedef struct
+{
+    short *qcoeff;
+    short *dqcoeff;
+    unsigned char  *predictor;
+    short *diff;
+    short *reference;
+
+    short(*dequant)[4];
+
+    // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
+    unsigned char **base_pre;
+    int pre;
+    int pre_stride;
+
+    unsigned char **base_dst;
+    int dst;
+    int dst_stride;
+
+    int eob;
+
+    B_MODE_INFO bmi;
+
+} BLOCKD;
+
+typedef struct
+{
+    DECLARE_ALIGNED(16, short, diff[400]);      // from idct diff
+    DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
+    DECLARE_ALIGNED(16, short, reference[384]);
+    DECLARE_ALIGNED(16, short, qcoeff[400]);
+    DECLARE_ALIGNED(16, short, dqcoeff[400]);
+
+    // 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries.
+    BLOCKD block[25];
+
+    YV12_BUFFER_CONFIG pre; // Filtered copy of previous frame reconstruction
+    YV12_BUFFER_CONFIG dst;
+
+    MODE_INFO *mode_info_context;
+    MODE_INFO *mode_info;
+
+    int mode_info_stride;
+
+    FRAME_TYPE frame_type;
+
+    MB_MODE_INFO mbmi;
+
+    int up_available;
+    int left_available;
+
+    // Y,U,V,Y2
+    ENTROPY_CONTEXT *above_context[4];   // row of context for each plane
+    ENTROPY_CONTEXT(*left_context)[4];   // (up to) 4 contexts ""
+
+    // 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active.
+    unsigned char segmentation_enabled;
+
+    // 0 (do not update) 1 (update) the macroblock segmentation map.
+    unsigned char update_mb_segmentation_map;
+
+    // 0 (do not update) 1 (update) the macroblock segmentation feature data.
+    unsigned char update_mb_segmentation_data;
+
+    // 0 (do not update) 1 (update) the macroblock segmentation feature data.
+    unsigned char mb_segement_abs_delta;
+
+    // Per frame flags that define which MB level features (such as quantizer or loop filter level)
+    // are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO
+    vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS];         // Probability Tree used to code Segment number
+
+    signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];            // Segment parameters
+
+    // mode_based Loop filter adjustment
+    unsigned char mode_ref_lf_delta_enabled;
+    unsigned char mode_ref_lf_delta_update;
+
+    // Delta values have the range +/- MAX_LOOP_FILTER
+    //char ref_lf_deltas[MAX_REF_LF_DELTAS];                      // 0 = Intra, Last, GF, ARF
+    //char mode_lf_deltas[MAX_MODE_LF_DELTAS];                            // 0 = BPRED, ZERO_MV, MV, SPLIT
+    signed char ref_lf_deltas[MAX_REF_LF_DELTAS];                     // 0 = Intra, Last, GF, ARF
+    signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];                           // 0 = BPRED, ZERO_MV, MV, SPLIT
+
+    // Distance of MB away from frame edges
+    int mb_to_left_edge;
+    int mb_to_right_edge;
+    int mb_to_top_edge;
+    int mb_to_bottom_edge;
+
+    //char * gf_active_ptr;
+    signed char *gf_active_ptr;
+
+    unsigned int frames_since_golden;
+    unsigned int frames_till_alt_ref_frame;
+    vp8_subpix_fn_t  subpixel_predict;
+    vp8_subpix_fn_t  subpixel_predict8x4;
+    vp8_subpix_fn_t  subpixel_predict8x8;
+    vp8_subpix_fn_t  subpixel_predict16x16;
+
+    void *current_bc;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+    struct VP8_COMMON_RTCD  *rtcd;
+#endif
+} MACROBLOCKD;
+
+
+extern void vp8_build_block_doffsets(MACROBLOCKD *x);
+extern void vp8_setup_block_dptrs(MACROBLOCKD *x);
+
+#endif  /* __INC_BLOCKD_H */
diff --git a/vp8/common/boolcoder.h b/vp8/common/boolcoder.h
new file mode 100644
index 000000000..0659d4873
--- /dev/null
+++ b/vp8/common/boolcoder.h
@@ -0,0 +1,569 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef bool_coder_h
+#define bool_coder_h 1
+
+/* Arithmetic bool coder with largish probability range.
+   Timothy S Murphy  6 August 2004 */
+
+/* So as not to force users to drag in too much of my idiosyncratic C++ world,
+   I avoid fancy storage management. */
+
+#include <assert.h>
+
+#include <stddef.h>
+#include <stdio.h>
+
+typedef unsigned char vp8bc_index_t; // probability index
+
+/* There are a couple of slight variants in the details of finite-precision
+   arithmetic coding.  May be safely ignored by most users. */
+
+enum vp8bc_rounding
+{
+    vp8bc_down = 0,     // just like VP8
+    vp8bc_down_full = 1, // handles minimum probability correctly
+    vp8bc_up = 2
+};
+
+#if _MSC_VER
+
+/* Note that msvc by default does not inline _anything_ (regardless of the
+   setting of inline_depth) and that a command-line option (-Ob1 or -Ob2)
+   is required to inline even the smallest functions. */
+
+#   pragma inline_depth( 255)           // I mean it when I inline something
+#   pragma warning( disable : 4099)     // No class vs. struct harassment
+#   pragma warning( disable : 4250)     // dominance complaints
+#   pragma warning( disable : 4284)     // operator-> in templates
+#   pragma warning( disable : 4800)     // bool conversion
+
+// don't let prefix ++,-- stand in for postfix, disaster would ensue
+
+#   pragma warning( error : 4620 4621)
+
+#endif  // _MSC_VER
+
+
+#if __cplusplus
+
+// Sometimes one wishes to be definite about integer lengths.
+
+struct int_types
+{
+    typedef const bool cbool;
+    typedef const signed char cchar;
+    typedef const short cshort;
+    typedef const int cint;
+    typedef const int clong;
+
+    typedef const double cdouble;
+    typedef const size_t csize_t;
+
+    typedef unsigned char uchar;    // 8 bits
+    typedef const uchar cuchar;
+
+    typedef short int16;
+    typedef unsigned short uint16;
+    typedef const int16 cint16;
+    typedef const uint16 cuint16;
+
+    typedef int int32;
+    typedef unsigned int uint32;
+    typedef const int32 cint32;
+    typedef const uint32 cuint32;
+
+    typedef unsigned int uint;
+    typedef unsigned int ulong;
+    typedef const uint cuint;
+    typedef const ulong culong;
+
+
+    // All structs consume space, may as well have a vptr.
+
+    virtual ~int_types();
+};
+
+
+struct bool_coder_spec;
+struct bool_coder;
+struct bool_writer;
+struct bool_reader;
+
+
+struct bool_coder_namespace : int_types
+{
+    typedef vp8bc_index_t Index;
+    typedef bool_coder_spec Spec;
+    typedef const Spec c_spec;
+
+    enum Rounding
+    {
+        Down = vp8bc_down,
+        down_full = vp8bc_down_full,
+        Up = vp8bc_up
+    };
+};
+
+
+// Archivable specification of a bool coder includes rounding spec
+// and probability mapping table.  The latter replaces a uchar j
+// (0 <= j < 256) with an arbitrary uint16 tbl[j] = p.
+// p/65536 is then the probability of a zero.
+
+struct bool_coder_spec : bool_coder_namespace
+{
+    friend struct bool_coder;
+    friend struct bool_writer;
+    friend struct bool_reader;
+    friend struct bool_coder_spec_float;
+    friend struct bool_coder_spec_explicit_table;
+    friend struct bool_coder_spec_exponential_table;
+    friend struct BPsrc;
+private:
+    uint w;                 // precision
+    Rounding r;
+
+    uint ebits, mbits, ebias;
+    uint32 mmask;
+
+    Index max_index, half_index;
+
+    uint32 mantissa(Index i) const
+    {
+        assert(i < half_index);
+        return (1 << mbits) + (i & mmask);
+    }
+    uint exponent(Index i) const
+    {
+        assert(i < half_index);
+        return ebias - (i >> mbits);
+    }
+
+    uint16 Ptbl[256];       // kinda clunky, but so is storage management.
+
+    /* Cost in bits of encoding a zero at every probability, scaled by 2^20.
+       Assumes that index is at most 8 bits wide. */
+
+    uint32 Ctbl[256];
+
+    uint32 split(Index i, uint32 R) const    // 1 <= split <= max( 1, R-1)
+    {
+        if (!ebias)
+            return 1 + (((R - 1) * Ptbl[i]) >> 16);
+
+        if (i >= half_index)
+            return R - split(max_index - i, R);
+
+        return 1 + (((R - 1) * mantissa(i)) >> exponent(i));
+    }
+
+    uint32 max_range() const
+    {
+        return (1 << w) - (r == down_full ? 0 : 1);
+    }
+    uint32 min_range() const
+    {
+        return (1 << (w - 1)) + (r == down_full ? 1 : 0);
+    }
+    uint32 Rinc() const
+    {
+        return r == Up ? 1 : 0;
+    }
+
+    void check_prec() const;
+
+    bool float_init(uint Ebits, uint Mbits);
+
+    void cost_init();
+
+    bool_coder_spec(
+        uint prec, Rounding rr, uint Ebits = 0, uint Mbits = 0
+    )
+        : w(prec), r(rr)
+    {
+        float_init(Ebits, Mbits);
+    }
+public:
+    // Read complete spec from file.
+    bool_coder_spec(FILE *);
+
+    // Write spec to file.
+    void dump(FILE *) const;
+
+    // return probability index best approximating prob.
+    Index operator()(double prob) const;
+
+    // probability corresponding to index
+    double operator()(Index i) const;
+
+    Index complement(Index i) const
+    {
+        return max_index - i;
+    }
+
+    Index max_index() const
+    {
+        return max_index;
+    }
+    Index half_index() const
+    {
+        return half_index;
+    }
+
+    uint32 cost_zero(Index i) const
+    {
+        return Ctbl[i];
+    }
+    uint32 cost_one(Index i) const
+    {
+        return Ctbl[ max_index - i];
+    }
+    uint32 cost_bit(Index i, bool b) const
+    {
+        return Ctbl[b? max_index-i:i];
+    }
+};
+
+
+/* Pseudo floating-point probability specification.
+
+   At least one of Ebits and Mbits must be nonzero.
+
+   Since all arithmetic is done at 32 bits, Ebits is at most 5.
+
+   Total significant bits in index is Ebits + Mbits + 1.
+
+   Below the halfway point (i.e. when the top significant bit is 0),
+   the index is (e << Mbits) + m.
+
+   The exponent e is between 0 and (2**Ebits) - 1,
+   the mantissa m is between 0 and (2**Mbits) - 1.
+
+   Prepending an implicit 1 to the mantissa, the probability is then
+
+        (2**Mbits + m) >> (e - 2**Ebits - 1 - Mbits),
+
+   which has (1/2)**(2**Ebits + 1) as a minimum
+   and (1/2) * [1 - 2**(Mbits + 1)] as a maximum.
+
+   When the index is above the halfway point, the probability is the
+   complement of the probability associated to the complement of the index.
+
+   Note that the probability increases with the index and that, because of
+   the symmetry, we cannot encode probability exactly 1/2; though we
+   can get as close to 1/2 as we like, provided we have enough Mbits.
+
+   The latter is of course not a problem in practice, one never has
+   exact probabilities and entropy errors are second order, that is, the
+   "overcoding" of a zero will be largely compensated for by the
+   "undercoding" of a one (or vice-versa).
+
+   Compared to arithmetic probability specs (a la VP8), this will do better
+   at very high and low probabilities and worse at probabilities near 1/2,
+   as well as facilitating the usage of wider or narrower probability indices.
+*/
+
+struct bool_coder_spec_float : bool_coder_spec
+{
+    bool_coder_spec_float(
+        uint Ebits = 3, uint Mbits = 4, Rounding rr = down_full, uint prec = 12
+    )
+        : bool_coder_spec(prec, rr, Ebits, Mbits)
+    {
+        cost_init();
+    }
+};
+
+
+struct bool_coder_spec_explicit_table : bool_coder_spec
+{
+    bool_coder_spec_explicit_table(
+        cuint16 probability_table[256] = 0,  // default is tbl[i] = i << 8.
+        Rounding = down_full,
+        uint precision = 16
+    );
+};
+
+// Contruct table via multiplicative interpolation between
+// p[128] = 1/2  and p[0] = (1/2)^x.
+// Since we are working with 16-bit precision, x is at most 16.
+// For probabilities to increase with i, we must have x > 1.
+// For 0 <= i <= 128, p[i] = (1/2)^{ 1 + [1 - (i/128)]*[x-1] }.
+// Finally, p[128+i] = 1 - p[128 - i].
+
+struct bool_coder_spec_exponential_table : bool_coder_spec
+{
+    bool_coder_spec_exponential_table(uint x, Rounding = down_full, uint prec = 16);
+};
+
+
+// Commonalities between writer and reader.
+
+struct bool_coder : bool_coder_namespace
+{
+    friend struct bool_writer;
+    friend struct bool_reader;
+    friend struct BPsrc;
+private:
+    uint32 Low, Range;
+    cuint32 min_range;
+    cuint32 rinc;
+    c_spec spec;
+
+    void _reset()
+    {
+        Low = 0;
+        Range = spec.max_range();
+    }
+
+    bool_coder(c_spec &s)
+        :  min_range(s.min_range()),
+           rinc(s.Rinc()),
+           spec(s)
+    {
+        _reset();
+    }
+
+    uint32 half() const
+    {
+        return 1 + ((Range - 1) >> 1);
+    }
+public:
+    c_spec &Spec() const
+    {
+        return spec;
+    }
+};
+
+
+struct bool_writer : bool_coder
+{
+    friend struct BPsrc;
+private:
+    uchar *Bstart, *Bend, *B;
+    int bit_lag;
+    bool is_toast;
+    void carry();
+    void reset()
+    {
+        _reset();
+        bit_lag = 32 - spec.w;
+        is_toast = 0;
+    }
+    void raw(bool value, uint32 split);
+public:
+    bool_writer(c_spec &, uchar *Dest, size_t Len);
+    virtual ~bool_writer();
+
+    void operator()(Index p, bool v)
+    {
+        raw(v, spec.split(p, Range));
+    }
+
+    uchar *buf() const
+    {
+        return Bstart;
+    }
+    size_t bytes_written() const
+    {
+        return B - Bstart;
+    }
+
+    // Call when done with input, flushes internal state.
+    // DO NOT write any more data after calling this.
+
+    bool_writer &flush();
+
+    void write_bits(int n, uint val)
+    {
+        if (n)
+        {
+            uint m = 1 << (n - 1);
+
+            do
+            {
+                raw((bool)(val & m), half());
+            }
+            while (m >>= 1);
+        }
+    }
+
+#   if 0
+    // We are agnostic about storage management.
+    // By default, overflows throw an assert but user can
+    // override to provide an expanding buffer using ...
+
+    virtual void overflow(uint Len) const;
+
+    // ... this function copies already-written data into new buffer
+    // and retains new buffer location.
+
+    void new_buffer(uchar *dest, uint Len);
+
+    // Note that storage management is the user's responsibility.
+#   endif
+};
+
+
+// This could be adjusted to use a little less lookahead.
+
+struct bool_reader : bool_coder
+{
+    friend struct BPsrc;
+private:
+    cuchar *const Bstart;   // for debugging
+    cuchar *B;
+    cuchar *const Bend;
+    cuint shf;
+    uint bct;
+    bool raw(uint32 split);
+public:
+    bool_reader(c_spec &s, cuchar *src, size_t Len);
+
+    bool operator()(Index p)
+    {
+        return raw(spec.split(p, Range));
+    }
+
+    uint read_bits(int num_bits)
+    {
+        uint v = 0;
+
+        while (--num_bits >= 0)
+            v += v + (raw(half()) ? 1 : 0);
+
+        return v;
+    }
+};
+
+extern "C" {
+
+#endif /*  __cplusplus */
+
+
+    /* C interface */
+
+    typedef struct bool_coder_spec bool_coder_spec;
+    typedef struct bool_writer bool_writer;
+    typedef struct bool_reader bool_reader;
+
+    typedef const bool_coder_spec c_bool_coder_spec;
+    typedef const bool_writer c_bool_writer;
+    typedef const bool_reader c_bool_reader;
+
+
+    /* Optionally override default precision when constructing coder_specs.
+       Just pass a zero pointer if you don't care.
+       Precision is at most 16 bits for table specs, at most 23 otherwise. */
+
+    struct vp8bc_prec
+    {
+        enum vp8bc_rounding r;      /* see top header file for def */
+        unsigned int prec;          /* range precision in bits */
+    };
+
+    typedef const struct vp8bc_prec vp8bc_c_prec;
+
+    /* bool_coder_spec contains mapping of uchars to actual probabilities
+       (16 bit uints) as well as (usually immaterial) selection of
+       exact finite-precision algorithm used (for now, the latter can only
+       be overridden using the C++ interface).
+       See comments above the corresponding C++ constructors for discussion,
+       especially of exponential probability table generation. */
+
+    bool_coder_spec *vp8bc_vp8spec(); // just like vp8
+
+    bool_coder_spec *vp8bc_literal_spec(
+        const unsigned short prob_map[256],  // 0 is like vp8 w/more precision
+        vp8bc_c_prec*
+    );
+
+    bool_coder_spec *vp8bc_float_spec(
+        unsigned int exponent_bits, unsigned int mantissa_bits, vp8bc_c_prec*
+    );
+
+    bool_coder_spec *vp8bc_exponential_spec(unsigned int min_exp, vp8bc_c_prec *);
+
+    bool_coder_spec *vp8bc_spec_from_file(FILE *);
+
+
+    void vp8bc_destroy_spec(c_bool_coder_spec *);
+
+    void vp8bc_spec_to_file(c_bool_coder_spec *, FILE *);
+
+
+    /* Nearest index to supplied probability of zero, 0 <= prob <= 1. */
+
+    vp8bc_index_t vp8bc_index(c_bool_coder_spec *, double prob);
+
+    vp8bc_index_t vp8bc_index_from_counts(
+        c_bool_coder_spec *p, unsigned int zero_ct, unsigned int one_ct
+    );
+
+    /* In case you want to look */
+
+    double vp8bc_probability(c_bool_coder_spec *, vp8bc_index_t);
+
+    /* Opposite index */
+
+    vp8bc_index_t vp8bc_complement(c_bool_coder_spec *, vp8bc_index_t);
+
+    /* Cost in bits of encoding a zero at given probability, scaled by 2^20.
+       (assumes that an int holds at least 32 bits). */
+
+    unsigned int vp8bc_cost_zero(c_bool_coder_spec *, vp8bc_index_t);
+
+    unsigned int vp8bc_cost_one(c_bool_coder_spec *, vp8bc_index_t);
+    unsigned int vp8bc_cost_bit(c_bool_coder_spec *, vp8bc_index_t, int);
+
+
+    /* bool_writer interface */
+
+    /* Length = 0 disables checking for writes beyond buffer end. */
+
+    bool_writer *vp8bc_create_writer(
+        c_bool_coder_spec *, unsigned char *Destination, size_t Length
+    );
+
+    /* Flushes out any buffered data and returns total # of bytes written. */
+
+    size_t vp8bc_destroy_writer(bool_writer *);
+
+    void vp8bc_write_bool(bool_writer *, int boolean_val, vp8bc_index_t false_prob);
+
+    void vp8bc_write_bits(
+        bool_writer *, unsigned int integer_value, int number_of_bits
+    );
+
+    c_bool_coder_spec *vp8bc_writer_spec(c_bool_writer *);
+
+
+    /* bool_reader interface */
+
+    /* Length = 0 disables checking for reads beyond buffer end. */
+
+    bool_reader *vp8bc_create_reader(
+        c_bool_coder_spec *, const unsigned char *Source, size_t Length
+    );
+    void vp8bc_destroy_reader(bool_reader *);
+
+    int vp8bc_read_bool(bool_reader *, vp8bc_index_t false_prob);
+
+    unsigned int vp8bc_read_bits(bool_reader *, int number_of_bits);
+
+    c_bool_coder_spec *vp8bc_reader_spec(c_bool_reader *);
+
+#if __cplusplus
+}
+#endif
+
+#endif  /* bool_coder_h */
diff --git a/vp8/common/codec_common_interface.h b/vp8/common/codec_common_interface.h
new file mode 100644
index 000000000..7881b0a41
--- /dev/null
+++ b/vp8/common/codec_common_interface.h
@@ -0,0 +1,92 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+#ifndef CODEC_COMMON_INTERFACE_H
+#define CODEC_COMMON_INTERFACE_H
+
+#define __export
+#define _export
+#define dll_export   __declspec( dllexport )
+#define dll_import   __declspec( dllimport )
+
+// Playback ERROR Codes.
+#define NO_DECODER_ERROR            0
+#define REMOTE_DECODER_ERROR        -1
+
+#define DFR_BAD_DCT_COEFF           -100
+#define DFR_ZERO_LENGTH_FRAME       -101
+#define DFR_FRAME_SIZE_INVALID      -102
+#define DFR_OUTPUT_BUFFER_OVERFLOW  -103
+#define DFR_INVALID_FRAME_HEADER    -104
+#define FR_INVALID_MODE_TOKEN       -110
+#define ETR_ALLOCATION_ERROR        -200
+#define ETR_INVALID_ROOT_PTR        -201
+#define SYNCH_ERROR                 -400
+#define BUFFER_UNDERFLOW_ERROR      -500
+#define PB_IB_OVERFLOW_ERROR        -501
+
+// External error triggers
+#define PB_HEADER_CHECKSUM_ERROR    -601
+#define PB_DATA_CHECKSUM_ERROR      -602
+
+// DCT Error Codes
+#define DDCT_EXPANSION_ERROR        -700
+#define DDCT_INVALID_TOKEN_ERROR    -701
+
+// exception_errors
+#define GEN_EXCEPTIONS              -800
+#define EX_UNQUAL_ERROR             -801
+
+// Unrecoverable error codes
+#define FATAL_PLAYBACK_ERROR        -1000
+#define GEN_ERROR_CREATING_CDC      -1001
+#define GEN_THREAD_CREATION_ERROR   -1002
+#define DFR_CREATE_BMP_FAILED       -1003
+
+// YUV buffer configuration structure
+typedef struct
+{
+    int     y_width;
+    int     y_height;
+    int     y_stride;
+
+    int     uv_width;
+    int     uv_height;
+    int     uv_stride;
+
+    unsigned char   *y_buffer;
+    unsigned char   *u_buffer;
+    unsigned char   *v_buffer;
+
+} YUV_BUFFER_CONFIG;
+typedef enum
+{
+    C_SET_KEY_FRAME,
+    C_SET_FIXED_Q,
+    C_SET_FIRSTPASS_FILE,
+    C_SET_EXPERIMENTAL_MIN,
+    C_SET_EXPERIMENTAL_MAX = C_SET_EXPERIMENTAL_MIN + 255,
+    C_SET_CHECKPROTECT,
+    C_SET_TESTMODE,
+    C_SET_INTERNAL_SIZE,
+    C_SET_RECOVERY_FRAME,
+    C_SET_REFERENCEFRAME,
+    C_SET_GOLDENFRAME
+
+#ifndef VP50_COMP_INTERFACE
+    // Specialist test facilities.
+//    C_VCAP_PARAMS,              // DO NOT USE FOR NOW WITH VFW CODEC
+#endif
+
+} C_SETTING;
+
+typedef unsigned long C_SET_VALUE;
+
+
+#endif
diff --git a/vp8/common/coefupdateprobs.h b/vp8/common/coefupdateprobs.h
new file mode 100644
index 000000000..99affd618
--- /dev/null
+++ b/vp8/common/coefupdateprobs.h
@@ -0,0 +1,184 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/* Update probabilities for the nodes in the token entropy tree.
+   Generated file included by entropy.c */
+
+const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1] =
+{
+    {
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, },
+            {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+    {
+        {
+            {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, },
+            {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, },
+        },
+        {
+            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+    {
+        {
+            {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+    {
+        {
+            {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, },
+            {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+};
diff --git a/vp8/common/common.h b/vp8/common/common.h
new file mode 100644
index 000000000..29f6d371b
--- /dev/null
+++ b/vp8/common/common.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef common_h
+#define common_h 1
+
+#include <assert.h>
+
+/* Interface header for common constant data structures and lookup tables */
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "common_types.h"
+
+/* Only need this for fixed-size arrays, for structs just assign. */
+
+#define vp8_copy( Dest, Src) { \
+        assert( sizeof( Dest) == sizeof( Src)); \
+        vpx_memcpy( Dest, Src, sizeof( Src)); \
+    }
+
+/* Use this for variably-sized arrays. */
+
+#define vp8_copy_array( Dest, Src, N) { \
+        assert( sizeof( *Dest) == sizeof( *Src)); \
+        vpx_memcpy( Dest, Src, N * sizeof( *Src)); \
+    }
+
+#define vp8_zero( Dest)  vpx_memset( &Dest, 0, sizeof( Dest));
+
+#define vp8_zero_array( Dest, N)  vpx_memset( Dest, 0, N * sizeof( *Dest));
+
+
+#endif  /* common_h */
diff --git a/vp8/common/common_types.h b/vp8/common/common_types.h
new file mode 100644
index 000000000..deb5ed8e5
--- /dev/null
+++ b/vp8/common/common_types.h
@@ -0,0 +1,17 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_COMMON_TYPES
+#define __INC_COMMON_TYPES
+
+#define TRUE    1
+#define FALSE   0
+
+#endif
diff --git a/vp8/common/context.c b/vp8/common/context.c
new file mode 100644
index 000000000..17ee8c338
--- /dev/null
+++ b/vp8/common/context.c
@@ -0,0 +1,398 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "entropy.h"
+
+/* *** GENERATED FILE: DO NOT EDIT *** */
+
+#if 0
+int Contexts[vp8_coef_counter_dimen];
+
+const int default_contexts[vp8_coef_counter_dimen] =
+{
+    {
+        // Block Type ( 0 )
+        {
+            // Coeff Band ( 0 )
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+        },
+        {
+            // Coeff Band ( 1 )
+            {30190, 26544, 225,  24,   4,   0,   0,   0,   0,   0,   0, 4171593,},
+            {26846, 25157, 1241, 130,  26,   6,   1,   0,   0,   0,   0, 149987,},
+            {10484, 9538, 1006, 160,  36,  18,   0,   0,   0,   0,   0, 15104,},
+        },
+        {
+            // Coeff Band ( 2 )
+            {25842, 40456, 1126,  83,  11,   2,   0,   0,   0,   0,   0,   0,},
+            {9338, 8010, 512,  73,   7,   3,   2,   0,   0,   0,   0, 43294,},
+            {1047, 751, 149,  31,  13,   6,   1,   0,   0,   0,   0, 879,},
+        },
+        {
+            // Coeff Band ( 3 )
+            {26136, 9826, 252,  13,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {8134, 5574, 191,  14,   2,   0,   0,   0,   0,   0,   0, 35302,},
+            { 605, 677, 116,   9,   1,   0,   0,   0,   0,   0,   0, 611,},
+        },
+        {
+            // Coeff Band ( 4 )
+            {10263, 15463, 283,  17,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {2773, 2191, 128,   9,   2,   2,   0,   0,   0,   0,   0, 10073,},
+            { 134, 125,  32,   4,   0,   2,   0,   0,   0,   0,   0,  50,},
+        },
+        {
+            // Coeff Band ( 5 )
+            {10483, 2663,  23,   1,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {2137, 1251,  27,   1,   1,   0,   0,   0,   0,   0,   0, 14362,},
+            { 116, 156,  14,   2,   1,   0,   0,   0,   0,   0,   0, 190,},
+        },
+        {
+            // Coeff Band ( 6 )
+            {40977, 27614, 412,  28,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {6113, 5213, 261,  22,   3,   0,   0,   0,   0,   0,   0, 26164,},
+            { 382, 312,  50,  14,   2,   0,   0,   0,   0,   0,   0, 345,},
+        },
+        {
+            // Coeff Band ( 7 )
+            {   0,  26,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,  13,   0,   0,   0,   0,   0,   0,   0,   0,   0, 319,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   8,},
+        },
+    },
+    {
+        // Block Type ( 1 )
+        {
+            // Coeff Band ( 0 )
+            {3268, 19382, 1043, 250,  93,  82,  49,  26,  17,   8,  25, 82289,},
+            {8758, 32110, 5436, 1832, 827, 668, 420, 153,  24,   0,   3, 52914,},
+            {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399,  59,   0,   0, 18620,},
+        },
+        {
+            // Coeff Band ( 1 )
+            {12419, 8420, 452,  62,   9,   1,   0,   0,   0,   0,   0,   0,},
+            {11715, 8705, 693,  92,  15,   7,   2,   0,   0,   0,   0, 53988,},
+            {7603, 8585, 2306, 778, 270, 145,  39,   5,   0,   0,   0, 9136,},
+        },
+        {
+            // Coeff Band ( 2 )
+            {15938, 14335, 1207, 184,  55,  13,   4,   1,   0,   0,   0,   0,},
+            {7415, 6829, 1138, 244,  71,  26,   7,   0,   0,   0,   0, 9980,},
+            {1580, 1824, 655, 241,  89,  46,  10,   2,   0,   0,   0, 429,},
+        },
+        {
+            // Coeff Band ( 3 )
+            {19453, 5260, 201,  19,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {9173, 3758, 213,  22,   1,   1,   0,   0,   0,   0,   0, 9820,},
+            {1689, 1277, 276,  51,  17,   4,   0,   0,   0,   0,   0, 679,},
+        },
+        {
+            // Coeff Band ( 4 )
+            {12076, 10667, 620,  85,  19,   9,   5,   0,   0,   0,   0,   0,},
+            {4665, 3625, 423,  55,  19,   9,   0,   0,   0,   0,   0, 5127,},
+            { 415, 440, 143,  34,  20,   7,   2,   0,   0,   0,   0, 101,},
+        },
+        {
+            // Coeff Band ( 5 )
+            {12183, 4846, 115,  11,   1,   0,   0,   0,   0,   0,   0,   0,},
+            {4226, 3149, 177,  21,   2,   0,   0,   0,   0,   0,   0, 7157,},
+            { 375, 621, 189,  51,  11,   4,   1,   0,   0,   0,   0, 198,},
+        },
+        {
+            // Coeff Band ( 6 )
+            {61658, 37743, 1203,  94,  10,   3,   0,   0,   0,   0,   0,   0,},
+            {15514, 11563, 903, 111,  14,   5,   0,   0,   0,   0,   0, 25195,},
+            { 929, 1077, 291,  78,  14,   7,   1,   0,   0,   0,   0, 507,},
+        },
+        {
+            // Coeff Band ( 7 )
+            {   0, 990,  15,   3,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0, 412,  13,   0,   0,   0,   0,   0,   0,   0,   0, 1641,},
+            {   0,  18,   7,   1,   0,   0,   0,   0,   0,   0,   0,  30,},
+        },
+    },
+    {
+        // Block Type ( 2 )
+        {
+            // Coeff Band ( 0 )
+            { 953, 24519, 628, 120,  28,  12,   4,   0,   0,   0,   0, 2248798,},
+            {1525, 25654, 2647, 617, 239, 143,  42,   5,   0,   0,   0, 66837,},
+            {1180, 11011, 3001, 1237, 532, 448, 239,  54,   5,   0,   0, 7122,},
+        },
+        {
+            // Coeff Band ( 1 )
+            {1356, 2220,  67,  10,   4,   1,   0,   0,   0,   0,   0,   0,},
+            {1450, 2544, 102,  18,   4,   3,   0,   0,   0,   0,   0, 57063,},
+            {1182, 2110, 470, 130,  41,  21,   0,   0,   0,   0,   0, 6047,},
+        },
+        {
+            // Coeff Band ( 2 )
+            { 370, 3378, 200,  30,   5,   4,   1,   0,   0,   0,   0,   0,},
+            { 293, 1006, 131,  29,  11,   0,   0,   0,   0,   0,   0, 5404,},
+            { 114, 387,  98,  23,   4,   8,   1,   0,   0,   0,   0, 236,},
+        },
+        {
+            // Coeff Band ( 3 )
+            { 579, 194,   4,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            { 395, 213,   5,   1,   0,   0,   0,   0,   0,   0,   0, 4157,},
+            { 119, 122,   4,   0,   0,   0,   0,   0,   0,   0,   0, 300,},
+        },
+        {
+            // Coeff Band ( 4 )
+            {  38, 557,  19,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  21, 114,  12,   1,   0,   0,   0,   0,   0,   0,   0, 427,},
+            {   0,   5,   0,   0,   0,   0,   0,   0,   0,   0,   0,   7,},
+        },
+        {
+            // Coeff Band ( 5 )
+            {  52,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  18,   6,   0,   0,   0,   0,   0,   0,   0,   0,   0, 652,},
+            {   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  30,},
+        },
+        {
+            // Coeff Band ( 6 )
+            { 640, 569,  10,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  25,  77,   2,   0,   0,   0,   0,   0,   0,   0,   0, 517,},
+            {   4,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3,},
+        },
+        {
+            // Coeff Band ( 7 )
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+        },
+    },
+    {
+        // Block Type ( 3 )
+        {
+            // Coeff Band ( 0 )
+            {2506, 20161, 2707, 767, 261, 178, 107,  30,  14,   3,   0, 100694,},
+            {8806, 36478, 8817, 3268, 1280, 850, 401, 114,  42,   0,   0, 58572,},
+            {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175,  32,   0,   0, 19284,},
+        },
+        {
+            // Coeff Band ( 1 )
+            {9738, 11313, 959, 205,  70,  18,  11,   1,   0,   0,   0,   0,},
+            {12628, 15085, 1507, 273,  52,  19,   9,   0,   0,   0,   0, 54280,},
+            {10701, 15846, 5561, 1926, 813, 570, 249,  36,   0,   0,   0, 6460,},
+        },
+        {
+            // Coeff Band ( 2 )
+            {6781, 22539, 2784, 634, 182, 123,  20,   4,   0,   0,   0,   0,},
+            {6263, 11544, 2649, 790, 259, 168,  27,   5,   0,   0,   0, 20539,},
+            {3109, 4075, 2031, 896, 457, 386, 158,  29,   0,   0,   0, 1138,},
+        },
+        {
+            // Coeff Band ( 3 )
+            {11515, 4079, 465,  73,   5,  14,   2,   0,   0,   0,   0,   0,},
+            {9361, 5834, 650,  96,  24,   8,   4,   0,   0,   0,   0, 22181,},
+            {4343, 3974, 1360, 415, 132,  96,  14,   1,   0,   0,   0, 1267,},
+        },
+        {
+            // Coeff Band ( 4 )
+            {4787, 9297, 823, 168,  44,  12,   4,   0,   0,   0,   0,   0,},
+            {3619, 4472, 719, 198,  60,  31,   3,   0,   0,   0,   0, 8401,},
+            {1157, 1175, 483, 182,  88,  31,   8,   0,   0,   0,   0, 268,},
+        },
+        {
+            // Coeff Band ( 5 )
+            {8299, 1226,  32,   5,   1,   0,   0,   0,   0,   0,   0,   0,},
+            {3502, 1568,  57,   4,   1,   1,   0,   0,   0,   0,   0, 9811,},
+            {1055, 1070, 166,  29,   6,   1,   0,   0,   0,   0,   0, 527,},
+        },
+        {
+            // Coeff Band ( 6 )
+            {27414, 27927, 1989, 347,  69,  26,   0,   0,   0,   0,   0,   0,},
+            {5876, 10074, 1574, 341,  91,  24,   4,   0,   0,   0,   0, 21954,},
+            {1571, 2171, 778, 324, 124,  65,  16,   0,   0,   0,   0, 979,},
+        },
+        {
+            // Coeff Band ( 7 )
+            {   0,  29,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,  23,   0,   0,   0,   0,   0,   0,   0,   0,   0, 459,},
+            {   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  13,},
+        },
+    },
+};
+
+//Update probabilities for the nodes in the token entropy tree.
+const vp8_prob tree_update_probs[vp8_coef_tree_dimen] =
+{
+    {
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, },
+            {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+    {
+        {
+            {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, },
+            {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, },
+        },
+        {
+            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+    {
+        {
+            {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+    {
+        {
+            {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, },
+            {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+};
+#endif
diff --git a/vp8/common/debugmodes.c b/vp8/common/debugmodes.c
new file mode 100644
index 000000000..e2d2d2c0f
--- /dev/null
+++ b/vp8/common/debugmodes.c
@@ -0,0 +1,156 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include <stdio.h>
+#include "blockd.h"
+
+
+void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int frame)
+{
+
+    int mb_row;
+    int mb_col;
+    int mb_index = 0;
+    FILE *mvs = fopen("mvs.stt", "a");
+
+    // print out the macroblock Y modes
+    mb_index = 0;
+    fprintf(mvs, "Mb Modes for Frame %d\n", frame);
+
+    for (mb_row = 0; mb_row < rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cols; mb_col++)
+        {
+
+            fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode);
+
+            mb_index++;
+        }
+
+        fprintf(mvs, "\n");
+        mb_index++;
+    }
+
+    fprintf(mvs, "\n");
+
+    mb_index = 0;
+    fprintf(mvs, "Mb mv ref for Frame %d\n", frame);
+
+    for (mb_row = 0; mb_row < rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cols; mb_col++)
+        {
+
+            fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame);
+
+            mb_index++;
+        }
+
+        fprintf(mvs, "\n");
+        mb_index++;
+    }
+
+    fprintf(mvs, "\n");
+
+    // print out the macroblock UV modes
+    mb_index = 0;
+    fprintf(mvs, "UV Modes for Frame %d\n", frame);
+
+    for (mb_row = 0; mb_row < rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cols; mb_col++)
+        {
+
+            fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode);
+
+            mb_index++;
+        }
+
+        mb_index++;
+        fprintf(mvs, "\n");
+    }
+
+    fprintf(mvs, "\n");
+
+    // print out the block modes
+    mb_index = 0;
+    fprintf(mvs, "Mbs for Frame %d\n", frame);
+    {
+        int b_row;
+
+        for (b_row = 0; b_row < 4 * rows; b_row++)
+        {
+            int b_col;
+            int bindex;
+
+            for (b_col = 0; b_col < 4 * cols; b_col++)
+            {
+                mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
+                bindex = (b_row & 3) * 4 + (b_col & 3);
+
+                if (mi[mb_index].mbmi.mode == B_PRED)
+                    fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].mode);
+                else
+                    fprintf(mvs, "xx ");
+
+            }
+
+            fprintf(mvs, "\n");
+        }
+    }
+    fprintf(mvs, "\n");
+
+    // print out the macroblock mvs
+    mb_index = 0;
+    fprintf(mvs, "MVs for Frame %d\n", frame);
+
+    for (mb_row = 0; mb_row < rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cols; mb_col++)
+        {
+            fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv.as_mv.row / 2, mi[mb_index].mbmi.mv.as_mv.col / 2);
+
+            mb_index++;
+        }
+
+        mb_index++;
+        fprintf(mvs, "\n");
+    }
+
+    fprintf(mvs, "\n");
+
+
+    // print out the block modes
+    mb_index = 0;
+    fprintf(mvs, "MVs for Frame %d\n", frame);
+    {
+        int b_row;
+
+        for (b_row = 0; b_row < 4 * rows; b_row++)
+        {
+            int b_col;
+            int bindex;
+
+            for (b_col = 0; b_col < 4 * cols; b_col++)
+            {
+                mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
+                bindex = (b_row & 3) * 4 + (b_col & 3);
+                fprintf(mvs, "%3d:%-3d ", mi[mb_index].bmi[bindex].mv.as_mv.row, mi[mb_index].bmi[bindex].mv.as_mv.col);
+
+            }
+
+            fprintf(mvs, "\n");
+        }
+    }
+    fprintf(mvs, "\n");
+
+
+    fclose(mvs);
+}
diff --git a/vp8/common/defaultcoefcounts.h b/vp8/common/defaultcoefcounts.h
new file mode 100644
index 000000000..ccdf326e6
--- /dev/null
+++ b/vp8/common/defaultcoefcounts.h
@@ -0,0 +1,220 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/* Generated file, included by entropy.c */
+
+static const unsigned int default_coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens] =
+{
+
+    {
+        // Block Type ( 0 )
+        {
+            // Coeff Band ( 0 )
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+        },
+        {
+            // Coeff Band ( 1 )
+            {30190, 26544, 225,  24,   4,   0,   0,   0,   0,   0,   0, 4171593,},
+            {26846, 25157, 1241, 130,  26,   6,   1,   0,   0,   0,   0, 149987,},
+            {10484, 9538, 1006, 160,  36,  18,   0,   0,   0,   0,   0, 15104,},
+        },
+        {
+            // Coeff Band ( 2 )
+            {25842, 40456, 1126,  83,  11,   2,   0,   0,   0,   0,   0,   0,},
+            {9338, 8010, 512,  73,   7,   3,   2,   0,   0,   0,   0, 43294,},
+            {1047, 751, 149,  31,  13,   6,   1,   0,   0,   0,   0, 879,},
+        },
+        {
+            // Coeff Band ( 3 )
+            {26136, 9826, 252,  13,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {8134, 5574, 191,  14,   2,   0,   0,   0,   0,   0,   0, 35302,},
+            { 605, 677, 116,   9,   1,   0,   0,   0,   0,   0,   0, 611,},
+        },
+        {
+            // Coeff Band ( 4 )
+            {10263, 15463, 283,  17,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {2773, 2191, 128,   9,   2,   2,   0,   0,   0,   0,   0, 10073,},
+            { 134, 125,  32,   4,   0,   2,   0,   0,   0,   0,   0,  50,},
+        },
+        {
+            // Coeff Band ( 5 )
+            {10483, 2663,  23,   1,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {2137, 1251,  27,   1,   1,   0,   0,   0,   0,   0,   0, 14362,},
+            { 116, 156,  14,   2,   1,   0,   0,   0,   0,   0,   0, 190,},
+        },
+        {
+            // Coeff Band ( 6 )
+            {40977, 27614, 412,  28,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {6113, 5213, 261,  22,   3,   0,   0,   0,   0,   0,   0, 26164,},
+            { 382, 312,  50,  14,   2,   0,   0,   0,   0,   0,   0, 345,},
+        },
+        {
+            // Coeff Band ( 7 )
+            {   0,  26,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,  13,   0,   0,   0,   0,   0,   0,   0,   0,   0, 319,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   8,},
+        },
+    },
+    {
+        // Block Type ( 1 )
+        {
+            // Coeff Band ( 0 )
+            {3268, 19382, 1043, 250,  93,  82,  49,  26,  17,   8,  25, 82289,},
+            {8758, 32110, 5436, 1832, 827, 668, 420, 153,  24,   0,   3, 52914,},
+            {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399,  59,   0,   0, 18620,},
+        },
+        {
+            // Coeff Band ( 1 )
+            {12419, 8420, 452,  62,   9,   1,   0,   0,   0,   0,   0,   0,},
+            {11715, 8705, 693,  92,  15,   7,   2,   0,   0,   0,   0, 53988,},
+            {7603, 8585, 2306, 778, 270, 145,  39,   5,   0,   0,   0, 9136,},
+        },
+        {
+            // Coeff Band ( 2 )
+            {15938, 14335, 1207, 184,  55,  13,   4,   1,   0,   0,   0,   0,},
+            {7415, 6829, 1138, 244,  71,  26,   7,   0,   0,   0,   0, 9980,},
+            {1580, 1824, 655, 241,  89,  46,  10,   2,   0,   0,   0, 429,},
+        },
+        {
+            // Coeff Band ( 3 )
+            {19453, 5260, 201,  19,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {9173, 3758, 213,  22,   1,   1,   0,   0,   0,   0,   0, 9820,},
+            {1689, 1277, 276,  51,  17,   4,   0,   0,   0,   0,   0, 679,},
+        },
+        {
+            // Coeff Band ( 4 )
+            {12076, 10667, 620,  85,  19,   9,   5,   0,   0,   0,   0,   0,},
+            {4665, 3625, 423,  55,  19,   9,   0,   0,   0,   0,   0, 5127,},
+            { 415, 440, 143,  34,  20,   7,   2,   0,   0,   0,   0, 101,},
+        },
+        {
+            // Coeff Band ( 5 )
+            {12183, 4846, 115,  11,   1,   0,   0,   0,   0,   0,   0,   0,},
+            {4226, 3149, 177,  21,   2,   0,   0,   0,   0,   0,   0, 7157,},
+            { 375, 621, 189,  51,  11,   4,   1,   0,   0,   0,   0, 198,},
+        },
+        {
+            // Coeff Band ( 6 )
+            {61658, 37743, 1203,  94,  10,   3,   0,   0,   0,   0,   0,   0,},
+            {15514, 11563, 903, 111,  14,   5,   0,   0,   0,   0,   0, 25195,},
+            { 929, 1077, 291,  78,  14,   7,   1,   0,   0,   0,   0, 507,},
+        },
+        {
+            // Coeff Band ( 7 )
+            {   0, 990,  15,   3,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0, 412,  13,   0,   0,   0,   0,   0,   0,   0,   0, 1641,},
+            {   0,  18,   7,   1,   0,   0,   0,   0,   0,   0,   0,  30,},
+        },
+    },
+    {
+        // Block Type ( 2 )
+        {
+            // Coeff Band ( 0 )
+            { 953, 24519, 628, 120,  28,  12,   4,   0,   0,   0,   0, 2248798,},
+            {1525, 25654, 2647, 617, 239, 143,  42,   5,   0,   0,   0, 66837,},
+            {1180, 11011, 3001, 1237, 532, 448, 239,  54,   5,   0,   0, 7122,},
+        },
+        {
+            // Coeff Band ( 1 )
+            {1356, 2220,  67,  10,   4,   1,   0,   0,   0,   0,   0,   0,},
+            {1450, 2544, 102,  18,   4,   3,   0,   0,   0,   0,   0, 57063,},
+            {1182, 2110, 470, 130,  41,  21,   0,   0,   0,   0,   0, 6047,},
+        },
+        {
+            // Coeff Band ( 2 )
+            { 370, 3378, 200,  30,   5,   4,   1,   0,   0,   0,   0,   0,},
+            { 293, 1006, 131,  29,  11,   0,   0,   0,   0,   0,   0, 5404,},
+            { 114, 387,  98,  23,   4,   8,   1,   0,   0,   0,   0, 236,},
+        },
+        {
+            // Coeff Band ( 3 )
+            { 579, 194,   4,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            { 395, 213,   5,   1,   0,   0,   0,   0,   0,   0,   0, 4157,},
+            { 119, 122,   4,   0,   0,   0,   0,   0,   0,   0,   0, 300,},
+        },
+        {
+            // Coeff Band ( 4 )
+            {  38, 557,  19,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  21, 114,  12,   1,   0,   0,   0,   0,   0,   0,   0, 427,},
+            {   0,   5,   0,   0,   0,   0,   0,   0,   0,   0,   0,   7,},
+        },
+        {
+            // Coeff Band ( 5 )
+            {  52,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  18,   6,   0,   0,   0,   0,   0,   0,   0,   0,   0, 652,},
+            {   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  30,},
+        },
+        {
+            // Coeff Band ( 6 )
+            { 640, 569,  10,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  25,  77,   2,   0,   0,   0,   0,   0,   0,   0,   0, 517,},
+            {   4,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3,},
+        },
+        {
+            // Coeff Band ( 7 )
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+        },
+    },
+    {
+        // Block Type ( 3 )
+        {
+            // Coeff Band ( 0 )
+            {2506, 20161, 2707, 767, 261, 178, 107,  30,  14,   3,   0, 100694,},
+            {8806, 36478, 8817, 3268, 1280, 850, 401, 114,  42,   0,   0, 58572,},
+            {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175,  32,   0,   0, 19284,},
+        },
+        {
+            // Coeff Band ( 1 )
+            {9738, 11313, 959, 205,  70,  18,  11,   1,   0,   0,   0,   0,},
+            {12628, 15085, 1507, 273,  52,  19,   9,   0,   0,   0,   0, 54280,},
+            {10701, 15846, 5561, 1926, 813, 570, 249,  36,   0,   0,   0, 6460,},
+        },
+        {
+            // Coeff Band ( 2 )
+            {6781, 22539, 2784, 634, 182, 123,  20,   4,   0,   0,   0,   0,},
+            {6263, 11544, 2649, 790, 259, 168,  27,   5,   0,   0,   0, 20539,},
+            {3109, 4075, 2031, 896, 457, 386, 158,  29,   0,   0,   0, 1138,},
+        },
+        {
+            // Coeff Band ( 3 )
+            {11515, 4079, 465,  73,   5,  14,   2,   0,   0,   0,   0,   0,},
+            {9361, 5834, 650,  96,  24,   8,   4,   0,   0,   0,   0, 22181,},
+            {4343, 3974, 1360, 415, 132,  96,  14,   1,   0,   0,   0, 1267,},
+        },
+        {
+            // Coeff Band ( 4 )
+            {4787, 9297, 823, 168,  44,  12,   4,   0,   0,   0,   0,   0,},
+            {3619, 4472, 719, 198,  60,  31,   3,   0,   0,   0,   0, 8401,},
+            {1157, 1175, 483, 182,  88,  31,   8,   0,   0,   0,   0, 268,},
+        },
+        {
+            // Coeff Band ( 5 )
+            {8299, 1226,  32,   5,   1,   0,   0,   0,   0,   0,   0,   0,},
+            {3502, 1568,  57,   4,   1,   1,   0,   0,   0,   0,   0, 9811,},
+            {1055, 1070, 166,  29,   6,   1,   0,   0,   0,   0,   0, 527,},
+        },
+        {
+            // Coeff Band ( 6 )
+            {27414, 27927, 1989, 347,  69,  26,   0,   0,   0,   0,   0,   0,},
+            {5876, 10074, 1574, 341,  91,  24,   4,   0,   0,   0,   0, 21954,},
+            {1571, 2171, 778, 324, 124,  65,  16,   0,   0,   0,   0, 979,},
+        },
+        {
+            // Coeff Band ( 7 )
+            {   0,  29,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,  23,   0,   0,   0,   0,   0,   0,   0,   0,   0, 459,},
+            {   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  13,},
+        },
+    },
+};
diff --git a/vp8/common/dma_desc.h b/vp8/common/dma_desc.h
new file mode 100644
index 000000000..5e6fa0ca9
--- /dev/null
+++ b/vp8/common/dma_desc.h
@@ -0,0 +1,124 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef _dma_desc_h
+#define _dma_desc_h
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+#define NDSIZE_LG   0x00000900  // Next Descriptor Size
+#define NDSIZE_SM   0x00000800  // Next Descriptor Size
+#define NDSIZE_7    0x00000700  // Next Descriptor Size
+#define NDSIZE_6    0x00000600  // Next Descriptor Size
+#define NDSIZE_5    0x00000500  // Next Descriptor Size
+#define NDSIZE_4    0x00000400  // Next Descriptor Size
+#define NDSIZE_3    0x00000300  // Next Descriptor Size
+#define NDSIZE_2    0x00000200  // Next Descriptor Size
+#define NDSIZE_1    0x00000100  // Next Descriptor Size
+
+#define FLOW_STOP       0x0000
+#define FLOW_AUTO       0x1000
+#define FLOW_DESC_AR    0x4000
+#define FLOW_DESC_SM    0x6000
+#define FLOW_DESC_LG    0x7000
+
+    typedef struct
+    {
+        unsigned int ndp;
+        //unsigned short ndpl;
+        //unsigned short ndph;
+        unsigned int sa;
+        //unsigned short sal;
+        //unsigned short sah;
+
+        unsigned short dmacfg;
+        unsigned short xcnt;
+        unsigned short xmod;
+        unsigned short ycnt;
+        unsigned short ymod;
+
+    } LARGE_DESC;
+
+    typedef struct
+    {
+        unsigned short ndpl;
+        unsigned short sal;
+        unsigned short sah;
+        unsigned short dmacfg;
+        unsigned short xcnt;
+        unsigned short xmod;
+        unsigned short ycnt;
+        unsigned short ymod;
+    } SMALL_DESC;
+
+    typedef struct
+    {
+        unsigned short sal;
+        unsigned short sah;
+        unsigned short dmacfg;
+        unsigned short xcnt;
+        unsigned short xmod;
+        unsigned short ycnt;
+        unsigned short ymod;
+    } ARRAY_DESC_7;
+
+    typedef struct
+    {
+        unsigned short sal;
+        unsigned short sah;
+        unsigned short dmacfg;
+        unsigned short xcnt;
+        unsigned short xmod;
+        unsigned short ycnt;
+    } ARRAY_DESC_6;
+
+    typedef struct
+    {
+        unsigned short sal;
+        unsigned short sah;
+        unsigned short dmacfg;
+        unsigned short xcnt;
+        unsigned short xmod;
+    } ARRAY_DESC_5;
+
+    typedef struct
+    {
+        unsigned short sal;
+        unsigned short sah;
+        unsigned short dmacfg;
+        unsigned short xcnt;
+    } ARRAY_DESC_4;
+
+    typedef struct
+    {
+        unsigned short sal;
+        unsigned short sah;
+        unsigned short dmacfg;
+    } ARRAY_DESC_3;
+
+    typedef struct
+    {
+        unsigned short sal;
+        unsigned short sah;
+    } ARRAY_DESC_2;
+
+    typedef struct
+    {
+        unsigned short sal;
+    } ARRAY_DESC_1;
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif //_dma_desc_h
diff --git a/vp8/common/duck_io.h b/vp8/common/duck_io.h
new file mode 100644
index 000000000..f63a5cdc1
--- /dev/null
+++ b/vp8/common/duck_io.h
@@ -0,0 +1,115 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef _duck_io_h
+#define _duck_io_h
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined (_WIN32)
+    typedef __int64 int64_t;
+#elif defined(__MWERKS__)
+    typedef long long int64_t;
+#elif defined(__APPLE__) || defined(__POWERPC)
+#include <ppc/types.h>
+#else
+    typedef long long int64_t;
+#endif
+
+    typedef struct
+    {
+        int64_t  offset;     // offset to start from
+        int    blocking;    // non-zero for blocking
+    } re_open_t;
+
+
+    typedef enum
+    {
+        SAL_ERR_MAX                 = -10,
+        SAL_ERROR                   = -11, // Default error
+        SAL_ERR_WSASTARTUP          = -12,
+        SAL_ERR_SOCKET_CREATE       = -13,
+        SAL_ERR_RESOLVING_HOSTNAME  = -14,
+        SAL_ERR_SERVER_CONNECTION   = -15,
+        SAL_ERR_SENDING_DATA        = -16,
+        SAL_ERR_RECEIVING_DATA      = -17,
+        SAL_ERR_404_FILE_NOT_FOUND  = -18,
+        SAL_ERR_PARSING_HTTP_HEADER = -19,
+        SAL_ERR_PARSING_CONTENT_LEN = -20,
+        SAL_ERR_CONNECTION_TIMEOUT  = -21,
+        SAL_ERR_FILE_OPEN_FAILED    = -22,
+        SAL_ERR_MIN                 = -23
+    } SAL_ERR; /* EMH 1-15-03 */
+
+
+    typedef struct sal_err_map_temp
+    {
+        SAL_ERR code;
+        const char *decode;
+
+    } sal_err_map_t;
+
+
+    static char *sal_err_text(SAL_ERR e)
+    {
+        int t;
+        const sal_err_map_t g_sal_err_map[] =
+        {
+            {   SAL_ERR_WSASTARTUP,             "Error with WSAStartup"         },
+            {   SAL_ERR_SOCKET_CREATE,          "Error creating socket"         },
+            {   SAL_ERR_RESOLVING_HOSTNAME,     "Error resolving hostname"      },
+            {   SAL_ERR_SERVER_CONNECTION,      "Error connecting to server"    },
+            {   SAL_ERR_SENDING_DATA,           "Error sending data"            },
+            {   SAL_ERR_RECEIVING_DATA,         "Error receiving data"          },
+            {   SAL_ERR_404_FILE_NOT_FOUND,     "Error file not found "         },
+            {   SAL_ERR_PARSING_HTTP_HEADER,    "Error parsing http header"     },
+            {   SAL_ERR_PARSING_CONTENT_LEN,    "Error parsing content length"  },
+            {   SAL_ERR_CONNECTION_TIMEOUT,     "Error Connection timed out"    },
+            {   SAL_ERR_FILE_OPEN_FAILED,       "Error opening file"            }
+        };
+
+        for (t = 0; t < sizeof(g_sal_err_map) / sizeof(sal_err_map_t); t++)
+        {
+            if (e == g_sal_err_map[t].code)
+                return (char *) g_sal_err_map[t].decode;
+        }
+
+        return 0;
+    }
+
+
+
+
+
+
+
+    int duck_open(const char *fname, unsigned long user_data);
+
+    void duck_close(int ghndl);
+
+    int duck_read(int ghndl, unsigned char *buf, int nbytes);
+
+    int64_t duck_seek(int g_hndl, int64_t offs, int origin);
+
+    int duck_read_finished(int han, int flag); /* FWG 7-9-99 */
+
+    int duck_name(int handle, char name[], size_t max_len); /* EMH 9-23-03 */
+
+    int duck_read_blocking(int handle, unsigned char *buffer, int bytes); /* EMH 9-23-03 */
+
+    int64_t duck_available_data(int handle); /* EMH 10-23-03 */
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/vp8/common/entropy.c b/vp8/common/entropy.c
new file mode 100644
index 000000000..e524c2acc
--- /dev/null
+++ b/vp8/common/entropy.c
@@ -0,0 +1,161 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include <stdio.h>
+
+#include "entropy.h"
+#include "string.h"
+#include "blockd.h"
+#include "onyxc_int.h"
+
+#define uchar unsigned char     /* typedefs can clash */
+#define uint  unsigned int
+
+typedef const uchar cuchar;
+typedef const uint cuint;
+
+typedef vp8_prob Prob;
+
+#include "coefupdateprobs.h"
+
+DECLARE_ALIGNED(16, cuchar, vp8_coef_bands[16]) = { 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7};
+DECLARE_ALIGNED(16, cuchar, vp8_prev_token_class[MAX_ENTROPY_TOKENS]) = { 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0};
+DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) =
+{
+    0,  1,  4,  8,
+    5,  2,  3,  6,
+    9, 12, 13, 10,
+    7, 11, 14, 15,
+};
+
+DECLARE_ALIGNED(16, short, vp8_default_zig_zag_mask[16]);
+
+const int vp8_mb_feature_data_bits[MB_LVL_MAX] = {7, 6};
+
+/* Array indices are identical to previously-existing CONTEXT_NODE indices */
+
+const vp8_tree_index vp8_coef_tree[ 22] =     /* corresponding _CONTEXT_NODEs */
+{
+    -DCT_EOB_TOKEN, 2,                             /* 0 = EOB */
+    -ZERO_TOKEN, 4,                               /* 1 = ZERO */
+    -ONE_TOKEN, 6,                               /* 2 = ONE */
+    8, 12,                                      /* 3 = LOW_VAL */
+    -TWO_TOKEN, 10,                            /* 4 = TWO */
+    -THREE_TOKEN, -FOUR_TOKEN,                /* 5 = THREE */
+    14, 16,                                    /* 6 = HIGH_LOW */
+    -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 7 = CAT_ONE */
+    18, 20,                                   /* 8 = CAT_THREEFOUR */
+    -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,  /* 9 = CAT_THREE */
+    -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6   /* 10 = CAT_FIVE */
+};
+
+struct vp8_token_struct vp8_coef_encodings[vp8_coef_tokens];
+
+/* Trees for extra bits.  Probabilities are constant and
+   do not depend on previously encoded bits */
+
+static const Prob Pcat1[] = { 159};
+static const Prob Pcat2[] = { 165, 145};
+static const Prob Pcat3[] = { 173, 148, 140};
+static const Prob Pcat4[] = { 176, 155, 140, 135};
+static const Prob Pcat5[] = { 180, 157, 141, 134, 130};
+static const Prob Pcat6[] =
+{ 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129};
+
+static vp8_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[22];
+
+void vp8_init_scan_order_mask()
+{
+    int i;
+
+    for (i = 0; i < 16; i++)
+    {
+        vp8_default_zig_zag_mask[vp8_default_zig_zag1d[i]] = 1 << i;
+    }
+
+}
+
+static void init_bit_tree(vp8_tree_index *p, int n)
+{
+    int i = 0;
+
+    while (++i < n)
+    {
+        p[0] = p[1] = i << 1;
+        p += 2;
+    }
+
+    p[0] = p[1] = 0;
+}
+
+static void init_bit_trees()
+{
+    init_bit_tree(cat1, 1);
+    init_bit_tree(cat2, 2);
+    init_bit_tree(cat3, 3);
+    init_bit_tree(cat4, 4);
+    init_bit_tree(cat5, 5);
+    init_bit_tree(cat6, 11);
+}
+
+
+static vp8bc_index_t bcc1[1], bcc2[2], bcc3[3], bcc4[4], bcc5[5], bcc6[11];
+
+vp8_extra_bit_struct vp8_extra_bits[12] =
+{
+    { 0, 0, 0, 0, 0},
+    { 0, 0, 0, 0, 1},
+    { 0, 0, 0, 0, 2},
+    { 0, 0, 0, 0, 3},
+    { 0, 0, 0, 0, 4},
+    { cat1, Pcat1, bcc1, 1, 5},
+    { cat2, Pcat2, bcc2, 2, 7},
+    { cat3, Pcat3, bcc3, 3, 11},
+    { cat4, Pcat4, bcc4, 4, 19},
+    { cat5, Pcat5, bcc5, 5, 35},
+    { cat6, Pcat6, bcc6, 11, 67},
+    { 0, 0, 0, 0, 0}
+};
+#include "defaultcoefcounts.h"
+
+void vp8_default_coef_probs(VP8_COMMON *pc)
+{
+    int h = 0;
+
+    do
+    {
+        int i = 0;
+
+        do
+        {
+            int k = 0;
+
+            do
+            {
+                unsigned int branch_ct [vp8_coef_tokens-1] [2];
+                vp8_tree_probs_from_distribution(
+                    vp8_coef_tokens, vp8_coef_encodings, vp8_coef_tree,
+                    pc->fc.coef_probs [h][i][k], branch_ct, default_coef_counts [h][i][k],
+                    256, 1);
+
+            }
+            while (++k < PREV_COEF_CONTEXTS);
+        }
+        while (++i < COEF_BANDS);
+    }
+    while (++h < BLOCK_TYPES);
+}
+
+
+void vp8_coef_tree_initialize()
+{
+    init_bit_trees();
+    vp8_tokens_from_tree(vp8_coef_encodings, vp8_coef_tree);
+}
diff --git a/vp8/common/entropy.h b/vp8/common/entropy.h
new file mode 100644
index 000000000..1415832d5
--- /dev/null
+++ b/vp8/common/entropy.h
@@ -0,0 +1,101 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENTROPY_H
+#define __INC_ENTROPY_H
+
+#include "treecoder.h"
+#include "blockd.h"
+
+/* Coefficient token alphabet */
+
+#define ZERO_TOKEN              0       //0         Extra Bits 0+0
+#define ONE_TOKEN               1       //1         Extra Bits 0+1
+#define TWO_TOKEN               2       //2         Extra Bits 0+1
+#define THREE_TOKEN             3       //3         Extra Bits 0+1
+#define FOUR_TOKEN              4       //4         Extra Bits 0+1
+#define DCT_VAL_CATEGORY1       5       //5-6       Extra Bits 1+1
+#define DCT_VAL_CATEGORY2       6       //7-10      Extra Bits 2+1
+#define DCT_VAL_CATEGORY3       7       //11-26     Extra Bits 4+1
+#define DCT_VAL_CATEGORY4       8       //11-26     Extra Bits 5+1
+#define DCT_VAL_CATEGORY5       9       //27-58     Extra Bits 5+1
+#define DCT_VAL_CATEGORY6       10      //59+       Extra Bits 11+1
+#define DCT_EOB_TOKEN           11      //EOB       Extra Bits 0+0
+
+#define vp8_coef_tokens 12
+#define MAX_ENTROPY_TOKENS vp8_coef_tokens
+#define ENTROPY_NODES 11
+
+extern const vp8_tree_index vp8_coef_tree[];
+
+extern struct vp8_token_struct vp8_coef_encodings[vp8_coef_tokens];
+
+typedef struct
+{
+    vp8_tree_p tree;
+    const vp8_prob *prob;
+    vp8bc_index_t *prob_bc;
+    int Len;
+    int base_val;
+} vp8_extra_bit_struct;
+
+extern vp8_extra_bit_struct vp8_extra_bits[12];    /* indexed by token value */
+
+#define PROB_UPDATE_BASELINE_COST   7
+
+#define MAX_PROB                255
+#define DCT_MAX_VALUE           2048
+
+
+/* Coefficients are predicted via a 3-dimensional probability table. */
+
+/* Outside dimension.  0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */
+
+#define BLOCK_TYPES 4
+
+/* Middle dimension is a coarsening of the coefficient's
+   position within the 4x4 DCT. */
+
+#define COEF_BANDS 8
+extern DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]);
+
+/* Inside dimension is 3-valued measure of nearby complexity, that is,
+   the extent to which nearby coefficients are nonzero.  For the first
+   coefficient (DC, unless block type is 0), we look at the (already encoded)
+   blocks above and to the left of the current block.  The context index is
+   then the number (0,1,or 2) of these blocks having nonzero coefficients.
+   After decoding a coefficient, the measure is roughly the size of the
+   most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1).
+   Note that the intuitive meaning of this measure changes as coefficients
+   are decoded, e.g., prior to the first token, a zero means that my neighbors
+   are empty while, after the first token, because of the use of end-of-block,
+   a zero means we just decoded a zero and hence guarantees that a non-zero
+   coefficient will appear later in this block.  However, this shift
+   in meaning is perfectly OK because our context depends also on the
+   coefficient band (and since zigzag positions 0, 1, and 2 are in
+   distinct bands). */
+
+/*# define DC_TOKEN_CONTEXTS        3 // 00, 0!0, !0!0 */
+#   define PREV_COEF_CONTEXTS       3
+
+extern DECLARE_ALIGNED(16, const unsigned char, vp8_prev_token_class[vp8_coef_tokens]);
+
+extern const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1];
+
+
+struct VP8Common;
+void vp8_default_coef_probs(struct VP8Common *);
+
+extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]);
+extern short vp8_default_zig_zag_mask[16];
+extern const int vp8_mb_feature_data_bits[MB_LVL_MAX];
+
+void vp8_coef_tree_initialize(void);
+#endif
diff --git a/vp8/common/entropymode.c b/vp8/common/entropymode.c
new file mode 100644
index 000000000..7dc1acde0
--- /dev/null
+++ b/vp8/common/entropymode.c
@@ -0,0 +1,270 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "entropymode.h"
+#include "entropy.h"
+#include "vpx_mem/vpx_mem.h"
+
+static const unsigned int kf_y_mode_cts[VP8_YMODES] = { 1607, 915, 812, 811, 5455};
+static const unsigned int y_mode_cts  [VP8_YMODES] = { 8080, 1908, 1582, 1007, 5874};
+
+static const unsigned int uv_mode_cts  [VP8_UV_MODES] = { 59483, 13605, 16492, 4230};
+static const unsigned int kf_uv_mode_cts[VP8_UV_MODES] = { 5319, 1904, 1703, 674};
+
+static const unsigned int bmode_cts[VP8_BINTRAMODES] =
+{
+    43891, 17694, 10036, 3920, 3363, 2546, 5119, 3221, 2471, 1723
+};
+
+typedef enum
+{
+    SUBMVREF_NORMAL,
+    SUBMVREF_LEFT_ZED,
+    SUBMVREF_ABOVE_ZED,
+    SUBMVREF_LEFT_ABOVE_SAME,
+    SUBMVREF_LEFT_ABOVE_ZED
+} sumvfref_t;
+
+int vp8_mv_cont(const MV *l, const MV *a)
+{
+    int lez = (l->row == 0 && l->col == 0);
+    int aez = (a->row == 0 && a->col == 0);
+    int lea = (l->row == a->row && l->col == a->col);
+
+    if (lea && lez)
+        return SUBMVREF_LEFT_ABOVE_ZED;
+
+    if (lea)
+        return SUBMVREF_LEFT_ABOVE_SAME;
+
+    if (aez)
+        return SUBMVREF_ABOVE_ZED;
+
+    if (lez)
+        return SUBMVREF_LEFT_ZED;
+
+    return SUBMVREF_NORMAL;
+}
+
+static const vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1] = { 180, 162, 25};
+
+const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1] =
+{
+    { 147, 136, 18 },
+    { 106, 145, 1  },
+    { 179, 121, 1  },
+    { 223, 1  , 34 },
+    { 208, 1  , 1  }
+};
+
+
+
+vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS] =
+{
+    {
+        0,  0,  0,  0,
+        0,  0,  0,  0,
+        1,  1,  1,  1,
+        1,  1,  1,  1,
+    },
+    {
+        0,  0,  1,  1,
+        0,  0,  1,  1,
+        0,  0,  1,  1,
+        0,  0,  1,  1,
+    },
+    {
+        0,  0,  1,  1,
+        0,  0,  1,  1,
+        2,  2,  3,  3,
+        2,  2,  3,  3,
+    },
+    {
+        0,  1,  2,  3,
+        4,  5,  6,  7,
+        8,  9,  10, 11,
+        12, 13, 14, 15,
+    },
+};
+
+const int vp8_mbsplit_count [VP8_NUMMBSPLITS] = { 2, 2, 4, 16};
+
+const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1] = { 110, 111, 150};
+
+
+/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
+
+const vp8_tree_index vp8_bmode_tree[18] =     /* INTRAMODECONTEXTNODE value */
+{
+    -B_DC_PRED, 2,                             /* 0 = DC_NODE */
+    -B_TM_PRED, 4,                            /* 1 = TM_NODE */
+    -B_VE_PRED, 6,                           /* 2 = VE_NODE */
+    8, 12,                                  /* 3 = COM_NODE */
+    -B_HE_PRED, 10,                        /* 4 = HE_NODE */
+    -B_RD_PRED, -B_VR_PRED,               /* 5 = RD_NODE */
+    -B_LD_PRED, 14,                        /* 6 = LD_NODE */
+    -B_VL_PRED, 16,                      /* 7 = VL_NODE */
+    -B_HD_PRED, -B_HU_PRED             /* 8 = HD_NODE */
+};
+
+/* Again, these trees use the same probability indices as their
+   explicitly-programmed predecessors. */
+
+const vp8_tree_index vp8_ymode_tree[8] =
+{
+    -DC_PRED, 2,
+    4, 6,
+    -V_PRED, -H_PRED,
+    -TM_PRED, -B_PRED
+};
+
+const vp8_tree_index vp8_kf_ymode_tree[8] =
+{
+    -B_PRED, 2,
+    4, 6,
+    -DC_PRED, -V_PRED,
+    -H_PRED, -TM_PRED
+};
+
+const vp8_tree_index vp8_uv_mode_tree[6] =
+{
+    -DC_PRED, 2,
+    -V_PRED, 4,
+    -H_PRED, -TM_PRED
+};
+
+const vp8_tree_index vp8_mbsplit_tree[6] =
+{
+    -3, 2,
+    -2, 4,
+    -0, -1
+};
+
+const vp8_tree_index vp8_mv_ref_tree[8] =
+{
+    -ZEROMV, 2,
+    -NEARESTMV, 4,
+    -NEARMV, 6,
+    -NEWMV, -SPLITMV
+};
+
+const vp8_tree_index vp8_sub_mv_ref_tree[6] =
+{
+    -LEFT4X4, 2,
+    -ABOVE4X4, 4,
+    -ZERO4X4, -NEW4X4
+};
+
+
+struct vp8_token_struct vp8_bmode_encodings   [VP8_BINTRAMODES];
+struct vp8_token_struct vp8_ymode_encodings   [VP8_YMODES];
+struct vp8_token_struct vp8_kf_ymode_encodings [VP8_YMODES];
+struct vp8_token_struct vp8_uv_mode_encodings  [VP8_UV_MODES];
+struct vp8_token_struct vp8_mbsplit_encodings [VP8_NUMMBSPLITS];
+
+struct vp8_token_struct vp8_mv_ref_encoding_array    [VP8_MVREFS];
+struct vp8_token_struct vp8_sub_mv_ref_encoding_array [VP8_SUBMVREFS];
+
+
+const vp8_tree_index vp8_small_mvtree [14] =
+{
+    2, 8,
+    4, 6,
+    -0, -1,
+    -2, -3,
+    10, 12,
+    -4, -5,
+    -6, -7
+};
+
+struct vp8_token_struct vp8_small_mvencodings [8];
+
+void vp8_init_mbmode_probs(VP8_COMMON *x)
+{
+    unsigned int bct [VP8_YMODES] [2];      /* num Ymodes > num UV modes */
+
+    vp8_tree_probs_from_distribution(
+        VP8_YMODES, vp8_ymode_encodings, vp8_ymode_tree,
+        x->fc.ymode_prob, bct, y_mode_cts,
+        256, 1
+    );
+    vp8_tree_probs_from_distribution(
+        VP8_YMODES, vp8_kf_ymode_encodings, vp8_kf_ymode_tree,
+        x->kf_ymode_prob, bct, kf_y_mode_cts,
+        256, 1
+    );
+    vp8_tree_probs_from_distribution(
+        VP8_UV_MODES, vp8_uv_mode_encodings, vp8_uv_mode_tree,
+        x->fc.uv_mode_prob, bct, uv_mode_cts,
+        256, 1
+    );
+    vp8_tree_probs_from_distribution(
+        VP8_UV_MODES, vp8_uv_mode_encodings, vp8_uv_mode_tree,
+        x->kf_uv_mode_prob, bct, kf_uv_mode_cts,
+        256, 1
+    );
+    vpx_memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob));
+}
+
+
+static void intra_bmode_probs_from_distribution(
+    vp8_prob p [VP8_BINTRAMODES-1],
+    unsigned int branch_ct [VP8_BINTRAMODES-1] [2],
+    const unsigned int events [VP8_BINTRAMODES]
+)
+{
+    vp8_tree_probs_from_distribution(
+        VP8_BINTRAMODES, vp8_bmode_encodings, vp8_bmode_tree,
+        p, branch_ct, events,
+        256, 1
+    );
+}
+
+void vp8_default_bmode_probs(vp8_prob p [VP8_BINTRAMODES-1])
+{
+    unsigned int branch_ct [VP8_BINTRAMODES-1] [2];
+    intra_bmode_probs_from_distribution(p, branch_ct, bmode_cts);
+}
+
+void vp8_kf_default_bmode_probs(vp8_prob p [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1])
+{
+    unsigned int branch_ct [VP8_BINTRAMODES-1] [2];
+
+    int i = 0;
+
+    do
+    {
+        int j = 0;
+
+        do
+        {
+            intra_bmode_probs_from_distribution(
+                p[i][j], branch_ct, vp8_kf_default_bmode_counts[i][j]);
+
+        }
+        while (++j < VP8_BINTRAMODES);
+    }
+    while (++i < VP8_BINTRAMODES);
+}
+
+
+void vp8_entropy_mode_init()
+{
+    vp8_tokens_from_tree(vp8_bmode_encodings,   vp8_bmode_tree);
+    vp8_tokens_from_tree(vp8_ymode_encodings,   vp8_ymode_tree);
+    vp8_tokens_from_tree(vp8_kf_ymode_encodings, vp8_kf_ymode_tree);
+    vp8_tokens_from_tree(vp8_uv_mode_encodings,  vp8_uv_mode_tree);
+    vp8_tokens_from_tree(vp8_mbsplit_encodings, vp8_mbsplit_tree);
+
+    vp8_tokens_from_tree(VP8_MVREFENCODINGS,   vp8_mv_ref_tree);
+    vp8_tokens_from_tree(VP8_SUBMVREFENCODINGS, vp8_sub_mv_ref_tree);
+
+    vp8_tokens_from_tree(vp8_small_mvencodings, vp8_small_mvtree);
+}
diff --git a/vp8/common/entropymode.h b/vp8/common/entropymode.h
new file mode 100644
index 000000000..ff630a477
--- /dev/null
+++ b/vp8/common/entropymode.h
@@ -0,0 +1,71 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENTROPYMODE_H
+#define __INC_ENTROPYMODE_H
+
+#include "onyxc_int.h"
+#include "treecoder.h"
+
+typedef const int vp8_mbsplit[16];
+
+#define VP8_NUMMBSPLITS 4
+
+extern vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS];
+
+extern const int vp8_mbsplit_count [VP8_NUMMBSPLITS];    /* # of subsets */
+
+extern const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1];
+
+extern int vp8_mv_cont(const MV *l, const MV *a);
+#define SUBMVREF_COUNT 5
+extern const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1];
+
+
+extern const unsigned int vp8_kf_default_bmode_counts [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES];
+
+
+extern const vp8_tree_index vp8_bmode_tree[];
+
+extern const vp8_tree_index  vp8_ymode_tree[];
+extern const vp8_tree_index  vp8_kf_ymode_tree[];
+extern const vp8_tree_index  vp8_uv_mode_tree[];
+
+extern const vp8_tree_index  vp8_mbsplit_tree[];
+extern const vp8_tree_index  vp8_mv_ref_tree[];
+extern const vp8_tree_index  vp8_sub_mv_ref_tree[];
+
+extern struct vp8_token_struct vp8_bmode_encodings   [VP8_BINTRAMODES];
+extern struct vp8_token_struct vp8_ymode_encodings   [VP8_YMODES];
+extern struct vp8_token_struct vp8_kf_ymode_encodings [VP8_YMODES];
+extern struct vp8_token_struct vp8_uv_mode_encodings  [VP8_UV_MODES];
+extern struct vp8_token_struct vp8_mbsplit_encodings  [VP8_NUMMBSPLITS];
+
+/* Inter mode values do not start at zero */
+
+extern struct vp8_token_struct vp8_mv_ref_encoding_array    [VP8_MVREFS];
+extern struct vp8_token_struct vp8_sub_mv_ref_encoding_array [VP8_SUBMVREFS];
+
+#define VP8_MVREFENCODINGS      (vp8_mv_ref_encoding_array - NEARESTMV)
+#define VP8_SUBMVREFENCODINGS   (vp8_sub_mv_ref_encoding_array - LEFT4X4)
+
+
+extern const vp8_tree_index vp8_small_mvtree[];
+
+extern struct vp8_token_struct vp8_small_mvencodings [8];
+
+void vp8_entropy_mode_init(void);
+
+void vp8_init_mbmode_probs(VP8_COMMON *x);
+
+void   vp8_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES-1]);
+void vp8_kf_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1]);
+
+#endif
diff --git a/vp8/common/entropymv.c b/vp8/common/entropymv.c
new file mode 100644
index 000000000..2b00c17a9
--- /dev/null
+++ b/vp8/common/entropymv.c
@@ -0,0 +1,48 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "entropymv.h"
+
+const MV_CONTEXT vp8_mv_update_probs[2] =
+{
+    {{
+        237,
+        246,
+        253, 253, 254, 254, 254, 254, 254,
+        254, 254, 254, 254, 254, 250, 250, 252, 254, 254
+    }},
+    {{
+        231,
+        243,
+        245, 253, 254, 254, 254, 254, 254,
+        254, 254, 254, 254, 254, 251, 251, 254, 254, 254
+    }}
+};
+const MV_CONTEXT vp8_default_mv_context[2] =
+{
+    {{
+        // row
+        162,                                        // is short
+        128,                                        // sign
+        225, 146, 172, 147, 214,  39, 156,          // short tree
+        128, 129, 132,  75, 145, 178, 206, 239, 254, 254 // long bits
+    }},
+
+
+
+    {{
+        // same for column
+        164,                                        // is short
+        128,
+        204, 170, 119, 235, 140, 230, 228,
+        128, 130, 130,  74, 148, 180, 203, 236, 254, 254 // long bits
+
+    }}
+};
diff --git a/vp8/common/entropymv.h b/vp8/common/entropymv.h
new file mode 100644
index 000000000..d940c599b
--- /dev/null
+++ b/vp8/common/entropymv.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENTROPYMV_H
+#define __INC_ENTROPYMV_H
+
+#include "treecoder.h"
+
+enum
+{
+    mv_max  = 1023,              /* max absolute value of a MV component */
+    MVvals = (2 * mv_max) + 1,   /* # possible values "" */
+
+    mvlong_width = 10,       /* Large MVs have 9 bit magnitudes */
+    mvnum_short = 8,         /* magnitudes 0 through 7 */
+
+    /* probability offsets for coding each MV component */
+
+    mvpis_short = 0,         /* short (<= 7) vs long (>= 8) */
+    MVPsign,                /* sign for non-zero */
+    MVPshort,               /* 8 short values = 7-position tree */
+
+    MVPbits = MVPshort + mvnum_short - 1, /* mvlong_width long value bits */
+    MVPcount = MVPbits + mvlong_width    /* (with independent probabilities) */
+};
+
+typedef struct mv_context
+{
+    vp8_prob prob[MVPcount];  /* often come in row, col pairs */
+} MV_CONTEXT;
+
+extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2];
+
+#endif
diff --git a/vp8/common/extend.c b/vp8/common/extend.c
new file mode 100644
index 000000000..74079527c
--- /dev/null
+++ b/vp8/common/extend.c
@@ -0,0 +1,120 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "extend.h"
+#include "vpx_mem/vpx_mem.h"
+
+
+static void extend_plane_borders
+(
+    unsigned char *s, // source
+    int sp,           // pitch
+    int h,            // height
+    int w,            // width
+    int et,           // extend top border
+    int el,           // extend left border
+    int eb,           // extend bottom border
+    int er            // extend right border
+)
+{
+
+    int i;
+    unsigned char *src_ptr1, *src_ptr2;
+    unsigned char *dest_ptr1, *dest_ptr2;
+    int linesize;
+
+    // copy the left and right most columns out
+    src_ptr1 = s;
+    src_ptr2 = s + w - 1;
+    dest_ptr1 = s - el;
+    dest_ptr2 = s + w;
+
+    for (i = 0; i < h - 0 + 1; i++)
+    {
+        vpx_memset(dest_ptr1, src_ptr1[0], el);
+        vpx_memset(dest_ptr2, src_ptr2[0], er);
+        src_ptr1  += sp;
+        src_ptr2  += sp;
+        dest_ptr1 += sp;
+        dest_ptr2 += sp;
+    }
+
+    // Now copy the top and bottom source lines into each line of the respective borders
+    src_ptr1 = s - el;
+    src_ptr2 = s + sp * (h - 1) - el;
+    dest_ptr1 = s + sp * (-et) - el;
+    dest_ptr2 = s + sp * (h) - el;
+    linesize = el + er + w + 1;
+
+    for (i = 0; i < (int)et; i++)
+    {
+        vpx_memcpy(dest_ptr1, src_ptr1, linesize);
+        dest_ptr1 += sp;
+    }
+
+    for (i = 0; i < (int)eb; i++)
+    {
+        vpx_memcpy(dest_ptr2, src_ptr2, linesize);
+        dest_ptr2 += sp;
+    }
+}
+
+
+void vp8_extend_to_multiple_of16(YV12_BUFFER_CONFIG *ybf, int width, int height)
+{
+    int er = 0xf & (16 - (width & 0xf));
+    int eb = 0xf & (16 - (height & 0xf));
+
+    // check for non multiples of 16
+    if (er != 0 || eb != 0)
+    {
+        extend_plane_borders(ybf->y_buffer, ybf->y_stride, height, width, 0, 0, eb, er);
+
+        //adjust for uv
+        height = (height + 1) >> 1;
+        width  = (width  + 1) >> 1;
+        er = 0x7 & (8 - (width  & 0x7));
+        eb = 0x7 & (8 - (height & 0x7));
+
+        if (er || eb)
+        {
+            extend_plane_borders(ybf->u_buffer, ybf->uv_stride, height, width, 0, 0, eb, er);
+            extend_plane_borders(ybf->v_buffer, ybf->uv_stride, height, width, 0, 0, eb, er);
+        }
+    }
+}
+
+// note the extension is only for the last row, for intra prediction purpose
+void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr)
+{
+    int i;
+
+    YPtr += ybf->y_stride * 14;
+    UPtr += ybf->uv_stride * 6;
+    VPtr += ybf->uv_stride * 6;
+
+    for (i = 0; i < 4; i++)
+    {
+        YPtr[i] = YPtr[-1];
+        UPtr[i] = UPtr[-1];
+        VPtr[i] = VPtr[-1];
+    }
+
+    YPtr += ybf->y_stride;
+    UPtr += ybf->uv_stride;
+    VPtr += ybf->uv_stride;
+
+    for (i = 0; i < 4; i++)
+    {
+        YPtr[i] = YPtr[-1];
+        UPtr[i] = UPtr[-1];
+        VPtr[i] = VPtr[-1];
+    }
+}
diff --git a/vp8/common/extend.h b/vp8/common/extend.h
new file mode 100644
index 000000000..6809ae756
--- /dev/null
+++ b/vp8/common/extend.h
@@ -0,0 +1,20 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_EXTEND_H
+#define __INC_EXTEND_H
+
+#include "vpx_scale/yv12config.h"
+
+void Extend(YV12_BUFFER_CONFIG *ybf);
+void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr);
+void vp8_extend_to_multiple_of16(YV12_BUFFER_CONFIG *ybf, int width, int height);
+
+#endif
diff --git a/vp8/common/filter_c.c b/vp8/common/filter_c.c
new file mode 100644
index 000000000..38991cb28
--- /dev/null
+++ b/vp8/common/filter_c.c
@@ -0,0 +1,539 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+
+#define BLOCK_HEIGHT_WIDTH 4
+#define VP8_FILTER_WEIGHT 128
+#define VP8_FILTER_SHIFT  7
+
+
+static const int bilinear_filters[8][2] =
+{
+    { 128,   0 },
+    { 112,  16 },
+    {  96,  32 },
+    {  80,  48 },
+    {  64,  64 },
+    {  48,  80 },
+    {  32,  96 },
+    {  16, 112 }
+};
+
+
+static const short sub_pel_filters[8][6] =
+{
+
+    { 0,  0,  128,    0,   0,  0 },         // note that 1/8 pel positions are just as per alpha -0.5 bicubic
+    { 0, -6,  123,   12,  -1,  0 },
+    { 2, -11, 108,   36,  -8,  1 },         // New 1/4 pel 6 tap filter
+    { 0, -9,   93,   50,  -6,  0 },
+    { 3, -16,  77,   77, -16,  3 },         // New 1/2 pel 6 tap filter
+    { 0, -6,   50,   93,  -9,  0 },
+    { 1, -8,   36,  108, -11,  2 },         // New 1/4 pel 6 tap filter
+    { 0, -1,   12,  123,  -6,  0 },
+
+
+
+};
+
+void vp8_filter_block2d_first_pass
+(
+    unsigned char *src_ptr,
+    int *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const short *vp8_filter
+)
+{
+    unsigned int i, j;
+    int  Temp;
+
+    for (i = 0; i < output_height; i++)
+    {
+        for (j = 0; j < output_width; j++)
+        {
+            Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
+                   ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
+                   ((int)src_ptr[0]                 * vp8_filter[2]) +
+                   ((int)src_ptr[pixel_step]         * vp8_filter[3]) +
+                   ((int)src_ptr[2*pixel_step]       * vp8_filter[4]) +
+                   ((int)src_ptr[3*pixel_step]       * vp8_filter[5]) +
+                   (VP8_FILTER_WEIGHT >> 1);      // Rounding
+
+            // Normalize back to 0-255
+            Temp = Temp >> VP8_FILTER_SHIFT;
+
+            if (Temp < 0)
+                Temp = 0;
+            else if (Temp > 255)
+                Temp = 255;
+
+            output_ptr[j] = Temp;
+            src_ptr++;
+        }
+
+        // Next row...
+        src_ptr    += src_pixels_per_line - output_width;
+        output_ptr += output_width;
+    }
+}
+
+void vp8_filter_block2d_second_pass
+(
+    int *src_ptr,
+    unsigned char *output_ptr,
+    int output_pitch,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const short *vp8_filter
+)
+{
+    unsigned int i, j;
+    int  Temp;
+
+    for (i = 0; i < output_height; i++)
+    {
+        for (j = 0; j < output_width; j++)
+        {
+            // Apply filter
+            Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
+                   ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
+                   ((int)src_ptr[0]                 * vp8_filter[2]) +
+                   ((int)src_ptr[pixel_step]         * vp8_filter[3]) +
+                   ((int)src_ptr[2*pixel_step]       * vp8_filter[4]) +
+                   ((int)src_ptr[3*pixel_step]       * vp8_filter[5]) +
+                   (VP8_FILTER_WEIGHT >> 1);   // Rounding
+
+            // Normalize back to 0-255
+            Temp = Temp >> VP8_FILTER_SHIFT;
+
+            if (Temp < 0)
+                Temp = 0;
+            else if (Temp > 255)
+                Temp = 255;
+
+            output_ptr[j] = (unsigned char)Temp;
+            src_ptr++;
+        }
+
+        // Start next row
+        src_ptr    += src_pixels_per_line - output_width;
+        output_ptr += output_pitch;
+    }
+}
+
+
+void vp8_filter_block2d
+(
+    unsigned char  *src_ptr,
+    unsigned char  *output_ptr,
+    unsigned int src_pixels_per_line,
+    int output_pitch,
+    const short  *HFilter,
+    const short  *VFilter
+)
+{
+    int FData[9*4]; // Temp data bufffer used in filtering
+
+    // First filter 1-D horizontally...
+    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);
+
+    // then filter verticaly...
+    vp8_filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
+}
+
+
+void vp8_block_variation_c
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int *HVar,
+    int *VVar
+)
+{
+    int i, j;
+    unsigned char *Ptr = src_ptr;
+
+    for (i = 0; i < 4; i++)
+    {
+        for (j = 0; j < 4; j++)
+        {
+            *HVar += abs((int)Ptr[j] - (int)Ptr[j+1]);
+            *VVar += abs((int)Ptr[j] - (int)Ptr[j+src_pixels_per_line]);
+        }
+
+        Ptr += src_pixels_per_line;
+    }
+}
+
+
+
+
+void vp8_sixtap_predict_c
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+
+    HFilter = sub_pel_filters[xoffset];   // 6 tap
+    VFilter = sub_pel_filters[yoffset];   // 6 tap
+
+    vp8_filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
+}
+void vp8_sixtap_predict8x8_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    int FData[13*16];   // Temp data bufffer used in filtering
+
+    HFilter = sub_pel_filters[xoffset];   // 6 tap
+    VFilter = sub_pel_filters[yoffset];   // 6 tap
+
+    // First filter 1-D horizontally...
+    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);
+
+
+    // then filter verticaly...
+    vp8_filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
+
+}
+
+void vp8_sixtap_predict8x4_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    int FData[13*16];   // Temp data bufffer used in filtering
+
+    HFilter = sub_pel_filters[xoffset];   // 6 tap
+    VFilter = sub_pel_filters[yoffset];   // 6 tap
+
+    // First filter 1-D horizontally...
+    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);
+
+
+    // then filter verticaly...
+    vp8_filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);
+
+}
+
+void vp8_sixtap_predict16x16_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    int FData[21*24];   // Temp data bufffer used in filtering
+
+
+    HFilter = sub_pel_filters[xoffset];   // 6 tap
+    VFilter = sub_pel_filters[yoffset];   // 6 tap
+
+    // First filter 1-D horizontally...
+    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);
+
+    // then filter verticaly...
+    vp8_filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);
+
+}
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil_first_pass
+ *
+ *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
+ *                  UINT32 src_pixels_per_line : Stride of input block.
+ *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
+ *                  UINT32 output_height     : Input block height.
+ *                  UINT32 output_width      : Input block width.
+ *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.
+ *
+ *  OUTPUTS       : INT32 *output_ptr        : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
+ *                  either horizontal or vertical direction to produce the
+ *                  filtered output block. Used to implement first-pass
+ *                  of 2-D separable filter.
+ *
+ *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
+ *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
+ *                  pixel_step defines whether the filter is applied
+ *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
+ *                  It defines the offset required to move from one input
+ *                  to the next.
+ *
+ ****************************************************************************/
+void vp8_filter_block2d_bil_first_pass
+(
+    unsigned char *src_ptr,
+    unsigned short *output_ptr,
+    unsigned int src_pixels_per_line,
+    int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const int *vp8_filter
+)
+{
+    unsigned int i, j;
+
+    for (i = 0; i < output_height; i++)
+    {
+        for (j = 0; j < output_width; j++)
+        {
+            // Apply bilinear filter
+            output_ptr[j] = (((int)src_ptr[0]          * vp8_filter[0]) +
+                             ((int)src_ptr[pixel_step] * vp8_filter[1]) +
+                             (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
+            src_ptr++;
+        }
+
+        // Next row...
+        src_ptr    += src_pixels_per_line - output_width;
+        output_ptr += output_width;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil_second_pass
+ *
+ *  INPUTS        : INT32  *src_ptr          : Pointer to source block.
+ *                  UINT32 src_pixels_per_line : Stride of input block.
+ *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
+ *                  UINT32 output_height     : Input block height.
+ *                  UINT32 output_width      : Input block width.
+ *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.
+ *
+ *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
+ *                  either horizontal or vertical direction to produce the
+ *                  filtered output block. Used to implement second-pass
+ *                  of 2-D separable filter.
+ *
+ *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
+ *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
+ *                  pixel_step defines whether the filter is applied
+ *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
+ *                  It defines the offset required to move from one input
+ *                  to the next.
+ *
+ ****************************************************************************/
+void vp8_filter_block2d_bil_second_pass
+(
+    unsigned short *src_ptr,
+    unsigned char  *output_ptr,
+    int output_pitch,
+    unsigned int  src_pixels_per_line,
+    unsigned int  pixel_step,
+    unsigned int  output_height,
+    unsigned int  output_width,
+    const int *vp8_filter
+)
+{
+    unsigned int  i, j;
+    int  Temp;
+
+    for (i = 0; i < output_height; i++)
+    {
+        for (j = 0; j < output_width; j++)
+        {
+            // Apply filter
+            Temp = ((int)src_ptr[0]         * vp8_filter[0]) +
+                   ((int)src_ptr[pixel_step] * vp8_filter[1]) +
+                   (VP8_FILTER_WEIGHT / 2);
+            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
+            src_ptr++;
+        }
+
+        // Next row...
+        src_ptr    += src_pixels_per_line - output_width;
+        output_ptr += output_pitch;
+    }
+}
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil
+ *
+ *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
+ *                  UINT32 src_pixels_per_line : Stride of input block.
+ *                  INT32  *HFilter         : Array of 2 horizontal filter taps.
+ *                  INT32  *VFilter         : Array of 2 vertical filter taps.
+ *
+ *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 2-D filters an input block by applying a 2-tap
+ *                  bi-linear filter horizontally followed by a 2-tap
+ *                  bi-linear filter vertically on the result.
+ *
+ *  SPECIAL NOTES : The largest block size can be handled here is 16x16
+ *
+ ****************************************************************************/
+void vp8_filter_block2d_bil
+(
+    unsigned char *src_ptr,
+    unsigned char *output_ptr,
+    unsigned int   src_pixels_per_line,
+    unsigned int   dst_pitch,
+    const int      *HFilter,
+    const int      *VFilter,
+    int            Width,
+    int            Height
+)
+{
+
+    unsigned short FData[17*16];    // Temp data bufffer used in filtering
+
+    // First filter 1-D horizontally...
+    vp8_filter_block2d_bil_first_pass(src_ptr, FData, src_pixels_per_line, 1, Height + 1, Width, HFilter);
+
+    // then 1-D vertically...
+    vp8_filter_block2d_bil_second_pass(FData, output_ptr, dst_pitch, Width, Width, Height, Width, VFilter);
+}
+
+
+void vp8_bilinear_predict4x4_c
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    const int  *HFilter;
+    const int  *VFilter;
+
+    HFilter = bilinear_filters[xoffset];
+    VFilter = bilinear_filters[yoffset];
+#if 0
+    {
+        int i;
+        unsigned char temp1[16];
+        unsigned char temp2[16];
+
+        bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
+        vp8_filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
+
+        for (i = 0; i < 16; i++)
+        {
+            if (temp1[i] != temp2[i])
+            {
+                bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
+                vp8_filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
+            }
+        }
+    }
+#endif
+    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
+
+}
+
+void vp8_bilinear_predict8x8_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const int  *HFilter;
+    const int  *VFilter;
+
+    HFilter = bilinear_filters[xoffset];
+    VFilter = bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
+
+}
+
+void vp8_bilinear_predict8x4_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const int  *HFilter;
+    const int  *VFilter;
+
+    HFilter = bilinear_filters[xoffset];
+    VFilter = bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
+
+}
+
+void vp8_bilinear_predict16x16_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const int  *HFilter;
+    const int  *VFilter;
+
+    HFilter = bilinear_filters[xoffset];
+    VFilter = bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
+}
diff --git a/vp8/common/findnearmv.c b/vp8/common/findnearmv.c
new file mode 100644
index 000000000..fcb1f202c
--- /dev/null
+++ b/vp8/common/findnearmv.c
@@ -0,0 +1,207 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "findnearmv.h"
+
+#define FINDNEAR_SEARCH_SITES   3
+
+/* Predict motion vectors using those from already-decoded nearby blocks.
+   Note that we only consider one 4x4 subblock from each candidate 16x16
+   macroblock.   */
+
+typedef union
+{
+    unsigned int as_int;
+    MV           as_mv;
+} int_mv;        /* facilitates rapid equality tests */
+
+static void mv_bias(const MODE_INFO *x, int refframe, int_mv *mvp, const int *ref_frame_sign_bias)
+{
+    MV xmv;
+    xmv = x->mbmi.mv.as_mv;
+
+    if (ref_frame_sign_bias[x->mbmi.ref_frame] != ref_frame_sign_bias[refframe])
+    {
+        xmv.row *= -1;
+        xmv.col *= -1;
+    }
+
+    mvp->as_mv = xmv;
+}
+
+
+void vp8_clamp_mv(MV *mv, const MACROBLOCKD *xd)
+{
+    if (mv->col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
+        mv->col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
+    else if (mv->col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
+        mv->col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
+
+    if (mv->row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
+        mv->row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
+    else if (mv->row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
+        mv->row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
+}
+
+
+void vp8_find_near_mvs
+(
+    MACROBLOCKD *xd,
+    const MODE_INFO *here,
+    MV *nearest,
+    MV *nearby,
+    MV *best_mv,
+    int cnt[4],
+    int refframe,
+    int *ref_frame_sign_bias
+)
+{
+    const MODE_INFO *above = here - xd->mode_info_stride;
+    const MODE_INFO *left = here - 1;
+    const MODE_INFO *aboveleft = above - 1;
+    int_mv            near_mvs[4];
+    int_mv           *mv = near_mvs;
+    int             *cntx = cnt;
+    enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV};
+
+    /* Zero accumulators */
+    mv[0].as_int = mv[1].as_int = mv[2].as_int = 0;
+    cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;
+
+    /* Process above */
+    if (above->mbmi.ref_frame != INTRA_FRAME)
+    {
+        if (above->mbmi.mv.as_int)
+        {
+            (++mv)->as_int = above->mbmi.mv.as_int;
+            mv_bias(above, refframe, mv, ref_frame_sign_bias);
+            ++cntx;
+        }
+
+        *cntx += 2;
+    }
+
+    /* Process left */
+    if (left->mbmi.ref_frame != INTRA_FRAME)
+    {
+        if (left->mbmi.mv.as_int)
+        {
+            int_mv this_mv;
+
+            this_mv.as_int = left->mbmi.mv.as_int;
+            mv_bias(left, refframe, &this_mv, ref_frame_sign_bias);
+
+            if (this_mv.as_int != mv->as_int)
+            {
+                (++mv)->as_int = this_mv.as_int;
+                ++cntx;
+            }
+
+            *cntx += 2;
+        }
+        else
+            cnt[CNT_INTRA] += 2;
+    }
+
+    /* Process above left */
+    if (aboveleft->mbmi.ref_frame != INTRA_FRAME)
+    {
+        if (aboveleft->mbmi.mv.as_int)
+        {
+            int_mv this_mv;
+
+            this_mv.as_int = aboveleft->mbmi.mv.as_int;
+            mv_bias(aboveleft, refframe, &this_mv, ref_frame_sign_bias);
+
+            if (this_mv.as_int != mv->as_int)
+            {
+                (++mv)->as_int = this_mv.as_int;
+                ++cntx;
+            }
+
+            *cntx += 1;
+        }
+        else
+            cnt[CNT_INTRA] += 1;
+    }
+
+    /* If we have three distinct MV's ... */
+    if (cnt[CNT_SPLITMV])
+    {
+        /* See if above-left MV can be merged with NEAREST */
+        if (mv->as_int == near_mvs[CNT_NEAREST].as_int)
+            cnt[CNT_NEAREST] += 1;
+    }
+
+    cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV)
+                        + (left->mbmi.mode == SPLITMV)) * 2
+                       + (aboveleft->mbmi.mode == SPLITMV);
+
+    /* Swap near and nearest if necessary */
+    if (cnt[CNT_NEAR] > cnt[CNT_NEAREST])
+    {
+        int tmp;
+        tmp = cnt[CNT_NEAREST];
+        cnt[CNT_NEAREST] = cnt[CNT_NEAR];
+        cnt[CNT_NEAR] = tmp;
+        tmp = near_mvs[CNT_NEAREST].as_int;
+        near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
+        near_mvs[CNT_NEAR].as_int = tmp;
+    }
+
+    /* Use near_mvs[0] to store the "best" MV */
+    if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA])
+        near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST];
+
+    /* Set up return values */
+    *best_mv = near_mvs[0].as_mv;
+    *nearest = near_mvs[CNT_NEAREST].as_mv;
+    *nearby = near_mvs[CNT_NEAR].as_mv;
+
+    vp8_clamp_mv(nearest, xd);
+    vp8_clamp_mv(nearby, xd);
+    vp8_clamp_mv(best_mv, xd); //TODO: move this up before the copy
+}
+
+vp8_prob *vp8_mv_ref_probs(
+    vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
+)
+{
+    p[0] = vp8_mode_contexts [near_mv_ref_ct[0]] [0];
+    p[1] = vp8_mode_contexts [near_mv_ref_ct[1]] [1];
+    p[2] = vp8_mode_contexts [near_mv_ref_ct[2]] [2];
+    p[3] = vp8_mode_contexts [near_mv_ref_ct[3]] [3];
+    //p[3] = vp8_mode_contexts [near_mv_ref_ct[1] + near_mv_ref_ct[2] + near_mv_ref_ct[3]] [3];
+    return p;
+}
+
+const B_MODE_INFO *vp8_left_bmi(const MODE_INFO *cur_mb, int b)
+{
+    if (!(b & 3))
+    {
+        /* On L edge, get from MB to left of us */
+        --cur_mb;
+        b += 4;
+    }
+
+    return cur_mb->bmi + b - 1;
+}
+
+const B_MODE_INFO *vp8_above_bmi(const MODE_INFO *cur_mb, int b, int mi_stride)
+{
+    if (!(b >> 2))
+    {
+        /* On top edge, get from MB above us */
+        cur_mb -= mi_stride;
+        b += 16;
+    }
+
+    return cur_mb->bmi + b - 4;
+}
diff --git a/vp8/common/findnearmv.h b/vp8/common/findnearmv.h
new file mode 100644
index 000000000..2c02033e6
--- /dev/null
+++ b/vp8/common/findnearmv.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_FINDNEARMV_H
+#define __INC_FINDNEARMV_H
+
+#include "mv.h"
+#include "blockd.h"
+#include "modecont.h"
+#include "treecoder.h"
+
+void vp8_find_near_mvs
+(
+    MACROBLOCKD *xd,
+    const MODE_INFO *here,
+    MV *nearest, MV *nearby, MV *best,
+    int near_mv_ref_cts[4],
+    int refframe,
+    int *ref_frame_sign_bias
+);
+
+vp8_prob *vp8_mv_ref_probs(
+    vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
+);
+
+const B_MODE_INFO *vp8_left_bmi(const MODE_INFO *cur_mb, int b);
+
+const B_MODE_INFO *vp8_above_bmi(const MODE_INFO *cur_mb, int b, int mi_stride);
+
+#define LEFT_TOP_MARGIN (16 << 3)
+#define RIGHT_BOTTOM_MARGIN (16 << 3)
+
+
+#endif
diff --git a/vp8/common/fourcc.hpp b/vp8/common/fourcc.hpp
new file mode 100644
index 000000000..5f1faed2f
--- /dev/null
+++ b/vp8/common/fourcc.hpp
@@ -0,0 +1,120 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef FOURCC_HPP
+#define FOURCC_HPP
+
+#include <iosfwd>
+#include <cstring>
+
+
+#if defined(__POWERPC__) || defined(__APPLE__) || defined(__MERKS__)
+using namespace std;
+#endif
+
+class four_cc
+{
+public:
+
+    four_cc();
+    four_cc(const char*);
+    explicit four_cc(unsigned long);
+
+    bool operator==(const four_cc&) const;
+    bool operator!=(const four_cc&) const;
+
+    bool operator==(const char*) const;
+    bool operator!=(const char*) const;
+
+    operator unsigned long() const;
+    unsigned long as_long() const;
+
+    four_cc& operator=(unsigned long);
+
+    char operator[](int) const;
+
+    std::ostream& put(std::ostream&) const;
+
+    bool printable() const;
+
+private:
+
+    union
+    {
+        char code[4];
+        unsigned long code_as_long;
+    };
+
+};
+
+
+inline four_cc::four_cc()
+{
+}
+
+inline four_cc::four_cc(unsigned long x)
+    : code_as_long(x)
+{
+}
+
+inline four_cc::four_cc(const char* str)
+{
+    memcpy(code, str, 4);
+}
+
+
+inline bool four_cc::operator==(const four_cc& rhs) const
+{
+    return code_as_long == rhs.code_as_long;
+}
+
+inline bool four_cc::operator!=(const four_cc& rhs) const
+{
+    return !operator==(rhs);
+}
+
+inline bool four_cc::operator==(const char* rhs) const
+{
+    return (memcmp(code, rhs, 4) == 0);
+}
+
+inline bool four_cc::operator!=(const char* rhs) const
+{
+    return !operator==(rhs);
+}
+
+
+inline four_cc::operator unsigned long() const
+{
+    return code_as_long;
+}
+
+inline unsigned long four_cc::as_long() const
+{
+    return code_as_long;
+}
+
+inline char four_cc::operator[](int i) const
+{
+    return code[i];
+}
+
+inline four_cc& four_cc::operator=(unsigned long val)
+{
+    code_as_long = val;
+    return *this;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const four_cc& rhs)
+{
+    return rhs.put(os);
+}
+
+#endif
diff --git a/vp8/common/g_common.h b/vp8/common/g_common.h
new file mode 100644
index 000000000..e68c53e1c
--- /dev/null
+++ b/vp8/common/g_common.h
@@ -0,0 +1,20 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+extern void (*vp8_clear_system_state)(void);
+extern void (*vp8_plane_add_noise)(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int DPitch, int q);
+extern void (*de_interlace)
+(
+    unsigned char *src_ptr,
+    unsigned char *dst_ptr,
+    int Width,
+    int Height,
+    int Stride
+);
diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
new file mode 100644
index 000000000..0011ae0dc
--- /dev/null
+++ b/vp8/common/generic/systemdependent.c
@@ -0,0 +1,79 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "g_common.h"
+#include "subpixel.h"
+#include "loopfilter.h"
+#include "recon.h"
+#include "idct.h"
+#include "onyxc_int.h"
+
+extern void vp8_arch_x86_common_init(VP8_COMMON *ctx);
+
+void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);
+extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);
+
+void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x);
+extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x);
+
+void vp8_machine_specific_config(VP8_COMMON *ctx)
+{
+#if CONFIG_RUNTIME_CPU_DETECT
+    VP8_COMMON_RTCD *rtcd = &ctx->rtcd;
+
+    rtcd->idct.idct1        = vp8_short_idct4x4llm_1_c;
+    rtcd->idct.idct16       = vp8_short_idct4x4llm_c;
+    rtcd->idct.idct1_scalar = vp8_dc_only_idct_c;
+    rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_c;
+    rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_c;
+
+    rtcd->recon.copy16x16   = vp8_copy_mem16x16_c;
+    rtcd->recon.copy8x8     = vp8_copy_mem8x8_c;
+    rtcd->recon.copy8x4     = vp8_copy_mem8x4_c;
+    rtcd->recon.recon      = vp8_recon_b_c;
+    rtcd->recon.recon2      = vp8_recon2b_c;
+    rtcd->recon.recon4     = vp8_recon4b_c;
+
+    rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_c;
+    rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_c;
+    rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_c;
+    rtcd->subpix.sixtap4x4     = vp8_sixtap_predict_c;
+    rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_c;
+    rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_c;
+    rtcd->subpix.bilinear8x4   = vp8_bilinear_predict8x4_c;
+    rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_c;
+
+    rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_c;
+    rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_c;
+    rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_c;
+    rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_c;
+    rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_c;
+    rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_c;
+    rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_c;
+    rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_c;
+
+#if CONFIG_POSTPROC || CONFIG_VP8_ENCODER
+    rtcd->postproc.down        = vp8_mbpost_proc_down_c;
+    rtcd->postproc.across      = vp8_mbpost_proc_across_ip_c;
+    rtcd->postproc.downacross  = vp8_post_proc_down_and_across_c;
+    rtcd->postproc.addnoise    = vp8_plane_add_noise_c;
+#endif
+
+#endif
+    // Pure C:
+    vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby;
+    vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s;
+
+#if ARCH_X86 || ARCH_X86_64
+    vp8_arch_x86_common_init(ctx);
+#endif
+
+}
diff --git a/vp8/common/header.h b/vp8/common/header.h
new file mode 100644
index 000000000..8b2b0094a
--- /dev/null
+++ b/vp8/common/header.h
@@ -0,0 +1,42 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_HEADER_H
+#define __INC_HEADER_H
+
+/* 24 bits total */
+typedef struct
+{
+    unsigned int type: 1;
+    unsigned int version: 3;
+    unsigned int show_frame: 1;
+
+    /* Allow 2^20 bytes = 8 megabits for first partition */
+
+    unsigned int first_partition_length_in_bytes: 19;
+
+#ifdef PACKET_TESTING
+    unsigned int frame_number;
+    unsigned int update_gold: 1;
+    unsigned int uses_gold: 1;
+    unsigned int update_last: 1;
+    unsigned int uses_last: 1;
+#endif
+
+} VP8_HEADER;
+
+#ifdef PACKET_TESTING
+#define VP8_HEADER_SIZE 8
+#else
+#define VP8_HEADER_SIZE 3
+#endif
+
+
+#endif
diff --git a/vp8/common/idct.h b/vp8/common/idct.h
new file mode 100644
index 000000000..47b5f0576
--- /dev/null
+++ b/vp8/common/idct.h
@@ -0,0 +1,77 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_IDCT_H
+#define __INC_IDCT_H
+
+#define prototype_second_order(sym) \
+    void sym(short *input, short *output)
+
+#define prototype_idct(sym) \
+    void sym(short *input, short *output, int pitch)
+
+#define prototype_idct_scalar(sym) \
+    void sym(short input, short *output, int pitch)
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/idct_x86.h"
+#endif
+
+#if ARCH_ARM
+#include "arm/idct_arm.h"
+#endif
+
+#ifndef vp8_idct_idct1
+#define vp8_idct_idct1 vp8_short_idct4x4llm_1_c
+#endif
+extern prototype_idct(vp8_idct_idct1);
+
+#ifndef vp8_idct_idct16
+#define vp8_idct_idct16 vp8_short_idct4x4llm_c
+#endif
+extern prototype_idct(vp8_idct_idct16);
+
+#ifndef vp8_idct_idct1_scalar
+#define vp8_idct_idct1_scalar vp8_dc_only_idct_c
+#endif
+extern prototype_idct_scalar(vp8_idct_idct1_scalar);
+
+
+#ifndef vp8_idct_iwalsh1
+#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_c
+#endif
+extern prototype_second_order(vp8_idct_iwalsh1);
+
+#ifndef vp8_idct_iwalsh16
+#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_c
+#endif
+extern prototype_second_order(vp8_idct_iwalsh16);
+
+typedef prototype_idct((*vp8_idct_fn_t));
+typedef prototype_idct_scalar((*vp8_idct_scalar_fn_t));
+typedef prototype_second_order((*vp8_second_order_fn_t));
+
+typedef struct
+{
+    vp8_idct_fn_t         idct1;
+    vp8_idct_fn_t         idct16;
+    vp8_idct_scalar_fn_t  idct1_scalar;
+
+    vp8_second_order_fn_t iwalsh1;
+    vp8_second_order_fn_t iwalsh16;
+} vp8_idct_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IDCT_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define IDCT_INVOKE(ctx,fn) vp8_idct_##fn
+#endif
+
+#endif
diff --git a/vp8/common/idctllm.c b/vp8/common/idctllm.c
new file mode 100644
index 000000000..57cf8584e
--- /dev/null
+++ b/vp8/common/idctllm.c
@@ -0,0 +1,189 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+ * Notes:
+ *
+ * This implementation makes use of 16 bit fixed point verio of two multiply
+ * constants:
+ *         1.   sqrt(2) * cos (pi/8)
+ *         2.   sqrt(2) * sin (pi/8)
+ * Becuase the first constant is bigger than 1, to maintain the same 16 bit
+ * fixed point precision as the second one, we use a trick of
+ *         x * a = x + x*(a-1)
+ * so
+ *         x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
+ **************************************************************************/
+static const int cospi8sqrt2minus1 = 20091;
+static const int sinpi8sqrt2      = 35468;
+static const int rounding = 0;
+void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
+{
+    int i;
+    int a1, b1, c1, d1;
+
+    short *ip = input;
+    short *op = output;
+    int temp1, temp2;
+    int shortpitch = pitch >> 1;
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ip[0] + ip[8];
+        b1 = ip[0] - ip[8];
+
+        temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16;
+        temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16);
+        c1 = temp1 - temp2;
+
+        temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16);
+        temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16;
+        d1 = temp1 + temp2;
+
+        op[shortpitch*0] = a1 + d1;
+        op[shortpitch*3] = a1 - d1;
+
+        op[shortpitch*1] = b1 + c1;
+        op[shortpitch*2] = b1 - c1;
+
+        ip++;
+        op++;
+    }
+
+    ip = output;
+    op = output;
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ip[0] + ip[2];
+        b1 = ip[0] - ip[2];
+
+        temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16;
+        temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16);
+        c1 = temp1 - temp2;
+
+        temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16);
+        temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16;
+        d1 = temp1 + temp2;
+
+
+        op[0] = (a1 + d1 + 4) >> 3;
+        op[3] = (a1 - d1 + 4) >> 3;
+
+        op[1] = (b1 + c1 + 4) >> 3;
+        op[2] = (b1 - c1 + 4) >> 3;
+
+        ip += shortpitch;
+        op += shortpitch;
+    }
+}
+
+void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch)
+{
+    int i;
+    int a1;
+    short *op = output;
+    int shortpitch = pitch >> 1;
+    a1 = ((input[0] + 4) >> 3);
+
+    for (i = 0; i < 4; i++)
+    {
+        op[0] = a1;
+        op[1] = a1;
+        op[2] = a1;
+        op[3] = a1;
+        op += shortpitch;
+    }
+}
+
+
+void vp8_dc_only_idct_c(short input_dc, short *output, int pitch)
+{
+    int i;
+    int a1;
+    short *op = output;
+    int shortpitch = pitch >> 1;
+    a1 = ((input_dc + 4) >> 3);
+
+    for (i = 0; i < 4; i++)
+    {
+        op[0] = a1;
+        op[1] = a1;
+        op[2] = a1;
+        op[3] = a1;
+        op += shortpitch;
+    }
+}
+
+void vp8_short_inv_walsh4x4_c(short *input, short *output)
+{
+    int i;
+    int a1, b1, c1, d1;
+    int a2, b2, c2, d2;
+    short *ip = input;
+    short *op = output;
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ip[0] + ip[12];
+        b1 = ip[4] + ip[8];
+        c1 = ip[4] - ip[8];
+        d1 = ip[0] - ip[12];
+
+        op[0] = a1 + b1;
+        op[4] = c1 + d1;
+        op[8] = a1 - b1;
+        op[12] = d1 - c1;
+        ip++;
+        op++;
+    }
+
+    ip = output;
+    op = output;
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ip[0] + ip[3];
+        b1 = ip[1] + ip[2];
+        c1 = ip[1] - ip[2];
+        d1 = ip[0] - ip[3];
+
+        a2 = a1 + b1;
+        b2 = c1 + d1;
+        c2 = a1 - b1;
+        d2 = d1 - c1;
+
+        op[0] = (a2 + 3) >> 3;
+        op[1] = (b2 + 3) >> 3;
+        op[2] = (c2 + 3) >> 3;
+        op[3] = (d2 + 3) >> 3;
+
+        ip += 4;
+        op += 4;
+    }
+}
+
+void vp8_short_inv_walsh4x4_1_c(short *input, short *output)
+{
+    int i;
+    int a1;
+    short *op = output;
+
+    a1 = ((input[0] + 3) >> 3);
+
+    for (i = 0; i < 4; i++)
+    {
+        op[0] = a1;
+        op[1] = a1;
+        op[2] = a1;
+        op[3] = a1;
+        op += 4;
+    }
+}
diff --git a/vp8/common/invtrans.c b/vp8/common/invtrans.c
new file mode 100644
index 000000000..1ff596ead
--- /dev/null
+++ b/vp8/common/invtrans.c
@@ -0,0 +1,86 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "invtrans.h"
+
+
+
+static void recon_dcblock(MACROBLOCKD *x)
+{
+    BLOCKD *b = &x->block[24];
+    int i;
+
+    for (i = 0; i < 16; i++)
+    {
+        x->block[i].dqcoeff[0] = b->diff[i];
+    }
+
+}
+
+void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch)
+{
+    if (b->eob > 1)
+        IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->diff, pitch);
+    else
+        IDCT_INVOKE(rtcd, idct1)(b->dqcoeff, b->diff, pitch);
+}
+
+
+void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
+{
+    int i;
+
+    // do 2nd order transform on the dc block
+    IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->block[24].diff);
+
+    recon_dcblock(x);
+
+    for (i = 0; i < 16; i++)
+    {
+        vp8_inverse_transform_b(rtcd, &x->block[i], 32);
+    }
+
+}
+void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
+{
+    int i;
+
+    for (i = 16; i < 24; i++)
+    {
+        vp8_inverse_transform_b(rtcd, &x->block[i], 16);
+    }
+
+}
+
+
+void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
+{
+    int i;
+
+    if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
+    {
+        // do 2nd order transform on the dc block
+
+        IDCT_INVOKE(rtcd, iwalsh16)(&x->block[24].dqcoeff[0], x->block[24].diff);
+        recon_dcblock(x);
+    }
+
+    for (i = 0; i < 16; i++)
+    {
+        vp8_inverse_transform_b(rtcd, &x->block[i], 32);
+    }
+
+
+    for (i = 16; i < 24; i++)
+    {
+        vp8_inverse_transform_b(rtcd, &x->block[i], 16);
+    }
+
+}
diff --git a/vp8/common/invtrans.h b/vp8/common/invtrans.h
new file mode 100644
index 000000000..93a40f956
--- /dev/null
+++ b/vp8/common/invtrans.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_INVTRANS_H
+#define __INC_INVTRANS_H
+
+#include "vpx_ports/config.h"
+#include "idct.h"
+#include "blockd.h"
+extern void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch);
+extern void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
+extern void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
+extern void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
+
+#endif
diff --git a/vp8/common/littlend.h b/vp8/common/littlend.h
new file mode 100644
index 000000000..08c525c5d
--- /dev/null
+++ b/vp8/common/littlend.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef _littlend_h
+#define _littlend_h
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define invert2(x) (x)
+#define invert4(x) (x)
+
+#define low_byte(x) (unsigned char)x
+#define mid1Byte(x) (unsigned char)(x >> 8)
+#define mid2Byte(x) (unsigned char)(x >> 16)
+#define high_byte(x) (unsigned char)(x >> 24)
+
+#define SWAPENDS 0
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/vp8/common/loopfilter.c b/vp8/common/loopfilter.c
new file mode 100644
index 000000000..79e617754
--- /dev/null
+++ b/vp8/common/loopfilter.c
@@ -0,0 +1,601 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "loopfilter.h"
+#include "onyxc_int.h"
+
+typedef unsigned char uc;
+
+
+prototype_loopfilter(vp8_loop_filter_horizontal_edge_c);
+prototype_loopfilter(vp8_loop_filter_vertical_edge_c);
+prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_c);
+prototype_loopfilter(vp8_mbloop_filter_vertical_edge_c);
+prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_c);
+prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_c);
+
+// Horizontal MB filtering
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) simpler_lpf;
+    vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+}
+
+void vp8_loop_filter_mbhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                            int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) u_ptr;
+    (void) v_ptr;
+    (void) uv_stride;
+    (void) simpler_lpf;
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+}
+
+// Vertical MB Filtering
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) simpler_lpf;
+    vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+}
+
+void vp8_loop_filter_mbvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                            int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) u_ptr;
+    (void) v_ptr;
+    (void) uv_stride;
+    (void) simpler_lpf;
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+}
+
+// Horizontal B Filtering
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                          int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) simpler_lpf;
+    vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+}
+
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) u_ptr;
+    (void) v_ptr;
+    (void) uv_stride;
+    (void) simpler_lpf;
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+}
+
+// Vertical B Filtering
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                          int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) simpler_lpf;
+    vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+}
+
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) u_ptr;
+    (void) v_ptr;
+    (void) uv_stride;
+    (void) simpler_lpf;
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+}
+
+void vp8_init_loop_filter(VP8_COMMON *cm)
+{
+    loop_filter_info *lfi = cm->lf_info;
+    LOOPFILTERTYPE lft = cm->filter_type;
+    int sharpness_lvl = cm->sharpness_level;
+    int frame_type = cm->frame_type;
+    int i, j;
+
+    int block_inside_limit = 0;
+    int HEVThresh;
+    const int yhedge_boost  = 2;
+    const int uvhedge_boost = 2;
+
+    // For each possible value for the loop filter fill out a "loop_filter_info" entry.
+    for (i = 0; i <= MAX_LOOP_FILTER; i++)
+    {
+        int filt_lvl = i;
+
+        if (frame_type == KEY_FRAME)
+        {
+            if (filt_lvl >= 40)
+                HEVThresh = 2;
+            else if (filt_lvl >= 15)
+                HEVThresh = 1;
+            else
+                HEVThresh = 0;
+        }
+        else
+        {
+            if (filt_lvl >= 40)
+                HEVThresh = 3;
+            else if (filt_lvl >= 20)
+                HEVThresh = 2;
+            else if (filt_lvl >= 15)
+                HEVThresh = 1;
+            else
+                HEVThresh = 0;
+        }
+
+        // Set loop filter paramaeters that control sharpness.
+        block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
+        block_inside_limit = block_inside_limit >> (sharpness_lvl > 4);
+
+        if (sharpness_lvl > 0)
+        {
+            if (block_inside_limit > (9 - sharpness_lvl))
+                block_inside_limit = (9 - sharpness_lvl);
+        }
+
+        if (block_inside_limit < 1)
+            block_inside_limit = 1;
+
+        for (j = 0; j < 16; j++)
+        {
+            lfi[i].lim[j] = block_inside_limit;
+            lfi[i].mbflim[j] = filt_lvl + yhedge_boost;
+            lfi[i].mbthr[j] = HEVThresh;
+            lfi[i].flim[j] = filt_lvl;
+            lfi[i].thr[j] = HEVThresh;
+            lfi[i].uvlim[j] = block_inside_limit;
+            lfi[i].uvmbflim[j] = filt_lvl + uvhedge_boost;
+            lfi[i].uvmbthr[j] = HEVThresh;
+            lfi[i].uvflim[j] = filt_lvl;
+            lfi[i].uvthr[j] = HEVThresh;
+        }
+
+    }
+
+    // Set up the function pointers depending on the type of loop filtering selected
+    if (lft == NORMAL_LOOPFILTER)
+    {
+        cm->lf_mbv = LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v);
+        cm->lf_bv  = LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v);
+        cm->lf_mbh = LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h);
+        cm->lf_bh  = LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h);
+    }
+    else
+    {
+        cm->lf_mbv = LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v);
+        cm->lf_bv  = LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v);
+        cm->lf_mbh = LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h);
+        cm->lf_bh  = LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h);
+    }
+}
+
+// Put vp8_init_loop_filter() in vp8dx_create_decompressor(). Only call vp8_frame_init_loop_filter() while decoding
+// each frame. Check last_frame_type to skip the function most of times.
+void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type)
+{
+    int HEVThresh;
+    int i, j;
+
+    // For each possible value for the loop filter fill out a "loop_filter_info" entry.
+    for (i = 0; i <= MAX_LOOP_FILTER; i++)
+    {
+        int filt_lvl = i;
+
+        if (frame_type == KEY_FRAME)
+        {
+            if (filt_lvl >= 40)
+                HEVThresh = 2;
+            else if (filt_lvl >= 15)
+                HEVThresh = 1;
+            else
+                HEVThresh = 0;
+        }
+        else
+        {
+            if (filt_lvl >= 40)
+                HEVThresh = 3;
+            else if (filt_lvl >= 20)
+                HEVThresh = 2;
+            else if (filt_lvl >= 15)
+                HEVThresh = 1;
+            else
+                HEVThresh = 0;
+        }
+
+        for (j = 0; j < 16; j++)
+        {
+            //lfi[i].lim[j] = block_inside_limit;
+            //lfi[i].mbflim[j] = filt_lvl+yhedge_boost;
+            lfi[i].mbthr[j] = HEVThresh;
+            //lfi[i].flim[j] = filt_lvl;
+            lfi[i].thr[j] = HEVThresh;
+            //lfi[i].uvlim[j] = block_inside_limit;
+            //lfi[i].uvmbflim[j] = filt_lvl+uvhedge_boost;
+            lfi[i].uvmbthr[j] = HEVThresh;
+            //lfi[i].uvflim[j] = filt_lvl;
+            lfi[i].uvthr[j] = HEVThresh;
+        }
+    }
+}
+
+
+void vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int *filter_level)
+{
+    MB_MODE_INFO *mbmi = &mbd->mode_info_context->mbmi;
+
+    if (mbd->mode_ref_lf_delta_enabled)
+    {
+        // Aplly delta for reference frame
+        *filter_level += mbd->ref_lf_deltas[mbmi->ref_frame];
+
+        // Apply delta for mode
+        if (mbmi->ref_frame == INTRA_FRAME)
+        {
+            // Only the split mode BPRED has a further special case
+            if (mbmi->mode == B_PRED)
+                *filter_level +=  mbd->mode_lf_deltas[0];
+        }
+        else
+        {
+            // Zero motion mode
+            if (mbmi->mode == ZEROMV)
+                *filter_level +=  mbd->mode_lf_deltas[1];
+
+            // Split MB motion mode
+            else if (mbmi->mode == SPLITMV)
+                *filter_level +=  mbd->mode_lf_deltas[3];
+
+            // All other inter motion modes (Nearest, Near, New)
+            else
+                *filter_level +=  mbd->mode_lf_deltas[2];
+        }
+
+        // Range check
+        if (*filter_level > MAX_LOOP_FILTER)
+            *filter_level = MAX_LOOP_FILTER;
+        else if (*filter_level < 0)
+            *filter_level = 0;
+    }
+}
+
+
+void vp8_loop_filter_frame
+(
+    VP8_COMMON *cm,
+    MACROBLOCKD *mbd,
+    int default_filt_lvl
+)
+{
+    YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+    loop_filter_info *lfi = cm->lf_info;
+    int frame_type = cm->frame_type;
+
+    int mb_row;
+    int mb_col;
+
+
+    int baseline_filter_level[MAX_MB_SEGMENTS];
+    int filter_level;
+    int alt_flt_enabled = mbd->segmentation_enabled;
+
+    int i;
+    unsigned char *y_ptr, *u_ptr, *v_ptr;
+
+    mbd->mode_info_context = cm->mi;          // Point at base of Mb MODE_INFO list
+
+    // Note the baseline filter values for each segment
+    if (alt_flt_enabled)
+    {
+        for (i = 0; i < MAX_MB_SEGMENTS; i++)
+        {
+            // Abs value
+            if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+                baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+            // Delta Value
+            else
+            {
+                baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+                baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  // Clamp to valid range
+            }
+        }
+    }
+    else
+    {
+        for (i = 0; i < MAX_MB_SEGMENTS; i++)
+            baseline_filter_level[i] = default_filt_lvl;
+    }
+
+    // Initialize the loop filter for this frame.
+    if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
+        vp8_init_loop_filter(cm);
+    else if (frame_type != cm->last_frame_type)
+        vp8_frame_init_loop_filter(lfi, frame_type);
+
+    // Set up the buffer pointers
+    y_ptr = post->y_buffer;
+    u_ptr = post->u_buffer;
+    v_ptr = post->v_buffer;
+
+    // vp8_filter each macro block
+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+        {
+            int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
+
+            filter_level = baseline_filter_level[Segment];
+
+            // Distance of Mb to the various image edges.
+            // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
+            // Apply any context driven MB level adjustment
+            vp8_adjust_mb_lf_value(mbd, &filter_level);
+
+            if (filter_level)
+            {
+                if (mb_col > 0)
+                    cm->lf_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+
+                if (mbd->mode_info_context->mbmi.dc_diff > 0)
+                    cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+
+                // don't apply across umv border
+                if (mb_row > 0)
+                    cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+
+                if (mbd->mode_info_context->mbmi.dc_diff > 0)
+                    cm->lf_bh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+            }
+
+            y_ptr += 16;
+            u_ptr += 8;
+            v_ptr += 8;
+
+            mbd->mode_info_context++;     // step to next MB
+        }
+
+        y_ptr += post->y_stride  * 16 - post->y_width;
+        u_ptr += post->uv_stride *  8 - post->uv_width;
+        v_ptr += post->uv_stride *  8 - post->uv_width;
+
+        mbd->mode_info_context++;         // Skip border mb
+    }
+}
+
+
+void vp8_loop_filter_frame_yonly
+(
+    VP8_COMMON *cm,
+    MACROBLOCKD *mbd,
+    int default_filt_lvl,
+    int sharpness_lvl
+)
+{
+    YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+
+    int i;
+    unsigned char *y_ptr;
+    int mb_row;
+    int mb_col;
+
+    loop_filter_info *lfi = cm->lf_info;
+    int baseline_filter_level[MAX_MB_SEGMENTS];
+    int filter_level;
+    int alt_flt_enabled = mbd->segmentation_enabled;
+    int frame_type = cm->frame_type;
+
+    (void) sharpness_lvl;
+
+    //MODE_INFO * this_mb_mode_info = cm->mi;  // Point at base of Mb MODE_INFO list
+    mbd->mode_info_context = cm->mi;          // Point at base of Mb MODE_INFO list
+
+    // Note the baseline filter values for each segment
+    if (alt_flt_enabled)
+    {
+        for (i = 0; i < MAX_MB_SEGMENTS; i++)
+        {
+            // Abs value
+            if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+                baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+            // Delta Value
+            else
+            {
+                baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+                baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  // Clamp to valid range
+            }
+        }
+    }
+    else
+    {
+        for (i = 0; i < MAX_MB_SEGMENTS; i++)
+            baseline_filter_level[i] = default_filt_lvl;
+    }
+
+    // Initialize the loop filter for this frame.
+    if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
+        vp8_init_loop_filter(cm);
+    else if (frame_type != cm->last_frame_type)
+        vp8_frame_init_loop_filter(lfi, frame_type);
+
+    // Set up the buffer pointers
+    y_ptr = post->y_buffer;
+
+    // vp8_filter each macro block
+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+        {
+            int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
+            filter_level = baseline_filter_level[Segment];
+
+            // Apply any context driven MB level adjustment
+            vp8_adjust_mb_lf_value(mbd, &filter_level);
+
+            if (filter_level)
+            {
+                if (mb_col > 0)
+                    cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+
+                if (mbd->mode_info_context->mbmi.dc_diff > 0)
+                    cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+
+                // don't apply across umv border
+                if (mb_row > 0)
+                    cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+
+                if (mbd->mode_info_context->mbmi.dc_diff > 0)
+                    cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+            }
+
+            y_ptr += 16;
+            mbd->mode_info_context ++;        // step to next MB
+
+        }
+
+        y_ptr += post->y_stride  * 16 - post->y_width;
+        mbd->mode_info_context ++;            // Skip border mb
+    }
+
+}
+
+
+void vp8_loop_filter_partial_frame
+(
+    VP8_COMMON *cm,
+    MACROBLOCKD *mbd,
+    int default_filt_lvl,
+    int sharpness_lvl,
+    int Fraction
+)
+{
+    YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+
+    int i;
+    unsigned char *y_ptr;
+    int mb_row;
+    int mb_col;
+    //int mb_rows = post->y_height >> 4;
+    int mb_cols = post->y_width  >> 4;
+
+    int linestocopy;
+
+    loop_filter_info *lfi = cm->lf_info;
+    int baseline_filter_level[MAX_MB_SEGMENTS];
+    int filter_level;
+    int alt_flt_enabled = mbd->segmentation_enabled;
+    int frame_type = cm->frame_type;
+
+    (void) sharpness_lvl;
+
+    //MODE_INFO * this_mb_mode_info = cm->mi + (post->y_height>>5) * (mb_cols + 1);  // Point at base of Mb MODE_INFO list
+    mbd->mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);        // Point at base of Mb MODE_INFO list
+
+    linestocopy = (post->y_height >> (4 + Fraction));
+
+    if (linestocopy < 1)
+        linestocopy = 1;
+
+    linestocopy <<= 4;
+
+    // Note the baseline filter values for each segment
+    if (alt_flt_enabled)
+    {
+        for (i = 0; i < MAX_MB_SEGMENTS; i++)
+        {
+            // Abs value
+            if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+                baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+            // Delta Value
+            else
+            {
+                baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+                baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  // Clamp to valid range
+            }
+        }
+    }
+    else
+    {
+        for (i = 0; i < MAX_MB_SEGMENTS; i++)
+            baseline_filter_level[i] = default_filt_lvl;
+    }
+
+    // Initialize the loop filter for this frame.
+    if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
+        vp8_init_loop_filter(cm);
+    else if (frame_type != cm->last_frame_type)
+        vp8_frame_init_loop_filter(lfi, frame_type);
+
+    // Set up the buffer pointers
+    y_ptr = post->y_buffer + (post->y_height >> 5) * 16 * post->y_stride;
+
+    // vp8_filter each macro block
+    for (mb_row = 0; mb_row<(linestocopy >> 4); mb_row++)
+    {
+        for (mb_col = 0; mb_col < mb_cols; mb_col++)
+        {
+            int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
+            filter_level = baseline_filter_level[Segment];
+
+            if (filter_level)
+            {
+                if (mb_col > 0)
+                    cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+
+                if (mbd->mode_info_context->mbmi.dc_diff > 0)
+                    cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+
+                cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+
+                if (mbd->mode_info_context->mbmi.dc_diff > 0)
+                    cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+            }
+
+            y_ptr += 16;
+            mbd->mode_info_context += 1;      // step to next MB
+        }
+
+        y_ptr += post->y_stride  * 16 - post->y_width;
+        mbd->mode_info_context += 1;          // Skip border mb
+    }
+}
diff --git a/vp8/common/loopfilter.h b/vp8/common/loopfilter.h
new file mode 100644
index 000000000..c6ce508cc
--- /dev/null
+++ b/vp8/common/loopfilter.h
@@ -0,0 +1,120 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef loopfilter_h
+#define loopfilter_h
+
+#include "vpx_ports/mem.h"
+
+#define MAX_LOOP_FILTER 63
+
+typedef enum
+{
+    NORMAL_LOOPFILTER = 0,
+    SIMPLE_LOOPFILTER = 1
+} LOOPFILTERTYPE;
+
+// FRK
+// Need to align this structure so when it is declared and
+// passed it can be loaded into vector registers.
+// FRK
+typedef struct
+{
+    DECLARE_ALIGNED(16, signed char, lim[16]);
+    DECLARE_ALIGNED(16, signed char, flim[16]);
+    DECLARE_ALIGNED(16, signed char, thr[16]);
+    DECLARE_ALIGNED(16, signed char, mbflim[16]);
+    DECLARE_ALIGNED(16, signed char, mbthr[16]);
+    DECLARE_ALIGNED(16, signed char, uvlim[16]);
+    DECLARE_ALIGNED(16, signed char, uvflim[16]);
+    DECLARE_ALIGNED(16, signed char, uvthr[16]);
+    DECLARE_ALIGNED(16, signed char, uvmbflim[16]);
+    DECLARE_ALIGNED(16, signed char, uvmbthr[16]);
+} loop_filter_info;
+
+
+#define prototype_loopfilter(sym) \
+    void sym(unsigned char *src, int pitch, const signed char *flimit,\
+             const signed char *limit, const signed char *thresh, int count)
+
+#define prototype_loopfilter_block(sym) \
+    void sym(unsigned char *y, unsigned char *u, unsigned char *v,\
+             int ystride, int uv_stride, loop_filter_info *lfi, int simpler)
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/loopfilter_x86.h"
+#endif
+
+#if ARCH_ARM
+#include "arm/loopfilter_arm.h"
+#endif
+
+#ifndef vp8_lf_normal_mb_v
+#define vp8_lf_normal_mb_v vp8_loop_filter_mbv_c
+#endif
+extern prototype_loopfilter_block(vp8_lf_normal_mb_v);
+
+#ifndef vp8_lf_normal_b_v
+#define vp8_lf_normal_b_v vp8_loop_filter_bv_c
+#endif
+extern prototype_loopfilter_block(vp8_lf_normal_b_v);
+
+#ifndef vp8_lf_normal_mb_h
+#define vp8_lf_normal_mb_h vp8_loop_filter_mbh_c
+#endif
+extern prototype_loopfilter_block(vp8_lf_normal_mb_h);
+
+#ifndef vp8_lf_normal_b_h
+#define vp8_lf_normal_b_h vp8_loop_filter_bh_c
+#endif
+extern prototype_loopfilter_block(vp8_lf_normal_b_h);
+
+
+#ifndef vp8_lf_simple_mb_v
+#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_c
+#endif
+extern prototype_loopfilter_block(vp8_lf_simple_mb_v);
+
+#ifndef vp8_lf_simple_b_v
+#define vp8_lf_simple_b_v vp8_loop_filter_bvs_c
+#endif
+extern prototype_loopfilter_block(vp8_lf_simple_b_v);
+
+#ifndef vp8_lf_simple_mb_h
+#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_c
+#endif
+extern prototype_loopfilter_block(vp8_lf_simple_mb_h);
+
+#ifndef vp8_lf_simple_b_h
+#define vp8_lf_simple_b_h vp8_loop_filter_bhs_c
+#endif
+extern prototype_loopfilter_block(vp8_lf_simple_b_h);
+
+typedef prototype_loopfilter_block((*vp8_lf_block_fn_t));
+typedef struct
+{
+    vp8_lf_block_fn_t  normal_mb_v;
+    vp8_lf_block_fn_t  normal_b_v;
+    vp8_lf_block_fn_t  normal_mb_h;
+    vp8_lf_block_fn_t  normal_b_h;
+    vp8_lf_block_fn_t  simple_mb_v;
+    vp8_lf_block_fn_t  simple_b_v;
+    vp8_lf_block_fn_t  simple_mb_h;
+    vp8_lf_block_fn_t  simple_b_h;
+} vp8_loopfilter_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define LF_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define LF_INVOKE(ctx,fn) vp8_lf_##fn
+#endif
+
+
+#endif
diff --git a/vp8/common/loopfilter_filters.c b/vp8/common/loopfilter_filters.c
new file mode 100644
index 000000000..7d16e4843
--- /dev/null
+++ b/vp8/common/loopfilter_filters.c
@@ -0,0 +1,368 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include "loopfilter.h"
+#include "onyxc_int.h"
+
+
+#define NEW_LOOPFILTER_MASK
+
+typedef unsigned char uc;
+
+__inline signed char vp8_signed_char_clamp(int t)
+{
+    t = (t < -128 ? -128 : t);
+    t = (t > 127 ? 127 : t);
+    return (signed char) t;
+}
+
+
+// should we apply any filter at all ( 11111111 yes, 00000000 no)
+__inline signed char vp8_filter_mask(signed char limit, signed char flimit,
+                                     uc p3, uc p2, uc p1, uc p0, uc q0, uc q1, uc q2, uc q3)
+{
+    signed char mask = 0;
+    mask |= (abs(p3 - p2) > limit) * -1;
+    mask |= (abs(p2 - p1) > limit) * -1;
+    mask |= (abs(p1 - p0) > limit) * -1;
+    mask |= (abs(q1 - q0) > limit) * -1;
+    mask |= (abs(q2 - q1) > limit) * -1;
+    mask |= (abs(q3 - q2) > limit) * -1;
+#ifndef NEW_LOOPFILTER_MASK
+    mask |= (abs(p0 - q0) > flimit) * -1;
+#else
+    mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit) * -1;
+#endif
+    mask = ~mask;
+    return mask;
+}
+
+// is there high variance internal edge ( 11111111 yes, 00000000 no)
+__inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0, uc q1)
+{
+    signed char hev = 0;
+    hev  |= (abs(p1 - p0) > thresh) * -1;
+    hev  |= (abs(q1 - q0) > thresh) * -1;
+    return hev;
+}
+
+__inline void vp8_filter(signed char mask, signed char hev, uc *op1, uc *op0, uc *oq0, uc *oq1)
+
+{
+    signed char ps0, qs0;
+    signed char ps1, qs1;
+    signed char vp8_filter, Filter1, Filter2;
+    signed char u;
+
+    ps1 = (signed char) * op1 ^ 0x80;
+    ps0 = (signed char) * op0 ^ 0x80;
+    qs0 = (signed char) * oq0 ^ 0x80;
+    qs1 = (signed char) * oq1 ^ 0x80;
+
+    // add outer taps if we have high edge variance
+    vp8_filter = vp8_signed_char_clamp(ps1 - qs1);
+    vp8_filter &= hev;
+
+    // inner taps
+    vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0));
+    vp8_filter &= mask;
+
+    // save bottom 3 bits so that we round one side +4 and the other +3
+    // if it equals 4 we'll set to adjust by -1 to account for the fact
+    // we'd round 3 the other way
+    Filter1 = vp8_signed_char_clamp(vp8_filter + 4);
+    Filter2 = vp8_signed_char_clamp(vp8_filter + 3);
+    Filter1 >>= 3;
+    Filter2 >>= 3;
+    u = vp8_signed_char_clamp(qs0 - Filter1);
+    *oq0 = u ^ 0x80;
+    u = vp8_signed_char_clamp(ps0 + Filter2);
+    *op0 = u ^ 0x80;
+    vp8_filter = Filter1;
+
+    // outer tap adjustments
+    vp8_filter += 1;
+    vp8_filter >>= 1;
+    vp8_filter &= ~hev;
+
+    u = vp8_signed_char_clamp(qs1 - vp8_filter);
+    *oq1 = u ^ 0x80;
+    u = vp8_signed_char_clamp(ps1 + vp8_filter);
+    *op1 = u ^ 0x80;
+
+}
+void vp8_loop_filter_horizontal_edge_c
+(
+    unsigned char *s,
+    int p, //pitch
+    const signed char *flimit,
+    const signed char *limit,
+    const signed char *thresh,
+    int count
+)
+{
+    int  hev = 0; // high edge variance
+    signed char mask = 0;
+    int i = 0;
+
+    // loop filter designed to work using chars so that we can make maximum use
+    // of 8 bit simd instructions.
+    do
+    {
+        mask = vp8_filter_mask(limit[i], flimit[i],
+                               s[-4*p], s[-3*p], s[-2*p], s[-1*p],
+                               s[0*p], s[1*p], s[2*p], s[3*p]);
+
+        hev = vp8_hevmask(thresh[i], s[-2*p], s[-1*p], s[0*p], s[1*p]);
+
+        vp8_filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
+
+        ++s;
+    }
+    while (++i < count * 8);
+}
+
+void vp8_loop_filter_vertical_edge_c
+(
+    unsigned char *s,
+    int p,
+    const signed char *flimit,
+    const signed char *limit,
+    const signed char *thresh,
+    int count
+)
+{
+    int  hev = 0; // high edge variance
+    signed char mask = 0;
+    int i = 0;
+
+    // loop filter designed to work using chars so that we can make maximum use
+    // of 8 bit simd instructions.
+    do
+    {
+        mask = vp8_filter_mask(limit[i], flimit[i],
+                               s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
+
+        hev = vp8_hevmask(thresh[i], s[-2], s[-1], s[0], s[1]);
+
+        vp8_filter(mask, hev, s - 2, s - 1, s, s + 1);
+
+        s += p;
+    }
+    while (++i < count * 8);
+}
+
+__inline void vp8_mbfilter(signed char mask, signed char hev,
+                           uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2)
+{
+    signed char s, u;
+    signed char vp8_filter, Filter1, Filter2;
+    signed char ps2 = (signed char) * op2 ^ 0x80;
+    signed char ps1 = (signed char) * op1 ^ 0x80;
+    signed char ps0 = (signed char) * op0 ^ 0x80;
+    signed char qs0 = (signed char) * oq0 ^ 0x80;
+    signed char qs1 = (signed char) * oq1 ^ 0x80;
+    signed char qs2 = (signed char) * oq2 ^ 0x80;
+
+    // add outer taps if we have high edge variance
+    vp8_filter = vp8_signed_char_clamp(ps1 - qs1);
+    vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0));
+    vp8_filter &= mask;
+
+    Filter2 = vp8_filter;
+    Filter2 &= hev;
+
+    // save bottom 3 bits so that we round one side +4 and the other +3
+    Filter1 = vp8_signed_char_clamp(Filter2 + 4);
+    Filter2 = vp8_signed_char_clamp(Filter2 + 3);
+    Filter1 >>= 3;
+    Filter2 >>= 3;
+    qs0 = vp8_signed_char_clamp(qs0 - Filter1);
+    ps0 = vp8_signed_char_clamp(ps0 + Filter2);
+
+
+    // only apply wider filter if not high edge variance
+    vp8_filter &= ~hev;
+    Filter2 = vp8_filter;
+
+    // roughly 3/7th difference across boundary
+    u = vp8_signed_char_clamp((63 + Filter2 * 27) >> 7);
+    s = vp8_signed_char_clamp(qs0 - u);
+    *oq0 = s ^ 0x80;
+    s = vp8_signed_char_clamp(ps0 + u);
+    *op0 = s ^ 0x80;
+
+    // roughly 2/7th difference across boundary
+    u = vp8_signed_char_clamp((63 + Filter2 * 18) >> 7);
+    s = vp8_signed_char_clamp(qs1 - u);
+    *oq1 = s ^ 0x80;
+    s = vp8_signed_char_clamp(ps1 + u);
+    *op1 = s ^ 0x80;
+
+    // roughly 1/7th difference across boundary
+    u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7);
+    s = vp8_signed_char_clamp(qs2 - u);
+    *oq2 = s ^ 0x80;
+    s = vp8_signed_char_clamp(ps2 + u);
+    *op2 = s ^ 0x80;
+}
+
+void vp8_mbloop_filter_horizontal_edge_c
+(
+    unsigned char *s,
+    int p,
+    const signed char *flimit,
+    const signed char *limit,
+    const signed char *thresh,
+    int count
+)
+{
+    signed char hev = 0; // high edge variance
+    signed char mask = 0;
+    int i = 0;
+
+    // loop filter designed to work using chars so that we can make maximum use
+    // of 8 bit simd instructions.
+    do
+    {
+
+        mask = vp8_filter_mask(limit[i], flimit[i],
+                               s[-4*p], s[-3*p], s[-2*p], s[-1*p],
+                               s[0*p], s[1*p], s[2*p], s[3*p]);
+
+        hev = vp8_hevmask(thresh[i], s[-2*p], s[-1*p], s[0*p], s[1*p]);
+
+        vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p);
+
+        ++s;
+    }
+    while (++i < count * 8);
+
+}
+
+
+void vp8_mbloop_filter_vertical_edge_c
+(
+    unsigned char *s,
+    int p,
+    const signed char *flimit,
+    const signed char *limit,
+    const signed char *thresh,
+    int count
+)
+{
+    signed char hev = 0; // high edge variance
+    signed char mask = 0;
+    int i = 0;
+
+    do
+    {
+
+        mask = vp8_filter_mask(limit[i], flimit[i],
+                               s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
+
+        hev = vp8_hevmask(thresh[i], s[-2], s[-1], s[0], s[1]);
+
+        vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2);
+
+        s += p;
+    }
+    while (++i < count * 8);
+
+}
+
+// should we apply any filter at all ( 11111111 yes, 00000000 no)
+__inline signed char vp8_simple_filter_mask(signed char limit, signed char flimit, uc p1, uc p0, uc q0, uc q1)
+{
+// Why does this cause problems for win32?
+// error C2143: syntax error : missing ';' before 'type'
+//  (void) limit;
+#ifndef NEW_LOOPFILTER_MASK
+    signed char mask = (abs(p0 - q0) <= flimit) * -1;
+#else
+    signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= flimit * 2 + limit) * -1;
+#endif
+    return mask;
+}
+
+__inline void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0, uc *oq1)
+{
+    signed char vp8_filter, Filter1, Filter2;
+    signed char p1 = (signed char) * op1 ^ 0x80;
+    signed char p0 = (signed char) * op0 ^ 0x80;
+    signed char q0 = (signed char) * oq0 ^ 0x80;
+    signed char q1 = (signed char) * oq1 ^ 0x80;
+    signed char u;
+
+    vp8_filter = vp8_signed_char_clamp(p1 - q1);
+    vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (q0 - p0));
+    vp8_filter &= mask;
+
+    // save bottom 3 bits so that we round one side +4 and the other +3
+    Filter1 = vp8_signed_char_clamp(vp8_filter + 4);
+    Filter1 >>= 3;
+    u = vp8_signed_char_clamp(q0 - Filter1);
+    *oq0  = u ^ 0x80;
+
+    Filter2 = vp8_signed_char_clamp(vp8_filter + 3);
+    Filter2 >>= 3;
+    u = vp8_signed_char_clamp(p0 + Filter2);
+    *op0 = u ^ 0x80;
+}
+
+void vp8_loop_filter_simple_horizontal_edge_c
+(
+    unsigned char *s,
+    int p,
+    const signed char *flimit,
+    const signed char *limit,
+    const signed char *thresh,
+    int count
+)
+{
+    signed char mask = 0;
+    int i = 0;
+    (void) thresh;
+
+    do
+    {
+        //mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1*p],s[0*p]);
+        mask = vp8_simple_filter_mask(limit[i], flimit[i], s[-2*p], s[-1*p], s[0*p], s[1*p]);
+        vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p);
+        ++s;
+    }
+    while (++i < count * 8);
+}
+
+void vp8_loop_filter_simple_vertical_edge_c
+(
+    unsigned char *s,
+    int p,
+    const signed char *flimit,
+    const signed char *limit,
+    const signed char *thresh,
+    int count
+)
+{
+    signed char mask = 0;
+    int i = 0;
+    (void) thresh;
+
+    do
+    {
+        //mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1],s[0]);
+        mask = vp8_simple_filter_mask(limit[i], flimit[i], s[-2], s[-1], s[0], s[1]);
+        vp8_simple_filter(mask, s - 2, s - 1, s, s + 1);
+        s += p;
+    }
+    while (++i < count * 8);
+
+}
diff --git a/vp8/common/mac_specs.h b/vp8/common/mac_specs.h
new file mode 100644
index 000000000..97bffc776
--- /dev/null
+++ b/vp8/common/mac_specs.h
@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#if !defined(_mac_specs_h)
+#define _mac_specs_h
+
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+    extern unsigned int vp8_read_tsc();
+
+    extern unsigned int vp8_get_processor_freq();
+
+    extern unsigned int vpx_has_altivec();
+
+#if defined(__cplusplus)
+}
+#endif
+
+
+#endif
diff --git a/vp8/common/mbpitch.c b/vp8/common/mbpitch.c
new file mode 100644
index 000000000..a7e0ce99a
--- /dev/null
+++ b/vp8/common/mbpitch.c
@@ -0,0 +1,128 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "blockd.h"
+
+typedef enum
+{
+    PRED = 0,
+    DEST = 1,
+} BLOCKSET;
+
+void vp8_setup_block
+(
+    BLOCKD *b,
+    int mv_stride,
+    unsigned char **base,
+    int Stride,
+    int offset,
+    BLOCKSET bs
+)
+{
+
+    if (bs == DEST)
+    {
+        b->dst_stride = Stride;
+        b->dst = offset;
+        b->base_dst = base;
+    }
+    else
+    {
+        b->pre_stride = Stride;
+        b->pre = offset;
+        b->base_pre = base;
+    }
+
+}
+
+void vp8_setup_macroblock(MACROBLOCKD *x, BLOCKSET bs)
+{
+    int block;
+
+    unsigned char **y, **u, **v;
+
+    if (bs == DEST)
+    {
+        y = &x->dst.y_buffer;
+        u = &x->dst.u_buffer;
+        v = &x->dst.v_buffer;
+    }
+    else
+    {
+        y = &x->pre.y_buffer;
+        u = &x->pre.u_buffer;
+        v = &x->pre.v_buffer;
+    }
+
+    for (block = 0; block < 16; block++) // y blocks
+    {
+        vp8_setup_block(&x->block[block], x->dst.y_stride, y, x->dst.y_stride,
+                        (block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4, bs);
+    }
+
+    for (block = 16; block < 20; block++) // U and V blocks
+    {
+        vp8_setup_block(&x->block[block], x->dst.uv_stride, u, x->dst.uv_stride,
+                        ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4, bs);
+
+        vp8_setup_block(&x->block[block+4], x->dst.uv_stride, v, x->dst.uv_stride,
+                        ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4, bs);
+    }
+}
+
+void vp8_setup_block_dptrs(MACROBLOCKD *x)
+{
+    int r, c;
+
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 4; c++)
+        {
+            x->block[r*4+c].diff      = &x->diff[r * 4 * 16 + c * 4];
+            x->block[r*4+c].predictor = x->predictor + r * 4 * 16 + c * 4;
+        }
+    }
+
+    for (r = 0; r < 2; r++)
+    {
+        for (c = 0; c < 2; c++)
+        {
+            x->block[16+r*2+c].diff      = &x->diff[256 + r * 4 * 8 + c * 4];
+            x->block[16+r*2+c].predictor = x->predictor + 256 + r * 4 * 8 + c * 4;
+
+        }
+    }
+
+    for (r = 0; r < 2; r++)
+    {
+        for (c = 0; c < 2; c++)
+        {
+            x->block[20+r*2+c].diff      = &x->diff[320+ r * 4 * 8 + c * 4];
+            x->block[20+r*2+c].predictor = x->predictor + 320 + r * 4 * 8 + c * 4;
+
+        }
+    }
+
+    x->block[24].diff = &x->diff[384];
+
+    for (r = 0; r < 25; r++)
+    {
+        x->block[r].qcoeff  = x->qcoeff  + r * 16;
+        x->block[r].dqcoeff = x->dqcoeff + r * 16;
+    }
+}
+
+void vp8_build_block_doffsets(MACROBLOCKD *x)
+{
+
+    // handle the destination pitch features
+    vp8_setup_macroblock(x, DEST);
+    vp8_setup_macroblock(x, PRED);
+}
diff --git a/vp8/common/modecont.c b/vp8/common/modecont.c
new file mode 100644
index 000000000..9301a2567
--- /dev/null
+++ b/vp8/common/modecont.c
@@ -0,0 +1,39 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "entropy.h"
+
+const int vp8_mode_contexts[6][4] =
+{
+    {
+        // 0
+        7,     1,     1,   143,
+    },
+    {
+        // 1
+        14,    18,    14,   107,
+    },
+    {
+        // 2
+        135,    64,    57,    68,
+    },
+    {
+        // 3
+        60,    56,   128,    65,
+    },
+    {
+        // 4
+        159,   134,   128,    34,
+    },
+    {
+        // 5
+        234,   188,   128,    28,
+    },
+};
diff --git a/vp8/common/modecont.h b/vp8/common/modecont.h
new file mode 100644
index 000000000..0c57651ed
--- /dev/null
+++ b/vp8/common/modecont.h
@@ -0,0 +1,16 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MODECONT_H
+#define __INC_MODECONT_H
+
+extern const int vp8_mode_contexts[6][4];
+
+#endif
diff --git a/vp8/common/modecontext.c b/vp8/common/modecontext.c
new file mode 100644
index 000000000..ceee74c70
--- /dev/null
+++ b/vp8/common/modecontext.c
@@ -0,0 +1,145 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "entropymode.h"
+
+const unsigned int vp8_kf_default_bmode_counts [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES] =
+{
+    {
+        //Above Mode :  0
+        { 43438,   2195,    470,    316,    615,    171,    217,    412,    124,    160, }, // left_mode 0
+        {  5722,   2751,    296,    291,     81,     68,     80,    101,    100,    170, }, // left_mode 1
+        {  1629,    201,    307,     25,     47,     16,     34,     72,     19,     28, }, // left_mode 2
+        {   332,    266,     36,    500,     20,     65,     23,     14,    154,    106, }, // left_mode 3
+        {   450,     97,     10,     24,    117,     10,      2,     12,      8,     71, }, // left_mode 4
+        {   384,     49,     29,     44,     12,    162,     51,      5,     87,     42, }, // left_mode 5
+        {   495,     53,    157,     27,     14,     57,    180,     17,     17,     34, }, // left_mode 6
+        {   695,     64,     62,      9,     27,      5,      3,    147,     10,     26, }, // left_mode 7
+        {   230,     54,     20,    124,     16,    125,     29,     12,    283,     37, }, // left_mode 8
+        {   260,     87,     21,    120,     32,     16,     33,     16,     33,    203, }, // left_mode 9
+    },
+    {
+        //Above Mode :  1
+        {  3934,   2573,    355,    137,    128,     87,    133,    117,     37,     27, }, // left_mode 0
+        {  1036,   1929,    278,    135,     27,     37,     48,     55,     41,     91, }, // left_mode 1
+        {   223,    256,    253,     15,     13,      9,     28,     64,      3,      3, }, // left_mode 2
+        {   120,    129,     17,    316,     15,     11,      9,      4,     53,     74, }, // left_mode 3
+        {   129,     58,      6,     11,     38,      2,      0,      5,      2,     67, }, // left_mode 4
+        {    53,     22,     11,     16,      8,     26,     14,      3,     19,     12, }, // left_mode 5
+        {    59,     26,     61,     11,      4,      9,     35,     13,      8,      8, }, // left_mode 6
+        {   101,     52,     40,      8,      5,      2,      8,     59,      2,     20, }, // left_mode 7
+        {    48,     34,     10,     52,      8,     15,      6,      6,     63,     20, }, // left_mode 8
+        {    96,     48,     22,     63,     11,     14,      5,      8,      9,     96, }, // left_mode 9
+    },
+    {
+        //Above Mode :  2
+        {   709,    461,    506,     36,     27,     33,    151,     98,     24,      6, }, // left_mode 0
+        {   201,    375,    442,     27,     13,      8,     46,     58,      6,     19, }, // left_mode 1
+        {   122,    140,    417,      4,     13,      3,     33,     59,      4,      2, }, // left_mode 2
+        {    36,     17,     22,     16,      6,      8,     12,     17,      9,     21, }, // left_mode 3
+        {    51,     15,      7,      1,     14,      0,      4,      5,      3,     22, }, // left_mode 4
+        {    18,     11,     30,      9,      7,     20,     11,      5,      2,      6, }, // left_mode 5
+        {    38,     21,    103,      9,      4,     12,     79,     13,      2,      5, }, // left_mode 6
+        {    64,     17,     66,      2,     12,      4,      2,     65,      4,      5, }, // left_mode 7
+        {    14,      7,      7,     16,      3,     11,      4,     13,     15,     16, }, // left_mode 8
+        {    36,      8,     32,      9,      9,      4,     14,      7,      6,     24, }, // left_mode 9
+    },
+    {
+        //Above Mode :  3
+        {  1340,    173,     36,    119,     30,     10,     13,     10,     20,     26, }, // left_mode 0
+        {   156,    293,     26,    108,      5,     16,      2,      4,     23,     30, }, // left_mode 1
+        {    60,     34,     13,      7,      3,      3,      0,      8,      4,      5, }, // left_mode 2
+        {    72,     64,      1,    235,      3,      9,      2,      7,     28,     38, }, // left_mode 3
+        {    29,     14,      1,      3,      5,      0,      2,      2,      5,     13, }, // left_mode 4
+        {    22,      7,      4,     11,      2,      5,      1,      2,      6,      4, }, // left_mode 5
+        {    18,     14,      5,      6,      4,      3,     14,      0,      9,      2, }, // left_mode 6
+        {    41,     10,      7,      1,      2,      0,      0,     10,      2,      1, }, // left_mode 7
+        {    23,     19,      2,     33,      1,      5,      2,      0,     51,      8, }, // left_mode 8
+        {    33,     26,      7,     53,      3,      9,      3,      3,      9,     19, }, // left_mode 9
+    },
+    {
+        //Above Mode :  4
+        {   410,    165,     43,     31,     66,     15,     30,     54,      8,     17, }, // left_mode 0
+        {   115,     64,     27,     18,     30,      7,     11,     15,      4,     19, }, // left_mode 1
+        {    31,     23,     25,      1,      7,      2,      2,     10,      0,      5, }, // left_mode 2
+        {    17,      4,      1,      6,      8,      2,      7,      5,      5,     21, }, // left_mode 3
+        {   120,     12,      1,      2,     83,      3,      0,      4,      1,     40, }, // left_mode 4
+        {     4,      3,      1,      2,      1,      2,      5,      0,      3,      6, }, // left_mode 5
+        {    10,      2,     13,      6,      6,      6,      8,      2,      4,      5, }, // left_mode 6
+        {    58,     10,      5,      1,     28,      1,      1,     33,      1,      9, }, // left_mode 7
+        {     8,      2,      1,      4,      2,      5,      1,      1,      2,     10, }, // left_mode 8
+        {    76,      7,      5,      7,     18,      2,      2,      0,      5,     45, }, // left_mode 9
+    },
+    {
+        //Above Mode :  5
+        {   444,     46,     47,     20,     14,    110,     60,     14,     60,      7, }, // left_mode 0
+        {    59,     57,     25,     18,      3,     17,     21,      6,     14,      6, }, // left_mode 1
+        {    24,     17,     20,      6,      4,     13,      7,      2,      3,      2, }, // left_mode 2
+        {    13,     11,      5,     14,      4,      9,      2,      4,     15,      7, }, // left_mode 3
+        {     8,      5,      2,      1,      4,      0,      1,      1,      2,     12, }, // left_mode 4
+        {    19,      5,      5,      7,      4,     40,      6,      3,     10,      4, }, // left_mode 5
+        {    16,      5,      9,      1,      1,     16,     26,      2,     10,      4, }, // left_mode 6
+        {    11,      4,      8,      1,      1,      4,      4,      5,      4,      1, }, // left_mode 7
+        {    15,      1,      3,      7,      3,     21,      7,      1,     34,      5, }, // left_mode 8
+        {    18,      5,      1,      3,      4,      3,      7,      1,      2,      9, }, // left_mode 9
+    },
+    {
+        //Above Mode :  6
+        {   476,    149,     94,     13,     14,     77,    291,     27,     23,      3, }, // left_mode 0
+        {    79,     83,     42,     14,      2,     12,     63,      2,      4,     14, }, // left_mode 1
+        {    43,     36,     55,      1,      3,      8,     42,     11,      5,      1, }, // left_mode 2
+        {     9,      9,      6,     16,      1,      5,      6,      3,     11,     10, }, // left_mode 3
+        {    10,      3,      1,      3,     10,      1,      0,      1,      1,      4, }, // left_mode 4
+        {    14,      6,     15,      5,      1,     20,     25,      2,      5,      0, }, // left_mode 5
+        {    28,      7,     51,      1,      0,      8,    127,      6,      2,      5, }, // left_mode 6
+        {    13,      3,      3,      2,      3,      1,      2,      8,      1,      2, }, // left_mode 7
+        {    10,      3,      3,      3,      3,      8,      2,      2,      9,      3, }, // left_mode 8
+        {    13,      7,     11,      4,      0,      4,      6,      2,      5,      8, }, // left_mode 9
+    },
+    {
+        //Above Mode :  7
+        {   376,    135,    119,      6,     32,      8,     31,    224,      9,      3, }, // left_mode 0
+        {    93,     60,     54,      6,     13,      7,      8,     92,      2,     12, }, // left_mode 1
+        {    74,     36,     84,      0,      3,      2,      9,     67,      2,      1, }, // left_mode 2
+        {    19,      4,      4,      8,      8,      2,      4,      7,      6,     16, }, // left_mode 3
+        {    51,      7,      4,      1,     77,      3,      0,     14,      1,     15, }, // left_mode 4
+        {     7,      7,      5,      7,      4,      7,      4,      5,      0,      3, }, // left_mode 5
+        {    18,      2,     19,      2,      2,      4,     12,     11,      1,      2, }, // left_mode 6
+        {   129,      6,     27,      1,     21,      3,      0,    189,      0,      6, }, // left_mode 7
+        {     9,      1,      2,      8,      3,      7,      0,      5,      3,      3, }, // left_mode 8
+        {    20,      4,      5,     10,      4,      2,      7,     17,      3,     16, }, // left_mode 9
+    },
+    {
+        //Above Mode :  8
+        {   617,     68,     34,     79,     11,     27,     25,     14,     75,     13, }, // left_mode 0
+        {    51,     82,     21,     26,      6,     12,     13,      1,     26,     16, }, // left_mode 1
+        {    29,      9,     12,     11,      3,      7,      1,     10,      2,      2, }, // left_mode 2
+        {    17,     19,     11,     74,      4,      3,      2,      0,     58,     13, }, // left_mode 3
+        {    10,      1,      1,      3,      4,      1,      0,      2,      1,      8, }, // left_mode 4
+        {    14,      4,      5,      5,      1,     13,      2,      0,     27,      8, }, // left_mode 5
+        {    10,      3,      5,      4,      1,      7,      6,      4,      5,      1, }, // left_mode 6
+        {    10,      2,      6,      2,      1,      1,      1,      4,      2,      1, }, // left_mode 7
+        {    14,      8,      5,     23,      2,     12,      6,      2,    117,      5, }, // left_mode 8
+        {     9,      6,      2,     19,      1,      6,      3,      2,      9,      9, }, // left_mode 9
+    },
+    {
+        //Above Mode :  9
+        {   680,     73,     22,     38,     42,      5,     11,      9,      6,     28, }, // left_mode 0
+        {   113,    112,     21,     22,     10,      2,      8,      4,      6,     42, }, // left_mode 1
+        {    44,     20,     24,      6,      5,      4,      3,      3,      1,      2, }, // left_mode 2
+        {    40,     23,      7,     71,      5,      2,      4,      1,      7,     22, }, // left_mode 3
+        {    85,      9,      4,      4,     17,      2,      0,      3,      2,     23, }, // left_mode 4
+        {    13,      4,      2,      6,      1,      7,      0,      1,      7,      6, }, // left_mode 5
+        {    26,      6,      8,      3,      2,      3,      8,      1,      5,      4, }, // left_mode 6
+        {    54,      8,      9,      6,      7,      0,      1,     11,      1,      3, }, // left_mode 7
+        {     9,     10,      4,     13,      2,      5,      4,      2,     14,      8, }, // left_mode 8
+        {    92,      9,      5,     19,     15,      3,      3,      1,      6,     58, }, // left_mode 9
+    },
+};
diff --git a/vp8/common/mv.h b/vp8/common/mv.h
new file mode 100644
index 000000000..3d8418108
--- /dev/null
+++ b/vp8/common/mv.h
@@ -0,0 +1,20 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MV_H
+#define __INC_MV_H
+
+typedef struct
+{
+    short row;
+    short col;
+} MV;
+
+#endif
diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
new file mode 100644
index 000000000..b66c40070
--- /dev/null
+++ b/vp8/common/onyx.h
@@ -0,0 +1,222 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_VP8_H
+#define __INC_VP8_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include "vpx_codec/internal/vpx_codec_internal.h"
+#include "vpx_scale/yv12config.h"
+#include "type_aliases.h"
+#include "ppflags.h"
+    typedef int *VP8_PTR;
+
+    /* Create/destroy static data structures. */
+
+    typedef enum
+    {
+        NORMAL      = 0,
+        FOURFIVE    = 1,
+        THREEFIVE   = 2,
+        ONETWO      = 3
+
+    } VPX_SCALING;
+
+    typedef enum
+    {
+        VP8_LAST_FLAG = 1,
+        VP8_GOLD_FLAG = 2,
+        VP8_ALT_FLAG = 4
+    } VP8_REFFRAME;
+
+
+    typedef enum
+    {
+        USAGE_STREAM_FROM_SERVER    = 0x0,
+        USAGE_LOCAL_FILE_PLAYBACK   = 0x1
+    } END_USAGE;
+
+
+    typedef enum
+    {
+        MODE_REALTIME       = 0x0,
+        MODE_GOODQUALITY    = 0x1,
+        MODE_BESTQUALITY    = 0x2,
+        MODE_FIRSTPASS      = 0x3,
+        MODE_SECONDPASS     = 0x4,
+        MODE_SECONDPASS_BEST = 0x5,
+    } MODE;
+
+    typedef enum
+    {
+        FRAMEFLAGS_KEY    = 1,
+        FRAMEFLAGS_GOLDEN = 2,
+        FRAMEFLAGS_ALTREF = 4,
+    } FRAMETYPE_FLAGS;
+
+
+#include <assert.h>
+    static __inline void Scale2Ratio(int mode, int *hr, int *hs)
+    {
+        switch (mode)
+        {
+        case    NORMAL:
+            *hr = 1;
+            *hs = 1;
+            break;
+        case    FOURFIVE:
+            *hr = 4;
+            *hs = 5;
+            break;
+        case    THREEFIVE:
+            *hr = 3;
+            *hs = 5;
+            break;
+        case    ONETWO:
+            *hr = 1;
+            *hs = 2;
+            break;
+        default:
+            *hr = 1;
+            *hs = 1;
+            assert(0);
+            break;
+        }
+    }
+
+    typedef struct
+    {
+        int Version;            // 4 versions of bitstream defined 0 best quality/slowest decode, 3 lowest quality/fastest decode
+        int Width;              // width of data passed to the compressor
+        int Height;             // height of data passed to the compressor
+        double frame_rate;       // set to passed in framerate
+        int target_bandwidth;    // bandwidth to be used in kilobits per second
+
+        int noise_sensitivity;   // parameter used for applying pre processing blur: recommendation 0
+        int Sharpness;          // parameter used for sharpening output: recommendation 0:
+        int cpu_used;
+
+        // mode ->
+        //(0)=Realtime/Live Encoding. This mode is optimized for realtim encoding (for example, capturing
+        //    a television signal or feed from a live camera). ( speed setting controls how fast )
+        //(1)=Good Quality Fast Encoding. The encoder balances quality with the amount of time it takes to
+        //    encode the output. ( speed setting controls how fast )
+        //(2)=One Pass - Best Quality. The encoder places priority on the quality of the output over encoding
+        //    speed. The output is compressed at the highest possible quality. This option takes the longest
+        //    amount of time to encode. ( speed setting ignored )
+        //(3)=Two Pass - First Pass. The encoder generates a file of statistics for use in the second encoding
+        //    pass. ( speed setting controls how fast )
+        //(4)=Two Pass - Second Pass. The encoder uses the statistics that were generated in the first encoding
+        //    pass to create the compressed output. ( speed setting controls how fast )
+        //(5)=Two Pass - Second Pass Best.  The encoder uses the statistics that were generated in the first
+        //    encoding pass to create the compressed output using the highest possible quality, and taking a
+        //    longer amount of time to encode.. ( speed setting ignored )
+        int Mode;               //
+
+        // Key Framing Operations
+        int auto_key;            // automatically detect cut scenes and set the keyframes
+        int key_freq;            // maximum distance to key frame.
+
+        int allow_lag;           // allow lagged compression (if 0 lagin frames is ignored)
+        int lag_in_frames;        // how many frames lag before we start encoding
+
+        //----------------------------------------------------------------
+        // DATARATE CONTROL OPTIONS
+
+        int end_usage; // vbr or cbr
+
+        // shoot to keep buffer full at all times by undershooting a bit 95 recommended
+        int under_shoot_pct;
+
+        // buffering parameters
+        int starting_buffer_level;  // in seconds
+        int optimal_buffer_level;
+        int maximum_buffer_size;
+
+        // controlling quality
+        int fixed_q;
+        int worst_allowed_q;
+        int best_allowed_q;
+
+        // allow internal resizing ( currently disabled in the build !!!!!)
+        int allow_spatial_resampling;
+        int resample_down_water_mark;
+        int resample_up_water_mark;
+
+        // allow internal frame rate alterations
+        int allow_df;
+        int drop_frames_water_mark;
+
+        // two pass datarate control
+        int two_pass_vbrbias;        // two pass datarate control tweaks
+        int two_pass_vbrmin_section;
+        int two_pass_vbrmax_section;
+        // END DATARATE CONTROL OPTIONS
+        //----------------------------------------------------------------
+
+
+        // these parameters aren't to be used in final build don't use!!!
+        int play_alternate;
+        int alt_freq;
+        int alt_q;
+        int key_q;
+        int gold_q;
+
+
+        int multi_threaded;   // how many threads to run the encoder on
+        int token_partitions; // how many token partitions to create for multi core decoding
+        int encode_breakout;  // early breakout encode threshold : for video conf recommend 800
+
+        int error_resilient_mode;  // if running over udp networks provides decodable frames after a
+        // dropped packet
+
+        int arnr_max_frames;
+        int arnr_strength ;
+        int arnr_type     ;
+
+
+        struct vpx_fixed_buf         two_pass_stats_in;
+        struct vpx_codec_pkt_list  *output_pkt_list;
+    } VP8_CONFIG;
+
+
+    void vp8_initialize();
+
+    VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf);
+    void vp8_remove_compressor(VP8_PTR *comp);
+
+    void vp8_init_config(VP8_PTR onyx, VP8_CONFIG *oxcf);
+    void vp8_change_config(VP8_PTR onyx, VP8_CONFIG *oxcf);
+
+// receive a frames worth of data caller can assume that a copy of this frame is made
+// and not just a copy of the pointer..
+    int vp8_receive_raw_frame(VP8_PTR comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, INT64 time_stamp, INT64 end_time_stamp);
+    int vp8_get_compressed_data(VP8_PTR comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, INT64 *time_stamp, INT64 *time_end, int flush);
+    int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags);
+
+    int vp8_use_as_reference(VP8_PTR comp, int ref_frame_flags);
+    int vp8_update_reference(VP8_PTR comp, int ref_frame_flags);
+    int vp8_get_reference(VP8_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    int vp8_set_reference(VP8_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    int vp8_update_entropy(VP8_PTR comp, int update);
+    int vp8_set_roimap(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4]);
+    int vp8_set_active_map(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols);
+    int vp8_set_internal_size(VP8_PTR comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
+    int vp8_get_quantizer(VP8_PTR c);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h
new file mode 100644
index 000000000..a40ffb9f0
--- /dev/null
+++ b/vp8/common/onyxc_int.h
@@ -0,0 +1,205 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_VP8C_INT_H
+#define __INC_VP8C_INT_H
+
+#include "vpx_ports/config.h"
+#include "vpx_codec/internal/vpx_codec_internal.h"
+#include "loopfilter.h"
+#include "entropymv.h"
+#include "entropy.h"
+#include "idct.h"
+#include "recon.h"
+#include "postproc.h"
+
+//#ifdef PACKET_TESTING
+#include "header.h"
+//#endif
+
+/* Create/destroy static data structures. */
+
+void vp8_initialize_common(void);
+
+#define MINQ 0
+#define MAXQ 127
+#define QINDEX_RANGE (MAXQ + 1)
+
+
+typedef struct frame_contexts
+{
+    vp8_prob bmode_prob [VP8_BINTRAMODES-1];
+    vp8_prob ymode_prob [VP8_YMODES-1];   /* interframe intra mode probs */
+    vp8_prob uv_mode_prob [VP8_UV_MODES-1];
+    vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1];
+    vp8_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1];
+    MV_CONTEXT mvc[2];
+    MV_CONTEXT pre_mvc[2];  //not to caculate the mvcost for the frame if mvc doesn't change.
+} FRAME_CONTEXT;
+
+typedef enum
+{
+    ONE_PARTITION  = 0,
+    TWO_PARTITION  = 1,
+    FOUR_PARTITION = 2,
+    EIGHT_PARTITION = 3
+} TOKEN_PARTITION;
+
+typedef enum
+{
+    RECON_CLAMP_REQUIRED        = 0,
+    RECON_CLAMP_NOTREQUIRED     = 1
+} CLAMP_TYPE;
+
+typedef enum
+{
+    SIXTAP   = 0,
+    BILINEAR = 1
+} INTERPOLATIONFILTERTYPE;
+
+typedef struct VP8_COMMON_RTCD
+{
+#if CONFIG_RUNTIME_CPU_DETECT
+    vp8_idct_rtcd_vtable_t        idct;
+    vp8_recon_rtcd_vtable_t       recon;
+    vp8_subpix_rtcd_vtable_t      subpix;
+    vp8_loopfilter_rtcd_vtable_t  loopfilter;
+    vp8_postproc_rtcd_vtable_t    postproc;
+#else
+    int unused;
+#endif
+} VP8_COMMON_RTCD;
+
+typedef struct VP8Common
+{
+    struct vpx_internal_error_info  error;
+
+    DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][4][4]);
+    DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][4][4]);
+    DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][4][4]);
+
+    int Width;
+    int Height;
+    int horiz_scale;
+    int vert_scale;
+
+    YUV_TYPE clr_type;
+    CLAMP_TYPE  clamp_type;
+
+    YV12_BUFFER_CONFIG last_frame;
+    YV12_BUFFER_CONFIG golden_frame;
+    YV12_BUFFER_CONFIG alt_ref_frame;
+    YV12_BUFFER_CONFIG new_frame;
+    YV12_BUFFER_CONFIG *frame_to_show;
+    YV12_BUFFER_CONFIG post_proc_buffer;
+    YV12_BUFFER_CONFIG temp_scale_frame;
+
+    FRAME_TYPE last_frame_type;  //Add to check if vp8_frame_init_loop_filter() can be skiped.
+    FRAME_TYPE frame_type;
+
+    int show_frame;
+
+    int frame_flags;
+    int MBs;
+    int mb_rows;
+    int mb_cols;
+    int mode_info_stride;
+
+    // prfile settings
+    int mb_no_coeff_skip;
+    int no_lpf;
+    int simpler_lpf;
+    int use_bilinear_mc_filter;
+    int full_pixel;
+
+    int base_qindex;
+    int last_kf_gf_q;  // Q used on the last GF or KF
+
+    int y1dc_delta_q;
+    int y2dc_delta_q;
+    int y2ac_delta_q;
+    int uvdc_delta_q;
+    int uvac_delta_q;
+
+    unsigned int frames_since_golden;
+    unsigned int frames_till_alt_ref_frame;
+    unsigned char *gf_active_flags;   // Record of which MBs still refer to last golden frame either directly or through 0,0
+    int gf_active_count;
+
+    /* We allocate a MODE_INFO struct for each macroblock, together with
+       an extra row on top and column on the left to simplify prediction. */
+
+    MODE_INFO *mip; /* Base of allocated array */
+    MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */
+
+
+    INTERPOLATIONFILTERTYPE mcomp_filter_type;
+    LOOPFILTERTYPE last_filter_type;
+    LOOPFILTERTYPE filter_type;
+    loop_filter_info lf_info[MAX_LOOP_FILTER+1];
+    prototype_loopfilter_block((*lf_mbv));
+    prototype_loopfilter_block((*lf_mbh));
+    prototype_loopfilter_block((*lf_bv));
+    prototype_loopfilter_block((*lf_bh));
+    int filter_level;
+    int last_sharpness_level;
+    int sharpness_level;
+
+    int refresh_last_frame;       // Two state 0 = NO, 1 = YES
+    int refresh_golden_frame;     // Two state 0 = NO, 1 = YES
+    int refresh_alt_ref_frame;     // Two state 0 = NO, 1 = YES
+
+    int copy_buffer_to_gf;         // 0 none, 1 Last to GF, 2 ARF to GF
+    int copy_buffer_to_arf;        // 0 none, 1 Last to ARF, 2 GF to ARF
+
+    int refresh_entropy_probs;    // Two state 0 = NO, 1 = YES
+
+    int ref_frame_sign_bias[MAX_REF_FRAMES];    // Two state 0, 1
+
+    // Y,U,V,Y2
+    ENTROPY_CONTEXT *above_context[4];   // row of context for each plane
+    ENTROPY_CONTEXT left_context[4][4];  // (up to) 4 contexts ""
+
+
+    // keyframe block modes are predicted by their above, left neighbors
+
+    vp8_prob kf_bmode_prob [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1];
+    vp8_prob kf_ymode_prob [VP8_YMODES-1];  /* keyframe "" */
+    vp8_prob kf_uv_mode_prob [VP8_UV_MODES-1];
+
+
+    FRAME_CONTEXT lfc; // last frame entropy
+    FRAME_CONTEXT fc;  // this frame entropy
+
+    unsigned int current_video_frame;
+
+    int near_boffset[3];
+    int version;
+
+    TOKEN_PARTITION multi_token_partition;
+
+#ifdef PACKET_TESTING
+    VP8_HEADER oh;
+#endif
+    double bitrate;
+    double framerate;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+    VP8_COMMON_RTCD rtcd;
+#endif
+    struct postproc_state  postproc_state;
+} VP8_COMMON;
+
+
+void vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int *filter_level);
+void vp8_init_loop_filter(VP8_COMMON *cm);
+extern void vp8_loop_filter_frame(VP8_COMMON *cm,    MACROBLOCKD *mbd,  int filt_val);
+
+#endif
diff --git a/vp8/common/onyxd.h b/vp8/common/onyxd.h
new file mode 100644
index 000000000..644c0ec77
--- /dev/null
+++ b/vp8/common/onyxd.h
@@ -0,0 +1,67 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_VP8D_H
+#define __INC_VP8D_H
+
+
+/* Create/destroy static data structures. */
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#include "type_aliases.h"
+#include "vpx_scale/yv12config.h"
+#include "ppflags.h"
+#include "vpx_ports/mem.h"
+
+    typedef void   *VP8D_PTR;
+    typedef struct
+    {
+        int     Width;
+        int     Height;
+        int     Version;
+        int     postprocess;
+        int     max_threads;
+    } VP8D_CONFIG;
+    typedef enum
+    {
+        VP8_LAST_FLAG = 1,
+        VP8_GOLD_FLAG = 2,
+        VP8_ALT_FLAG = 4
+    } VP8_REFFRAME;
+
+    typedef enum
+    {
+        VP8D_OK = 0
+    } VP8D_SETTING;
+
+    void vp8dx_initialize(void);
+
+    void vp8dx_set_setting(VP8D_PTR comp, VP8D_SETTING oxst, int x);
+
+    int vp8dx_get_setting(VP8D_PTR comp, VP8D_SETTING oxst);
+
+    int vp8dx_receive_compressed_data(VP8D_PTR comp, unsigned long size, const unsigned char *dest, INT64 time_stamp);
+    int vp8dx_get_raw_frame(VP8D_PTR comp, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, int deblock_level,  int noise_level, int flags);
+
+    int vp8dx_get_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    int vp8dx_set_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+
+    VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf);
+
+    void vp8dx_remove_decompressor(VP8D_PTR comp);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/vp8/common/partialgfupdate.h b/vp8/common/partialgfupdate.h
new file mode 100644
index 000000000..32a55ee6c
--- /dev/null
+++ b/vp8/common/partialgfupdate.h
@@ -0,0 +1,18 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_PARTIALGFUPDATE_H
+#define __INC_PARTIALGFUPDATE_H
+
+#include "onyxc_int.h"
+
+extern void update_gf_selective(ONYX_COMMON *cm, MACROBLOCKD *x);
+
+#endif
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
new file mode 100644
index 000000000..f019925c3
--- /dev/null
+++ b/vp8/common/postproc.c
@@ -0,0 +1,641 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_scale/yv12config.h"
+#include "postproc.h"
+#include "vpx_scale/yv12extend.h"
+#include "vpx_scale/vpxscale.h"
+#include "systemdependent.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+// global constants
+
+static const short kernel5[] =
+{
+    1, 1, 4, 1, 1
+};
+
+const short vp8_rv[] =
+{
+    8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
+    0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
+    10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
+    8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
+    8, 11, 13, 4, 2, 9, 0, 3, 9, 6,
+    1, 2, 3, 14, 13, 1, 8, 2, 9, 7,
+    3, 3, 1, 13, 13, 6, 6, 5, 2, 7,
+    11, 9, 11, 8, 7, 3, 2, 0, 13, 13,
+    14, 4, 12, 5, 12, 10, 8, 10, 13, 10,
+    4, 14, 4, 10, 0, 8, 11, 1, 13, 7,
+    7, 14, 6, 14, 13, 2, 13, 5, 4, 4,
+    0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
+    8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
+    3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
+    3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
+    13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
+    5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
+    9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
+    4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
+    3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
+    11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
+    5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
+    0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
+    10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
+    4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
+    0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
+    8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
+    3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
+    3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
+    13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
+    5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
+    9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
+    4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
+    3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
+    11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
+    5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
+    0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
+    10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
+    4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
+    3, 8, 3, 7, 8, 5, 11, 4, 12, 3,
+    11, 9, 14, 8, 14, 13, 4, 3, 1, 2,
+    14, 6, 5, 4, 4, 11, 4, 6, 2, 1,
+    5, 8, 8, 12, 13, 5, 14, 10, 12, 13,
+    0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
+};
+
+
+extern void vp8_blit_text(const char *msg, unsigned char *address, const int pitch);
+
+/***********************************************************************************************************
+ */
+void vp8_post_proc_down_and_across_c
+(
+    unsigned char *src_ptr,
+    unsigned char *dst_ptr,
+    int src_pixels_per_line,
+    int dst_pixels_per_line,
+    int rows,
+    int cols,
+    int flimit
+)
+{
+    unsigned char *p_src, *p_dst;
+    int row;
+    int col;
+    int i;
+    int v;
+    int pitch = src_pixels_per_line;
+    unsigned char d[8];
+    (void)dst_pixels_per_line;
+
+    for (row = 0; row < rows; row++)
+    {
+        // post_proc_down for one row
+        p_src = src_ptr;
+        p_dst = dst_ptr;
+
+        for (col = 0; col < cols; col++)
+        {
+
+            int kernel = 4;
+            int v = p_src[col];
+
+            for (i = -2; i <= 2; i++)
+            {
+                if (abs(v - p_src[col+i*pitch]) > flimit)
+                    goto down_skip_convolve;
+
+                kernel += kernel5[2+i] * p_src[col+i*pitch];
+            }
+
+            v = (kernel >> 3);
+        down_skip_convolve:
+            p_dst[col] = v;
+        }
+
+        // now post_proc_across
+        p_src = dst_ptr;
+        p_dst = dst_ptr;
+
+        for (i = 0; i < 8; i++)
+            d[i] = p_src[i];
+
+        for (col = 0; col < cols; col++)
+        {
+            int kernel = 4;
+            v = p_src[col];
+
+            d[col&7] = v;
+
+            for (i = -2; i <= 2; i++)
+            {
+                if (abs(v - p_src[col+i]) > flimit)
+                    goto across_skip_convolve;
+
+                kernel += kernel5[2+i] * p_src[col+i];
+            }
+
+            d[col&7] = (kernel >> 3);
+        across_skip_convolve:
+
+            if (col >= 2)
+                p_dst[col-2] = d[(col-2)&7];
+        }
+
+        //handle the last two pixels
+        p_dst[col-2] = d[(col-2)&7];
+        p_dst[col-1] = d[(col-1)&7];
+
+
+        //next row
+        src_ptr += pitch;
+        dst_ptr += pitch;
+    }
+}
+
+int vp8_q2mbl(int x)
+{
+    if (x < 20) x = 20;
+
+    x = 50 + (x - 50) * 10 / 8;
+    return x * x / 3;
+}
+void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit)
+{
+    int r, c, i;
+
+    unsigned char *s = src;
+    unsigned char d[16];
+
+
+    for (r = 0; r < rows; r++)
+    {
+        int sumsq = 0;
+        int sum   = 0;
+
+        for (i = -8; i <= 6; i++)
+        {
+            sumsq += s[i] * s[i];
+            sum   += s[i];
+            d[i+8] = 0;
+        }
+
+        for (c = 0; c < cols + 8; c++)
+        {
+            int x = s[c+7] - s[c-8];
+            int y = s[c+7] + s[c-8];
+
+            sum  += x;
+            sumsq += x * y;
+
+            d[c&15] = s[c];
+
+            if (sumsq * 15 - sum * sum < flimit)
+            {
+                d[c&15] = (8 + sum + s[c]) >> 4;
+            }
+
+            s[c-8] = d[(c-8)&15];
+        }
+
+        s += pitch;
+    }
+}
+
+
+
+
+
+void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit)
+{
+    int r, c, i;
+    const short *rv3 = &vp8_rv[63&rand()];
+
+    for (c = 0; c < cols; c++)
+    {
+        unsigned char *s = &dst[c];
+        int sumsq = 0;
+        int sum   = 0;
+        unsigned char d[16];
+        const short *rv2 = rv3 + ((c * 17) & 127);
+
+        for (i = -8; i <= 6; i++)
+        {
+            sumsq += s[i*pitch] * s[i*pitch];
+            sum   += s[i*pitch];
+        }
+
+        for (r = 0; r < rows + 8; r++)
+        {
+            sumsq += s[7*pitch] * s[ 7*pitch] - s[-8*pitch] * s[-8*pitch];
+            sum  += s[7*pitch] - s[-8*pitch];
+            d[r&15] = s[0];
+
+            if (sumsq * 15 - sum * sum < flimit)
+            {
+                d[r&15] = (rv2[r&127] + sum + s[0]) >> 4;
+            }
+
+            s[-8*pitch] = d[(r-8)&15];
+            s += pitch;
+        }
+    }
+}
+
+
+static void vp8_deblock_and_de_macro_block(YV12_BUFFER_CONFIG         *source,
+        YV12_BUFFER_CONFIG         *post,
+        int                         q,
+        int                         low_var_thresh,
+        int                         flag,
+        vp8_postproc_rtcd_vtable_t *rtcd)
+{
+    double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
+    int ppl = (int)(level + .5);
+    (void) low_var_thresh;
+    (void) flag;
+
+    POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer, source->y_stride,  post->y_stride, source->y_height, source->y_width,  ppl);
+    POSTPROC_INVOKE(rtcd, across)(post->y_buffer, post->y_stride, post->y_height, post->y_width, vp8_q2mbl(q));
+    POSTPROC_INVOKE(rtcd, down)(post->y_buffer, post->y_stride, post->y_height, post->y_width, vp8_q2mbl(q));
+
+    POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
+    POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
+
+}
+
+extern void vp8_deblock(YV12_BUFFER_CONFIG         *source,
+                        YV12_BUFFER_CONFIG         *post,
+                        int                         q,
+                        int                         low_var_thresh,
+                        int                         flag,
+                        vp8_postproc_rtcd_vtable_t *rtcd)
+{
+    double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
+    int ppl = (int)(level + .5);
+    (void) low_var_thresh;
+    (void) flag;
+
+    POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer, source->y_stride,  post->y_stride, source->y_height, source->y_width,   ppl);
+    POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride,  source->uv_height, source->uv_width, ppl);
+    POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
+}
+
+void vp8_de_noise(YV12_BUFFER_CONFIG         *source,
+                  YV12_BUFFER_CONFIG         *post,
+                  int                         q,
+                  int                         low_var_thresh,
+                  int                         flag,
+                  vp8_postproc_rtcd_vtable_t *rtcd)
+{
+    double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
+    int ppl = (int)(level + .5);
+    (void) post;
+    (void) low_var_thresh;
+    (void) flag;
+
+    POSTPROC_INVOKE(rtcd, downacross)(
+        source->y_buffer + 2 * source->y_stride + 2,
+        source->y_buffer + 2 * source->y_stride + 2,
+        source->y_stride,
+        source->y_stride,
+        source->y_height - 4,
+        source->y_width - 4,
+        ppl);
+    POSTPROC_INVOKE(rtcd, downacross)(
+        source->u_buffer + 2 * source->uv_stride + 2,
+        source->u_buffer + 2 * source->uv_stride + 2,
+        source->uv_stride,
+        source->uv_stride,
+        source->uv_height - 4,
+        source->uv_width - 4, ppl);
+    POSTPROC_INVOKE(rtcd, downacross)(
+        source->v_buffer + 2 * source->uv_stride + 2,
+        source->v_buffer + 2 * source->uv_stride + 2,
+        source->uv_stride,
+        source->uv_stride,
+        source->uv_height - 4,
+        source->uv_width - 4, ppl);
+
+}
+
+
+//Notes: It is better to change CHAR to unsigned or signed to
+//avoid error on ARM platform.
+char vp8_an[8][64][3072];
+int vp8_cd[8][64];
+
+
+double vp8_gaussian(double sigma, double mu, double x)
+{
+    return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
+           (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
+}
+
+extern void (*vp8_clear_system_state)(void);
+
+
+static void fillrd(struct postproc_state *state, int q, int a)
+{
+    char char_dist[300];
+
+    double sigma;
+    int ai = a, qi = q, i;
+
+    vp8_clear_system_state();
+
+
+    sigma = ai + .5 + .6 * (63 - qi) / 63.0;
+
+    // set up a lookup table of 256 entries that matches
+    // a gaussian distribution with sigma determined by q.
+    //
+    {
+        double i;
+        int next, j;
+
+        next = 0;
+
+        for (i = -32; i < 32; i++)
+        {
+            int a = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i));
+
+            if (a)
+            {
+                for (j = 0; j < a; j++)
+                {
+                    char_dist[next+j] = (char) i;
+                }
+
+                next = next + j;
+            }
+
+        }
+
+        for (next = next; next < 256; next++)
+            char_dist[next] = 0;
+
+    }
+
+    for (i = 0; i < 3072; i++)
+    {
+        state->noise[i] = char_dist[rand() & 0xff];
+    }
+
+    for (i = 0; i < 16; i++)
+    {
+        state->blackclamp[i] = -char_dist[0];
+        state->whiteclamp[i] = -char_dist[0];
+        state->bothclamp[i] = -2 * char_dist[0];
+    }
+
+    state->last_q = q;
+    state->last_noise = a;
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : plane_add_noise_c
+ *
+ *  INPUTS        : unsigned char *Start    starting address of buffer to add gaussian
+ *                                  noise to
+ *                  unsigned int Width    width of plane
+ *                  unsigned int Height   height of plane
+ *                  int  Pitch    distance between subsequent lines of frame
+ *                  int  q        quantizer used to determine amount of noise
+ *                                  to add
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void.
+ *
+ *  FUNCTION      : adds gaussian noise to a plane of pixels
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8_plane_add_noise_c(unsigned char *Start, char *noise,
+                           char blackclamp[16],
+                           char whiteclamp[16],
+                           char bothclamp[16],
+                           unsigned int Width, unsigned int Height, int Pitch)
+{
+    unsigned int i, j;
+
+    for (i = 0; i < Height; i++)
+    {
+        unsigned char *Pos = Start + i * Pitch;
+        char  *Ref = (char *)(noise + (rand() & 0xff));
+
+        for (j = 0; j < Width; j++)
+        {
+            if (Pos[j] < blackclamp[0])
+                Pos[j] = blackclamp[0];
+
+            if (Pos[j] > 255 + whiteclamp[0])
+                Pos[j] = 255 + whiteclamp[0];
+
+            Pos[j] += Ref[j];
+        }
+    }
+}
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define RTCD_VTABLE(oci) (&(oci)->rtcd.postproc)
+#else
+#define RTCD_VTABLE(oci) NULL
+#endif
+
+int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags)
+{
+    char message[512];
+    int q = oci->filter_level * 10 / 6;
+
+    if (!oci->frame_to_show)
+        return -1;
+
+    if (q > 63)
+        q = 63;
+
+    if (!flags)
+    {
+        *dest = *oci->frame_to_show;
+
+        // handle problem with extending borders
+        dest->y_width = oci->Width;
+        dest->y_height = oci->Height;
+        dest->uv_height = dest->y_height / 2;
+        return 0;
+
+    }
+
+#if ARCH_X86||ARCH_X86_64
+    vpx_reset_mmx_state();
+#endif
+
+    if (flags & VP8D_DEMACROBLOCK)
+    {
+        vp8_deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,
+                                       q + (deblock_level - 5) * 10, 1, 0, RTCD_VTABLE(oci));
+    }
+    else if (flags & VP8D_DEBLOCK)
+    {
+        vp8_deblock(oci->frame_to_show, &oci->post_proc_buffer,
+                    q, 1, 0, RTCD_VTABLE(oci));
+    }
+    else
+    {
+        vp8_yv12_copy_frame_ptr(oci->frame_to_show, &oci->post_proc_buffer);
+    }
+
+    if (flags & VP8D_ADDNOISE)
+    {
+        if (oci->postproc_state.last_q != q
+            || oci->postproc_state.last_noise != noise_level)
+        {
+            fillrd(&oci->postproc_state, 63 - q, noise_level);
+        }
+
+        POSTPROC_INVOKE(RTCD_VTABLE(oci), addnoise)
+        (oci->post_proc_buffer.y_buffer,
+         oci->postproc_state.noise,
+         oci->postproc_state.blackclamp,
+         oci->postproc_state.whiteclamp,
+         oci->postproc_state.bothclamp,
+         oci->post_proc_buffer.y_width, oci->post_proc_buffer.y_height,
+         oci->post_proc_buffer.y_stride);
+    }
+
+    if (flags & VP8D_DEBUG_LEVEL1)
+    {
+        sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
+                (oci->frame_type == KEY_FRAME),
+                oci->refresh_golden_frame,
+                oci->base_qindex,
+                oci->filter_level,
+                flags,
+                oci->mb_cols, oci->mb_rows);
+        vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
+    }
+    else if (flags & VP8D_DEBUG_LEVEL2)
+    {
+        int i, j;
+        unsigned char *y_ptr;
+        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+        int mb_rows = post->y_height >> 4;
+        int mb_cols = post->y_width  >> 4;
+        int mb_index = 0;
+        MODE_INFO *mi = oci->mi;
+
+        y_ptr = post->y_buffer + 4 * post->y_stride + 4;
+
+        // vp8_filter each macro block
+        for (i = 0; i < mb_rows; i++)
+        {
+            for (j = 0; j < mb_cols; j++)
+            {
+                char zz[4];
+
+                sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a');
+
+                vp8_blit_text(zz, y_ptr, post->y_stride);
+                mb_index ++;
+                y_ptr += 16;
+            }
+
+            mb_index ++; //border
+            y_ptr += post->y_stride  * 16 - post->y_width;
+
+        }
+    }
+    else if (flags & VP8D_DEBUG_LEVEL3)
+    {
+        int i, j;
+        unsigned char *y_ptr;
+        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+        int mb_rows = post->y_height >> 4;
+        int mb_cols = post->y_width  >> 4;
+        int mb_index = 0;
+        MODE_INFO *mi = oci->mi;
+
+        y_ptr = post->y_buffer + 4 * post->y_stride + 4;
+
+        // vp8_filter each macro block
+        for (i = 0; i < mb_rows; i++)
+        {
+            for (j = 0; j < mb_cols; j++)
+            {
+                char zz[4];
+
+                if (oci->frame_type == KEY_FRAME)
+                    sprintf(zz, "a");
+                else
+                    sprintf(zz, "%c", mi[mb_index].mbmi.dc_diff + '0');
+
+                vp8_blit_text(zz, y_ptr, post->y_stride);
+                mb_index ++;
+                y_ptr += 16;
+            }
+
+            mb_index ++; //border
+            y_ptr += post->y_stride  * 16 - post->y_width;
+
+        }
+    }
+    else if (flags & VP8D_DEBUG_LEVEL4)
+    {
+        sprintf(message, "Bitrate: %10.2f frame_rate: %10.2f ", oci->bitrate, oci->framerate);
+        vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
+#if 0
+        int i, j;
+        unsigned char *y_ptr;
+        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+        int mb_rows = post->y_height >> 4;
+        int mb_cols = post->y_width  >> 4;
+        int mb_index = 0;
+        MODE_INFO *mi = oci->mi;
+
+        y_ptr = post->y_buffer + 4 * post->y_stride + 4;
+
+        // vp8_filter each macro block
+        for (i = 0; i < mb_rows; i++)
+        {
+            for (j = 0; j < mb_cols; j++)
+            {
+                char zz[4];
+
+                sprintf(zz, "%c", mi[mb_index].mbmi.dc_diff + '0');
+                vp8_blit_text(zz, y_ptr, post->y_stride);
+                mb_index ++;
+                y_ptr += 16;
+            }
+
+            mb_index ++; //border
+            y_ptr += post->y_stride  * 16 - post->y_width;
+
+        }
+
+#endif
+
+    }
+
+
+
+    *dest = oci->post_proc_buffer;
+
+    // handle problem with extending borders
+    dest->y_width = oci->Width;
+    dest->y_height = oci->Height;
+    dest->uv_height = dest->y_height / 2;
+    return 0;
+}
diff --git a/vp8/common/postproc.h b/vp8/common/postproc.h
new file mode 100644
index 000000000..c45fe9252
--- /dev/null
+++ b/vp8/common/postproc.h
@@ -0,0 +1,90 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef POSTPROC_H
+#define POSTPROC_H
+
+#define prototype_postproc_inplace(sym)\
+    void sym (unsigned char *dst, int pitch, int rows, int cols,int flimit)
+
+#define prototype_postproc(sym)\
+    void sym (unsigned char *src, unsigned char *dst, int src_pitch,\
+              int dst_pitch, int rows, int cols, int flimit)
+
+#define prototype_postproc_addnoise(sym) \
+    void sym (unsigned char *s, char *noise, char blackclamp[16],\
+              char whiteclamp[16], char bothclamp[16],\
+              unsigned int w, unsigned int h, int pitch)
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/postproc_x86.h"
+#endif
+
+#ifndef vp8_postproc_down
+#define vp8_postproc_down vp8_mbpost_proc_down_c
+#endif
+extern prototype_postproc_inplace(vp8_postproc_down);
+
+#ifndef vp8_postproc_across
+#define vp8_postproc_across vp8_mbpost_proc_across_ip_c
+#endif
+extern prototype_postproc_inplace(vp8_postproc_across);
+
+#ifndef vp8_postproc_downacross
+#define vp8_postproc_downacross vp8_post_proc_down_and_across_c
+#endif
+extern prototype_postproc(vp8_postproc_downacross);
+
+#ifndef vp8_postproc_addnoise
+#define vp8_postproc_addnoise vp8_plane_add_noise_c
+#endif
+extern prototype_postproc_addnoise(vp8_postproc_addnoise);
+
+
+typedef prototype_postproc((*vp8_postproc_fn_t));
+typedef prototype_postproc_inplace((*vp8_postproc_inplace_fn_t));
+typedef prototype_postproc_addnoise((*vp8_postproc_addnoise_fn_t));
+typedef struct
+{
+    vp8_postproc_inplace_fn_t   down;
+    vp8_postproc_inplace_fn_t   across;
+    vp8_postproc_fn_t           downacross;
+    vp8_postproc_addnoise_fn_t  addnoise;
+} vp8_postproc_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define POSTPROC_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define POSTPROC_INVOKE(ctx,fn) vp8_postproc_##fn
+#endif
+
+#include "vpx_ports/mem.h"
+struct postproc_state
+{
+    int           last_q;
+    int           last_noise;
+    char          noise[3072];
+    DECLARE_ALIGNED(16, char, blackclamp[16]);
+    DECLARE_ALIGNED(16, char, whiteclamp[16]);
+    DECLARE_ALIGNED(16, char, bothclamp[16]);
+};
+#include "onyxc_int.h"
+#include "ppflags.h"
+int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest,
+                        int deblock_level, int noise_level, int flags);
+
+
+void vp8_de_noise(YV12_BUFFER_CONFIG         *source,
+                  YV12_BUFFER_CONFIG         *post,
+                  int                         q,
+                  int                         low_var_thresh,
+                  int                         flag,
+                  vp8_postproc_rtcd_vtable_t *rtcd);
+#endif
diff --git a/vp8/common/ppc/copy_altivec.asm b/vp8/common/ppc/copy_altivec.asm
new file mode 100644
index 000000000..e87eb2112
--- /dev/null
+++ b/vp8/common/ppc/copy_altivec.asm
@@ -0,0 +1,46 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    .globl copy_mem16x16_ppc
+
+;# r3 unsigned char *src
+;# r4 int src_stride
+;# r5 unsigned char *dst
+;# r6 int dst_stride
+
+;# Make the assumption that input will not be aligned,
+;#  but the output will be.  So two reads and a perm
+;#  for the input, but only one store for the output.
+copy_mem16x16_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xe000
+    mtspr   256, r12            ;# set VRSAVE
+
+    li      r10, 16
+    mtctr   r10
+
+cp_16x16_loop:
+    lvsl    v0,  0, r3          ;# permutate value for alignment
+
+    lvx     v1,   0, r3
+    lvx     v2, r10, r3
+
+    vperm   v1, v1, v2, v0
+
+    stvx    v1,  0, r5
+
+    add     r3, r3, r4          ;# increment source pointer
+    add     r5, r5, r6          ;# increment destination pointer
+
+    bdnz    cp_16x16_loop
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
diff --git a/vp8/common/ppc/filter_altivec.asm b/vp8/common/ppc/filter_altivec.asm
new file mode 100644
index 000000000..2a3550773
--- /dev/null
+++ b/vp8/common/ppc/filter_altivec.asm
@@ -0,0 +1,1012 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    .globl sixtap_predict_ppc
+    .globl sixtap_predict8x4_ppc
+    .globl sixtap_predict8x8_ppc
+    .globl sixtap_predict16x16_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+    lis     \R0, \LABEL@ha
+    la      \R1, \LABEL@l(\R0)
+    lvx     \V, \OFF, \R1
+.endm
+
+.macro load_hfilter V0, V1
+    load_c \V0, HFilter, r5, r9, r10
+
+    addi    r5,  r5, 16
+    lvx     \V1, r5, r10
+.endm
+
+;# Vertical filtering
+.macro Vprolog
+    load_c v0, VFilter, r6, r3, r10
+
+    vspltish v5, 8
+    vspltish v6, 3
+    vslh    v6, v5, v6      ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    vspltb  v1, v0, 1
+    vspltb  v2, v0, 2
+    vspltb  v3, v0, 3
+    vspltb  v4, v0, 4
+    vspltb  v5, v0, 5
+    vspltb  v0, v0, 0
+.endm
+
+.macro vpre_load
+    Vprolog
+    li      r10,  16
+    lvx     v10,   0, r9    ;# v10..v14 = first 5 rows
+    lvx     v11, r10, r9
+    addi    r9,   r9, 32
+    lvx     v12,   0, r9
+    lvx     v13, r10, r9
+    addi    r9,   r9, 32
+    lvx     v14,   0, r9
+.endm
+
+.macro Msum Re, Ro, V, T, TMP
+                                ;# (Re,Ro) += (V*T)
+    vmuleub \TMP, \V, \T        ;# trashes v8
+    vadduhm \Re, \Re, \TMP      ;# Re = evens, saturation unnecessary
+    vmuloub \TMP, \V, \T
+    vadduhm \Ro, \Ro, \TMP      ;# Ro = odds
+.endm
+
+.macro vinterp_no_store P0 P1 P2 P3 P4 P5
+    vmuleub  v8, \P0, v0        ;# 64 + 4 positive taps
+    vadduhm v16, v6, v8
+    vmuloub  v8, \P0, v0
+    vadduhm v17, v6, v8
+    Msum v16, v17, \P2, v2, v8
+    Msum v16, v17, \P3, v3, v8
+    Msum v16, v17, \P5, v5, v8
+
+    vmuleub v18, \P1, v1        ;# 2 negative taps
+    vmuloub v19, \P1, v1
+    Msum v18, v19, \P4, v4, v8
+
+    vsubuhs v16, v16, v18       ;# subtract neg from pos
+    vsubuhs v17, v17, v19
+    vsrh    v16, v16, v7        ;# divide by 128
+    vsrh    v17, v17, v7        ;# v16 v17 = evens, odds
+    vmrghh  v18, v16, v17       ;# v18 v19 = 16-bit result in order
+    vmrglh  v19, v16, v17
+    vpkuhus  \P0, v18, v19      ;# P0 = 8-bit result
+.endm
+
+.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5
+    vmuleub v24, \P0, v13       ;# 64 + 4 positive taps
+    vadduhm v21, v20, v24
+    vmuloub v24, \P0, v13
+    vadduhm v22, v20, v24
+    Msum v21, v22, \P2, v15, v25
+    Msum v21, v22, \P3, v16, v25
+    Msum v21, v22, \P5, v18, v25
+
+    vmuleub v23, \P1, v14       ;# 2 negative taps
+    vmuloub v24, \P1, v14
+    Msum v23, v24, \P4, v17, v25
+
+    vsubuhs v21, v21, v23       ;# subtract neg from pos
+    vsubuhs v22, v22, v24
+    vsrh    v21, v21, v19       ;# divide by 128
+    vsrh    v22, v22, v19       ;# v16 v17 = evens, odds
+    vmrghh  v23, v21, v22       ;# v18 v19 = 16-bit result in order
+    vmrglh  v24, v21, v22
+    vpkuhus \P0, v23, v24       ;# P0 = 8-bit result
+.endm
+
+
+.macro Vinterp P0 P1 P2 P3 P4 P5
+    vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5
+    stvx    \P0, 0, r7
+    add     r7, r7, r8      ;# 33 ops per 16 pels
+.endm
+
+
+.macro luma_v P0, P1, P2, P3, P4, P5
+    addi    r9,   r9, 16        ;# P5 = newest input row
+    lvx     \P5,   0, r9
+    Vinterp \P0, \P1, \P2, \P3, \P4, \P5
+.endm
+
+.macro luma_vtwo
+    luma_v v10, v11, v12, v13, v14, v15
+    luma_v v11, v12, v13, v14, v15, v10
+.endm
+
+.macro luma_vfour
+    luma_vtwo
+    luma_v v12, v13, v14, v15, v10, v11
+    luma_v v13, v14, v15, v10, v11, v12
+.endm
+
+.macro luma_vsix
+    luma_vfour
+    luma_v v14, v15, v10, v11, v12, v13
+    luma_v v15, v10, v11, v12, v13, v14
+.endm
+
+.macro Interp4 R I I4
+    vmsummbm \R, v13, \I, v15
+    vmsummbm \R, v14, \I4, \R
+.endm
+
+.macro Read8x8 VD, RS, RP, increment_counter
+    lvsl    v21,  0, \RS        ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     \VD,   0, \RS
+    lvx     v20, r10, \RS
+
+.if \increment_counter
+    add     \RS, \RS, \RP
+.endif
+
+    vperm   \VD, \VD, v20, v21
+.endm
+
+.macro interp_8x8 R
+    vperm   v20, \R, \R, v16    ;# v20 = 0123 1234 2345 3456
+    vperm   v21, \R, \R, v17    ;# v21 = 4567 5678 6789 789A
+    Interp4 v20, v20,  v21      ;# v20 = result 0 1 2 3
+    vperm   \R, \R, \R, v18     ;# R   = 89AB 9ABC ABCx BCxx
+    Interp4 v21, v21, \R        ;# v21 = result 4 5 6 7
+
+    vpkswus \R, v20, v21        ;#  R = 0 1 2 3 4 5 6 7
+    vsrh    \R, \R, v19
+
+    vpkuhus \R, \R, \R          ;# saturate and pack
+
+.endm
+
+.macro Read4x4 VD, RS, RP, increment_counter
+    lvsl    v21,  0, \RS        ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v20,   0, \RS
+
+.if \increment_counter
+    add     \RS, \RS, \RP
+.endif
+
+    vperm   \VD, v20, v20, v21
+.endm
+    .text
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+sixtap_predict_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xff87
+    ori     r12, r12, 0xffc0
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    slwi.   r5, r5, 5           ;# index into horizontal filter array
+
+    vspltish v19, 7
+
+    ;# If there isn't any filtering to be done for the horizontal, then
+    ;#  just skip to the second pass.
+    beq-    vertical_only_4x4
+
+    ;# load up horizontal filter
+    load_hfilter v13, v14
+
+    ;# rounding added in on the multiply
+    vspltisw v16, 8
+    vspltisw v15, 3
+    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
+
+    ;# Load up permutation constants
+    load_c v16, B_0123, 0, r9, r10
+    load_c v17, B_4567, 0, r9, r10
+    load_c v18, B_89AB, 0, r9, r10
+
+    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
+    addi    r3, r3, -2
+
+    addi    r9, r3, 0
+    li      r10, 16
+    Read8x8 v2, r3, r4, 1
+    Read8x8 v3, r3, r4, 1
+    Read8x8 v4, r3, r4, 1
+    Read8x8 v5, r3, r4, 1
+
+    slwi.   r6, r6, 4           ;# index into vertical filter array
+
+    ;# filter a line
+    interp_8x8 v2
+    interp_8x8 v3
+    interp_8x8 v4
+    interp_8x8 v5
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional 5 lines that are needed
+    ;#  for the vertical filter.
+    beq-    store_4x4
+
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r9, r9, r4
+    sub     r9, r9, r4
+
+    Read8x8 v0, r9, r4, 1
+    Read8x8 v1, r9, r4, 0
+    Read8x8 v6, r3, r4, 1
+    Read8x8 v7, r3, r4, 1
+    Read8x8 v8, r3, r4, 0
+
+    interp_8x8 v0
+    interp_8x8 v1
+    interp_8x8 v6
+    interp_8x8 v7
+    interp_8x8 v8
+
+    b       second_pass_4x4
+
+vertical_only_4x4:
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r3, r3, r4
+    sub     r3, r3, r4
+    li      r10, 16
+
+    Read8x8 v0, r3, r4, 1
+    Read8x8 v1, r3, r4, 1
+    Read8x8 v2, r3, r4, 1
+    Read8x8 v3, r3, r4, 1
+    Read8x8 v4, r3, r4, 1
+    Read8x8 v5, r3, r4, 1
+    Read8x8 v6, r3, r4, 1
+    Read8x8 v7, r3, r4, 1
+    Read8x8 v8, r3, r4, 0
+
+    slwi    r6, r6, 4           ;# index into vertical filter array
+
+second_pass_4x4:
+    load_c   v20, b_hilo_4x4, 0, r9, r10
+    load_c   v21, b_hilo, 0, r9, r10
+
+    ;# reposition input so that it can go through the
+    ;# filtering phase with one pass.
+    vperm   v0, v0, v1, v20     ;# 0 1 x x
+    vperm   v2, v2, v3, v20     ;# 2 3 x x
+    vperm   v4, v4, v5, v20     ;# 4 5 x x
+    vperm   v6, v6, v7, v20     ;# 6 7 x x
+
+    vperm   v0, v0, v2, v21     ;# 0 1 2 3
+    vperm   v4, v4, v6, v21     ;# 4 5 6 7
+
+    vsldoi  v1, v0, v4, 4
+    vsldoi  v2, v0, v4, 8
+    vsldoi  v3, v0, v4, 12
+
+    vsldoi  v5, v4, v8, 4
+
+    load_c   v13, VFilter, r6, r9, r10
+
+    vspltish v15, 8
+    vspltish v20, 3
+    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    vspltb  v14, v13, 1
+    vspltb  v15, v13, 2
+    vspltb  v16, v13, 3
+    vspltb  v17, v13, 4
+    vspltb  v18, v13, 5
+    vspltb  v13, v13, 0
+
+    vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
+
+    stvx    v0, 0, r1
+
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    lwz     r0, 4(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    lwz     r0, 8(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    lwz     r0, 12(r1)
+    stw     r0, 0(r7)
+
+    b       exit_4x4
+
+store_4x4:
+
+    stvx    v2, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v3, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v4, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v5, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+
+exit_4x4:
+
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+.macro w_8x8 V, D, R, P
+    stvx    \V, 0, r1
+    lwz     \R, 0(r1)
+    stw     \R, 0(r7)
+    lwz     \R, 4(r1)
+    stw     \R, 4(r7)
+    add     \D, \D, \P
+.endm
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+
+sixtap_predict8x4_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xffc0
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    slwi.   r5, r5, 5           ;# index into horizontal filter array
+
+    vspltish v19, 7
+
+    ;# If there isn't any filtering to be done for the horizontal, then
+    ;#  just skip to the second pass.
+    beq-    second_pass_pre_copy_8x4
+
+    load_hfilter v13, v14
+
+    ;# rounding added in on the multiply
+    vspltisw v16, 8
+    vspltisw v15, 3
+    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
+
+    ;# Load up permutation constants
+    load_c v16, B_0123, 0, r9, r10
+    load_c v17, B_4567, 0, r9, r10
+    load_c v18, B_89AB, 0, r9, r10
+
+    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
+    addi    r3, r3, -2
+
+    addi    r9, r3, 0
+    li      r10, 16
+    Read8x8 v2, r3, r4, 1
+    Read8x8 v3, r3, r4, 1
+    Read8x8 v4, r3, r4, 1
+    Read8x8 v5, r3, r4, 1
+
+    slwi.   r6, r6, 4           ;# index into vertical filter array
+
+    ;# filter a line
+    interp_8x8 v2
+    interp_8x8 v3
+    interp_8x8 v4
+    interp_8x8 v5
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional 5 lines that are needed
+    ;#  for the vertical filter.
+    beq-    store_8x4
+
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r9, r9, r4
+    sub     r9, r9, r4
+
+    Read8x8 v0, r9, r4, 1
+    Read8x8 v1, r9, r4, 0
+    Read8x8 v6, r3, r4, 1
+    Read8x8 v7, r3, r4, 1
+    Read8x8 v8, r3, r4, 0
+
+    interp_8x8 v0
+    interp_8x8 v1
+    interp_8x8 v6
+    interp_8x8 v7
+    interp_8x8 v8
+
+    b       second_pass_8x4
+
+second_pass_pre_copy_8x4:
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r3, r3, r4
+    sub     r3, r3, r4
+    li      r10, 16
+
+    Read8x8 v0,  r3, r4, 1
+    Read8x8 v1,  r3, r4, 1
+    Read8x8 v2,  r3, r4, 1
+    Read8x8 v3,  r3, r4, 1
+    Read8x8 v4,  r3, r4, 1
+    Read8x8 v5,  r3, r4, 1
+    Read8x8 v6,  r3, r4, 1
+    Read8x8 v7,  r3, r4, 1
+    Read8x8 v8,  r3, r4, 1
+
+    slwi    r6, r6, 4           ;# index into vertical filter array
+
+second_pass_8x4:
+    load_c v13, VFilter, r6, r9, r10
+
+    vspltish v15, 8
+    vspltish v20, 3
+    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    vspltb  v14, v13, 1
+    vspltb  v15, v13, 2
+    vspltb  v16, v13, 3
+    vspltb  v17, v13, 4
+    vspltb  v18, v13, 5
+    vspltb  v13, v13, 0
+
+    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5
+    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6
+    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7
+    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8
+
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned_8x4
+
+    w_8x8   v0, r7, r0, r8
+    w_8x8   v1, r7, r0, r8
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+
+    b       exit_8x4
+
+store_aligned_8x4:
+
+    load_c v10, b_hilo, 0, r9, r10
+
+    vperm   v0, v0, v1, v10
+    vperm   v2, v2, v3, v10
+
+    stvx    v0, 0, r7
+    addi    r7, r7, 16
+    stvx    v2, 0, r7
+
+    b       exit_8x4
+
+store_8x4:
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned2_8x4
+
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+    w_8x8   v4, r7, r0, r8
+    w_8x8   v5, r7, r0, r8
+
+    b       exit_8x4
+
+store_aligned2_8x4:
+    load_c v10, b_hilo, 0, r9, r10
+
+    vperm   v2, v2, v3, v10
+    vperm   v4, v4, v5, v10
+
+    stvx    v2, 0, r7
+    addi    r7, r7, 16
+    stvx    v4, 0, r7
+
+exit_8x4:
+
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+
+    blr
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+
+;# Because the width that needs to be filtered will fit in a single altivec
+;#  register there is no need to loop.  Everything can stay in registers.
+sixtap_predict8x8_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xffc0
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    slwi.   r5, r5, 5           ;# index into horizontal filter array
+
+    vspltish v19, 7
+
+    ;# If there isn't any filtering to be done for the horizontal, then
+    ;#  just skip to the second pass.
+    beq-    second_pass_pre_copy_8x8
+
+    load_hfilter v13, v14
+
+    ;# rounding added in on the multiply
+    vspltisw v16, 8
+    vspltisw v15, 3
+    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
+
+    ;# Load up permutation constants
+    load_c v16, B_0123, 0, r9, r10
+    load_c v17, B_4567, 0, r9, r10
+    load_c v18, B_89AB, 0, r9, r10
+
+    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
+    addi    r3, r3, -2
+
+    addi    r9, r3, 0
+    li      r10, 16
+    Read8x8 v2, r3, r4, 1
+    Read8x8 v3, r3, r4, 1
+    Read8x8 v4, r3, r4, 1
+    Read8x8 v5, r3, r4, 1
+    Read8x8 v6, r3, r4, 1
+    Read8x8 v7, r3, r4, 1
+    Read8x8 v8, r3, r4, 1
+    Read8x8 v9, r3, r4, 1
+
+    slwi.   r6, r6, 4           ;# index into vertical filter array
+
+    ;# filter a line
+    interp_8x8 v2
+    interp_8x8 v3
+    interp_8x8 v4
+    interp_8x8 v5
+    interp_8x8 v6
+    interp_8x8 v7
+    interp_8x8 v8
+    interp_8x8 v9
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional 5 lines that are needed
+    ;#  for the vertical filter.
+    beq-    store_8x8
+
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r9, r9, r4
+    sub     r9, r9, r4
+
+    Read8x8 v0,  r9, r4, 1
+    Read8x8 v1,  r9, r4, 0
+    Read8x8 v10, r3, r4, 1
+    Read8x8 v11, r3, r4, 1
+    Read8x8 v12, r3, r4, 0
+
+    interp_8x8 v0
+    interp_8x8 v1
+    interp_8x8 v10
+    interp_8x8 v11
+    interp_8x8 v12
+
+    b       second_pass_8x8
+
+second_pass_pre_copy_8x8:
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r3, r3, r4
+    sub     r3, r3, r4
+    li      r10, 16
+
+    Read8x8 v0,  r3, r4, 1
+    Read8x8 v1,  r3, r4, 1
+    Read8x8 v2,  r3, r4, 1
+    Read8x8 v3,  r3, r4, 1
+    Read8x8 v4,  r3, r4, 1
+    Read8x8 v5,  r3, r4, 1
+    Read8x8 v6,  r3, r4, 1
+    Read8x8 v7,  r3, r4, 1
+    Read8x8 v8,  r3, r4, 1
+    Read8x8 v9,  r3, r4, 1
+    Read8x8 v10, r3, r4, 1
+    Read8x8 v11, r3, r4, 1
+    Read8x8 v12, r3, r4, 0
+
+    slwi    r6, r6, 4           ;# index into vertical filter array
+
+second_pass_8x8:
+    load_c v13, VFilter, r6, r9, r10
+
+    vspltish v15, 8
+    vspltish v20, 3
+    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    vspltb  v14, v13, 1
+    vspltb  v15, v13, 2
+    vspltb  v16, v13, 3
+    vspltb  v17, v13, 4
+    vspltb  v18, v13, 5
+    vspltb  v13, v13, 0
+
+    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5
+    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6
+    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7
+    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8
+    vinterp_no_store_8x8 v4, v5, v6, v7,  v8,  v9
+    vinterp_no_store_8x8 v5, v6, v7, v8,  v9,  v10
+    vinterp_no_store_8x8 v6, v7, v8, v9,  v10, v11
+    vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12
+
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned_8x8
+
+    w_8x8   v0, r7, r0, r8
+    w_8x8   v1, r7, r0, r8
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+    w_8x8   v4, r7, r0, r8
+    w_8x8   v5, r7, r0, r8
+    w_8x8   v6, r7, r0, r8
+    w_8x8   v7, r7, r0, r8
+
+    b       exit_8x8
+
+store_aligned_8x8:
+
+    load_c v10, b_hilo, 0, r9, r10
+
+    vperm   v0, v0, v1, v10
+    vperm   v2, v2, v3, v10
+    vperm   v4, v4, v5, v10
+    vperm   v6, v6, v7, v10
+
+    stvx    v0, 0, r7
+    addi    r7, r7, 16
+    stvx    v2, 0, r7
+    addi    r7, r7, 16
+    stvx    v4, 0, r7
+    addi    r7, r7, 16
+    stvx    v6, 0, r7
+
+    b       exit_8x8
+
+store_8x8:
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned2_8x8
+
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+    w_8x8   v4, r7, r0, r8
+    w_8x8   v5, r7, r0, r8
+    w_8x8   v6, r7, r0, r8
+    w_8x8   v7, r7, r0, r8
+    w_8x8   v8, r7, r0, r8
+    w_8x8   v9, r7, r0, r8
+
+    b       exit_8x8
+
+store_aligned2_8x8:
+    load_c v10, b_hilo, 0, r9, r10
+
+    vperm   v2, v2, v3, v10
+    vperm   v4, v4, v5, v10
+    vperm   v6, v6, v7, v10
+    vperm   v8, v8, v9, v10
+
+    stvx    v2, 0, r7
+    addi    r7, r7, 16
+    stvx    v4, 0, r7
+    addi    r7, r7, 16
+    stvx    v6, 0, r7
+    addi    r7, r7, 16
+    stvx    v8, 0, r7
+
+exit_8x8:
+
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+
+;# Two pass filtering.  First pass is Horizontal edges, second pass is vertical
+;#  edges.  One of the filters can be null, but both won't be.  Needs to use a
+;#  temporary buffer because the source buffer can't be modified and the buffer
+;#  for the destination is not large enough to hold the temporary data.
+sixtap_predict16x16_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xf000
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-416(r1)         ;# create space on the stack
+
+    ;# Three possiblities
+    ;#  1. First filter is null.  Don't use a temp buffer.
+    ;#  2. Second filter is null.  Don't use a temp buffer.
+    ;#  3. Neither are null, use temp buffer.
+
+    ;# First Pass (horizontal edge)
+    ;#  setup pointers for src
+    ;#  if possiblity (1) then setup the src pointer to be the orginal and jump
+    ;#  to second pass.  this is based on if x_offset is 0.
+
+    ;# load up horizontal filter
+    slwi.   r5, r5, 5           ;# index into horizontal filter array
+
+    load_hfilter v4, v5
+
+    beq-    copy_horizontal_16x21
+
+    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
+    addi    r3, r3, -2
+
+    slwi.   r6, r6, 4           ;# index into vertical filter array
+
+    ;# setup constants
+    ;# v14 permutation value for alignment
+    load_c v14, b_hperm, 0, r9, r10
+
+    ;# These statements are guessing that there won't be a second pass,
+    ;#  but if there is then inside the bypass they need to be set
+    li      r0, 16              ;# prepare for no vertical filter
+
+    ;# Change the output pointer and pitch to be the actual
+    ;#  desination instead of a temporary buffer.
+    addi    r9, r7, 0
+    addi    r5, r8, 0
+
+    ;# no vertical filter, so write the output from the first pass
+    ;#  directly into the output buffer.
+    beq-    no_vertical_filter_bypass
+
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r3, r3, r4
+    sub     r3, r3, r4
+
+    ;# setup counter for the number of lines that are going to be filtered
+    li      r0, 21
+
+    ;# use the stack as temporary storage
+    la      r9, 48(r1)
+    li      r5, 16
+
+no_vertical_filter_bypass:
+
+    mtctr   r0
+
+    ;# rounding added in on the multiply
+    vspltisw v10, 8
+    vspltisw v12, 3
+    vslw    v12, v10, v12       ;# 0x00000040000000400000004000000040
+
+    ;# downshift by 7 ( divide by 128 ) at the end
+    vspltish v13, 7
+
+    ;# index to the next set of vectors in the row.
+    li      r10, 16
+    li      r12, 32
+
+horizontal_loop_16x16:
+
+    lvsl    v15,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v1,   0, r3
+    lvx     v2, r10, r3
+    lvx     v3, r12, r3
+
+    vperm   v8, v1, v2, v15
+    vperm   v9, v2, v3, v15     ;# v8 v9 = 21 input pixels left-justified
+
+    vsldoi  v11, v8, v9, 4
+
+    ;# set 0
+    vmsummbm v6, v4, v8, v12    ;# taps times elements
+    vmsummbm v0, v5, v11, v6
+
+    ;# set 1
+    vsldoi  v10, v8, v9, 1
+    vsldoi  v11, v8, v9, 5
+
+    vmsummbm v6, v4, v10, v12
+    vmsummbm v1, v5, v11, v6
+
+    ;# set 2
+    vsldoi  v10, v8, v9, 2
+    vsldoi  v11, v8, v9, 6
+
+    vmsummbm v6, v4, v10, v12
+    vmsummbm v2, v5, v11, v6
+
+    ;# set 3
+    vsldoi  v10, v8, v9, 3
+    vsldoi  v11, v8, v9, 7
+
+    vmsummbm v6, v4, v10, v12
+    vmsummbm v3, v5, v11, v6
+
+    vpkswus v0, v0, v1          ;# v0 = 0 4 8 C 1 5 9 D (16-bit)
+    vpkswus v1, v2, v3          ;# v1 = 2 6 A E 3 7 B F
+
+    vsrh    v0, v0, v13         ;# divide v0, v1 by 128
+    vsrh    v1, v1, v13
+
+    vpkuhus v0, v0, v1          ;# v0 = scrambled 8-bit result
+    vperm   v0, v0, v0, v14     ;# v0 = correctly-ordered result
+
+    stvx    v0,  0, r9
+    add     r9, r9, r5
+
+    add     r3, r3, r4
+
+    bdnz    horizontal_loop_16x16
+
+    ;# check again to see if vertical filter needs to be done.
+    cmpi    cr0, r6, 0
+    beq     cr0, end_16x16
+
+    ;# yes there is, so go to the second pass
+    b       second_pass_16x16
+
+copy_horizontal_16x21:
+    li      r10, 21
+    mtctr   r10
+
+    li      r10, 16
+
+    sub     r3, r3, r4
+    sub     r3, r3, r4
+
+    ;# this is done above if there is a horizontal filter,
+    ;#  if not it needs to be done down here.
+    slwi    r6, r6, 4           ;# index into vertical filter array
+
+    ;# always write to the stack when doing a horizontal copy
+    la      r9, 48(r1)
+
+copy_horizontal_loop_16x21:
+    lvsl    v15,  0, r3         ;# permutate value for alignment
+
+    lvx     v1,   0, r3
+    lvx     v2, r10, r3
+
+    vperm   v8, v1, v2, v15
+
+    stvx    v8,  0, r9
+    addi    r9, r9, 16
+
+    add     r3, r3, r4
+
+    bdnz    copy_horizontal_loop_16x21
+
+second_pass_16x16:
+
+    ;# always read from the stack when doing a vertical filter
+    la      r9, 48(r1)
+
+    ;# downshift by 7 ( divide by 128 ) at the end
+    vspltish v7, 7
+
+    vpre_load
+
+    luma_vsix
+    luma_vsix
+    luma_vfour
+
+end_16x16:
+
+    addi    r1, r1, 416         ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .data
+
+    .align 4
+HFilter:
+    .byte     0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0
+    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12
+    .byte    -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0
+    .byte     2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36
+    .byte    -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0
+    .byte     0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50
+    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0
+    .byte     3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77
+    .byte   -16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0
+    .byte     0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93
+    .byte    -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0
+    .byte     1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108
+    .byte   -11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0
+    .byte     0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123
+    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0
+
+    .align 4
+VFilter:
+    .byte     0,  0,128,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     0,  6,123, 12,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     2, 11,108, 36,  8,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     0,  9, 93, 50,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     3, 16, 77, 77, 16,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     0,  6, 50, 93,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     1,  8, 36,108, 11,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     0,  1, 12,123,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+
+    .align 4
+b_hperm:
+    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
+
+    .align 4
+B_0123:
+    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
+
+    .align 4
+B_4567:
+    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
+
+    .align 4
+B_89AB:
+    .byte     8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+
+    .align 4
+b_hilo:
+    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
+
+    .align 4
+b_hilo_4x4:
+    .byte     0,  1,  2,  3, 16, 17, 18, 19,  0,  0,  0,  0,  0,  0,  0,  0
diff --git a/vp8/common/ppc/filter_bilinear_altivec.asm b/vp8/common/ppc/filter_bilinear_altivec.asm
new file mode 100644
index 000000000..27e02a87f
--- /dev/null
+++ b/vp8/common/ppc/filter_bilinear_altivec.asm
@@ -0,0 +1,676 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    .globl bilinear_predict4x4_ppc
+    .globl bilinear_predict8x4_ppc
+    .globl bilinear_predict8x8_ppc
+    .globl bilinear_predict16x16_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+    lis     \R0, \LABEL@ha
+    la      \R1, \LABEL@l(\R0)
+    lvx     \V, \OFF, \R1
+.endm
+
+.macro load_vfilter V0, V1
+    load_c \V0, vfilter_b, r6, r9, r10
+
+    addi    r6,  r6, 16
+    lvx     \V1, r6, r10
+.endm
+
+.macro HProlog jump_label
+    ;# load up horizontal filter
+    slwi.   r5, r5, 4           ;# index into horizontal filter array
+
+    ;# index to the next set of vectors in the row.
+    li      r10, 16
+    li      r12, 32
+
+    ;# downshift by 7 ( divide by 128 ) at the end
+    vspltish v19, 7
+
+    ;# If there isn't any filtering to be done for the horizontal, then
+    ;#  just skip to the second pass.
+    beq     \jump_label
+
+    load_c v20, hfilter_b, r5, r9, r0
+
+    ;# setup constants
+    ;# v14 permutation value for alignment
+    load_c v28, b_hperm_b, 0, r9, r0
+
+    ;# rounding added in on the multiply
+    vspltisw v21, 8
+    vspltisw v18, 3
+    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
+
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+.endm
+
+;# Filters a horizontal line
+;# expects:
+;#  r3  src_ptr
+;#  r4  pitch
+;#  r10 16
+;#  r12 32
+;#  v17 perm intput
+;#  v18 rounding
+;#  v19 shift
+;#  v20 filter taps
+;#  v21 tmp
+;#  v22 tmp
+;#  v23 tmp
+;#  v24 tmp
+;#  v25 tmp
+;#  v26 tmp
+;#  v27 tmp
+;#  v28 perm output
+;#
+.macro HFilter V
+    vperm   v24, v21, v21, v10  ;# v20 = 0123 1234 2345 3456
+    vperm   v25, v21, v21, v11  ;# v21 = 4567 5678 6789 789A
+
+    vmsummbm v24, v20, v24, v18
+    vmsummbm v25, v20, v25, v18
+
+    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+
+    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
+
+    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
+.endm
+
+.macro hfilter_8 V, increment_counter
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 9 bytes wide, output is 8 bytes.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+    vperm   v21, v21, v22, v17
+
+    HFilter \V
+.endm
+
+
+.macro load_and_align_8 V, increment_counter
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+
+    vperm   \V, v21, v22, v17
+.endm
+
+.macro write_aligned_8 V, increment_counter
+    stvx    \V,  0, r7
+
+.if \increment_counter
+    add     r7, r7, r8
+.endif
+.endm
+
+.macro vfilter_16 P0 P1
+    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
+    vadduhm v22, v18, v22
+    vmuloub v23, \P0, v20
+    vadduhm v23, v18, v23
+
+    vmuleub v24, \P1, v21
+    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
+    vmuloub v25, \P1, v21
+    vadduhm v23, v23, v25       ;# Ro = odds
+
+    vsrh    v22, v22, v19       ;# divide by 128
+    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
+    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
+    vmrglh  v23, v22, v23
+    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
+.endm
+
+
+.macro w_8x8 V, D, R, P
+    stvx    \V, 0, r1
+    lwz     \R, 0(r1)
+    stw     \R, 0(r7)
+    lwz     \R, 4(r1)
+    stw     \R, 4(r7)
+    add     \D, \D, \P
+.endm
+
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict4x4_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xf830
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_4x4_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v10, b_0123_b, 0, r9, r12
+    load_c v11, b_4567_b, 0, r9, r12
+
+    hfilter_8 v0, 1
+    hfilter_8 v1, 1
+    hfilter_8 v2, 1
+    hfilter_8 v3, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     store_out_4x4_b
+
+    hfilter_8 v4, 0
+
+    b   second_pass_4x4_b
+
+second_pass_4x4_pre_copy_b:
+    slwi    r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_8  v0, 1
+    load_and_align_8  v1, 1
+    load_and_align_8  v2, 1
+    load_and_align_8  v3, 1
+    load_and_align_8  v4, 1
+
+second_pass_4x4_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+
+store_out_4x4_b:
+
+    stvx    v0, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v1, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v2, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v3, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+
+exit_4x4:
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict8x4_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xf830
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_8x4_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v10, b_0123_b, 0, r9, r12
+    load_c v11, b_4567_b, 0, r9, r12
+
+    hfilter_8 v0, 1
+    hfilter_8 v1, 1
+    hfilter_8 v2, 1
+    hfilter_8 v3, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     store_out_8x4_b
+
+    hfilter_8 v4, 0
+
+    b   second_pass_8x4_b
+
+second_pass_8x4_pre_copy_b:
+    slwi    r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_8  v0, 1
+    load_and_align_8  v1, 1
+    load_and_align_8  v2, 1
+    load_and_align_8  v3, 1
+    load_and_align_8  v4, 1
+
+second_pass_8x4_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+
+store_out_8x4_b:
+
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned_8x4_b
+
+    w_8x8   v0, r7, r0, r8
+    w_8x8   v1, r7, r0, r8
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+
+    b       exit_8x4
+
+store_aligned_8x4_b:
+    load_c v10, b_hilo_b, 0, r9, r10
+
+    vperm   v0, v0, v1, v10
+    vperm   v2, v2, v3, v10
+
+    stvx    v0, 0, r7
+    addi    r7, r7, 16
+    stvx    v2, 0, r7
+
+exit_8x4:
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict8x8_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xfff0
+    ori     r12, r12, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_8x8_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v10, b_0123_b, 0, r9, r12
+    load_c v11, b_4567_b, 0, r9, r12
+
+    hfilter_8 v0, 1
+    hfilter_8 v1, 1
+    hfilter_8 v2, 1
+    hfilter_8 v3, 1
+    hfilter_8 v4, 1
+    hfilter_8 v5, 1
+    hfilter_8 v6, 1
+    hfilter_8 v7, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     store_out_8x8_b
+
+    hfilter_8 v8, 0
+
+    b   second_pass_8x8_b
+
+second_pass_8x8_pre_copy_b:
+    slwi    r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_8  v0, 1
+    load_and_align_8  v1, 1
+    load_and_align_8  v2, 1
+    load_and_align_8  v3, 1
+    load_and_align_8  v4, 1
+    load_and_align_8  v5, 1
+    load_and_align_8  v6, 1
+    load_and_align_8  v7, 1
+    load_and_align_8  v8, 0
+
+second_pass_8x8_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+    vfilter_16 v4,  v5
+    vfilter_16 v5,  v6
+    vfilter_16 v6,  v7
+    vfilter_16 v7,  v8
+
+store_out_8x8_b:
+
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned_8x8_b
+
+    w_8x8   v0, r7, r0, r8
+    w_8x8   v1, r7, r0, r8
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+    w_8x8   v4, r7, r0, r8
+    w_8x8   v5, r7, r0, r8
+    w_8x8   v6, r7, r0, r8
+    w_8x8   v7, r7, r0, r8
+
+    b       exit_8x8
+
+store_aligned_8x8_b:
+    load_c v10, b_hilo_b, 0, r9, r10
+
+    vperm   v0, v0, v1, v10
+    vperm   v2, v2, v3, v10
+    vperm   v4, v4, v5, v10
+    vperm   v6, v6, v7, v10
+
+    stvx    v0, 0, r7
+    addi    r7, r7, 16
+    stvx    v2, 0, r7
+    addi    r7, r7, 16
+    stvx    v4, 0, r7
+    addi    r7, r7, 16
+    stvx    v6, 0, r7
+
+exit_8x8:
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+;# Filters a horizontal line
+;# expects:
+;#  r3  src_ptr
+;#  r4  pitch
+;#  r10 16
+;#  r12 32
+;#  v17 perm intput
+;#  v18 rounding
+;#  v19 shift
+;#  v20 filter taps
+;#  v21 tmp
+;#  v22 tmp
+;#  v23 tmp
+;#  v24 tmp
+;#  v25 tmp
+;#  v26 tmp
+;#  v27 tmp
+;#  v28 perm output
+;#
+.macro hfilter_16 V, increment_counter
+
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+    lvx     v23, r12, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+    vperm   v21, v21, v22, v17
+    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
+
+    ;# set 0
+    vmsummbm v24, v20, v21, v18 ;# taps times elements
+
+    ;# set 1
+    vsldoi  v23, v21, v22, 1
+    vmsummbm v25, v20, v23, v18
+
+    ;# set 2
+    vsldoi  v23, v21, v22, 2
+    vmsummbm v26, v20, v23, v18
+
+    ;# set 3
+    vsldoi  v23, v21, v22, 3
+    vmsummbm v27, v20, v23, v18
+
+    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
+
+    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
+    vsrh    v25, v25, v19
+
+    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
+    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
+.endm
+
+.macro load_and_align_16 V, increment_counter
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+
+    vperm   \V, v21, v22, v17
+.endm
+
+.macro write_16 V, increment_counter
+    stvx    \V,  0, r7
+
+.if \increment_counter
+    add     r7, r7, r8
+.endif
+.endm
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict16x16_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    HProlog second_pass_16x16_pre_copy_b
+
+    hfilter_16 v0,  1
+    hfilter_16 v1,  1
+    hfilter_16 v2,  1
+    hfilter_16 v3,  1
+    hfilter_16 v4,  1
+    hfilter_16 v5,  1
+    hfilter_16 v6,  1
+    hfilter_16 v7,  1
+    hfilter_16 v8,  1
+    hfilter_16 v9,  1
+    hfilter_16 v10, 1
+    hfilter_16 v11, 1
+    hfilter_16 v12, 1
+    hfilter_16 v13, 1
+    hfilter_16 v14, 1
+    hfilter_16 v15, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     store_out_16x16_b
+
+    hfilter_16 v16, 0
+
+    b   second_pass_16x16_b
+
+second_pass_16x16_pre_copy_b:
+    slwi    r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16  v0,  1
+    load_and_align_16  v1,  1
+    load_and_align_16  v2,  1
+    load_and_align_16  v3,  1
+    load_and_align_16  v4,  1
+    load_and_align_16  v5,  1
+    load_and_align_16  v6,  1
+    load_and_align_16  v7,  1
+    load_and_align_16  v8,  1
+    load_and_align_16  v9,  1
+    load_and_align_16  v10, 1
+    load_and_align_16  v11, 1
+    load_and_align_16  v12, 1
+    load_and_align_16  v13, 1
+    load_and_align_16  v14, 1
+    load_and_align_16  v15, 1
+    load_and_align_16  v16, 0
+
+second_pass_16x16_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+    vfilter_16 v4,  v5
+    vfilter_16 v5,  v6
+    vfilter_16 v6,  v7
+    vfilter_16 v7,  v8
+    vfilter_16 v8,  v9
+    vfilter_16 v9,  v10
+    vfilter_16 v10, v11
+    vfilter_16 v11, v12
+    vfilter_16 v12, v13
+    vfilter_16 v13, v14
+    vfilter_16 v14, v15
+    vfilter_16 v15, v16
+
+store_out_16x16_b:
+
+    write_16 v0,  1
+    write_16 v1,  1
+    write_16 v2,  1
+    write_16 v3,  1
+    write_16 v4,  1
+    write_16 v5,  1
+    write_16 v6,  1
+    write_16 v7,  1
+    write_16 v8,  1
+    write_16 v9,  1
+    write_16 v10, 1
+    write_16 v11, 1
+    write_16 v12, 1
+    write_16 v13, 1
+    write_16 v14, 1
+    write_16 v15, 0
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .data
+
+    .align 4
+hfilter_b:
+    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
+    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
+    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
+    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
+    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
+    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
+    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
+    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
+
+    .align 4
+vfilter_b:
+    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
+    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+
+    .align 4
+b_hperm_b:
+    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
+
+    .align 4
+b_0123_b:
+    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
+
+    .align 4
+b_4567_b:
+    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
+
+b_hilo_b:
+    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
diff --git a/vp8/common/ppc/idctllm_altivec.asm b/vp8/common/ppc/idctllm_altivec.asm
new file mode 100644
index 000000000..e88af8d7d
--- /dev/null
+++ b/vp8/common/ppc/idctllm_altivec.asm
@@ -0,0 +1,188 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    .globl short_idct4x4llm_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+    lis     \R0, \LABEL@ha
+    la      \R1, \LABEL@l(\R0)
+    lvx     \V, \OFF, \R1
+.endm
+
+;# r3 short *input
+;# r4 short *output
+;# r5 int pitch
+    .align 2
+short_idct4x4llm_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    load_c v8, sinpi8sqrt2, 0, r9, r10
+    load_c v9, cospi8sqrt2minus1, 0, r9, r10
+    load_c v10, hi_hi, 0, r9, r10
+    load_c v11, lo_lo, 0, r9, r10
+    load_c v12, shift_16, 0, r9, r10
+
+    li      r10,  16
+    lvx     v0,   0, r3         ;# input ip[0], ip[ 4]
+    lvx     v1, r10, r3         ;# input ip[8], ip[12]
+
+    ;# first pass
+    vupkhsh v2, v0
+    vupkhsh v3, v1
+    vaddsws v6, v2, v3          ;# a1 = ip[0]+ip[8]
+    vsubsws v7, v2, v3          ;# b1 = ip[0]-ip[8]
+
+    vupklsh v0, v0
+    vmulosh v4, v0, v8
+    vsraw   v4, v4, v12
+    vaddsws v4, v4, v0          ;# ip[ 4] * sin(pi/8) * sqrt(2)
+
+    vupklsh v1, v1
+    vmulosh v5, v1, v9
+    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
+    vaddsws v5, v5, v1
+
+    vsubsws v4, v4, v5          ;# c1
+
+    vmulosh v3, v1, v8
+    vsraw   v3, v3, v12
+    vaddsws v3, v3, v1          ;# ip[12] * sin(pi/8) * sqrt(2)
+
+    vmulosh v5, v0, v9
+    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
+    vaddsws v5, v5, v0
+
+    vaddsws v3, v3, v5          ;# d1
+
+    vaddsws v0, v6, v3          ;# a1 + d1
+    vsubsws v3, v6, v3          ;# a1 - d1
+
+    vaddsws v1, v7, v4          ;# b1 + c1
+    vsubsws v2, v7, v4          ;# b1 - c1
+
+    ;# transpose input
+    vmrghw  v4, v0, v1          ;# a0 b0 a1 b1
+    vmrghw  v5, v2, v3          ;# c0 d0 c1 d1
+
+    vmrglw  v6, v0, v1          ;# a2 b2 a3 b3
+    vmrglw  v7, v2, v3          ;# c2 d2 c3 d3
+
+    vperm   v0, v4, v5, v10     ;# a0 b0 c0 d0
+    vperm   v1, v4, v5, v11     ;# a1 b1 c1 d1
+
+    vperm   v2, v6, v7, v10     ;# a2 b2 c2 d2
+    vperm   v3, v6, v7, v11     ;# a3 b3 c3 d3
+
+    ;# second pass
+    vaddsws v6, v0, v2          ;# a1 = ip[0]+ip[8]
+    vsubsws v7, v0, v2          ;# b1 = ip[0]-ip[8]
+
+    vmulosh v4, v1, v8
+    vsraw   v4, v4, v12
+    vaddsws v4, v4, v1          ;# ip[ 4] * sin(pi/8) * sqrt(2)
+
+    vmulosh v5, v3, v9
+    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
+    vaddsws v5, v5, v3
+
+    vsubsws v4, v4, v5          ;# c1
+
+    vmulosh v2, v3, v8
+    vsraw   v2, v2, v12
+    vaddsws v2, v2, v3          ;# ip[12] * sin(pi/8) * sqrt(2)
+
+    vmulosh v5, v1, v9
+    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
+    vaddsws v5, v5, v1
+
+    vaddsws v3, v2, v5          ;# d1
+
+    vaddsws v0, v6, v3          ;# a1 + d1
+    vsubsws v3, v6, v3          ;# a1 - d1
+
+    vaddsws v1, v7, v4          ;# b1 + c1
+    vsubsws v2, v7, v4          ;# b1 - c1
+
+    vspltish v6, 4
+    vspltish v7, 3
+
+    vpkswss v0, v0, v1
+    vpkswss v1, v2, v3
+
+    vaddshs v0, v0, v6
+    vaddshs v1, v1, v6
+
+    vsrah   v0, v0, v7
+    vsrah   v1, v1, v7
+
+    ;# transpose output
+    vmrghh  v2, v0, v1          ;# a0 c0 a1 c1 a2 c2 a3 c3
+    vmrglh  v3, v0, v1          ;# b0 d0 b1 d1 b2 d2 b3 d3
+
+    vmrghh  v0, v2, v3          ;# a0 b0 c0 d0 a1 b1 c1 d1
+    vmrglh  v1, v2, v3          ;# a2 b2 c2 d2 a3 b3 c3 d3
+
+    stwu    r1,-416(r1)         ;# create space on the stack
+
+    stvx    v0,  0, r1
+    lwz     r6, 0(r1)
+    stw     r6, 0(r4)
+    lwz     r6, 4(r1)
+    stw     r6, 4(r4)
+
+    add     r4, r4, r5
+
+    lwz     r6,  8(r1)
+    stw     r6,  0(r4)
+    lwz     r6, 12(r1)
+    stw     r6,  4(r4)
+
+    add     r4, r4, r5
+
+    stvx    v1,  0, r1
+    lwz     r6, 0(r1)
+    stw     r6, 0(r4)
+    lwz     r6, 4(r1)
+    stw     r6, 4(r4)
+
+    add     r4, r4, r5
+
+    lwz     r6,  8(r1)
+    stw     r6,  0(r4)
+    lwz     r6, 12(r1)
+    stw     r6,  4(r4)
+
+    addi    r1, r1, 416         ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 4
+sinpi8sqrt2:
+    .short  35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468
+
+    .align 4
+cospi8sqrt2minus1:
+    .short  20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091
+
+    .align 4
+shift_16:
+    .long      16,    16,    16,    16
+
+    .align 4
+hi_hi:
+    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
+
+    .align 4
+lo_lo:
+    .byte     8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
diff --git a/vp8/common/ppc/loopfilter_altivec.c b/vp8/common/ppc/loopfilter_altivec.c
new file mode 100644
index 000000000..586eed477
--- /dev/null
+++ b/vp8/common/ppc/loopfilter_altivec.c
@@ -0,0 +1,142 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "loopfilter.h"
+#include "onyxc_int.h"
+
+typedef void loop_filter_function_y_ppc
+(
+    unsigned char *s,   // source pointer
+    int p,              // pitch
+    const signed char *flimit,
+    const signed char *limit,
+    const signed char *thresh
+);
+
+typedef void loop_filter_function_uv_ppc
+(
+    unsigned char *u,   // source pointer
+    unsigned char *v,   // source pointer
+    int p,              // pitch
+    const signed char *flimit,
+    const signed char *limit,
+    const signed char *thresh
+);
+
+typedef void loop_filter_function_s_ppc
+(
+    unsigned char *s,   // source pointer
+    int p,              // pitch
+    const signed char *flimit
+);
+
+loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc;
+loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc;
+loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc;
+loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc;
+
+loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc;
+loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc;
+loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc;
+loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc;
+
+loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc;
+loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc;
+
+// Horizontal MB filtering
+void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                         int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void)simpler_lpf;
+    mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr);
+
+    if (u_ptr)
+        mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr);
+}
+
+void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                          int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void)simpler_lpf;
+    (void)u_ptr;
+    (void)v_ptr;
+    (void)uv_stride;
+    loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim);
+}
+
+// Vertical MB Filtering
+void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                         int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void)simpler_lpf;
+    mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr);
+
+    if (u_ptr)
+        mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr);
+}
+
+void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                          int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void)simpler_lpf;
+    (void)u_ptr;
+    (void)v_ptr;
+    (void)uv_stride;
+    loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim);
+}
+
+// Horizontal B Filtering
+void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                        int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void)simpler_lpf;
+    // These should all be done at once with one call, instead of 3
+    loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
+    loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
+    loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
+
+    if (u_ptr)
+        loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr);
+}
+
+void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                         int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void)simpler_lpf;
+    (void)u_ptr;
+    (void)v_ptr;
+    (void)uv_stride;
+    loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim);
+    loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim);
+    loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim);
+}
+
+// Vertical B Filtering
+void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                        int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void)simpler_lpf;
+    loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);
+
+    if (u_ptr)
+        loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr);
+}
+
+void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                         int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void)simpler_lpf;
+    (void)u_ptr;
+    (void)v_ptr;
+    (void)uv_stride;
+    loop_filter_simple_vertical_edge_ppc(y_ptr + 4,  y_stride, lfi->flim);
+    loop_filter_simple_vertical_edge_ppc(y_ptr + 8,  y_stride, lfi->flim);
+    loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim);
+}
diff --git a/vp8/common/ppc/loopfilter_filters_altivec.asm b/vp8/common/ppc/loopfilter_filters_altivec.asm
new file mode 100644
index 000000000..78a5cf9b3
--- /dev/null
+++ b/vp8/common/ppc/loopfilter_filters_altivec.asm
@@ -0,0 +1,1252 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    .globl mbloop_filter_horizontal_edge_y_ppc
+    .globl loop_filter_horizontal_edge_y_ppc
+    .globl mbloop_filter_vertical_edge_y_ppc
+    .globl loop_filter_vertical_edge_y_ppc
+
+    .globl mbloop_filter_horizontal_edge_uv_ppc
+    .globl loop_filter_horizontal_edge_uv_ppc
+    .globl mbloop_filter_vertical_edge_uv_ppc
+    .globl loop_filter_vertical_edge_uv_ppc
+
+    .globl loop_filter_simple_horizontal_edge_ppc
+    .globl loop_filter_simple_vertical_edge_ppc
+
+    .text
+;# We often need to perform transposes (and other transpose-like operations)
+;#   on matrices of data.  This is simplified by the fact that we usually
+;#   operate on hunks of data whose dimensions are powers of 2, or at least
+;#   divisible by highish powers of 2.
+;#
+;#   These operations can be very confusing.  They become more straightforward
+;#   when we think of them as permutations of address bits: Concatenate a
+;#   group of vector registers and think of it as occupying a block of
+;#   memory beginning at address zero.  The low four bits 0...3 of the
+;#   address then correspond to position within a register, the higher-order
+;#   address bits select the register.
+;#
+;#   Although register selection, at the code level, is arbitrary, things
+;#   are simpler if we use contiguous ranges of register numbers, simpler
+;#   still if the low-order bits of the register number correspond to
+;#   conceptual address bits.  We do this whenever reasonable.
+;#
+;#   A 16x16 transpose can then be thought of as an operation on
+;#   a 256-element block of memory.  It takes 8 bits 0...7 to address this
+;#   memory and the effect of a transpose is to interchange address bit
+;#   0 with 4, 1 with 5, 2 with 6, and 3 with 7.  Bits 0...3 index the
+;#   column, which is interchanged with the row addressed by bits 4..7.
+;#
+;#   The altivec merge instructions provide a rapid means of effecting
+;#   many of these transforms.  They operate at three widths (8,16,32).
+;#   Writing V(x) for vector register #x, paired merges permute address
+;#   indices as follows.
+;#
+;#   0->1  1->2  2->3  3->(4+d)  (4+s)->0:
+;#
+;#      vmrghb  V( x),          V( y), V( y + (1<<s))
+;#      vmrglb  V( x + (1<<d)), V( y), V( y + (1<<s))
+;#
+;#
+;#   =0=   1->2  2->3  3->(4+d)  (4+s)->1:
+;#
+;#      vmrghh  V( x),          V( y), V( y + (1<<s))
+;#      vmrglh  V( x + (1<<d)), V( y), V( y + (1<<s))
+;#
+;#
+;#   =0=   =1=   2->3  3->(4+d)  (4+s)->2:
+;#
+;#      vmrghw  V( x),          V( y), V( y + (1<<s))
+;#      vmrglw  V( x + (1<<d)), V( y), V( y + (1<<s))
+;#
+;#
+;#   Unfortunately, there is no doubleword merge instruction.
+;#   The following sequence uses "vperm" is a substitute.
+;#   Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c)
+;#   are in registers Vhihi and Vlolo, we can also effect the permutation
+;#
+;#   =0=   =1=   =2=   3->(4+d)  (4+s)->3   by the sequence:
+;#
+;#      vperm   V( x),          V( y), V( y + (1<<s)), Vhihi
+;#      vperm   V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo
+;#
+;#
+;#   Except for bits s and d, the other relationships between register
+;#   number (= high-order part of address) bits are at the disposal of
+;#   the programmer.
+;#
+
+;# To avoid excess transposes, we filter all 3 vertical luma subblock
+;#   edges together.  This requires a single 16x16 transpose, which, in
+;#   the above language, amounts to the following permutation of address
+;#   indices:  0<->4   1<->5  2<->6  3<->7, which we accomplish by
+;#   4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0.
+;#
+;#   Except for the fact that the destination registers get written
+;#   before we are done referencing the old contents, the cyclic transform
+;#   is effected by
+;#
+;#      x = 0;  do {
+;#          vmrghb V(2x),   V(x), V(x+8);
+;#          vmrghb V(2x+1), V(x), V(x+8);
+;#      } while( ++x < 8);
+;#
+;#   For clarity, and because we can afford it, we do this transpose
+;#   using all 32 registers, alternating the banks 0..15  and  16 .. 31,
+;#   leaving the final result in 16 .. 31, as the lower registers are
+;#   used in the filtering itself.
+;#
+.macro Tpair A, B, X, Y
+    vmrghb  \A, \X, \Y
+    vmrglb  \B, \X, \Y
+.endm
+
+;# Each step takes 8*2 = 16 instructions
+
+.macro t16_even
+    Tpair v16,v17,  v0,v8
+    Tpair v18,v19,  v1,v9
+    Tpair v20,v21,  v2,v10
+    Tpair v22,v23,  v3,v11
+    Tpair v24,v25,  v4,v12
+    Tpair v26,v27,  v5,v13
+    Tpair v28,v29,  v6,v14
+    Tpair v30,v31,  v7,v15
+.endm
+
+.macro t16_odd
+    Tpair v0,v1, v16,v24
+    Tpair v2,v3, v17,v25
+    Tpair v4,v5, v18,v26
+    Tpair v6,v7, v19,v27
+    Tpair v8,v9, v20,v28
+    Tpair v10,v11, v21,v29
+    Tpair v12,v13, v22,v30
+    Tpair v14,v15, v23,v31
+.endm
+
+;# Whole transpose takes 4*16 = 64 instructions
+
+.macro t16_full
+    t16_odd
+    t16_even
+    t16_odd
+    t16_even
+.endm
+
+;# Vertical edge filtering requires transposes.  For the simple filter,
+;#   we need to convert 16 rows of 4 pels each into 4 registers of 16 pels
+;#   each.  Writing 0 ... 63 for the pixel indices, the desired result is:
+;#
+;#  v0 =  0  1 ... 14 15
+;#  v1 = 16 17 ... 30 31
+;#  v2 = 32 33 ... 47 48
+;#  v3 = 49 50 ... 62 63
+;#
+;#  In frame-buffer memory, the layout is:
+;#
+;#     0  16  32  48
+;#     1  17  33  49
+;#     ...
+;#    15  31  47  63.
+;#
+;#  We begin by reading the data 32 bits at a time (using scalar operations)
+;#  into a temporary array, reading the rows of the array into vector registers,
+;#  with the following layout:
+;#
+;#  v0 =  0 16 32 48  4 20 36 52  8 24 40 56  12 28 44 60
+;#  v1 =  1 17 33 49  5 21 ...                      45 61
+;#  v2 =  2 18 ...                                  46 62
+;#  v3 =  3 19 ...                                  47 63
+;#
+;#  From the "address-bit" perspective discussed above, we simply need to
+;#  interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone.
+;#  In other words, we transpose each of the four 4x4 submatrices.
+;#
+;#  This transformation is its own inverse, and we need to perform it
+;#  again before writing the pixels back into the frame buffer.
+;#
+;#  It acts in place on registers v0...v3, uses v4...v7 as temporaries,
+;#  and assumes that v14/v15 contain the b_hihi/b_lolo selectors
+;#  defined above.  We think of both groups of 4 registers as having
+;#  "addresses" {0,1,2,3} * 16.
+;#
+.macro Transpose4times4x4 Vlo, Vhi
+
+    ;# d=s=0        0->1  1->2  2->3  3->4  4->0  =5=
+
+    vmrghb  v4, v0, v1
+    vmrglb  v5, v0, v1
+    vmrghb  v6, v2, v3
+    vmrglb  v7, v2, v3
+
+    ;# d=0 s=1      =0=   1->2  2->3  3->4  4->5  5->1
+
+    vmrghh  v0, v4, v6
+    vmrglh  v1, v4, v6
+    vmrghh  v2, v5, v7
+    vmrglh  v3, v5, v7
+
+    ;# d=s=0        =0=   =1=   2->3  3->4  4->2  =5=
+
+    vmrghw  v4, v0, v1
+    vmrglw  v5, v0, v1
+    vmrghw  v6, v2, v3
+    vmrglw  v7, v2, v3
+
+    ;# d=0  s=1     =0=   =1=   =2=   3->4  4->5  5->3
+
+    vperm   v0, v4, v6, \Vlo
+    vperm   v1, v4, v6, \Vhi
+    vperm   v2, v5, v7, \Vlo
+    vperm   v3, v5, v7, \Vhi
+.endm
+;# end Transpose4times4x4
+
+
+;# Normal mb vertical edge filter transpose.
+;#
+;#   We read 8 columns of data, initially in the following pattern:
+;#
+;#  (0,0)  (1,0) ... (7,0)  (0,1)  (1,1) ... (7,1)
+;#  (0,2)  (1,2) ... (7,2)  (0,3)  (1,3) ... (7,3)
+;#  ...
+;#  (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15)
+;#
+;#   and wish to convert to:
+;#
+;#  (0,0) ... (0,15)
+;#  (1,0) ... (1,15)
+;#  ...
+;#  (7,0) ... (7,15).
+;#
+;#  In "address bit" language, we wish to map
+;#
+;#  0->4  1->5  2->6  3->0  4->1  5->2  6->3, i.e., I -> (I+4) mod 7.
+;#
+;#  This can be accomplished by 4 iterations of the cyclic transform
+;#
+;#  I -> (I+1) mod 7;
+;#
+;#  each iteration can be realized by (d=0, s=2):
+;#
+;#  x = 0;  do  Tpair( V(2x),V(2x+1),  V(x),V(x+4))  while( ++x < 4);
+;#
+;#  The input/output is in registers v0...v7.  We use v10...v17 as mirrors;
+;#  preserving v8 = sign converter.
+;#
+;#  Inverse transpose is similar, except here I -> (I+3) mod 7 and the
+;#  result lands in the "mirror" registers v10...v17
+;#
+.macro t8x16_odd
+    Tpair v10, v11,  v0, v4
+    Tpair v12, v13,  v1, v5
+    Tpair v14, v15,  v2, v6
+    Tpair v16, v17,  v3, v7
+.endm
+
+.macro t8x16_even
+    Tpair v0, v1,  v10, v14
+    Tpair v2, v3,  v11, v15
+    Tpair v4, v5,  v12, v16
+    Tpair v6, v7,  v13, v17
+.endm
+
+.macro transpose8x16_fwd
+    t8x16_odd
+    t8x16_even
+    t8x16_odd
+    t8x16_even
+.endm
+
+.macro transpose8x16_inv
+    t8x16_odd
+    t8x16_even
+    t8x16_odd
+.endm
+
+.macro Transpose16x16
+    vmrghb  v0, v16, v24
+    vmrglb  v1, v16, v24
+    vmrghb  v2, v17, v25
+    vmrglb  v3, v17, v25
+    vmrghb  v4, v18, v26
+    vmrglb  v5, v18, v26
+    vmrghb  v6, v19, v27
+    vmrglb  v7, v19, v27
+    vmrghb  v8, v20, v28
+    vmrglb  v9, v20, v28
+    vmrghb  v10, v21, v29
+    vmrglb  v11, v21, v29
+    vmrghb  v12, v22, v30
+    vmrglb  v13, v22, v30
+    vmrghb  v14, v23, v31
+    vmrglb  v15, v23, v31
+    vmrghb  v16, v0, v8
+    vmrglb  v17, v0, v8
+    vmrghb  v18, v1, v9
+    vmrglb  v19, v1, v9
+    vmrghb  v20, v2, v10
+    vmrglb  v21, v2, v10
+    vmrghb  v22, v3, v11
+    vmrglb  v23, v3, v11
+    vmrghb  v24, v4, v12
+    vmrglb  v25, v4, v12
+    vmrghb  v26, v5, v13
+    vmrglb  v27, v5, v13
+    vmrghb  v28, v6, v14
+    vmrglb  v29, v6, v14
+    vmrghb  v30, v7, v15
+    vmrglb  v31, v7, v15
+    vmrghb  v0, v16, v24
+    vmrglb  v1, v16, v24
+    vmrghb  v2, v17, v25
+    vmrglb  v3, v17, v25
+    vmrghb  v4, v18, v26
+    vmrglb  v5, v18, v26
+    vmrghb  v6, v19, v27
+    vmrglb  v7, v19, v27
+    vmrghb  v8, v20, v28
+    vmrglb  v9, v20, v28
+    vmrghb  v10, v21, v29
+    vmrglb  v11, v21, v29
+    vmrghb  v12, v22, v30
+    vmrglb  v13, v22, v30
+    vmrghb  v14, v23, v31
+    vmrglb  v15, v23, v31
+    vmrghb  v16, v0, v8
+    vmrglb  v17, v0, v8
+    vmrghb  v18, v1, v9
+    vmrglb  v19, v1, v9
+    vmrghb  v20, v2, v10
+    vmrglb  v21, v2, v10
+    vmrghb  v22, v3, v11
+    vmrglb  v23, v3, v11
+    vmrghb  v24, v4, v12
+    vmrglb  v25, v4, v12
+    vmrghb  v26, v5, v13
+    vmrglb  v27, v5, v13
+    vmrghb  v28, v6, v14
+    vmrglb  v29, v6, v14
+    vmrghb  v30, v7, v15
+    vmrglb  v31, v7, v15
+.endm
+
+;# load_g loads a global vector (whose address is in the local variable Gptr)
+;#   into vector register Vreg.  Trashes r0
+.macro load_g Vreg, Gptr
+    lwz     r0, \Gptr
+    lvx     \Vreg, 0, r0
+.endm
+
+;# exploit the saturation here.  if the answer is negative
+;# it will be clamped to 0.  orring 0 with a positive
+;# number will be the positive number (abs)
+;# RES = abs( A-B), trashes TMP
+.macro Abs RES, TMP, A, B
+    vsububs \RES, \A, \B
+    vsububs \TMP, \B, \A
+    vor     \RES, \RES, \TMP
+.endm
+
+;# RES = Max( RES, abs( A-B)), trashes TMP
+.macro max_abs RES, TMP, A, B
+    vsububs \TMP, \A, \B
+    vmaxub  \RES, \RES, \TMP
+    vsububs \TMP, \B, \A
+    vmaxub  \RES, \RES, \TMP
+.endm
+
+.macro Masks
+    ;# build masks
+    ;# input is all 8 bit unsigned (0-255).  need to
+    ;# do abs(vala-valb) > limit.  but no need to compare each
+    ;# value to the limit.  find the max of the absolute differences
+    ;# and compare that to the limit.
+    ;# First hev
+    Abs     v14, v13, v2, v3    ;# |P1 - P0|
+    max_abs  v14, v13, v5, v4    ;# |Q1 - Q0|
+
+    vcmpgtub v10, v14, v10      ;# HEV = true if thresh exceeded
+
+    ;# Next limit
+    max_abs  v14, v13, v0, v1    ;# |P3 - P2|
+    max_abs  v14, v13, v1, v2    ;# |P2 - P1|
+    max_abs  v14, v13, v6, v5    ;# |Q2 - Q1|
+    max_abs  v14, v13, v7, v6    ;# |Q3 - Q2|
+
+    vcmpgtub v9, v14, v9        ;# R = true if limit exceeded
+
+    ;# flimit
+    Abs     v14, v13, v3, v4    ;# |P0 - Q0|
+
+    vcmpgtub v8, v14, v8        ;# X = true if flimit exceeded
+
+    vor     v8, v8, v9          ;# R = true if flimit or limit exceeded
+    ;# done building masks
+.endm
+
+.macro build_constants RFL, RLI, RTH, FL, LI, TH
+    ;# build constants
+    lvx     \FL, 0, \RFL        ;# flimit
+    lvx     \LI, 0, \RLI        ;# limit
+    lvx     \TH, 0, \RTH        ;# thresh
+
+    vspltisb v11, 8
+    vspltisb v12, 4
+    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
+.endm
+
+.macro load_data_y
+    ;# setup strides/pointers to be able to access
+    ;# all of the data
+    add     r5, r4, r4          ;# r5 = 2 * stride
+    sub     r6, r3, r5          ;# r6 -> 2 rows back
+    neg     r7, r4              ;# r7 = -stride
+
+    ;# load 16 pixels worth of data to work on
+    sub     r0, r6, r5          ;# r0 -> 4 rows back (temp)
+    lvx     v0,  0, r0          ;# P3  (read only)
+    lvx     v1, r7, r6          ;# P2
+    lvx     v2,  0, r6          ;# P1
+    lvx     v3, r7, r3          ;# P0
+    lvx     v4,  0, r3          ;# Q0
+    lvx     v5, r4, r3          ;# Q1
+    lvx     v6, r5, r3          ;# Q2
+    add     r0, r3, r5          ;# r0 -> 2 rows fwd (temp)
+    lvx     v7, r4, r0          ;# Q3  (read only)
+.endm
+
+;# Expects
+;#  v10 == HEV
+;#  v13 == tmp
+;#  v14 == tmp
+.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT
+    vxor    \P1, \P1, v11       ;# SP1
+    vxor    \P0, \P0, v11       ;# SP0
+    vxor    \Q0, \Q0, v11       ;# SQ0
+    vxor    \Q1, \Q1, v11       ;# SQ1
+
+    vsubsbs v13, \P1, \Q1       ;# f  = c (P1 - Q1)
+.if \HEV_PRESENT
+    vand    v13, v13, v10       ;# f &= hev
+.endif
+    vsubsbs v14, \Q0, \P0       ;# -126 <=  X = Q0-P0  <= +126
+    vaddsbs v13, v13, v14
+    vaddsbs v13, v13, v14
+    vaddsbs v13, v13, v14       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+
+    vandc   v13, v13, v8        ;# f &= mask
+
+    vspltisb v8, 3
+    vspltisb v9, 4
+
+    vaddsbs v14, v13, v9        ;# f1 = c (f+4)
+    vaddsbs v15, v13, v8        ;# f2 = c (f+3)
+
+    vsrab   v13, v14, v8        ;# f1 >>= 3
+    vsrab   v15, v15, v8        ;# f2 >>= 3
+
+    vsubsbs \Q0, \Q0, v13       ;# u1 = c (SQ0 - f1)
+    vaddsbs \P0, \P0, v15       ;# u2 = c (SP0 + f2)
+.endm
+
+.macro vp8_mbfilter
+    Masks
+
+    ;# start the fitering here
+    vxor    v1, v1, v11         ;# SP2
+    vxor    v2, v2, v11         ;# SP1
+    vxor    v3, v3, v11         ;# SP0
+    vxor    v4, v4, v11         ;# SQ0
+    vxor    v5, v5, v11         ;# SQ1
+    vxor    v6, v6, v11         ;# SQ2
+
+    ;# add outer taps if we have high edge variance
+    vsubsbs v13, v2, v5         ;# f  = c (SP1-SQ1)
+
+    vsubsbs v14, v4, v3         ;# SQ0-SP0
+    vaddsbs v13, v13, v14
+    vaddsbs v13, v13, v14
+    vaddsbs v13, v13, v14       ;# f  = c( c(SP1-SQ1) + 3*(SQ0-SP0))
+
+    vandc   v13, v13, v8        ;# f &= mask
+    vand    v15, v13, v10       ;# f2 = f & hev
+
+    ;# save bottom 3 bits so that we round one side +4 and the other +3
+    vspltisb v8, 3
+    vspltisb v9, 4
+
+    vaddsbs v14, v15, v9        ;# f1 = c (f+4)
+    vaddsbs v15, v15, v8        ;# f2 = c (f+3)
+
+    vsrab   v14, v14, v8        ;# f1 >>= 3
+    vsrab   v15, v15, v8        ;# f2 >>= 3
+
+    vsubsbs v4, v4, v14         ;# u1 = c (SQ0 - f1)
+    vaddsbs v3, v3, v15         ;# u2 = c (SP0 + f2)
+
+    ;# only apply wider filter if not high edge variance
+    vandc   v13, v13, v10       ;# f &= ~hev
+
+    vspltisb v9, 2
+    vnor    v8, v8, v8
+    vsrb    v9, v8, v9          ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f
+    vupkhsb v9, v9              ;# 0x003f003f003f003f003f003f003f003f
+    vspltisb v8, 9
+
+    ;# roughly 1/7th difference across boundary
+    vspltish v10, 7
+    vmulosb v14, v8, v13        ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+    vmulesb v15, v8, v13
+    vaddshs v14, v14, v9        ;# +=  63
+    vaddshs v15, v15, v9
+    vsrah   v14, v14, v10       ;# >>= 7
+    vsrah   v15, v15, v10
+    vmrglh  v10, v15, v14
+    vmrghh  v15, v15, v14
+
+    vpkshss v10, v15, v10       ;# X = saturated down to bytes
+
+    vsubsbs v6, v6, v10         ;# subtract from Q and add to P
+    vaddsbs v1, v1, v10
+
+    vxor    v6, v6, v11
+    vxor    v1, v1, v11
+
+    ;# roughly 2/7th difference across boundary
+    vspltish v10, 7
+    vaddubm v12, v8, v8
+    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+    vmulesb v15, v12, v13
+    vaddshs v14, v14, v9
+    vaddshs v15, v15, v9
+    vsrah   v14, v14, v10       ;# >>= 7
+    vsrah   v15, v15, v10
+    vmrglh  v10, v15, v14
+    vmrghh  v15, v15, v14
+
+    vpkshss v10, v15, v10       ;# X = saturated down to bytes
+
+    vsubsbs v5, v5, v10         ;# subtract from Q and add to P
+    vaddsbs v2, v2, v10
+
+    vxor    v5, v5, v11
+    vxor    v2, v2, v11
+
+    ;# roughly 3/7th difference across boundary
+    vspltish v10, 7
+    vaddubm v12, v12, v8
+    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+    vmulesb v15, v12, v13
+    vaddshs v14, v14, v9
+    vaddshs v15, v15, v9
+    vsrah   v14, v14, v10       ;# >>= 7
+    vsrah   v15, v15, v10
+    vmrglh  v10, v15, v14
+    vmrghh  v15, v15, v14
+
+    vpkshss v10, v15, v10       ;# X = saturated down to bytes
+
+    vsubsbs v4, v4, v10         ;# subtract from Q and add to P
+    vaddsbs v3, v3, v10
+
+    vxor    v4, v4, v11
+    vxor    v3, v3, v11
+.endm
+
+.macro SBFilter
+    Masks
+
+    common_adjust v3, v4, v2, v5, 1
+
+    ;# outer tap adjustments
+    vspltisb v8, 1
+
+    vaddubm v13, v13, v8        ;# f  += 1
+    vsrab   v13, v13, v8        ;# f >>= 1
+
+    vandc   v13, v13, v10       ;# f &= ~hev
+
+    vsubsbs v5, v5, v13         ;# u1 = c (SQ1 - f)
+    vaddsbs v2, v2, v13         ;# u2 = c (SP1 + f)
+
+    vxor    v2, v2, v11
+    vxor    v3, v3, v11
+    vxor    v4, v4, v11
+    vxor    v5, v5, v11
+.endm
+
+    .align 2
+mbloop_filter_horizontal_edge_y_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    build_constants r5, r6, r7, v8, v9, v10
+
+    load_data_y
+
+    vp8_mbfilter
+
+    stvx     v1, r7, r6         ;# P2
+    stvx     v2,  0, r6         ;# P1
+    stvx     v3, r7, r3         ;# P0
+    stvx     v4,  0, r3         ;# Q0
+    stvx     v5, r4, r3         ;# Q1
+    stvx     v6, r5, r3         ;# Q2
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;#  r3 unsigned char *s
+;#  r4 int p
+;#  r5 const signed char *flimit
+;#  r6 const signed char *limit
+;#  r7 const signed char *thresh
+loop_filter_horizontal_edge_y_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    build_constants r5, r6, r7, v8, v9, v10
+
+    load_data_y
+
+    SBFilter
+
+    stvx     v2,  0, r6         ;# P1
+    stvx     v3, r7, r3         ;# P0
+    stvx     v4,  0, r3         ;# Q0
+    stvx     v5, r4, r3         ;# Q1
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+;# Filtering a vertical mb.  Each mb is aligned on a 16 byte boundary.
+;#  So we can read in an entire mb aligned.  However if we want to filter the mb
+;#  edge we run into problems.  For the loopfilter we require 4 bytes before the mb
+;#  and 4 after for a total of 8 bytes.  Reading 16 bytes inorder to get 4 is a bit
+;#  of a waste.  So this is an even uglier way to get around that.
+;# Using the regular register file words are read in and then saved back out to
+;#  memory to align and order them up.  Then they are read in using the
+;#  vector register file.
+.macro RLVmb V, R
+    lwzux   r0, r3, r4
+    stw     r0, 4(\R)
+    lwz     r0,-4(r3)
+    stw     r0, 0(\R)
+    lwzux   r0, r3, r4
+    stw     r0,12(\R)
+    lwz     r0,-4(r3)
+    stw     r0, 8(\R)
+    lvx     \V, 0, \R
+.endm
+
+.macro WLVmb V, R
+    stvx    \V, 0, \R
+    lwz     r0,12(\R)
+    stwux   r0, r3, r4
+    lwz     r0, 8(\R)
+    stw     r0,-4(r3)
+    lwz     r0, 4(\R)
+    stwux   r0, r3, r4
+    lwz     r0, 0(\R)
+    stw     r0,-4(r3)
+.endm
+
+    .align 2
+;#  r3 unsigned char *s
+;#  r4 int p
+;#  r5 const signed char *flimit
+;#  r6 const signed char *limit
+;#  r7 const signed char *thresh
+mbloop_filter_vertical_edge_y_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xc000
+    mtspr   256, r12            ;# set VRSAVE
+
+    la      r9, -48(r1)         ;# temporary space for reading in vectors
+    sub     r3, r3, r4
+
+    RLVmb v0, r9
+    RLVmb v1, r9
+    RLVmb v2, r9
+    RLVmb v3, r9
+    RLVmb v4, r9
+    RLVmb v5, r9
+    RLVmb v6, r9
+    RLVmb v7, r9
+
+    transpose8x16_fwd
+
+    build_constants r5, r6, r7, v8, v9, v10
+
+    vp8_mbfilter
+
+    transpose8x16_inv
+
+    add r3, r3, r4
+    neg r4, r4
+
+    WLVmb v17, r9
+    WLVmb v16, r9
+    WLVmb v15, r9
+    WLVmb v14, r9
+    WLVmb v13, r9
+    WLVmb v12, r9
+    WLVmb v11, r9
+    WLVmb v10, r9
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+.macro RL V, R, P
+    lvx     \V, 0,  \R
+    add     \R, \R, \P
+.endm
+
+.macro WL V, R, P
+    stvx    \V, 0,  \R
+    add     \R, \R, \P
+.endm
+
+.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3
+                                ;# K = |P0-P1| already
+    Abs     v14, v13, \Q0, \Q1  ;# M = |Q0-Q1|
+    vmaxub  v14, v14, v4        ;# M = max( |P0-P1|, |Q0-Q1|)
+    vcmpgtub v10, v14, v0
+
+    Abs     v4, v5, \Q2, \Q3    ;# K = |Q2-Q3| = next |P0-P1]
+
+    max_abs  v14, v13, \Q1, \Q2  ;# M = max( M, |Q1-Q2|)
+    max_abs  v14, v13, \P1, \P2  ;# M = max( M, |P1-P2|)
+    max_abs  v14, v13, \P2, \P3  ;# M = max( M, |P2-P3|)
+
+    vmaxub   v14, v14, v4       ;# M = max interior abs diff
+    vcmpgtub v9, v14, v2        ;# M = true if int_l exceeded
+
+    Abs     v14, v13, \P0, \Q0  ;# X = Abs( P0-Q0)
+    vcmpgtub v8, v14, v3        ;# X = true if edge_l exceeded
+    vor     v8, v8, v9          ;# M = true if edge_l or int_l exceeded
+
+    ;# replace P1,Q1 w/signed versions
+    common_adjust \P0, \Q0, \P1, \Q1, 1
+
+    vaddubm v13, v13, v1        ;# -16 <= M <= 15, saturation irrelevant
+    vsrab   v13, v13, v1
+    vandc   v13, v13, v10       ;# adjust P1,Q1 by (M+1)>>1  if ! hev
+    vsubsbs \Q1, \Q1, v13
+    vaddsbs \P1, \P1, v13
+
+    vxor    \P1, \P1, v11       ;# P1
+    vxor    \P0, \P0, v11       ;# P0
+    vxor    \Q0, \Q0, v11       ;# Q0
+    vxor    \Q1, \Q1, v11       ;# Q1
+.endm
+
+
+    .align 2
+;#  r3 unsigned char *s
+;#  r4 int p
+;#  r5 const signed char *flimit
+;#  r6 const signed char *limit
+;#  r7 const signed char *thresh
+loop_filter_vertical_edge_y_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    addi    r9, r3, 0
+    RL      v16, r9, r4
+    RL      v17, r9, r4
+    RL      v18, r9, r4
+    RL      v19, r9, r4
+    RL      v20, r9, r4
+    RL      v21, r9, r4
+    RL      v22, r9, r4
+    RL      v23, r9, r4
+    RL      v24, r9, r4
+    RL      v25, r9, r4
+    RL      v26, r9, r4
+    RL      v27, r9, r4
+    RL      v28, r9, r4
+    RL      v29, r9, r4
+    RL      v30, r9, r4
+    lvx     v31, 0, r9
+
+    Transpose16x16
+
+    vspltisb v1, 1
+
+    build_constants r5, r6, r7, v3, v2, v0
+
+    Abs v4, v5, v19, v18                            ;# K(v14) = first |P0-P1|
+
+    Fil v16, v17, v18, v19,  v20, v21, v22, v23
+    Fil v20, v21, v22, v23,  v24, v25, v26, v27
+    Fil v24, v25, v26, v27,  v28, v29, v30, v31
+
+    Transpose16x16
+
+    addi    r9, r3, 0
+    WL      v16, r9, r4
+    WL      v17, r9, r4
+    WL      v18, r9, r4
+    WL      v19, r9, r4
+    WL      v20, r9, r4
+    WL      v21, r9, r4
+    WL      v22, r9, r4
+    WL      v23, r9, r4
+    WL      v24, r9, r4
+    WL      v25, r9, r4
+    WL      v26, r9, r4
+    WL      v27, r9, r4
+    WL      v28, r9, r4
+    WL      v29, r9, r4
+    WL      v30, r9, r4
+    stvx    v31, 0, r9
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+.macro active_chroma_sel V
+    andi.   r7, r3, 8       ;# row origin modulo 16
+    add     r7, r7, r7      ;# selects selectors
+    lis     r12, _chromaSelectors@ha
+    la      r0,  _chromaSelectors@l(r12)
+    lwzux   r0, r7, r0      ;# leave selector addr in r7
+
+    lvx     \V, 0, r0       ;# mask to concatenate active U,V pels
+.endm
+
+.macro hread_uv Dest, U, V, Offs, VMask
+    lvx     \U, \Offs, r3
+    lvx     \V, \Offs, r4
+    vperm   \Dest, \U, \V, \VMask   ;# Dest = active part of U then V
+.endm
+
+.macro hwrite_uv New, U, V, Offs, Umask, Vmask
+    vperm   \U, \New, \U, \Umask    ;# Combine new pels with siblings
+    vperm   \V, \New, \V, \Vmask
+    stvx    \U, \Offs, r3           ;# Write to frame buffer
+    stvx    \V, \Offs, r4
+.endm
+
+;# Process U,V in parallel.
+.macro load_chroma_h
+    neg     r9, r5          ;# r9 = -1 * stride
+    add     r8, r9, r9      ;# r8 = -2 * stride
+    add     r10, r5, r5     ;# r10 = 2 * stride
+
+    active_chroma_sel v12
+
+    ;# P3, Q3 are read-only; need not save addresses or sibling pels
+    add     r6, r8, r8      ;# r6 = -4 * stride
+    hread_uv v0, v14, v15, r6, v12
+    add     r6, r10, r5     ;# r6 =  3 * stride
+    hread_uv v7, v14, v15, r6, v12
+
+    ;# Others are read/write; save addresses and sibling pels
+
+    add     r6, r8, r9      ;# r6 = -3 * stride
+    hread_uv v1, v16, v17, r6,  v12
+    hread_uv v2, v18, v19, r8,  v12
+    hread_uv v3, v20, v21, r9,  v12
+    hread_uv v4, v22, v23, 0,   v12
+    hread_uv v5, v24, v25, r5,  v12
+    hread_uv v6, v26, v27, r10, v12
+.endm
+
+.macro uresult_sel V
+    load_g   \V, 4(r7)
+.endm
+
+.macro vresult_sel V
+    load_g   \V, 8(r7)
+.endm
+
+;# always write P1,P0,Q0,Q1
+.macro store_chroma_h
+    uresult_sel v11
+    vresult_sel v12
+    hwrite_uv v2, v18, v19, r8, v11, v12
+    hwrite_uv v3, v20, v21, r9, v11, v12
+    hwrite_uv v4, v22, v23, 0,  v11, v12
+    hwrite_uv v5, v24, v25, r5, v11, v12
+.endm
+
+    .align 2
+;#  r3 unsigned char *u
+;#  r4 unsigned char *v
+;#  r5 int p
+;#  r6 const signed char *flimit
+;#  r7 const signed char *limit
+;#  r8 const signed char *thresh
+mbloop_filter_horizontal_edge_uv_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    build_constants r6, r7, r8, v8, v9, v10
+
+    load_chroma_h
+
+    vp8_mbfilter
+
+    store_chroma_h
+
+    hwrite_uv v1, v16, v17, r6,  v11, v12    ;# v1 == P2
+    hwrite_uv v6, v26, v27, r10, v11, v12    ;# v6 == Q2
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;#  r3 unsigned char *u
+;#  r4 unsigned char *v
+;#  r5 int p
+;#  r6 const signed char *flimit
+;#  r7 const signed char *limit
+;#  r8 const signed char *thresh
+loop_filter_horizontal_edge_uv_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    build_constants r6, r7, r8, v8, v9, v10
+
+    load_chroma_h
+
+    SBFilter
+
+    store_chroma_h
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+.macro R V, R
+    lwzux   r0, r3, r5
+    stw     r0, 4(\R)
+    lwz     r0,-4(r3)
+    stw     r0, 0(\R)
+    lwzux   r0, r4, r5
+    stw     r0,12(\R)
+    lwz     r0,-4(r4)
+    stw     r0, 8(\R)
+    lvx     \V, 0, \R
+.endm
+
+
+.macro W V, R
+    stvx    \V, 0, \R
+    lwz     r0,12(\R)
+    stwux   r0, r4, r5
+    lwz     r0, 8(\R)
+    stw     r0,-4(r4)
+    lwz     r0, 4(\R)
+    stwux   r0, r3, r5
+    lwz     r0, 0(\R)
+    stw     r0,-4(r3)
+.endm
+
+.macro chroma_vread R
+    sub r3, r3, r5          ;# back up one line for simplicity
+    sub r4, r4, r5
+
+    R v0, \R
+    R v1, \R
+    R v2, \R
+    R v3, \R
+    R v4, \R
+    R v5, \R
+    R v6, \R
+    R v7, \R
+
+    transpose8x16_fwd
+.endm
+
+.macro chroma_vwrite R
+
+    transpose8x16_inv
+
+    add     r3, r3, r5
+    add     r4, r4, r5
+    neg     r5, r5          ;# Write rows back in reverse order
+
+    W v17, \R
+    W v16, \R
+    W v15, \R
+    W v14, \R
+    W v13, \R
+    W v12, \R
+    W v11, \R
+    W v10, \R
+.endm
+
+    .align 2
+;#  r3 unsigned char *u
+;#  r4 unsigned char *v
+;#  r5 int p
+;#  r6 const signed char *flimit
+;#  r7 const signed char *limit
+;#  r8 const signed char *thresh
+mbloop_filter_vertical_edge_uv_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xc000
+    mtspr   256, r12            ;# set VRSAVE
+
+    la      r9, -48(r1)         ;# temporary space for reading in vectors
+
+    chroma_vread r9
+
+    build_constants r6, r7, r8, v8, v9, v10
+
+    vp8_mbfilter
+
+    chroma_vwrite r9
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;#  r3 unsigned char *u
+;#  r4 unsigned char *v
+;#  r5 int p
+;#  r6 const signed char *flimit
+;#  r7 const signed char *limit
+;#  r8 const signed char *thresh
+loop_filter_vertical_edge_uv_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xc000
+    mtspr   256, r12            ;# set VRSAVE
+
+    la      r9, -48(r1)         ;# temporary space for reading in vectors
+
+    chroma_vread r9
+
+    build_constants r6, r7, r8, v8, v9, v10
+
+    SBFilter
+
+    chroma_vwrite r9
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=-
+
+.macro vp8_simple_filter
+    Abs v14, v13, v1, v2    ;# M = abs( P0 - Q0)
+    vcmpgtub v8, v14, v8    ;# v5 = true if _over_ limit
+
+    ;# preserve unsigned v0 and v3
+    common_adjust v1, v2, v0, v3, 0
+
+    vxor v1, v1, v11
+    vxor v2, v2, v11        ;# cvt Q0, P0 back to pels
+.endm
+
+.macro simple_vertical
+    addi    r8,  0, 16
+    addi    r7, r5, 32
+
+    lvx     v0,  0, r5
+    lvx     v1, r8, r5
+    lvx     v2,  0, r7
+    lvx     v3, r8, r7
+
+    lis     r12, _B_hihi@ha
+    la      r0,  _B_hihi@l(r12)
+    lvx     v16, 0, r0
+
+    lis     r12, _B_lolo@ha
+    la      r0,  _B_lolo@l(r12)
+    lvx     v17, 0, r0
+
+    Transpose4times4x4 v16, v17
+    vp8_simple_filter
+
+    vxor v0, v0, v11
+    vxor v3, v3, v11        ;# cvt Q0, P0 back to pels
+
+    Transpose4times4x4 v16, v17
+
+    stvx    v0,  0, r5
+    stvx    v1, r8, r5
+    stvx    v2,  0, r7
+    stvx    v3, r8, r7
+.endm
+
+    .align 2
+;#  r3 unsigned char *s
+;#  r4 int p
+;#  r5 const signed char *flimit
+loop_filter_simple_horizontal_edge_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    ;# build constants
+    lvx     v8, 0, r5           ;# flimit
+
+    vspltisb v11, 8
+    vspltisb v12, 4
+    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
+
+    neg     r5, r4              ;# r5 = -1 * stride
+    add     r6, r5, r5          ;# r6 = -2 * stride
+
+    lvx     v0, r6, r3          ;# v0 = P1 = 16 pels two rows above edge
+    lvx     v1, r5, r3          ;# v1 = P0 = 16 pels one row  above edge
+    lvx     v2,  0, r3          ;# v2 = Q0 = 16 pels one row  below edge
+    lvx     v3, r4, r3          ;# v3 = Q1 = 16 pels two rows below edge
+
+    vp8_simple_filter
+
+    stvx    v1, r5, r3          ;# store P0
+    stvx    v2,  0, r3          ;# store Q0
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+.macro RLV Offs
+    stw     r0, (\Offs*4)(r5)
+    lwzux   r0, r7, r4
+.endm
+
+.macro WLV Offs
+    lwz     r0, (\Offs*4)(r5)
+    stwux   r0, r7, r4
+.endm
+
+    .align 2
+;#  r3 unsigned char *s
+;#  r4 int p
+;#  r5 const signed char *flimit
+loop_filter_simple_vertical_edge_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xc000
+    mtspr   256, r12            ;# set VRSAVE
+
+    ;# build constants
+    lvx     v8, 0, r5           ;# flimit
+
+    vspltisb v11, 8
+    vspltisb v12, 4
+    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
+
+    la r5, -96(r1)              ;# temporary space for reading in vectors
+
+    ;# Store 4 pels at word "Offs" in temp array, then advance r7
+    ;#   to next row and read another 4 pels from the frame buffer.
+
+    subi    r7, r3,  2          ;# r7 -> 2 pels before start
+    lwzx    r0,  0, r7          ;# read first 4 pels
+
+    ;# 16 unaligned word accesses
+    RLV 0
+    RLV 4
+    RLV 8
+    RLV 12
+    RLV 1
+    RLV 5
+    RLV 9
+    RLV 13
+    RLV 2
+    RLV 6
+    RLV 10
+    RLV 14
+    RLV 3
+    RLV 7
+    RLV 11
+
+    stw     r0, (15*4)(r5)      ;# write last 4 pels
+
+    simple_vertical
+
+    ;# Read temp array, write frame buffer.
+    subi    r7, r3,  2          ;# r7 -> 2 pels before start
+    lwzx    r0,  0, r5          ;# read/write first 4 pels
+    stwx    r0,  0, r7
+
+    WLV 4
+    WLV 8
+    WLV 12
+    WLV 1
+    WLV 5
+    WLV 9
+    WLV 13
+    WLV 2
+    WLV 6
+    WLV 10
+    WLV 14
+    WLV 3
+    WLV 7
+    WLV 11
+    WLV 15
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .data
+
+_chromaSelectors:
+    .long   _B_hihi
+    .long   _B_Ures0
+    .long   _B_Vres0
+    .long   0
+    .long   _B_lolo
+    .long   _B_Ures8
+    .long   _B_Vres8
+    .long   0
+
+    .align 4
+_B_Vres8:
+    .byte   16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15
+
+    .align 4
+_B_Ures8:
+    .byte   16, 17, 18, 19, 20, 21, 22, 23,  0,  1,  2,  3,  4,  5,  6,  7
+
+    .align 4
+_B_lolo:
+    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
+
+    .align 4
+_B_Vres0:
+    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
+    .align 4
+_B_Ures0:
+    .byte    0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31
+
+    .align 4
+_B_hihi:
+    .byte    0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
diff --git a/vp8/common/ppc/platform_altivec.asm b/vp8/common/ppc/platform_altivec.asm
new file mode 100644
index 000000000..227ef2a94
--- /dev/null
+++ b/vp8/common/ppc/platform_altivec.asm
@@ -0,0 +1,58 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    .globl save_platform_context
+    .globl restore_platform_context
+
+.macro W V P
+    stvx    \V,  0, \P
+    addi    \P, \P, 16
+.endm
+
+.macro R V P
+    lvx     \V,  0, \P
+    addi    \P, \P, 16
+.endm
+
+;# r3 context_ptr
+    .align 2
+save_platform_contex:
+    W v20, r3
+    W v21, r3
+    W v22, r3
+    W v23, r3
+    W v24, r3
+    W v25, r3
+    W v26, r3
+    W v27, r3
+    W v28, r3
+    W v29, r3
+    W v30, r3
+    W v31, r3
+
+    blr
+
+;# r3 context_ptr
+    .align 2
+restore_platform_context:
+    R v20, r3
+    R v21, r3
+    R v22, r3
+    R v23, r3
+    R v24, r3
+    R v25, r3
+    R v26, r3
+    R v27, r3
+    R v28, r3
+    R v29, r3
+    R v30, r3
+    R v31, r3
+
+    blr
diff --git a/vp8/common/ppc/recon_altivec.asm b/vp8/common/ppc/recon_altivec.asm
new file mode 100644
index 000000000..f478b954c
--- /dev/null
+++ b/vp8/common/ppc/recon_altivec.asm
@@ -0,0 +1,174 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    .globl recon4b_ppc
+    .globl recon2b_ppc
+    .globl recon_b_ppc
+
+.macro row_of16 Diff Pred Dst Stride
+    lvx     v1,  0, \Pred           ;# v1 = pred = p0..p15
+    addi    \Pred, \Pred, 16        ;# next pred
+    vmrghb  v2, v0, v1              ;# v2 = 16-bit p0..p7
+    lvx     v3,  0, \Diff           ;# v3 = d0..d7
+    vaddshs v2, v2, v3              ;# v2 = r0..r7
+    vmrglb  v1, v0, v1              ;# v1 = 16-bit p8..p15
+    lvx     v3, r8, \Diff           ;# v3 = d8..d15
+    addi    \Diff, \Diff, 32        ;# next diff
+    vaddshs v3, v3, v1              ;# v3 = r8..r15
+    vpkshus v2, v2, v3              ;# v2 = 8-bit r0..r15
+    stvx    v2,  0, \Dst            ;# to dst
+    add     \Dst, \Dst, \Stride     ;# next dst
+.endm
+
+    .text
+    .align 2
+;#  r3 = short *diff_ptr,
+;#  r4 = unsigned char *pred_ptr,
+;#  r5 = unsigned char *dst_ptr,
+;#  r6 = int stride
+recon4b_ppc:
+    mfspr   r0, 256                     ;# get old VRSAVE
+    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
+    oris    r0, r0, 0xf000
+    mtspr   256,r0                      ;# set VRSAVE
+
+    vxor    v0, v0, v0
+    li      r8, 16
+
+    row_of16 r3, r4, r5, r6
+    row_of16 r3, r4, r5, r6
+    row_of16 r3, r4, r5, r6
+    row_of16 r3, r4, r5, r6
+
+    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
+    mtspr   256, r12                    ;# reset old VRSAVE
+
+    blr
+
+.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels
+    lvx     v1,  0, \Pred       ;# v1 = pred = p0..p15
+    vmrghb  v2, v0, v1          ;# v2 = 16-bit p0..p7
+    lvx     v3,  0, \Diff       ;# v3 = d0..d7
+    vaddshs v2, v2, v3          ;# v2 = r0..r7
+    vmrglb  v1, v0, v1          ;# v1 = 16-bit p8..p15
+    lvx     v3, r8, \Diff       ;# v2 = d8..d15
+    vaddshs v3, v3, v1          ;# v3 = r8..r15
+    vpkshus v2, v2, v3          ;# v3 = 8-bit r0..r15
+    stvx    v2,  0, r10         ;# 2 rows to dst from buf
+    lwz     r0, 0(r10)
+.if \write_first_four_pels
+    stw     r0, 0(\Dst)
+    .else
+    stwux   r0, \Dst, \Stride
+.endif
+    lwz     r0, 4(r10)
+    stw     r0, 4(\Dst)
+    lwz     r0, 8(r10)
+    stwux   r0, \Dst, \Stride       ;# advance dst to next row
+    lwz     r0, 12(r10)
+    stw     r0, 4(\Dst)
+.endm
+
+    .align 2
+;#  r3 = short *diff_ptr,
+;#  r4 = unsigned char *pred_ptr,
+;#  r5 = unsigned char *dst_ptr,
+;#  r6 = int stride
+
+recon2b_ppc:
+    mfspr   r0, 256                     ;# get old VRSAVE
+    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
+    oris    r0, r0, 0xf000
+    mtspr   256,r0                      ;# set VRSAVE
+
+    vxor    v0, v0, v0
+    li      r8, 16
+
+    la      r10, -48(r1)                ;# buf
+
+    two_rows_of8 r3, r4, r5, r6, 1
+
+    addi    r4, r4, 16;                 ;# next pred
+    addi    r3, r3, 32;                 ;# next diff
+
+    two_rows_of8 r3, r4, r5, r6, 0
+
+    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
+    mtspr   256, r12                    ;# reset old VRSAVE
+
+    blr
+
+.macro get_two_diff_rows
+    stw     r0, 0(r10)
+    lwz     r0, 4(r3)
+    stw     r0, 4(r10)
+    lwzu    r0, 32(r3)
+    stw     r0, 8(r10)
+    lwz     r0, 4(r3)
+    stw     r0, 12(r10)
+    lvx     v3, 0, r10
+.endm
+
+    .align 2
+;#  r3 = short *diff_ptr,
+;#  r4 = unsigned char *pred_ptr,
+;#  r5 = unsigned char *dst_ptr,
+;#  r6 = int stride
+recon_b_ppc:
+    mfspr   r0, 256                     ;# get old VRSAVE
+    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
+    oris    r0, r0, 0xf000
+    mtspr   256,r0                      ;# set VRSAVE
+
+    vxor    v0, v0, v0
+
+    la      r10, -48(r1)    ;# buf
+
+    lwz     r0, 0(r4)
+    stw     r0, 0(r10)
+    lwz     r0, 16(r4)
+    stw     r0, 4(r10)
+    lwz     r0, 32(r4)
+    stw     r0, 8(r10)
+    lwz     r0, 48(r4)
+    stw     r0, 12(r10)
+
+    lvx     v1,  0, r10;    ;# v1 = pred = p0..p15
+
+    lwz r0, 0(r3)           ;# v3 = d0..d7
+
+    get_two_diff_rows
+
+    vmrghb  v2, v0, v1;     ;# v2 = 16-bit p0..p7
+    vaddshs v2, v2, v3;     ;# v2 = r0..r7
+
+    lwzu r0, 32(r3)         ;# v3 = d8..d15
+
+    get_two_diff_rows
+
+    vmrglb  v1, v0, v1;     ;# v1 = 16-bit p8..p15
+    vaddshs v3, v3, v1;     ;# v3 = r8..r15
+
+    vpkshus v2, v2, v3;     ;# v2 = 8-bit r0..r15
+    stvx    v2,  0, r10;    ;# 16 pels to dst from buf
+
+    lwz     r0, 0(r10)
+    stw     r0, 0(r5)
+    lwz     r0, 4(r10)
+    stwux   r0, r5, r6
+    lwz     r0, 8(r10)
+    stwux   r0, r5, r6
+    lwz     r0, 12(r10)
+    stwx    r0, r5, r6
+
+    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
+    mtspr   256, r12                    ;# reset old VRSAVE
+
+    blr
diff --git a/vp8/common/ppc/systemdependent.c b/vp8/common/ppc/systemdependent.c
new file mode 100644
index 000000000..284731085
--- /dev/null
+++ b/vp8/common/ppc/systemdependent.c
@@ -0,0 +1,170 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "g_common.h"
+#include "subpixel.h"
+#include "loopfilter.h"
+#include "recon.h"
+#include "idct.h"
+#include "onyxc_int.h"
+
+void (*vp8_short_idct4x4)(short *input, short *output, int pitch);
+void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch);
+void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch);
+
+extern void (*vp8_post_proc_down_and_across)(
+    unsigned char *src_ptr,
+    unsigned char *dst_ptr,
+    int src_pixels_per_line,
+    int dst_pixels_per_line,
+    int rows,
+    int cols,
+    int flimit
+);
+
+extern void (*vp8_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit);
+extern void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit);
+extern void (*vp8_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit);
+extern void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit);
+
+extern void vp8_post_proc_down_and_across_c
+(
+    unsigned char *src_ptr,
+    unsigned char *dst_ptr,
+    int src_pixels_per_line,
+    int dst_pixels_per_line,
+    int rows,
+    int cols,
+    int flimit
+);
+void vp8_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a);
+
+extern copy_mem_block_function *vp8_copy_mem16x16;
+extern copy_mem_block_function *vp8_copy_mem8x8;
+extern copy_mem_block_function *vp8_copy_mem8x4;
+
+// PPC
+extern subpixel_predict_function sixtap_predict_ppc;
+extern subpixel_predict_function sixtap_predict8x4_ppc;
+extern subpixel_predict_function sixtap_predict8x8_ppc;
+extern subpixel_predict_function sixtap_predict16x16_ppc;
+extern subpixel_predict_function bilinear_predict4x4_ppc;
+extern subpixel_predict_function bilinear_predict8x4_ppc;
+extern subpixel_predict_function bilinear_predict8x8_ppc;
+extern subpixel_predict_function bilinear_predict16x16_ppc;
+
+extern copy_mem_block_function copy_mem16x16_ppc;
+
+void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+
+extern void short_idct4x4llm_ppc(short *input, short *output, int pitch);
+
+// Generic C
+extern subpixel_predict_function vp8_sixtap_predict_c;
+extern subpixel_predict_function vp8_sixtap_predict8x4_c;
+extern subpixel_predict_function vp8_sixtap_predict8x8_c;
+extern subpixel_predict_function vp8_sixtap_predict16x16_c;
+extern subpixel_predict_function vp8_bilinear_predict4x4_c;
+extern subpixel_predict_function vp8_bilinear_predict8x4_c;
+extern subpixel_predict_function vp8_bilinear_predict8x8_c;
+extern subpixel_predict_function vp8_bilinear_predict16x16_c;
+
+extern copy_mem_block_function vp8_copy_mem16x16_c;
+extern copy_mem_block_function vp8_copy_mem8x8_c;
+extern copy_mem_block_function vp8_copy_mem8x4_c;
+
+void vp8_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void vp8_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void vp8_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+
+extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
+extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch);
+extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
+
+// PPC
+extern loop_filter_block_function loop_filter_mbv_ppc;
+extern loop_filter_block_function loop_filter_bv_ppc;
+extern loop_filter_block_function loop_filter_mbh_ppc;
+extern loop_filter_block_function loop_filter_bh_ppc;
+
+extern loop_filter_block_function loop_filter_mbvs_ppc;
+extern loop_filter_block_function loop_filter_bvs_ppc;
+extern loop_filter_block_function loop_filter_mbhs_ppc;
+extern loop_filter_block_function loop_filter_bhs_ppc;
+
+// Generic C
+extern loop_filter_block_function vp8_loop_filter_mbv_c;
+extern loop_filter_block_function vp8_loop_filter_bv_c;
+extern loop_filter_block_function vp8_loop_filter_mbh_c;
+extern loop_filter_block_function vp8_loop_filter_bh_c;
+
+extern loop_filter_block_function vp8_loop_filter_mbvs_c;
+extern loop_filter_block_function vp8_loop_filter_bvs_c;
+extern loop_filter_block_function vp8_loop_filter_mbhs_c;
+extern loop_filter_block_function vp8_loop_filter_bhs_c;
+
+extern loop_filter_block_function *vp8_lf_mbvfull;
+extern loop_filter_block_function *vp8_lf_mbhfull;
+extern loop_filter_block_function *vp8_lf_bvfull;
+extern loop_filter_block_function *vp8_lf_bhfull;
+
+extern loop_filter_block_function *vp8_lf_mbvsimple;
+extern loop_filter_block_function *vp8_lf_mbhsimple;
+extern loop_filter_block_function *vp8_lf_bvsimple;
+extern loop_filter_block_function *vp8_lf_bhsimple;
+
+void vp8_clear_c(void)
+{
+}
+
+void vp8_machine_specific_config(void)
+{
+    // Pure C:
+    vp8_clear_system_state                = vp8_clear_c;
+    vp8_recon_b                          = vp8_recon_b_c;
+    vp8_recon4b                         = vp8_recon4b_c;
+    vp8_recon2b                         = vp8_recon2b_c;
+
+    vp8_bilinear_predict16x16            = bilinear_predict16x16_ppc;
+    vp8_bilinear_predict8x8              = bilinear_predict8x8_ppc;
+    vp8_bilinear_predict8x4              = bilinear_predict8x4_ppc;
+    vp8_bilinear_predict                 = bilinear_predict4x4_ppc;
+
+    vp8_sixtap_predict16x16              = sixtap_predict16x16_ppc;
+    vp8_sixtap_predict8x8                = sixtap_predict8x8_ppc;
+    vp8_sixtap_predict8x4                = sixtap_predict8x4_ppc;
+    vp8_sixtap_predict                   = sixtap_predict_ppc;
+
+    vp8_short_idct4x4_1                  = vp8_short_idct4x4llm_1_c;
+    vp8_short_idct4x4                    = short_idct4x4llm_ppc;
+    vp8_dc_only_idct                      = vp8_dc_only_idct_c;
+
+    vp8_lf_mbvfull                       = loop_filter_mbv_ppc;
+    vp8_lf_bvfull                        = loop_filter_bv_ppc;
+    vp8_lf_mbhfull                       = loop_filter_mbh_ppc;
+    vp8_lf_bhfull                        = loop_filter_bh_ppc;
+
+    vp8_lf_mbvsimple                     = loop_filter_mbvs_ppc;
+    vp8_lf_bvsimple                      = loop_filter_bvs_ppc;
+    vp8_lf_mbhsimple                     = loop_filter_mbhs_ppc;
+    vp8_lf_bhsimple                      = loop_filter_bhs_ppc;
+
+    vp8_post_proc_down_and_across           = vp8_post_proc_down_and_across_c;
+    vp8_mbpost_proc_down                  = vp8_mbpost_proc_down_c;
+    vp8_mbpost_proc_across_ip              = vp8_mbpost_proc_across_ip_c;
+    vp8_plane_add_noise                   = vp8_plane_add_noise_c;
+
+    vp8_copy_mem16x16                    = copy_mem16x16_ppc;
+    vp8_copy_mem8x8                      = vp8_copy_mem8x8_c;
+    vp8_copy_mem8x4                      = vp8_copy_mem8x4_c;
+
+}
diff --git a/vp8/common/ppflags.h b/vp8/common/ppflags.h
new file mode 100644
index 000000000..c66397682
--- /dev/null
+++ b/vp8/common/ppflags.h
@@ -0,0 +1,25 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_PPFLAGS_H
+#define __INC_PPFLAGS_H
+enum
+{
+    VP8D_NOFILTERING    = 0,
+    VP8D_DEBLOCK        = 1,
+    VP8D_DEMACROBLOCK   = 2,
+    VP8D_ADDNOISE       = 4,
+    VP8D_DEBUG_LEVEL1   = 8,
+    VP8D_DEBUG_LEVEL2   = 16,
+    VP8D_DEBUG_LEVEL3   = 32,
+    VP8D_DEBUG_LEVEL4   = 64,
+};
+
+#endif
diff --git a/vp8/common/pragmas.h b/vp8/common/pragmas.h
new file mode 100644
index 000000000..25a4b776f
--- /dev/null
+++ b/vp8/common/pragmas.h
@@ -0,0 +1,18 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+
+
+#ifdef __INTEL_COMPILER
+#pragma warning(disable:997 1011 170)
+#endif
+#ifdef _MSC_VER
+#pragma warning(disable:4799)
+#endif
diff --git a/vp8/common/predictdc.c b/vp8/common/predictdc.c
new file mode 100644
index 000000000..df4c96e4a
--- /dev/null
+++ b/vp8/common/predictdc.c
@@ -0,0 +1,43 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include "blockd.h"
+
+
+void vp8_predict_dc(short *lastdc, short *thisdc, short quant, short *cons)
+{
+    int diff;
+    int sign;
+    int last_dc = *lastdc;
+    int this_dc = *thisdc;
+
+    if (*cons  > DCPREDCNTTHRESH)
+    {
+        this_dc += last_dc;
+    }
+
+    diff = abs(last_dc - this_dc);
+    sign  = (last_dc >> 31) ^(this_dc >> 31);
+    sign |= (!last_dc | !this_dc);
+
+    if (sign)
+    {
+        *cons = 0;
+    }
+    else
+    {
+        if (diff <= DCPREDSIMTHRESH * quant)
+            (*cons)++ ;
+    }
+
+    *thisdc = this_dc;
+    *lastdc = this_dc;
+}
diff --git a/vp8/common/predictdc.h b/vp8/common/predictdc.h
new file mode 100644
index 000000000..b8871e452
--- /dev/null
+++ b/vp8/common/predictdc.h
@@ -0,0 +1,17 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __PREDICTDC_H
+#define __PREDICTDC_H
+
+void uvvp8_predict_dc(short *lastdc, short *thisdc, short quant, short *cons);
+void vp8_predict_dc(short *lastdc, short *thisdc, short quant, short *cons);
+
+#endif
diff --git a/vp8/common/preproc.h b/vp8/common/preproc.h
new file mode 100644
index 000000000..00ec9a8d7
--- /dev/null
+++ b/vp8/common/preproc.h
@@ -0,0 +1,45 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     preproc.h
+*
+*   Description  :     simple preprocessor
+*
+****************************************************************************/
+
+#ifndef __INC_PREPROC_H
+#define __INC_PREPROC_H
+
+/****************************************************************************
+*  Types
+****************************************************************************/
+
+typedef struct
+{
+    unsigned char *frame_buffer;
+    int frame;
+    unsigned int *fixed_divide;
+
+    unsigned char *frame_buffer_alloc;
+    unsigned int *fixed_divide_alloc;
+} pre_proc_instance;
+
+/****************************************************************************
+*  Functions.
+****************************************************************************/
+void pre_proc_machine_specific_config(void);
+void delete_pre_proc(pre_proc_instance *ppi);
+int init_pre_proc(pre_proc_instance *ppi, int frame_size);
+extern void spatial_filter_c(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int width, int height, int pitch, int strength);
+extern void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);
+
+#endif
diff --git a/vp8/common/preprocif.h b/vp8/common/preprocif.h
new file mode 100644
index 000000000..986c45b10
--- /dev/null
+++ b/vp8/common/preprocif.h
@@ -0,0 +1,75 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     preproc_if.h
+*
+*   Description  :     Pre-processor interface header file.
+*
+****************************************************************************/
+
+#ifndef __PREPROC_IF_H
+#define __PREPROC_IF_H
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+#include "type_aliases.h"
+
+/****************************************************************************
+*  Types
+****************************************************************************/
+
+typedef struct
+{
+    UINT8 *Yuv0ptr;
+    UINT8 *Yuv1ptr;
+
+    UINT8   *frag_info;              // blocks coded : passed in
+    UINT32   frag_info_element_size;   // size of each element
+    UINT32   frag_info_coded_mask;     // mask to get at whether fragment is coded
+
+    UINT32 *region_index;            // Gives pixel index for top left of each block
+    UINT32 video_frame_height;
+    UINT32 video_frame_width;
+    UINT8 hfrag_pixels;
+    UINT8 vfrag_pixels;
+
+} SCAN_CONFIG_DATA;
+
+typedef enum
+{
+    SCP_FILTER_ON_OFF,
+    SCP_SET_SRF_OFFSET,
+    SCP_SET_EBO_ON_OFF,
+    SCP_SET_VCAP_LEVEL_OFFSET,
+    SCP_SET_SHOW_LOCAL
+
+} SCP_SETTINGS;
+
+typedef struct PP_INSTANCE *x_pp_inst;
+
+/****************************************************************************
+*  Module statics
+****************************************************************************/
+/* Controls whether Early break out is on or off in default case */
+#define EARLY_BREAKOUT_DEFAULT  TRUE
+
+/****************************************************************************
+*  Functions
+****************************************************************************/
+extern  void set_scan_param(x_pp_inst ppi, UINT32 param_id, INT32 param_value);
+extern  UINT32 yuvanalyse_frame(x_pp_inst ppi, UINT32 *KFIndicator);
+extern  x_pp_inst create_pp_instance(void);
+extern  void delete_pp_instance(x_pp_inst *);
+extern  BOOL scan_yuvinit(x_pp_inst,  SCAN_CONFIG_DATA *scan_config_ptr);
+
+#endif
diff --git a/vp8/common/proposed.h b/vp8/common/proposed.h
new file mode 100644
index 000000000..1171ede43
--- /dev/null
+++ b/vp8/common/proposed.h
@@ -0,0 +1,70 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+typedef struct core_codec *codec_ptr;
+typedef struct interface_table *interface_ptr;
+
+typedef struct
+{
+    void (*Initialize)();
+    void (*Shutdown)();
+    codec_ptr(*Create)();
+    int (*compress_frame)(codec_ptr, unsigned int *frame_flags, YV12_BUFFER_CONFIG *sd, unsigned long *size, char *dest, INT64 time_stamp);
+    int (*show_frame)(codec_ptr , YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags);
+    void (*Remove)(codec_ptr *comp);
+    interface_ptr(*get_interface)(unsigned int id);
+
+} core_codec;
+
+typedef struct
+{
+    int (*set_bitrate)(codec_ptr, END_USAGE usage, int Datarate);
+    int (*get_bitrate)(codec_ptr, END_USAGE *usage, int *Datarate);
+    int (*set_mode)(codec_ptr, MODE mode, int Speed, char *File);
+    int (*get_mode)(codec_ptr, MODE *mode, int *Speed, char **File);
+} codec_settings_basic;
+
+typedef struct
+{
+    int (*set_bitrate)(codec_ptr, END_USAGE usage, int Datarate);
+    int (*get_bitrate)(codec_ptr, END_USAGE *usage, int *Datarate);
+    int (*set_mode)(codec_ptr, MODE  mode, int  Speed, char   *File);
+    int (*get_mode)(codec_ptr, MODE *mode, int *Speed, char **File);
+    int (*set_denoise)(codec_ptr, int  Level);
+    int (*get_denoise)(codec_ptr, int *Level);
+    int (*set_sharpness)(codec_ptr, int  sharpness);
+    int (*get_sharpness)(codec_ptr, int *sharpness);
+    int (*set_keyframing)(codec_ptr, int  Auto, int  max_distance);
+    int (*get_keyframing)(codec_ptr, int *Auto, int *max_distance);
+    int (*set_buffering)(codec_ptr, int  buffer_level, int  max_buffer_level);
+    int (*get_buffering)(codec_ptr, int *buffer_level, int *max_buffer_level);
+    int (*set_adjust_frame_rate)(codec_ptr, int Allowed, int at_buffer_level_pct);
+    int (*get_adjust_frame_rate)(codec_ptr, int *Allowed, int *at_buffer_level_pct);
+    int (*set_adjust_frame_size)(codec_ptr, int Allowed, int down_at_buffer_level_pct, int up_at_buffer_level_pct);
+    int (*get_adjust_frame_size)(codec_ptr, int *Allowed, int *down_at_buffer_level_pct, int *up_at_buffer_level_pct);
+    int (*set_adjust_quality)(codec_ptr, int Allowed, int min_quantizer, int max_quantizer);
+    int (*get_adjust_quality)(codec_ptr, int *Allowed, int *min_quantizer, int *max_quantizer);
+    int (*set_vbrparms)(codec_ptr, int Bias, int Min, int Max);
+    int (*get_vbrparms)(codec_ptr, int *Bias, int *Min, int *Max);
+
+} codec_settings_v1;
+
+typedef struct
+{
+    int (*request_recovery)(codec_ptr);
+    int (*request_droppable)(codec_ptr);
+    int (*internal_size)(codec_ptr, VPX_SCALING Vertical, VPX_SCALING Horizontal);
+    int (*update_last)(codec_ptr);
+    int (*update_gold)(codec_ptr);
+    int (*use_only_last)(codec_ptr);
+    int (*use_only_gold)(codec_ptr);
+    int (*update_entropy)(codec_ptr);
+
+} codec_realtime_requests;
diff --git a/vp8/common/quant_common.c b/vp8/common/quant_common.c
new file mode 100644
index 000000000..09fe31fe5
--- /dev/null
+++ b/vp8/common/quant_common.c
@@ -0,0 +1,131 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "quant_common.h"
+
+static const int dc_qlookup[QINDEX_RANGE] =
+{
+    4,    5,    6,    7,    8,    9,   10,   10,   11,   12,   13,   14,   15,   16,   17,   17,
+    18,   19,   20,   20,   21,   21,   22,   22,   23,   23,   24,   25,   25,   26,   27,   28,
+    29,   30,   31,   32,   33,   34,   35,   36,   37,   37,   38,   39,   40,   41,   42,   43,
+    44,   45,   46,   46,   47,   48,   49,   50,   51,   52,   53,   54,   55,   56,   57,   58,
+    59,   60,   61,   62,   63,   64,   65,   66,   67,   68,   69,   70,   71,   72,   73,   74,
+    75,   76,   76,   77,   78,   79,   80,   81,   82,   83,   84,   85,   86,   87,   88,   89,
+    91,   93,   95,   96,   98,  100,  101,  102,  104,  106,  108,  110,  112,  114,  116,  118,
+    122,  124,  126,  128,  130,  132,  134,  136,  138,  140,  143,  145,  148,  151,  154,  157,
+};
+
+static const int ac_qlookup[QINDEX_RANGE] =
+{
+    4,    5,    6,    7,    8,    9,   10,   11,   12,   13,   14,   15,   16,   17,   18,   19,
+    20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,   33,   34,   35,
+    36,   37,   38,   39,   40,   41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51,
+    52,   53,   54,   55,   56,   57,   58,   60,   62,   64,   66,   68,   70,   72,   74,   76,
+    78,   80,   82,   84,   86,   88,   90,   92,   94,   96,   98,  100,  102,  104,  106,  108,
+    110,  112,  114,  116,  119,  122,  125,  128,  131,  134,  137,  140,  143,  146,  149,  152,
+    155,  158,  161,  164,  167,  170,  173,  177,  181,  185,  189,  193,  197,  201,  205,  209,
+    213,  217,  221,  225,  229,  234,  239,  245,  249,  254,  259,  264,  269,  274,  279,  284,
+};
+
+
+int vp8_dc_quant(int QIndex, int Delta)
+{
+    int retval;
+
+    QIndex = QIndex + Delta;
+
+    if (QIndex > 127)
+        QIndex = 127;
+    else if (QIndex < 0)
+        QIndex = 0;
+
+    retval = dc_qlookup[ QIndex ];
+    return retval;
+}
+
+int vp8_dc2quant(int QIndex, int Delta)
+{
+    int retval;
+
+    QIndex = QIndex + Delta;
+
+    if (QIndex > 127)
+        QIndex = 127;
+    else if (QIndex < 0)
+        QIndex = 0;
+
+    retval = dc_qlookup[ QIndex ] * 2;
+    return retval;
+
+}
+int vp8_dc_uv_quant(int QIndex, int Delta)
+{
+    int retval;
+
+    QIndex = QIndex + Delta;
+
+    if (QIndex > 127)
+        QIndex = 127;
+    else if (QIndex < 0)
+        QIndex = 0;
+
+    retval = dc_qlookup[ QIndex ];
+
+    if (retval > 132)
+        retval = 132;
+
+    return retval;
+}
+
+int vp8_ac_yquant(int QIndex)
+{
+    int retval;
+
+    if (QIndex > 127)
+        QIndex = 127;
+    else if (QIndex < 0)
+        QIndex = 0;
+
+    retval = ac_qlookup[ QIndex ];
+    return retval;
+}
+
+int vp8_ac2quant(int QIndex, int Delta)
+{
+    int retval;
+
+    QIndex = QIndex + Delta;
+
+    if (QIndex > 127)
+        QIndex = 127;
+    else if (QIndex < 0)
+        QIndex = 0;
+
+    retval = (ac_qlookup[ QIndex ] * 155) / 100;
+
+    if (retval < 8)
+        retval = 8;
+
+    return retval;
+}
+int vp8_ac_uv_quant(int QIndex, int Delta)
+{
+    int retval;
+
+    QIndex = QIndex + Delta;
+
+    if (QIndex > 127)
+        QIndex = 127;
+    else if (QIndex < 0)
+        QIndex = 0;
+
+    retval = ac_qlookup[ QIndex ];
+    return retval;
+}
diff --git a/vp8/common/quant_common.h b/vp8/common/quant_common.h
new file mode 100644
index 000000000..0c92ce8b9
--- /dev/null
+++ b/vp8/common/quant_common.h
@@ -0,0 +1,20 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "string.h"
+#include "blockd.h"
+#include "onyxc_int.h"
+
+extern int vp8_ac_yquant(int QIndex);
+extern int vp8_dc_quant(int QIndex, int Delta);
+extern int vp8_dc2quant(int QIndex, int Delta);
+extern int vp8_ac2quant(int QIndex, int Delta);
+extern int vp8_dc_uv_quant(int QIndex, int Delta);
+extern int vp8_ac_uv_quant(int QIndex, int Delta);
diff --git a/vp8/common/recon.c b/vp8/common/recon.c
new file mode 100644
index 000000000..d1268ea22
--- /dev/null
+++ b/vp8/common/recon.c
@@ -0,0 +1,137 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "recon.h"
+#include "blockd.h"
+
+void vp8_recon_b_c
+(
+    unsigned char *pred_ptr,
+    short *diff_ptr,
+    unsigned char *dst_ptr,
+    int stride
+)
+{
+    int r, c;
+
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 4; c++)
+        {
+            int a = diff_ptr[c] + pred_ptr[c] ;
+
+            if (a < 0)
+                a = 0;
+
+            if (a > 255)
+                a = 255;
+
+            dst_ptr[c] = (unsigned char) a ;
+        }
+
+        dst_ptr += stride;
+        diff_ptr += 16;
+        pred_ptr += 16;
+    }
+}
+
+void vp8_recon4b_c
+(
+    unsigned char *pred_ptr,
+    short *diff_ptr,
+    unsigned char *dst_ptr,
+    int stride
+)
+{
+    int r, c;
+
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 16; c++)
+        {
+            int a = diff_ptr[c] + pred_ptr[c] ;
+
+            if (a < 0)
+                a = 0;
+
+            if (a > 255)
+                a = 255;
+
+            dst_ptr[c] = (unsigned char) a ;
+        }
+
+        dst_ptr += stride;
+        diff_ptr += 16;
+        pred_ptr += 16;
+    }
+}
+
+void vp8_recon2b_c
+(
+    unsigned char *pred_ptr,
+    short *diff_ptr,
+    unsigned char *dst_ptr,
+    int stride
+)
+{
+    int r, c;
+
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 8; c++)
+        {
+            int a = diff_ptr[c] + pred_ptr[c] ;
+
+            if (a < 0)
+                a = 0;
+
+            if (a > 255)
+                a = 255;
+
+            dst_ptr[c] = (unsigned char) a ;
+        }
+
+        dst_ptr += stride;
+        diff_ptr += 8;
+        pred_ptr += 8;
+    }
+}
+
+void vp8_recon16x16mby(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
+{
+    int i;
+
+    for (i = 0; i < 16; i += 4)
+    {
+        BLOCKD *b = &x->block[i];
+
+        RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    }
+}
+
+void vp8_recon16x16mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
+{
+    int i;
+
+    for (i = 0; i < 16; i += 4)
+    {
+        BLOCKD *b = &x->block[i];
+
+        RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    }
+
+    for (i = 16; i < 24; i += 2)
+    {
+        BLOCKD *b = &x->block[i];
+
+        RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    }
+}
diff --git a/vp8/common/recon.h b/vp8/common/recon.h
new file mode 100644
index 000000000..f65a90f7e
--- /dev/null
+++ b/vp8/common/recon.h
@@ -0,0 +1,81 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_RECON_H
+#define __INC_RECON_H
+
+#define prototype_copy_block(sym) \
+    void sym(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch)
+
+#define prototype_recon_block(sym) \
+    void sym(unsigned char *pred, short *diff, unsigned char *dst, int pitch);
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/recon_x86.h"
+#endif
+
+#if ARCH_ARM
+#include "arm/recon_arm.h"
+#endif
+
+#ifndef vp8_recon_copy16x16
+#define vp8_recon_copy16x16 vp8_copy_mem16x16_c
+#endif
+extern prototype_copy_block(vp8_recon_copy16x16);
+
+#ifndef vp8_recon_copy8x8
+#define vp8_recon_copy8x8 vp8_copy_mem8x8_c
+#endif
+extern prototype_copy_block(vp8_recon_copy8x8);
+
+#ifndef vp8_recon_copy8x4
+#define vp8_recon_copy8x4 vp8_copy_mem8x4_c
+#endif
+extern prototype_copy_block(vp8_recon_copy8x4);
+
+#ifndef vp8_recon_recon
+#define vp8_recon_recon vp8_recon_b_c
+#endif
+extern prototype_recon_block(vp8_recon_recon);
+
+#ifndef vp8_recon_recon2
+#define vp8_recon_recon2 vp8_recon2b_c
+#endif
+extern prototype_recon_block(vp8_recon_recon2);
+
+#ifndef vp8_recon_recon4
+#define vp8_recon_recon4 vp8_recon4b_c
+#endif
+extern prototype_recon_block(vp8_recon_recon4);
+
+typedef prototype_copy_block((*vp8_copy_block_fn_t));
+typedef prototype_recon_block((*vp8_recon_fn_t));
+typedef struct
+{
+    vp8_copy_block_fn_t  copy16x16;
+    vp8_copy_block_fn_t  copy8x8;
+    vp8_copy_block_fn_t  copy8x4;
+    vp8_recon_fn_t       recon;
+    vp8_recon_fn_t       recon2;
+    vp8_recon_fn_t       recon4;
+} vp8_recon_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define RECON_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define RECON_INVOKE(ctx,fn) vp8_recon_##fn
+#endif
+
+#include "blockd.h"
+void vp8_recon16x16mby(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
+void vp8_recon16x16mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
+void vp8_recon_intra4x4mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
+void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
+#endif
diff --git a/vp8/common/reconinter.c b/vp8/common/reconinter.c
new file mode 100644
index 000000000..c48886deb
--- /dev/null
+++ b/vp8/common/reconinter.c
@@ -0,0 +1,680 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "recon.h"
+#include "subpixel.h"
+#include "blockd.h"
+#include "reconinter.h"
+#if CONFIG_RUNTIME_CPU_DETECT
+#include "onyxc_int.h"
+#endif
+
+// use this define on systems where unaligned int reads and writes are
+// not allowed, i.e. ARM architectures
+//#define MUST_BE_ALIGNED
+
+
+static const int bbb[4] = {0, 2, 8, 10};
+
+
+
+void vp8_copy_mem16x16_c(
+    unsigned char *src,
+    int src_stride,
+    unsigned char *dst,
+    int dst_stride)
+{
+
+    int r;
+
+    for (r = 0; r < 16; r++)
+    {
+#ifdef MUST_BE_ALIGNED
+        dst[0] = src[0];
+        dst[1] = src[1];
+        dst[2] = src[2];
+        dst[3] = src[3];
+        dst[4] = src[4];
+        dst[5] = src[5];
+        dst[6] = src[6];
+        dst[7] = src[7];
+        dst[8] = src[8];
+        dst[9] = src[9];
+        dst[10] = src[10];
+        dst[11] = src[11];
+        dst[12] = src[12];
+        dst[13] = src[13];
+        dst[14] = src[14];
+        dst[15] = src[15];
+
+#else
+        ((int *)dst)[0] = ((int *)src)[0] ;
+        ((int *)dst)[1] = ((int *)src)[1] ;
+        ((int *)dst)[2] = ((int *)src)[2] ;
+        ((int *)dst)[3] = ((int *)src)[3] ;
+
+#endif
+        src += src_stride;
+        dst += dst_stride;
+
+    }
+
+}
+
+void vp8_copy_mem8x8_c(
+    unsigned char *src,
+    int src_stride,
+    unsigned char *dst,
+    int dst_stride)
+{
+    int r;
+
+    for (r = 0; r < 8; r++)
+    {
+#ifdef MUST_BE_ALIGNED
+        dst[0] = src[0];
+        dst[1] = src[1];
+        dst[2] = src[2];
+        dst[3] = src[3];
+        dst[4] = src[4];
+        dst[5] = src[5];
+        dst[6] = src[6];
+        dst[7] = src[7];
+#else
+        ((int *)dst)[0] = ((int *)src)[0] ;
+        ((int *)dst)[1] = ((int *)src)[1] ;
+#endif
+        src += src_stride;
+        dst += dst_stride;
+
+    }
+
+}
+
+void vp8_copy_mem8x4_c(
+    unsigned char *src,
+    int src_stride,
+    unsigned char *dst,
+    int dst_stride)
+{
+    int r;
+
+    for (r = 0; r < 4; r++)
+    {
+#ifdef MUST_BE_ALIGNED
+        dst[0] = src[0];
+        dst[1] = src[1];
+        dst[2] = src[2];
+        dst[3] = src[3];
+        dst[4] = src[4];
+        dst[5] = src[5];
+        dst[6] = src[6];
+        dst[7] = src[7];
+#else
+        ((int *)dst)[0] = ((int *)src)[0] ;
+        ((int *)dst)[1] = ((int *)src)[1] ;
+#endif
+        src += src_stride;
+        dst += dst_stride;
+
+    }
+
+}
+
+
+
+void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf)
+{
+    int r;
+    unsigned char *ptr_base;
+    unsigned char *ptr;
+    unsigned char *pred_ptr = d->predictor;
+
+    ptr_base = *(d->base_pre);
+
+    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
+    {
+        ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+        sppf(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch);
+    }
+    else
+    {
+        ptr_base += d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+        ptr = ptr_base;
+
+        for (r = 0; r < 4; r++)
+        {
+#ifdef MUST_BE_ALIGNED
+            pred_ptr[0]  = ptr[0];
+            pred_ptr[1]  = ptr[1];
+            pred_ptr[2]  = ptr[2];
+            pred_ptr[3]  = ptr[3];
+#else
+            *(int *)pred_ptr = *(int *)ptr ;
+#endif
+            pred_ptr     += pitch;
+            ptr         += d->pre_stride;
+        }
+    }
+}
+
+void vp8_build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, int pitch)
+{
+    unsigned char *ptr_base;
+    unsigned char *ptr;
+    unsigned char *pred_ptr = d->predictor;
+
+    ptr_base = *(d->base_pre);
+    ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
+    {
+        x->subpixel_predict8x8(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch);
+    }
+    else
+    {
+        RECON_INVOKE(&x->rtcd->recon, copy8x8)(ptr, d->pre_stride, pred_ptr, pitch);
+    }
+}
+
+void vp8_build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, int pitch)
+{
+    unsigned char *ptr_base;
+    unsigned char *ptr;
+    unsigned char *pred_ptr = d->predictor;
+
+    ptr_base = *(d->base_pre);
+    ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
+    {
+        x->subpixel_predict8x4(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch);
+    }
+    else
+    {
+        RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr, d->pre_stride, pred_ptr, pitch);
+    }
+}
+
+
+void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x)
+{
+    int i;
+
+    if (x->mbmi.ref_frame != INTRA_FRAME && x->mbmi.mode != SPLITMV)
+    {
+        unsigned char *uptr, *vptr;
+        unsigned char *upred_ptr = &x->predictor[256];
+        unsigned char *vpred_ptr = &x->predictor[320];
+
+        int mv_row = x->block[16].bmi.mv.as_mv.row;
+        int mv_col = x->block[16].bmi.mv.as_mv.col;
+        int offset;
+        int pre_stride = x->block[16].pre_stride;
+
+        offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+        uptr = x->pre.u_buffer + offset;
+        vptr = x->pre.v_buffer + offset;
+
+        if ((mv_row | mv_col) & 7)
+        {
+            x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8);
+            x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8);
+        }
+        else
+        {
+            RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, upred_ptr, 8);
+            RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, vpred_ptr, 8);
+        }
+    }
+    else
+    {
+        for (i = 16; i < 24; i += 2)
+        {
+            BLOCKD *d0 = &x->block[i];
+            BLOCKD *d1 = &x->block[i+1];
+
+            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+                vp8_build_inter_predictors2b(x, d0, 8);
+            else
+            {
+                vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict);
+                vp8_build_inter_predictors_b(d1, 8, x->subpixel_predict);
+            }
+        }
+    }
+}
+
+
+void vp8_build_inter_predictors_mby(MACROBLOCKD *x)
+{
+    if (x->mbmi.ref_frame != INTRA_FRAME && x->mbmi.mode != SPLITMV)
+    {
+        unsigned char *ptr_base;
+        unsigned char *ptr;
+        unsigned char *pred_ptr = x->predictor;
+        int mv_row = x->mbmi.mv.as_mv.row;
+        int mv_col = x->mbmi.mv.as_mv.col;
+        int pre_stride = x->block[0].pre_stride;
+
+        ptr_base = x->pre.y_buffer;
+        ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
+
+        if ((mv_row | mv_col) & 7)
+        {
+            x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16);
+        }
+        else
+        {
+            RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, pred_ptr, 16);
+        }
+    }
+    else
+    {
+        int i;
+
+        if (x->mbmi.partitioning < 3)
+        {
+            for (i = 0; i < 4; i++)
+            {
+                BLOCKD *d = &x->block[bbb[i]];
+                vp8_build_inter_predictors4b(x, d, 16);
+            }
+
+        }
+        else
+        {
+            for (i = 0; i < 16; i += 2)
+            {
+                BLOCKD *d0 = &x->block[i];
+                BLOCKD *d1 = &x->block[i+1];
+
+                if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+                    vp8_build_inter_predictors2b(x, d0, 16);
+                else
+                {
+                    vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict);
+                    vp8_build_inter_predictors_b(d1, 16, x->subpixel_predict);
+                }
+
+            }
+        }
+    }
+}
+
+void vp8_build_inter_predictors_mb(MACROBLOCKD *x)
+{
+    if (x->mbmi.ref_frame != INTRA_FRAME && x->mbmi.mode != SPLITMV)
+    {
+        int offset;
+        unsigned char *ptr_base;
+        unsigned char *ptr;
+        unsigned char *uptr, *vptr;
+        unsigned char *pred_ptr = x->predictor;
+        unsigned char *upred_ptr = &x->predictor[256];
+        unsigned char *vpred_ptr = &x->predictor[320];
+
+        int mv_row = x->mbmi.mv.as_mv.row;
+        int mv_col = x->mbmi.mv.as_mv.col;
+        int pre_stride = x->block[0].pre_stride;
+
+        ptr_base = x->pre.y_buffer;
+        ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
+
+        if ((mv_row | mv_col) & 7)
+        {
+            x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16);
+        }
+        else
+        {
+            RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, pred_ptr, 16);
+        }
+
+        mv_row = x->block[16].bmi.mv.as_mv.row;
+        mv_col = x->block[16].bmi.mv.as_mv.col;
+        pre_stride >>= 1;
+        offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+        uptr = x->pre.u_buffer + offset;
+        vptr = x->pre.v_buffer + offset;
+
+        if ((mv_row | mv_col) & 7)
+        {
+            x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8);
+            x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8);
+        }
+        else
+        {
+            RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, upred_ptr, 8);
+            RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, vpred_ptr, 8);
+        }
+    }
+    else
+    {
+        int i;
+
+        if (x->mbmi.partitioning < 3)
+        {
+            for (i = 0; i < 4; i++)
+            {
+                BLOCKD *d = &x->block[bbb[i]];
+                vp8_build_inter_predictors4b(x, d, 16);
+            }
+        }
+        else
+        {
+            for (i = 0; i < 16; i += 2)
+            {
+                BLOCKD *d0 = &x->block[i];
+                BLOCKD *d1 = &x->block[i+1];
+
+                if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+                    vp8_build_inter_predictors2b(x, d0, 16);
+                else
+                {
+                    vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict);
+                    vp8_build_inter_predictors_b(d1, 16, x->subpixel_predict);
+                }
+
+            }
+
+        }
+
+        for (i = 16; i < 24; i += 2)
+        {
+            BLOCKD *d0 = &x->block[i];
+            BLOCKD *d1 = &x->block[i+1];
+
+            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+                vp8_build_inter_predictors2b(x, d0, 8);
+            else
+            {
+                vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict);
+                vp8_build_inter_predictors_b(d1, 8, x->subpixel_predict);
+            }
+
+        }
+
+    }
+}
+
+void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel)
+{
+    int i, j;
+
+    if (x->mbmi.mode == SPLITMV)
+    {
+        for (i = 0; i < 2; i++)
+        {
+            for (j = 0; j < 2; j++)
+            {
+                int yoffset = i * 8 + j * 2;
+                int uoffset = 16 + i * 2 + j;
+                int voffset = 20 + i * 2 + j;
+
+                int temp;
+
+                temp = x->block[yoffset  ].bmi.mv.as_mv.row
+                       + x->block[yoffset+1].bmi.mv.as_mv.row
+                       + x->block[yoffset+4].bmi.mv.as_mv.row
+                       + x->block[yoffset+5].bmi.mv.as_mv.row;
+
+                if (temp < 0) temp -= 4;
+                else temp += 4;
+
+                x->block[uoffset].bmi.mv.as_mv.row = temp / 8;
+
+                if (fullpixel)
+                    x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & 0xfffffff8;
+
+                temp = x->block[yoffset  ].bmi.mv.as_mv.col
+                       + x->block[yoffset+1].bmi.mv.as_mv.col
+                       + x->block[yoffset+4].bmi.mv.as_mv.col
+                       + x->block[yoffset+5].bmi.mv.as_mv.col;
+
+                if (temp < 0) temp -= 4;
+                else temp += 4;
+
+                x->block[uoffset].bmi.mv.as_mv.col = temp / 8;
+
+                if (fullpixel)
+                    x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & 0xfffffff8;
+
+                x->block[voffset].bmi.mv.as_mv.row = x->block[uoffset].bmi.mv.as_mv.row ;
+                x->block[voffset].bmi.mv.as_mv.col = x->block[uoffset].bmi.mv.as_mv.col ;
+            }
+        }
+    }
+    else
+    {
+        int mvrow = x->mbmi.mv.as_mv.row;
+        int mvcol = x->mbmi.mv.as_mv.col;
+
+        if (mvrow < 0)
+            mvrow -= 1;
+        else
+            mvrow += 1;
+
+        if (mvcol < 0)
+            mvcol -= 1;
+        else
+            mvcol += 1;
+
+        mvrow /= 2;
+        mvcol /= 2;
+
+        for (i = 0; i < 8; i++)
+        {
+            x->block[ 16 + i].bmi.mv.as_mv.row = mvrow;
+            x->block[ 16 + i].bmi.mv.as_mv.col = mvcol;
+
+            if (fullpixel)
+            {
+                x->block[ 16 + i].bmi.mv.as_mv.row = mvrow & 0xfffffff8;
+                x->block[ 16 + i].bmi.mv.as_mv.col = mvcol & 0xfffffff8;
+            }
+        }
+    }
+}
+
+
+// The following functions are wriiten for skip_recon_mb() to call. Since there is no recon in this
+// situation, we can write the result directly to dst buffer instead of writing it to predictor
+// buffer and then copying it to dst buffer.
+static void vp8_build_inter_predictors_b_s(BLOCKD *d, unsigned char *dst_ptr, vp8_subpix_fn_t sppf)
+{
+    int r;
+    unsigned char *ptr_base;
+    unsigned char *ptr;
+    //unsigned char *pred_ptr = d->predictor;
+    int dst_stride = d->dst_stride;
+    int pre_stride = d->pre_stride;
+
+    ptr_base = *(d->base_pre);
+
+    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
+    {
+        ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+        sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_ptr, dst_stride);
+    }
+    else
+    {
+        ptr_base += d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+        ptr = ptr_base;
+
+        for (r = 0; r < 4; r++)
+        {
+#ifdef MUST_BE_ALIGNED
+            dst_ptr[0]   = ptr[0];
+            dst_ptr[1]   = ptr[1];
+            dst_ptr[2]   = ptr[2];
+            dst_ptr[3]   = ptr[3];
+#else
+            *(int *)dst_ptr = *(int *)ptr ;
+#endif
+            dst_ptr      += dst_stride;
+            ptr         += pre_stride;
+        }
+    }
+}
+
+
+
+void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
+{
+    //unsigned char *pred_ptr = x->block[0].predictor;
+    //unsigned char *dst_ptr = *(x->block[0].base_dst) + x->block[0].dst;
+    unsigned char *pred_ptr = x->predictor;
+    unsigned char *dst_ptr = x->dst.y_buffer;
+
+    if (x->mbmi.mode != SPLITMV)
+    {
+        int offset;
+        unsigned char *ptr_base;
+        unsigned char *ptr;
+        unsigned char *uptr, *vptr;
+        //unsigned char *pred_ptr = x->predictor;
+        //unsigned char *upred_ptr = &x->predictor[256];
+        //unsigned char *vpred_ptr = &x->predictor[320];
+        unsigned char *udst_ptr = x->dst.u_buffer;
+        unsigned char *vdst_ptr = x->dst.v_buffer;
+
+        int mv_row = x->mbmi.mv.as_mv.row;
+        int mv_col = x->mbmi.mv.as_mv.col;
+        int pre_stride = x->dst.y_stride; //x->block[0].pre_stride;
+
+        ptr_base = x->pre.y_buffer;
+        ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
+
+        if ((mv_row | mv_col) & 7)
+        {
+            x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_ptr, x->dst.y_stride); //x->block[0].dst_stride);
+        }
+        else
+        {
+            RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_ptr, x->dst.y_stride); //x->block[0].dst_stride);
+        }
+
+        mv_row = x->block[16].bmi.mv.as_mv.row;
+        mv_col = x->block[16].bmi.mv.as_mv.col;
+        pre_stride >>= 1;
+        offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+        uptr = x->pre.u_buffer + offset;
+        vptr = x->pre.v_buffer + offset;
+
+        if ((mv_row | mv_col) & 7)
+        {
+            x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, udst_ptr, x->dst.uv_stride);
+            x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vdst_ptr, x->dst.uv_stride);
+        }
+        else
+        {
+            RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, udst_ptr, x->dst.uv_stride);
+            RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, vdst_ptr, x->dst.uv_stride);
+        }
+    }
+    else
+    {
+        //note: this whole ELSE part is not executed at all. So, no way to test the correctness of my modification. Later,
+        //if sth is wrong, go back to what it is in build_inter_predictors_mb.
+        int i;
+
+        if (x->mbmi.partitioning < 3)
+        {
+            for (i = 0; i < 4; i++)
+            {
+                BLOCKD *d = &x->block[bbb[i]];
+                //vp8_build_inter_predictors4b(x, d, 16);
+
+                {
+                    unsigned char *ptr_base;
+                    unsigned char *ptr;
+                    unsigned char *pred_ptr = d->predictor;
+
+                    ptr_base = *(d->base_pre);
+                    ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+                    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
+                    {
+                        x->subpixel_predict8x8(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_ptr, x->dst.y_stride); //x->block[0].dst_stride);
+                    }
+                    else
+                    {
+                        RECON_INVOKE(&x->rtcd->recon, copy8x8)(ptr, d->pre_stride, dst_ptr, x->dst.y_stride); //x->block[0].dst_stride);
+                    }
+                }
+            }
+        }
+        else
+        {
+            for (i = 0; i < 16; i += 2)
+            {
+                BLOCKD *d0 = &x->block[i];
+                BLOCKD *d1 = &x->block[i+1];
+
+                if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+                {
+                    //vp8_build_inter_predictors2b(x, d0, 16);
+                    unsigned char *ptr_base;
+                    unsigned char *ptr;
+                    unsigned char *pred_ptr = d0->predictor;
+
+                    ptr_base = *(d0->base_pre);
+                    ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
+
+                    if (d0->bmi.mv.as_mv.row & 7 || d0->bmi.mv.as_mv.col & 7)
+                    {
+                        x->subpixel_predict8x4(ptr, d0->pre_stride, d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7, dst_ptr, x->dst.y_stride);
+                    }
+                    else
+                    {
+                        RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr, d0->pre_stride, dst_ptr, x->dst.y_stride);
+                    }
+                }
+                else
+                {
+                    vp8_build_inter_predictors_b_s(d0, dst_ptr, x->subpixel_predict);
+                    vp8_build_inter_predictors_b_s(d1, dst_ptr, x->subpixel_predict);
+                }
+            }
+        }
+
+        for (i = 16; i < 24; i += 2)
+        {
+            BLOCKD *d0 = &x->block[i];
+            BLOCKD *d1 = &x->block[i+1];
+
+            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+            {
+                //vp8_build_inter_predictors2b(x, d0, 8);
+                unsigned char *ptr_base;
+                unsigned char *ptr;
+                unsigned char *pred_ptr = d0->predictor;
+
+                ptr_base = *(d0->base_pre);
+                ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
+
+                if (d0->bmi.mv.as_mv.row & 7 || d0->bmi.mv.as_mv.col & 7)
+                {
+                    x->subpixel_predict8x4(ptr, d0->pre_stride, d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7, dst_ptr, x->dst.y_stride);
+                }
+                else
+                {
+                    RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr, d0->pre_stride, dst_ptr, x->dst.y_stride);
+                }
+            }
+            else
+            {
+                vp8_build_inter_predictors_b_s(d0, dst_ptr, x->subpixel_predict);
+                vp8_build_inter_predictors_b_s(d1, dst_ptr, x->subpixel_predict);
+            }
+        }
+    }
+}
diff --git a/vp8/common/reconinter.h b/vp8/common/reconinter.h
new file mode 100644
index 000000000..b2d1ae97a
--- /dev/null
+++ b/vp8/common/reconinter.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_RECONINTER_H
+#define __INC_RECONINTER_H
+
+extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x);
+extern void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x);
+
+extern void vp8_build_inter_predictors_mby(MACROBLOCKD *x);
+extern void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel);
+extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf);
+extern void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x);
+
+#endif
diff --git a/vp8/common/reconintra.c b/vp8/common/reconintra.c
new file mode 100644
index 000000000..e33bce348
--- /dev/null
+++ b/vp8/common/reconintra.c
@@ -0,0 +1,555 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "recon.h"
+#include "reconintra.h"
+#include "vpx_mem/vpx_mem.h"
+
+// For skip_recon_mb(), add vp8_build_intra_predictors_mby_s(MACROBLOCKD *x) and
+// vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x).
+
+void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
+{
+    int i;
+
+    for (i = 16; i < 24; i += 2)
+    {
+        BLOCKD *b = &x->block[i];
+        RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    }
+}
+
+void vp8_build_intra_predictors_mby(MACROBLOCKD *x)
+{
+
+    unsigned char *yabove_row = x->dst.y_buffer - x->dst.y_stride;
+    unsigned char yleft_col[16];
+    unsigned char ytop_left = yabove_row[-1];
+    unsigned char *ypred_ptr = x->predictor;
+    int r, c, i;
+
+    for (i = 0; i < 16; i++)
+    {
+        yleft_col[i] = x->dst.y_buffer [i* x->dst.y_stride -1];
+    }
+
+    // for Y
+    switch (x->mbmi.mode)
+    {
+    case DC_PRED:
+    {
+        int expected_dc;
+        int i;
+        int shift;
+        int average = 0;
+
+
+        if (x->up_available || x->left_available)
+        {
+            if (x->up_available)
+            {
+                for (i = 0; i < 16; i++)
+                {
+                    average += yabove_row[i];
+                }
+            }
+
+            if (x->left_available)
+            {
+
+                for (i = 0; i < 16; i++)
+                {
+                    average += yleft_col[i];
+                }
+
+            }
+
+
+
+            shift = 3 + x->up_available + x->left_available;
+            expected_dc = (average + (1 << (shift - 1))) >> shift;
+        }
+        else
+        {
+            expected_dc = 128;
+        }
+
+        vpx_memset(ypred_ptr, expected_dc, 256);
+    }
+    break;
+    case V_PRED:
+    {
+
+        for (r = 0; r < 16; r++)
+        {
+
+            ((int *)ypred_ptr)[0] = ((int *)yabove_row)[0];
+            ((int *)ypred_ptr)[1] = ((int *)yabove_row)[1];
+            ((int *)ypred_ptr)[2] = ((int *)yabove_row)[2];
+            ((int *)ypred_ptr)[3] = ((int *)yabove_row)[3];
+            ypred_ptr += 16;
+        }
+    }
+    break;
+    case H_PRED:
+    {
+
+        for (r = 0; r < 16; r++)
+        {
+
+            vpx_memset(ypred_ptr, yleft_col[r], 16);
+            ypred_ptr += 16;
+        }
+
+    }
+    break;
+    case TM_PRED:
+    {
+
+        for (r = 0; r < 16; r++)
+        {
+            for (c = 0; c < 16; c++)
+            {
+                int pred =  yleft_col[r] + yabove_row[ c] - ytop_left;
+
+                if (pred < 0)
+                    pred = 0;
+
+                if (pred > 255)
+                    pred = 255;
+
+                ypred_ptr[c] = pred;
+            }
+
+            ypred_ptr += 16;
+        }
+
+    }
+    break;
+    case B_PRED:
+    case NEARESTMV:
+    case NEARMV:
+    case ZEROMV:
+    case NEWMV:
+    case SPLITMV:
+    case MB_MODE_COUNT:
+        break;
+    }
+}
+
+void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x)
+{
+
+    unsigned char *yabove_row = x->dst.y_buffer - x->dst.y_stride;
+    unsigned char yleft_col[16];
+    unsigned char ytop_left = yabove_row[-1];
+    unsigned char *ypred_ptr = x->predictor;
+    int r, c, i;
+
+    int y_stride = x->dst.y_stride;
+    ypred_ptr = x->dst.y_buffer; //x->predictor;
+
+    for (i = 0; i < 16; i++)
+    {
+        yleft_col[i] = x->dst.y_buffer [i* x->dst.y_stride -1];
+    }
+
+    // for Y
+    switch (x->mbmi.mode)
+    {
+    case DC_PRED:
+    {
+        int expected_dc;
+        int i;
+        int shift;
+        int average = 0;
+
+
+        if (x->up_available || x->left_available)
+        {
+            if (x->up_available)
+            {
+                for (i = 0; i < 16; i++)
+                {
+                    average += yabove_row[i];
+                }
+            }
+
+            if (x->left_available)
+            {
+
+                for (i = 0; i < 16; i++)
+                {
+                    average += yleft_col[i];
+                }
+
+            }
+
+
+
+            shift = 3 + x->up_available + x->left_available;
+            expected_dc = (average + (1 << (shift - 1))) >> shift;
+        }
+        else
+        {
+            expected_dc = 128;
+        }
+
+        //vpx_memset(ypred_ptr, expected_dc, 256);
+        for (r = 0; r < 16; r++)
+        {
+            vpx_memset(ypred_ptr, expected_dc, 16);
+            ypred_ptr += y_stride; //16;
+        }
+    }
+    break;
+    case V_PRED:
+    {
+
+        for (r = 0; r < 16; r++)
+        {
+
+            ((int *)ypred_ptr)[0] = ((int *)yabove_row)[0];
+            ((int *)ypred_ptr)[1] = ((int *)yabove_row)[1];
+            ((int *)ypred_ptr)[2] = ((int *)yabove_row)[2];
+            ((int *)ypred_ptr)[3] = ((int *)yabove_row)[3];
+            ypred_ptr += y_stride; //16;
+        }
+    }
+    break;
+    case H_PRED:
+    {
+
+        for (r = 0; r < 16; r++)
+        {
+
+            vpx_memset(ypred_ptr, yleft_col[r], 16);
+            ypred_ptr += y_stride;  //16;
+        }
+
+    }
+    break;
+    case TM_PRED:
+    {
+
+        for (r = 0; r < 16; r++)
+        {
+            for (c = 0; c < 16; c++)
+            {
+                int pred =  yleft_col[r] + yabove_row[ c] - ytop_left;
+
+                if (pred < 0)
+                    pred = 0;
+
+                if (pred > 255)
+                    pred = 255;
+
+                ypred_ptr[c] = pred;
+            }
+
+            ypred_ptr += y_stride;  //16;
+        }
+
+    }
+    break;
+    case B_PRED:
+    case NEARESTMV:
+    case NEARMV:
+    case ZEROMV:
+    case NEWMV:
+    case SPLITMV:
+    case MB_MODE_COUNT:
+        break;
+    }
+}
+
+void vp8_build_intra_predictors_mbuv(MACROBLOCKD *x)
+{
+    unsigned char *uabove_row = x->dst.u_buffer - x->dst.uv_stride;
+    unsigned char uleft_col[16];
+    unsigned char utop_left = uabove_row[-1];
+    unsigned char *vabove_row = x->dst.v_buffer - x->dst.uv_stride;
+    unsigned char vleft_col[20];
+    unsigned char vtop_left = vabove_row[-1];
+    unsigned char *upred_ptr = &x->predictor[256];
+    unsigned char *vpred_ptr = &x->predictor[320];
+    int i, j;
+
+    for (i = 0; i < 8; i++)
+    {
+        uleft_col[i] = x->dst.u_buffer [i* x->dst.uv_stride -1];
+        vleft_col[i] = x->dst.v_buffer [i* x->dst.uv_stride -1];
+    }
+
+    switch (x->mbmi.uv_mode)
+    {
+    case DC_PRED:
+    {
+        int expected_udc;
+        int expected_vdc;
+        int i;
+        int shift;
+        int Uaverage = 0;
+        int Vaverage = 0;
+
+        if (x->up_available)
+        {
+            for (i = 0; i < 8; i++)
+            {
+                Uaverage += uabove_row[i];
+                Vaverage += vabove_row[i];
+            }
+        }
+
+        if (x->left_available)
+        {
+            for (i = 0; i < 8; i++)
+            {
+                Uaverage += uleft_col[i];
+                Vaverage += vleft_col[i];
+            }
+        }
+
+        if (!x->up_available && !x->left_available)
+        {
+            expected_udc = 128;
+            expected_vdc = 128;
+        }
+        else
+        {
+            shift = 2 + x->up_available + x->left_available;
+            expected_udc = (Uaverage + (1 << (shift - 1))) >> shift;
+            expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift;
+        }
+
+
+        vpx_memset(upred_ptr, expected_udc, 64);
+        vpx_memset(vpred_ptr, expected_vdc, 64);
+
+
+    }
+    break;
+    case V_PRED:
+    {
+        int i;
+
+        for (i = 0; i < 8; i++)
+        {
+            vpx_memcpy(upred_ptr, uabove_row, 8);
+            vpx_memcpy(vpred_ptr, vabove_row, 8);
+            upred_ptr += 8;
+            vpred_ptr += 8;
+        }
+
+    }
+    break;
+    case H_PRED:
+    {
+        int i;
+
+        for (i = 0; i < 8; i++)
+        {
+            vpx_memset(upred_ptr, uleft_col[i], 8);
+            vpx_memset(vpred_ptr, vleft_col[i], 8);
+            upred_ptr += 8;
+            vpred_ptr += 8;
+        }
+    }
+
+    break;
+    case TM_PRED:
+    {
+        int i;
+
+        for (i = 0; i < 8; i++)
+        {
+            for (j = 0; j < 8; j++)
+            {
+                int predu = uleft_col[i] + uabove_row[j] - utop_left;
+                int predv = vleft_col[i] + vabove_row[j] - vtop_left;
+
+                if (predu < 0)
+                    predu = 0;
+
+                if (predu > 255)
+                    predu = 255;
+
+                if (predv < 0)
+                    predv = 0;
+
+                if (predv > 255)
+                    predv = 255;
+
+                upred_ptr[j] = predu;
+                vpred_ptr[j] = predv;
+            }
+
+            upred_ptr += 8;
+            vpred_ptr += 8;
+        }
+
+    }
+    break;
+    case B_PRED:
+    case NEARESTMV:
+    case NEARMV:
+    case ZEROMV:
+    case NEWMV:
+    case SPLITMV:
+    case MB_MODE_COUNT:
+        break;
+    }
+}
+
+void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x)
+{
+    unsigned char *uabove_row = x->dst.u_buffer - x->dst.uv_stride;
+    unsigned char uleft_col[16];
+    unsigned char utop_left = uabove_row[-1];
+    unsigned char *vabove_row = x->dst.v_buffer - x->dst.uv_stride;
+    unsigned char vleft_col[20];
+    unsigned char vtop_left = vabove_row[-1];
+    unsigned char *upred_ptr = x->dst.u_buffer; //&x->predictor[256];
+    unsigned char *vpred_ptr = x->dst.v_buffer; //&x->predictor[320];
+    int uv_stride = x->dst.uv_stride;
+
+    int i, j;
+
+    for (i = 0; i < 8; i++)
+    {
+        uleft_col[i] = x->dst.u_buffer [i* x->dst.uv_stride -1];
+        vleft_col[i] = x->dst.v_buffer [i* x->dst.uv_stride -1];
+    }
+
+    switch (x->mbmi.uv_mode)
+    {
+    case DC_PRED:
+    {
+        int expected_udc;
+        int expected_vdc;
+        int i;
+        int shift;
+        int Uaverage = 0;
+        int Vaverage = 0;
+
+        if (x->up_available)
+        {
+            for (i = 0; i < 8; i++)
+            {
+                Uaverage += uabove_row[i];
+                Vaverage += vabove_row[i];
+            }
+        }
+
+        if (x->left_available)
+        {
+            for (i = 0; i < 8; i++)
+            {
+                Uaverage += uleft_col[i];
+                Vaverage += vleft_col[i];
+            }
+        }
+
+        if (!x->up_available && !x->left_available)
+        {
+            expected_udc = 128;
+            expected_vdc = 128;
+        }
+        else
+        {
+            shift = 2 + x->up_available + x->left_available;
+            expected_udc = (Uaverage + (1 << (shift - 1))) >> shift;
+            expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift;
+        }
+
+
+        //vpx_memset(upred_ptr,expected_udc,64);
+        //vpx_memset(vpred_ptr,expected_vdc,64);
+        for (i = 0; i < 8; i++)
+        {
+            vpx_memset(upred_ptr, expected_udc, 8);
+            vpx_memset(vpred_ptr, expected_vdc, 8);
+            upred_ptr += uv_stride; //8;
+            vpred_ptr += uv_stride; //8;
+        }
+    }
+    break;
+    case V_PRED:
+    {
+        int i;
+
+        for (i = 0; i < 8; i++)
+        {
+            vpx_memcpy(upred_ptr, uabove_row, 8);
+            vpx_memcpy(vpred_ptr, vabove_row, 8);
+            upred_ptr += uv_stride; //8;
+            vpred_ptr += uv_stride; //8;
+        }
+
+    }
+    break;
+    case H_PRED:
+    {
+        int i;
+
+        for (i = 0; i < 8; i++)
+        {
+            vpx_memset(upred_ptr, uleft_col[i], 8);
+            vpx_memset(vpred_ptr, vleft_col[i], 8);
+            upred_ptr += uv_stride; //8;
+            vpred_ptr += uv_stride; //8;
+        }
+    }
+
+    break;
+    case TM_PRED:
+    {
+        int i;
+
+        for (i = 0; i < 8; i++)
+        {
+            for (j = 0; j < 8; j++)
+            {
+                int predu = uleft_col[i] + uabove_row[j] - utop_left;
+                int predv = vleft_col[i] + vabove_row[j] - vtop_left;
+
+                if (predu < 0)
+                    predu = 0;
+
+                if (predu > 255)
+                    predu = 255;
+
+                if (predv < 0)
+                    predv = 0;
+
+                if (predv > 255)
+                    predv = 255;
+
+                upred_ptr[j] = predu;
+                vpred_ptr[j] = predv;
+            }
+
+            upred_ptr += uv_stride; //8;
+            vpred_ptr += uv_stride; //8;
+        }
+
+    }
+    break;
+    case B_PRED:
+    case NEARESTMV:
+    case NEARMV:
+    case ZEROMV:
+    case NEWMV:
+    case SPLITMV:
+    case MB_MODE_COUNT:
+        break;
+    }
+}
diff --git a/vp8/common/reconintra.h b/vp8/common/reconintra.h
new file mode 100644
index 000000000..d63aa15cb
--- /dev/null
+++ b/vp8/common/reconintra.h
@@ -0,0 +1,28 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_RECONINTRA_H
+#define __INC_RECONINTRA_H
+
+extern void init_intra_left_above_pixels(MACROBLOCKD *x);
+
+extern void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);
+extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);
+extern void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x);
+extern void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x);
+extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x);
+extern void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x);
+
+extern void vp8_build_intra_predictors_mbuv(MACROBLOCKD *x);
+extern void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x);
+
+extern void vp8_predict_intra4x4(BLOCKD *x, int b_mode, unsigned char *Predictor);
+
+#endif
diff --git a/vp8/common/reconintra4x4.c b/vp8/common/reconintra4x4.c
new file mode 100644
index 000000000..d92d5c96a
--- /dev/null
+++ b/vp8/common/reconintra4x4.c
@@ -0,0 +1,330 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "recon.h"
+#include "vpx_mem/vpx_mem.h"
+#include "reconintra.h"
+
+void vp8_predict_intra4x4(BLOCKD *x,
+                          int b_mode,
+                          unsigned char *predictor)
+{
+    int i, r, c;
+
+    unsigned char *Above = *(x->base_dst) + x->dst - x->dst_stride;
+    unsigned char Left[4];
+    unsigned char top_left = Above[-1];
+
+    Left[0] = (*(x->base_dst))[x->dst - 1];
+    Left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride];
+    Left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride];
+    Left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride];
+
+    switch (b_mode)
+    {
+    case B_DC_PRED:
+    {
+        int expected_dc = 0;
+
+        for (i = 0; i < 4; i++)
+        {
+            expected_dc += Above[i];
+            expected_dc += Left[i];
+        }
+
+        expected_dc = (expected_dc + 4) >> 3;
+
+        for (r = 0; r < 4; r++)
+        {
+            for (c = 0; c < 4; c++)
+            {
+                predictor[c] = expected_dc;
+            }
+
+            predictor += 16;
+        }
+    }
+    break;
+    case B_TM_PRED:
+    {
+        // prediction similar to true_motion prediction
+        for (r = 0; r < 4; r++)
+        {
+            for (c = 0; c < 4; c++)
+            {
+                int pred = Above[c] - top_left + Left[r];
+
+                if (pred < 0)
+                    pred = 0;
+
+                if (pred > 255)
+                    pred = 255;
+
+                predictor[c] = pred;
+            }
+
+            predictor += 16;
+        }
+    }
+    break;
+
+    case B_VE_PRED:
+    {
+
+        unsigned int ap[4];
+        ap[0] = (top_left  + 2 * Above[0] + Above[1] + 2) >> 2;
+        ap[1] = (Above[0] + 2 * Above[1] + Above[2] + 2) >> 2;
+        ap[2] = (Above[1] + 2 * Above[2] + Above[3] + 2) >> 2;
+        ap[3] = (Above[2] + 2 * Above[3] + Above[4] + 2) >> 2;
+
+        for (r = 0; r < 4; r++)
+        {
+            for (c = 0; c < 4; c++)
+            {
+
+                predictor[c] = ap[c];
+            }
+
+            predictor += 16;
+        }
+
+    }
+    break;
+
+
+    case B_HE_PRED:
+    {
+
+        unsigned int lp[4];
+        lp[0] = (top_left + 2 * Left[0] + Left[1] + 2) >> 2;
+        lp[1] = (Left[0] + 2 * Left[1] + Left[2] + 2) >> 2;
+        lp[2] = (Left[1] + 2 * Left[2] + Left[3] + 2) >> 2;
+        lp[3] = (Left[2] + 2 * Left[3] + Left[3] + 2) >> 2;
+
+        for (r = 0; r < 4; r++)
+        {
+            for (c = 0; c < 4; c++)
+            {
+                predictor[c] = lp[r];
+            }
+
+            predictor += 16;
+        }
+    }
+    break;
+    case B_LD_PRED:
+    {
+        unsigned char *ptr = Above;
+        predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
+        predictor[0 * 16 + 1] =
+            predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
+        predictor[0 * 16 + 2] =
+            predictor[1 * 16 + 1] =
+                predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
+        predictor[0 * 16 + 3] =
+            predictor[1 * 16 + 2] =
+                predictor[2 * 16 + 1] =
+                    predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
+        predictor[1 * 16 + 3] =
+            predictor[2 * 16 + 2] =
+                predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
+        predictor[2 * 16 + 3] =
+            predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
+        predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
+
+    }
+    break;
+    case B_RD_PRED:
+    {
+
+        unsigned char pp[9];
+
+        pp[0] = Left[3];
+        pp[1] = Left[2];
+        pp[2] = Left[1];
+        pp[3] = Left[0];
+        pp[4] = top_left;
+        pp[5] = Above[0];
+        pp[6] = Above[1];
+        pp[7] = Above[2];
+        pp[8] = Above[3];
+
+        predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+        predictor[3 * 16 + 1] =
+            predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        predictor[3 * 16 + 2] =
+            predictor[2 * 16 + 1] =
+                predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+        predictor[3 * 16 + 3] =
+            predictor[2 * 16 + 2] =
+                predictor[1 * 16 + 1] =
+                    predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+        predictor[2 * 16 + 3] =
+            predictor[1 * 16 + 2] =
+                predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+        predictor[1 * 16 + 3] =
+            predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+        predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
+
+    }
+    break;
+    case B_VR_PRED:
+    {
+
+        unsigned char pp[9];
+
+        pp[0] = Left[3];
+        pp[1] = Left[2];
+        pp[2] = Left[1];
+        pp[3] = Left[0];
+        pp[4] = top_left;
+        pp[5] = Above[0];
+        pp[6] = Above[1];
+        pp[7] = Above[2];
+        pp[8] = Above[3];
+
+
+        predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+        predictor[3 * 16 + 1] =
+            predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+        predictor[2 * 16 + 1] =
+            predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1;
+        predictor[3 * 16 + 2] =
+            predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+        predictor[2 * 16 + 2] =
+            predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1;
+        predictor[3 * 16 + 3] =
+            predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+        predictor[2 * 16 + 3] =
+            predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1;
+        predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
+        predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1;
+
+    }
+    break;
+    case B_VL_PRED:
+    {
+
+        unsigned char *pp = Above;
+
+        predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
+        predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+        predictor[2 * 16 + 0] =
+            predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1;
+        predictor[1 * 16 + 1] =
+            predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        predictor[2 * 16 + 1] =
+            predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1;
+        predictor[3 * 16 + 1] =
+            predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+        predictor[0 * 16 + 3] =
+            predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1;
+        predictor[1 * 16 + 3] =
+            predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+        predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+        predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+    }
+    break;
+
+    case B_HD_PRED:
+    {
+        unsigned char pp[9];
+        pp[0] = Left[3];
+        pp[1] = Left[2];
+        pp[2] = Left[1];
+        pp[3] = Left[0];
+        pp[4] = top_left;
+        pp[5] = Above[0];
+        pp[6] = Above[1];
+        pp[7] = Above[2];
+        pp[8] = Above[3];
+
+
+        predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
+        predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+        predictor[2 * 16 + 0] =
+            predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1;
+        predictor[2 * 16 + 1] =
+            predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        predictor[2 * 16 + 2] =
+            predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
+        predictor[2 * 16 + 3] =
+            predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+        predictor[1 * 16 + 2] =
+            predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1;
+        predictor[1 * 16 + 3] =
+            predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+        predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+        predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+    }
+    break;
+
+
+    case B_HU_PRED:
+    {
+        unsigned char *pp = Left;
+        predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
+        predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+        predictor[0 * 16 + 2] =
+            predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1;
+        predictor[0 * 16 + 3] =
+            predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        predictor[1 * 16 + 2] =
+            predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
+        predictor[1 * 16 + 3] =
+            predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
+        predictor[2 * 16 + 2] =
+            predictor[2 * 16 + 3] =
+                predictor[3 * 16 + 0] =
+                    predictor[3 * 16 + 1] =
+                        predictor[3 * 16 + 2] =
+                            predictor[3 * 16 + 3] = pp[3];
+    }
+    break;
+
+
+    }
+}
+// copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
+// to the right prediction have filled in pixels to use.
+void vp8_intra_prediction_down_copy(MACROBLOCKD *x)
+{
+    unsigned char *above_right = *(x->block[0].base_dst) + x->block[0].dst - x->block[0].dst_stride + 16;
+
+    unsigned int *src_ptr = (unsigned int *)above_right;
+    unsigned int *dst_ptr0 = (unsigned int *)(above_right + 4 * x->block[0].dst_stride);
+    unsigned int *dst_ptr1 = (unsigned int *)(above_right + 8 * x->block[0].dst_stride);
+    unsigned int *dst_ptr2 = (unsigned int *)(above_right + 12 * x->block[0].dst_stride);
+
+    *dst_ptr0 = *src_ptr;
+    *dst_ptr1 = *src_ptr;
+    *dst_ptr2 = *src_ptr;
+}
+
+
+void vp8_recon_intra4x4mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
+{
+    int i;
+
+    vp8_intra_prediction_down_copy(x);
+
+    for (i = 0; i < 16; i++)
+    {
+        BLOCKD *b = &x->block[i];
+
+        vp8_predict_intra4x4(b, x->block[i].bmi.mode, x->block[i].predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    }
+
+    vp8_recon_intra_mbuv(rtcd, x);
+
+}
diff --git a/vp8/common/reconintra4x4.h b/vp8/common/reconintra4x4.h
new file mode 100644
index 000000000..788c8c40a
--- /dev/null
+++ b/vp8/common/reconintra4x4.h
@@ -0,0 +1,16 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_RECONINTRA4x4_H
+#define __INC_RECONINTRA4x4_H
+
+extern void vp8_intra_prediction_down_copy(MACROBLOCKD *x);
+
+#endif
diff --git a/vp8/common/segmentation_common.c b/vp8/common/segmentation_common.c
new file mode 100644
index 000000000..72b8c874b
--- /dev/null
+++ b/vp8/common/segmentation_common.c
@@ -0,0 +1,63 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "segmentation_common.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_update_gf_useage_maps(VP8_COMMON *cm, MACROBLOCKD *xd)
+{
+    int mb_row, mb_col;
+
+    MODE_INFO *this_mb_mode_info = cm->mi;
+
+    xd->gf_active_ptr = (signed char *)cm->gf_active_flags;
+
+    if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame))
+    {
+        // Reset Gf useage monitors
+        vpx_memset(cm->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+        cm->gf_active_count = cm->mb_rows * cm->mb_cols;
+    }
+    else
+    {
+        // for each macroblock row in image
+        for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+        {
+            // for each macroblock col in image
+            for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+            {
+
+                // If using golden then set GF active flag if not already set.
+                // If using last frame 0,0 mode then leave flag as it is
+                // else if using non 0,0 motion or intra modes then clear flag if it is currently set
+                if ((this_mb_mode_info->mbmi.ref_frame == GOLDEN_FRAME) || (this_mb_mode_info->mbmi.ref_frame == ALTREF_FRAME))
+                {
+                    if (*(xd->gf_active_ptr) == 0)
+                    {
+                        *(xd->gf_active_ptr) = 1;
+                        cm->gf_active_count ++;
+                    }
+                }
+                else if ((this_mb_mode_info->mbmi.mode != ZEROMV) && *(xd->gf_active_ptr))
+                {
+                    *(xd->gf_active_ptr) = 0;
+                    cm->gf_active_count--;
+                }
+
+                xd->gf_active_ptr++;          // Step onto next entry
+                this_mb_mode_info++;           // skip to next mb
+
+            }
+
+            // this is to account for the border
+            this_mb_mode_info++;
+        }
+    }
+}
diff --git a/vp8/common/segmentation_common.h b/vp8/common/segmentation_common.h
new file mode 100644
index 000000000..bb93533a3
--- /dev/null
+++ b/vp8/common/segmentation_common.h
@@ -0,0 +1,15 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "string.h"
+#include "blockd.h"
+#include "onyxc_int.h"
+
+extern void vp8_update_gf_useage_maps(VP8_COMMON *cm, MACROBLOCKD *xd);
diff --git a/vp8/common/setupintrarecon.c b/vp8/common/setupintrarecon.c
new file mode 100644
index 000000000..dcaafe6c6
--- /dev/null
+++ b/vp8/common/setupintrarecon.c
@@ -0,0 +1,37 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "setupintrarecon.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf)
+{
+    int i;
+
+    // set up frame new frame for intra coded blocks
+    vpx_memset(ybf->y_buffer - 1 - 2 * ybf->y_stride, 127, ybf->y_width + 5);
+    vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
+
+    for (i = 0; i < ybf->y_height; i++)
+        ybf->y_buffer[ybf->y_stride *i - 1] = (unsigned char) 129;
+
+    vpx_memset(ybf->u_buffer - 1 - 2 * ybf->uv_stride, 127, ybf->uv_width + 5);
+    vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+
+    for (i = 0; i < ybf->uv_height; i++)
+        ybf->u_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129;
+
+    vpx_memset(ybf->v_buffer - 1 - 2 * ybf->uv_stride, 127, ybf->uv_width + 5);
+    vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+
+    for (i = 0; i < ybf->uv_height; i++)
+        ybf->v_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129;
+
+}
diff --git a/vp8/common/setupintrarecon.h b/vp8/common/setupintrarecon.h
new file mode 100644
index 000000000..6ec79b29c
--- /dev/null
+++ b/vp8/common/setupintrarecon.h
@@ -0,0 +1,12 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/yv12config.h"
+extern void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
diff --git a/vp8/common/subpixel.h b/vp8/common/subpixel.h
new file mode 100644
index 000000000..fbd5f4daf
--- /dev/null
+++ b/vp8/common/subpixel.h
@@ -0,0 +1,85 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef SUBPIXEL_H
+#define SUBPIXEL_H
+
+#define prototype_subpixel_predict(sym) \
+    void sym(unsigned char *src, int src_pitch, int xofst, int yofst, \
+             unsigned char *dst, int dst_pitch)
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/subpixel_x86.h"
+#endif
+
+#if ARCH_ARM
+#include "arm/subpixel_arm.h"
+#endif
+
+#ifndef vp8_subpix_sixtap16x16
+#define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_c
+#endif
+extern prototype_subpixel_predict(vp8_subpix_sixtap16x16);
+
+#ifndef vp8_subpix_sixtap8x8
+#define vp8_subpix_sixtap8x8 vp8_sixtap_predict8x8_c
+#endif
+extern prototype_subpixel_predict(vp8_subpix_sixtap8x8);
+
+#ifndef vp8_subpix_sixtap8x4
+#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_c
+#endif
+extern prototype_subpixel_predict(vp8_subpix_sixtap8x4);
+
+#ifndef vp8_subpix_sixtap4x4
+#define vp8_subpix_sixtap4x4 vp8_sixtap_predict_c
+#endif
+extern prototype_subpixel_predict(vp8_subpix_sixtap4x4);
+
+#ifndef vp8_subpix_bilinear16x16
+#define vp8_subpix_bilinear16x16 vp8_bilinear_predict16x16_c
+#endif
+extern prototype_subpixel_predict(vp8_subpix_bilinear16x16);
+
+#ifndef vp8_subpix_bilinear8x8
+#define vp8_subpix_bilinear8x8 vp8_bilinear_predict8x8_c
+#endif
+extern prototype_subpixel_predict(vp8_subpix_bilinear8x8);
+
+#ifndef vp8_subpix_bilinear8x4
+#define vp8_subpix_bilinear8x4 vp8_bilinear_predict8x4_c
+#endif
+extern prototype_subpixel_predict(vp8_subpix_bilinear8x4);
+
+#ifndef vp8_subpix_bilinear4x4
+#define vp8_subpix_bilinear4x4 vp8_bilinear_predict4x4_c
+#endif
+extern prototype_subpixel_predict(vp8_subpix_bilinear4x4);
+
+typedef prototype_subpixel_predict((*vp8_subpix_fn_t));
+typedef struct
+{
+    vp8_subpix_fn_t  sixtap16x16;
+    vp8_subpix_fn_t  sixtap8x8;
+    vp8_subpix_fn_t  sixtap8x4;
+    vp8_subpix_fn_t  sixtap4x4;
+    vp8_subpix_fn_t  bilinear16x16;
+    vp8_subpix_fn_t  bilinear8x8;
+    vp8_subpix_fn_t  bilinear8x4;
+    vp8_subpix_fn_t  bilinear4x4;
+} vp8_subpix_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define SUBPIX_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define SUBPIX_INVOKE(ctx,fn) vp8_subpix_##fn
+#endif
+
+#endif
diff --git a/vp8/common/swapyv12buffer.c b/vp8/common/swapyv12buffer.c
new file mode 100644
index 000000000..afe6a885e
--- /dev/null
+++ b/vp8/common/swapyv12buffer.c
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "swapyv12buffer.h"
+
+void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame)
+{
+    unsigned char *temp;
+
+    temp = last_frame->buffer_alloc;
+    last_frame->buffer_alloc = new_frame->buffer_alloc;
+    new_frame->buffer_alloc = temp;
+
+    temp = last_frame->y_buffer;
+    last_frame->y_buffer = new_frame->y_buffer;
+    new_frame->y_buffer = temp;
+
+    temp = last_frame->u_buffer;
+    last_frame->u_buffer = new_frame->u_buffer;
+    new_frame->u_buffer = temp;
+
+    temp = last_frame->v_buffer;
+    last_frame->v_buffer = new_frame->v_buffer;
+    new_frame->v_buffer = temp;
+
+}
diff --git a/vp8/common/swapyv12buffer.h b/vp8/common/swapyv12buffer.h
new file mode 100644
index 000000000..caf9499d9
--- /dev/null
+++ b/vp8/common/swapyv12buffer.h
@@ -0,0 +1,18 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef SWAPYV12_BUFFER_H
+#define SWAPYV12_BUFFER_H
+
+#include "vpx_scale/yv12config.h"
+
+void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame);
+
+#endif
diff --git a/vp8/common/systemdependent.h b/vp8/common/systemdependent.h
new file mode 100644
index 000000000..1829b649c
--- /dev/null
+++ b/vp8/common/systemdependent.h
@@ -0,0 +1,20 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#if ARCH_X86 || ARCH_X86_64
+void vpx_reset_mmx_state(void);
+#define vp8_clear_system_state() vpx_reset_mmx_state()
+#else
+#define vp8_clear_system_state()
+#endif
+
+struct VP8Common;
+void vp8_machine_specific_config(struct VP8Common *);
diff --git a/vp8/common/textblit.c b/vp8/common/textblit.c
new file mode 100644
index 000000000..a45937b12
--- /dev/null
+++ b/vp8/common/textblit.c
@@ -0,0 +1,52 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+
+
+void vp8_blit_text(const char *msg, unsigned char *address, const int pitch)
+{
+    int letter_bitmap;
+    unsigned char *output_pos = address;
+    int colpos;
+    const int font[] =
+    {
+        0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, 0x18000,
+        0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, 0x80000, 0x111110,
+        0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, 0x4D6B7, 0x456AA,
+        0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, 0x8A880, 0x52940, 0x22A20,
+        0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, 0x8C62E, 0xE8C63F, 0x118D6BF,
+        0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, 0xF8C628, 0x8A89F, 0x108421F,
+        0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, 0x164C62E, 0x12694BF, 0x8AD6A2,
+        0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, 0x1151151, 0x117041, 0x119D731,
+        0x47E0, 0x1041041, 0xFC400, 0x10440, 0x1084210, 0x820
+    };
+    colpos = 0;
+
+    while (msg[colpos] != 0)
+    {
+        char letter = msg[colpos];
+        int fontcol, fontrow;
+
+        if (letter <= 'Z' && letter >= ' ')
+            letter_bitmap = font[letter-' '];
+        else if (letter <= 'z' && letter >= 'a')
+            letter_bitmap = font[letter-'a'+'A' - ' '];
+        else
+            letter_bitmap = font[0];
+
+        for (fontcol = 6; fontcol >= 0 ; fontcol--)
+            for (fontrow = 0; fontrow < 5; fontrow++)
+                output_pos[fontrow *pitch + fontcol] =
+                    ((letter_bitmap >> (fontcol * 5)) & (1 << fontrow) ? 255 : 0);
+
+        output_pos += 7;
+        colpos++;
+    }
+}
diff --git a/vp8/common/threading.h b/vp8/common/threading.h
new file mode 100644
index 000000000..a02cb244b
--- /dev/null
+++ b/vp8/common/threading.h
@@ -0,0 +1,89 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef _PTHREAD_EMULATION
+#define _PTHREAD_EMULATION
+
+#define VPXINFINITE 10000       //10second.
+
+/* Thread management macros */
+#ifdef _WIN32
+/* Win32 */
+#define _WIN32_WINNT 0x500 /* WINBASE.H - Enable signal_object_and_wait */
+#include <process.h>
+#include <windows.h>
+#define THREAD_FUNCTION DWORD WINAPI
+#define THREAD_FUNCTION_RETURN DWORD
+#define THREAD_SPECIFIC_INDEX DWORD
+#define pthread_t HANDLE
+#define pthread_attr_t DWORD
+#define pthread_create(thhandle,attr,thfunc,tharg) (int)((*thhandle=(HANDLE)_beginthreadex(NULL,0,(unsigned int (__stdcall *)(void *))thfunc,tharg,0,NULL))==NULL)
+#define pthread_join(thread, result) ((WaitForSingleObject((thread),VPXINFINITE)!=WAIT_OBJECT_0) || !CloseHandle(thread))
+#define pthread_detach(thread) if(thread!=NULL)CloseHandle(thread)
+#define thread_sleep(nms) Sleep(nms)
+#define pthread_cancel(thread) terminate_thread(thread,0)
+#define ts_key_create(ts_key, destructor) {ts_key = TlsAlloc();};
+#define pthread_getspecific(ts_key) TlsGetValue(ts_key)
+#define pthread_setspecific(ts_key, value) TlsSetValue(ts_key, (void *)value)
+#define pthread_self() GetCurrentThreadId()
+#else
+#ifdef __APPLE__
+#include <mach/semaphore.h>
+#include <mach/task.h>
+#include <time.h>
+#include <unistd.h>
+
+#else
+#include <semaphore.h>
+#endif
+
+#include <pthread.h>
+/* pthreads */
+/* Nearly everything is already defined */
+#define THREAD_FUNCTION void *
+#define THREAD_FUNCTION_RETURN void *
+#define THREAD_SPECIFIC_INDEX pthread_key_t
+#define ts_key_create(ts_key, destructor) pthread_key_create (&(ts_key), destructor);
+#endif
+
+/* Syncrhronization macros: Win32 and Pthreads */
+#ifdef _WIN32
+#define sem_t HANDLE
+#define pause(voidpara) __asm PAUSE
+#define sem_init(sem, sem_attr1, sem_init_value) (int)((*sem = CreateEvent(NULL,FALSE,FALSE,NULL))==NULL)
+#define sem_wait(sem) (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem,VPXINFINITE))
+#define sem_post(sem) SetEvent(*sem)
+#define sem_destroy(sem) if(*sem)((int)(CloseHandle(*sem))==TRUE)
+#define thread_sleep(nms) Sleep(nms)
+
+#else
+
+#ifdef __APPLE__
+#define sem_t semaphore_t
+#define sem_init(X,Y,Z) semaphore_create(mach_task_self(), X, SYNC_POLICY_FIFO, Z)
+#define sem_wait(sem) (semaphore_wait(*sem) )
+#define sem_post(sem) semaphore_signal(*sem)
+#define sem_destroy(sem) semaphore_destroy(mach_task_self(),*sem)
+#define thread_sleep(nms) // { struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);}
+#else
+#include <unistd.h>
+#define thread_sleep(nms) usleep(nms*1000);// {struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);}
+#endif
+/* Not Windows. Assume pthreads */
+
+#endif
+
+#if ARCH_X86 || ARCH_X86_64
+#include "vpx_ports/x86.h"
+#else
+#define x86_pause_hint()
+#endif
+
+#endif
diff --git a/vp8/common/treecoder.c b/vp8/common/treecoder.c
new file mode 100644
index 000000000..4ad018d49
--- /dev/null
+++ b/vp8/common/treecoder.c
@@ -0,0 +1,136 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#if CONFIG_DEBUG
+#include <assert.h>
+#endif
+#include <stdio.h>
+
+#include "treecoder.h"
+
+static void tree2tok(
+    struct vp8_token_struct *const p,
+    vp8_tree t,
+    int i,
+    int v,
+    int L
+)
+{
+    v += v;
+    ++L;
+
+    do
+    {
+        const vp8_tree_index j = t[i++];
+
+        if (j <= 0)
+        {
+            p[-j].value = v;
+            p[-j].Len = L;
+        }
+        else
+            tree2tok(p, t, j, v, L);
+    }
+    while (++v & 1);
+}
+
+void vp8_tokens_from_tree(struct vp8_token_struct *p, vp8_tree t)
+{
+    tree2tok(p, t, 0, 0, 0);
+}
+
+static void branch_counts(
+    int n,                      /* n = size of alphabet */
+    vp8_token tok               [ /* n */ ],
+    vp8_tree tree,
+    unsigned int branch_ct       [ /* n-1 */ ] [2],
+    const unsigned int num_events[ /* n */ ]
+)
+{
+    const int tree_len = n - 1;
+    int t = 0;
+
+#if CONFIG_DEBUG
+    assert(tree_len);
+#endif
+
+    do
+    {
+        branch_ct[t][0] = branch_ct[t][1] = 0;
+    }
+    while (++t < tree_len);
+
+    t = 0;
+
+    do
+    {
+        int L = tok[t].Len;
+        const int enc = tok[t].value;
+        const unsigned int ct = num_events[t];
+
+        vp8_tree_index i = 0;
+
+        do
+        {
+            const int b = (enc >> --L) & 1;
+            const int j = i >> 1;
+#if CONFIG_DEBUG
+            assert(j < tree_len  &&  0 <= L);
+#endif
+
+            branch_ct [j] [b] += ct;
+            i = tree[ i + b];
+        }
+        while (i > 0);
+
+#if CONFIG_DEBUG
+        assert(!L);
+#endif
+    }
+    while (++t < n);
+
+}
+
+
+void vp8_tree_probs_from_distribution(
+    int n,                      /* n = size of alphabet */
+    vp8_token tok               [ /* n */ ],
+    vp8_tree tree,
+    vp8_prob probs          [ /* n-1 */ ],
+    unsigned int branch_ct       [ /* n-1 */ ] [2],
+    const unsigned int num_events[ /* n */ ],
+    unsigned int Pfac,
+    int rd
+)
+{
+    const int tree_len = n - 1;
+    int t = 0;
+
+    branch_counts(n, tok, tree, branch_ct, num_events);
+
+    do
+    {
+        const unsigned int *const c = branch_ct[t];
+        const unsigned int tot = c[0] + c[1];
+
+#if CONFIG_DEBUG
+        assert(tot < (1 << 24));        /* no overflow below */
+#endif
+
+        if (tot)
+        {
+            const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot;
+            probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */
+        }
+        else
+            probs[t] = vp8_prob_half;
+    }
+    while (++t < tree_len);
+}
diff --git a/vp8/common/treecoder.h b/vp8/common/treecoder.h
new file mode 100644
index 000000000..0356d2b02
--- /dev/null
+++ b/vp8/common/treecoder.h
@@ -0,0 +1,87 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_TREECODER_H
+#define __INC_TREECODER_H
+
+typedef unsigned char vp8bc_index_t; // probability index
+
+
+typedef unsigned char vp8_prob;
+
+#define vp8_prob_half ( (vp8_prob) 128)
+
+typedef signed char vp8_tree_index;
+struct bool_coder_spec;
+
+typedef struct bool_coder_spec bool_coder_spec;
+typedef struct bool_writer bool_writer;
+typedef struct bool_reader bool_reader;
+
+typedef const bool_coder_spec c_bool_coder_spec;
+typedef const bool_writer c_bool_writer;
+typedef const bool_reader c_bool_reader;
+
+
+
+# define vp8_complement( x) (255 - x)
+
+
+/* We build coding trees compactly in arrays.
+   Each node of the tree is a pair of vp8_tree_indices.
+   Array index often references a corresponding probability table.
+   Index <= 0 means done encoding/decoding and value = -Index,
+   Index > 0 means need another bit, specification at index.
+   Nonnegative indices are always even;  processing begins at node 0. */
+
+typedef const vp8_tree_index vp8_tree[], *vp8_tree_p;
+
+
+typedef const struct vp8_token_struct
+{
+    int value;
+    int Len;
+} vp8_token;
+
+/* Construct encoding array from tree. */
+
+void vp8_tokens_from_tree(struct vp8_token_struct *, vp8_tree);
+
+
+/* Convert array of token occurrence counts into a table of probabilities
+   for the associated binary encoding tree.  Also writes count of branches
+   taken for each node on the tree; this facilitiates decisions as to
+   probability updates. */
+
+void vp8_tree_probs_from_distribution(
+    int n,                      /* n = size of alphabet */
+    vp8_token tok               [ /* n */ ],
+    vp8_tree tree,
+    vp8_prob probs          [ /* n-1 */ ],
+    unsigned int branch_ct       [ /* n-1 */ ] [2],
+    const unsigned int num_events[ /* n */ ],
+    unsigned int Pfactor,
+    int Round
+);
+
+/* Variant of above using coder spec rather than hardwired 8-bit probs. */
+
+void vp8bc_tree_probs_from_distribution(
+    int n,                      /* n = size of alphabet */
+    vp8_token tok               [ /* n */ ],
+    vp8_tree tree,
+    vp8_prob probs          [ /* n-1 */ ],
+    unsigned int branch_ct       [ /* n-1 */ ] [2],
+    const unsigned int num_events[ /* n */ ],
+    c_bool_coder_spec *s
+);
+
+
+#endif
diff --git a/vp8/common/type_aliases.h b/vp8/common/type_aliases.h
new file mode 100644
index 000000000..addd26469
--- /dev/null
+++ b/vp8/common/type_aliases.h
@@ -0,0 +1,116 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     type_aliases.h
+*
+*   Description  :     Standard type aliases
+*
+****************************************************************************/
+#ifndef __INC_TYPE_ALIASES_H
+#define __INC_TYPE_ALIASES_H
+
+/****************************************************************************
+* Macros
+****************************************************************************/
+#define EXPORT
+#define IMPORT          extern      /* Used to declare imported data & routines */
+#define PRIVATE         static      /* Used to declare & define module-local data */
+#define LOCAL           static      /* Used to define all persistent routine-local data */
+#define STD_IN_PATH     0           /* Standard input path */
+#define STD_OUT_PATH    1           /* Standard output path */
+#define STD_ERR_PATH    2           /* Standard error path */
+#define STD_IN_FILE     stdin       /* Standard input file pointer */
+#define STD_OUT_FILE    stdout      /* Standard output file pointer */
+#define STD_ERR_FILE    stderr      /* Standard error file pointer */
+#define max_int         0x7FFFFFFF
+
+#define __export
+#define _export
+
+#define CCONV
+
+#ifndef NULL
+#ifdef __cplusplus
+#define NULL    0
+#else
+#define NULL    ((void *)0)
+#endif
+#endif
+
+#ifndef FALSE
+#define FALSE   0
+#endif
+
+#ifndef TRUE
+#define TRUE    1
+#endif
+
+/****************************************************************************
+* Typedefs
+****************************************************************************/
+#ifndef TYPE_INT8
+#define TYPE_INT8
+typedef signed char     INT8;
+#endif
+
+#ifndef TYPE_INT16
+//#define TYPE_INT16
+typedef signed short    INT16;
+#endif
+
+#ifndef TYPE_INT32
+//#define TYPE_INT32
+typedef signed int      INT32;
+#endif
+
+#ifndef TYPE_UINT8
+//#define TYPE_UINT8
+typedef unsigned char   UINT8;
+#endif
+
+#ifndef TYPE_UINT32
+//#define TYPE_UINT32
+typedef unsigned int    UINT32;
+#endif
+
+#ifndef TYPE_UINT16
+//#define TYPE_UINT16
+typedef unsigned short  UINT16;
+#endif
+
+#ifndef TYPE_BOOL
+//#define TYPE_BOOL
+typedef int             BOOL;
+#endif
+
+typedef unsigned char   BOOLEAN;
+
+#ifdef _MSC_VER
+typedef __int64 INT64;
+#else
+
+#ifndef TYPE_INT64
+#ifdef _TMS320C6X
+//for now we only have 40bits
+typedef long INT64;
+#else
+typedef long long INT64;
+#endif
+#endif
+
+#endif
+
+/* Floating point */
+typedef  double         FLOAT64;
+typedef  float          FLOAT32;
+
+#endif
diff --git a/vp8/common/vfwsetting.hpp b/vp8/common/vfwsetting.hpp
new file mode 100644
index 000000000..e352e7a19
--- /dev/null
+++ b/vp8/common/vfwsetting.hpp
@@ -0,0 +1,75 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#if !defined(VFWSETTING_HPP)
+#define VFWSETTING_HPP
+//______________________________________________________________________________
+//
+//  VFWSetting.hpp
+//
+
+#include "four_cc.hpp"
+#include <iosfwd>
+
+namespace vpxvp
+{
+
+    //--------------------------------------
+    class VFWSetting
+    {
+        friend std::ostream& operator<<(std::ostream& os, const VFWSetting& vfws);
+
+    public:
+
+        enum Mode
+        {
+            m_setting,
+            m_config
+        };
+
+        enum
+        {
+            header_size = 8,
+            Size = 16
+        };
+
+        VFWSetting(four_cc fcc);
+        ~VFWSetting();
+
+        four_cc fcc() const;
+        Mode mode() const;
+
+        int setting() const;
+        int value() const;
+        void setting_value(int i_setting, int i_value);  //  Sets mode to m_setting
+
+        long size() const;
+        const void* data() const;
+        int data(const void* p_data, unsigned long ul_size);
+
+    private:
+
+        VFWSetting(const VFWSetting& vfws);  //  Not implemented
+        VFWSetting& operator=(const VFWSetting& vfws);  //  Not implemented
+
+        int extract_(const void* p_data, unsigned long ul_size);
+        void update_() const;
+
+        four_cc m_fcc;
+        Mode m_mode;
+        int m_i_setting;
+        int m_i_value;
+
+        mutable unsigned char m_p_data[Size];
+    };
+
+}  //  namespace vpxvp
+
+#endif  //  VFWSETTING_HPP
diff --git a/vp8/common/vpx_ref_build_prefix.h b/vp8/common/vpx_ref_build_prefix.h
new file mode 100644
index 000000000..40608c6dd
--- /dev/null
+++ b/vp8/common/vpx_ref_build_prefix.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef _VPX_REF_BUILD_PREFIX_h
+#define _VPX_REF_BUILD_PREFIX_h
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* include guards */
diff --git a/vp8/common/vpxblit.h b/vp8/common/vpxblit.h
new file mode 100644
index 000000000..d03e0bd02
--- /dev/null
+++ b/vp8/common/vpxblit.h
@@ -0,0 +1,111 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef VPXBLIT_H_INCL
+#define VPXBLIT_H_INCL
+/*==============================================================================
+                              Includes
+==============================================================================*/
+
+/*==============================================================================
+                              Defines
+==============================================================================*/
+
+
+#ifdef VPX_BIG_ENDIAN
+#define BYTE_ZERO(X)    ((X & 0xFF000000) >> (24 - 2)   )
+#define BYTE_ONE(X)     ((X & 0x00FF0000) >> (16 - 2)   )
+#define BYTE_TWO(X)     ((X & 0x0000FF00) >> (8 - 2)    )
+#define BYTE_THREE(X)   ((X & 0x000000FF) << (0 + 2)    )
+
+#define BYTE_ZERO_UV(X) ((X & 0x0000FF00) >> (8 - 2)    )
+#define BYTE_ONE_UV(X)  ((X & 0x000000FF) << (0 + 2)    )
+
+#define REREFERENCE(X) (*((int *) &(X)))
+
+#else
+
+#define BYTE_THREE(X)   ((X & 0xFF000000) >> (24 - 2)   )
+#define BYTE_TWO(X)     ((X & 0x00FF0000) >> (16 - 2)   )
+#define BYTE_ONE(X)     ((X & 0x0000FF00) >> (8 - 2)    )
+#define BYTE_ZERO(X)    ((X & 0x000000FF) << (0 + 2)    )
+
+#define BYTE_ONE_UV(X) ((X & 0x0000FF00) >> (8 - 2) )
+#define BYTE_ZERO_UV(X)     ((X & 0x000000FF) << (0 + 2)    )
+
+#define REREFERENCE(X) (*((int *) &(X)))
+
+#endif
+
+
+/*==============================================================================
+                            Type Definitions
+==============================================================================*/
+typedef struct  // YUV buffer configuration structure
+{
+    int   y_width;
+    int   y_height;
+    int   y_stride;
+
+    int   uv_width;
+    int   uv_height;
+    int   uv_stride;
+
+    char *y_buffer;
+    char *u_buffer;
+    char *v_buffer;
+
+    char *uv_start;
+    int   uv_dst_area;
+    int   uv_used_area;
+
+} VPX_BLIT_CONFIG;
+
+typedef struct tx86_params
+{
+    unsigned int pushed_registers[6];
+    unsigned int return_address;
+    unsigned int dst;
+    unsigned int scrn_pitch;
+    VPX_BLIT_CONFIG *buff_config;
+} x86_params;
+
+/*=============================================================================
+                                Enums
+==============================================================================*/
+
+
+/*==============================================================================
+                              Structures
+==============================================================================*/
+
+/*==============================================================================
+                             Constants
+==============================================================================*/
+
+
+/*==============================================================================
+                               Variables
+==============================================================================*/
+
+
+
+
+/*==============================================================================
+                            Function Protoypes/MICROS
+==============================================================================*/
+int vpx_get_size_of_pixel(unsigned int bd);
+void *vpx_get_blitter(unsigned int bd);
+void vpx_set_blit(void);
+void vpx_destroy_blit(void);
+
+
+
+#endif //VPXBLIT_H_INCL
diff --git a/vp8/common/vpxblit_c64.h b/vp8/common/vpxblit_c64.h
new file mode 100644
index 000000000..a8e28f59a
--- /dev/null
+++ b/vp8/common/vpxblit_c64.h
@@ -0,0 +1,47 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef _VPX_BLIT_C64_h
+#define _VPX_BLIT_C64_h
+
+/****************************************************************************
+*  Typedefs
+****************************************************************************/
+
+typedef struct  // YUV buffer configuration structure
+{
+    int     y_width;
+    int     y_height;
+    int     y_stride;
+
+    int     uv_width;
+    int     uv_height;
+    int     uv_stride;
+
+    unsigned char *y_buffer;
+    unsigned char *u_buffer;
+    unsigned char *v_buffer;
+
+    unsigned char *y_ptr_scrn;
+    unsigned char *u_ptr_scrn;
+    unsigned char *v_ptr_scrn;
+
+} DXV_YUV_BUFFER_CONFIG;
+
+typedef struct
+{
+    unsigned char *rgbptr_scrn;
+    unsigned char *y_ptr_scrn;
+    unsigned char *u_ptr_scrn;
+    unsigned char *v_ptr_scrn;
+    unsigned char *rgbptr_scrn2;
+} DXV_FINAL_VIDEO;
+
+#endif /* include guards */
diff --git a/vp8/common/vpxerrors.h b/vp8/common/vpxerrors.h
new file mode 100644
index 000000000..e4c9f3ef3
--- /dev/null
+++ b/vp8/common/vpxerrors.h
@@ -0,0 +1,12 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+
+#define ALLOC_FAILURE -2
diff --git a/vp8/common/x86/boolcoder.cxx b/vp8/common/x86/boolcoder.cxx
new file mode 100644
index 000000000..06faca69c
--- /dev/null
+++ b/vp8/common/x86/boolcoder.cxx
@@ -0,0 +1,493 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+
+/* Arithmetic bool coder with largish probability range.
+   Timothy S Murphy  6 August 2004 */
+
+#include <assert.h>
+#include <math.h>
+
+#include "bool_coder.h"
+
+#if tim_vp8
+    extern "C" {
+#       include "VP8cx/treewriter.h"
+    }
+#endif
+
+int_types::~int_types() {}
+
+void bool_coder_spec::check_prec() const {
+    assert( w  &&  (r==Up || w > 1)  &&  w < 24  &&  (ebias || w < 17));
+}
+
+bool bool_coder_spec::float_init( uint Ebits, uint Mbits) {
+    uint b = (ebits = Ebits) + (mbits = Mbits);
+    if( b) {
+        assert( ebits < 6  &&  w + mbits < 31);
+        assert( ebits + mbits  <  sizeof(Index) * 8);
+        ebias = (1 << ebits) + 1 + mbits;
+        mmask = (1 << mbits) - 1;
+        max_index = ( ( half_index = 1 << b ) << 1) - 1;
+    } else {
+        ebias = 0;
+        max_index = 255;
+        half_index = 128;
+    }
+    check_prec();
+    return b? 1:0;
+}
+
+void bool_coder_spec::cost_init()
+{
+    static cdouble c = -(1 << 20)/log( 2.);
+
+    FILE *f = fopen( "costs.txt", "w");
+    assert( f);
+
+    assert( sizeof(int) >= 4);  /* for C interface */
+    assert( max_index <= 255);   /* size of Ctbl */
+    uint i = 0;  do {
+        cdouble p = ( *this)( (Index) i);
+        Ctbl[i] = (uint32) ( log( p) * c);
+        fprintf(
+            f, "cost( %d -> %10.7f) = %10d = %12.5f bits\n",
+            i, p, Ctbl[i], (double) Ctbl[i] / (1<<20)
+        );
+    } while( ++i <= max_index);
+    fclose( f);
+}
+
+bool_coder_spec_explicit_table::bool_coder_spec_explicit_table(
+    cuint16 tbl[256], Rounding rr, uint prec
+)
+  : bool_coder_spec( prec, rr)
+{
+    check_prec();
+    uint i = 0;
+    if( tbl)
+        do { Ptbl[i] = tbl[i];}  while( ++i < 256);
+    else
+        do { Ptbl[i] = i << 8;}  while( ++i < 256);
+    cost_init();
+}
+
+
+bool_coder_spec_exponential_table::bool_coder_spec_exponential_table(
+    uint x, Rounding rr, uint prec
+)
+  : bool_coder_spec( prec, rr)
+{
+    assert( x > 1  &&  x <= 16);
+    check_prec();
+    Ptbl[128] = 32768u;
+    Ptbl[0] = (uint16) pow( 2., 16. - x);
+    --x;
+    int i=1;  do {
+        cdouble d = pow( .5, 1. + (1. - i/128.)*x) * 65536.;
+        uint16 v = (uint16) d;
+        if( v < i)
+            v = i;
+        Ptbl[256-i] = (uint16) ( 65536U - (Ptbl[i] = v));
+    } while( ++i < 128);
+    cost_init();
+}
+
+bool_coder_spec::bool_coder_spec( FILE *fp) {
+    fscanf( fp, "%d", &w);
+    int v;
+    fscanf( fp, "%d", &v);
+    assert( 0 <= v  &&  v <= 2);
+    r = (Rounding) v;
+    fscanf( fp, "%d", &ebits);
+    fscanf( fp, "%d", &mbits);
+    if( float_init( ebits, mbits))
+        return;
+    int i=0;  do {
+        uint v;
+        fscanf( fp, "%d", &v);
+        assert( 0 <=v  &&  v <= 65535U);
+        Ptbl[i] = v;
+    } while( ++i < 256);
+    cost_init();
+}
+
+void bool_coder_spec::dump( FILE *fp) const {
+    fprintf( fp, "%d %d %d %d\n", w, (int) r, ebits, mbits);
+    if( ebits  ||  mbits)
+        return;
+    int i=0;  do { fprintf( fp, "%d\n", Ptbl[i]);}  while( ++i < 256);
+}
+
+vp8bc_index_t bool_coder_spec::operator()( double p) const
+{
+    if( p <= 0.)
+        return 0;
+    if( p >= 1.)
+        return max_index;
+    if( ebias) {
+        if( p > .5)
+            return max_index - ( *this)( 1. - p);
+        int e;
+        uint m = (uint) ldexp( frexp( p, &e), mbits + 2);
+        uint x = 1 << (mbits + 1);
+        assert( x <= m  &&  m < x<<1);
+        if( (m = (m >> 1) + (m & 1)) >= x) {
+            m = x >> 1;
+            ++e;
+        }
+        int y = 1 << ebits;
+        if( (e += y) >= y)
+            return half_index - 1;
+        if( e < 0)
+            return 0;
+        return (Index) ( (e << mbits) + (m & mmask));
+    }
+
+    cuint16 v = (uint16) (p * 65536.);
+    int i = 128;
+    int j = 128;
+    uint16 w;
+    while( w = Ptbl[i], j >>= 1) {
+        if( w < v)
+            i += j;
+        else if( w == v)
+            return (uchar) i;
+        else
+            i -= j;
+    }
+    if( w > v) {
+        cuint16 x = Ptbl[i-1];
+        if( v <= x  ||  w - v > v - x)
+            --i;
+    } else if( w < v  &&  i < 255) {
+        cuint16 x = Ptbl[i+1];
+        if( x <= v  ||  x - v < v - w)
+            ++i;
+    }
+    return (Index) i;
+}
+
+double bool_coder_spec::operator()( Index i) const {
+    if( !ebias)
+        return Ptbl[i]/65536.;
+    if( i >= half_index)
+        return 1. - ( *this)( (Index) (max_index - i));
+    return ldexp( (double)mantissa( i), - (int) exponent( i));
+}
+
+
+
+void bool_writer::carry() {
+    uchar *p = B;
+    assert( p > Bstart);
+    while( *--p == 255) { assert( p > Bstart);  *p = 0;}
+    ++*p;
+}
+
+
+bool_writer::bool_writer( c_spec& s, uchar *Dest, size_t Len)
+  : bool_coder( s),
+    Bstart( Dest),
+    Bend( Len? Dest+Len : 0),
+    B( Dest)
+{
+    assert( Dest);
+    reset();
+}
+
+bool_writer::~bool_writer() { flush();}
+
+#if 1
+    extern "C" { int bc_v = 0;}
+#else
+#   define bc_v 0
+#endif
+
+
+void bool_writer::raw( bool value, uint32 s) {
+    uint32 L = Low;
+
+    assert( Range >= min_range  &&  Range <= spec.max_range());
+    assert( !is_toast  &&  s  &&  s < Range);
+
+    if( bc_v) printf(
+        "Writing a %d, B %x  Low %x  Range %x  s %x   blag %d ...\n",
+        value? 1:0, B-Bstart, Low, Range, s, bit_lag
+    );
+    if( value) {
+        L += s;
+        s = Range - s;
+    } else
+        s -= rinc;
+    if( s < min_range) {
+        int ct = bit_lag;  do {
+            if( !--ct) {
+                ct = 8;
+                if( L & (1 << 31))
+                    carry();
+                assert( !Bend  ||  B < Bend);
+                *B++ = (uchar) (L >> 23);
+                L &= (1<<23) - 1;
+            }
+        } while( L += L, (s += s + rinc) < min_range);
+        bit_lag = ct;
+    }
+    Low = L;
+    Range = s;
+    if( bc_v)
+        printf(
+            "...done, B %x  Low %x  Range %x  blag %d \n",
+                B-Bstart, Low, Range, bit_lag
+        );
+}
+
+bool_writer& bool_writer::flush() {
+    if( is_toast)
+        return *this;
+    int b = bit_lag;
+    uint32 L = Low;
+    assert( b);
+    if( L & (1 << (32 - b)))
+        carry();
+    L <<= b & 7;
+    b >>= 3;
+    while( --b >= 0)
+        L <<= 8;
+    b = 4;
+    assert( !Bend  ||  B + 4 <= Bend);
+    do {
+        *B++ = (uchar) (L >> 24);
+        L <<= 8;
+    } while( --b);
+    is_toast = 1;
+    return *this;
+}
+
+
+bool_reader::bool_reader( c_spec& s, cuchar *src, size_t Len)
+  : bool_coder( s),
+    Bstart( src),
+    B( src),
+    Bend( Len? src+Len : 0),
+    shf( 32 - s.w),
+    bct( 8)
+{
+    int i = 4;  do { Low <<= 8;  Low |= *B++;}  while( --i);
+}
+
+
+bool bool_reader::raw( uint32 s) {
+
+    bool val = 0;
+    uint32 L = Low;
+    cuint32 S = s << shf;
+
+    assert( Range >= min_range  &&  Range <= spec.max_range());
+    assert( s  &&  s < Range  &&  (L >> shf) < Range);
+
+    if( bc_v)
+        printf(
+            "Reading, B %x  Low %x  Range %x  s %x  bct %d ...\n",
+            B-Bstart, Low, Range, s, bct
+        );
+
+    if( L >= S) {
+        L -= S;
+        s = Range - s;
+        assert( L < (s << shf));
+        val = 1;
+    } else
+        s -= rinc;
+    if( s < min_range) {
+        int ct = bct;
+        do {
+            assert( ~L & (1 << 31));
+            L += L;
+            if( !--ct) {
+                ct = 8;
+                if( !Bend  ||  B < Bend)
+                    L |= *B++;
+            }
+        } while( (s += s + rinc) < min_range);
+        bct = ct;
+    }
+    Low = L;
+    Range = s;
+    if( bc_v)
+        printf(
+            "...done, val %d  B %x  Low %x  Range %x  bct %d\n",
+            val? 1:0, B-Bstart, Low, Range, bct
+        );
+    return val;
+}
+
+
+/* C interfaces */
+
+// spec interface
+
+struct NS : bool_coder_namespace {
+    static Rounding r( vp8bc_c_prec *p, Rounding rr =down_full) {
+        return p? (Rounding) p->r : rr;
+    }
+};
+
+bool_coder_spec *vp8bc_vp6spec() {
+    return new bool_coder_spec_explicit_table( 0, bool_coder_namespace::Down, 8);
+}
+bool_coder_spec *vp8bc_float_spec(
+    unsigned int Ebits, unsigned int Mbits, vp8bc_c_prec *p
+) {
+    return new bool_coder_spec_float( Ebits, Mbits, NS::r( p), p? p->prec : 12);
+}
+bool_coder_spec *vp8bc_literal_spec(
+    const unsigned short m[256], vp8bc_c_prec *p
+) {
+    return new bool_coder_spec_explicit_table( m, NS::r( p), p? p->prec : 16);
+}
+bool_coder_spec *vp8bc_exponential_spec( unsigned int x, vp8bc_c_prec *p)
+{
+    return new bool_coder_spec_exponential_table( x, NS::r( p), p? p->prec : 16);
+}
+bool_coder_spec *vp8bc_spec_from_file( FILE *fp) {
+    return new bool_coder_spec( fp);
+}
+void vp8bc_destroy_spec( c_bool_coder_spec *p) { delete p;}
+
+void vp8bc_spec_to_file( c_bool_coder_spec *p, FILE *fp) { p->dump( fp);}
+
+vp8bc_index_t vp8bc_index( c_bool_coder_spec *p, double x) {
+    return ( *p)( x);
+}
+
+vp8bc_index_t vp8bc_index_from_counts(
+    c_bool_coder_spec *p, unsigned int L, unsigned int R
+) {
+    return ( *p)( (R += L)? (double) L/R : .5);
+}
+
+double vp8bc_probability( c_bool_coder_spec *p, vp8bc_index_t i) {
+    return ( *p)( i);
+}
+
+vp8bc_index_t vp8bc_complement( c_bool_coder_spec *p, vp8bc_index_t i) {
+    return p->complement( i);
+}
+unsigned int vp8bc_cost_zero( c_bool_coder_spec *p, vp8bc_index_t i) {
+    return p->cost_zero( i);
+}
+unsigned int vp8bc_cost_one( c_bool_coder_spec *p, vp8bc_index_t i) {
+    return p->cost_one( i);
+}
+unsigned int vp8bc_cost_bit( c_bool_coder_spec *p, vp8bc_index_t i, int v) {
+    return p->cost_bit( i, v);
+}
+
+#if tim_vp8
+    extern "C" int tok_verbose;
+
+#   define dbg_l 1000000
+
+    static vp8bc_index_t dbg_i [dbg_l];
+    static char dbg_v [dbg_l];
+    static size_t dbg_w = 0, dbg_r = 0;
+#endif
+
+// writer interface
+
+bool_writer *vp8bc_create_writer(
+    c_bool_coder_spec *p, unsigned char *D, size_t L
+) {
+    return new bool_writer( *p, D, L);
+}
+
+size_t vp8bc_destroy_writer( bool_writer *p) {
+    const size_t s = p->flush().bytes_written();
+    delete p;
+    return s;
+}
+
+void vp8bc_write_bool( bool_writer *p, int v, vp8bc_index_t i)
+{
+#   if tim_vp8
+        // bc_v = dbg_w < 10;
+        if( bc_v = tok_verbose)
+            printf( " writing %d at prob %d\n", v? 1:0, i);
+        accum_entropy_bc( &p->Spec(), i, v);
+
+        ( *p)( i, (bool) v);
+
+        if( dbg_w < dbg_l) {
+            dbg_i [dbg_w] = i;
+            dbg_v [dbg_w++] = v? 1:0;
+        }
+#   else
+        ( *p)( i, (bool) v);
+#   endif
+}
+
+void vp8bc_write_bits( bool_writer *p, unsigned int v, int n)
+{
+#   if tim_vp8
+        {
+            c_bool_coder_spec * const s = & p->Spec();
+            const vp8bc_index_t i = s->half_index();
+            int m = n;
+            while( --m >= 0)
+                accum_entropy_bc( s, i, (v>>m) & 1);
+        }
+#   endif
+
+    p->write_bits( n, v);
+}
+
+c_bool_coder_spec *vp8bc_writer_spec( c_bool_writer *w) { return & w->Spec();}
+
+// reader interface
+
+bool_reader *vp8bc_create_reader(
+    c_bool_coder_spec *p, const unsigned char *S, size_t L
+) {
+    return new bool_reader( *p, S, L);
+}
+
+void vp8bc_destroy_reader( bool_reader * p) { delete p;}
+
+int vp8bc_read_bool( bool_reader *p, vp8bc_index_t i)
+{
+#   if tim_vp8
+        // bc_v = dbg_r < 10;
+        bc_v = tok_verbose;
+        const int v = ( *p)( i)? 1:0;
+        if( tok_verbose)
+            printf( " reading %d at prob %d\n", v, i);
+        if( dbg_r < dbg_l) {
+            assert( dbg_r <= dbg_w);
+            if( i != dbg_i[dbg_r]  ||  v != dbg_v[dbg_r]) {
+                printf(
+        "Position %d: INCORRECTLY READING %d  prob %d, wrote %d  prob %d\n",
+                    dbg_r, v, i, dbg_v[dbg_r], dbg_i[dbg_r]
+                );
+            }
+            ++dbg_r;
+        }
+        return v;
+#   else
+        return ( *p)( i)? 1:0;
+#   endif
+}
+
+unsigned int vp8bc_read_bits( bool_reader *p, int n) { return p->read_bits( n);}
+
+c_bool_coder_spec *vp8bc_reader_spec( c_bool_reader *r) { return & r->Spec();}
+
+#undef bc_v
diff --git a/vp8/common/x86/idct_x86.h b/vp8/common/x86/idct_x86.h
new file mode 100644
index 000000000..5dfb212e1
--- /dev/null
+++ b/vp8/common/x86/idct_x86.h
@@ -0,0 +1,63 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef IDCT_X86_H
+#define IDCT_X86_H
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+
+#if HAVE_MMX
+extern prototype_idct(vp8_short_idct4x4llm_1_mmx);
+extern prototype_idct(vp8_short_idct4x4llm_mmx);
+extern prototype_idct_scalar(vp8_dc_only_idct_mmx);
+
+extern prototype_second_order(vp8_short_inv_walsh4x4_mmx);
+extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_idct_idct1
+#define vp8_idct_idct1 vp8_short_idct4x4llm_1_mmx
+
+#undef  vp8_idct_idct16
+#define vp8_idct_idct16 vp8_short_idct4x4llm_mmx
+
+#undef  vp8_idct_idct1_scalar
+#define vp8_idct_idct1_scalar vp8_dc_only_idct_mmx
+
+#undef vp8_idct_iwalsh16
+#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_mmx
+
+#undef vp8_idct_iwalsh1
+#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_mmx
+
+#endif
+#endif
+
+#if HAVE_SSE2
+
+extern prototype_second_order(vp8_short_inv_walsh4x4_sse2);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef vp8_idct_iwalsh16
+#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_sse2
+
+#endif
+
+#endif
+
+
+
+#endif
diff --git a/vp8/common/x86/idctllm_mmx.asm b/vp8/common/x86/idctllm_mmx.asm
new file mode 100644
index 000000000..2751c6934
--- /dev/null
+++ b/vp8/common/x86/idctllm_mmx.asm
@@ -0,0 +1,265 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; /****************************************************************************
+; * Notes:
+; *
+; * This implementation makes use of 16 bit fixed point verio of two multiply
+; * constants:
+; *        1.   sqrt(2) * cos (pi/8)
+; *         2.   sqrt(2) * sin (pi/8)
+; * Becuase the first constant is bigger than 1, to maintain the same 16 bit
+; * fixed point prrcision as the second one, we use a trick of
+; *        x * a = x + x*(a-1)
+; * so
+; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
+; *
+; * For     the second constant, becuase of the 16bit version is 35468, which
+; * is bigger than 32768, in signed 16 bit multiply, it become a negative
+; * number.
+; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
+; *
+; **************************************************************************/
+
+
+;void short_idct4x4llm_mmx(short *input, short *output, int pitch)
+global sym(vp8_short_idct4x4llm_mmx)
+sym(vp8_short_idct4x4llm_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    GET_GOT     rbx
+    ; end prolog
+
+        mov         rax,            arg(0) ;input
+        mov         rdx,            arg(1) ;output
+
+        movq        mm0,            [rax   ]
+        movq        mm1,            [rax+ 8]
+
+        movq        mm2,            [rax+16]
+        movq        mm3,            [rax+24]
+
+        movsxd      rax,            dword ptr arg(2) ;pitch
+
+        psubw       mm0,            mm2             ; b1= 0-2
+        paddw       mm2,            mm2             ;
+
+        movq        mm5,            mm1
+        paddw       mm2,            mm0             ; a1 =0+2
+
+        pmulhw      mm5,            [x_s1sqr2 GLOBAL]        ;
+        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
+
+        movq        mm7,            mm3             ;
+        pmulhw      mm7,            [x_c1sqr2less1 GLOBAL]    ;
+
+        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
+        psubw       mm7,            mm5             ; c1
+
+        movq        mm5,            mm1
+        movq        mm4,            mm3
+
+        pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
+        paddw       mm5,            mm1
+
+        pmulhw      mm3,            [x_s1sqr2 GLOBAL]
+        paddw       mm3,            mm4
+
+        paddw       mm3,            mm5             ; d1
+        movq        mm6,            mm2             ; a1
+
+        movq        mm4,            mm0             ; b1
+        paddw       mm2,            mm3             ;0
+
+        paddw       mm4,            mm7             ;1
+        psubw       mm0,            mm7             ;2
+
+        psubw       mm6,            mm3             ;3
+
+        movq        mm1,            mm2             ; 03 02 01 00
+        movq        mm3,            mm4             ; 23 22 21 20
+
+        punpcklwd   mm1,            mm0             ; 11 01 10 00
+        punpckhwd   mm2,            mm0             ; 13 03 12 02
+
+        punpcklwd   mm3,            mm6             ; 31 21 30 20
+        punpckhwd   mm4,            mm6             ; 33 23 32 22
+
+        movq        mm0,            mm1             ; 11 01 10 00
+        movq        mm5,            mm2             ; 13 03 12 02
+
+        punpckldq   mm0,            mm3             ; 30 20 10 00
+        punpckhdq   mm1,            mm3             ; 31 21 11 01
+
+        punpckldq   mm2,            mm4             ; 32 22 12 02
+        punpckhdq   mm5,            mm4             ; 33 23 13 03
+
+        movq        mm3,            mm5             ; 33 23 13 03
+
+        psubw       mm0,            mm2             ; b1= 0-2
+        paddw       mm2,            mm2             ;
+
+        movq        mm5,            mm1
+        paddw       mm2,            mm0             ; a1 =0+2
+
+        pmulhw      mm5,            [x_s1sqr2 GLOBAL]         ;
+        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
+
+        movq        mm7,            mm3             ;
+        pmulhw      mm7,            [x_c1sqr2less1 GLOBAL]    ;
+
+        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
+        psubw       mm7,            mm5             ; c1
+
+        movq        mm5,            mm1
+        movq        mm4,            mm3
+
+        pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
+        paddw       mm5,            mm1
+
+        pmulhw      mm3,            [x_s1sqr2 GLOBAL]
+        paddw       mm3,            mm4
+
+        paddw       mm3,            mm5             ; d1
+        paddw       mm0,            [fours GLOBAL]
+
+        paddw       mm2,            [fours GLOBAL]
+        movq        mm6,            mm2             ; a1
+
+        movq        mm4,            mm0             ; b1
+        paddw       mm2,            mm3             ;0
+
+        paddw       mm4,            mm7             ;1
+        psubw       mm0,            mm7             ;2
+
+        psubw       mm6,            mm3             ;3
+        psraw       mm2,            3
+
+        psraw       mm0,            3
+        psraw       mm4,            3
+
+        psraw       mm6,            3
+
+        movq        mm1,            mm2             ; 03 02 01 00
+        movq        mm3,            mm4             ; 23 22 21 20
+
+        punpcklwd   mm1,            mm0             ; 11 01 10 00
+        punpckhwd   mm2,            mm0             ; 13 03 12 02
+
+        punpcklwd   mm3,            mm6             ; 31 21 30 20
+        punpckhwd   mm4,            mm6             ; 33 23 32 22
+
+        movq        mm0,            mm1             ; 11 01 10 00
+        movq        mm5,            mm2             ; 13 03 12 02
+
+        punpckldq   mm0,            mm3             ; 30 20 10 00
+        punpckhdq   mm1,            mm3             ; 31 21 11 01
+
+        punpckldq   mm2,            mm4             ; 32 22 12 02
+        punpckhdq   mm5,            mm4             ; 33 23 13 03
+
+        movq        [rdx],          mm0
+
+        movq        [rdx+rax],      mm1
+        movq        [rdx+rax*2],    mm2
+
+        add         rdx,            rax
+        movq        [rdx+rax*2],    mm5
+
+    ; begin epilog
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)
+global sym(vp8_short_idct4x4llm_1_mmx)
+sym(vp8_short_idct4x4llm_1_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    GET_GOT     rbx
+    ; end prolog
+
+        mov         rax,            arg(0) ;input
+        movd        mm0,            [rax]
+
+        paddw       mm0,            [fours GLOBAL]
+        mov         rdx,            arg(1) ;output
+
+        psraw       mm0,            3
+        movsxd      rax,            dword ptr arg(2) ;pitch
+
+        punpcklwd   mm0,            mm0
+        punpckldq   mm0,            mm0
+
+        movq        [rdx],          mm0
+        movq        [rdx+rax],      mm0
+
+        movq        [rdx+rax*2],    mm0
+        add         rdx,            rax
+
+        movq        [rdx+rax*2],    mm0
+
+
+    ; begin epilog
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void dc_only_idct_mmx(short input_dc, short *output, int pitch)
+global sym(vp8_dc_only_idct_mmx)
+sym(vp8_dc_only_idct_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    GET_GOT     rbx
+    ; end prolog
+
+        movd        mm0,            arg(0) ;input_dc
+
+        paddw       mm0,            [fours GLOBAL]
+        mov         rdx,            arg(1) ;output
+
+        psraw       mm0,            3
+        movsxd      rax,            dword ptr arg(2) ;pitch
+
+        punpcklwd   mm0,            mm0
+        punpckldq   mm0,            mm0
+
+        movq        [rdx],          mm0
+        movq        [rdx+rax],      mm0
+
+        movq        [rdx+rax*2],    mm0
+        add         rdx,            rax
+
+        movq        [rdx+rax*2],    mm0
+
+    ; begin epilog
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+x_s1sqr2:
+    times 4 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+    times 4 dw 0x4E7B
+align 16
+fours:
+    times 4 dw 0x0004
diff --git a/vp8/common/x86/iwalsh_mmx.asm b/vp8/common/x86/iwalsh_mmx.asm
new file mode 100644
index 000000000..562e5908f
--- /dev/null
+++ b/vp8/common/x86/iwalsh_mmx.asm
@@ -0,0 +1,172 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_short_inv_walsh4x4_1_mmx(short *input, short *output)
+global sym(vp8_short_inv_walsh4x4_1_mmx)
+sym(vp8_short_inv_walsh4x4_1_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov     rsi, arg(0)
+    mov     rax, 3
+
+    mov     rdi, arg(1)
+    add     rax, [rsi]          ;input[0] + 3
+
+    movd    mm0, eax
+
+    punpcklwd mm0, mm0          ;x x val val
+
+    punpckldq mm0, mm0          ;val val val val
+
+    psraw   mm0, 3            ;(input[0] + 3) >> 3
+
+    movq  [rdi + 0], mm0
+    movq  [rdi + 8], mm0
+    movq  [rdi + 16], mm0
+    movq  [rdi + 24], mm0
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
+global sym(vp8_short_inv_walsh4x4_mmx)
+sym(vp8_short_inv_walsh4x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov     rax, 3
+    mov     rsi, arg(0)
+    mov     rdi, arg(1)
+    shl     rax, 16
+
+    movq    mm0, [rsi + 0]        ;ip[0]
+    movq    mm1, [rsi + 8]        ;ip[4]
+    or      rax, 3            ;00030003h
+
+    movq    mm2, [rsi + 16]       ;ip[8]
+    movq    mm3, [rsi + 24]       ;ip[12]
+
+    movd    mm7, rax
+    movq    mm4, mm0
+
+    punpcklwd mm7, mm7          ;0003000300030003h
+    movq    mm5, mm1
+
+    paddw   mm4, mm3          ;ip[0] + ip[12] aka al
+    paddw   mm5, mm2          ;ip[4] + ip[8] aka bl
+
+    movq    mm6, mm4          ;temp al
+
+    paddw   mm4, mm5          ;al + bl
+    psubw   mm6, mm5          ;al - bl
+
+    psubw   mm0, mm3          ;ip[0] - ip[12] aka d1
+    psubw   mm1, mm2          ;ip[4] - ip[8] aka c1
+
+    movq    mm5, mm0          ;temp dl
+
+    paddw   mm0, mm1          ;dl + cl
+    psubw   mm5, mm1          ;dl - cl
+
+    ; 03 02 01 00
+    ; 13 12 11 10
+    ; 23 22 21 20
+    ; 33 32 31 30
+
+    movq    mm3, mm4          ; 03 02 01 00
+    punpcklwd mm4, mm0          ; 11 01 10 00
+    punpckhwd mm3, mm0          ; 13 03 12 02
+
+    movq    mm1, mm6          ; 23 22 21 20
+    punpcklwd mm6, mm5          ; 31 21 30 20
+    punpckhwd mm1, mm5          ; 33 23 32 22
+
+    movq    mm0, mm4          ; 11 01 10 00
+    movq    mm2, mm3          ; 13 03 12 02
+
+    punpckldq mm0, mm6          ; 30 20 10 00 aka ip[0]
+    punpckhdq mm4, mm6          ; 31 21 11 01 aka ip[4]
+
+    punpckldq mm2, mm1          ; 32 22 12 02 aka ip[8]
+    punpckhdq mm3, mm1          ; 33 23 13 03 aka ip[12]
+;~~~~~~~~~~~~~~~~~~~~~
+    movq    mm1, mm0
+    movq    mm5, mm4
+
+    paddw   mm1, mm3          ;ip[0] + ip[12] aka al
+    paddw   mm5, mm2          ;ip[4] + ip[8] aka bl
+
+    movq    mm6, mm1          ;temp al
+
+    paddw   mm1, mm5          ;al + bl
+    psubw   mm6, mm5          ;al - bl
+
+    psubw   mm0, mm3          ;ip[0] - ip[12] aka d1
+    psubw   mm4, mm2          ;ip[4] - ip[8] aka c1
+
+    movq    mm5, mm0          ;temp dl
+
+    paddw   mm0, mm4          ;dl + cl
+    psubw   mm5, mm4          ;dl - cl
+;~~~~~~~~~~~~~~~~~~~~~
+    movq    mm3, mm1          ; 03 02 01 00
+    punpcklwd mm1, mm0          ; 11 01 10 00
+    punpckhwd mm3, mm0          ; 13 03 12 02
+
+    movq    mm4, mm6          ; 23 22 21 20
+    punpcklwd mm6, mm5          ; 31 21 30 20
+    punpckhwd mm4, mm5          ; 33 23 32 22
+
+    movq    mm0, mm1          ; 11 01 10 00
+    movq    mm2, mm3          ; 13 03 12 02
+
+    punpckldq mm0, mm6          ; 30 20 10 00 aka ip[0]
+    punpckhdq mm1, mm6          ; 31 21 11 01 aka ip[4]
+
+    punpckldq mm2, mm4          ; 32 22 12 02 aka ip[8]
+    punpckhdq mm3, mm4          ; 33 23 13 03 aka ip[12]
+
+    paddw   mm0, mm7
+    paddw   mm1, mm7
+    paddw   mm2, mm7
+    paddw   mm3, mm7
+
+    psraw   mm0, 3
+    psraw   mm1, 3
+    psraw   mm2, 3
+    psraw   mm3, 3
+
+    movq  [rdi + 0], mm0
+    movq  [rdi + 8], mm1
+    movq  [rdi + 16], mm2
+    movq  [rdi + 24], mm3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
diff --git a/vp8/common/x86/iwalsh_sse2.asm b/vp8/common/x86/iwalsh_sse2.asm
new file mode 100644
index 000000000..96943dfb8
--- /dev/null
+++ b/vp8/common/x86/iwalsh_sse2.asm
@@ -0,0 +1,116 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_short_inv_walsh4x4_sse2(short *input, short *output)
+global sym(vp8_short_inv_walsh4x4_sse2)
+sym(vp8_short_inv_walsh4x4_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov     rsi, arg(0)
+    mov     rdi, arg(1)
+    mov     rax, 3
+
+    movdqa    xmm0, [rsi + 0]       ;ip[4] ip[0]
+    movdqa    xmm1, [rsi + 16]      ;ip[12] ip[8]
+
+    shl     rax, 16
+    or      rax, 3            ;00030003h
+
+    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
+    movdqa    xmm3, xmm0          ;ip[4] ip[0]
+
+    paddw   xmm0, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+    movdqa    xmm4, xmm0
+    punpcklqdq  xmm0, xmm3          ;d1 a1
+    punpckhqdq  xmm4, xmm3          ;c1 b1
+    movd    xmm7, eax
+
+    movdqa    xmm1, xmm4          ;c1 b1
+    paddw   xmm4, xmm0          ;dl+cl a1+b1 aka op[4] op[0]
+    psubw   xmm0, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
+
+;;;temp output
+;;  movdqu  [rdi + 0], xmm4
+;;  movdqu  [rdi + 16], xmm3
+
+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ; 13 12 11 10 03 02 01 00
+    ;
+    ; 33 32 31 30 23 22 21 20
+    ;
+    movdqa    xmm3, xmm4          ; 13 12 11 10 03 02 01 00
+    punpcklwd xmm4, xmm0          ; 23 03 22 02 21 01 20 00
+    punpckhwd xmm3, xmm0          ; 33 13 32 12 31 11 30 10
+    movdqa    xmm1, xmm4          ; 23 03 22 02 21 01 20 00
+    punpcklwd xmm4, xmm3          ; 31 21 11 01 30 20 10 00
+    punpckhwd xmm1, xmm3          ; 33 23 13 03 32 22 12 02
+    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
+    movdqa    xmm3, xmm4          ;ip[4] ip[0]
+
+    pshufd    xmm7, xmm7, 0       ;03 03 03 03 03 03 03 03
+
+    paddw   xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+    movdqa    xmm5, xmm4
+    punpcklqdq  xmm4, xmm3          ;d1 a1
+    punpckhqdq  xmm5, xmm3          ;c1 b1
+
+    movdqa    xmm1, xmm5          ;c1 b1
+    paddw   xmm5, xmm4          ;dl+cl a1+b1 aka op[4] op[0]
+    psubw   xmm4, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ; 13 12 11 10 03 02 01 00
+    ;
+    ; 33 32 31 30 23 22 21 20
+    ;
+    movdqa    xmm0, xmm5          ; 13 12 11 10 03 02 01 00
+    punpcklwd xmm5, xmm4          ; 23 03 22 02 21 01 20 00
+    punpckhwd xmm0, xmm4          ; 33 13 32 12 31 11 30 10
+    movdqa    xmm1, xmm5          ; 23 03 22 02 21 01 20 00
+    punpcklwd xmm5, xmm0          ; 31 21 11 01 30 20 10 00
+    punpckhwd xmm1, xmm0          ; 33 23 13 03 32 22 12 02
+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    paddw   xmm5, xmm7
+    paddw   xmm1, xmm7
+
+    psraw   xmm5, 3
+    psraw   xmm1, 3
+
+    movdqa  [rdi + 0], xmm5
+    movdqa  [rdi + 16], xmm1
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+x_s1sqr2:
+    times 4 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+    times 4 dw 0x4E7B
+align 16
+fours:
+    times 4 dw 0x0004
diff --git a/vp8/common/x86/loopfilter_mmx.asm b/vp8/common/x86/loopfilter_mmx.asm
new file mode 100644
index 000000000..6e4d2b651
--- /dev/null
+++ b/vp8/common/x86/loopfilter_mmx.asm
@@ -0,0 +1,1776 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void vp8_loop_filter_horizontal_edge_mmx
+;(
+;    unsigned char *src_ptr,
+;    int src_pixel_step,
+;    const char *flimit,
+;    const char *limit,
+;    const char *thresh,
+;    int  count
+;)
+global sym(vp8_loop_filter_horizontal_edge_mmx)
+sym(vp8_loop_filter_horizontal_edge_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 32                         ; reserve 32 bytes
+    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[8];
+    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[8];
+
+        mov         rsi, arg(0) ;src_ptr
+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        movsxd      rcx, dword ptr arg(5) ;count
+next8_h:
+        mov         rdx, arg(3) ;limit
+        movq        mm7, [rdx]
+        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
+        add         rdi, rax
+
+        ; calculate breakout conditions
+        movq        mm2, [rdi+2*rax]      ; q3
+        movq        mm1, [rsi+2*rax]      ; q2
+        movq        mm6, mm1              ; q2
+        psubusb     mm1, mm2              ; q2-=q3
+        psubusb     mm2, mm6              ; q3-=q2
+        por         mm1, mm2              ; abs(q3-q2)
+        psubusb     mm1, mm7              ;
+
+
+        movq        mm4, [rsi+rax]        ; q1
+        movq        mm3, mm4              ; q1
+        psubusb     mm4, mm6              ; q1-=q2
+        psubusb     mm6, mm3              ; q2-=q1
+        por         mm4, mm6              ; abs(q2-q1)
+
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+        movq        mm4, [rsi]            ; q0
+        movq        mm0, mm4              ; q0
+        psubusb     mm4, mm3              ; q0-=q1
+        psubusb     mm3, mm0              ; q1-=q0
+        por         mm4, mm3              ; abs(q0-q1)
+        movq        t0, mm4               ; save to t0
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+
+        neg         rax                   ; negate pitch to deal with above border
+
+        movq        mm2, [rsi+4*rax]      ; p3
+        movq        mm4, [rdi+4*rax]      ; p2
+        movq        mm5, mm4              ; p2
+        psubusb     mm4, mm2              ; p2-=p3
+        psubusb     mm2, mm5              ; p3-=p2
+        por         mm4, mm2              ; abs(p3 - p2)
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+
+        movq        mm4, [rsi+2*rax]      ; p1
+        movq        mm3, mm4              ; p1
+        psubusb     mm4, mm5              ; p1-=p2
+        psubusb     mm5, mm3              ; p2-=p1
+        por         mm4, mm5              ; abs(p2 - p1)
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+        movq        mm2, mm3              ; p1
+
+        movq        mm4, [rsi+rax]        ; p0
+        movq        mm5, mm4              ; p0
+        psubusb     mm4, mm3              ; p0-=p1
+        psubusb     mm3, mm5              ; p1-=p0
+        por         mm4, mm3              ; abs(p1 - p0)
+        movq        t1, mm4               ; save to t1
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+        movq        mm3, [rdi]            ; q1
+        movq        mm4, mm3              ; q1
+        psubusb     mm3, mm2              ; q1-=p1
+        psubusb     mm2, mm4              ; p1-=q1
+        por         mm2, mm3              ; abs(p1-q1)
+        pand        mm2, [tfe GLOBAL]     ; set lsb of each byte to zero
+        psrlw       mm2, 1                ; abs(p1-q1)/2
+
+        movq        mm6, mm5              ; p0
+        movq        mm3, [rsi]            ; q0
+        psubusb     mm5, mm3              ; p0-=q0
+        psubusb     mm3, mm6              ; q0-=p0
+        por         mm5, mm3              ; abs(p0 - q0)
+        paddusb     mm5, mm5              ; abs(p0-q0)*2
+        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        mov         rdx, arg(2) ;flimit           ; get flimit
+        movq        mm2, [rdx]            ; flimit mm2
+        paddb       mm2, mm2              ; flimit*2 (less than 255)
+        paddb       mm7, mm2              ; flimit * 2 + limit (less than 255)
+
+        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        por         mm1,    mm5
+        pxor        mm5,    mm5
+        pcmpeqb     mm1,    mm5           ; mask mm1
+
+        ; calculate high edge variance
+        mov         rdx, arg(4) ;thresh           ; get thresh
+        movq        mm7, [rdx]            ;
+        movq        mm4, t0               ; get abs (q1 - q0)
+        psubusb     mm4, mm7
+        movq        mm3, t1               ; get abs (p1 - p0)
+        psubusb     mm3, mm7
+        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+        pcmpeqb     mm4,        mm5
+
+        pcmpeqb     mm5,        mm5
+        pxor        mm4,        mm5
+
+
+        ; start work on filters
+        movq        mm2, [rsi+2*rax]      ; p1
+        movq        mm7, [rdi]            ; q1
+        pxor        mm2, [t80 GLOBAL]     ; p1 offset to convert to signed values
+        pxor        mm7, [t80 GLOBAL]     ; q1 offset to convert to signed values
+        psubsb      mm2, mm7              ; p1 - q1
+        pand        mm2, mm4              ; high var mask (hvm)(p1 - q1)
+        pxor        mm6, [t80 GLOBAL]     ; offset to convert to signed values
+        pxor        mm0, [t80 GLOBAL]     ; offset to convert to signed values
+        movq        mm3, mm0              ; q0
+        psubsb      mm0, mm6              ; q0 - p0
+        paddsb      mm2, mm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      mm2, mm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      mm2, mm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
+        pand        mm1, mm2                  ; mask filter values we don't care about
+        movq        mm2, mm1
+        paddsb      mm1, [t4 GLOBAL]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+        paddsb      mm2, [t3 GLOBAL]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+        pxor        mm0, mm0             ;
+        pxor        mm5, mm5
+        punpcklbw   mm0, mm2            ;
+        punpckhbw   mm5, mm2            ;
+        psraw       mm0, 11             ;
+        psraw       mm5, 11
+        packsswb    mm0, mm5
+        movq        mm2, mm0            ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+        pxor        mm0, mm0              ; 0
+        movq        mm5, mm1              ; abcdefgh
+        punpcklbw   mm0, mm1              ; e0f0g0h0
+        psraw       mm0, 11               ; sign extended shift right by 3
+        pxor        mm1, mm1              ; 0
+        punpckhbw   mm1, mm5              ; a0b0c0d0
+        psraw       mm1, 11               ; sign extended shift right by 3
+        movq        mm5, mm0              ; save results
+
+        packsswb    mm0, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+        paddsw      mm5, [ones GLOBAL]
+        paddsw      mm1, [ones GLOBAL]
+        psraw       mm5, 1                ; partial shifted one more time for 2nd tap
+        psraw       mm1, 1                ; partial shifted one more time for 2nd tap
+        packsswb    mm5, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+        pandn       mm4, mm5              ; high edge variance additive
+
+        paddsb      mm6, mm2              ; p0+= p0 add
+        pxor        mm6, [t80 GLOBAL]     ; unoffset
+        movq        [rsi+rax], mm6        ; write back
+
+        movq        mm6, [rsi+2*rax]      ; p1
+        pxor        mm6, [t80 GLOBAL]     ; reoffset
+        paddsb      mm6, mm4              ; p1+= p1 add
+        pxor        mm6, [t80 GLOBAL]     ; unoffset
+        movq        [rsi+2*rax], mm6      ; write back
+
+        psubsb      mm3, mm0              ; q0-= q0 add
+        pxor        mm3, [t80 GLOBAL]     ; unoffset
+        movq        [rsi], mm3            ; write back
+
+        psubsb      mm7, mm4              ; q1-= q1 add
+        pxor        mm7, [t80 GLOBAL]     ; unoffset
+        movq        [rdi], mm7            ; write back
+
+        add         rsi,8
+        neg         rax
+        dec         rcx
+        jnz         next8_h
+
+    add rsp, 32
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_loop_filter_vertical_edge_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *flimit,
+;    const char *limit,
+;    const char *thresh,
+;    int count
+;)
+global sym(vp8_loop_filter_vertical_edge_mmx)
+sym(vp8_loop_filter_vertical_edge_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub          rsp, 64      ; reserve 64 bytes
+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
+    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[32];
+
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        lea         rsi,        [rsi + rax*4 - 4]
+
+        movsxd      rcx,        dword ptr arg(5) ;count
+next8_v:
+        mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing
+        add         rdi,        rax
+
+
+        ;transpose
+        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
+        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
+
+        punpckhbw   mm7,        [rdi+2*rax]                 ; 77 67 76 66 75 65 74 64
+        punpcklbw   mm6,        [rdi+2*rax]                 ; 73 63 72 62 71 61 70 60
+
+        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
+        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
+
+        punpckhbw   mm5,        [rsi+rax]                   ; 57 47 56 46 55 45 54 44
+        punpcklbw   mm4,        [rsi+rax]                   ; 53 43 52 42 51 41 50 40
+
+        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
+        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
+
+        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
+        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
+
+        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
+        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
+
+        neg         rax
+        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
+
+        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
+        punpckhbw   mm6,        [rsi+rax]                   ; 37 27 36 36 35 25 34 24
+
+        punpcklbw   mm1,        [rsi+rax]                   ; 33 23 32 22 31 21 30 20
+        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
+
+        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
+        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
+
+        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
+        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
+
+        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
+        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
+
+        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
+
+        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
+        psubusb     mm5,        mm7                         ; q2-q3
+
+        psubusb     mm7,        mm6                         ; q3-q2
+        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
+
+        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
+        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
+
+        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
+        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
+
+        psubusb     mm3,        mm6                         ; q1-q2
+        psubusb     mm6,        mm5                         ; q2-q1
+
+        por         mm6,        mm3                         ; mm6=abs(q2-q1)
+        lea         rdx,        srct
+
+        movq        [rdx+24],   mm5                         ; save q1
+        movq        [rdx+16],   mm0                         ; save q0
+
+        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
+        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
+
+        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
+        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
+
+        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
+        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
+
+        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
+        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
+
+        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
+        psubusb     mm2,        mm0                         ; p2-p3
+
+        psubusb     mm0,        mm1                         ; p3-p2
+        por         mm0,        mm2                         ; mm0=abs(p3-p2)
+
+        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
+        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
+
+        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
+        movq        [rdx+8],    mm3                         ; save p0
+
+        movq        [rdx],      mm2                         ; save p1
+        movq        mm5,        mm2                         ; mm5 = p1
+
+        psubusb     mm2,        mm1                         ; p1-p2
+        psubusb     mm1,        mm5                         ; p2-p1
+
+        por         mm1,        mm2                         ; mm1=abs(p2-p1)
+        mov         rdx,        arg(3) ;limit
+
+        movq        mm4,        [rdx]                       ; mm4 = limit
+        psubusb     mm7,        mm4
+
+        psubusb     mm0,        mm4
+        psubusb     mm1,        mm4
+
+        psubusb     mm6,        mm4
+        por         mm7,        mm6
+
+        por         mm0,        mm1
+        por         mm0,        mm7                         ;   abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
+
+        movq        mm1,        mm5                         ; p1
+
+        movq        mm7,        mm3                         ; mm3=mm7=p0
+        psubusb     mm7,        mm5                         ; p0 - p1
+
+        psubusb     mm5,        mm3                         ; p1 - p0
+        por         mm5,        mm7                         ; abs(p1-p0)
+
+        movq        t0,         mm5                         ; save abs(p1-p0)
+        lea         rdx,        srct
+
+        psubusb     mm5,        mm4
+        por         mm0,        mm5                         ; mm0=mask
+
+        movq        mm5,        [rdx+16]                    ; mm5=q0
+        movq        mm7,        [rdx+24]                    ; mm7=q1
+
+        movq        mm6,        mm5                         ; mm6=q0
+        movq        mm2,        mm7                         ; q1
+        psubusb     mm5,        mm7                         ; q0-q1
+
+        psubusb     mm7,        mm6                         ; q1-q0
+        por         mm7,        mm5                         ; abs(q1-q0)
+
+        movq        t1,         mm7                         ; save abs(q1-q0)
+        psubusb     mm7,        mm4
+
+        por         mm0,        mm7                         ; mask
+
+        movq        mm5,        mm2                         ; q1
+        psubusb     mm5,        mm1                         ; q1-=p1
+        psubusb     mm1,        mm2                         ; p1-=q1
+        por         mm5,        mm1                         ; abs(p1-q1)
+        pand        mm5,        [tfe GLOBAL]                ; set lsb of each byte to zero
+        psrlw       mm5,        1                           ; abs(p1-q1)/2
+
+        mov         rdx,        arg(2) ;flimit                      ;
+
+        movq        mm2,        [rdx]                       ;flimit  mm2
+        movq        mm1,        mm3                         ; mm1=mm3=p0
+
+        movq        mm7,        mm6                         ; mm7=mm6=q0
+        psubusb     mm1,        mm7                         ; p0-q0
+
+        psubusb     mm7,        mm3                         ; q0-p0
+        por         mm1,        mm7                         ; abs(q0-p0)
+        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
+        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        paddb       mm2,        mm2                         ; flimit*2 (less than 255)
+        paddb       mm4,        mm2                         ; flimit * 2 + limit (less than 255)
+
+        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        por         mm1,        mm0;                        ; mask
+
+        pxor        mm0,        mm0
+        pcmpeqb     mm1,        mm0
+
+        ; calculate high edge variance
+        mov         rdx,        arg(4) ;thresh            ; get thresh
+        movq        mm7,        [rdx]
+        ;
+        movq        mm4,        t0              ; get abs (q1 - q0)
+        psubusb     mm4,        mm7
+
+        movq        mm3,        t1              ; get abs (p1 - p0)
+        psubusb     mm3,        mm7
+
+        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+        pcmpeqb     mm4,        mm0
+
+        pcmpeqb     mm0,        mm0
+        pxor        mm4,        mm0
+
+
+
+        ; start work on filters
+        lea         rdx,        srct
+
+        movq        mm2,        [rdx]           ; p1
+        movq        mm7,        [rdx+24]        ; q1
+
+        movq        mm6,        [rdx+8]         ; p0
+        movq        mm0,        [rdx+16]        ; q0
+
+        pxor        mm2,        [t80 GLOBAL]    ; p1 offset to convert to signed values
+        pxor        mm7,        [t80 GLOBAL]    ; q1 offset to convert to signed values
+
+        psubsb      mm2,        mm7             ; p1 - q1
+        pand        mm2,        mm4             ; high var mask (hvm)(p1 - q1)
+
+        pxor        mm6,        [t80 GLOBAL]    ; offset to convert to signed values
+        pxor        mm0,        [t80 GLOBAL]    ; offset to convert to signed values
+
+        movq        mm3,        mm0             ; q0
+        psubsb      mm0,        mm6             ; q0 - p0
+
+        paddsb      mm2,        mm0             ; 1 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      mm2,        mm0             ; 2 * (q0 - p0) + hvm(p1 - q1)
+
+        paddsb      mm2,        mm0             ; 3 * (q0 - p0) + hvm(p1 - q1)
+        pand       mm1,        mm2              ; mask filter values we don't care about
+
+        movq        mm2,        mm1
+        paddsb      mm1,        [t4 GLOBAL]       ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+
+        paddsb      mm2,        [t3 GLOBAL]       ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+        pxor        mm0,        mm0          ;
+
+        pxor        mm5,        mm5
+        punpcklbw   mm0,        mm2         ;
+
+        punpckhbw   mm5,        mm2         ;
+        psraw       mm0,        11              ;
+
+        psraw       mm5,        11
+        packsswb    mm0,        mm5
+
+        movq        mm2,        mm0         ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+        pxor        mm0,        mm0           ; 0
+        movq        mm5,        mm1           ; abcdefgh
+
+        punpcklbw   mm0,        mm1           ; e0f0g0h0
+        psraw       mm0,        11                ; sign extended shift right by 3
+
+        pxor        mm1,        mm1           ; 0
+        punpckhbw   mm1,        mm5           ; a0b0c0d0
+
+        psraw       mm1,        11                ; sign extended shift right by 3
+        movq        mm5,        mm0              ; save results
+
+        packsswb    mm0,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+        paddsw      mm5,        [ones GLOBAL]
+
+        paddsw      mm1,        [ones GLOBAL]
+        psraw       mm5,        1                 ; partial shifted one more time for 2nd tap
+
+        psraw       mm1,        1                 ; partial shifted one more time for 2nd tap
+        packsswb    mm5,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+
+        pandn       mm4,        mm5             ; high edge variance additive
+
+        paddsb      mm6,        mm2             ; p0+= p0 add
+        pxor        mm6,        [t80 GLOBAL]    ; unoffset
+
+        ; mm6=p0                               ;
+        movq        mm1,        [rdx]           ; p1
+        pxor        mm1,        [t80 GLOBAL]    ; reoffset
+
+        paddsb      mm1,        mm4                 ; p1+= p1 add
+        pxor        mm1,        [t80 GLOBAL]        ; unoffset
+        ; mm6 = p0 mm1 = p1
+
+        psubsb      mm3,        mm0                 ; q0-= q0 add
+        pxor        mm3,        [t80 GLOBAL]        ; unoffset
+
+        ; mm3 = q0
+        psubsb      mm7,        mm4                 ; q1-= q1 add
+        pxor        mm7,        [t80 GLOBAL]        ; unoffset
+        ; mm7 = q1
+
+        ; tranpose and write back
+        ; mm1 =    72 62 52 42 32 22 12 02
+        ; mm6 =    73 63 53 43 33 23 13 03
+        ; mm3 =    74 64 54 44 34 24 14 04
+        ; mm7 =    75 65 55 45 35 25 15 05
+
+        movq        mm2,        mm1             ; 72 62 52 42 32 22 12 02
+        punpcklbw   mm2,        mm6             ; 33 32 23 22 13 12 03 02
+
+        movq        mm4,        mm3             ; 74 64 54 44 34 24 14 04
+        punpckhbw   mm1,        mm6             ; 73 72 63 62 53 52 43 42
+
+        punpcklbw   mm4,        mm7             ; 35 34 25 24 15 14 05 04
+        punpckhbw   mm3,        mm7             ; 75 74 65 64 55 54 45 44
+
+        movq        mm6,        mm2             ; 33 32 23 22 13 12 03 02
+        punpcklwd   mm2,        mm4             ; 15 14 13 12 05 04 03 02
+
+        punpckhwd   mm6,        mm4             ; 35 34 33 32 25 24 23 22
+        movq        mm5,        mm1             ; 73 72 63 62 53 52 43 42
+
+        punpcklwd   mm1,        mm3             ; 55 54 53 52 45 44 43 42
+        punpckhwd   mm5,        mm3             ; 75 74 73 72 65 64 63 62
+
+
+        ; mm2 = 15 14 13 12 05 04 03 02
+        ; mm6 = 35 34 33 32 25 24 23 22
+        ; mm5 = 55 54 53 52 45 44 43 42
+        ; mm1 = 75 74 73 72 65 64 63 62
+
+
+
+        movd        [rsi+rax*4+2], mm2
+        psrlq       mm2,        32
+
+        movd        [rdi+rax*4+2], mm2
+        movd        [rsi+rax*2+2], mm6
+
+        psrlq       mm6,        32
+        movd        [rsi+rax+2],mm6
+
+        movd        [rsi+2],    mm1
+        psrlq       mm1,        32
+
+        movd        [rdi+2],    mm1
+        neg         rax
+
+        movd        [rdi+rax+2],mm5
+        psrlq       mm5,        32
+
+        movd        [rdi+rax*2+2], mm5
+
+        lea         rsi,        [rsi+rax*8]
+        dec         rcx
+        jnz         next8_v
+
+    add rsp, 64
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_mbloop_filter_horizontal_edge_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *flimit,
+;    const char *limit,
+;    const char *thresh,
+;    int count
+;)
+global sym(vp8_mbloop_filter_horizontal_edge_mmx)
+sym(vp8_mbloop_filter_horizontal_edge_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub          rsp, 32      ; reserve 32 bytes
+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
+
+        mov         rsi, arg(0) ;src_ptr
+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        movsxd      rcx, dword ptr arg(5) ;count
+next8_mbh:
+        mov         rdx, arg(3) ;limit
+        movq        mm7, [rdx]
+        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
+        add         rdi, rax
+
+        ; calculate breakout conditions
+        movq        mm2, [rdi+2*rax]      ; q3
+
+        movq        mm1, [rsi+2*rax]      ; q2
+        movq        mm6, mm1              ; q2
+        psubusb     mm1, mm2              ; q2-=q3
+        psubusb     mm2, mm6              ; q3-=q2
+        por         mm1, mm2              ; abs(q3-q2)
+        psubusb     mm1, mm7
+
+
+        ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
+        movq        mm4, [rsi+rax]        ; q1
+        movq        mm3, mm4              ; q1
+        psubusb     mm4, mm6              ; q1-=q2
+        psubusb     mm6, mm3              ; q2-=q1
+        por         mm4, mm6              ; abs(q2-q1)
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+
+        ; mm1 = mask,      mm3=q1, mm7 = limit
+
+        movq        mm4, [rsi]            ; q0
+        movq        mm0, mm4              ; q0
+        psubusb     mm4, mm3              ; q0-=q1
+        psubusb     mm3, mm0              ; q1-=q0
+        por         mm4, mm3              ; abs(q0-q1)
+        movq        t0, mm4               ; save to t0
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+
+        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
+
+        neg         rax                   ; negate pitch to deal with above border
+
+        movq        mm2, [rsi+4*rax]      ; p3
+        movq        mm4, [rdi+4*rax]      ; p2
+        movq        mm5, mm4              ; p2
+        psubusb     mm4, mm2              ; p2-=p3
+        psubusb     mm2, mm5              ; p3-=p2
+        por         mm4, mm2              ; abs(p3 - p2)
+        psubusb     mm4, mm7
+        por        mm1, mm4
+        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
+
+        movq        mm4, [rsi+2*rax]      ; p1
+        movq        mm3, mm4              ; p1
+        psubusb     mm4, mm5              ; p1-=p2
+        psubusb     mm5, mm3              ; p2-=p1
+        por         mm4, mm5              ; abs(p2 - p1)
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+        movq        mm2, mm3              ; p1
+
+
+        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
+
+        movq        mm4, [rsi+rax]        ; p0
+        movq        mm5, mm4              ; p0
+        psubusb     mm4, mm3              ; p0-=p1
+        psubusb     mm3, mm5              ; p1-=p0
+        por         mm4, mm3              ; abs(p1 - p0)
+        movq        t1, mm4               ; save to t1
+        psubusb     mm4, mm7
+        por        mm1, mm4
+        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
+        ; mm5 = p0
+        movq        mm3, [rdi]            ; q1
+        movq        mm4, mm3              ; q1
+        psubusb     mm3, mm2              ; q1-=p1
+        psubusb     mm2, mm4              ; p1-=q1
+        por         mm2, mm3              ; abs(p1-q1)
+        pand        mm2, [tfe GLOBAL]     ; set lsb of each byte to zero
+        psrlw       mm2, 1                ; abs(p1-q1)/2
+
+        movq        mm6, mm5              ; p0
+        movq        mm3, mm0              ; q0
+        psubusb     mm5, mm3              ; p0-=q0
+        psubusb     mm3, mm6              ; q0-=p0
+        por         mm5, mm3              ; abs(p0 - q0)
+        paddusb     mm5, mm5              ; abs(p0-q0)*2
+        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        mov         rdx, arg(2) ;flimit           ; get flimit
+        movq        mm2, [rdx]            ; flimit mm2
+        paddb       mm2, mm2              ; flimit*2 (less than 255)
+        paddb       mm7, mm2              ; flimit * 2 + limit (less than 255)
+
+        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        por         mm1,    mm5
+        pxor        mm5,    mm5
+        pcmpeqb     mm1,    mm5           ; mask mm1
+
+        ; mm1 = mask, mm0=q0,  mm7 = flimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
+        ; mm6 = p0,
+
+        ; calculate high edge variance
+        mov         rdx, arg(4) ;thresh           ; get thresh
+        movq        mm7, [rdx]            ;
+        movq        mm4, t0               ; get abs (q1 - q0)
+        psubusb     mm4, mm7
+        movq        mm3, t1               ; get abs (p1 - p0)
+        psubusb     mm3, mm7
+        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+        pcmpeqb     mm4,        mm5
+
+        pcmpeqb     mm5,        mm5
+        pxor        mm4,        mm5
+
+
+
+        ; mm1 = mask, mm0=q0,  mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
+        ; mm6 = p0, mm4=hev
+        ; start work on filters
+        movq        mm2, [rsi+2*rax]      ; p1
+        movq        mm7, [rdi]            ; q1
+        pxor        mm2, [t80 GLOBAL]     ; p1 offset to convert to signed values
+        pxor        mm7, [t80 GLOBAL]     ; q1 offset to convert to signed values
+        psubsb      mm2, mm7              ; p1 - q1
+
+        pxor        mm6, [t80 GLOBAL]     ; offset to convert to signed values
+        pxor        mm0, [t80 GLOBAL]     ; offset to convert to signed values
+        movq        mm3, mm0              ; q0
+        psubsb      mm0, mm6              ; q0 - p0
+        paddsb      mm2, mm0              ; 1 * (q0 - p0) + (p1 - q1)
+        paddsb      mm2, mm0              ; 2 * (q0 - p0)
+        paddsb      mm2, mm0              ; 3 * (q0 - p0) + (p1 - q1)
+        pand        mm1, mm2              ; mask filter values we don't care about
+
+
+        ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
+        movq        mm2, mm1              ; vp8_filter
+        pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
+
+        movq        mm5,        mm2       ;
+        paddsb      mm5,        [t3 GLOBAL];
+
+        pxor        mm0, mm0              ; 0
+        pxor        mm7, mm7              ; 0
+
+        punpcklbw   mm0, mm5              ; e0f0g0h0
+        psraw       mm0, 11               ; sign extended shift right by 3
+        punpckhbw   mm7, mm5              ; a0b0c0d0
+        psraw       mm7, 11               ; sign extended shift right by 3
+        packsswb    mm0, mm7              ; Filter2 >>=3;
+
+        movq        mm5, mm0              ; Filter2
+
+        paddsb      mm2, [t4 GLOBAL]      ; vp8_signed_char_clamp(Filter2 + 4)
+        pxor        mm0, mm0              ; 0
+        pxor        mm7, mm7              ; 0
+
+        punpcklbw   mm0, mm2              ; e0f0g0h0
+        psraw       mm0, 11               ; sign extended shift right by 3
+        punpckhbw   mm7, mm2              ; a0b0c0d0
+        psraw       mm7, 11               ; sign extended shift right by 3
+        packsswb    mm0, mm7              ; Filter2 >>=3;
+
+        ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
+        psubsb      mm3, mm0              ; qs0 =qs0 - filter1
+        paddsb      mm6, mm5              ; ps0 =ps0 + Fitler2
+
+        ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
+        ; vp8_filter &= ~hev;
+        ; Filter2 = vp8_filter;
+        pandn       mm4, mm1              ; vp8_filter&=~hev
+
+
+        ; mm3=qs0, mm4=filter2, mm6=ps0
+
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
+        ; s = vp8_signed_char_clamp(qs0 - u);
+        ; *oq0 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps0 + u);
+        ; *op0 = s^0x80;
+        pxor        mm0, mm0
+
+        pxor        mm1, mm1
+        pxor        mm2, mm2
+        punpcklbw   mm1, mm4
+        punpckhbw   mm2, mm4
+        pmulhw      mm1, [s27 GLOBAL]
+        pmulhw      mm2, [s27 GLOBAL]
+        paddw       mm1, [s63 GLOBAL]
+        paddw       mm2, [s63 GLOBAL]
+        psraw       mm1, 7
+        psraw       mm2, 7
+        packsswb    mm1, mm2
+
+        psubsb      mm3, mm1
+        paddsb      mm6, mm1
+
+        pxor        mm3, [t80 GLOBAL]
+        pxor        mm6, [t80 GLOBAL]
+        movq        [rsi+rax], mm6
+        movq        [rsi],     mm3
+
+        ; roughly 2/7th difference across boundary
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
+        ; s = vp8_signed_char_clamp(qs1 - u);
+        ; *oq1 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps1 + u);
+        ; *op1 = s^0x80;
+        pxor        mm1, mm1
+        pxor        mm2, mm2
+        punpcklbw   mm1, mm4
+        punpckhbw   mm2, mm4
+        pmulhw      mm1, [s18 GLOBAL]
+        pmulhw      mm2, [s18 GLOBAL]
+        paddw       mm1, [s63 GLOBAL]
+        paddw       mm2, [s63 GLOBAL]
+        psraw       mm1, 7
+        psraw       mm2, 7
+        packsswb    mm1, mm2
+
+        movq        mm3, [rdi]
+        movq        mm6, [rsi+rax*2]       ; p1
+
+        pxor        mm3, [t80 GLOBAL]
+        pxor        mm6, [t80 GLOBAL]
+
+        paddsb      mm6, mm1
+        psubsb      mm3, mm1
+
+        pxor        mm6, [t80 GLOBAL]
+        pxor        mm3, [t80 GLOBAL]
+        movq        [rdi], mm3
+        movq        [rsi+rax*2], mm6
+
+        ; roughly 1/7th difference across boundary
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
+        ; s = vp8_signed_char_clamp(qs2 - u);
+        ; *oq2 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps2 + u);
+        ; *op2 = s^0x80;
+        pxor        mm1, mm1
+        pxor        mm2, mm2
+        punpcklbw   mm1, mm4
+        punpckhbw   mm2, mm4
+        pmulhw      mm1, [s9 GLOBAL]
+        pmulhw      mm2, [s9 GLOBAL]
+        paddw       mm1, [s63 GLOBAL]
+        paddw       mm2, [s63 GLOBAL]
+        psraw       mm1, 7
+        psraw       mm2, 7
+        packsswb    mm1, mm2
+
+
+        movq        mm6, [rdi+rax*4]
+        neg         rax
+        movq        mm3, [rdi+rax  ]
+
+        pxor        mm6, [t80 GLOBAL]
+        pxor        mm3, [t80 GLOBAL]
+
+        paddsb      mm6, mm1
+        psubsb      mm3, mm1
+
+        pxor        mm6, [t80 GLOBAL]
+        pxor        mm3, [t80 GLOBAL]
+        movq        [rdi+rax  ], mm3
+        neg         rax
+        movq        [rdi+rax*4], mm6
+
+;EARLY_BREAK_OUT:
+        neg         rax
+        add         rsi,8
+        dec         rcx
+        jnz         next8_mbh
+
+    add rsp, 32
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_mbloop_filter_vertical_edge_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *flimit,
+;    const char *limit,
+;    const char *thresh,
+;    int count
+;)
+global sym(vp8_mbloop_filter_vertical_edge_mmx)
+sym(vp8_mbloop_filter_vertical_edge_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub          rsp, 96      ; reserve 96 bytes
+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
+    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[64];
+
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        lea         rsi,        [rsi + rax*4 - 4]
+
+        movsxd      rcx,        dword ptr arg(5) ;count
+next8_mbv:
+        lea         rdi,        [rsi + rax]  ; rdi points to row +1 for indirect addressing
+
+        ;transpose
+        movq        mm0,        [rdi+2*rax]                 ; 77 76 75 74 73 72 71 70
+        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
+
+        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
+        punpckhbw   mm7,        mm0                         ; 77 67 76 66 75 65 74 64
+
+        punpcklbw   mm6,        mm0                         ; 73 63 72 62 71 61 70 60
+        movq        mm0,        [rsi+rax]                   ; 57 56 55 54 53 52 51 50
+
+        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
+        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
+
+        punpckhbw   mm5,        mm0                         ; 57 47 56 46 55 45 54 44
+        punpcklbw   mm4,        mm0                         ; 53 43 52 42 51 41 50 40
+
+        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
+        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
+
+        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
+        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
+
+        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
+        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
+
+        neg         rax
+
+        movq        mm7,        [rsi+rax]                   ; 37 36 35 34 33 32 31 30
+        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
+
+        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
+        punpckhbw   mm6,        mm7                         ; 37 27 36 36 35 25 34 24
+
+        punpcklbw   mm1,        mm7                         ; 33 23 32 22 31 21 30 20
+
+        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
+        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
+
+        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
+        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
+
+        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
+        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
+
+        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
+        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
+
+        lea         rdx,        srct
+        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
+
+        movq        [rdx+56],   mm7
+        psubusb     mm5,        mm7                         ; q2-q3
+
+
+        movq        [rdx+48],   mm6
+        psubusb     mm7,        mm6                         ; q3-q2
+
+        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
+        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
+
+        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
+        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
+
+        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
+        psubusb     mm3,        mm6                         ; q1-q2
+
+        psubusb     mm6,        mm5                         ; q2-q1
+        por         mm6,        mm3                         ; mm6=abs(q2-q1)
+
+        movq        [rdx+40],   mm5                         ; save q1
+        movq        [rdx+32],   mm0                         ; save q0
+
+        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
+        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
+
+        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
+        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
+
+        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
+        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
+
+        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
+        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
+
+        movq        [rdx],      mm0                         ; save p3
+        movq        [rdx+8],    mm1                         ; save p2
+
+        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
+        psubusb     mm2,        mm0                         ; p2-p3
+
+        psubusb     mm0,        mm1                         ; p3-p2
+        por         mm0,        mm2                         ; mm0=abs(p3-p2)
+
+        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
+        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
+
+        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
+        movq        [rdx+24],   mm3                         ; save p0
+
+        movq        [rdx+16],   mm2                         ; save p1
+        movq        mm5,        mm2                         ; mm5 = p1
+
+        psubusb     mm2,        mm1                         ; p1-p2
+        psubusb     mm1,        mm5                         ; p2-p1
+
+        por         mm1,        mm2                         ; mm1=abs(p2-p1)
+        mov         rdx,        arg(3) ;limit
+
+        movq        mm4,        [rdx]                       ; mm4 = limit
+        psubusb     mm7,        mm4                         ; abs(q3-q2) > limit
+
+        psubusb     mm0,        mm4                         ; abs(p3-p2) > limit
+        psubusb     mm1,        mm4                         ; abs(p2-p1) > limit
+
+        psubusb     mm6,        mm4                         ; abs(q2-q1) > limit
+        por         mm7,        mm6                         ; or
+
+        por         mm0,        mm1                         ;
+        por         mm0,        mm7                         ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
+
+        movq        mm1,        mm5                         ; p1
+
+        movq        mm7,        mm3                         ; mm3=mm7=p0
+        psubusb     mm7,        mm5                         ; p0 - p1
+
+        psubusb     mm5,        mm3                         ; p1 - p0
+        por         mm5,        mm7                         ; abs(p1-p0)
+
+        movq        t0,         mm5                         ; save abs(p1-p0)
+        lea         rdx,        srct
+
+        psubusb     mm5,        mm4                         ; mm5 = abs(p1-p0) > limit
+        por         mm0,        mm5                         ; mm0=mask
+
+        movq        mm5,        [rdx+32]                    ; mm5=q0
+        movq        mm7,        [rdx+40]                    ; mm7=q1
+
+        movq        mm6,        mm5                         ; mm6=q0
+        movq        mm2,        mm7                         ; q1
+        psubusb     mm5,        mm7                         ; q0-q1
+
+        psubusb     mm7,        mm6                         ; q1-q0
+        por         mm7,        mm5                         ; abs(q1-q0)
+
+        movq        t1,         mm7                         ; save abs(q1-q0)
+        psubusb     mm7,        mm4                         ; mm7=abs(q1-q0)> limit
+
+        por         mm0,        mm7                         ; mask
+
+        movq        mm5,        mm2                         ; q1
+        psubusb     mm5,        mm1                         ; q1-=p1
+        psubusb     mm1,        mm2                         ; p1-=q1
+        por         mm5,        mm1                         ; abs(p1-q1)
+        pand        mm5,        [tfe GLOBAL]                ; set lsb of each byte to zero
+        psrlw       mm5,        1                           ; abs(p1-q1)/2
+
+        mov         rdx,        arg(2) ;flimit                      ;
+
+        movq        mm2,        [rdx]                       ;flimit  mm2
+        movq        mm1,        mm3                         ; mm1=mm3=p0
+
+        movq        mm7,        mm6                         ; mm7=mm6=q0
+        psubusb     mm1,        mm7                         ; p0-q0
+
+        psubusb     mm7,        mm3                         ; q0-p0
+        por         mm1,        mm7                         ; abs(q0-p0)
+        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
+        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        paddb       mm2,        mm2                         ; flimit*2 (less than 255)
+        paddb       mm4,        mm2                         ; flimit * 2 + limit (less than 255)
+
+        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        por         mm1,        mm0;                        ; mask
+
+        pxor        mm0,        mm0
+        pcmpeqb     mm1,        mm0
+
+        ; calculate high edge variance
+        mov         rdx,        arg(4) ;thresh            ; get thresh
+        movq        mm7,        [rdx]
+        ;
+        movq        mm4,        t0              ; get abs (q1 - q0)
+        psubusb     mm4,        mm7             ; abs(q1 - q0) > thresh
+
+        movq        mm3,        t1              ; get abs (p1 - p0)
+        psubusb     mm3,        mm7             ; abs(p1 - p0)> thresh
+
+        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+        pcmpeqb     mm4,        mm0
+
+        pcmpeqb     mm0,        mm0
+        pxor        mm4,        mm0
+
+
+
+
+        ; start work on filters
+        lea         rdx,        srct
+
+        ; start work on filters
+        movq        mm2, [rdx+16]         ; p1
+        movq        mm7, [rdx+40]         ; q1
+        pxor        mm2, [t80 GLOBAL]     ; p1 offset to convert to signed values
+        pxor        mm7, [t80 GLOBAL]     ; q1 offset to convert to signed values
+        psubsb      mm2, mm7              ; p1 - q1
+
+        movq        mm6, [rdx+24]         ; p0
+        movq        mm0, [rdx+32]         ; q0
+        pxor        mm6, [t80 GLOBAL]     ; offset to convert to signed values
+        pxor        mm0, [t80 GLOBAL]     ; offset to convert to signed values
+
+        movq        mm3, mm0              ; q0
+        psubsb      mm0, mm6              ; q0 - p0
+        paddsb      mm2, mm0              ; 1 * (q0 - p0) + (p1 - q1)
+        paddsb      mm2, mm0              ; 2 * (q0 - p0)
+        paddsb      mm2, mm0              ; 3 * (q0 - p0) + (p1 - q1)
+        pand       mm1, mm2           ; mask filter values we don't care about
+
+        ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
+        movq        mm2, mm1              ; vp8_filter
+        pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
+
+        movq        mm5,        mm2       ;
+        paddsb      mm5,        [t3 GLOBAL];
+
+        pxor        mm0, mm0              ; 0
+        pxor        mm7, mm7              ; 0
+
+        punpcklbw   mm0, mm5              ; e0f0g0h0
+        psraw       mm0, 11               ; sign extended shift right by 3
+        punpckhbw   mm7, mm5              ; a0b0c0d0
+        psraw       mm7, 11               ; sign extended shift right by 3
+        packsswb    mm0, mm7              ; Filter2 >>=3;
+
+        movq        mm5, mm0              ; Filter2
+
+        paddsb      mm2, [t4 GLOBAL]      ; vp8_signed_char_clamp(Filter2 + 4)
+        pxor        mm0, mm0              ; 0
+        pxor        mm7, mm7              ; 0
+
+        punpcklbw   mm0, mm2              ; e0f0g0h0
+        psraw       mm0, 11               ; sign extended shift right by 3
+        punpckhbw   mm7, mm2              ; a0b0c0d0
+        psraw       mm7, 11               ; sign extended shift right by 3
+        packsswb    mm0, mm7              ; Filter2 >>=3;
+
+        ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
+        psubsb      mm3, mm0              ; qs0 =qs0 - filter1
+        paddsb      mm6, mm5              ; ps0 =ps0 + Fitler2
+
+        ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
+        ; vp8_filter &= ~hev;
+        ; Filter2 = vp8_filter;
+        pandn       mm4, mm1              ; vp8_filter&=~hev
+
+
+        ; mm3=qs0, mm4=filter2, mm6=ps0
+
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
+        ; s = vp8_signed_char_clamp(qs0 - u);
+        ; *oq0 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps0 + u);
+        ; *op0 = s^0x80;
+        pxor        mm0, mm0
+
+        pxor        mm1, mm1
+        pxor        mm2, mm2
+        punpcklbw   mm1, mm4
+        punpckhbw   mm2, mm4
+        pmulhw      mm1, [s27 GLOBAL]
+        pmulhw      mm2, [s27 GLOBAL]
+        paddw       mm1, [s63 GLOBAL]
+        paddw       mm2, [s63 GLOBAL]
+        psraw       mm1, 7
+        psraw       mm2, 7
+        packsswb    mm1, mm2
+
+        psubsb      mm3, mm1
+        paddsb      mm6, mm1
+
+        pxor        mm3, [t80 GLOBAL]
+        pxor        mm6, [t80 GLOBAL]
+        movq        [rdx+24], mm6
+        movq        [rdx+32], mm3
+
+        ; roughly 2/7th difference across boundary
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
+        ; s = vp8_signed_char_clamp(qs1 - u);
+        ; *oq1 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps1 + u);
+        ; *op1 = s^0x80;
+        pxor        mm1, mm1
+        pxor        mm2, mm2
+        punpcklbw   mm1, mm4
+        punpckhbw   mm2, mm4
+        pmulhw      mm1, [s18 GLOBAL]
+        pmulhw      mm2, [s18 GLOBAL]
+        paddw       mm1, [s63 GLOBAL]
+        paddw       mm2, [s63 GLOBAL]
+        psraw       mm1, 7
+        psraw       mm2, 7
+        packsswb    mm1, mm2
+
+        movq        mm3, [rdx + 40]
+        movq        mm6, [rdx + 16]       ; p1
+        pxor        mm3, [t80 GLOBAL]
+        pxor        mm6, [t80 GLOBAL]
+
+        paddsb      mm6, mm1
+        psubsb      mm3, mm1
+
+        pxor        mm6, [t80 GLOBAL]
+        pxor        mm3, [t80 GLOBAL]
+        movq        [rdx + 40], mm3
+        movq        [rdx + 16], mm6
+
+        ; roughly 1/7th difference across boundary
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
+        ; s = vp8_signed_char_clamp(qs2 - u);
+        ; *oq2 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps2 + u);
+        ; *op2 = s^0x80;
+        pxor        mm1, mm1
+        pxor        mm2, mm2
+        punpcklbw   mm1, mm4
+        punpckhbw   mm2, mm4
+        pmulhw      mm1, [s9 GLOBAL]
+        pmulhw      mm2, [s9 GLOBAL]
+        paddw       mm1, [s63 GLOBAL]
+        paddw       mm2, [s63 GLOBAL]
+        psraw       mm1, 7
+        psraw       mm2, 7
+        packsswb    mm1, mm2
+
+        movq        mm6, [rdx+ 8]
+        movq        mm3, [rdx+48]
+
+        pxor        mm6, [t80 GLOBAL]
+        pxor        mm3, [t80 GLOBAL]
+
+        paddsb      mm6, mm1
+        psubsb      mm3, mm1
+
+        pxor        mm6, [t80 GLOBAL]           ; mm6 = 71 61 51 41 31 21 11 01
+        pxor        mm3, [t80 GLOBAL]           ; mm3 = 76 66 56 46 36 26 15 06
+
+        ; tranpose and write back
+        movq        mm0,    [rdx]               ; mm0 = 70 60 50 40 30 20 10 00
+        movq        mm1,    mm0                 ; mm0 = 70 60 50 40 30 20 10 00
+
+        punpcklbw   mm0,    mm6                 ; mm0 = 31 30 21 20 11 10 01 00
+        punpckhbw   mm1,    mm6                 ; mm3 = 71 70 61 60 51 50 41 40
+
+        movq        mm2,    [rdx+16]            ; mm2 = 72 62 52 42 32 22 12 02
+        movq        mm6,    mm2                 ; mm3 = 72 62 52 42 32 22 12 02
+
+        punpcklbw   mm2,    [rdx+24]            ; mm2 = 33 32 23 22 13 12 03 02
+        punpckhbw   mm6,    [rdx+24]            ; mm3 = 73 72 63 62 53 52 43 42
+
+        movq        mm5,    mm0                 ; mm5 = 31 30 21 20 11 10 01 00
+        punpcklwd   mm0,    mm2                 ; mm0 = 13 12 11 10 03 02 01 00
+
+        punpckhwd   mm5,    mm2                 ; mm5 = 33 32 31 30 23 22 21 20
+        movq        mm4,    mm1                 ; mm4 = 71 70 61 60 51 50 41 40
+
+        punpcklwd   mm1,    mm6                 ; mm1 = 53 52 51 50 43 42 41 40
+        punpckhwd   mm4,    mm6                 ; mm4 = 73 72 71 70 63 62 61 60
+
+        movq        mm2,    [rdx+32]            ; mm2 = 74 64 54 44 34 24 14 04
+        punpcklbw   mm2,    [rdx+40]            ; mm2 = 35 34 25 24 15 14 05 04
+
+        movq        mm6,    mm3                 ; mm6 = 76 66 56 46 36 26 15 06
+        punpcklbw   mm6,    [rdx+56]            ; mm6 = 37 36 27 26 17 16 07 06
+
+        movq        mm7,    mm2                 ; mm7 = 35 34 25 24 15 14 05 04
+        punpcklwd   mm2,    mm6                 ; mm2 = 17 16 15 14 07 06 05 04
+
+        punpckhwd   mm7,    mm6                 ; mm7 = 37 36 35 34 27 26 25 24
+        movq        mm6,    mm0                 ; mm6 = 13 12 11 10 03 02 01 00
+
+        punpckldq   mm0,    mm2                 ; mm0 = 07 06 05 04 03 02 01 00
+        punpckhdq   mm6,    mm2                 ; mm6 = 17 16 15 14 13 12 11 10
+
+        movq        [rsi+rax*4], mm0            ; write out
+        movq        [rdi+rax*4], mm6            ; write out
+
+        movq        mm0,    mm5                 ; mm0 = 33 32 31 30 23 22 21 20
+        punpckldq   mm0,    mm7                 ; mm0 = 27 26 25 24 23 22 20 20
+
+        punpckhdq   mm5,    mm7                 ; mm5 = 37 36 35 34 33 32 31 30
+        movq        [rsi+rax*2], mm0            ; write out
+
+        movq        [rdi+rax*2], mm5            ; write out
+        movq        mm2,    [rdx+32]            ; mm2 = 74 64 54 44 34 24 14 04
+
+        punpckhbw   mm2,    [rdx+40]            ; mm2 = 75 74 65 64 54 54 45 44
+        punpckhbw   mm3,    [rdx+56]            ; mm3 = 77 76 67 66 57 56 47 46
+
+        movq        mm5,    mm2                 ; mm5 = 75 74 65 64 54 54 45 44
+        punpcklwd   mm2,    mm3                 ; mm2 = 57 56 55 54 47 46 45 44
+
+        punpckhwd   mm5,    mm3                 ; mm5 = 77 76 75 74 67 66 65 64
+        movq        mm0,    mm1                 ; mm0=  53 52 51 50 43 42 41 40
+
+        movq        mm3,    mm4                 ; mm4 = 73 72 71 70 63 62 61 60
+        punpckldq   mm0,    mm2                 ; mm0 = 47 46 45 44 43 42 41 40
+
+        punpckhdq   mm1,    mm2                 ; mm1 = 57 56 55 54 53 52 51 50
+        movq        [rsi],  mm0                 ; write out
+
+        movq        [rdi],  mm1                 ; write out
+        neg         rax
+
+        punpckldq   mm3,    mm5                 ; mm3 = 67 66 65 64 63 62 61 60
+        punpckhdq   mm4,    mm5                 ; mm4 = 77 76 75 74 73 72 71 60
+
+        movq        [rsi+rax*2], mm3
+        movq        [rdi+rax*2], mm4
+
+        lea         rsi,        [rsi+rax*8]
+        dec         rcx
+
+        jnz         next8_mbv
+
+    add rsp, 96
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_loop_filter_simple_horizontal_edge_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *flimit,
+;    const char *limit,
+;    const char *thresh,
+;    int count
+;)
+global sym(vp8_loop_filter_simple_horizontal_edge_mmx)
+sym(vp8_loop_filter_simple_horizontal_edge_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi, arg(0) ;src_ptr
+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        movsxd      rcx, dword ptr arg(5) ;count
+nexts8_h:
+        mov         rdx, arg(3) ;limit
+        movq        mm7, [rdx]
+        mov         rdx, arg(2) ;flimit           ; get flimit
+        movq        mm3, [rdx]            ;
+        paddb       mm3, mm3              ; flimit*2 (less than 255)
+        paddb       mm3, mm7              ; flimit * 2 + limit (less than 255)
+
+        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
+        add         rdi, rax
+        neg         rax
+
+        ; calculate mask
+        movq        mm1, [rsi+2*rax]      ; p1
+        movq        mm0, [rdi]            ; q1
+        movq        mm2, mm1
+        movq        mm7, mm0
+        movq        mm4, mm0
+        psubusb     mm0, mm1              ; q1-=p1
+        psubusb     mm1, mm4              ; p1-=q1
+        por         mm1, mm0              ; abs(p1-q1)
+        pand        mm1, [tfe GLOBAL]     ; set lsb of each byte to zero
+        psrlw       mm1, 1                ; abs(p1-q1)/2
+
+        movq        mm5, [rsi+rax]        ; p0
+        movq        mm4, [rsi]            ; q0
+        movq        mm0, mm4              ; q0
+        movq        mm6, mm5              ; p0
+        psubusb     mm5, mm4              ; p0-=q0
+        psubusb     mm4, mm6              ; q0-=p0
+        por         mm5, mm4              ; abs(p0 - q0)
+        paddusb     mm5, mm5              ; abs(p0-q0)*2
+        paddusb     mm5, mm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        psubusb     mm5, mm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        pxor        mm3, mm3
+        pcmpeqb     mm5, mm3
+
+        ; start work on filters
+        pxor        mm2, [t80 GLOBAL]     ; p1 offset to convert to signed values
+        pxor        mm7, [t80 GLOBAL]     ; q1 offset to convert to signed values
+        psubsb      mm2, mm7              ; p1 - q1
+
+        pxor        mm6, [t80 GLOBAL]     ; offset to convert to signed values
+        pxor        mm0, [t80 GLOBAL]     ; offset to convert to signed values
+        movq        mm3, mm0              ; q0
+        psubsb      mm0, mm6              ; q0 - p0
+        paddsb      mm2, mm0              ; p1 - q1 + 1 * (q0 - p0)
+        paddsb      mm2, mm0              ; p1 - q1 + 2 * (q0 - p0)
+        paddsb      mm2, mm0              ; p1 - q1 + 3 * (q0 - p0)
+        pand        mm5, mm2              ; mask filter values we don't care about
+
+        ; do + 4 side
+        paddsb      mm5, [t4 GLOBAL]      ; 3* (q0 - p0) + (p1 - q1) + 4
+
+        movq        mm0, mm5              ; get a copy of filters
+        psllw       mm0, 8                ; shift left 8
+        psraw       mm0, 3                ; arithmetic shift right 11
+        psrlw       mm0, 8
+        movq        mm1, mm5              ; get a copy of filters
+        psraw       mm1, 11               ; arithmetic shift right 11
+        psllw       mm1, 8                ; shift left 8 to put it back
+
+        por         mm0, mm1              ; put the two together to get result
+
+        psubsb      mm3, mm0              ; q0-= q0 add
+        pxor        mm3, [t80 GLOBAL]     ; unoffset
+        movq        [rsi], mm3            ; write back
+
+
+        ; now do +3 side
+        psubsb      mm5, [t1s GLOBAL]      ; +3 instead of +4
+
+        movq        mm0, mm5              ; get a copy of filters
+        psllw       mm0, 8                ; shift left 8
+        psraw       mm0, 3                ; arithmetic shift right 11
+        psrlw       mm0, 8
+        psraw       mm5, 11               ; arithmetic shift right 11
+        psllw       mm5, 8                ; shift left 8 to put it back
+        por         mm0, mm5              ; put the two together to get result
+
+
+        paddsb      mm6, mm0              ; p0+= p0 add
+        pxor        mm6, [t80 GLOBAL]     ; unoffset
+        movq        [rsi+rax], mm6        ; write back
+
+        add         rsi,8
+        neg         rax
+        dec         rcx
+        jnz         nexts8_h
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_loop_filter_simple_vertical_edge_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *flimit,
+;    const char *limit,
+;    const char *thresh,
+;    int count
+;)
+global sym(vp8_loop_filter_simple_vertical_edge_mmx)
+sym(vp8_loop_filter_simple_vertical_edge_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub          rsp, 32      ; reserve 32 bytes
+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
+
+        mov         rsi, arg(0) ;src_ptr
+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        lea         rsi, [rsi + rax*4- 2];  ;
+        movsxd      rcx, dword ptr arg(5) ;count
+nexts8_v:
+
+        lea         rdi,        [rsi + rax];
+        movd        mm0,        [rdi + rax * 2]                 ; xx xx xx xx 73 72 71 70
+
+        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 63 62 61 60
+        punpcklbw   mm6,        mm0                             ; 73 63 72 62 71 61 70 60
+
+        movd        mm0,        [rsi + rax]                     ; xx xx xx xx 53 52 51 50
+        movd        mm4,        [rsi]                           ; xx xx xx xx 43 42 41 40
+
+        punpcklbw   mm4,        mm0                             ; 53 43 52 42 51 41 50 40
+        movq        mm5,        mm4                             ; 53 43 52 42 51 41 50 40
+
+        punpcklwd   mm4,        mm6                             ; 71 61 51 41 70 60 50 40
+        punpckhwd   mm5,        mm6                             ; 73 63 53 43 72 62 52 42
+
+        neg         rax
+
+        movd        mm7,        [rsi + rax]                     ; xx xx xx xx 33 32 31 30
+        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 23 22 21 20
+
+        punpcklbw   mm6,        mm7                             ; 33 23 32 22 31 21 30 20
+        movd        mm1,        [rdi + rax * 4]                 ; xx xx xx xx 13 12 11 10
+
+        movd        mm0,        [rsi + rax * 4]                 ; xx xx xx xx 03 02 01 00
+        punpcklbw   mm0,        mm1                             ; 13 03 12 02 11 01 10 00
+
+        movq        mm2,        mm0                             ; 13 03 12 02 11 01 10 00
+        punpcklwd   mm0,        mm6                             ; 31 21 11 01 30 20 10 00
+
+        punpckhwd   mm2,        mm6                             ; 33 23 13 03 32 22 12 02
+        movq        mm1,        mm0                             ; 13 03 12 02 11 01 10 00
+
+        punpckldq   mm0,        mm4                             ; 70 60 50 40 30 20 10 00       = p1
+        movq        mm3,        mm2                             ; 33 23 13 03 32 22 12 02
+
+        punpckhdq   mm1,        mm4                             ; 71 61 51 41 31 21 11 01       = p0
+        punpckldq   mm2,        mm5                             ; 72 62 52 42 32 22 12 02       = q0
+
+        punpckhdq   mm3,        mm5                             ; 73 63 53 43 33 23 13 03       = q1
+
+
+        ; calculate mask
+        movq        mm6,        mm0                             ; p1
+        movq        mm7,        mm3                             ; q1
+        psubusb     mm7,        mm6                             ; q1-=p1
+        psubusb     mm6,        mm3                             ; p1-=q1
+        por         mm6,        mm7                             ; abs(p1-q1)
+        pand        mm6,        [tfe GLOBAL]                    ; set lsb of each byte to zero
+        psrlw       mm6,        1                               ; abs(p1-q1)/2
+
+        movq        mm5,        mm1                             ; p0
+        movq        mm4,        mm2                             ; q0
+
+        psubusb     mm5,        mm2                             ; p0-=q0
+        psubusb     mm4,        mm1                             ; q0-=p0
+
+        por         mm5,        mm4                             ; abs(p0 - q0)
+        paddusb     mm5,        mm5                             ; abs(p0-q0)*2
+        paddusb     mm5,        mm6                             ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        mov         rdx,        arg(2) ;flimit                          ; get flimit
+        movq        mm7,        [rdx]
+        mov         rdx,        arg(3)                          ; get limit
+        movq        mm6,        [rdx]
+        paddb       mm7,        mm7                             ; flimit*2 (less than 255)
+        paddb       mm7,        mm6                             ; flimit * 2 + limit (less than 255)
+
+        psubusb     mm5,        mm7                             ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        pxor        mm7,        mm7
+        pcmpeqb     mm5,        mm7                             ; mm5 = mask
+
+        ; start work on filters
+        movq        t0,         mm0
+        movq        t1,         mm3
+
+        pxor        mm0,        [t80 GLOBAL]                    ; p1 offset to convert to signed values
+        pxor        mm3,        [t80 GLOBAL]                    ; q1 offset to convert to signed values
+
+        psubsb      mm0,        mm3                             ; p1 - q1
+        movq        mm6,        mm1                             ; p0
+
+        movq        mm7,        mm2                             ; q0
+        pxor        mm6,        [t80 GLOBAL]                    ; offset to convert to signed values
+
+        pxor        mm7,        [t80 GLOBAL]                    ; offset to convert to signed values
+        movq        mm3,        mm7                             ; offseted ; q0
+
+        psubsb      mm7,        mm6                             ; q0 - p0
+        paddsb      mm0,        mm7                             ; p1 - q1 + 1 * (q0 - p0)
+
+        paddsb      mm0,        mm7                             ; p1 - q1 + 2 * (q0 - p0)
+        paddsb      mm0,        mm7                             ; p1 - q1 + 3 * (q0 - p0)
+
+        pand        mm5,        mm0                             ; mask filter values we don't care about
+
+        paddsb      mm5,        [t4 GLOBAL]                     ;  3* (q0 - p0) + (p1 - q1) + 4
+
+        movq        mm0,        mm5                             ; get a copy of filters
+        psllw       mm0,        8                               ; shift left 8
+        psraw       mm0,        3                               ; arithmetic shift right 11
+        psrlw       mm0,        8
+
+        movq        mm7,        mm5                             ; get a copy of filters
+        psraw       mm7,        11                              ; arithmetic shift right 11
+        psllw       mm7,        8                               ; shift left 8 to put it back
+
+        por         mm0,        mm7                             ; put the two together to get result
+
+        psubsb      mm3,        mm0                             ; q0-= q0sz add
+        pxor        mm3,        [t80 GLOBAL]                    ; unoffset
+
+        ; now do +3 side
+        psubsb      mm5, [t1s GLOBAL]                           ; +3 instead of +4
+
+        movq        mm0, mm5                                    ; get a copy of filters
+        psllw       mm0, 8                                      ; shift left 8
+        psraw       mm0, 3                                      ; arithmetic shift right 11
+        psrlw       mm0, 8
+
+        psraw       mm5, 11                                     ; arithmetic shift right 11
+        psllw       mm5, 8                                      ; shift left 8 to put it back
+        por         mm0, mm5                                    ; put the two together to get result
+
+        paddsb      mm6, mm0                                    ; p0+= p0 add
+        pxor        mm6, [t80 GLOBAL]                           ; unoffset
+
+
+        movq        mm0,        t0
+        movq        mm4,        t1
+
+        ; mm0 = 70 60 50 40 30 20 10 00
+        ; mm6 = 71 61 51 41 31 21 11 01
+        ; mm3 = 72 62 52 42 32 22 12 02
+        ; mm4 = 73 63 53 43 33 23 13 03
+        ; transpose back to write out
+
+        movq        mm1,        mm0                         ;
+        punpcklbw   mm0,        mm6                         ; 31 30 21 20 11 10 01 00
+
+        punpckhbw   mm1,        mm6                         ; 71 70 61 60 51 50 41 40
+        movq        mm2,        mm3                         ;
+
+        punpcklbw   mm2,        mm4                         ; 33 32 23 22 13 12 03 02
+        movq        mm5,        mm1                         ; 71 70 61 60 51 50 41 40
+
+        punpckhbw   mm3,        mm4                         ; 73 72 63 62 53 52 43 42
+        movq        mm6,        mm0                         ; 31 30 21 20 11 10 01 00
+
+        punpcklwd   mm0,        mm2                         ; 13 12 11 10 03 02 01 00
+        punpckhwd   mm6,        mm2                         ; 33 32 31 30 23 22 21 20
+
+        movd        [rsi+rax*4], mm0                        ; write 03 02 01 00
+        punpcklwd   mm1,        mm3                         ; 53 52 51 50 43 42 41 40
+
+        psrlq       mm0,        32                          ; xx xx xx xx 13 12 11 10
+        punpckhwd   mm5,        mm3                         ; 73 72 71 70 63 62 61 60
+
+        movd        [rdi+rax*4], mm0                        ; write 13 12 11 10
+        movd        [rsi+rax*2], mm6                        ; write 23 22 21 20
+
+        psrlq       mm6,        32                          ; 33 32 31 30
+        movd        [rsi],      mm1                         ; write 43 42 41 40
+
+        movd        [rsi + rax], mm6                        ; write 33 32 31 30
+        neg         rax
+
+        movd        [rsi + rax*2], mm5                      ; write 63 62 61 60
+        psrlq       mm1,        32                          ; 53 52 51 50
+
+        movd        [rdi],      mm1                         ; write out 53 52 51 50
+        psrlq       mm5,        32                          ; 73 72 71 70
+
+        movd        [rdi + rax*2], mm5                      ; write 73 72 71 70
+
+        lea         rsi,        [rsi+rax*8]                 ; next 8
+
+        dec         rcx
+        jnz         nexts8_v
+
+    add rsp, 32
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
+;                  int y_stride,
+;                  loop_filter_info *lfi)
+;{
+;
+;
+;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+;}
+
+SECTION_RODATA
+align 16
+tfe:
+    times 8 db 0xfe
+align 16
+t80:
+    times 8 db 0x80
+align 16
+t1s:
+    times 8 db 0x01
+align 16
+t3:
+    times 8 db 0x03
+align 16
+t4:
+    times 8 db 0x04
+align 16
+ones:
+    times 4 dw 0x0001
+align 16
+s27:
+    times 4 dw 0x1b00
+align 16
+s18:
+    times 4 dw 0x1200
+align 16
+s9:
+    times 4 dw 0x0900
+align 16
+s63:
+    times 4 dw 0x003f
diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm
new file mode 100644
index 000000000..5275dfa3b
--- /dev/null
+++ b/vp8/common/x86/loopfilter_sse2.asm
@@ -0,0 +1,1978 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void vp8_loop_filter_horizontal_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *flimit,
+;    const char    *limit,
+;    const char    *thresh,
+;    int            count
+;)
+global sym(vp8_loop_filter_horizontal_edge_sse2)
+sym(vp8_loop_filter_horizontal_edge_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 32                         ; reserve 32 bytes
+    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
+    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];
+
+        mov         rsi, arg(0) ;src_ptr
+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        mov         rdx,    arg(3) ;limit
+        movdqa      xmm7,   XMMWORD PTR [rdx]
+        mov         rdi,    rsi           ; rdi points to row +1 for indirect addressing
+        add         rdi,    rax
+
+        ; calculate breakout conditions
+        movdqu      xmm2,   [rdi+2*rax]      ; q3
+        movdqu      xmm1,   [rsi+2*rax]      ; q2
+        movdqa      xmm6,   xmm1              ; q2
+        psubusb     xmm1,   xmm2              ; q2-=q3
+        psubusb     xmm2,   xmm6              ; q3-=q2
+        por         xmm1,   xmm2              ; abs(q3-q2)
+        psubusb     xmm1,   xmm7              ;
+
+
+        movdqu      xmm4,   [rsi+rax]         ; q1
+        movdqa      xmm3,   xmm4              ; q1
+        psubusb     xmm4,   xmm6              ; q1-=q2
+        psubusb     xmm6,   xmm3              ; q2-=q1
+        por         xmm4,   xmm6              ; abs(q2-q1)
+
+        psubusb     xmm4,   xmm7
+        por         xmm1,   xmm4
+
+        movdqu      xmm4,   [rsi]             ; q0
+        movdqa      xmm0,   xmm4              ; q0
+        psubusb     xmm4,   xmm3              ; q0-=q1
+        psubusb     xmm3,   xmm0              ; q1-=q0
+        por         xmm4,   xmm3              ; abs(q0-q1)
+        movdqa        t0,       xmm4                  ; save to t0
+        psubusb     xmm4,   xmm7
+        por         xmm1,   xmm4
+
+        neg         rax                   ; negate pitch to deal with above border
+        movdqu      xmm2,   [rsi+4*rax]      ; p3
+        movdqu      xmm4,   [rdi+4*rax]      ; p2
+        movdqa      xmm5,   xmm4              ; p2
+        psubusb     xmm4,   xmm2              ; p2-=p3
+        psubusb     xmm2,   xmm5              ; p3-=p2
+        por         xmm4,   xmm2              ; abs(p3 - p2)
+        psubusb     xmm4,   xmm7
+        por         xmm1,   xmm4
+
+
+        movdqu      xmm4,   [rsi+2*rax]      ; p1
+        movdqa      xmm3,   xmm4              ; p1
+        psubusb     xmm4,   xmm5              ; p1-=p2
+        psubusb     xmm5,   xmm3              ; p2-=p1
+        por         xmm4,   xmm5              ; abs(p2 - p1)
+        psubusb     xmm4,   xmm7
+        por         xmm1,   xmm4
+
+        movdqa      xmm2,   xmm3              ; p1
+
+        movdqu      xmm4,   [rsi+rax]         ; p0
+        movdqa      xmm5,   xmm4              ; p0
+        psubusb     xmm4,   xmm3              ; p0-=p1
+        psubusb     xmm3,   xmm5              ; p1-=p0
+        por         xmm4,   xmm3              ; abs(p1 - p0)
+        movdqa      t1,     xmm4                  ; save to t1
+        psubusb     xmm4,   xmm7
+        por         xmm1,    xmm4
+
+        movdqu      xmm3,   [rdi]             ; q1
+        movdqa      xmm4,   xmm3              ; q1
+        psubusb     xmm3,   xmm2              ; q1-=p1
+        psubusb     xmm2,   xmm4              ; p1-=q1
+        por         xmm2,   xmm3              ; abs(p1-q1)
+        pand        xmm2,   [tfe GLOBAL]      ; set lsb of each byte to zero
+        psrlw       xmm2,   1                 ; abs(p1-q1)/2
+
+        movdqa      xmm6,   xmm5              ; p0
+        movdqu      xmm3,   [rsi]             ; q0
+        psubusb     xmm5,   xmm3              ; p0-=q0
+        psubusb     xmm3,   xmm6              ; q0-=p0
+        por         xmm5,   xmm3              ; abs(p0 - q0)
+        paddusb     xmm5,   xmm5              ; abs(p0-q0)*2
+        paddusb     xmm5,   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        mov         rdx,    arg(2) ;flimit            ; get flimit
+        movdqa      xmm2,   [rdx]             ;
+
+        paddb       xmm2,   xmm2              ; flimit*2 (less than 255)
+        paddb       xmm7,   xmm2              ; flimit * 2 + limit (less than 255)
+
+        psubusb     xmm5,    xmm7             ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        por         xmm1,    xmm5
+        pxor        xmm5,    xmm5
+        pcmpeqb     xmm1,    xmm5             ; mask mm1
+
+
+        ; calculate high edge variance
+        mov         rdx,    arg(4) ;thresh            ; get thresh
+        movdqa      xmm7,   [rdx]             ;
+        movdqa      xmm4,   t0                ; get abs (q1 - q0)
+        psubusb     xmm4,   xmm7
+        movdqa      xmm3,   t1                ; get abs (p1 - p0)
+        psubusb     xmm3,   xmm7
+        paddb       xmm4,   xmm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+        pcmpeqb     xmm4,        xmm5
+        pcmpeqb     xmm5,        xmm5
+        pxor        xmm4,        xmm5
+
+
+        ; start work on filters
+        movdqu      xmm2, [rsi+2*rax]     ; p1
+        movdqu      xmm7, [rdi]           ; q1
+        pxor        xmm2, [t80 GLOBAL]    ; p1 offset to convert to signed values
+        pxor        xmm7, [t80 GLOBAL]    ; q1 offset to convert to signed values
+        psubsb      xmm2, xmm7            ; p1 - q1
+        pand        xmm2, xmm4            ; high var mask (hvm)(p1 - q1)
+        pxor        xmm6, [t80 GLOBAL]    ; offset to convert to signed values
+        pxor        xmm0, [t80 GLOBAL]    ; offset to convert to signed values
+        movdqa      xmm3, xmm0            ; q0
+        psubsb      xmm0, xmm6            ; q0 - p0
+        paddsb      xmm2, xmm0            ; 1 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      xmm2, xmm0            ; 2 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      xmm2, xmm0            ; 3 * (q0 - p0) + hvm(p1 - q1)
+        pand        xmm1, xmm2            ; mask filter values we don't care about
+        movdqa      xmm2, xmm1
+        paddsb      xmm1, [t4 GLOBAL]         ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+        paddsb      xmm2, [t3 GLOBAL]         ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+        pxor        xmm0, xmm0           ;
+        pxor        xmm5, xmm5
+        punpcklbw   xmm0, xmm2          ;
+        punpckhbw   xmm5, xmm2          ;
+        psraw       xmm0, 11                ;
+        psraw       xmm5, 11
+        packsswb    xmm0, xmm5
+        movdqa      xmm2, xmm0          ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+        pxor        xmm0, xmm0            ; 0
+        movdqa      xmm5, xmm1            ; abcdefgh
+        punpcklbw   xmm0, xmm1            ; e0f0g0h0
+        psraw       xmm0, 11                  ; sign extended shift right by 3
+        pxor        xmm1, xmm1            ; 0
+        punpckhbw   xmm1, xmm5            ; a0b0c0d0
+        psraw       xmm1, 11                  ; sign extended shift right by 3
+        movdqa      xmm5, xmm0              ; save results
+
+        packsswb    xmm0, xmm1            ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+        paddsw      xmm5, [ones GLOBAL]
+        paddsw      xmm1, [ones GLOBAL]
+        psraw       xmm5, 1               ; partial shifted one more time for 2nd tap
+        psraw       xmm1, 1               ; partial shifted one more time for 2nd tap
+        packsswb    xmm5, xmm1            ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+        pandn       xmm4, xmm5            ; high edge variance additive
+
+        paddsb      xmm6, xmm2            ; p0+= p0 add
+        pxor        xmm6, [t80 GLOBAL]    ; unoffset
+        movdqu      [rsi+rax], xmm6       ; write back
+
+        movdqu      xmm6, [rsi+2*rax]     ; p1
+        pxor        xmm6, [t80 GLOBAL]    ; reoffset
+        paddsb      xmm6, xmm4            ; p1+= p1 add
+        pxor        xmm6, [t80 GLOBAL]    ; unoffset
+        movdqu      [rsi+2*rax], xmm6     ; write back
+
+        psubsb      xmm3, xmm0            ; q0-= q0 add
+        pxor        xmm3, [t80 GLOBAL]    ; unoffset
+        movdqu      [rsi], xmm3           ; write back
+
+        psubsb      xmm7, xmm4            ; q1-= q1 add
+        pxor        xmm7, [t80 GLOBAL]    ; unoffset
+        movdqu      [rdi], xmm7           ; write back
+
+    add rsp, 32
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_loop_filter_vertical_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *flimit,
+;    const char    *limit,
+;    const char    *thresh,
+;    int            count
+;)
+global sym(vp8_loop_filter_vertical_edge_sse2)
+sym(vp8_loop_filter_vertical_edge_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub          rsp, 96      ; reserve 96 bytes
+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[16];
+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[16];
+    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[64];
+
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        lea         rsi,        [rsi + rax*4 - 4]
+        mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing
+
+        add         rdi,        rax
+        lea         rcx,        [rdi + rax *8]
+
+        ;transpose
+        movq        xmm7,       QWORD PTR [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
+        movq        xmm6,       QWORD PTR [rdi+2*rax]                 ; 77 76 75 74 73 72 71 70
+
+        punpcklbw   xmm7,       xmm6                        ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
+        movq        xmm5,       QWORD PTR [rsi]                       ; 47 46 45 44 43 42 41 40
+
+        movq        xmm4,       QWORD PTR [rsi+rax]                   ; 57 56 55 54 53 52 51 50
+        punpcklbw   xmm5,       xmm4                        ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+
+        movdqa      xmm3,       xmm5                        ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+        punpckhwd   xmm5,       xmm7                        ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
+
+        lea         rsi,        [rsi+ rax*8]
+
+        punpcklwd   xmm3,       xmm7                        ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+        movq        xmm6,       QWORD PTR [rsi + 2*rax]               ; e7 e6 e5 e4 e3 e2 e1 e0
+
+        movq        xmm7,       QWORD PTR [rcx + 2*rax]               ; f7 f6 f5 f4 f3 f2 f1 f0
+        punpcklbw   xmm6,       xmm7                        ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
+
+        movq        xmm4,       QWORD PTR [rsi]                       ; c7 c6 c5 c4 c3 c2 c1 c0
+        movq        xmm7,       QWORD PTR [rsi + rax]                 ; d7 d6 d5 d4 d3 d2 d1 d0
+
+        punpcklbw   xmm4,       xmm7                        ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 c1 d0 c0
+        movdqa      xmm7,       xmm4                        ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 c1 d0 c0
+
+        punpckhwd   xmm7,       xmm6                        ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
+        punpcklwd   xmm4,       xmm6                        ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+
+        ; xmm3 xmm4, xmm5 xmm7   in use
+        neg         rax
+
+        lea         rsi,        [rsi+rax*8]
+        movq        xmm6,       QWORD PTR [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
+
+        movq        xmm1,       QWORD PTR [rsi+rax  ]                 ; 37 36 35 34 33 32 31 30
+        punpcklbw   xmm6,       xmm1                        ; 37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20
+
+        movq        xmm2,       QWORD PTR [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
+        movq        xmm1,       QWORD PTR [rdi+rax*4]                 ; 17 16 15 14 13 12 11 10
+
+        punpcklbw   xmm2,       xmm1                        ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+        movdqa      xmm0,       xmm2
+
+        punpckhwd   xmm2,       xmm6                        ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
+        punpcklwd   xmm0,       xmm6                        ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+
+        movdqa      xmm6,       xmm2
+        punpckldq   xmm2,       xmm5                        ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+
+        punpckhdq   xmm6,       xmm5                        ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+        ;xmm0 xmm2 xmm3 xmm4, xmm6, xmm7
+
+        movdqa      xmm5,       xmm0                        ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+        punpckhdq   xmm5,       xmm3                        ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+
+        punpckldq   xmm0,       xmm3                        ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+        lea         rsi,        [rcx+rax]
+        ; xmm1, xmm3 free
+        movq        xmm1,       QWORD PTR [rsi+rax*2]                 ; a7 a6 a5 a4 a3 a2 a1 a0
+        movq        xmm3,       QWORD PTR [rsi+rax]                   ; b7 b6 b5 b4 b3 b2 b1 b0
+
+        punpcklbw   xmm1,       xmm3                        ;
+        lea         rdx,        srct                        ;
+
+        movdqa      [rdx+16],   xmm1                        ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
+        movq        xmm3,       QWORD PTR [rsi+rax*4]                 ; 87 86 85 84 83 82 81 80
+
+        movq        xmm1,       QWORD PTR [rcx+rax*4]
+        punpcklbw   xmm3,       xmm1                        ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+
+        movdqa      [rdx],      xmm3                        ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+
+        punpckhwd   xmm3,       [rdx+16]                    ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+        movdqa      xmm1,       xmm3                        ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+
+        punpckhdq   xmm1,       xmm7                        ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
+        punpckldq   xmm3,       xmm7                        ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
+
+        movdqa      xmm7,       xmm2                        ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+        punpcklqdq  xmm7,       xmm3                        ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+
+        punpckhqdq  xmm2,       xmm3                        ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+        movdqa      [rdx+32],   xmm7                        ; save 4s
+
+        movdqa      [rdx+48],   xmm2                        ; save 5s
+        movdqa      xmm7,       xmm6                        ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+
+        punpckhqdq  xmm7,       xmm1                        ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07 = q3
+        punpcklqdq  xmm6,       xmm1                        ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 = q2
+
+        ; free 1, 3   xmm7-7s xmm6-6s, xmm2-5s
+        movq        xmm1,       QWORD PTR [rdx]                       ; 93 83 92 82 91 81 90 80
+        movq        xmm3,       QWORD PTR [rdx+16]                    ; b3 a3 b2 a2 b1 a1 b0 a0
+
+        punpcklwd   xmm1,       xmm3                        ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+        movdqa      xmm3,       xmm1                        ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+
+        punpckhdq   xmm3,       xmm4                        ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+        punpckldq   xmm1,       xmm4                        ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+
+        movdqa      xmm4,       xmm5                        ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+        punpcklqdq  xmm5,       xmm3                        ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+
+        punpckhqdq  xmm4,       xmm3                        ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+        movdqa      [rdx],      xmm5                        ; save 2s
+
+        movdqa      [rdx+16],   xmm4                        ; save 3s
+
+        movdqa      xmm3,       xmm6                        ;
+        psubusb     xmm3,       xmm7                        ; q3 - q2
+
+        psubusb     xmm7,       xmm6                        ; q2 - q3
+        por         xmm7,       xmm3                        ; abs(q3-q2)
+
+        movdqa      xmm3,       xmm2                        ; q1
+        psubusb     xmm3,       xmm6                        ; q1 - q2
+
+        psubusb     xmm6,       xmm2                        ; q2 - q1
+        por         xmm6,       xmm3                        ; abs(q2-q1)
+
+
+        movdqa      xmm3,       xmm0                        ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+        punpcklqdq  xmm0,       xmm1                        ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+
+        punpckhqdq  xmm3,       xmm1                        ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+        movdqa      xmm1,       xmm3
+
+        psubusb     xmm3,       xmm0                        ; p2-p3
+        psubusb     xmm0,       xmm1                        ; p3-p2
+
+        por         xmm0,       xmm3                        ; abs(p3-p2)
+        movdqa      xmm3,       xmm5                        ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+
+        psubusb     xmm3,       xmm1                        ; p1-p2
+        psubusb     xmm1,       xmm5                        ; p2-p1
+
+        por         xmm1,       xmm3                        ; abs(p1-p2)
+        mov         rdx,        arg(3) ;limit
+
+        movdqa      xmm3,       [rdx]                       ; limit
+
+        psubusb     xmm7,       xmm3
+        psubusb     xmm0,       xmm3
+
+        psubusb     xmm1,       xmm3
+        psubusb     xmm6,       xmm3
+
+        por         xmm7,       xmm6
+        por         xmm0,       xmm1
+
+        por         xmm0,       xmm7                         ;   abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
+
+        movdqa      xmm1,       xmm5                         ; p1
+
+        movdqa      xmm7,       xmm4                        ; xmm4 xmm7 = p0
+
+        psubusb     xmm7,       xmm5                        ; p0 - p1
+        psubusb     xmm5,       xmm4                        ; p1 - p0
+
+        por         xmm5,       xmm7                        ; abs(p1-p0)
+        movdqa        t0,       xmm5                        ; save abs(p1-p0)
+
+        lea         rdx,        srct
+        psubusb     xmm5,       xmm3
+
+        por         xmm0,       xmm5                        ; xmm0=mask
+        movdqa      xmm5,       [rdx+32]                    ; xmm5=q0
+
+        movdqa      xmm7,       [rdx+48]                    ; xmm7=q1
+        movdqa      xmm6,       xmm5                        ; mm6=q0
+
+        movdqa      xmm2,       xmm7                        ; q1
+
+        psubusb     xmm5,       xmm7                        ; q0-q1
+        psubusb     xmm7,       xmm6                        ; q1-q0
+
+        por         xmm7,       xmm5                        ; abs(q1-q0)
+        movdqa        t1,       xmm7                        ; save abs(q1-q0)
+
+        psubusb     xmm7,       xmm3
+        por         xmm0,       xmm7                        ; mask
+
+        movdqa      xmm5,       xmm2                        ; q1
+        psubusb     xmm5,       xmm1                        ; q1-=p1
+        psubusb     xmm1,       xmm2                        ; p1-=q1
+        por         xmm5,       xmm1                        ; abs(p1-q1)
+        pand        xmm5,       [tfe GLOBAL]                ; set lsb of each byte to zero
+        psrlw       xmm5,       1                           ; abs(p1-q1)/2
+
+        mov         rdx,        arg(2) ;flimit                      ;
+        movdqa        xmm2,       [rdx]                       ;flimit  xmm2
+
+        movdqa      xmm1,       xmm4                        ; xmm1=xmm4=p0
+
+        movdqa      xmm7,       xmm6                        ; xmm7=xmm6=q0
+        psubusb     xmm1,       xmm7                        ; p0-q0
+
+        psubusb     xmm7,       xmm4                        ; q0-p0
+        por         xmm1,       xmm7                        ; abs(q0-p0)
+        paddusb     xmm1,       xmm1                        ; abs(q0-p0)*2
+        paddusb     xmm1,       xmm5                        ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        paddb       xmm2,       xmm2                        ; flimit*2 (less than 255)
+        paddb       xmm3,       xmm2                        ; flimit * 2 + limit (less than 255)
+
+        psubusb     xmm1,       xmm3                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+
+        por         xmm1,       xmm0;                       ; mask
+
+        pxor        xmm0,       xmm0
+        pcmpeqb     xmm1,       xmm0
+        ; calculate high edge variance
+        mov         rdx,        arg(4) ;thresh            ; get thresh
+        movdqa      xmm7,       [rdx]
+
+        ;
+        movdqa      xmm4,       t0              ; get abs (q1 - q0)
+        psubusb     xmm4,       xmm7
+
+        movdqa      xmm3,       t1              ; get abs (p1 - p0)
+        psubusb     xmm3,       xmm7
+
+        por         xmm4,       xmm3            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+        pcmpeqb     xmm4,       xmm0
+
+        pcmpeqb     xmm0,       xmm0
+        pxor        xmm4,       xmm0
+
+        ; start work on filters
+        lea         rdx,        srct
+
+        movdqa      xmm2,       [rdx]           ; p1
+        movdqa      xmm7,       [rdx+48]        ; q1
+
+        movdqa      xmm6,       [rdx+16]        ; p0
+        movdqa      xmm0,       [rdx+32]        ; q0
+
+        pxor        xmm2,       [t80 GLOBAL]    ; p1 offset to convert to signed values
+        pxor        xmm7,       [t80 GLOBAL]    ; q1 offset to convert to signed values
+
+        psubsb      xmm2,       xmm7            ; p1 - q1
+        pand        xmm2,       xmm4            ; high var mask (hvm)(p1 - q1)
+
+        pxor        xmm6,       [t80 GLOBAL]    ; offset to convert to signed values
+        pxor        xmm0,       [t80 GLOBAL]    ; offset to convert to signed values
+
+        movdqa      xmm3,       xmm0            ; q0
+        psubsb      xmm0,       xmm6            ; q0 - p0
+
+        paddsb      xmm2,       xmm0            ; 1 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      xmm2,       xmm0            ; 2 * (q0 - p0) + hvm(p1 - q1)
+
+        paddsb      xmm2,       xmm0            ; 3 * (q0 - p0) + hvm(p1 - q1)
+        pand        xmm1,       xmm2            ; mask filter values we don't care about
+
+        movdqa      xmm2,       xmm1
+        paddsb      xmm1,       [t4 GLOBAL]       ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+
+        paddsb      xmm2,       [t3 GLOBAL]       ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+        pxor        xmm0,       xmm0             ;
+
+        pxor        xmm5,       xmm5
+        punpcklbw   xmm0,       xmm2            ;
+
+        punpckhbw   xmm5,       xmm2            ;
+        psraw       xmm0,       11              ;
+
+        psraw       xmm5,       11
+        packsswb    xmm0,       xmm5
+
+        movdqa      xmm2,       xmm0            ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+        pxor        xmm0,       xmm0              ; 0
+        movdqa      xmm5,       xmm1              ; abcdefgh
+
+        punpcklbw   xmm0,       xmm1              ; e0f0g0h0
+        psraw       xmm0,       11                ; sign extended shift right by 3
+
+        pxor        xmm1,       xmm1              ; 0
+        punpckhbw   xmm1,       xmm5              ; a0b0c0d0
+
+        psraw       xmm1,       11                ; sign extended shift right by 3
+        movdqa      xmm5,       xmm0              ; save results
+
+        packsswb    xmm0,       xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+        paddsw      xmm5,       [ones GLOBAL]
+
+        paddsw      xmm1,       [ones GLOBAL]
+        psraw       xmm5,       1                 ; partial shifted one more time for 2nd tap
+
+        psraw       xmm1,       1                 ; partial shifted one more time for 2nd tap
+        packsswb    xmm5,       xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+
+        pandn       xmm4,       xmm5            ; high edge variance additive
+
+        paddsb      xmm6,       xmm2            ; p0+= p0 add
+        pxor        xmm6,       [t80 GLOBAL]    ; unoffset
+
+        ; mm6=p0                               ;
+        movdqa      xmm1,       [rdx]           ; p1
+        pxor        xmm1,       [t80 GLOBAL]    ; reoffset
+
+        paddsb      xmm1,       xmm4            ; p1+= p1 add
+        pxor        xmm1,       [t80 GLOBAL]    ; unoffset
+        ; mm6 = p0 mm1 = p1
+
+        psubsb      xmm3,       xmm0            ; q0-= q0 add
+        pxor        xmm3,       [t80 GLOBAL]    ; unoffset
+
+        ; mm3 = q0
+        psubsb      xmm7,       xmm4            ; q1-= q1 add
+        pxor        xmm7,       [t80 GLOBAL]    ; unoffset
+        ; mm7 = q1
+
+        ; tranpose and write back
+        ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+        ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+        ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+        movdqa      xmm2,       xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        punpcklbw   xmm2,       xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+
+        movdqa      xmm4,       xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+        punpckhbw   xmm1,       xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+        punpcklbw   xmm4,       xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+        punpckhbw   xmm3,       xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
+
+        movdqa      xmm6,       xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+        punpcklwd   xmm2,       xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
+
+        punpckhwd   xmm6,       xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
+        movdqa      xmm5,       xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+        punpcklwd   xmm1,       xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
+        punpckhwd   xmm5,       xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
+
+        ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
+        ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
+        ; xmm5 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+        ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
+        lea         rsi,           [rsi+rax*8]
+
+        movd        [rsi+rax*4+2], xmm2
+        psrldq      xmm2,   4
+
+        movd        [rdi+rax*4+2], xmm2
+        psrldq      xmm2,   4
+
+        movd        [rsi+rax*2+2], xmm2
+        psrldq      xmm2,   4
+
+        movd        [rdi+rax*2+2], xmm2
+        movd        [rsi+2],       xmm6
+
+        psrldq      xmm6,   4
+        movd        [rdi+2],       xmm6
+
+        psrldq      xmm6,   4
+        neg         rax
+
+        movd        [rdi+rax+2],    xmm6
+        psrldq      xmm6,   4
+
+        movd        [rdi+rax*2+2],  xmm6
+        lea         rsi,       [rsi+rax*8]
+
+        neg         rax
+        ;;;;;;;;;;;;;;;;;;;;/
+        movd        [rsi+rax*4+2], xmm1
+        psrldq      xmm1,   4
+
+        movd        [rcx+rax*4+2], xmm1
+        psrldq      xmm1,   4
+
+        movd        [rsi+rax*2+2], xmm1
+        psrldq      xmm1,   4
+
+        movd        [rcx+rax*2+2], xmm1
+        psrldq      xmm1,   4
+
+        movd        [rsi+2],       xmm5
+        psrldq      xmm5,   4
+
+        movd        [rcx+2],        xmm5
+        psrldq      xmm5,   4
+
+        neg         rax
+        movd        [rcx+rax+2],    xmm5
+
+        psrldq      xmm5,   4
+        movd        [rcx+rax*2+2],  xmm5
+
+    add rsp, 96
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_mbloop_filter_horizontal_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *flimit,
+;    const char    *limit,
+;    const char    *thresh,
+;    int            count
+;)
+global sym(vp8_mbloop_filter_horizontal_edge_sse2)
+sym(vp8_mbloop_filter_horizontal_edge_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 32                         ; reserve 32 bytes
+    %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[8];
+    %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[8];
+
+        mov         rsi,                    arg(0) ;src_ptr
+        movsxd      rax,                    dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        mov         rdx,                    arg(3) ;limit
+        movdqa      xmm7,                   XMMWORD PTR [rdx]
+
+        mov         rdi,                    rsi           ; rdi points to row +1 for indirect addressing
+        add         rdi,                    rax
+
+        ; calculate breakout conditions
+        movdqa      xmm2,                   XMMWORD PTR [rdi+2*rax]      ; q3
+        movdqa      xmm1,                   XMMWORD PTR [rsi+2*rax]      ; q2
+
+        movdqa      xmm6,                   xmm1              ; q2
+        psubusb     xmm1,                   xmm2              ; q2-=q3
+
+
+        psubusb     xmm2,                   xmm6              ; q3-=q2
+        por         xmm1,                   xmm2              ; abs(q3-q2)
+
+        psubusb     xmm1,                   xmm7
+
+        ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
+        movdqa      xmm4,                   XMMWORD PTR [rsi+rax]         ; q1
+        movdqa      xmm3,                   xmm4              ; q1
+
+        psubusb     xmm4,                   xmm6              ; q1-=q2
+        psubusb     xmm6,                   xmm3              ; q2-=q1
+
+        por         xmm4,                   xmm6              ; abs(q2-q1)
+        psubusb     xmm4,                   xmm7
+
+        por         xmm1,                   xmm4
+        ; mm1 = mask,      mm3=q1, mm7 = limit
+
+        movdqa      xmm4,                   XMMWORD PTR [rsi]             ; q0
+        movdqa      xmm0,                   xmm4              ; q0
+
+        psubusb     xmm4,                   xmm3              ; q0-=q1
+        psubusb     xmm3,                   xmm0              ; q1-=q0
+
+        por         xmm4,                   xmm3              ; abs(q0-q1)
+        movdqa      t0,                     xmm4                  ; save to t0
+
+        psubusb     xmm4,                   xmm7
+        por         xmm1,                   xmm4
+
+        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
+        neg         rax                   ; negate pitch to deal with above border
+
+        movdqa      xmm2,                   XMMWORD PTR [rsi+4*rax]      ; p3
+        movdqa      xmm4,                   XMMWORD PTR [rdi+4*rax]      ; p2
+
+        movdqa      xmm5,                   xmm4              ; p2
+        psubusb     xmm4,                   xmm2              ; p2-=p3
+
+        psubusb     xmm2,                   xmm5              ; p3-=p2
+        por         xmm4,                   xmm2              ; abs(p3 - p2)
+
+        psubusb     xmm4,                   xmm7
+        por         xmm1,                   xmm4
+
+        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
+        movdqa      xmm4,                   XMMWORD PTR [rsi+2*rax]      ; p1
+        movdqa      xmm3,                   xmm4              ; p1
+
+        psubusb     xmm4,                   xmm5              ; p1-=p2
+        psubusb     xmm5,                   xmm3              ; p2-=p1
+
+        por         xmm4,                   xmm5              ; abs(p2 - p1)
+        psubusb     xmm4,                   xmm7
+
+        por         xmm1,                   xmm4
+
+        movdqa      xmm2,                   xmm3              ; p1
+
+        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
+        movdqa      xmm4,                   XMMWORD PTR [rsi+rax]         ; p0
+        movdqa      xmm5,                   xmm4              ; p0
+
+        psubusb     xmm4,                   xmm3              ; p0-=p1
+        psubusb     xmm3,                   xmm5              ; p1-=p0
+
+        por         xmm4,                   xmm3              ; abs(p1 - p0)
+        movdqa        t1,                   xmm4                  ; save to t1
+
+        psubusb     xmm4,                   xmm7
+        por         xmm1,                   xmm4
+
+        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
+        ; mm5 = p0
+        movdqa      xmm3,                   XMMWORD PTR [rdi] ; q1
+        movdqa      xmm4,                   xmm3              ; q1
+        psubusb     xmm3,                   xmm2              ; q1-=p1
+        psubusb     xmm2,                   xmm4              ; p1-=q1
+        por         xmm2,                   xmm3              ; abs(p1-q1)
+        pand        xmm2,                   [tfe GLOBAL]      ; set lsb of each byte to zero
+        psrlw       xmm2,                   1                 ; abs(p1-q1)/2
+
+        movdqa      xmm6,                   xmm5              ; p0
+        movdqa      xmm3,                   xmm0              ; q0
+
+        psubusb     xmm5,                   xmm3              ; p0-=q0
+        psubusb     xmm3,                   xmm6              ; q0-=p0
+
+        por         xmm5,                   xmm3              ; abs(p0 - q0)
+        paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
+        paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        mov         rdx,                    arg(2) ;flimit            ; get flimit
+        movdqa      xmm2,                   XMMWORD PTR [rdx]             ;
+        paddb       xmm2,                   xmm2              ; flimit*2 (less than 255)
+        paddb       xmm7,                   xmm2              ; flimit * 2 + limit (less than 255)
+
+        psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        por         xmm1,                   xmm5
+        pxor        xmm5,                   xmm5
+        pcmpeqb     xmm1,                   xmm5               ; mask mm1
+        ; mm1 = mask, mm0=q0,  mm7 = flimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
+        ; mm6 = p0,
+
+        ; calculate high edge variance
+        mov         rdx,                    arg(4) ;thresh            ; get thresh
+        movdqa      xmm7,                   XMMWORD PTR [rdx]             ;
+
+        movdqa      xmm4,                   t0                ; get abs (q1 - q0)
+        psubusb     xmm4,                   xmm7
+
+        movdqa      xmm3,                   t1                ; get abs (p1 - p0)
+        psubusb     xmm3,                   xmm7
+
+        paddb       xmm4,                   xmm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+        pcmpeqb     xmm4,                   xmm5
+
+        pcmpeqb     xmm5,                   xmm5
+        pxor        xmm4,                   xmm5
+        ; mm1 = mask, mm0=q0,  mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
+        ; mm6 = p0, mm4=hev
+        ; start work on filters
+        movdqa      xmm2,                   XMMWORD PTR [rsi+2*rax]   ; p1
+        movdqa      xmm7,                   XMMWORD PTR [rdi]             ; q1
+
+        pxor        xmm2,                   [t80 GLOBAL]  ; p1 offset to convert to signed values
+        pxor        xmm7,                   [t80 GLOBAL]  ; q1 offset to convert to signed values
+
+        psubsb      xmm2,                   xmm7              ; p1 - q1
+        pxor        xmm6,                   [t80 GLOBAL]  ; offset to convert to signed values
+
+        pxor        xmm0,                   [t80 GLOBAL]  ; offset to convert to signed values
+        movdqa      xmm3,                   xmm0              ; q0
+
+        psubsb      xmm0,                   xmm6              ; q0 - p0
+        paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + (p1 - q1)
+
+        paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0)
+        paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + (p1 - q1)
+
+        pand        xmm1,                   xmm2              ; mask filter values we don't care about
+        ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
+        movdqa      xmm2,                   xmm1              ; vp8_filter
+        pand        xmm2,                   xmm4;             ; Filter2 = vp8_filter & hev
+
+
+        movdqa      xmm5,                   xmm2          ;
+        paddsb      xmm5,                   [t3 GLOBAL];
+
+        pxor        xmm0,                   xmm0              ; 0
+        pxor        xmm7,                   xmm7              ; 0
+
+        punpcklbw   xmm0,                   xmm5              ; e0f0g0h0
+        psraw       xmm0,                   11                ; sign extended shift right by 3
+
+        punpckhbw   xmm7,                   xmm5              ; a0b0c0d0
+        psraw       xmm7,                   11                ; sign extended shift right by 3
+
+        packsswb    xmm0,                   xmm7              ; Filter2 >>=3;
+        movdqa      xmm5,                   xmm0              ; Filter2
+
+        paddsb      xmm2,                   [t4 GLOBAL]      ; vp8_signed_char_clamp(Filter2 + 4)
+        pxor        xmm0,                   xmm0              ; 0
+
+        pxor        xmm7,                   xmm7              ; 0
+        punpcklbw   xmm0,                   xmm2              ; e0f0g0h0
+
+        psraw       xmm0,                   11                ; sign extended shift right by 3
+        punpckhbw   xmm7,                   xmm2              ; a0b0c0d0
+
+        psraw       xmm7,                   11                ; sign extended shift right by 3
+        packsswb    xmm0,                   xmm7              ; Filter2 >>=3;
+
+        ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
+        psubsb      xmm3,                   xmm0              ; qs0 =qs0 - filter1
+        paddsb      xmm6,                   xmm5              ; ps0 =ps0 + Fitler2
+
+        ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
+        ; vp8_filter &= ~hev;
+        ; Filter2 = vp8_filter;
+        pandn       xmm4,                   xmm1              ; vp8_filter&=~hev
+
+
+        ; mm3=qs0, mm4=filter2, mm6=ps0
+
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
+        ; s = vp8_signed_char_clamp(qs0 - u);
+        ; *oq0 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps0 + u);
+        ; *op0 = s^0x80;
+        pxor        xmm0,                   xmm0
+        pxor        xmm1,                   xmm1
+
+        pxor        xmm2,                   xmm2
+        punpcklbw   xmm1,                   xmm4
+
+        punpckhbw   xmm2,                   xmm4
+        pmulhw      xmm1,                   [s27 GLOBAL]
+
+        pmulhw      xmm2,                   [s27 GLOBAL]
+        paddw       xmm1,                   [s63 GLOBAL]
+
+        paddw       xmm2,                   [s63 GLOBAL]
+        psraw       xmm1,                   7
+
+        psraw       xmm2,                   7
+        packsswb    xmm1,                   xmm2
+
+        psubsb      xmm3,                   xmm1
+        paddsb      xmm6,                   xmm1
+
+        pxor        xmm3,                   [t80 GLOBAL]
+        pxor        xmm6,                   [t80 GLOBAL]
+
+        movdqa      XMMWORD PTR [rsi+rax],  xmm6
+        movdqa      XMMWORD PTR [rsi],      xmm3
+
+        ; roughly 2/7th difference across boundary
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
+        ; s = vp8_signed_char_clamp(qs1 - u);
+        ; *oq1 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps1 + u);
+        ; *op1 = s^0x80;
+        pxor        xmm1,                   xmm1
+        pxor        xmm2,                   xmm2
+
+        punpcklbw   xmm1,                   xmm4
+        punpckhbw   xmm2,                   xmm4
+
+        pmulhw      xmm1,                   [s18 GLOBAL]
+        pmulhw      xmm2,                   [s18 GLOBAL]
+
+        paddw       xmm1,                   [s63 GLOBAL]
+        paddw       xmm2,                   [s63 GLOBAL]
+
+        psraw       xmm1,                   7
+        psraw       xmm2,                   7
+
+        packsswb    xmm1,                   xmm2
+
+        movdqa      xmm3,                   XMMWORD PTR [rdi]
+        movdqa      xmm6,                   XMMWORD PTR [rsi+rax*2]       ; p1
+
+        pxor        xmm3,                   [t80 GLOBAL]
+        pxor        xmm6,                   [t80 GLOBAL]
+
+        paddsb      xmm6,                   xmm1
+        psubsb      xmm3,                   xmm1
+
+        pxor        xmm6,                   [t80 GLOBAL]
+        pxor        xmm3,                   [t80 GLOBAL]
+
+        movdqa      XMMWORD PTR [rdi],      xmm3
+        movdqa      XMMWORD PTR [rsi+rax*2],xmm6
+
+        ; roughly 1/7th difference across boundary
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
+        ; s = vp8_signed_char_clamp(qs2 - u);
+        ; *oq2 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps2 + u);
+        ; *op2 = s^0x80;
+        pxor        xmm1,                   xmm1
+        pxor        xmm2,                   xmm2
+
+        punpcklbw   xmm1,                   xmm4
+        punpckhbw   xmm2,                   xmm4
+
+        pmulhw      xmm1,                   [s9 GLOBAL]
+        pmulhw      xmm2,                   [s9 GLOBAL]
+
+        paddw       xmm1,                   [s63 GLOBAL]
+        paddw       xmm2,                   [s63 GLOBAL]
+
+        psraw       xmm1,                   7
+        psraw       xmm2,                   7
+
+        packsswb    xmm1,                   xmm2
+
+
+        movdqa      xmm6,                   XMMWORD PTR [rdi+rax*4]
+        neg         rax
+
+        movdqa      xmm3,                   XMMWORD PTR [rdi+rax  ]
+
+        pxor        xmm6,                   [t80 GLOBAL]
+        pxor        xmm3,                   [t80 GLOBAL]
+
+        paddsb      xmm6,                   xmm1
+        psubsb      xmm3,                   xmm1
+
+        pxor        xmm6,                   [t80 GLOBAL]
+        pxor        xmm3,                   [t80 GLOBAL]
+
+        movdqa      XMMWORD PTR [rdi+rax  ], xmm3
+        neg         rax
+
+        movdqa      XMMWORD PTR [rdi+rax*4], xmm6
+
+    add rsp, 32
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_mbloop_filter_vertical_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *flimit,
+;    const char    *limit,
+;    const char    *thresh,
+;    int            count
+;)
+global sym(vp8_mbloop_filter_vertical_edge_sse2)
+sym(vp8_mbloop_filter_vertical_edge_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub          rsp, 160     ; reserve 160 bytes
+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[16];
+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[16];
+    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[128];
+
+
+        mov         rsi,                arg(0) ;src_ptr
+        movsxd      rax,                dword ptr arg(1) ;src_pixel_step                    ; destination pitch?
+
+        lea         rsi,                [rsi + rax*4 - 4]
+        lea         rdi,                [rsi + rax]                     ; rdi points to row +1 for indirect addressing
+
+        mov         rcx,                rax
+        neg         rcx
+
+        ; Transpose
+        movq        xmm0,               QWORD PTR [rdi+rax*2]           ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
+        movq        xmm7,               QWORD PTR [rsi+rax*2]           ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
+
+        punpcklbw   xmm7,               xmm0                            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
+        movq        xmm0,               QWORD PTR [rsi+rax]             ;
+
+        movq        xmm5,               QWORD PTR [rsi]                 ;
+        punpcklbw   xmm5,               xmm0                            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+
+        movdqa      xmm6,               xmm5                            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+        punpcklwd   xmm5,               xmm7                            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+
+        punpckhwd   xmm6,               xmm7                            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
+        movq        xmm7,               QWORD PTR [rsi + rcx]           ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
+
+        movq        xmm0,               QWORD PTR [rsi + rcx*2]         ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
+        punpcklbw   xmm0,               xmm7                            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
+
+        movq        xmm4,               QWORD PTR [rsi + rcx*4]         ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
+        movq        xmm7,               QWORD PTR [rdi + rcx*4]         ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
+
+        punpcklbw   xmm4,               xmm7                            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+        movdqa      xmm3,               xmm4                            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+
+        punpcklwd   xmm3,               xmm0                            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+        punpckhwd   xmm4,               xmm0                            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
+
+        movdqa      xmm7,               xmm4                            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
+        movdqa      xmm2,               xmm3                            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+
+        punpckhdq   xmm7,               xmm6                            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+        punpckldq   xmm4,               xmm6                            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+
+        punpckhdq   xmm3,               xmm5                            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+        punpckldq   xmm2,               xmm5                            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+
+        movdqa      t0,                 xmm2                            ; save to free XMM2
+        ;movdqa        t1,                 xmm3
+
+        ; XMM3 XMM4 XMM7 in use
+        lea         rsi,                [rsi+rax*8]
+        lea         rdi,                [rdi+rax*8]
+
+        movq        xmm6,               QWORD PTR [rdi+rax*2]           ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
+        movq        xmm5,               QWORD PTR [rsi+rax*2]           ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
+
+        punpcklbw   xmm5,               xmm6                            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
+        movq        xmm6,               QWORD PTR [rsi+rax]             ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
+
+        movq        xmm1,               QWORD PTR [rsi]                 ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
+        punpcklbw   xmm1,               xmm6                            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
+
+        movdqa      xmm6,               xmm1                            ;
+        punpckhwd   xmm6,               xmm5                            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
+
+        punpcklwd   xmm1,               xmm5                            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+        movq        xmm5,               QWORD PTR [rsi+rcx]             ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
+
+        movq        xmm0,               QWORD PTR [rsi+rcx*2]           ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
+        punpcklbw   xmm0,               xmm5                            ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
+
+        movq        xmm2,               QWORD PTR [rsi+rcx*4]           ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
+        movq        xmm5,               QWORD PTR [rdi+rcx*4]           ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
+
+        punpcklbw   xmm2,               xmm5                            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+        movdqa      xmm5,               xmm2                            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+
+        punpcklwd   xmm5,               xmm0                            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+        punpckhwd   xmm2,               xmm0                            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+
+        movdqa      xmm0,               xmm5
+        punpckldq   xmm0,               xmm1                            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+
+
+        punpckhdq   xmm5,               xmm1                            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+        movdqa      xmm1,               xmm2                            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+
+        punpckldq   xmm1,               xmm6                            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
+        punpckhdq   xmm2,               xmm6                            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
+
+        movdqa      xmm6,               xmm7                            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+        punpcklqdq  xmm6,               xmm2                            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
+
+
+        lea         rdx,                srct
+        punpckhqdq  xmm7,               xmm2                            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
+
+        movdqa      [rdx+112],          xmm7                            ; save 7
+        movdqa      xmm2,               xmm3                            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+
+        movdqa      [rdx+96],           xmm6                            ; save 6
+        punpcklqdq  xmm2,               xmm5                            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+
+        punpckhqdq  xmm3,               xmm5                            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+        movdqa      [rdx+32],           xmm2                            ; save 2
+
+        movdqa      xmm5,               xmm4                            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+        punpcklqdq  xmm4,               xmm1                            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+
+        movdqa      [rdx+48],           xmm3                            ; save 3
+        punpckhqdq  xmm5,               xmm1                            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+
+        movdqa      [rdx+64],           xmm4                            ; save 4
+        movdqa      [rdx+80],           xmm5                            ; save 5
+
+        movdqa      xmm1,               t0                              ; get
+        movdqa      xmm2,               xmm1                            ;
+
+        punpckhqdq  xmm1,               xmm0                            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+        punpcklqdq  xmm2,               xmm0                            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+
+        movdqa      [rdx+16],           xmm1
+        movdqa      [rdx],              xmm2
+
+        movdqa      xmm0,               xmm6                            ; q2
+        psubusb     xmm0,               xmm7                            ; q2-q3
+
+        psubusb     xmm7,               xmm6                            ; q3-q2
+        por         xmm7,               xmm0                            ; abs (q3-q2)
+
+        movdqa      xmm1,               xmm5                            ; q1
+        psubusb     xmm1,               xmm6                            ; q1-q2
+
+        psubusb     xmm6,               xmm5                            ; q2-q1
+        por         xmm6,               xmm1                            ; abs (q2-q1)
+
+        ;/*
+        ;movdqa      xmm0,               xmm4                            ; q0
+        ;psubusb     xmm0                xmm5                            ; q0-q1
+        ;
+        ;pusbusb     xmm5,               xmm4                            ; q1-q0
+        ;por         xmm5,               xmm0                            ; abs (q1-q0)
+        ;*/
+
+        movdqa      xmm1,               [rdx+16]                        ; p2
+        movdqa      xmm0,               xmm1
+
+        psubusb     xmm0,               xmm2                            ; p2 - p3;
+        psubusb     xmm2,               xmm1                            ; p3 - p2;
+
+        por         xmm0,               xmm2                            ; abs(p2-p3)
+
+        movdqa      xmm2,               [rdx+32]                        ; p1
+        movdqa      xmm5,               xmm2                            ; p1
+
+        psubusb     xmm5,               xmm1                            ; p1-p2
+        psubusb     xmm1,               xmm2                            ; p2-p1
+
+        por         xmm1,               xmm5                            ; abs(p2-p1)
+        mov         rdx,                arg(3) ;limit
+
+        movdqa      xmm4,               [rdx]                           ; limit
+        psubusb     xmm7,               xmm4                            ;
+
+
+        psubusb     xmm0,               xmm4                            ; abs(p3-p2) > limit
+        psubusb     xmm1,               xmm4                            ; abs(p2-p1) > limit
+
+        psubusb     xmm6,               xmm4                            ; abs(q2-q1) > limit
+        por         xmm7,               xmm6                            ; or
+
+        por         xmm0,               xmm1                            ;
+        por         xmm0,               xmm7                            ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
+
+        movdqa      xmm1,               xmm2                            ; p1
+
+        movdqa      xmm7,               xmm3                            ; p0
+        psubusb     xmm7,               xmm2                            ; p0-p1
+
+        psubusb     xmm2,               xmm3                            ; p1-p0
+        por         xmm2,               xmm7                            ; abs(p1-p0)
+
+        movdqa      t0,                 xmm2                            ; save abs(p1-p0)
+        lea         rdx,                srct
+
+        psubusb     xmm2,               xmm4                            ; abs(p1-p0)>limit
+        por         xmm0,               xmm2                            ; mask
+
+        movdqa      xmm5,               [rdx+64]                        ; q0
+        movdqa      xmm7,               [rdx+80]                        ; q1
+
+        movdqa      xmm6,               xmm5                            ; q0
+        movdqa      xmm2,               xmm7                            ; q1
+        psubusb     xmm5,               xmm7                            ; q0-q1
+
+        psubusb     xmm7,               xmm6                            ; q1-q0
+        por         xmm7,               xmm5                            ; abs(q1-q0)
+
+        movdqa      t1,                 xmm7                            ; save abs(q1-q0)
+        psubusb     xmm7,               xmm4                            ; abs(q1-q0)> limit
+
+        por         xmm0,               xmm7                            ; mask
+
+        movdqa      xmm5,                xmm2                           ; q1
+        psubusb     xmm5,                xmm1                           ; q1-=p1
+        psubusb     xmm1,                xmm2                           ; p1-=q1
+        por         xmm5,                xmm1                           ; abs(p1-q1)
+        pand        xmm5,                [tfe GLOBAL]                   ; set lsb of each byte to zero
+        psrlw       xmm5,                1                              ; abs(p1-q1)/2
+
+        mov         rdx,                arg(2) ;flimit                          ;
+        movdqa      xmm2,               [rdx]                           ; flimit
+
+        movdqa      xmm1,               xmm3                            ; p0
+        movdqa      xmm7,               xmm6                            ; q0
+        psubusb     xmm1,               xmm7                            ; p0-q0
+        psubusb     xmm7,               xmm3                            ; q0-p0
+        por         xmm1,               xmm7                            ; abs(q0-p0)
+        paddusb     xmm1,               xmm1                            ; abs(q0-p0)*2
+        paddusb     xmm1,               xmm5                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        paddb       xmm2,               xmm2                            ; flimit*2 (less than 255)
+        paddb       xmm4,               xmm2                            ; flimit * 2 + limit (less than 255)
+
+        psubusb     xmm1,               xmm4                            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        por         xmm1,               xmm0;                           ; mask
+        pxor        xmm0,               xmm0
+        pcmpeqb     xmm1,               xmm0
+
+        ; calculate high edge variance
+        mov         rdx,                arg(4) ;thresh                          ; get thresh
+        movdqa      xmm7,               [rdx]
+
+        movdqa      xmm4,               t0                              ; get abs (q1 - q0)
+        psubusb     xmm4,               xmm7                            ; abs(q1 - q0) > thresh
+
+        movdqa      xmm3,               t1                              ; get abs (p1 - p0)
+        psubusb     xmm3,               xmm7                            ; abs(p1 - p0)> thresh
+
+        por         xmm4,               xmm3                            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+        pcmpeqb     xmm4,               xmm0
+
+        pcmpeqb     xmm0,               xmm0
+        pxor        xmm4,               xmm0
+
+
+        ; start work on filters
+        lea         rdx,                srct
+
+        ; start work on filters
+        movdqa      xmm2,               [rdx+32]                        ; p1
+        movdqa      xmm7,               [rdx+80]                        ; q1
+
+        pxor        xmm2,               [t80 GLOBAL]                    ; p1 offset to convert to signed values
+        pxor        xmm7,               [t80 GLOBAL]                    ; q1 offset to convert to signed values
+
+        psubsb      xmm2,               xmm7                            ; p1 - q1
+        movdqa      xmm6,               [rdx+48]                        ; p0
+
+        movdqa      xmm0,               [rdx+64]                        ; q0
+        pxor        xmm6,               [t80 GLOBAL]                    ; offset to convert to signed values
+
+        pxor        xmm0,               [t80 GLOBAL]                    ; offset to convert to signed values
+        movdqa      xmm3,               xmm0                            ; q0
+
+        psubsb      xmm0,               xmm6                            ; q0 - p0
+        paddsb      xmm2,               xmm0                            ; 1 * (q0 - p0) + (p1 - q1)
+
+        paddsb      xmm2,               xmm0                            ; 2 * (q0 - p0)
+        paddsb      xmm2,               xmm0                            ; 3 * (q0 - p0)+ (p1 - q1)
+
+        pand        xmm1,               xmm2                            ; mask filter values we don't care about
+
+        ; xmm1 = vp8_filter, xmm4=hev, xmm6=ps0, xmm3=qs0
+        movdqa      xmm2,               xmm1                            ; vp8_filter
+        pand        xmm2,               xmm4;                           ; Filter2 = vp8_filter & hev
+
+        movdqa      xmm5,               xmm2
+        paddsb      xmm5,               [t3 GLOBAL]
+
+        pxor        xmm0,               xmm0                            ; 0
+        pxor        xmm7,               xmm7                            ; 0
+
+        punpcklbw   xmm0,               xmm5                            ; e0f0g0h0
+        psraw       xmm0,               11                              ; sign extended shift right by 3
+
+        punpckhbw   xmm7,               xmm5                            ; a0b0c0d0
+        psraw       xmm7,               11                              ; sign extended shift right by 3
+
+        packsswb    xmm0,               xmm7                            ; Filter2 >>=3;
+        movdqa      xmm5,               xmm0                            ; Filter2
+
+        paddsb      xmm2,               [t4 GLOBAL]                     ; vp8_signed_char_clamp(Filter2 + 4)
+        pxor        xmm0,               xmm0                            ; 0
+
+        pxor        xmm7,               xmm7                            ; 0
+        punpcklbw   xmm0,               xmm2                            ; e0f0g0h0
+
+        psraw       xmm0,               11                              ; sign extended shift right by 3
+        punpckhbw   xmm7,               xmm2                            ; a0b0c0d0
+
+        psraw       xmm7,               11                              ; sign extended shift right by 3
+        packsswb    xmm0,               xmm7                            ; Filter2 >>=3;
+
+        ; xmm0= filter2 xmm1 = vp8_filter,  xmm3 =qs0 xmm5=s xmm4 =hev xmm6=ps0
+        psubsb      xmm3,               xmm0                            ; qs0 =qs0 - filter1
+        paddsb      xmm6,               xmm5                            ; ps0 =ps0 + Fitler2
+
+
+        ; xmm1=vp8_filter, xmm3=qs0, xmm4 =hev xmm6=ps0
+        ; vp8_filter &= ~hev;
+        ; Filter2 = vp8_filter;
+        pandn       xmm4,                   xmm1                        ; vp8_filter&=~hev
+
+        ; xmm3=qs0, xmm4=filter2, xmm6=ps0
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
+        ; s = vp8_signed_char_clamp(qs0 - u);
+        ; *oq0 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps0 + u);
+        ; *op0 = s^0x80;
+        pxor        xmm0,                   xmm0
+        pxor        xmm1,                   xmm1
+
+        pxor        xmm2,                   xmm2
+        punpcklbw   xmm1,                   xmm4
+
+        punpckhbw   xmm2,                   xmm4
+        pmulhw      xmm1,                   [s27 GLOBAL]
+
+        pmulhw      xmm2,                   [s27 GLOBAL]
+        paddw       xmm1,                   [s63 GLOBAL]
+
+        paddw       xmm2,                   [s63 GLOBAL]
+        psraw       xmm1,                   7
+
+        psraw       xmm2,                   7
+        packsswb    xmm1,                   xmm2
+
+        psubsb      xmm3,                   xmm1
+        paddsb      xmm6,                   xmm1
+
+        pxor        xmm3,                   [t80 GLOBAL]
+        pxor        xmm6,                   [t80 GLOBAL]
+
+        movdqa      [rdx+48],               xmm6
+        movdqa      [rdx+64],               xmm3
+
+        ; roughly 2/7th difference across boundary
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
+        ; s = vp8_signed_char_clamp(qs1 - u);
+        ; *oq1 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps1 + u);
+        ; *op1 = s^0x80;
+        pxor        xmm1,                       xmm1
+        pxor        xmm2,                       xmm2
+
+        punpcklbw   xmm1,                       xmm4
+        punpckhbw   xmm2,                       xmm4
+
+        pmulhw      xmm1,                       [s18 GLOBAL]
+        pmulhw      xmm2,                       [s18 GLOBAL]
+
+        paddw       xmm1,                       [s63 GLOBAL]
+        paddw       xmm2,                       [s63 GLOBAL]
+
+        psraw       xmm1,                       7
+        psraw       xmm2,                       7
+
+        packsswb    xmm1,                       xmm2
+
+        movdqa      xmm3,                       [rdx + 80]              ;/q1
+        movdqa      xmm6,                       [rdx + 32]              ; p1
+
+        pxor        xmm3,                       [t80 GLOBAL]
+        pxor        xmm6,                       [t80 GLOBAL]
+
+        paddsb      xmm6,                       xmm1
+        psubsb      xmm3,                       xmm1
+
+        pxor        xmm6,                       [t80 GLOBAL]
+        pxor        xmm3,                       [t80 GLOBAL]
+
+        movdqa      [rdx + 80],                 xmm3
+        movdqa      [rdx + 32],                 xmm6
+
+
+        ; roughly 1/7th difference across boundary
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
+        ; s = vp8_signed_char_clamp(qs2 - u);
+        ; *oq2 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps2 + u);
+        ; *op2 = s^0x80;
+        pxor        xmm1,                       xmm1
+        pxor        xmm2,                       xmm2
+
+        punpcklbw   xmm1,                       xmm4
+        punpckhbw   xmm2,                       xmm4
+
+        pmulhw      xmm1,                       [s9 GLOBAL]
+        pmulhw      xmm2,                       [s9 GLOBAL]
+
+        paddw       xmm1,                       [s63 GLOBAL]
+        paddw       xmm2,                       [s63 GLOBAL]
+
+        psraw       xmm1,                       7
+        psraw       xmm2,                       7
+
+        packsswb    xmm1,                       xmm2
+
+        movdqa      xmm6,                       [rdx+16]
+        movdqa      xmm3,                       [rdx+96]
+
+        pxor        xmm6,                       [t80 GLOBAL]
+        pxor        xmm3,                       [t80 GLOBAL]
+
+        paddsb      xmm6,                       xmm1
+        psubsb      xmm3,                       xmm1
+
+        pxor        xmm6,                       [t80 GLOBAL]        ; xmm6 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+        pxor        xmm3,                       [t80 GLOBAL]        ; xmm3 = f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 15 06
+
+
+        ; transpose and write back
+        movdqa      xmm0,                       [rdx]               ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+        movdqa      xmm1,                       xmm0                ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+
+        punpcklbw   xmm0,                       xmm6                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+        punpckhbw   xmm1,                       xmm6                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+        movdqa      xmm2,                       [rdx+32]            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        movdqa      xmm6,                       xmm2                ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+
+        punpcklbw   xmm2,                       [rdx+48]            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+        punpckhbw   xmm6,                       [rdx+48]            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+        movdqa      xmm5,                       xmm0                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+        punpcklwd   xmm0,                       xmm2                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+
+        punpckhwd   xmm5,                       xmm2                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+        movdqa      xmm4,                       xmm1                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+        punpcklwd   xmm1,                       xmm6                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+        punpckhwd   xmm4,                       xmm6                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+
+        movdqa      xmm2,                       [rdx+64]            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+        punpcklbw   xmm2,                       [rdx+80]            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+
+        movdqa      xmm6,                       xmm3                ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
+        punpcklbw   xmm6,                       [rdx+112]           ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
+
+        movdqa      xmm7,                       xmm2                ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+        punpcklwd   xmm2,                       xmm6                ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
+
+        punpckhwd   xmm7,                       xmm6                ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
+        movdqa      xmm6,                       xmm0                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+
+        punpckldq   xmm0,                       xmm2                ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
+        punpckhdq   xmm6,                       xmm2                ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
+
+        lea         rsi,                        [rsi+rcx*8]
+        lea         rdi,                        [rdi+rcx*8]
+
+        movq        QWORD PTR [rsi+rcx*4],      xmm0
+        psrldq      xmm0,                       8
+
+        movq        QWORD PTR [rsi+rcx*2],      xmm6
+        psrldq      xmm6,                       8
+
+        movq        QWORD PTR [rdi+rcx*4],      xmm0
+        movq        QWORD PTR [rsi+rcx],        xmm6
+
+        movdqa      xmm0,                       xmm5                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+        punpckldq   xmm0,                       xmm7                ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
+
+        punpckhdq   xmm5,                       xmm7                ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
+
+        movq        QWORD PTR [rsi],            xmm0
+        psrldq      xmm0,                       8
+
+        movq        QWORD PTR [rsi+rax*2],      xmm5
+        psrldq      xmm5,                       8
+
+        movq        QWORD PTR [rsi+rax],        xmm0
+        movq        QWORD PTR [rdi+rax*2],      xmm5
+
+        movdqa      xmm2,                       [rdx+64]            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+        punpckhbw   xmm2,                       [rdx+80]            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
+
+        punpckhbw   xmm3,                       [rdx+112]           ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
+        movdqa      xmm0,                       xmm2
+
+        punpcklwd   xmm0,                       xmm3                ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
+        punpckhwd   xmm2,                       xmm3                ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
+
+        movdqa      xmm3,                       xmm1                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+        punpckldq   xmm1,                       xmm0                ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
+
+        punpckhdq   xmm3,                       xmm0                ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
+
+        lea         rsi,                        [rsi+rax*8]
+        lea         rdi,                        [rdi+rax*8]
+
+        movq        QWORD PTR [rsi+rcx*4],      xmm1
+        psrldq      xmm1,                       8
+
+        movq        QWORD PTR [rsi+rcx*2],      xmm3
+        psrldq      xmm3,                       8
+
+        movq        QWORD PTR [rdi+rcx*4],      xmm1
+        movq        QWORD PTR [rsi+rcx],        xmm3
+
+        movdqa      xmm1,                       xmm4                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+        punpckldq   xmm1,                       xmm2                ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
+
+        punpckhdq   xmm4,                       xmm2                ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
+        movq        QWORD PTR [rsi],            xmm1
+
+        psrldq      xmm1,                       8
+
+        movq        QWORD PTR [rsi+rax*2],      xmm4
+        psrldq      xmm4,                       8
+
+        movq        QWORD PTR [rsi+rax],        xmm1
+        movq        QWORD PTR [rdi+rax*2],      xmm4
+
+    add rsp, 160
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_loop_filter_simple_horizontal_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *flimit,
+;    const char *limit,
+;    const char *thresh,
+;    int count
+;)
+global sym(vp8_loop_filter_simple_horizontal_edge_sse2)
+sym(vp8_loop_filter_simple_horizontal_edge_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi, arg(0)             ;src_ptr
+        movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
+        mov         rdx, arg(2) ;flimit     ; get flimit
+        movdqa      xmm3, XMMWORD PTR [rdx]
+        mov         rdx, arg(3) ;limit
+        movdqa      xmm7, XMMWORD PTR [rdx]
+
+        paddb       xmm3, xmm3              ; flimit*2 (less than 255)
+        paddb       xmm3, xmm7              ; flimit * 2 + limit (less than 255)
+
+        mov         rdi, rsi                ; rdi points to row +1 for indirect addressing
+        add         rdi, rax
+        neg         rax
+
+        ; calculate mask
+        movdqu      xmm1, [rsi+2*rax]       ; p1
+        movdqu      xmm0, [rdi]             ; q1
+        movdqa      xmm2, xmm1
+        movdqa      xmm7, xmm0
+        movdqa      xmm4, xmm0
+        psubusb     xmm0, xmm1              ; q1-=p1
+        psubusb     xmm1, xmm4              ; p1-=q1
+        por         xmm1, xmm0              ; abs(p1-q1)
+        pand        xmm1, [tfe GLOBAL]      ; set lsb of each byte to zero
+        psrlw       xmm1, 1                 ; abs(p1-q1)/2
+
+        movdqu      xmm5, [rsi+rax]         ; p0
+        movdqu      xmm4, [rsi]             ; q0
+        movdqa      xmm0, xmm4              ; q0
+        movdqa      xmm6, xmm5              ; p0
+        psubusb     xmm5, xmm4              ; p0-=q0
+        psubusb     xmm4, xmm6              ; q0-=p0
+        por         xmm5, xmm4              ; abs(p0 - q0)
+        paddusb     xmm5, xmm5              ; abs(p0-q0)*2
+        paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        pxor        xmm3, xmm3
+        pcmpeqb     xmm5, xmm3
+
+        ; start work on filters
+        pxor        xmm2, [t80 GLOBAL]      ; p1 offset to convert to signed values
+        pxor        xmm7, [t80 GLOBAL]      ; q1 offset to convert to signed values
+        psubsb      xmm2, xmm7              ; p1 - q1
+
+        pxor        xmm6, [t80 GLOBAL]      ; offset to convert to signed values
+        pxor        xmm0, [t80 GLOBAL]      ; offset to convert to signed values
+        movdqa      xmm3, xmm0              ; q0
+        psubsb      xmm0, xmm6              ; q0 - p0
+        paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
+        paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)
+        paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
+        pand        xmm5, xmm2              ; mask filter values we don't care about
+
+        ; do + 4 side
+        paddsb      xmm5, [t4 GLOBAL]       ; 3* (q0 - p0) + (p1 - q1) + 4
+
+        movdqa      xmm0, xmm5              ; get a copy of filters
+        psllw       xmm0, 8                 ; shift left 8
+        psraw       xmm0, 3                 ; arithmetic shift right 11
+        psrlw       xmm0, 8
+        movdqa      xmm1, xmm5              ; get a copy of filters
+        psraw       xmm1, 11                ; arithmetic shift right 11
+        psllw       xmm1, 8                 ; shift left 8 to put it back
+
+        por         xmm0, xmm1              ; put the two together to get result
+
+        psubsb      xmm3, xmm0              ; q0-= q0 add
+        pxor        xmm3, [t80 GLOBAL]      ; unoffset
+        movdqu      [rsi], xmm3             ; write back
+
+        ; now do +3 side
+        psubsb      xmm5, [t1s GLOBAL]      ; +3 instead of +4
+
+        movdqa      xmm0, xmm5              ; get a copy of filters
+        psllw       xmm0, 8                 ; shift left 8
+        psraw       xmm0, 3                 ; arithmetic shift right 11
+        psrlw       xmm0, 8
+        psraw       xmm5, 11                ; arithmetic shift right 11
+        psllw       xmm5, 8                 ; shift left 8 to put it back
+        por         xmm0, xmm5              ; put the two together to get result
+
+
+        paddsb      xmm6, xmm0              ; p0+= p0 add
+        pxor        xmm6, [t80 GLOBAL]      ; unoffset
+        movdqu      [rsi+rax], xmm6         ; write back
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_loop_filter_simple_vertical_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *flimit,
+;    const char *limit,
+;    const char *thresh,
+;    int count
+;)
+global sym(vp8_loop_filter_simple_vertical_edge_sse2)
+sym(vp8_loop_filter_simple_vertical_edge_sse2):
+    push        rbp         ; save old base pointer value.
+    mov         rbp, rsp    ; set new base pointer value.
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx         ; save callee-saved reg
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 32                         ; reserve 32 bytes
+    %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];
+    %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];
+
+        mov         rsi, arg(0) ;src_ptr
+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        lea         rsi,        [rsi - 2 ]
+        lea         rdi,        [rsi + rax]
+        lea         rdx,        [rsi + rax*4]
+        lea         rcx,        [rdx + rax]
+
+        movdqu      xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00
+        movdqu      xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40
+        movdqu      xmm2,       [rdi]                   ; 13 12 11 10
+        movdqu      xmm3,       [rcx]                   ; 53 52 51 50
+        punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00
+        punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10
+
+        movdqu      xmm4,       [rsi + rax*2]           ; 23 22 21 20
+        movdqu      xmm5,       [rdx + rax*2]           ; 63 62 61 60
+        movdqu      xmm6,       [rdi + rax*2]           ; 33 32 31 30
+        movdqu      xmm7,       [rcx + rax*2]           ; 73 72 71 70
+        punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20
+        punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30
+
+        punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
+        punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
+
+        movdqa      xmm1,       xmm0
+        punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+        punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+
+        movdqa      xmm2,       xmm0
+        punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+        punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+
+        movdqa      t0,         xmm0                    ; save to t0
+        movdqa      t1,         xmm2                    ; save to t1
+
+        lea         rsi,        [rsi + rax*8]
+        lea         rdi,        [rsi + rax]
+        lea         rdx,        [rsi + rax*4]
+        lea         rcx,        [rdx + rax]
+
+        movdqu      xmm4,       [rsi]                   ; 83 82 81 80
+        movdqu      xmm1,       [rdx]                   ; c3 c2 c1 c0
+        movdqu      xmm6,       [rdi]                   ; 93 92 91 90
+        movdqu      xmm3,       [rcx]                   ; d3 d2 d1 d0
+        punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
+        punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
+
+        movdqu      xmm0,       [rsi + rax*2]           ; a3 a2 a1 a0
+        movdqu      xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
+        movdqu      xmm2,       [rdi + rax*2]           ; b3 b2 b1 b0
+        movdqu      xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
+        punpckldq   xmm0,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
+        punpckldq   xmm2,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
+
+        punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
+        punpcklbw   xmm0,       xmm2                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
+
+        movdqa      xmm1,       xmm4
+        punpcklwd   xmm4,       xmm0                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+        punpckhwd   xmm1,       xmm0                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+
+        movdqa      xmm6,       xmm4
+        punpckldq   xmm4,       xmm1                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+        punpckhdq   xmm6,       xmm1                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+
+        movdqa      xmm0,       t0                      ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+        movdqa      xmm2,       t1                      ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+        movdqa      xmm1,       xmm0
+        movdqa      xmm3,       xmm2
+
+        punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+        punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+        punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+
+        ; calculate mask
+        movdqa      xmm6,       xmm0                            ; p1
+        movdqa      xmm7,       xmm3                            ; q1
+        psubusb     xmm7,       xmm0                            ; q1-=p1
+        psubusb     xmm6,       xmm3                            ; p1-=q1
+        por         xmm6,       xmm7                            ; abs(p1-q1)
+        pand        xmm6,       [tfe GLOBAL]                    ; set lsb of each byte to zero
+        psrlw       xmm6,       1                               ; abs(p1-q1)/2
+
+        movdqa      xmm5,       xmm1                            ; p0
+        movdqa      xmm4,       xmm2                            ; q0
+        psubusb     xmm5,       xmm2                            ; p0-=q0
+        psubusb     xmm4,       xmm1                            ; q0-=p0
+        por         xmm5,       xmm4                            ; abs(p0 - q0)
+        paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
+        paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        mov         rdx,        arg(2)                          ;flimit
+        movdqa      xmm7, XMMWORD PTR [rdx]
+        mov         rdx,        arg(3)                          ; get limit
+        movdqa      xmm6, XMMWORD PTR [rdx]
+        paddb       xmm7,        xmm7                           ; flimit*2 (less than 255)
+        paddb       xmm7,        xmm6                           ; flimit * 2 + limit (less than 255)
+
+        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        pxor        xmm7,        xmm7
+        pcmpeqb     xmm5,        xmm7                           ; mm5 = mask
+
+        ; start work on filters
+        movdqa        t0,        xmm0
+        movdqa        t1,        xmm3
+
+        pxor        xmm0,        [t80 GLOBAL]                   ; p1 offset to convert to signed values
+        pxor        xmm3,        [t80 GLOBAL]                   ; q1 offset to convert to signed values
+
+        psubsb      xmm0,        xmm3                           ; p1 - q1
+        movdqa      xmm6,        xmm1                           ; p0
+
+        movdqa      xmm7,        xmm2                           ; q0
+        pxor        xmm6,        [t80 GLOBAL]                   ; offset to convert to signed values
+
+        pxor        xmm7,        [t80 GLOBAL]                   ; offset to convert to signed values
+        movdqa      xmm3,        xmm7                           ; offseted ; q0
+
+        psubsb      xmm7,        xmm6                           ; q0 - p0
+        paddsb      xmm0,        xmm7                           ; p1 - q1 + 1 * (q0 - p0)
+
+        paddsb      xmm0,        xmm7                           ; p1 - q1 + 2 * (q0 - p0)
+        paddsb      xmm0,        xmm7                           ; p1 - q1 + 3 * (q0 - p0)
+
+        pand        xmm5,        xmm0                           ; mask filter values we don't care about
+
+
+        paddsb      xmm5,        [t4 GLOBAL]                    ;  3* (q0 - p0) + (p1 - q1) + 4
+
+        movdqa      xmm0,        xmm5                           ; get a copy of filters
+        psllw       xmm0,        8                              ; shift left 8
+
+        psraw       xmm0,        3                              ; arithmetic shift right 11
+        psrlw       xmm0,        8
+
+        movdqa      xmm7,        xmm5                           ; get a copy of filters
+        psraw       xmm7,        11                             ; arithmetic shift right 11
+
+        psllw       xmm7,        8                              ; shift left 8 to put it back
+        por         xmm0,        xmm7                           ; put the two together to get result
+
+        psubsb      xmm3,        xmm0                           ; q0-= q0sz add
+        pxor        xmm3,        [t80 GLOBAL]                   ; unoffset   q0
+
+        ; now do +3 side
+        psubsb      xmm5,        [t1s GLOBAL]                   ; +3 instead of +4
+        movdqa      xmm0,        xmm5                           ; get a copy of filters
+
+        psllw       xmm0,        8                              ; shift left 8
+        psraw       xmm0,        3                              ; arithmetic shift right 11
+
+        psrlw       xmm0,        8
+        psraw       xmm5,        11                             ; arithmetic shift right 11
+
+        psllw       xmm5,        8                              ; shift left 8 to put it back
+        por         xmm0,        xmm5                           ; put the two together to get result
+
+        paddsb      xmm6,        xmm0                           ; p0+= p0 add
+        pxor        xmm6,        [t80 GLOBAL]                   ; unoffset   p0
+
+        movdqa      xmm0,        t0                             ; p1
+        movdqa      xmm4,        t1                             ; q1
+
+        ; transpose back to write out
+        ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+        ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+        ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+        movdqa      xmm1,       xmm0
+        punpcklbw   xmm0,       xmm6                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+        punpckhbw   xmm1,       xmm6                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+        movdqa      xmm5,       xmm3
+        punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+        punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+        movdqa      xmm2,       xmm0
+        punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+        punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+
+        movdqa      xmm3,       xmm1
+        punpcklwd   xmm1,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+        punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+
+        ; write out order: xmm0 xmm2 xmm1 xmm3
+        lea         rdx,        [rsi + rax*4]
+
+        movd        [rsi],      xmm1                               ; write the second 8-line result
+        psrldq      xmm1,       4
+        movd        [rdi],      xmm1
+        psrldq      xmm1,       4
+        movd        [rsi + rax*2], xmm1
+        psrldq      xmm1,       4
+        movd        [rdi + rax*2], xmm1
+
+        movd        [rdx],      xmm3
+        psrldq      xmm3,       4
+        movd        [rcx],      xmm3
+        psrldq      xmm3,       4
+        movd        [rdx + rax*2], xmm3
+        psrldq      xmm3,       4
+        movd        [rcx + rax*2], xmm3
+
+        neg         rax
+        lea         rsi,        [rsi + rax*8]
+        neg         rax
+        lea         rdi,        [rsi + rax]
+        lea         rdx,        [rsi + rax*4]
+        lea         rcx,        [rdx + rax]
+
+        movd        [rsi],      xmm0                                ; write the first 8-line result
+        psrldq      xmm0,       4
+        movd        [rdi],      xmm0
+        psrldq      xmm0,       4
+        movd        [rsi + rax*2], xmm0
+        psrldq      xmm0,       4
+        movd        [rdi + rax*2], xmm0
+
+        movd        [rdx],      xmm2
+        psrldq      xmm2,       4
+        movd        [rcx],      xmm2
+        psrldq      xmm2,       4
+        movd        [rdx + rax*2], xmm2
+        psrldq      xmm2,       4
+        movd        [rcx + rax*2], xmm2
+
+    add rsp, 32
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+tfe:
+    times 16 db 0xfe
+align 16
+t80:
+    times 16 db 0x80
+align 16
+t1s:
+    times 16 db 0x01
+align 16
+t3:
+    times 16 db 0x03
+align 16
+t4:
+    times 16 db 0x04
+align 16
+ones:
+    times 8 dw 0x0001
+align 16
+s27:
+    times 8 dw 0x1b00
+align 16
+s18:
+    times 8 dw 0x1200
+align 16
+s9:
+    times 8 dw 0x0900
+align 16
+s63:
+    times 8 dw 0x003f
diff --git a/vp8/common/x86/loopfilter_x86.c b/vp8/common/x86/loopfilter_x86.c
new file mode 100644
index 000000000..143ee7469
--- /dev/null
+++ b/vp8/common/x86/loopfilter_x86.c
@@ -0,0 +1,274 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "loopfilter.h"
+
+prototype_loopfilter(vp8_loop_filter_horizontal_edge_c);
+prototype_loopfilter(vp8_loop_filter_vertical_edge_c);
+prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_c);
+prototype_loopfilter(vp8_mbloop_filter_vertical_edge_c);
+prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_c);
+prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_c);
+
+prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx);
+prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx);
+prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx);
+prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx);
+prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx);
+prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx);
+
+prototype_loopfilter(vp8_loop_filter_vertical_edge_sse2);
+prototype_loopfilter(vp8_loop_filter_horizontal_edge_sse2);
+prototype_loopfilter(vp8_mbloop_filter_vertical_edge_sse2);
+prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_sse2);
+prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_sse2);
+prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_sse2);
+prototype_loopfilter(vp8_fast_loop_filter_vertical_edges_sse2);
+
+#if HAVE_MMX
+// Horizontal MB filtering
+void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) simpler_lpf;
+    vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+}
+
+
+void vp8_loop_filter_mbhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) u_ptr;
+    (void) v_ptr;
+    (void) uv_stride;
+    (void) simpler_lpf;
+    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+}
+
+
+// Vertical MB Filtering
+void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) simpler_lpf;
+    vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+}
+
+
+void vp8_loop_filter_mbvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) u_ptr;
+    (void) v_ptr;
+    (void) uv_stride;
+    (void) simpler_lpf;
+    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+}
+
+
+// Horizontal B Filtering
+void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                            int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) simpler_lpf;
+    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+}
+
+
+void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) u_ptr;
+    (void) v_ptr;
+    (void) uv_stride;
+    (void) simpler_lpf;
+    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+}
+
+
+// Vertical B Filtering
+void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                            int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) simpler_lpf;
+    vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+}
+
+
+void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) u_ptr;
+    (void) v_ptr;
+    (void) uv_stride;
+    (void) simpler_lpf;
+    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+}
+#endif
+
+
+// Horizontal MB filtering
+#if HAVE_SSE2
+void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) simpler_lpf;
+    vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+}
+
+
+void vp8_loop_filter_mbhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) u_ptr;
+    (void) v_ptr;
+    (void) uv_stride;
+    (void) simpler_lpf;
+    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+}
+
+
+// Vertical MB Filtering
+void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) simpler_lpf;
+    vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+}
+
+
+void vp8_loop_filter_mbvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) u_ptr;
+    (void) v_ptr;
+    (void) uv_stride;
+    (void) simpler_lpf;
+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+}
+
+
+// Horizontal B Filtering
+void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) simpler_lpf;
+    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+}
+
+
+void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) u_ptr;
+    (void) v_ptr;
+    (void) uv_stride;
+    (void) simpler_lpf;
+    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+}
+
+
+// Vertical B Filtering
+void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) simpler_lpf;
+    vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+}
+
+
+void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) u_ptr;
+    (void) v_ptr;
+    (void) uv_stride;
+    (void) simpler_lpf;
+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+}
+
+#endif
+
+#if 0
+void vp8_fast_loop_filter_vertical_edges_sse(unsigned char *y_ptr,
+        int y_stride,
+        loop_filter_info *lfi)
+{
+
+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+}
+#endif
diff --git a/vp8/common/x86/loopfilter_x86.h b/vp8/common/x86/loopfilter_x86.h
new file mode 100644
index 000000000..c87f38a31
--- /dev/null
+++ b/vp8/common/x86/loopfilter_x86.h
@@ -0,0 +1,99 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef LOOPFILTER_X86_H
+#define LOOPFILTER_X86_H
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+
+#if HAVE_MMX
+extern prototype_loopfilter_block(vp8_loop_filter_mbv_mmx);
+extern prototype_loopfilter_block(vp8_loop_filter_bv_mmx);
+extern prototype_loopfilter_block(vp8_loop_filter_mbh_mmx);
+extern prototype_loopfilter_block(vp8_loop_filter_bh_mmx);
+extern prototype_loopfilter_block(vp8_loop_filter_mbvs_mmx);
+extern prototype_loopfilter_block(vp8_loop_filter_bvs_mmx);
+extern prototype_loopfilter_block(vp8_loop_filter_mbhs_mmx);
+extern prototype_loopfilter_block(vp8_loop_filter_bhs_mmx);
+
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_lf_normal_mb_v
+#define vp8_lf_normal_mb_v vp8_loop_filter_mbv_mmx
+
+#undef  vp8_lf_normal_b_v
+#define vp8_lf_normal_b_v vp8_loop_filter_bv_mmx
+
+#undef  vp8_lf_normal_mb_h
+#define vp8_lf_normal_mb_h vp8_loop_filter_mbh_mmx
+
+#undef  vp8_lf_normal_b_h
+#define vp8_lf_normal_b_h vp8_loop_filter_bh_mmx
+
+#undef  vp8_lf_simple_mb_v
+#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_mmx
+
+#undef  vp8_lf_simple_b_v
+#define vp8_lf_simple_b_v vp8_loop_filter_bvs_mmx
+
+#undef  vp8_lf_simple_mb_h
+#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_mmx
+
+#undef  vp8_lf_simple_b_h
+#define vp8_lf_simple_b_h vp8_loop_filter_bhs_mmx
+#endif
+#endif
+
+
+#if HAVE_SSE2
+extern prototype_loopfilter_block(vp8_loop_filter_mbv_sse2);
+extern prototype_loopfilter_block(vp8_loop_filter_bv_sse2);
+extern prototype_loopfilter_block(vp8_loop_filter_mbh_sse2);
+extern prototype_loopfilter_block(vp8_loop_filter_bh_sse2);
+extern prototype_loopfilter_block(vp8_loop_filter_mbvs_sse2);
+extern prototype_loopfilter_block(vp8_loop_filter_bvs_sse2);
+extern prototype_loopfilter_block(vp8_loop_filter_mbhs_sse2);
+extern prototype_loopfilter_block(vp8_loop_filter_bhs_sse2);
+
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_lf_normal_mb_v
+#define vp8_lf_normal_mb_v vp8_loop_filter_mbv_sse2
+
+#undef  vp8_lf_normal_b_v
+#define vp8_lf_normal_b_v vp8_loop_filter_bv_sse2
+
+#undef  vp8_lf_normal_mb_h
+#define vp8_lf_normal_mb_h vp8_loop_filter_mbh_sse2
+
+#undef  vp8_lf_normal_b_h
+#define vp8_lf_normal_b_h vp8_loop_filter_bh_sse2
+
+#undef  vp8_lf_simple_mb_v
+#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_sse2
+
+#undef  vp8_lf_simple_b_v
+#define vp8_lf_simple_b_v vp8_loop_filter_bvs_sse2
+
+#undef  vp8_lf_simple_mb_h
+#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_sse2
+
+#undef  vp8_lf_simple_b_h
+#define vp8_lf_simple_b_h vp8_loop_filter_bhs_sse2
+#endif
+#endif
+
+
+#endif
diff --git a/vp8/common/x86/postproc_mmx.asm b/vp8/common/x86/postproc_mmx.asm
new file mode 100644
index 000000000..721c8d612
--- /dev/null
+++ b/vp8/common/x86/postproc_mmx.asm
@@ -0,0 +1,533 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define VP8_FILTER_WEIGHT 128
+%define VP8_FILTER_SHIFT  7
+
+;void vp8_post_proc_down_and_across_mmx
+;(
+;    unsigned char *src_ptr,
+;    unsigned char *dst_ptr,
+;    int src_pixels_per_line,
+;    int dst_pixels_per_line,
+;    int rows,
+;    int cols,
+;    int flimit
+;)
+global sym(vp8_post_proc_down_and_across_mmx)
+sym(vp8_post_proc_down_and_across_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+    ; move the global rd onto the stack, since we don't have enough registers
+    ; to do PIC addressing
+    movq        mm0, [rd GLOBAL]
+    sub         rsp, 8
+    movq        [rsp], mm0
+%define RD [rsp]
+%else
+%define RD [rd GLOBAL]
+%endif
+
+        push        rbx
+        lea         rbx, [Blur GLOBAL]
+        movd        mm2, dword ptr arg(6) ;flimit
+        punpcklwd   mm2, mm2
+        punpckldq   mm2, mm2
+
+        mov         rsi,        arg(0) ;src_ptr
+        mov         rdi,        arg(1) ;dst_ptr
+
+        movsxd      rcx, DWORD PTR arg(4) ;rows
+        movsxd      rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
+        pxor        mm0, mm0              ; mm0 = 00000000
+
+nextrow:
+
+        xor         rdx,        rdx       ; clear out rdx for use as loop counter
+nextcol:
+
+        pxor        mm7, mm7              ; mm7 = 00000000
+        movq        mm6, [rbx + 32 ]      ; mm6 = kernel 2 taps
+        movq        mm3, [rsi]            ; mm4 = r0 p0..p7
+        punpcklbw   mm3, mm0              ; mm3 = p0..p3
+        movq        mm1, mm3              ; mm1 = p0..p3
+        pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
+
+        movq        mm6, [rbx + 48]       ; mm6 = kernel 3 taps
+        movq        mm5, [rsi + rax]      ; mm4 = r1 p0..p7
+        punpcklbw   mm5, mm0              ; mm5 = r1 p0..p3
+        pmullw      mm6, mm5              ; mm6 *= p0..p3 * kernel 3 modifiers
+        paddusw     mm3, mm6              ; mm3 += mm6
+
+        ; thresholding
+        movq        mm7, mm1              ; mm7 = r0 p0..p3
+        psubusw     mm7, mm5              ; mm7 = r0 p0..p3 - r1 p0..p3
+        psubusw     mm5, mm1              ; mm5 = r1 p0..p3 - r0 p0..p3
+        paddusw     mm7, mm5              ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
+        pcmpgtw     mm7, mm2
+
+        movq        mm6, [rbx + 64 ]      ; mm6 = kernel 4 modifiers
+        movq        mm5, [rsi + 2*rax]    ; mm4 = r2 p0..p7
+        punpcklbw   mm5, mm0              ; mm5 = r2 p0..p3
+        pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
+        paddusw     mm3, mm6              ; mm3 += mm5
+
+        ; thresholding
+        movq        mm6, mm1              ; mm6 = r0 p0..p3
+        psubusw     mm6, mm5              ; mm6 = r0 p0..p3 - r2 p0..p3
+        psubusw     mm5, mm1              ; mm5 = r2 p0..p3 - r2 p0..p3
+        paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
+        pcmpgtw     mm6, mm2
+        por         mm7, mm6              ; accumulate thresholds
+
+
+        neg         rax
+        movq        mm6, [rbx ]           ; kernel 0 taps
+        movq        mm5, [rsi+2*rax]      ; mm4 = r-2 p0..p7
+        punpcklbw   mm5, mm0              ; mm5 = r-2 p0..p3
+        pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
+        paddusw     mm3, mm6              ; mm3 += mm5
+
+        ; thresholding
+        movq        mm6, mm1              ; mm6 = r0 p0..p3
+        psubusw     mm6, mm5              ; mm6 = p0..p3 - r-2 p0..p3
+        psubusw     mm5, mm1              ; mm5 = r-2 p0..p3 - p0..p3
+        paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
+        pcmpgtw     mm6, mm2
+        por         mm7, mm6              ; accumulate thresholds
+
+        movq        mm6, [rbx + 16]       ; kernel 1 taps
+        movq        mm4, [rsi+rax]        ; mm4 = r-1 p0..p7
+        punpcklbw   mm4, mm0              ; mm4 = r-1 p0..p3
+        pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
+        paddusw     mm3, mm6              ; mm3 += mm5
+
+        ; thresholding
+        movq        mm6, mm1              ; mm6 = r0 p0..p3
+        psubusw     mm6, mm4              ; mm6 = p0..p3 - r-2 p0..p3
+        psubusw     mm4, mm1              ; mm5 = r-1 p0..p3 - p0..p3
+        paddusw     mm6, mm4              ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
+        pcmpgtw     mm6, mm2
+        por         mm7, mm6              ; accumulate thresholds
+
+
+        paddusw     mm3, RD               ; mm3 += round value
+        psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
+
+        pand        mm1, mm7              ; mm1 select vals > thresh from source
+        pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
+        paddusw     mm1, mm7              ; combination
+
+        packuswb    mm1, mm0              ; pack to bytes
+
+        movd        [rdi], mm1            ;
+        neg         rax                   ; pitch is positive
+
+
+        add         rsi, 4
+        add         rdi, 4
+        add         rdx, 4
+
+        cmp         edx, dword ptr arg(5) ;cols
+        jl          nextcol
+        ; done with the all cols, start the across filtering in place
+        sub         rsi, rdx
+        sub         rdi, rdx
+
+
+        push        rax
+        xor         rdx,    rdx
+        mov         rax,    [rdi-4];
+
+acrossnextcol:
+        pxor        mm7, mm7              ; mm7 = 00000000
+        movq        mm6, [rbx + 32 ]      ;
+        movq        mm4, [rdi+rdx]        ; mm4 = p0..p7
+        movq        mm3, mm4              ; mm3 = p0..p7
+        punpcklbw   mm3, mm0              ; mm3 = p0..p3
+        movq        mm1, mm3              ; mm1 = p0..p3
+        pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
+
+        movq        mm6, [rbx + 48]
+        psrlq       mm4, 8                ; mm4 = p1..p7
+        movq        mm5, mm4              ; mm5 = p1..p7
+        punpcklbw   mm5, mm0              ; mm5 = p1..p4
+        pmullw      mm6, mm5              ; mm6 *= p1..p4 * kernel 3 modifiers
+        paddusw     mm3, mm6              ; mm3 += mm6
+
+        ; thresholding
+        movq        mm7, mm1              ; mm7 = p0..p3
+        psubusw     mm7, mm5              ; mm7 = p0..p3 - p1..p4
+        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     mm7, mm5              ; mm7 = abs(p0..p3 - p1..p4)
+        pcmpgtw     mm7, mm2
+
+        movq        mm6, [rbx + 64 ]
+        psrlq       mm4, 8                ; mm4 = p2..p7
+        movq        mm5, mm4              ; mm5 = p2..p7
+        punpcklbw   mm5, mm0              ; mm5 = p2..p5
+        pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
+        paddusw     mm3, mm6              ; mm3 += mm5
+
+        ; thresholding
+        movq        mm6, mm1              ; mm6 = p0..p3
+        psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
+        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     mm6, mm2
+        por         mm7, mm6              ; accumulate thresholds
+
+
+        movq        mm6, [rbx ]
+        movq        mm4, [rdi+rdx-2]      ; mm4 = p-2..p5
+        movq        mm5, mm4              ; mm5 = p-2..p5
+        punpcklbw   mm5, mm0              ; mm5 = p-2..p1
+        pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
+        paddusw     mm3, mm6              ; mm3 += mm5
+
+        ; thresholding
+        movq        mm6, mm1              ; mm6 = p0..p3
+        psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
+        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     mm6, mm2
+        por         mm7, mm6              ; accumulate thresholds
+
+        movq        mm6, [rbx + 16]
+        psrlq       mm4, 8                ; mm4 = p-1..p5
+        punpcklbw   mm4, mm0              ; mm4 = p-1..p2
+        pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
+        paddusw     mm3, mm6              ; mm3 += mm5
+
+        ; thresholding
+        movq        mm6, mm1              ; mm6 = p0..p3
+        psubusw     mm6, mm4              ; mm6 = p0..p3 - p1..p4
+        psubusw     mm4, mm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     mm6, mm4              ; mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     mm6, mm2
+        por         mm7, mm6              ; accumulate thresholds
+
+        paddusw     mm3, RD               ; mm3 += round value
+        psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
+
+        pand        mm1, mm7              ; mm1 select vals > thresh from source
+        pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
+        paddusw     mm1, mm7              ; combination
+
+        packuswb    mm1, mm0              ; pack to bytes
+        mov         DWORD PTR [rdi+rdx-4],  eax   ; store previous four bytes
+        movd        eax,    mm1
+
+        add         rdx, 4
+        cmp         edx, dword ptr arg(5) ;cols
+        jl          acrossnextcol;
+
+        mov         DWORD PTR [rdi+rdx-4],  eax
+        pop         rax
+
+        ; done with this rwo
+        add         rsi,rax               ; next line
+        movsxd      rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?
+        add         rdi,rax               ; next destination
+        movsxd      rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
+
+        dec         rcx                   ; decrement count
+        jnz         nextrow               ; next row
+        pop         rbx
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef RD
+
+
+;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
+;                             int pitch, int rows, int cols,int flimit)
+extern sym(vp8_rv)
+global sym(vp8_mbpost_proc_down_mmx)
+sym(vp8_mbpost_proc_down_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 136
+
+    ; unsigned char d[16][8] at [rsp]
+    ; create flimit2 at [rsp+128]
+    mov         eax, dword ptr arg(4) ;flimit
+    mov         [rsp+128], eax
+    mov         [rsp+128+4], eax
+%define flimit2 [rsp+128]
+
+%if ABI_IS_32BIT=0
+    lea         r8,       [sym(vp8_rv) GLOBAL]
+%endif
+
+    ;rows +=8;
+    add         dword ptr arg(2), 8
+
+    ;for(c=0; c<cols; c+=4)
+loop_col:
+            mov         rsi,        arg(0)  ;s
+            pxor        mm0,        mm0     ;
+
+            movsxd      rax,        dword ptr arg(1) ;pitch       ;
+            neg         rax                                     ; rax = -pitch
+
+            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
+            neg         rax
+
+
+            pxor        mm5,        mm5
+            pxor        mm6,        mm6     ;
+
+            pxor        mm7,        mm7     ;
+            mov         rdi,        rsi
+
+            mov         rcx,        15          ;
+
+loop_initvar:
+            movd        mm1,        DWORD PTR [rdi];
+            punpcklbw   mm1,        mm0     ;
+
+            paddw       mm5,        mm1     ;
+            pmullw      mm1,        mm1     ;
+
+            movq        mm2,        mm1     ;
+            punpcklwd   mm1,        mm0     ;
+
+            punpckhwd   mm2,        mm0     ;
+            paddd       mm6,        mm1     ;
+
+            paddd       mm7,        mm2     ;
+            lea         rdi,        [rdi+rax]   ;
+
+            dec         rcx
+            jne         loop_initvar
+            ;save the var and sum
+            xor         rdx,        rdx
+loop_row:
+            movd        mm1,        DWORD PTR [rsi]     ; [s-pitch*8]
+            movd        mm2,        DWORD PTR [rdi]     ; [s+pitch*7]
+
+            punpcklbw   mm1,        mm0
+            punpcklbw   mm2,        mm0
+
+            paddw       mm5,        mm2
+            psubw       mm5,        mm1
+
+            pmullw      mm2,        mm2
+            movq        mm4,        mm2
+
+            punpcklwd   mm2,        mm0
+            punpckhwd   mm4,        mm0
+
+            paddd       mm6,        mm2
+            paddd       mm7,        mm4
+
+            pmullw      mm1,        mm1
+            movq        mm2,        mm1
+
+            punpcklwd   mm1,        mm0
+            psubd       mm6,        mm1
+
+            punpckhwd   mm2,        mm0
+            psubd       mm7,        mm2
+
+
+            movq        mm3,        mm6
+            pslld       mm3,        4
+
+            psubd       mm3,        mm6
+            movq        mm1,        mm5
+
+            movq        mm4,        mm5
+            pmullw      mm1,        mm1
+
+            pmulhw      mm4,        mm4
+            movq        mm2,        mm1
+
+            punpcklwd   mm1,        mm4
+            punpckhwd   mm2,        mm4
+
+            movq        mm4,        mm7
+            pslld       mm4,        4
+
+            psubd       mm4,        mm7
+
+            psubd       mm3,        mm1
+            psubd       mm4,        mm2
+
+            psubd       mm3,        flimit2
+            psubd       mm4,        flimit2
+
+            psrad       mm3,        31
+            psrad       mm4,        31
+
+            packssdw    mm3,        mm4
+            packsswb    mm3,        mm0
+
+            movd        mm1,        DWORD PTR [rsi+rax*8]
+
+            movq        mm2,        mm1
+            punpcklbw   mm1,        mm0
+
+            paddw       mm1,        mm5
+            mov         rcx,        rdx
+
+            and         rcx,        127
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+            push        rax
+            lea         rax,        [sym(vp8_rv) GLOBAL]
+            movq        mm4,        [rax + rcx*2] ;vp8_rv[rcx*2]
+            pop         rax
+%elif ABI_IS_32BIT=0
+            movq        mm4,        [r8 + rcx*2] ;vp8_rv[rcx*2]
+%else
+            movq        mm4,        [sym(vp8_rv) + rcx*2]
+%endif
+            paddw       mm1,        mm4
+            ;paddw     xmm1,       eight8s
+            psraw       mm1,        4
+
+            packuswb    mm1,        mm0
+            pand        mm1,        mm3
+
+            pandn       mm3,        mm2
+            por         mm1,        mm3
+
+            and         rcx,        15
+            movd        DWORD PTR   [rsp+rcx*4], mm1 ;d[rcx*4]
+
+            mov         rcx,        rdx
+            sub         rcx,        8
+
+            and         rcx,        15
+            movd        mm1,        DWORD PTR [rsp+rcx*4] ;d[rcx*4]
+
+            movd        [rsi],      mm1
+            lea         rsi,        [rsi+rax]
+
+            lea         rdi,        [rdi+rax]
+            add         rdx,        1
+
+            cmp         edx,        dword arg(2) ;rows
+            jl          loop_row
+
+
+        add         dword arg(0), 4 ; s += 4
+        sub         dword arg(3), 4 ; cols -= 4
+        cmp         dword arg(3), 0
+        jg          loop_col
+
+    add         rsp, 136
+    pop         rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit2
+
+
+;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
+;                            unsigned char blackclamp[16],
+;                            unsigned char whiteclamp[16],
+;                            unsigned char bothclamp[16],
+;                            unsigned int Width, unsigned int Height, int Pitch)
+extern sym(rand)
+global sym(vp8_plane_add_noise_mmx)
+sym(vp8_plane_add_noise_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+addnoise_loop:
+    call sym(rand) WRT_PLT
+    mov     rcx, arg(1) ;noise
+    and     rax, 0xff
+    add     rcx, rax
+
+    ; we rely on the fact that the clamping vectors are stored contiguously
+    ; in black/white/both order. Note that we have to reload this here because
+    ; rdx could be trashed by rand()
+    mov     rdx, arg(2) ; blackclamp
+
+
+            mov     rdi, rcx
+            movsxd  rcx, dword arg(5) ;[Width]
+            mov     rsi, arg(0) ;Pos
+            xor         rax,rax
+
+addnoise_nextset:
+            movq        mm1,[rsi+rax]         ; get the source
+
+            psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
+            paddusb     mm1, [rdx+32] ;bothclamp
+            psubusb     mm1, [rdx+16] ;whiteclamp
+
+            movq        mm2,[rdi+rax]         ; get the noise for this line
+            paddb       mm1,mm2              ; add it in
+            movq        [rsi+rax],mm1         ; store the result
+
+            add         rax,8                 ; move to the next line
+
+            cmp         rax, rcx
+            jl          addnoise_nextset
+
+    movsxd  rax, dword arg(7) ; Pitch
+    add     arg(0), rax ; Start += Pitch
+    sub     dword arg(6), 1   ; Height -= 1
+    jg      addnoise_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+Blur:
+    times 16 dw 16
+    times  8 dw 64
+    times 16 dw 16
+    times  8 dw  0
+
+rd:
+    times 4 dw 0x40
diff --git a/vp8/common/x86/postproc_mmx.c b/vp8/common/x86/postproc_mmx.c
new file mode 100644
index 000000000..095797b1e
--- /dev/null
+++ b/vp8/common/x86/postproc_mmx.c
@@ -0,0 +1,1507 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include <stdlib.h>
+#include "vpx_scale/yv12config.h"
+#include "pragmas.h"
+
+#define VP8_FILTER_WEIGHT 128
+#define VP8_FILTER_SHIFT  7
+
+
+
+/* static constants */
+__declspec(align(16))
+const static short  Blur[48] =
+{
+
+    16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16,
+    64, 64, 64, 64, 64, 64, 64, 64,
+    16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16,
+    0,  0,  0,  0,  0,  0,  0,  0,
+
+};
+#define RD  __declspec(align(16)) __int64 rd  = 0x0040004000400040;
+#define R4D2 __declspec(align(16)) __int64 rd42[2] = {0x0004000400040004,0x0004000400040004};
+
+#ifndef RELOCATEABLE
+const static RD;
+const static R4D2;
+#endif
+
+
+/* external references */
+extern double vp8_gaussian(double sigma, double mu, double x);
+extern short vp8_rv[];
+extern int vp8_q2mbl(int x) ;
+
+
+
+void vp8_post_proc_down_and_across_mmx
+(
+    unsigned char *src_ptr,
+    unsigned char *dst_ptr,
+    int src_pixels_per_line,
+    int dst_pixels_per_line,
+    int rows,
+    int cols,
+    int flimit
+)
+{
+#ifdef RELOCATEABLE
+    RD
+    R4D2
+#endif
+
+    __asm
+    {
+        push        ebx
+        lea         ebx, Blur
+        movd        mm2, flimit
+        punpcklwd   mm2, mm2
+        punpckldq   mm2, mm2
+
+        mov         esi,        src_ptr
+        mov         edi,        dst_ptr
+
+        mov         ecx, DWORD PTR rows
+        mov         eax, src_pixels_per_line ;
+        destination pitch?
+        pxor        mm0, mm0              ;
+        mm0 = 00000000
+
+        nextrow:
+
+        xor         edx,        edx       ;
+
+        clear out edx for use as loop counter
+        nextcol:
+
+        pxor        mm7, mm7              ;
+
+    mm7 = 00000000
+    movq        mm6, [ebx + 32 ]      ;
+        mm6 = kernel 2 taps
+        movq        mm3, [esi]            ;
+        mm4 = r0 p0..p7
+        punpcklbw   mm3, mm0              ;
+        mm3 = p0..p3
+        movq        mm1, mm3              ;
+        mm1 = p0..p3
+        pmullw      mm3, mm6              ;
+        mm3 *= kernel 2 modifiers
+
+        movq        mm6, [ebx + 48]       ;
+        mm6 = kernel 3 taps
+        movq        mm5, [esi + eax]      ;
+        mm4 = r1 p0..p7
+        punpcklbw   mm5, mm0              ;
+        mm5 = r1 p0..p3
+        pmullw      mm6, mm5              ;
+        mm6 *= p0..p3 * kernel 3 modifiers
+        paddusw     mm3, mm6              ;
+        mm3 += mm6
+
+        ;
+        thresholding
+        movq        mm7, mm1              ;
+        mm7 = r0 p0..p3
+        psubusw     mm7, mm5              ;
+        mm7 = r0 p0..p3 - r1 p0..p3
+        psubusw     mm5, mm1              ;
+        mm5 = r1 p0..p3 - r0 p0..p3
+        paddusw     mm7, mm5              ;
+        mm7 = abs(r0 p0..p3 - r1 p0..p3)
+        pcmpgtw     mm7, mm2
+
+        movq        mm6, [ebx + 64 ]      ;
+        mm6 = kernel 4 modifiers
+        movq        mm5, [esi + 2*eax]    ;
+        mm4 = r2 p0..p7
+        punpcklbw   mm5, mm0              ;
+        mm5 = r2 p0..p3
+        pmullw      mm6, mm5              ;
+        mm5 *= kernel 4 modifiers
+        paddusw     mm3, mm6              ;
+        mm3 += mm5
+
+        ;
+        thresholding
+        movq        mm6, mm1              ;
+        mm6 = r0 p0..p3
+        psubusw     mm6, mm5              ;
+        mm6 = r0 p0..p3 - r2 p0..p3
+        psubusw     mm5, mm1              ;
+        mm5 = r2 p0..p3 - r2 p0..p3
+        paddusw     mm6, mm5              ;
+        mm6 = abs(r0 p0..p3 - r2 p0..p3)
+        pcmpgtw     mm6, mm2
+        por         mm7, mm6              ;
+        accumulate thresholds
+
+
+        neg         eax
+        movq        mm6, [ebx ]           ;
+        kernel 0 taps
+        movq        mm5, [esi+2*eax]      ;
+        mm4 = r-2 p0..p7
+        punpcklbw   mm5, mm0              ;
+        mm5 = r-2 p0..p3
+        pmullw      mm6, mm5              ;
+        mm5 *= kernel 0 modifiers
+        paddusw     mm3, mm6              ;
+        mm3 += mm5
+
+        ;
+        thresholding
+        movq        mm6, mm1              ;
+        mm6 = r0 p0..p3
+        psubusw     mm6, mm5              ;
+        mm6 = p0..p3 - r-2 p0..p3
+        psubusw     mm5, mm1              ;
+        mm5 = r-2 p0..p3 - p0..p3
+        paddusw     mm6, mm5              ;
+        mm6 = abs(r0 p0..p3 - r-2 p0..p3)
+        pcmpgtw     mm6, mm2
+        por         mm7, mm6              ;
+        accumulate thresholds
+
+        movq        mm6, [ebx + 16]       ;
+        kernel 1 taps
+        movq        mm4, [esi+eax]        ;
+        mm4 = r-1 p0..p7
+        punpcklbw   mm4, mm0              ;
+        mm4 = r-1 p0..p3
+        pmullw      mm6, mm4              ;
+        mm4 *= kernel 1 modifiers.
+        paddusw     mm3, mm6              ;
+        mm3 += mm5
+
+        ;
+        thresholding
+        movq        mm6, mm1              ;
+        mm6 = r0 p0..p3
+        psubusw     mm6, mm4              ;
+        mm6 = p0..p3 - r-2 p0..p3
+        psubusw     mm4, mm1              ;
+        mm5 = r-1 p0..p3 - p0..p3
+        paddusw     mm6, mm4              ;
+        mm6 = abs(r0 p0..p3 - r-1 p0..p3)
+        pcmpgtw     mm6, mm2
+        por         mm7, mm6              ;
+        accumulate thresholds
+
+
+        paddusw     mm3, rd               ;
+        mm3 += round value
+        psraw       mm3, VP8_FILTER_SHIFT     ;
+        mm3 /= 128
+
+        pand        mm1, mm7              ;
+        mm1 select vals > thresh from source
+        pandn       mm7, mm3              ;
+        mm7 select vals < thresh from blurred result
+        paddusw     mm1, mm7              ;
+        combination
+
+        packuswb    mm1, mm0              ;
+        pack to bytes
+
+        movd        [edi], mm1            ;
+        neg         eax                   ;
+        pitch is positive
+
+
+        add         esi, 4
+        add         edi, 4
+        add         edx, 4
+
+        cmp         edx, cols
+        jl          nextcol
+        // done with the all cols, start the across filtering in place
+        sub         esi, edx
+        sub         edi, edx
+
+
+        push        eax
+        xor         edx,    edx
+        mov         eax,    [edi-4];
+
+        acrossnextcol:
+        pxor        mm7, mm7              ;
+        mm7 = 00000000
+        movq        mm6, [ebx + 32 ]      ;
+        movq        mm4, [edi+edx]        ;
+        mm4 = p0..p7
+        movq        mm3, mm4              ;
+        mm3 = p0..p7
+        punpcklbw   mm3, mm0              ;
+        mm3 = p0..p3
+        movq        mm1, mm3              ;
+        mm1 = p0..p3
+        pmullw      mm3, mm6              ;
+        mm3 *= kernel 2 modifiers
+
+        movq        mm6, [ebx + 48]
+        psrlq       mm4, 8                ;
+        mm4 = p1..p7
+        movq        mm5, mm4              ;
+        mm5 = p1..p7
+        punpcklbw   mm5, mm0              ;
+        mm5 = p1..p4
+        pmullw      mm6, mm5              ;
+        mm6 *= p1..p4 * kernel 3 modifiers
+        paddusw     mm3, mm6              ;
+        mm3 += mm6
+
+        ;
+        thresholding
+        movq        mm7, mm1              ;
+        mm7 = p0..p3
+        psubusw     mm7, mm5              ;
+        mm7 = p0..p3 - p1..p4
+        psubusw     mm5, mm1              ;
+        mm5 = p1..p4 - p0..p3
+        paddusw     mm7, mm5              ;
+        mm7 = abs(p0..p3 - p1..p4)
+        pcmpgtw     mm7, mm2
+
+        movq        mm6, [ebx + 64 ]
+        psrlq       mm4, 8                ;
+        mm4 = p2..p7
+        movq        mm5, mm4              ;
+        mm5 = p2..p7
+        punpcklbw   mm5, mm0              ;
+        mm5 = p2..p5
+        pmullw      mm6, mm5              ;
+        mm5 *= kernel 4 modifiers
+        paddusw     mm3, mm6              ;
+        mm3 += mm5
+
+        ;
+        thresholding
+        movq        mm6, mm1              ;
+        mm6 = p0..p3
+        psubusw     mm6, mm5              ;
+        mm6 = p0..p3 - p1..p4
+        psubusw     mm5, mm1              ;
+        mm5 = p1..p4 - p0..p3
+        paddusw     mm6, mm5              ;
+        mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     mm6, mm2
+        por         mm7, mm6              ;
+        accumulate thresholds
+
+
+        movq        mm6, [ebx ]
+        movq        mm4, [edi+edx-2]      ;
+        mm4 = p-2..p5
+        movq        mm5, mm4              ;
+        mm5 = p-2..p5
+        punpcklbw   mm5, mm0              ;
+        mm5 = p-2..p1
+        pmullw      mm6, mm5              ;
+        mm5 *= kernel 0 modifiers
+        paddusw     mm3, mm6              ;
+        mm3 += mm5
+
+        ;
+        thresholding
+        movq        mm6, mm1              ;
+        mm6 = p0..p3
+        psubusw     mm6, mm5              ;
+        mm6 = p0..p3 - p1..p4
+        psubusw     mm5, mm1              ;
+        mm5 = p1..p4 - p0..p3
+        paddusw     mm6, mm5              ;
+        mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     mm6, mm2
+        por         mm7, mm6              ;
+        accumulate thresholds
+
+        movq        mm6, [ebx + 16]
+        psrlq       mm4, 8                ;
+        mm4 = p-1..p5
+        punpcklbw   mm4, mm0              ;
+        mm4 = p-1..p2
+        pmullw      mm6, mm4              ;
+        mm4 *= kernel 1 modifiers.
+        paddusw     mm3, mm6              ;
+        mm3 += mm5
+
+        ;
+        thresholding
+        movq        mm6, mm1              ;
+        mm6 = p0..p3
+        psubusw     mm6, mm4              ;
+        mm6 = p0..p3 - p1..p4
+        psubusw     mm4, mm1              ;
+        mm5 = p1..p4 - p0..p3
+        paddusw     mm6, mm4              ;
+        mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     mm6, mm2
+        por         mm7, mm6              ;
+        accumulate thresholds
+
+        paddusw     mm3, rd               ;
+        mm3 += round value
+        psraw       mm3, VP8_FILTER_SHIFT     ;
+        mm3 /= 128
+
+        pand        mm1, mm7              ;
+        mm1 select vals > thresh from source
+        pandn       mm7, mm3              ;
+        mm7 select vals < thresh from blurred result
+        paddusw     mm1, mm7              ;
+        combination
+
+        packuswb    mm1, mm0              ;
+        pack to bytes
+        mov         DWORD PTR [edi+edx-4],  eax   ;
+        store previous four bytes
+        movd        eax,    mm1
+
+        add         edx, 4
+        cmp         edx, cols
+        jl          acrossnextcol;
+
+        mov         DWORD PTR [edi+edx-4],  eax
+        pop         eax
+
+        // done with this rwo
+        add         esi, eax               ;
+        next line
+        mov         eax, dst_pixels_per_line ;
+        destination pitch?
+        add         edi, eax               ;
+        next destination
+        mov         eax, src_pixels_per_line ;
+        destination pitch?
+
+        dec         ecx                   ;
+        decrement count
+        jnz         nextrow               ;
+        next row
+        pop         ebx
+
+    }
+}
+
+
+
+void vp8_post_proc_down_and_across_xmm
+(
+    unsigned char *src_ptr,
+    unsigned char *dst_ptr,
+    int src_pixels_per_line,
+    int dst_pixels_per_line,
+    int rows,
+    int cols,
+    int flimit
+)
+{
+#ifdef RELOCATEABLE
+    R4D2
+#endif
+
+    __asm
+    {
+        movd        xmm2,       flimit
+        punpcklwd   xmm2,       xmm2
+        punpckldq   xmm2,       xmm2
+        punpcklqdq  xmm2,       xmm2
+
+        mov         esi,        src_ptr
+        mov         edi,        dst_ptr
+
+        mov         ecx,        DWORD PTR rows
+        mov         eax,        src_pixels_per_line ;
+        destination pitch?
+        pxor        xmm0,       xmm0              ;
+        mm0 = 00000000
+
+        nextrow:
+
+        xor         edx,        edx       ;
+
+        clear out edx for use as loop counter
+        nextcol:
+        movq        xmm3,       QWORD PTR [esi]         ;
+
+        mm4 = r0 p0..p7
+        punpcklbw   xmm3,       xmm0                    ;
+        mm3 = p0..p3
+        movdqa      xmm1,       xmm3                    ;
+        mm1 = p0..p3
+        psllw       xmm3,       2                       ;
+
+        movq        xmm5,       QWORD PTR [esi + eax]   ;
+        mm4 = r1 p0..p7
+        punpcklbw   xmm5,       xmm0                    ;
+        mm5 = r1 p0..p3
+        paddusw     xmm3,       xmm5                    ;
+        mm3 += mm6
+
+        ;
+        thresholding
+        movdqa      xmm7,       xmm1                    ;
+        mm7 = r0 p0..p3
+        psubusw     xmm7,       xmm5                    ;
+        mm7 = r0 p0..p3 - r1 p0..p3
+        psubusw     xmm5,       xmm1                    ;
+        mm5 = r1 p0..p3 - r0 p0..p3
+        paddusw     xmm7,       xmm5                    ;
+        mm7 = abs(r0 p0..p3 - r1 p0..p3)
+        pcmpgtw     xmm7,       xmm2
+
+        movq        xmm5,       QWORD PTR [esi + 2*eax] ;
+        mm4 = r2 p0..p7
+        punpcklbw   xmm5,       xmm0                    ;
+        mm5 = r2 p0..p3
+        paddusw     xmm3,       xmm5                    ;
+        mm3 += mm5
+
+        ;
+        thresholding
+        movdqa      xmm6,       xmm1                    ;
+        mm6 = r0 p0..p3
+        psubusw     xmm6,       xmm5                    ;
+        mm6 = r0 p0..p3 - r2 p0..p3
+        psubusw     xmm5,       xmm1                    ;
+        mm5 = r2 p0..p3 - r2 p0..p3
+        paddusw     xmm6,       xmm5                    ;
+        mm6 = abs(r0 p0..p3 - r2 p0..p3)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6                    ;
+        accumulate thresholds
+
+
+        neg         eax
+        movq        xmm5,       QWORD PTR [esi+2*eax]   ;
+        mm4 = r-2 p0..p7
+        punpcklbw   xmm5,       xmm0                    ;
+        mm5 = r-2 p0..p3
+        paddusw     xmm3,       xmm5                    ;
+        mm3 += mm5
+
+        ;
+        thresholding
+        movdqa      xmm6,       xmm1                    ;
+        mm6 = r0 p0..p3
+        psubusw     xmm6,       xmm5                    ;
+        mm6 = p0..p3 - r-2 p0..p3
+        psubusw     xmm5,       xmm1                    ;
+        mm5 = r-2 p0..p3 - p0..p3
+        paddusw     xmm6,       xmm5                    ;
+        mm6 = abs(r0 p0..p3 - r-2 p0..p3)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6                    ;
+        accumulate thresholds
+
+        movq        xmm4,       QWORD PTR [esi+eax]     ;
+        mm4 = r-1 p0..p7
+        punpcklbw   xmm4,       xmm0                    ;
+        mm4 = r-1 p0..p3
+        paddusw     xmm3,       xmm4                    ;
+        mm3 += mm5
+
+        ;
+        thresholding
+        movdqa      xmm6,       xmm1                    ;
+        mm6 = r0 p0..p3
+        psubusw     xmm6,       xmm4                    ;
+        mm6 = p0..p3 - r-2 p0..p3
+        psubusw     xmm4,       xmm1                    ;
+        mm5 = r-1 p0..p3 - p0..p3
+        paddusw     xmm6,       xmm4                    ;
+        mm6 = abs(r0 p0..p3 - r-1 p0..p3)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6                    ;
+        accumulate thresholds
+
+
+        paddusw     xmm3,       rd42                    ;
+        mm3 += round value
+        psraw       xmm3,       3                       ;
+        mm3 /= 8
+
+        pand        xmm1,       xmm7                    ;
+        mm1 select vals > thresh from source
+        pandn       xmm7,       xmm3                    ;
+        mm7 select vals < thresh from blurred result
+        paddusw     xmm1,       xmm7                    ;
+        combination
+
+        packuswb    xmm1,       xmm0                    ;
+        pack to bytes
+        movq        QWORD PTR [edi], xmm1             ;
+
+        neg         eax                   ;
+        pitch is positive
+        add         esi,        8
+        add         edi,        8
+
+        add         edx,        8
+        cmp         edx,        cols
+
+        jl          nextcol
+
+        // done with the all cols, start the across filtering in place
+        sub         esi,        edx
+        sub         edi,        edx
+
+        xor         edx,        edx
+        movq        mm0,        QWORD PTR [edi-8];
+
+        acrossnextcol:
+        movq        xmm7,       QWORD PTR [edi +edx -2]
+        movd        xmm4,       DWORD PTR [edi +edx +6]
+
+        pslldq      xmm4,       8
+        por         xmm4,       xmm7
+
+        movdqa      xmm3,       xmm4
+        psrldq      xmm3,       2
+        punpcklbw   xmm3,       xmm0              ;
+        mm3 = p0..p3
+        movdqa      xmm1,       xmm3              ;
+        mm1 = p0..p3
+        psllw       xmm3,       2
+
+
+        movdqa      xmm5,       xmm4
+        psrldq      xmm5,       3
+        punpcklbw   xmm5,       xmm0              ;
+        mm5 = p1..p4
+        paddusw     xmm3,       xmm5              ;
+        mm3 += mm6
+
+        ;
+        thresholding
+        movdqa      xmm7,       xmm1              ;
+        mm7 = p0..p3
+        psubusw     xmm7,       xmm5              ;
+        mm7 = p0..p3 - p1..p4
+        psubusw     xmm5,       xmm1              ;
+        mm5 = p1..p4 - p0..p3
+        paddusw     xmm7,       xmm5              ;
+        mm7 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm7,       xmm2
+
+        movdqa      xmm5,       xmm4
+        psrldq      xmm5,       4
+        punpcklbw   xmm5,       xmm0              ;
+        mm5 = p2..p5
+        paddusw     xmm3,       xmm5              ;
+        mm3 += mm5
+
+        ;
+        thresholding
+        movdqa      xmm6,       xmm1              ;
+        mm6 = p0..p3
+        psubusw     xmm6,       xmm5              ;
+        mm6 = p0..p3 - p1..p4
+        psubusw     xmm5,       xmm1              ;
+        mm5 = p1..p4 - p0..p3
+        paddusw     xmm6,       xmm5              ;
+        mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6              ;
+        accumulate thresholds
+
+
+        movdqa      xmm5,       xmm4              ;
+        mm5 = p-2..p5
+        punpcklbw   xmm5,       xmm0              ;
+        mm5 = p-2..p1
+        paddusw     xmm3,       xmm5              ;
+        mm3 += mm5
+
+        ;
+        thresholding
+        movdqa      xmm6,       xmm1              ;
+        mm6 = p0..p3
+        psubusw     xmm6,       xmm5              ;
+        mm6 = p0..p3 - p1..p4
+        psubusw     xmm5,       xmm1              ;
+        mm5 = p1..p4 - p0..p3
+        paddusw     xmm6,       xmm5              ;
+        mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6              ;
+        accumulate thresholds
+
+        psrldq      xmm4,       1                   ;
+        mm4 = p-1..p5
+        punpcklbw   xmm4,       xmm0              ;
+        mm4 = p-1..p2
+        paddusw     xmm3,       xmm4              ;
+        mm3 += mm5
+
+        ;
+        thresholding
+        movdqa      xmm6,       xmm1              ;
+        mm6 = p0..p3
+        psubusw     xmm6,       xmm4              ;
+        mm6 = p0..p3 - p1..p4
+        psubusw     xmm4,       xmm1              ;
+        mm5 = p1..p4 - p0..p3
+        paddusw     xmm6,       xmm4              ;
+        mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6              ;
+        accumulate thresholds
+
+        paddusw     xmm3,       rd42              ;
+        mm3 += round value
+        psraw       xmm3,       3                 ;
+        mm3 /= 8
+
+        pand        xmm1,       xmm7              ;
+        mm1 select vals > thresh from source
+        pandn       xmm7,       xmm3              ;
+        mm7 select vals < thresh from blurred result
+        paddusw     xmm1,       xmm7              ;
+        combination
+
+        packuswb    xmm1,       xmm0              ;
+        pack to bytes
+        movq        QWORD PTR [edi+edx-8],  mm0   ;
+        store previous four bytes
+        movdq2q     mm0,        xmm1
+
+        add         edx,        8
+        cmp         edx,        cols
+        jl          acrossnextcol;
+
+        // last 8 pixels
+        movq        QWORD PTR [edi+edx-8],  mm0
+
+        // done with this rwo
+        add         esi, eax               ;
+        next line
+        mov         eax, dst_pixels_per_line ;
+        destination pitch?
+        add         edi, eax               ;
+        next destination
+        mov         eax, src_pixels_per_line ;
+        destination pitch?
+
+        dec         ecx                   ;
+        decrement count
+        jnz         nextrow               ;
+        next row
+    }
+}
+
+
+void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols, int flimit)
+{
+    int c, i;
+    __declspec(align(16))
+    int flimit2[2];
+    __declspec(align(16))
+    unsigned char d[16][8];
+
+    flimit = vp8_q2mbl(flimit);
+
+    for (i = 0; i < 2; i++)
+        flimit2[i] = flimit;
+
+    rows += 8;
+
+    for (c = 0; c < cols; c += 4)
+    {
+        unsigned char *s = &dst[c];
+
+        __asm
+        {
+            mov         esi,        s           ;
+            pxor        mm0,        mm0     ;
+
+            mov         eax,        pitch       ;
+            neg         eax                                     // eax = -pitch
+
+            lea         esi,        [esi + eax*8];              // edi = s[-pitch*8]
+            neg         eax
+
+
+            pxor        mm5,        mm5
+            pxor        mm6,        mm6     ;
+
+            pxor        mm7,        mm7     ;
+            mov         edi,        esi
+
+            mov         ecx,        15          ;
+
+            loop_initvar:
+            movd        mm1,        DWORD PTR [edi];
+            punpcklbw   mm1,        mm0     ;
+
+            paddw       mm5,        mm1     ;
+            pmullw      mm1,        mm1     ;
+
+            movq        mm2,        mm1     ;
+            punpcklwd   mm1,        mm0     ;
+
+            punpckhwd   mm2,        mm0     ;
+            paddd       mm6,        mm1     ;
+
+            paddd       mm7,        mm2     ;
+            lea         edi,        [edi+eax]   ;
+
+            dec         ecx
+            jne         loop_initvar
+            //save the var and sum
+            xor         edx,        edx
+            loop_row:
+            movd        mm1,        DWORD PTR [esi]     // [s-pitch*8]
+            movd        mm2,        DWORD PTR [edi]     // [s+pitch*7]
+
+            punpcklbw   mm1,        mm0
+            punpcklbw   mm2,        mm0
+
+            paddw       mm5,        mm2
+            psubw       mm5,        mm1
+
+            pmullw      mm2,        mm2
+            movq        mm4,        mm2
+
+            punpcklwd   mm2,        mm0
+            punpckhwd   mm4,        mm0
+
+            paddd       mm6,        mm2
+            paddd       mm7,        mm4
+
+            pmullw      mm1,        mm1
+            movq        mm2,        mm1
+
+            punpcklwd   mm1,        mm0
+            psubd       mm6,        mm1
+
+            punpckhwd   mm2,        mm0
+            psubd       mm7,        mm2
+
+
+            movq        mm3,        mm6
+            pslld       mm3,        4
+
+            psubd       mm3,        mm6
+            movq        mm1,        mm5
+
+            movq        mm4,        mm5
+            pmullw      mm1,        mm1
+
+            pmulhw      mm4,        mm4
+            movq        mm2,        mm1
+
+            punpcklwd   mm1,        mm4
+            punpckhwd   mm2,        mm4
+
+            movq        mm4,        mm7
+            pslld       mm4,        4
+
+            psubd       mm4,        mm7
+
+            psubd       mm3,        mm1
+            psubd       mm4,        mm2
+
+            psubd       mm3,        flimit2
+            psubd       mm4,        flimit2
+
+            psrad       mm3,        31
+            psrad       mm4,        31
+
+            packssdw    mm3,        mm4
+            packsswb    mm3,        mm0
+
+            movd        mm1,        DWORD PTR [esi+eax*8]
+
+            movq        mm2,        mm1
+            punpcklbw   mm1,        mm0
+
+            paddw       mm1,        mm5
+            mov         ecx,        edx
+
+            and         ecx,        127
+            movq        mm4,        vp8_rv[ecx*2]
+
+            paddw       mm1,        mm4
+            //paddw     xmm1,       eight8s
+            psraw       mm1,        4
+
+            packuswb    mm1,        mm0
+            pand        mm1,        mm3
+
+            pandn       mm3,        mm2
+            por         mm1,        mm3
+
+            and         ecx,        15
+            movd        DWORD PTR  d[ecx*4], mm1
+
+            mov         ecx,        edx
+            sub         ecx,        8
+
+            and         ecx,        15
+            movd        mm1,        DWORD PTR d[ecx*4]
+
+            movd        [esi],      mm1
+            lea         esi,        [esi+eax]
+
+            lea         edi,        [edi+eax]
+            add         edx,        1
+
+            cmp         edx,        rows
+            jl          loop_row
+
+        }
+
+    }
+}
+
+void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols, int flimit)
+{
+    int c, i;
+    __declspec(align(16))
+    int flimit4[4];
+    __declspec(align(16))
+    unsigned char d[16][8];
+
+    flimit = vp8_q2mbl(flimit);
+
+    for (i = 0; i < 4; i++)
+        flimit4[i] = flimit;
+
+    rows += 8;
+
+    for (c = 0; c < cols; c += 8)
+    {
+        unsigned char *s = &dst[c];
+
+        __asm
+        {
+            mov         esi,        s           ;
+            pxor        xmm0,       xmm0        ;
+
+            mov         eax,        pitch       ;
+            neg         eax                                     // eax = -pitch
+
+            lea         esi,        [esi + eax*8];              // edi = s[-pitch*8]
+            neg         eax
+
+
+            pxor        xmm5,       xmm5
+            pxor        xmm6,       xmm6        ;
+
+            pxor        xmm7,       xmm7        ;
+            mov         edi,        esi
+
+            mov         ecx,        15          ;
+
+            loop_initvar:
+            movq        xmm1,       QWORD PTR [edi];
+            punpcklbw   xmm1,       xmm0        ;
+
+            paddw       xmm5,       xmm1        ;
+            pmullw      xmm1,       xmm1        ;
+
+            movdqa      xmm2,       xmm1        ;
+            punpcklwd   xmm1,       xmm0        ;
+
+            punpckhwd   xmm2,       xmm0        ;
+            paddd       xmm6,       xmm1        ;
+
+            paddd       xmm7,       xmm2        ;
+            lea         edi,        [edi+eax]   ;
+
+            dec         ecx
+            jne         loop_initvar
+            //save the var and sum
+            xor         edx,        edx
+            loop_row:
+            movq        xmm1,       QWORD PTR [esi]     // [s-pitch*8]
+            movq        xmm2,       QWORD PTR [edi]     // [s+pitch*7]
+
+            punpcklbw   xmm1,       xmm0
+            punpcklbw   xmm2,       xmm0
+
+            paddw       xmm5,       xmm2
+            psubw       xmm5,       xmm1
+
+            pmullw      xmm2,       xmm2
+            movdqa      xmm4,       xmm2
+
+            punpcklwd   xmm2,       xmm0
+            punpckhwd   xmm4,       xmm0
+
+            paddd       xmm6,       xmm2
+            paddd       xmm7,       xmm4
+
+            pmullw      xmm1,       xmm1
+            movdqa      xmm2,       xmm1
+
+            punpcklwd   xmm1,       xmm0
+            psubd       xmm6,       xmm1
+
+            punpckhwd   xmm2,       xmm0
+            psubd       xmm7,       xmm2
+
+
+            movdqa      xmm3,       xmm6
+            pslld       xmm3,       4
+
+            psubd       xmm3,       xmm6
+            movdqa      xmm1,       xmm5
+
+            movdqa      xmm4,       xmm5
+            pmullw      xmm1,       xmm1
+
+            pmulhw      xmm4,       xmm4
+            movdqa      xmm2,       xmm1
+
+            punpcklwd   xmm1,       xmm4
+            punpckhwd   xmm2,       xmm4
+
+            movdqa      xmm4,       xmm7
+            pslld       xmm4,       4
+
+            psubd       xmm4,       xmm7
+
+            psubd       xmm3,       xmm1
+            psubd       xmm4,       xmm2
+
+            psubd       xmm3,       flimit4
+            psubd       xmm4,       flimit4
+
+            psrad       xmm3,       31
+            psrad       xmm4,       31
+
+            packssdw    xmm3,       xmm4
+            packsswb    xmm3,       xmm0
+
+            movq        xmm1,       QWORD PTR [esi+eax*8]
+
+            movq        xmm2,       xmm1
+            punpcklbw   xmm1,       xmm0
+
+            paddw       xmm1,       xmm5
+            mov         ecx,        edx
+
+            and         ecx,        127
+            movdqu      xmm4,       vp8_rv[ecx*2]
+
+            paddw       xmm1,       xmm4
+            //paddw     xmm1,       eight8s
+            psraw       xmm1,       4
+
+            packuswb    xmm1,       xmm0
+            pand        xmm1,       xmm3
+
+            pandn       xmm3,       xmm2
+            por         xmm1,       xmm3
+
+            and         ecx,        15
+            movq        QWORD PTR  d[ecx*8], xmm1
+
+            mov         ecx,        edx
+            sub         ecx,        8
+
+            and         ecx,        15
+            movq        mm0,        d[ecx*8]
+
+            movq        [esi],      mm0
+            lea         esi,        [esi+eax]
+
+            lea         edi,        [edi+eax]
+            add         edx,        1
+
+            cmp         edx,        rows
+            jl          loop_row
+
+        }
+
+    }
+}
+#if 0
+/****************************************************************************
+ *
+ *  ROUTINE       : plane_add_noise_wmt
+ *
+ *  INPUTS        : unsigned char *Start    starting address of buffer to add gaussian
+ *                                  noise to
+ *                  unsigned int Width    width of plane
+ *                  unsigned int Height   height of plane
+ *                  int  Pitch    distance between subsequent lines of frame
+ *                  int  q        quantizer used to determine amount of noise
+ *                                  to add
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void.
+ *
+ *  FUNCTION      : adds gaussian noise to a plane of pixels
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
+{
+    unsigned int i;
+
+    __declspec(align(16)) unsigned char blackclamp[16];
+    __declspec(align(16)) unsigned char whiteclamp[16];
+    __declspec(align(16)) unsigned char bothclamp[16];
+    char char_dist[300];
+    char Rand[2048];
+    double sigma;
+//    return;
+    __asm emms
+    sigma = a + .5 + .6 * (63 - q) / 63.0;
+
+    // set up a lookup table of 256 entries that matches
+    // a gaussian distribution with sigma determined by q.
+    //
+    {
+        double i;
+        int next, j;
+
+        next = 0;
+
+        for (i = -32; i < 32; i++)
+        {
+            double g = 256 * vp8_gaussian(sigma, 0, 1.0 * i);
+            int a = (int)(g + .5);
+
+            if (a)
+            {
+                for (j = 0; j < a; j++)
+                {
+                    char_dist[next+j] = (char) i;
+                }
+
+                next = next + j;
+            }
+
+        }
+
+        for (next = next; next < 256; next++)
+            char_dist[next] = 0;
+
+    }
+
+    for (i = 0; i < 2048; i++)
+    {
+        Rand[i] = char_dist[rand() & 0xff];
+    }
+
+    for (i = 0; i < 16; i++)
+    {
+        blackclamp[i] = -char_dist[0];
+        whiteclamp[i] = -char_dist[0];
+        bothclamp[i] = -2 * char_dist[0];
+    }
+
+    for (i = 0; i < Height; i++)
+    {
+        unsigned char *Pos = Start + i * Pitch;
+        char  *Ref = Rand + (rand() & 0xff);
+
+        __asm
+        {
+            mov ecx, [Width]
+            mov esi, Pos
+            mov edi, Ref
+            xor         eax, eax
+
+            nextset:
+            movdqu      xmm1, [esi+eax]        // get the source
+
+            psubusb     xmm1, blackclamp       // clamp both sides so we don't outrange adding noise
+            paddusb     xmm1, bothclamp
+            psubusb     xmm1, whiteclamp
+
+            movdqu      xmm2, [edi+eax]        // get the noise for this line
+            paddb       xmm1, xmm2             // add it in
+            movdqu      [esi+eax], xmm1        // store the result
+
+            add         eax, 16                // move to the next line
+
+            cmp         eax, ecx
+            jl          nextset
+
+
+        }
+
+    }
+}
+#endif
+__declspec(align(16))
+static const int four8s[4] = { 8, 8, 8, 8};
+void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, int pitch, int rows, int cols, int flimit)
+{
+    int r, i;
+    __declspec(align(16))
+    int flimit4[4];
+    unsigned char *s = src;
+    int sumsq;
+    int sum;
+
+
+    flimit = vp8_q2mbl(flimit);
+    flimit4[0] =
+        flimit4[1] =
+            flimit4[2] =
+                flimit4[3] = flimit;
+
+    for (r = 0; r < rows; r++)
+    {
+
+
+        sumsq = 0;
+        sum = 0;
+
+        for (i = -8; i <= 6; i++)
+        {
+            sumsq += s[i] * s[i];
+            sum   += s[i];
+        }
+
+        __asm
+        {
+            mov         eax,    sumsq
+            movd        xmm7,   eax
+
+            mov         eax,    sum
+            movd        xmm6,   eax
+
+            mov         esi,    s
+            xor         ecx,    ecx
+
+            mov         edx,    cols
+            add         edx,    8
+            pxor        mm0,    mm0
+            pxor        mm1,    mm1
+
+            pxor        xmm0,   xmm0
+            nextcol4:
+
+            movd        xmm1,   DWORD PTR [esi+ecx-8]   // -8 -7 -6 -5
+            movd        xmm2,   DWORD PTR [esi+ecx+7]   // +7 +8 +9 +10
+
+            punpcklbw   xmm1,   xmm0                    // expanding
+            punpcklbw   xmm2,   xmm0                    // expanding
+
+            punpcklwd   xmm1,   xmm0                    // expanding to dwords
+            punpcklwd   xmm2,   xmm0                    // expanding to dwords
+
+            psubd       xmm2,   xmm1                    // 7--8   8--7   9--6 10--5
+            paddd       xmm1,   xmm1                    // -8*2   -7*2   -6*2 -5*2
+
+            paddd       xmm1,   xmm2                    // 7+-8   8+-7   9+-6 10+-5
+            pmaddwd     xmm1,   xmm2                    // squared of 7+-8   8+-7   9+-6 10+-5
+
+            paddd       xmm6,   xmm2
+            paddd       xmm7,   xmm1
+
+            pshufd      xmm6,   xmm6,   0               // duplicate the last ones
+            pshufd      xmm7,   xmm7,   0               // duplicate the last ones
+
+            psrldq      xmm1,       4                   // 8--7   9--6 10--5  0000
+            psrldq      xmm2,       4                   // 8--7   9--6 10--5  0000
+
+            pshufd      xmm3,   xmm1,   3               // 0000  8--7   8--7   8--7 squared
+            pshufd      xmm4,   xmm2,   3               // 0000  8--7   8--7   8--7 squared
+
+            paddd       xmm6,   xmm4
+            paddd       xmm7,   xmm3
+
+            pshufd      xmm3,   xmm1,   01011111b       // 0000  0000   9--6   9--6 squared
+            pshufd      xmm4,   xmm2,   01011111b       // 0000  0000   9--6   9--6 squared
+
+            paddd       xmm7,   xmm3
+            paddd       xmm6,   xmm4
+
+            pshufd      xmm3,   xmm1,   10111111b       // 0000  0000   8--7   8--7 squared
+            pshufd      xmm4,   xmm2,   10111111b       // 0000  0000   8--7   8--7 squared
+
+            paddd       xmm7,   xmm3
+            paddd       xmm6,   xmm4
+
+            movdqa      xmm3,   xmm6
+            pmaddwd     xmm3,   xmm3
+
+            movdqa      xmm5,   xmm7
+            pslld       xmm5,   4
+
+            psubd       xmm5,   xmm7
+            psubd       xmm5,   xmm3
+
+            psubd       xmm5,   flimit4
+            psrad       xmm5,   31
+
+            packssdw    xmm5,   xmm0
+            packsswb    xmm5,   xmm0
+
+            movd        xmm1,   DWORD PTR [esi+ecx]
+            movq        xmm2,   xmm1
+
+            punpcklbw   xmm1,   xmm0
+            punpcklwd   xmm1,   xmm0
+
+            paddd       xmm1,   xmm6
+            paddd       xmm1,   four8s
+
+            psrad       xmm1,   4
+            packssdw    xmm1,   xmm0
+
+            packuswb    xmm1,   xmm0
+            pand        xmm1,   xmm5
+
+            pandn       xmm5,   xmm2
+            por         xmm5,   xmm1
+
+            movd        [esi+ecx-8],  mm0
+            movq        mm0,    mm1
+
+            movdq2q     mm1,    xmm5
+            psrldq      xmm7,   12
+
+            psrldq      xmm6,   12
+            add         ecx,    4
+
+            cmp         ecx,    edx
+            jl          nextcol4
+
+        }
+        s += pitch;
+    }
+}
+
+#if 0
+
+/****************************************************************************
+ *
+ *  ROUTINE       : plane_add_noise_mmx
+ *
+ *  INPUTS        : unsigned char *Start    starting address of buffer to add gaussian
+ *                                  noise to
+ *                  unsigned int Width    width of plane
+ *                  unsigned int Height   height of plane
+ *                  int  Pitch    distance between subsequent lines of frame
+ *                  int  q        quantizer used to determine amount of noise
+ *                                  to add
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void.
+ *
+ *  FUNCTION      : adds gaussian noise to a plane of pixels
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
+{
+    unsigned int i;
+    int Pitch4 = Pitch * 4;
+    const int noise_amount = 2;
+    const int noise_adder = 2 * noise_amount + 1;
+
+    __declspec(align(16)) unsigned char blackclamp[16];
+    __declspec(align(16)) unsigned char whiteclamp[16];
+    __declspec(align(16)) unsigned char bothclamp[16];
+
+    char char_dist[300];
+    char Rand[2048];
+
+    double sigma;
+    __asm emms
+    sigma = a + .5 + .6 * (63 - q) / 63.0;
+
+    // set up a lookup table of 256 entries that matches
+    // a gaussian distribution with sigma determined by q.
+    //
+    {
+        double i, sum = 0;
+        int next, j;
+
+        next = 0;
+
+        for (i = -32; i < 32; i++)
+        {
+            int a = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i));
+
+            if (a)
+            {
+                for (j = 0; j < a; j++)
+                {
+                    char_dist[next+j] = (char) i;
+                }
+
+                next = next + j;
+            }
+
+        }
+
+        for (next = next; next < 256; next++)
+            char_dist[next] = 0;
+
+    }
+
+    for (i = 0; i < 2048; i++)
+    {
+        Rand[i] = char_dist[rand() & 0xff];
+    }
+
+    for (i = 0; i < 16; i++)
+    {
+        blackclamp[i] = -char_dist[0];
+        whiteclamp[i] = -char_dist[0];
+        bothclamp[i] = -2 * char_dist[0];
+    }
+
+    for (i = 0; i < Height; i++)
+    {
+        unsigned char *Pos = Start + i * Pitch;
+        char  *Ref = Rand + (rand() & 0xff);
+
+        __asm
+        {
+            mov ecx, [Width]
+            mov esi, Pos
+            mov edi, Ref
+            xor         eax, eax
+
+            nextset:
+            movq        mm1, [esi+eax]        // get the source
+
+            psubusb     mm1, blackclamp       // clamp both sides so we don't outrange adding noise
+            paddusb     mm1, bothclamp
+            psubusb     mm1, whiteclamp
+
+            movq        mm2, [edi+eax]        // get the noise for this line
+            paddb       mm1, mm2             // add it in
+            movq        [esi+eax], mm1        // store the result
+
+            add         eax, 8                // move to the next line
+
+            cmp         eax, ecx
+            jl          nextset
+
+
+        }
+
+    }
+}
+#else
+extern char an[8][64][3072];
+extern int cd[8][64];
+
+void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
+{
+    unsigned int i;
+    __declspec(align(16)) unsigned char blackclamp[16];
+    __declspec(align(16)) unsigned char whiteclamp[16];
+    __declspec(align(16)) unsigned char bothclamp[16];
+
+
+    __asm emms
+
+    for (i = 0; i < 16; i++)
+    {
+        blackclamp[i] = -cd[a][q];
+        whiteclamp[i] = -cd[a][q];
+        bothclamp[i] = -2 * cd[a][q];
+    }
+
+    for (i = 0; i < Height; i++)
+    {
+        unsigned char *Pos = Start + i * Pitch;
+        char  *Ref = an[a][q] + (rand() & 0xff);
+
+        __asm
+        {
+            mov ecx, [Width]
+            mov esi, Pos
+            mov edi, Ref
+            xor         eax, eax
+
+            nextset:
+            movq        mm1, [esi+eax]        // get the source
+
+            psubusb     mm1, blackclamp       // clamp both sides so we don't outrange adding noise
+            paddusb     mm1, bothclamp
+            psubusb     mm1, whiteclamp
+
+            movq        mm2, [edi+eax]        // get the noise for this line
+            paddb       mm1, mm2             // add it in
+            movq        [esi+eax], mm1        // store the result
+
+            add         eax, 8                // move to the next line
+
+            cmp         eax, ecx
+            jl          nextset
+        }
+    }
+}
+
+
+void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
+{
+    unsigned int i;
+
+    __declspec(align(16)) unsigned char blackclamp[16];
+    __declspec(align(16)) unsigned char whiteclamp[16];
+    __declspec(align(16)) unsigned char bothclamp[16];
+
+    __asm emms
+
+    for (i = 0; i < 16; i++)
+    {
+        blackclamp[i] = -cd[a][q];
+        whiteclamp[i] = -cd[a][q];
+        bothclamp[i] = -2 * cd[a][q];
+    }
+
+    for (i = 0; i < Height; i++)
+    {
+        unsigned char *Pos = Start + i * Pitch;
+        char *Ref = an[a][q] + (rand() & 0xff);
+
+        __asm
+        {
+            mov ecx,    [Width]
+            mov esi,    Pos
+            mov edi,    Ref
+            xor         eax, eax
+
+            nextset:
+            movdqu      xmm1, [esi+eax]        // get the source
+
+            psubusb     xmm1, blackclamp       // clamp both sides so we don't outrange adding noise
+            paddusb     xmm1, bothclamp
+            psubusb     xmm1, whiteclamp
+
+            movdqu      xmm2, [edi+eax]        // get the noise for this line
+            paddb       xmm1, xmm2             // add it in
+            movdqu      [esi+eax], xmm1        // store the result
+
+            add         eax, 16                // move to the next line
+
+            cmp         eax, ecx
+            jl          nextset
+        }
+    }
+}
+
+#endif
diff --git a/vp8/common/x86/postproc_sse2.asm b/vp8/common/x86/postproc_sse2.asm
new file mode 100644
index 000000000..bfa36fa70
--- /dev/null
+++ b/vp8/common/x86/postproc_sse2.asm
@@ -0,0 +1,688 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_post_proc_down_and_across_xmm
+;(
+;    unsigned char *src_ptr,
+;    unsigned char *dst_ptr,
+;    int src_pixels_per_line,
+;    int dst_pixels_per_line,
+;    int rows,
+;    int cols,
+;    int flimit
+;)
+global sym(vp8_post_proc_down_and_across_xmm)
+sym(vp8_post_proc_down_and_across_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+    ALIGN_STACK 16, rax
+    ; move the global rd onto the stack, since we don't have enough registers
+    ; to do PIC addressing
+    movdqa      xmm0, [rd42 GLOBAL]
+    sub         rsp, 16
+    movdqa      [rsp], xmm0
+%define RD42 [rsp]
+%else
+%define RD42 [rd42 GLOBAL]
+%endif
+
+
+        movd        xmm2,       dword ptr arg(6) ;flimit
+        punpcklwd   xmm2,       xmm2
+        punpckldq   xmm2,       xmm2
+        punpcklqdq  xmm2,       xmm2
+
+        mov         rsi,        arg(0) ;src_ptr
+        mov         rdi,        arg(1) ;dst_ptr
+
+        movsxd      rcx,        DWORD PTR arg(4) ;rows
+        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
+        pxor        xmm0,       xmm0              ; mm0 = 00000000
+
+nextrow:
+
+        xor         rdx,        rdx       ; clear out rdx for use as loop counter
+nextcol:
+        movq        xmm3,       QWORD PTR [rsi]         ; mm4 = r0 p0..p7
+        punpcklbw   xmm3,       xmm0                    ; mm3 = p0..p3
+        movdqa      xmm1,       xmm3                    ; mm1 = p0..p3
+        psllw       xmm3,       2                       ;
+
+        movq        xmm5,       QWORD PTR [rsi + rax]   ; mm4 = r1 p0..p7
+        punpcklbw   xmm5,       xmm0                    ; mm5 = r1 p0..p3
+        paddusw     xmm3,       xmm5                    ; mm3 += mm6
+
+        ; thresholding
+        movdqa      xmm7,       xmm1                    ; mm7 = r0 p0..p3
+        psubusw     xmm7,       xmm5                    ; mm7 = r0 p0..p3 - r1 p0..p3
+        psubusw     xmm5,       xmm1                    ; mm5 = r1 p0..p3 - r0 p0..p3
+        paddusw     xmm7,       xmm5                    ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
+        pcmpgtw     xmm7,       xmm2
+
+        movq        xmm5,       QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
+        punpcklbw   xmm5,       xmm0                    ; mm5 = r2 p0..p3
+        paddusw     xmm3,       xmm5                    ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
+        psubusw     xmm6,       xmm5                    ; mm6 = r0 p0..p3 - r2 p0..p3
+        psubusw     xmm5,       xmm1                    ; mm5 = r2 p0..p3 - r2 p0..p3
+        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6                    ; accumulate thresholds
+
+
+        neg         rax
+        movq        xmm5,       QWORD PTR [rsi+2*rax]   ; mm4 = r-2 p0..p7
+        punpcklbw   xmm5,       xmm0                    ; mm5 = r-2 p0..p3
+        paddusw     xmm3,       xmm5                    ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
+        psubusw     xmm6,       xmm5                    ; mm6 = p0..p3 - r-2 p0..p3
+        psubusw     xmm5,       xmm1                    ; mm5 = r-2 p0..p3 - p0..p3
+        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6                    ; accumulate thresholds
+
+        movq        xmm4,       QWORD PTR [rsi+rax]     ; mm4 = r-1 p0..p7
+        punpcklbw   xmm4,       xmm0                    ; mm4 = r-1 p0..p3
+        paddusw     xmm3,       xmm4                    ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
+        psubusw     xmm6,       xmm4                    ; mm6 = p0..p3 - r-2 p0..p3
+        psubusw     xmm4,       xmm1                    ; mm5 = r-1 p0..p3 - p0..p3
+        paddusw     xmm6,       xmm4                    ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6                    ; accumulate thresholds
+
+
+        paddusw     xmm3,       RD42                    ; mm3 += round value
+        psraw       xmm3,       3                       ; mm3 /= 8
+
+        pand        xmm1,       xmm7                    ; mm1 select vals > thresh from source
+        pandn       xmm7,       xmm3                    ; mm7 select vals < thresh from blurred result
+        paddusw     xmm1,       xmm7                    ; combination
+
+        packuswb    xmm1,       xmm0                    ; pack to bytes
+        movq        QWORD PTR [rdi], xmm1             ;
+
+        neg         rax                   ; pitch is positive
+        add         rsi,        8
+        add         rdi,        8
+
+        add         rdx,        8
+        cmp         edx,        dword arg(5) ;cols
+
+        jl          nextcol
+
+        ; done with the all cols, start the across filtering in place
+        sub         rsi,        rdx
+        sub         rdi,        rdx
+
+        xor         rdx,        rdx
+        movq        mm0,        QWORD PTR [rdi-8];
+
+acrossnextcol:
+        movq        xmm7,       QWORD PTR [rdi +rdx -2]
+        movd        xmm4,       DWORD PTR [rdi +rdx +6]
+
+        pslldq      xmm4,       8
+        por         xmm4,       xmm7
+
+        movdqa      xmm3,       xmm4
+        psrldq      xmm3,       2
+        punpcklbw   xmm3,       xmm0              ; mm3 = p0..p3
+        movdqa      xmm1,       xmm3              ; mm1 = p0..p3
+        psllw       xmm3,       2
+
+
+        movdqa      xmm5,       xmm4
+        psrldq      xmm5,       3
+        punpcklbw   xmm5,       xmm0              ; mm5 = p1..p4
+        paddusw     xmm3,       xmm5              ; mm3 += mm6
+
+        ; thresholding
+        movdqa      xmm7,       xmm1              ; mm7 = p0..p3
+        psubusw     xmm7,       xmm5              ; mm7 = p0..p3 - p1..p4
+        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     xmm7,       xmm5              ; mm7 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm7,       xmm2
+
+        movdqa      xmm5,       xmm4
+        psrldq      xmm5,       4
+        punpcklbw   xmm5,       xmm0              ; mm5 = p2..p5
+        paddusw     xmm3,       xmm5              ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
+        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
+        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6              ; accumulate thresholds
+
+
+        movdqa      xmm5,       xmm4              ; mm5 = p-2..p5
+        punpcklbw   xmm5,       xmm0              ; mm5 = p-2..p1
+        paddusw     xmm3,       xmm5              ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
+        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
+        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6              ; accumulate thresholds
+
+        psrldq      xmm4,       1                   ; mm4 = p-1..p5
+        punpcklbw   xmm4,       xmm0              ; mm4 = p-1..p2
+        paddusw     xmm3,       xmm4              ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
+        psubusw     xmm6,       xmm4              ; mm6 = p0..p3 - p1..p4
+        psubusw     xmm4,       xmm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     xmm6,       xmm4              ; mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6              ; accumulate thresholds
+
+        paddusw     xmm3,       RD42              ; mm3 += round value
+        psraw       xmm3,       3                 ; mm3 /= 8
+
+        pand        xmm1,       xmm7              ; mm1 select vals > thresh from source
+        pandn       xmm7,       xmm3              ; mm7 select vals < thresh from blurred result
+        paddusw     xmm1,       xmm7              ; combination
+
+        packuswb    xmm1,       xmm0              ; pack to bytes
+        movq        QWORD PTR [rdi+rdx-8],  mm0   ; store previous four bytes
+        movdq2q     mm0,        xmm1
+
+        add         rdx,        8
+        cmp         edx,        dword arg(5) ;cols
+        jl          acrossnextcol;
+
+        ; last 8 pixels
+        movq        QWORD PTR [rdi+rdx-8],  mm0
+
+        ; done with this rwo
+        add         rsi,rax               ; next line
+        mov         eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
+        add         rdi,rax               ; next destination
+        mov         eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
+
+        dec         rcx                   ; decrement count
+        jnz         nextrow               ; next row
+
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+    add rsp,16
+    pop rsp
+%endif
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef RD42
+
+
+;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
+;                            int pitch, int rows, int cols,int flimit)
+extern sym(vp8_rv)
+global sym(vp8_mbpost_proc_down_xmm)
+sym(vp8_mbpost_proc_down_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 128+16
+
+    ; unsigned char d[16][8] at [rsp]
+    ; create flimit2 at [rsp+128]
+    mov         eax, dword ptr arg(4) ;flimit
+    mov         [rsp+128], eax
+    mov         [rsp+128+4], eax
+    mov         [rsp+128+8], eax
+    mov         [rsp+128+12], eax
+%define flimit4 [rsp+128]
+
+%if ABI_IS_32BIT=0
+    lea         r8,       [sym(vp8_rv) GLOBAL]
+%endif
+
+    ;rows +=8;
+    add         dword arg(2), 8
+
+    ;for(c=0; c<cols; c+=8)
+loop_col:
+            mov         rsi,        arg(0) ; s
+            pxor        xmm0,       xmm0        ;
+
+            movsxd      rax,        dword ptr arg(1) ;pitch       ;
+            neg         rax                                     ; rax = -pitch
+
+            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
+            neg         rax
+
+
+            pxor        xmm5,       xmm5
+            pxor        xmm6,       xmm6        ;
+
+            pxor        xmm7,       xmm7        ;
+            mov         rdi,        rsi
+
+            mov         rcx,        15          ;
+
+loop_initvar:
+            movq        xmm1,       QWORD PTR [rdi];
+            punpcklbw   xmm1,       xmm0        ;
+
+            paddw       xmm5,       xmm1        ;
+            pmullw      xmm1,       xmm1        ;
+
+            movdqa      xmm2,       xmm1        ;
+            punpcklwd   xmm1,       xmm0        ;
+
+            punpckhwd   xmm2,       xmm0        ;
+            paddd       xmm6,       xmm1        ;
+
+            paddd       xmm7,       xmm2        ;
+            lea         rdi,        [rdi+rax]   ;
+
+            dec         rcx
+            jne         loop_initvar
+            ;save the var and sum
+            xor         rdx,        rdx
+loop_row:
+            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
+            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
+
+            punpcklbw   xmm1,       xmm0
+            punpcklbw   xmm2,       xmm0
+
+            paddw       xmm5,       xmm2
+            psubw       xmm5,       xmm1
+
+            pmullw      xmm2,       xmm2
+            movdqa      xmm4,       xmm2
+
+            punpcklwd   xmm2,       xmm0
+            punpckhwd   xmm4,       xmm0
+
+            paddd       xmm6,       xmm2
+            paddd       xmm7,       xmm4
+
+            pmullw      xmm1,       xmm1
+            movdqa      xmm2,       xmm1
+
+            punpcklwd   xmm1,       xmm0
+            psubd       xmm6,       xmm1
+
+            punpckhwd   xmm2,       xmm0
+            psubd       xmm7,       xmm2
+
+
+            movdqa      xmm3,       xmm6
+            pslld       xmm3,       4
+
+            psubd       xmm3,       xmm6
+            movdqa      xmm1,       xmm5
+
+            movdqa      xmm4,       xmm5
+            pmullw      xmm1,       xmm1
+
+            pmulhw      xmm4,       xmm4
+            movdqa      xmm2,       xmm1
+
+            punpcklwd   xmm1,       xmm4
+            punpckhwd   xmm2,       xmm4
+
+            movdqa      xmm4,       xmm7
+            pslld       xmm4,       4
+
+            psubd       xmm4,       xmm7
+
+            psubd       xmm3,       xmm1
+            psubd       xmm4,       xmm2
+
+            psubd       xmm3,       flimit4
+            psubd       xmm4,       flimit4
+
+            psrad       xmm3,       31
+            psrad       xmm4,       31
+
+            packssdw    xmm3,       xmm4
+            packsswb    xmm3,       xmm0
+
+            movq        xmm1,       QWORD PTR [rsi+rax*8]
+
+            movq        xmm2,       xmm1
+            punpcklbw   xmm1,       xmm0
+
+            paddw       xmm1,       xmm5
+            mov         rcx,        rdx
+
+            and         rcx,        127
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+            push        rax
+            lea         rax,        [sym(vp8_rv) GLOBAL]
+            movdqu      xmm4,       [rax + rcx*2] ;vp8_rv[rcx*2]
+            pop         rax
+%elif ABI_IS_32BIT=0
+            movdqu      xmm4,       [r8 + rcx*2] ;vp8_rv[rcx*2]
+%else
+            movdqu      xmm4,       [sym(vp8_rv) + rcx*2]
+%endif
+
+            paddw       xmm1,       xmm4
+            ;paddw     xmm1,       eight8s
+            psraw       xmm1,       4
+
+            packuswb    xmm1,       xmm0
+            pand        xmm1,       xmm3
+
+            pandn       xmm3,       xmm2
+            por         xmm1,       xmm3
+
+            and         rcx,        15
+            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
+
+            mov         rcx,        rdx
+            sub         rcx,        8
+
+            and         rcx,        15
+            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
+
+            movq        [rsi],      mm0
+            lea         rsi,        [rsi+rax]
+
+            lea         rdi,        [rdi+rax]
+            add         rdx,        1
+
+            cmp         edx,        dword arg(2) ;rows
+            jl          loop_row
+
+        add         dword arg(0), 8 ; s += 8
+        sub         dword arg(3), 8 ; cols -= 8
+        cmp         dword arg(3), 0
+        jg          loop_col
+
+    add         rsp, 128+16
+    pop         rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit4
+
+
+;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
+;                                int pitch, int rows, int cols,int flimit)
+global sym(vp8_mbpost_proc_across_ip_xmm)
+sym(vp8_mbpost_proc_across_ip_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16
+
+    ; create flimit4 at [rsp]
+    mov         eax, dword ptr arg(4) ;flimit
+    mov         [rsp], eax
+    mov         [rsp+4], eax
+    mov         [rsp+8], eax
+    mov         [rsp+12], eax
+%define flimit4 [rsp]
+
+
+    ;for(r=0;r<rows;r++)
+ip_row_loop:
+
+        xor         rdx,    rdx ;sumsq=0;
+        xor         rcx,    rcx ;sum=0;
+        mov         rsi,    arg(0); s
+        mov         rdi,    -8
+ip_var_loop:
+        ;for(i=-8;i<=6;i++)
+        ;{
+        ;    sumsq += s[i]*s[i];
+        ;    sum   += s[i];
+        ;}
+        movzx       eax, byte [rsi+rdi]
+        add         ecx, eax
+        mul         al
+        add         edx, eax
+        add         rdi, 1
+        cmp         rdi, 6
+        jle         ip_var_loop
+
+
+            ;mov         rax,    sumsq
+            ;movd        xmm7,   rax
+            movd        xmm7,   edx
+
+            ;mov         rax,    sum
+            ;movd        xmm6,   rax
+            movd        xmm6,   ecx
+
+            mov         rsi,    arg(0) ;s
+            xor         rcx,    rcx
+
+            movsxd      rdx,    dword arg(3) ;cols
+            add         rdx,    8
+            pxor        mm0,    mm0
+            pxor        mm1,    mm1
+
+            pxor        xmm0,   xmm0
+nextcol4:
+
+            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
+            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
+
+            punpcklbw   xmm1,   xmm0                    ; expanding
+            punpcklbw   xmm2,   xmm0                    ; expanding
+
+            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
+            punpcklwd   xmm2,   xmm0                    ; expanding to dwords
+
+            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
+            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
+
+            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
+            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
+
+            paddd       xmm6,   xmm2
+            paddd       xmm7,   xmm1
+
+            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
+            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
+
+            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
+            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
+
+            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
+            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
+
+            paddd       xmm6,   xmm4
+            paddd       xmm7,   xmm3
+
+            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
+            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
+
+            paddd       xmm7,   xmm3
+            paddd       xmm6,   xmm4
+
+            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
+            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
+
+            paddd       xmm7,   xmm3
+            paddd       xmm6,   xmm4
+
+            movdqa      xmm3,   xmm6
+            pmaddwd     xmm3,   xmm3
+
+            movdqa      xmm5,   xmm7
+            pslld       xmm5,   4
+
+            psubd       xmm5,   xmm7
+            psubd       xmm5,   xmm3
+
+            psubd       xmm5,   flimit4
+            psrad       xmm5,   31
+
+            packssdw    xmm5,   xmm0
+            packsswb    xmm5,   xmm0
+
+            movd        xmm1,   DWORD PTR [rsi+rcx]
+            movq        xmm2,   xmm1
+
+            punpcklbw   xmm1,   xmm0
+            punpcklwd   xmm1,   xmm0
+
+            paddd       xmm1,   xmm6
+            paddd       xmm1,   [four8s GLOBAL]
+
+            psrad       xmm1,   4
+            packssdw    xmm1,   xmm0
+
+            packuswb    xmm1,   xmm0
+            pand        xmm1,   xmm5
+
+            pandn       xmm5,   xmm2
+            por         xmm5,   xmm1
+
+            movd        [rsi+rcx-8],  mm0
+            movq        mm0,    mm1
+
+            movdq2q     mm1,    xmm5
+            psrldq      xmm7,   12
+
+            psrldq      xmm6,   12
+            add         rcx,    4
+
+            cmp         rcx,    rdx
+            jl          nextcol4
+
+        ;s+=pitch;
+        movsxd rax, dword arg(1)
+        add    arg(0), rax
+
+        sub dword arg(2), 1 ;rows-=1
+        cmp dword arg(2), 0
+        jg ip_row_loop
+
+    add         rsp, 16
+    pop         rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit4
+
+
+;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
+;                            unsigned char blackclamp[16],
+;                            unsigned char whiteclamp[16],
+;                            unsigned char bothclamp[16],
+;                            unsigned int Width, unsigned int Height, int Pitch)
+extern sym(rand)
+global sym(vp8_plane_add_noise_wmt)
+sym(vp8_plane_add_noise_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+addnoise_loop:
+    call sym(rand) WRT_PLT
+    mov     rcx, arg(1) ;noise
+    and     rax, 0xff
+    add     rcx, rax
+
+    ; we rely on the fact that the clamping vectors are stored contiguously
+    ; in black/white/both order. Note that we have to reload this here because
+    ; rdx could be trashed by rand()
+    mov     rdx, arg(2) ; blackclamp
+
+
+            mov     rdi, rcx
+            movsxd  rcx, dword arg(5) ;[Width]
+            mov     rsi, arg(0) ;Pos
+            xor         rax,rax
+
+addnoise_nextset:
+            movdqu      xmm1,[rsi+rax]         ; get the source
+
+            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
+            paddusb     xmm1, [rdx+32] ;bothclamp
+            psubusb     xmm1, [rdx+16] ;whiteclamp
+
+            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
+            paddb       xmm1,xmm2              ; add it in
+            movdqu      [rsi+rax],xmm1         ; store the result
+
+            add         rax,16                 ; move to the next line
+
+            cmp         rax, rcx
+            jl          addnoise_nextset
+
+    movsxd  rax, dword arg(7) ; Pitch
+    add     arg(0), rax ; Start += Pitch
+    sub     dword arg(6), 1   ; Height -= 1
+    jg      addnoise_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+rd42:
+    times 8 dw 0x04
+four8s:
+    times 4 dd 8
diff --git a/vp8/common/x86/postproc_x86.h b/vp8/common/x86/postproc_x86.h
new file mode 100644
index 000000000..49a190793
--- /dev/null
+++ b/vp8/common/x86/postproc_x86.h
@@ -0,0 +1,63 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef POSTPROC_X86_H
+#define POSTPROC_X86_H
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+
+#if HAVE_MMX
+extern prototype_postproc_inplace(vp8_mbpost_proc_down_mmx);
+extern prototype_postproc(vp8_post_proc_down_and_across_mmx);
+extern prototype_postproc_addnoise(vp8_plane_add_noise_mmx);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_postproc_down
+#define vp8_postproc_down vp8_mbpost_proc_down_mmx
+
+#undef  vp8_postproc_downacross
+#define vp8_postproc_downacross vp8_post_proc_down_and_across_mmx
+
+#undef  vp8_postproc_addnoise
+#define vp8_postproc_addnoise vp8_plane_add_noise_mmx
+
+#endif
+#endif
+
+
+#if HAVE_SSE2
+extern prototype_postproc_inplace(vp8_mbpost_proc_down_xmm);
+extern prototype_postproc_inplace(vp8_mbpost_proc_across_ip_xmm);
+extern prototype_postproc(vp8_post_proc_down_and_across_xmm);
+extern prototype_postproc_addnoise(vp8_plane_add_noise_wmt);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_postproc_down
+#define vp8_postproc_down vp8_mbpost_proc_down_xmm
+
+#undef  vp8_postproc_across
+#define vp8_postproc_across vp8_mbpost_proc_across_ip_xmm
+
+#undef  vp8_postproc_downacross
+#define vp8_postproc_downacross vp8_post_proc_down_and_across_xmm
+
+#undef  vp8_postproc_addnoise
+#define vp8_postproc_addnoise vp8_plane_add_noise_wmt
+
+
+#endif
+#endif
+
+#endif
diff --git a/vp8/common/x86/recon_mmx.asm b/vp8/common/x86/recon_mmx.asm
new file mode 100644
index 000000000..ba60c5db7
--- /dev/null
+++ b/vp8/common/x86/recon_mmx.asm
@@ -0,0 +1,320 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+;void vp8_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride)
+global sym(vp8_recon_b_mmx)
+sym(vp8_recon_b_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov       rsi, arg(0) ;s
+        mov       rdi, arg(2) ;d
+        mov       rdx, arg(1) ;q
+        movsxd    rax, dword ptr arg(3) ;stride
+        pxor      mm0, mm0
+
+        movd      mm1, [rsi]
+        punpcklbw mm1, mm0
+        paddsw    mm1, [rdx]
+        packuswb  mm1,  mm0              ; pack and unpack to saturate
+        movd      [rdi], mm1
+
+        movd      mm2, [rsi+16]
+        punpcklbw mm2, mm0
+        paddsw    mm2, [rdx+32]
+        packuswb  mm2, mm0              ; pack and unpack to saturate
+        movd      [rdi+rax], mm2
+
+        movd      mm3, [rsi+32]
+        punpcklbw mm3, mm0
+        paddsw    mm3, [rdx+64]
+        packuswb  mm3,  mm0              ; pack and unpack to saturate
+        movd      [rdi+2*rax], mm3
+
+        add       rdi, rax
+        movd      mm4, [rsi+48]
+        punpcklbw mm4, mm0
+        paddsw    mm4, [rdx+96]
+        packuswb  mm4, mm0              ; pack and unpack to saturate
+        movd      [rdi+2*rax], mm4
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void copy_mem8x8_mmx(
+;    unsigned char *src,
+;    int src_stride,
+;    unsigned char *dst,
+;    int dst_stride
+;    )
+global sym(vp8_copy_mem8x8_mmx)
+sym(vp8_copy_mem8x8_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src;
+        movq        mm0,        [rsi]
+
+        movsxd      rax,        dword ptr arg(1) ;src_stride;
+        mov         rdi,        arg(2) ;dst;
+
+        movq        mm1,        [rsi+rax]
+        movq        mm2,        [rsi+rax*2]
+
+        movsxd      rcx,        dword ptr arg(3) ;dst_stride
+        lea         rsi,        [rsi+rax*2]
+
+        movq        [rdi],      mm0
+        add         rsi,        rax
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx*2],    mm2
+
+
+        lea         rdi,        [rdi+rcx*2]
+        movq        mm3,        [rsi]
+
+        add         rdi,        rcx
+        movq        mm4,        [rsi+rax]
+
+        movq        mm5,        [rsi+rax*2]
+        movq        [rdi],      mm3
+
+        lea         rsi,        [rsi+rax*2]
+        movq        [rdi+rcx],  mm4
+
+        movq        [rdi+rcx*2],    mm5
+        lea         rdi,        [rdi+rcx*2]
+
+        movq        mm0,        [rsi+rax]
+        movq        mm1,        [rsi+rax*2]
+
+        movq        [rdi+rcx],  mm0
+        movq        [rdi+rcx*2],mm1
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void copy_mem8x4_mmx(
+;    unsigned char *src,
+;    int src_stride,
+;    unsigned char *dst,
+;    int dst_stride
+;    )
+global sym(vp8_copy_mem8x4_mmx)
+sym(vp8_copy_mem8x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src;
+        movq        mm0,        [rsi]
+
+        movsxd      rax,        dword ptr arg(1) ;src_stride;
+        mov         rdi,        arg(2) ;dst;
+
+        movq        mm1,        [rsi+rax]
+        movq        mm2,        [rsi+rax*2]
+
+        movsxd      rcx,        dword ptr arg(3) ;dst_stride
+        lea         rsi,        [rsi+rax*2]
+
+        movq        [rdi],      mm0
+        movq        [rdi+rcx],      mm1
+
+        movq        [rdi+rcx*2],    mm2
+        lea         rdi,        [rdi+rcx*2]
+
+        movq        mm3,        [rsi+rax]
+        movq        [rdi+rcx],      mm3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void copy_mem16x16_mmx(
+;    unsigned char *src,
+;    int src_stride,
+;    unsigned char *dst,
+;    int dst_stride
+;    )
+global sym(vp8_copy_mem16x16_mmx)
+sym(vp8_copy_mem16x16_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src;
+        movsxd      rax,        dword ptr arg(1) ;src_stride;
+
+        mov         rdi,        arg(2) ;dst;
+        movsxd      rcx,        dword ptr arg(3) ;dst_stride
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        mm1,            [rsi+rax]
+        movq        mm4,            [rsi+rax+8]
+
+        movq        mm2,            [rsi+rax*2]
+        movq        mm5,            [rsi+rax*2+8]
+
+        lea         rsi,            [rsi+rax*2]
+        add         rsi,            rax
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx+8],    mm4
+
+        movq        [rdi+rcx*2],    mm2
+        movq        [rdi+rcx*2+8],  mm5
+
+        lea         rdi,            [rdi+rcx*2]
+        add         rdi,            rcx
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        mm1,            [rsi+rax]
+        movq        mm4,            [rsi+rax+8]
+
+        movq        mm2,            [rsi+rax*2]
+        movq        mm5,            [rsi+rax*2+8]
+
+        lea         rsi,            [rsi+rax*2]
+        add         rsi,            rax
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx+8],    mm4
+
+        movq        [rdi+rcx*2],    mm2
+        movq        [rdi+rcx*2+8],  mm5
+
+        lea         rdi,            [rdi+rcx*2]
+        add         rdi,            rcx
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        mm1,            [rsi+rax]
+        movq        mm4,            [rsi+rax+8]
+
+        movq        mm2,            [rsi+rax*2]
+        movq        mm5,            [rsi+rax*2+8]
+
+        lea         rsi,            [rsi+rax*2]
+        add         rsi,            rax
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx+8],    mm4
+
+        movq        [rdi+rcx*2],    mm2
+        movq        [rdi+rcx*2+8],  mm5
+
+        lea         rdi,            [rdi+rcx*2]
+        add         rdi,            rcx
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        mm1,            [rsi+rax]
+        movq        mm4,            [rsi+rax+8]
+
+        movq        mm2,            [rsi+rax*2]
+        movq        mm5,            [rsi+rax*2+8]
+
+        lea         rsi,            [rsi+rax*2]
+        add         rsi,            rax
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx+8],    mm4
+
+        movq        [rdi+rcx*2],    mm2
+        movq        [rdi+rcx*2+8],  mm5
+
+        lea         rdi,            [rdi+rcx*2]
+        add         rdi,            rcx
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        mm1,            [rsi+rax]
+        movq        mm4,            [rsi+rax+8]
+
+        movq        mm2,            [rsi+rax*2]
+        movq        mm5,            [rsi+rax*2+8]
+
+        lea         rsi,            [rsi+rax*2]
+        add         rsi,            rax
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx+8],    mm4
+
+        movq        [rdi+rcx*2],    mm2
+        movq        [rdi+rcx*2+8],  mm5
+
+        lea         rdi,            [rdi+rcx*2]
+        add         rdi,            rcx
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm
new file mode 100644
index 000000000..f2685a76f
--- /dev/null
+++ b/vp8/common/x86/recon_sse2.asm
@@ -0,0 +1,228 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+;void vp8_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
+global sym(vp8_recon2b_sse2)
+sym(vp8_recon2b_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;s
+        mov         rdi,        arg(2) ;d
+        mov         rdx,        arg(1) ;q
+        movsxd      rax,        dword ptr arg(3) ;stride
+        pxor        xmm0,       xmm0
+
+        movq        xmm1,       MMWORD PTR [rsi]
+        punpcklbw   xmm1,       xmm0
+        paddsw      xmm1,       XMMWORD PTR [rdx]
+        packuswb    xmm1,       xmm0              ; pack and unpack to saturate
+        movq        MMWORD PTR [rdi],   xmm1
+
+
+        movq        xmm2,       MMWORD PTR [rsi+8]
+        punpcklbw   xmm2,       xmm0
+        paddsw      xmm2,       XMMWORD PTR [rdx+16]
+        packuswb    xmm2,       xmm0              ; pack and unpack to saturate
+        movq        MMWORD PTR [rdi+rax],   xmm2
+
+
+        movq        xmm3,       MMWORD PTR [rsi+16]
+        punpcklbw   xmm3,       xmm0
+        paddsw      xmm3,       XMMWORD PTR [rdx+32]
+        packuswb    xmm3,       xmm0              ; pack and unpack to saturate
+        movq        MMWORD PTR [rdi+rax*2], xmm3
+
+        add         rdi, rax
+        movq        xmm4,       MMWORD PTR [rsi+24]
+        punpcklbw   xmm4,       xmm0
+        paddsw      xmm4,       XMMWORD PTR [rdx+48]
+        packuswb    xmm4,       xmm0              ; pack and unpack to saturate
+        movq        MMWORD PTR [rdi+rax*2], xmm4
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
+global sym(vp8_recon4b_sse2)
+sym(vp8_recon4b_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;s
+        mov         rdi,        arg(2) ;d
+        mov         rdx,        arg(1) ;q
+        movsxd      rax,        dword ptr arg(3) ;stride
+        pxor        xmm0,       xmm0
+
+        movdqa      xmm1,       XMMWORD PTR [rsi]
+        movdqa      xmm5,       xmm1
+        punpcklbw   xmm1,       xmm0
+        punpckhbw   xmm5,       xmm0
+        paddsw      xmm1,       XMMWORD PTR [rdx]
+        paddsw      xmm5,       XMMWORD PTR [rdx+16]
+        packuswb    xmm1,       xmm5              ; pack and unpack to saturate
+        movdqa      XMMWORD PTR [rdi],  xmm1
+
+
+        movdqa      xmm2,       XMMWORD PTR [rsi+16]
+        movdqa      xmm6,       xmm2
+        punpcklbw   xmm2,       xmm0
+        punpckhbw   xmm6,       xmm0
+        paddsw      xmm2,       XMMWORD PTR [rdx+32]
+        paddsw      xmm6,       XMMWORD PTR [rdx+48]
+        packuswb    xmm2,       xmm6              ; pack and unpack to saturate
+        movdqa      XMMWORD PTR [rdi+rax],  xmm2
+
+
+        movdqa      xmm3,       XMMWORD PTR [rsi+32]
+        movdqa      xmm7,       xmm3
+        punpcklbw   xmm3,       xmm0
+        punpckhbw   xmm7,       xmm0
+        paddsw      xmm3,       XMMWORD PTR [rdx+64]
+        paddsw      xmm7,       XMMWORD PTR [rdx+80]
+        packuswb    xmm3,       xmm7              ; pack and unpack to saturate
+        movdqa      XMMWORD PTR [rdi+rax*2],    xmm3
+
+        add       rdi, rax
+        movdqa      xmm4,       XMMWORD PTR [rsi+48]
+        movdqa      xmm5,       xmm4
+        punpcklbw   xmm4,       xmm0
+        punpckhbw   xmm5,       xmm0
+        paddsw      xmm4,       XMMWORD PTR [rdx+96]
+        paddsw      xmm5,       XMMWORD PTR [rdx+112]
+        packuswb    xmm4,       xmm5              ; pack and unpack to saturate
+        movdqa      XMMWORD PTR [rdi+rax*2],    xmm4
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void copy_mem16x16_sse2(
+;    unsigned char *src,
+;    int src_stride,
+;    unsigned char *dst,
+;    int dst_stride
+;    )
+global sym(vp8_copy_mem16x16_sse2)
+sym(vp8_copy_mem16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src;
+        movdqu      xmm0,       [rsi]
+
+        movsxd      rax,        dword ptr arg(1) ;src_stride;
+        mov         rdi,        arg(2) ;dst;
+
+        movdqu      xmm1,       [rsi+rax]
+        movdqu      xmm2,       [rsi+rax*2]
+
+        movsxd      rcx,        dword ptr arg(3) ;dst_stride
+        lea         rsi,        [rsi+rax*2]
+
+        movdqa      [rdi],      xmm0
+        add         rsi,        rax
+
+        movdqa      [rdi+rcx],  xmm1
+        movdqa      [rdi+rcx*2],xmm2
+
+        lea         rdi,        [rdi+rcx*2]
+        movdqu      xmm3,       [rsi]
+
+        add         rdi,        rcx
+        movdqu      xmm4,       [rsi+rax]
+
+        movdqu      xmm5,       [rsi+rax*2]
+        lea         rsi,        [rsi+rax*2]
+
+        movdqa      [rdi],  xmm3
+        add         rsi,        rax
+
+        movdqa      [rdi+rcx],  xmm4
+        movdqa      [rdi+rcx*2],xmm5
+
+        lea         rdi,        [rdi+rcx*2]
+        movdqu      xmm0,       [rsi]
+
+        add         rdi,        rcx
+        movdqu      xmm1,       [rsi+rax]
+
+        movdqu      xmm2,       [rsi+rax*2]
+        lea         rsi,        [rsi+rax*2]
+
+        movdqa      [rdi],      xmm0
+        add         rsi,        rax
+
+        movdqa      [rdi+rcx],  xmm1
+
+        movdqa      [rdi+rcx*2],    xmm2
+        movdqu      xmm3,       [rsi]
+
+        movdqu      xmm4,       [rsi+rax]
+        lea         rdi,        [rdi+rcx*2]
+
+        add         rdi,        rcx
+        movdqu      xmm5,       [rsi+rax*2]
+
+        lea         rsi,        [rsi+rax*2]
+        movdqa      [rdi],  xmm3
+
+        add         rsi,        rax
+        movdqa      [rdi+rcx],  xmm4
+
+        movdqa      [rdi+rcx*2],xmm5
+        movdqu      xmm0,       [rsi]
+
+        lea         rdi,        [rdi+rcx*2]
+        movdqu      xmm1,       [rsi+rax]
+
+        add         rdi,        rcx
+        movdqu      xmm2,       [rsi+rax*2]
+
+        lea         rsi,        [rsi+rax*2]
+        movdqa      [rdi],      xmm0
+
+        movdqa      [rdi+rcx],  xmm1
+        movdqa      [rdi+rcx*2],xmm2
+
+        movdqu      xmm3,       [rsi+rax]
+        lea         rdi,        [rdi+rcx*2]
+
+        movdqa      [rdi+rcx],  xmm3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/vp8/common/x86/recon_x86.h b/vp8/common/x86/recon_x86.h
new file mode 100644
index 000000000..c46977842
--- /dev/null
+++ b/vp8/common/x86/recon_x86.h
@@ -0,0 +1,61 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef RECON_X86_H
+#define RECON_X86_H
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+
+#if HAVE_MMX
+extern prototype_recon_block(vp8_recon_b_mmx);
+extern prototype_copy_block(vp8_copy_mem8x8_mmx);
+extern prototype_copy_block(vp8_copy_mem8x4_mmx);
+extern prototype_copy_block(vp8_copy_mem16x16_mmx);
+
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_recon_recon
+#define vp8_recon_recon vp8_recon_b_mmx
+
+#undef  vp8_recon_copy8x8
+#define vp8_recon_copy8x8 vp8_copy_mem8x8_mmx
+
+#undef  vp8_recon_copy8x4
+#define vp8_recon_copy8x4 vp8_copy_mem8x4_mmx
+
+#undef  vp8_recon_copy16x16
+#define vp8_recon_copy16x16 vp8_copy_mem16x16_mmx
+
+#endif
+#endif
+
+#if HAVE_SSE2
+extern prototype_recon_block(vp8_recon2b_sse2);
+extern prototype_recon_block(vp8_recon4b_sse2);
+extern prototype_copy_block(vp8_copy_mem16x16_sse2);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_recon_recon2
+#define vp8_recon_recon2 vp8_recon2b_sse2
+
+#undef  vp8_recon_recon4
+#define vp8_recon_recon4 vp8_recon4b_sse2
+
+#undef  vp8_recon_copy16x16
+#define vp8_recon_copy16x16 vp8_copy_mem16x16_sse2
+
+#endif
+#endif
+#endif
diff --git a/vp8/common/x86/subpixel_mmx.asm b/vp8/common/x86/subpixel_mmx.asm
new file mode 100644
index 000000000..c50211813
--- /dev/null
+++ b/vp8/common/x86/subpixel_mmx.asm
@@ -0,0 +1,817 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define vp8_filter_weight 128
+%define VP8_FILTER_SHIFT  7
+
+
+;void vp8_filter_block1d_h6_mmx
+;(
+;    unsigned char   *src_ptr,
+;    unsigned short  *output_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned int    pixel_step,
+;    unsigned int    output_height,
+;    unsigned int    output_width,
+;    short           * vp8_filter
+;)
+global sym(vp8_filter_block1d_h6_mmx)
+sym(vp8_filter_block1d_h6_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rdx,    arg(6) ;vp8_filter
+
+        movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!
+        movq        mm2,    [rdx + 32]         ;
+        movq        mm6,    [rdx + 48]        ;
+        movq        mm7,    [rdx + 64]        ;
+
+        mov         rdi,    arg(1) ;output_ptr
+        mov         rsi,    arg(0) ;src_ptr
+        movsxd      rcx,    dword ptr arg(4) ;output_height
+        movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
+        pxor        mm0,    mm0              ; mm0 = 00000000
+
+nextrow:
+        movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
+        movq        mm4,    mm3              ; mm4 = p-2..p5
+        psrlq       mm3,    8                ; mm3 = p-1..p5
+        punpcklbw   mm3,    mm0              ; mm3 = p-1..p2
+        pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.
+
+        movq        mm5,    mm4              ; mm5 = p-2..p5
+        punpckhbw   mm4,    mm0              ; mm5 = p2..p5
+        pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers
+        paddsw      mm3,    mm4              ; mm3 += mm5
+
+        movq        mm4,    mm5              ; mm4 = p-2..p5;
+        psrlq       mm5,    16               ; mm5 = p0..p5;
+        punpcklbw   mm5,    mm0              ; mm5 = p0..p3
+        pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers
+        paddsw      mm3,    mm5              ; mm3 += mm5
+
+        movq        mm5,    mm4              ; mm5 = p-2..p5
+        psrlq       mm4,    24               ; mm4 = p1..p5
+        punpcklbw   mm4,    mm0              ; mm4 = p1..p4
+        pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers
+        paddsw      mm3,    mm4              ; mm3 += mm5
+
+        ; do outer positive taps
+        movd        mm4,    [rsi+3]
+        punpcklbw   mm4,    mm0              ; mm5 = p3..p6
+        pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers
+        paddsw      mm3,    mm4              ; mm3 += mm5
+
+        punpcklbw   mm5,    mm0              ; mm5 = p-2..p1
+        pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
+        paddsw      mm3,    mm5              ; mm3 += mm5
+
+        paddsw      mm3,    [rd GLOBAL]               ; mm3 += round value
+        psraw       mm3,    VP8_FILTER_SHIFT     ; mm3 /= 128
+        packuswb    mm3,    mm0              ; pack and unpack to saturate
+        punpcklbw   mm3,    mm0              ;
+
+        movq        [rdi],  mm3              ; store the results in the destination
+
+%if ABI_IS_32BIT
+        add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line
+        add         rdi,    rax;
+%else
+        movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line
+        add         rdi,    rax;
+
+        add         rsi,    r8               ; next line
+%endif
+
+        dec         rcx                      ; decrement count
+        jnz         nextrow                  ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;
+; THIS FUNCTION APPEARS TO BE UNUSED
+;
+;void vp8_filter_block1d_v6_mmx
+;(
+;   short *src_ptr,
+;   unsigned char *output_ptr,
+;   unsigned int pixels_per_line,
+;   unsigned int pixel_step,
+;   unsigned int output_height,
+;   unsigned int output_width,
+;   short * vp8_filter
+;)
+global sym(vp8_filter_block1d_v6_mmx)
+sym(vp8_filter_block1d_v6_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        movq      mm5, [rd GLOBAL]
+        push        rbx
+        mov         rbx, arg(6) ;vp8_filter
+        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
+        movq      mm2, [rbx + 32]         ;
+        movq      mm6, [rbx + 48]        ;
+        movq      mm7, [rbx + 64]        ;
+
+        movsxd      rdx, dword ptr arg(2) ;pixels_per_line
+        mov         rdi, arg(1) ;output_ptr
+        mov         rsi, arg(0) ;src_ptr
+        sub         rsi, rdx
+        sub         rsi, rdx
+        movsxd      rcx, DWORD PTR arg(4) ;output_height
+        movsxd      rax, DWORD PTR arg(5) ;output_width      ; destination pitch?
+        pxor        mm0, mm0              ; mm0 = 00000000
+
+
+nextrow_v:
+        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
+        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
+
+
+        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
+        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
+        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
+        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+
+        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
+        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
+        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
+        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+
+        paddsw      mm3, mm5               ; mm3 += round value
+        psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
+        packuswb    mm3, mm0              ; pack and saturate
+
+        movd        [rdi],mm3             ; store the results in the destination
+
+        add         rdi,rax;
+
+        dec         rcx                   ; decrement count
+        jnz         nextrow_v             ; next row
+
+        pop         rbx
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_filter_block1dc_v6_mmx
+;(
+;   short *src_ptr,
+;   unsigned char *output_ptr,
+;    int output_pitch,
+;   unsigned int pixels_per_line,
+;   unsigned int pixel_step,
+;   unsigned int output_height,
+;   unsigned int output_width,
+;   short * vp8_filter
+;)
+global sym(vp8_filter_block1dc_v6_mmx)
+sym(vp8_filter_block1dc_v6_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        movq      mm5, [rd GLOBAL]
+        push        rbx
+        mov         rbx, arg(7) ;vp8_filter
+        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
+        movq      mm2, [rbx + 32]         ;
+        movq      mm6, [rbx + 48]        ;
+        movq      mm7, [rbx + 64]        ;
+
+        movsxd      rdx, dword ptr arg(3) ;pixels_per_line
+        mov         rdi, arg(1) ;output_ptr
+        mov         rsi, arg(0) ;src_ptr
+        sub         rsi, rdx
+        sub         rsi, rdx
+        movsxd      rcx, DWORD PTR arg(5) ;output_height
+        movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?
+        pxor        mm0, mm0              ; mm0 = 00000000
+
+
+nextrow_cv:
+        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
+        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
+
+
+        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
+        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
+        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
+        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+
+        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
+        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
+        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
+        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+
+        paddsw      mm3, mm5               ; mm3 += round value
+        psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
+        packuswb    mm3, mm0              ; pack and saturate
+
+        movd        [rdi],mm3             ; store the results in the destination
+        ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the
+        ; recon block should be in cache this shouldn't cost much.  Its obviously
+        ; avoidable!!!.
+        lea         rdi,  [rdi+rax] ;
+        dec         rcx                   ; decrement count
+        jnz         nextrow_cv             ; next row
+
+        pop         rbx
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void bilinear_predict8x8_mmx
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;   unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+global sym(vp8_bilinear_predict8x8_mmx)
+sym(vp8_bilinear_predict8x8_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ;const short *HFilter = bilinear_filters_mmx[xoffset];
+    ;const short *VFilter = bilinear_filters_mmx[yoffset];
+
+        movsxd      rax,        dword ptr arg(2) ;xoffset
+        mov         rdi,        arg(4) ;dst_ptr           ;
+
+        shl         rax,        5 ; offset * 32
+        lea         rcx,        [sym(vp8_bilinear_filters_mmx) GLOBAL]
+
+        add         rax,        rcx ; HFilter
+        mov         rsi,        arg(0) ;src_ptr              ;
+
+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
+        movq        mm1,        [rax]               ;
+
+        movq        mm2,        [rax+16]            ;
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+
+        pxor        mm0,        mm0                 ;
+
+        shl         rax,        5 ; offset*32
+        add         rax,        rcx ; VFilter
+
+        lea         rcx,        [rdi+rdx*8]          ;
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
+
+
+
+        ; get the first horizontal line done       ;
+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movq        mm4,        mm3                 ; make a copy of current line
+
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   mm4,        mm0                 ;
+
+        pmullw      mm3,        mm1                 ;
+        pmullw      mm4,        mm1                 ;
+
+        movq        mm5,        [rsi+1]             ;
+        movq        mm6,        mm5                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0                 ;
+
+        pmullw      mm5,        mm2                 ;
+        pmullw      mm6,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+        paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [rd GLOBAL]                  ;
+        psraw       mm4,        VP8_FILTER_SHIFT        ;
+
+        movq        mm7,        mm3                 ;
+        packuswb    mm7,        mm4                 ;
+
+        add         rsi,        rdx                 ; next line
+next_row_8x8:
+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movq        mm4,        mm3                 ; make a copy of current line
+
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   mm4,        mm0                 ;
+
+        pmullw      mm3,        mm1                 ;
+        pmullw      mm4,        mm1                 ;
+
+        movq        mm5,        [rsi+1]             ;
+        movq        mm6,        mm5                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0                 ;
+
+        pmullw      mm5,        mm2                 ;
+        pmullw      mm6,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+        movq        mm5,        mm7                 ;
+        movq        mm6,        mm7                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0
+
+        pmullw      mm5,        [rax]               ;
+        pmullw      mm6,        [rax]               ;
+
+        paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [rd GLOBAL]                  ;
+        psraw       mm4,        VP8_FILTER_SHIFT        ;
+
+        movq        mm7,        mm3                 ;
+        packuswb    mm7,        mm4                 ;
+
+
+        pmullw      mm3,        [rax+16]            ;
+        pmullw      mm4,        [rax+16]            ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+
+        paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [rd GLOBAL]                  ;
+        psraw       mm4,        VP8_FILTER_SHIFT        ;
+
+        packuswb    mm3,        mm4
+
+        movq        [rdi],      mm3                 ; store the results in the destination
+
+%if ABI_IS_32BIT
+        add         rsi,        rdx                 ; next line
+        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
+%else
+        movsxd      r8,         dword ptr arg(5) ;dst_pitch
+        add         rsi,        rdx                 ; next line
+        add         rdi,        r8                  ;dst_pitch
+%endif
+        cmp         rdi,        rcx                 ;
+        jne         next_row_8x8
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void bilinear_predict8x4_mmx
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;    unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+global sym(vp8_bilinear_predict8x4_mmx)
+sym(vp8_bilinear_predict8x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ;const short *HFilter = bilinear_filters_mmx[xoffset];
+    ;const short *VFilter = bilinear_filters_mmx[yoffset];
+
+        movsxd      rax,        dword ptr arg(2) ;xoffset
+        mov         rdi,        arg(4) ;dst_ptr           ;
+
+        lea         rcx,        [sym(vp8_bilinear_filters_mmx) GLOBAL]
+        shl         rax,        5
+
+        mov         rsi,        arg(0) ;src_ptr              ;
+        add         rax,        rcx
+
+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
+        movq        mm1,        [rax]               ;
+
+        movq        mm2,        [rax+16]            ;
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+
+        pxor        mm0,        mm0                 ;
+        shl         rax,        5
+
+        add         rax,        rcx
+        lea         rcx,        [rdi+rdx*4]          ;
+
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
+
+        ; get the first horizontal line done       ;
+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movq        mm4,        mm3                 ; make a copy of current line
+
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   mm4,        mm0                 ;
+
+        pmullw      mm3,        mm1                 ;
+        pmullw      mm4,        mm1                 ;
+
+        movq        mm5,        [rsi+1]             ;
+        movq        mm6,        mm5                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0                 ;
+
+        pmullw      mm5,        mm2                 ;
+        pmullw      mm6,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+        paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [rd GLOBAL]                  ;
+        psraw       mm4,        VP8_FILTER_SHIFT        ;
+
+        movq        mm7,        mm3                 ;
+        packuswb    mm7,        mm4                 ;
+
+        add         rsi,        rdx                 ; next line
+next_row_8x4:
+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movq        mm4,        mm3                 ; make a copy of current line
+
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   mm4,        mm0                 ;
+
+        pmullw      mm3,        mm1                 ;
+        pmullw      mm4,        mm1                 ;
+
+        movq        mm5,        [rsi+1]             ;
+        movq        mm6,        mm5                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0                 ;
+
+        pmullw      mm5,        mm2                 ;
+        pmullw      mm6,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+        movq        mm5,        mm7                 ;
+        movq        mm6,        mm7                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0
+
+        pmullw      mm5,        [rax]               ;
+        pmullw      mm6,        [rax]               ;
+
+        paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [rd GLOBAL]                  ;
+        psraw       mm4,        VP8_FILTER_SHIFT        ;
+
+        movq        mm7,        mm3                 ;
+        packuswb    mm7,        mm4                 ;
+
+
+        pmullw      mm3,        [rax+16]            ;
+        pmullw      mm4,        [rax+16]            ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+
+        paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [rd GLOBAL]                  ;
+        psraw       mm4,        VP8_FILTER_SHIFT        ;
+
+        packuswb    mm3,        mm4
+
+        movq        [rdi],      mm3                 ; store the results in the destination
+
+%if ABI_IS_32BIT
+        add         rsi,        rdx                 ; next line
+        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
+%else
+        movsxd      r8,         dword ptr arg(5) ;dst_pitch
+        add         rsi,        rdx                 ; next line
+        add         rdi,        r8
+%endif
+        cmp         rdi,        rcx                 ;
+        jne         next_row_8x4
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void bilinear_predict4x4_mmx
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;    unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+global sym(vp8_bilinear_predict4x4_mmx)
+sym(vp8_bilinear_predict4x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ;const short *HFilter = bilinear_filters_mmx[xoffset];
+    ;const short *VFilter = bilinear_filters_mmx[yoffset];
+
+        movsxd      rax,        dword ptr arg(2) ;xoffset
+        mov         rdi,        arg(4) ;dst_ptr           ;
+
+        lea         rcx,        [sym(vp8_bilinear_filters_mmx) GLOBAL]
+        shl         rax,        5
+
+        add         rax,        rcx ; HFilter
+        mov         rsi,        arg(0) ;src_ptr              ;
+
+        movsxd      rdx,        dword ptr arg(5) ;ldst_pitch
+        movq        mm1,        [rax]               ;
+
+        movq        mm2,        [rax+16]            ;
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+
+        pxor        mm0,        mm0                 ;
+        shl         rax,        5
+
+        add         rax,        rcx
+        lea         rcx,        [rdi+rdx*4]          ;
+
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
+
+        ; get the first horizontal line done       ;
+        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+
+        pmullw      mm3,        mm1                 ;
+        movd        mm5,        [rsi+1]             ;
+
+        punpcklbw   mm5,        mm0                 ;
+        pmullw      mm5,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
+
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        movq        mm7,        mm3                 ;
+        packuswb    mm7,        mm0                 ;
+
+        add         rsi,        rdx                 ; next line
+next_row_4x4:
+        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+
+        pmullw      mm3,        mm1                 ;
+        movd        mm5,        [rsi+1]             ;
+
+        punpcklbw   mm5,        mm0                 ;
+        pmullw      mm5,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+
+        movq        mm5,        mm7                 ;
+        punpcklbw   mm5,        mm0                 ;
+
+        pmullw      mm5,        [rax]               ;
+        paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
+
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+        movq        mm7,        mm3                 ;
+
+        packuswb    mm7,        mm0                 ;
+
+        pmullw      mm3,        [rax+16]            ;
+        paddw       mm3,        mm5                 ;
+
+
+        paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        packuswb    mm3,        mm0
+        movd        [rdi],      mm3                 ; store the results in the destination
+
+%if ABI_IS_32BIT
+        add         rsi,        rdx                 ; next line
+        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
+%else
+        movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;
+        add         rsi,        rdx                 ; next line
+        add         rdi,        r8
+%endif
+
+        cmp         rdi,        rcx                 ;
+        jne         next_row_4x4
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+SECTION_RODATA
+align 16
+rd:
+    times 4 dw 0x40
+
+align 16
+global sym(vp8_six_tap_mmx)
+sym(vp8_six_tap_mmx):
+    times 8 dw 0
+    times 8 dw 0
+    times 8 dw 128
+    times 8 dw 0
+    times 8 dw 0
+    times 8 dw 0
+
+    times 8 dw 0
+    times 8 dw -6
+    times 8 dw 123
+    times 8 dw 12
+    times 8 dw -1
+    times 8 dw 0
+
+    times 8 dw 2
+    times 8 dw -11
+    times 8 dw 108
+    times 8 dw 36
+    times 8 dw -8
+    times 8 dw 1
+
+    times 8 dw 0
+    times 8 dw -9
+    times 8 dw 93
+    times 8 dw 50
+    times 8 dw -6
+    times 8 dw 0
+
+    times 8 dw 3
+    times 8 dw -16
+    times 8 dw 77
+    times 8 dw 77
+    times 8 dw -16
+    times 8 dw 3
+
+    times 8 dw 0
+    times 8 dw -6
+    times 8 dw 50
+    times 8 dw 93
+    times 8 dw -9
+    times 8 dw 0
+
+    times 8 dw 1
+    times 8 dw -8
+    times 8 dw 36
+    times 8 dw 108
+    times 8 dw -11
+    times 8 dw 2
+
+    times 8 dw 0
+    times 8 dw -1
+    times 8 dw 12
+    times 8 dw 123
+    times 8 dw -6
+    times 8 dw 0
+
+
+align 16
+global sym(vp8_bilinear_filters_mmx)
+sym(vp8_bilinear_filters_mmx):
+    times 8 dw 128
+    times 8 dw 0
+
+    times 8 dw 112
+    times 8 dw 16
+
+    times 8 dw 96
+    times 8 dw 32
+
+    times 8 dw 80
+    times 8 dw 48
+
+    times 8 dw 64
+    times 8 dw 64
+
+    times 8 dw 48
+    times 8 dw 80
+
+    times 8 dw 32
+    times 8 dw 96
+
+    times 8 dw 16
+    times 8 dw 112
diff --git a/vp8/common/x86/subpixel_sse2.asm b/vp8/common/x86/subpixel_sse2.asm
new file mode 100644
index 000000000..dee04f2d9
--- /dev/null
+++ b/vp8/common/x86/subpixel_sse2.asm
@@ -0,0 +1,1032 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define VP8_FILTER_WEIGHT 128
+%define VP8_FILTER_SHIFT  7
+
+
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;*************************************************************************************/
+;void vp8_filter_block1d8_h6_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned short *output_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned int    pixel_step,
+;    unsigned int    output_height,
+;    unsigned int    output_width,
+;    short           *vp8_filter
+;)
+global sym(vp8_filter_block1d8_h6_sse2)
+sym(vp8_filter_block1d8_h6_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rdx,        arg(6) ;vp8_filter
+        mov         rsi,        arg(0) ;src_ptr
+
+        mov         rdi,        arg(1) ;output_ptr
+
+        movsxd      rcx,        dword ptr arg(4) ;output_height
+        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(5) ;output_width
+%endif
+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
+
+filter_block1d8_h6_rowloop:
+        movq        xmm3,       MMWORD PTR [rsi - 2]
+        movq        xmm1,       MMWORD PTR [rsi + 6]
+
+        prefetcht2  [rsi+rax-2]
+
+        pslldq      xmm1,       8
+        por         xmm1,       xmm3
+
+        movdqa      xmm4,       xmm1
+        movdqa      xmm5,       xmm1
+
+        movdqa      xmm6,       xmm1
+        movdqa      xmm7,       xmm1
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
+
+
+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
+
+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
+
+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
+
+        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
+
+
+        paddsw      xmm4,       xmm7
+        paddsw      xmm4,       xmm5
+
+        paddsw      xmm4,       xmm3
+        paddsw      xmm4,       xmm6
+
+        paddsw      xmm4,       xmm1
+        paddsw      xmm4,       [rd GLOBAL]
+
+        psraw       xmm4,       7
+
+        packuswb    xmm4,       xmm0
+        punpcklbw   xmm4,       xmm0
+
+        movdqa      XMMWORD Ptr [rdi],         xmm4
+        lea         rsi,        [rsi + rax]
+
+%if ABI_IS_32BIT
+        add         rdi,        DWORD Ptr arg(5) ;[output_width]
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx
+
+        jnz         filter_block1d8_h6_rowloop                ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_filter_block1d16_h6_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned short *output_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned int    pixel_step,
+;    unsigned int    output_height,
+;    unsigned int    output_width,
+;    short           *vp8_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;*************************************************************************************/
+global sym(vp8_filter_block1d16_h6_sse2)
+sym(vp8_filter_block1d16_h6_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rdx,        arg(6) ;vp8_filter
+        mov         rsi,        arg(0) ;src_ptr
+
+        mov         rdi,        arg(1) ;output_ptr
+
+        movsxd      rcx,        dword ptr arg(4) ;output_height
+        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(5) ;output_width
+%endif
+
+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
+
+filter_block1d16_h6_sse2_rowloop:
+        movq        xmm3,       MMWORD PTR [rsi - 2]
+        movq        xmm1,       MMWORD PTR [rsi + 6]
+
+        movq        xmm2,       MMWORD PTR [rsi +14]
+        pslldq      xmm2,       8
+
+        por         xmm2,       xmm1
+        prefetcht2  [rsi+rax-2]
+
+        pslldq      xmm1,       8
+        por         xmm1,       xmm3
+
+        movdqa      xmm4,       xmm1
+        movdqa      xmm5,       xmm1
+
+        movdqa      xmm6,       xmm1
+        movdqa      xmm7,       xmm1
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
+
+
+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
+
+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
+
+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
+
+        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
+
+        paddsw      xmm4,       xmm7
+        paddsw      xmm4,       xmm5
+
+        paddsw      xmm4,       xmm3
+        paddsw      xmm4,       xmm6
+
+        paddsw      xmm4,       xmm1
+        paddsw      xmm4,       [rd GLOBAL]
+
+        psraw       xmm4,       7
+
+        packuswb    xmm4,       xmm0
+        punpcklbw   xmm4,       xmm0
+
+        movdqa      XMMWORD Ptr [rdi],         xmm4
+
+        movdqa      xmm3,       xmm2
+        movdqa      xmm4,       xmm2
+
+        movdqa      xmm5,       xmm2
+        movdqa      xmm6,       xmm2
+
+        movdqa      xmm7,       xmm2
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
+
+
+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
+
+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
+
+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
+
+        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
+
+
+        paddsw      xmm4,       xmm7
+        paddsw      xmm4,       xmm5
+
+        paddsw      xmm4,       xmm3
+        paddsw      xmm4,       xmm6
+
+        paddsw      xmm4,       xmm2
+        paddsw      xmm4,       [rd GLOBAL]
+
+        psraw       xmm4,       7
+
+        packuswb    xmm4,       xmm0
+        punpcklbw   xmm4,       xmm0
+
+        movdqa      XMMWORD Ptr [rdi+16],      xmm4
+
+        lea         rsi,        [rsi + rax]
+%if ABI_IS_32BIT
+        add         rdi,        DWORD Ptr arg(5) ;[output_width]
+%else
+        add         rdi,        r8
+%endif
+
+        dec         rcx
+        jnz         filter_block1d16_h6_sse2_rowloop                ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_filter_block1d8_v6_sse2
+;(
+;    short *src_ptr,
+;    unsigned char *output_ptr,
+;    int dst_ptich,
+;    unsigned int pixels_per_line,
+;    unsigned int pixel_step,
+;    unsigned int output_height,
+;    unsigned int output_width,
+;    short * vp8_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
+; input pixel array has output_height rows.
+;*************************************************************************************/
+global sym(vp8_filter_block1d8_v6_sse2)
+sym(vp8_filter_block1d8_v6_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rax,        arg(7) ;vp8_filter
+        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
+
+        mov         rdi,        arg(1) ;output_ptr
+        mov         rsi,        arg(0) ;src_ptr
+
+        sub         rsi,        rdx
+        sub         rsi,        rdx
+
+        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
+        pxor        xmm0,       xmm0                        ; clear xmm0
+
+        movdqa      xmm7,       XMMWORD PTR [rd GLOBAL]
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(2) ; dst_ptich
+%endif
+
+vp8_filter_block1d8_v6_sse2_loop:
+        movdqa      xmm1,       XMMWORD PTR [rsi]
+        pmullw      xmm1,       [rax]
+
+        movdqa      xmm2,       XMMWORD PTR [rsi + rdx]
+        pmullw      xmm2,       [rax + 16]
+
+        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]
+        pmullw      xmm3,       [rax + 32]
+
+        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]
+        pmullw      xmm5,       [rax + 64]
+
+        add         rsi,        rdx
+        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]
+
+        pmullw      xmm4,       [rax + 48]
+        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]
+
+        pmullw      xmm6,       [rax + 80]
+
+        paddsw      xmm2,       xmm5
+        paddsw      xmm2,       xmm3
+
+        paddsw      xmm2,       xmm1
+        paddsw      xmm2,       xmm4
+
+        paddsw      xmm2,       xmm6
+        paddsw      xmm2,       xmm7
+
+        psraw       xmm2,       7
+        packuswb    xmm2,       xmm0              ; pack and saturate
+
+        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
+%if ABI_IS_32BIT
+        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx         ; decrement count
+        jnz         vp8_filter_block1d8_v6_sse2_loop               ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_unpack_block1d16_h6_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned short *output_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned int    output_height,
+;    unsigned int    output_width
+;)
+global sym(vp8_unpack_block1d16_h6_sse2)
+sym(vp8_unpack_block1d16_h6_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src_ptr
+        mov         rdi,        arg(1) ;output_ptr
+
+        movsxd      rcx,        dword ptr arg(3) ;output_height
+        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
+
+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source
+%endif
+
+unpack_block1d16_h6_sse2_rowloop:
+        movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
+        movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        punpcklbw   xmm1,       xmm0
+
+        movdqa      XMMWORD Ptr [rdi],         xmm1
+        movdqa      XMMWORD Ptr [rdi + 16],    xmm3
+
+        lea         rsi,        [rsi + rax]
+%if ABI_IS_32BIT
+        add         rdi,        DWORD Ptr arg(4) ;[output_width]
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx
+        jnz         unpack_block1d16_h6_sse2_rowloop                ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_unpack_block1d8_h6_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned short *output_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned int    output_height,
+;    unsigned int    output_width
+;)
+global sym(vp8_unpack_block1d8_h6_sse2)
+sym(vp8_unpack_block1d8_h6_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src_ptr
+        mov         rdi,        arg(1) ;output_ptr
+
+        movsxd      rcx,        dword ptr arg(3) ;output_height
+        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
+
+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source
+%endif
+
+unpack_block1d8_h6_sse2_rowloop:
+        movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
+        lea         rsi,        [rsi + rax]
+
+        punpcklbw   xmm1,       xmm0
+        movdqa      XMMWORD Ptr [rdi],         xmm1
+
+%if ABI_IS_32BIT
+        add         rdi,        DWORD Ptr arg(4) ;[output_width]
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx
+        jnz         unpack_block1d8_h6_sse2_rowloop                ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_pack_block1d8_v6_sse2
+;(
+;    short *src_ptr,
+;    unsigned char *output_ptr,
+;    int dst_ptich,
+;    unsigned int pixels_per_line,
+;    unsigned int output_height,
+;    unsigned int output_width
+;)
+global sym(vp8_pack_block1d8_v6_sse2)
+sym(vp8_pack_block1d8_v6_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
+        mov         rdi,        arg(1) ;output_ptr
+
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rcx,        DWORD PTR arg(4) ;[output_height]
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(5) ;output_width            ; Pitch for Source
+%endif
+
+pack_block1d8_v6_sse2_loop:
+        movdqa      xmm0,       XMMWORD PTR [rsi]
+        packuswb    xmm0,       xmm0
+
+        movq        QWORD PTR [rdi], xmm0         ; store the results in the destination
+        lea         rsi,        [rsi+rdx]
+
+%if ABI_IS_32BIT
+        add         rdi,        DWORD Ptr arg(5) ;[output_width]
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx         ; decrement count
+        jnz         pack_block1d8_v6_sse2_loop               ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_pack_block1d16_v6_sse2
+;(
+;    short *src_ptr,
+;    unsigned char *output_ptr,
+;    int dst_ptich,
+;    unsigned int pixels_per_line,
+;    unsigned int output_height,
+;    unsigned int output_width
+;)
+global sym(vp8_pack_block1d16_v6_sse2)
+sym(vp8_pack_block1d16_v6_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
+        mov         rdi,        arg(1) ;output_ptr
+
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rcx,        DWORD PTR arg(4) ;[output_height]
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(2) ;dst_pitch
+%endif
+
+pack_block1d16_v6_sse2_loop:
+        movdqa      xmm0,       XMMWORD PTR [rsi]
+        movdqa      xmm1,       XMMWORD PTR [rsi+16]
+
+        packuswb    xmm0,       xmm1
+        movdqa      XMMWORD PTR [rdi], xmm0         ; store the results in the destination
+
+        add         rsi,        rdx
+%if ABI_IS_32BIT
+        add         rdi,        DWORD Ptr arg(2) ;dst_pitch
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx         ; decrement count
+        jnz         pack_block1d16_v6_sse2_loop               ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_bilinear_predict16x16_sse2
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;    unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+extern sym(vp8_bilinear_filters_mmx)
+global sym(vp8_bilinear_predict16x16_sse2)
+sym(vp8_bilinear_predict16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ;const short *HFilter = bilinear_filters_mmx[xoffset]
+    ;const short *VFilter = bilinear_filters_mmx[yoffset]
+
+        lea         rcx,        [sym(vp8_bilinear_filters_mmx) GLOBAL]
+        movsxd      rax,        dword ptr arg(2) ;xoffset
+
+        cmp         rax,        0      ;skip first_pass filter if xoffset=0
+        je          b16x16_sp_only
+
+        shl         rax,        5
+        add         rax,        rcx    ;HFilter
+
+        mov         rdi,        arg(4) ;dst_ptr
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
+
+        movdqa      xmm1,       [rax]
+        movdqa      xmm2,       [rax+16]
+
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+
+        cmp         rax,        0      ;skip second_pass filter if yoffset=0
+        je          b16x16_fp_only
+
+        shl         rax,        5
+        add         rax,        rcx    ;VFilter
+
+        lea         rcx,        [rdi+rdx*8]
+        lea         rcx,        [rcx+rdx*8]
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
+
+        pxor        xmm0,       xmm0
+
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(5) ;dst_pitch
+%endif
+        ; get the first horizontal line done
+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+
+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   xmm4,       xmm0
+
+        pmullw      xmm3,       xmm1
+        pmullw      xmm4,       xmm1
+
+        movdqu      xmm5,       [rsi+1]
+        movdqa      xmm6,       xmm5
+
+        punpcklbw   xmm5,       xmm0
+        punpckhbw   xmm6,       xmm0
+
+        pmullw      xmm5,       xmm2
+        pmullw      xmm6,       xmm2
+
+        paddw       xmm3,       xmm5
+        paddw       xmm4,       xmm6
+
+        paddw       xmm3,       [rd GLOBAL]         ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       xmm4,       [rd GLOBAL]
+        psraw       xmm4,       VP8_FILTER_SHIFT
+
+        movdqa      xmm7,       xmm3
+        packuswb    xmm7,       xmm4
+
+        add         rsi,        rdx                 ; next line
+next_row:
+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+
+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   xmm4,       xmm0
+
+        pmullw      xmm3,       xmm1
+        pmullw      xmm4,       xmm1
+
+        movdqu      xmm5,       [rsi+1]
+        movdqa      xmm6,       xmm5
+
+        punpcklbw   xmm5,       xmm0
+        punpckhbw   xmm6,       xmm0
+
+        pmullw      xmm5,       xmm2
+        pmullw      xmm6,       xmm2
+
+        paddw       xmm3,       xmm5
+        paddw       xmm4,       xmm6
+
+        movdqa      xmm5,       xmm7
+        movdqa      xmm6,       xmm7
+
+        punpcklbw   xmm5,       xmm0
+        punpckhbw   xmm6,       xmm0
+
+        pmullw      xmm5,       [rax]
+        pmullw      xmm6,       [rax]
+
+        paddw       xmm3,       [rd GLOBAL]         ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       xmm4,       [rd GLOBAL]
+        psraw       xmm4,       VP8_FILTER_SHIFT
+
+        movdqa      xmm7,       xmm3
+        packuswb    xmm7,       xmm4
+
+        pmullw      xmm3,       [rax+16]
+        pmullw      xmm4,       [rax+16]
+
+        paddw       xmm3,       xmm5
+        paddw       xmm4,       xmm6
+
+        paddw       xmm3,       [rd GLOBAL]         ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       xmm4,       [rd GLOBAL]
+        psraw       xmm4,       VP8_FILTER_SHIFT
+
+        packuswb    xmm3,       xmm4
+        movdqa      [rdi],      xmm3                 ; store the results in the destination
+
+        add         rsi,        rdx                 ; next line
+%if ABI_IS_32BIT
+        add         rdi,        DWORD PTR arg(5) ;dst_pitch
+%else
+        add         rdi,        r8
+%endif
+
+        cmp         rdi,        rcx
+        jne         next_row
+
+        jmp         done
+
+b16x16_sp_only:
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+        shl         rax,        5
+        add         rax,        rcx    ;VFilter
+
+        mov         rdi,        arg(4) ;dst_ptr
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
+
+        movdqa      xmm1,       [rax]
+        movdqa      xmm2,       [rax+16]
+
+        lea         rcx,        [rdi+rdx*8]
+        lea         rcx,        [rcx+rdx*8]
+        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
+
+        pxor        xmm0,       xmm0
+
+        ; get the first horizontal line done
+        movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+
+        add         rsi,        rax                 ; next line
+next_row_spo:
+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+
+        movdqa      xmm5,       xmm7
+        movdqa      xmm6,       xmm7
+
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+        movdqa      xmm7,       xmm3
+
+        punpcklbw   xmm5,       xmm0
+        punpckhbw   xmm6,       xmm0
+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   xmm4,       xmm0
+
+        pmullw      xmm5,       xmm1
+        pmullw      xmm6,       xmm1
+        pmullw      xmm3,       xmm2
+        pmullw      xmm4,       xmm2
+
+        paddw       xmm3,       xmm5
+        paddw       xmm4,       xmm6
+
+        paddw       xmm3,       [rd GLOBAL]         ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       xmm4,       [rd GLOBAL]
+        psraw       xmm4,       VP8_FILTER_SHIFT
+
+        packuswb    xmm3,       xmm4
+        movdqa      [rdi],      xmm3                 ; store the results in the destination
+
+        add         rsi,        rax                 ; next line
+        add         rdi,        rdx                 ;dst_pitch
+        cmp         rdi,        rcx
+        jne         next_row_spo
+
+        jmp         done
+
+b16x16_fp_only:
+        lea         rcx,        [rdi+rdx*8]
+        lea         rcx,        [rcx+rdx*8]
+        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
+        pxor        xmm0,       xmm0
+
+next_row_fpo:
+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+
+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   xmm4,       xmm0
+
+        pmullw      xmm3,       xmm1
+        pmullw      xmm4,       xmm1
+
+        movdqu      xmm5,       [rsi+1]
+        movdqa      xmm6,       xmm5
+
+        punpcklbw   xmm5,       xmm0
+        punpckhbw   xmm6,       xmm0
+
+        pmullw      xmm5,       xmm2
+        pmullw      xmm6,       xmm2
+
+        paddw       xmm3,       xmm5
+        paddw       xmm4,       xmm6
+
+        paddw       xmm3,       [rd GLOBAL]         ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       xmm4,       [rd GLOBAL]
+        psraw       xmm4,       VP8_FILTER_SHIFT
+
+        packuswb    xmm3,       xmm4
+        movdqa      [rdi],      xmm3                 ; store the results in the destination
+
+        add         rsi,        rax                 ; next line
+        add         rdi,        rdx                 ; dst_pitch
+        cmp         rdi,        rcx
+        jne         next_row_fpo
+
+done:
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_bilinear_predict8x8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;    unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+extern sym(vp8_bilinear_filters_mmx)
+global sym(vp8_bilinear_predict8x8_sse2)
+sym(vp8_bilinear_predict8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 144                         ; reserve 144 bytes
+
+    ;const short *HFilter = bilinear_filters_mmx[xoffset]
+    ;const short *VFilter = bilinear_filters_mmx[yoffset]
+        lea         rcx,        [sym(vp8_bilinear_filters_mmx) GLOBAL]
+
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
+
+    ;Read 9-line unaligned data in and put them on stack. This gives a big
+    ;performance boost.
+        movdqu      xmm0,       [rsi]
+        lea         rax,        [rdx + rdx*2]
+        movdqu      xmm1,       [rsi+rdx]
+        movdqu      xmm2,       [rsi+rdx*2]
+        add         rsi,        rax
+        movdqu      xmm3,       [rsi]
+        movdqu      xmm4,       [rsi+rdx]
+        movdqu      xmm5,       [rsi+rdx*2]
+        add         rsi,        rax
+        movdqu      xmm6,       [rsi]
+        movdqu      xmm7,       [rsi+rdx]
+
+        movdqa      XMMWORD PTR [rsp],            xmm0
+
+        movdqu      xmm0,       [rsi+rdx*2]
+
+        movdqa      XMMWORD PTR [rsp+16],         xmm1
+        movdqa      XMMWORD PTR [rsp+32],         xmm2
+        movdqa      XMMWORD PTR [rsp+48],         xmm3
+        movdqa      XMMWORD PTR [rsp+64],         xmm4
+        movdqa      XMMWORD PTR [rsp+80],         xmm5
+        movdqa      XMMWORD PTR [rsp+96],         xmm6
+        movdqa      XMMWORD PTR [rsp+112],        xmm7
+        movdqa      XMMWORD PTR [rsp+128],        xmm0
+
+        movsxd      rax,        dword ptr arg(2) ;xoffset
+        shl         rax,        5
+        add         rax,        rcx    ;HFilter
+
+        mov         rdi,        arg(4) ;dst_ptr
+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
+
+        movdqa      xmm1,       [rax]
+        movdqa      xmm2,       [rax+16]
+
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+        shl         rax,        5
+        add         rax,        rcx    ;VFilter
+
+        lea         rcx,        [rdi+rdx*8]
+
+        movdqa      xmm5,       [rax]
+        movdqa      xmm6,       [rax+16]
+
+        pxor        xmm0,       xmm0
+
+        ; get the first horizontal line done
+        movdqa      xmm3,       XMMWORD PTR [rsp]
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+        psrldq      xmm4,       1
+
+        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
+        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
+
+        pmullw      xmm3,       xmm1
+        pmullw      xmm4,       xmm2
+
+        paddw       xmm3,       xmm4
+
+        paddw       xmm3,       [rd GLOBAL]         ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        movdqa      xmm7,       xmm3
+        add         rsp,        16                 ; next line
+next_row8x8:
+        movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+        psrldq      xmm4,       1
+
+        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
+        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
+
+        pmullw      xmm3,       xmm1
+        pmullw      xmm4,       xmm2
+
+        paddw       xmm3,       xmm4
+        pmullw      xmm7,       xmm5
+
+        paddw       xmm3,       [rd GLOBAL]         ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        movdqa      xmm4,       xmm3
+
+        pmullw      xmm3,       xmm6
+        paddw       xmm3,       xmm7
+
+        movdqa      xmm7,       xmm4
+
+        paddw       xmm3,       [rd GLOBAL]         ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        packuswb    xmm3,       xmm0
+        movq        [rdi],      xmm3                 ; store the results in the destination
+
+        add         rsp,        16                 ; next line
+        add         rdi,        rdx
+
+        cmp         rdi,        rcx
+        jne         next_row8x8
+
+    ;add rsp, 144
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+rd:
+    times 8 dw 0x40
diff --git a/vp8/common/x86/subpixel_x86.h b/vp8/common/x86/subpixel_x86.h
new file mode 100644
index 000000000..efa7b2e09
--- /dev/null
+++ b/vp8/common/x86/subpixel_x86.h
@@ -0,0 +1,88 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef SUBPIXEL_X86_H
+#define SUBPIXEL_X86_H
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+
+#if HAVE_MMX
+extern prototype_subpixel_predict(vp8_sixtap_predict16x16_mmx);
+extern prototype_subpixel_predict(vp8_sixtap_predict8x8_mmx);
+extern prototype_subpixel_predict(vp8_sixtap_predict8x4_mmx);
+extern prototype_subpixel_predict(vp8_sixtap_predict4x4_mmx);
+extern prototype_subpixel_predict(vp8_bilinear_predict16x16_mmx);
+extern prototype_subpixel_predict(vp8_bilinear_predict8x8_mmx);
+extern prototype_subpixel_predict(vp8_bilinear_predict8x4_mmx);
+extern prototype_subpixel_predict(vp8_bilinear_predict4x4_mmx);
+
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_subpix_sixtap16x16
+#define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_mmx
+
+#undef  vp8_subpix_sixtap8x8
+#define vp8_subpix_sixtap8x8 vp8_sixtap_predict8x8_mmx
+
+#undef  vp8_subpix_sixtap8x4
+#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_mmx
+
+#undef  vp8_subpix_sixtap4x4
+#define vp8_subpix_sixtap4x4 vp8_sixtap_predict4x4_mmx
+
+#undef  vp8_subpix_bilinear16x16
+#define vp8_subpix_bilinear16x16 vp8_bilinear_predict16x16_mmx
+
+#undef  vp8_subpix_bilinear8x8
+#define vp8_subpix_bilinear8x8 vp8_bilinear_predict8x8_mmx
+
+#undef  vp8_subpix_bilinear8x4
+#define vp8_subpix_bilinear8x4 vp8_bilinear_predict8x4_mmx
+
+#undef  vp8_subpix_bilinear4x4
+#define vp8_subpix_bilinear4x4 vp8_bilinear_predict4x4_mmx
+
+#endif
+#endif
+
+
+#if HAVE_SSE2
+extern prototype_subpixel_predict(vp8_sixtap_predict16x16_sse2);
+extern prototype_subpixel_predict(vp8_sixtap_predict8x8_sse2);
+extern prototype_subpixel_predict(vp8_sixtap_predict8x4_sse2);
+extern prototype_subpixel_predict(vp8_bilinear_predict16x16_sse2);
+extern prototype_subpixel_predict(vp8_bilinear_predict8x8_sse2);
+
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_subpix_sixtap16x16
+#define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_sse2
+
+#undef  vp8_subpix_sixtap8x8
+#define vp8_subpix_sixtap8x8 vp8_sixtap_predict8x8_sse2
+
+#undef  vp8_subpix_sixtap8x4
+#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_sse2
+
+#undef  vp8_subpix_bilinear16x16
+#define vp8_subpix_bilinear16x16 vp8_bilinear_predict16x16_sse2
+
+#undef  vp8_subpix_bilinear8x8
+#define vp8_subpix_bilinear8x8 vp8_bilinear_predict8x8_sse2
+
+#endif
+#endif
+
+#endif
diff --git a/vp8/common/x86/vp8_asm_stubs.c b/vp8/common/x86/vp8_asm_stubs.c
new file mode 100644
index 000000000..68454f709
--- /dev/null
+++ b/vp8/common/x86/vp8_asm_stubs.c
@@ -0,0 +1,342 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_ports/mem.h"
+#include "subpixel.h"
+
+extern const short vp8_six_tap_mmx[8][6*8];
+extern const short vp8_bilinear_filters_mmx[8][2*8];
+
+extern void vp8_filter_block1d_h6_mmx
+(
+    unsigned char   *src_ptr,
+    unsigned short  *output_ptr,
+    unsigned int    src_pixels_per_line,
+    unsigned int    pixel_step,
+    unsigned int    output_height,
+    unsigned int    output_width,
+    const short      *vp8_filter
+);
+extern void vp8_filter_block1dc_v6_mmx
+(
+    unsigned short *src_ptr,
+    unsigned char  *output_ptr,
+    int             output_pitch,
+    unsigned int    pixels_per_line,
+    unsigned int    pixel_step,
+    unsigned int    output_height,
+    unsigned int    output_width,
+    const short    *vp8_filter
+);
+extern void vp8_filter_block1d8_h6_sse2
+(
+    unsigned char  *src_ptr,
+    unsigned short *output_ptr,
+    unsigned int    src_pixels_per_line,
+    unsigned int    pixel_step,
+    unsigned int    output_height,
+    unsigned int    output_width,
+    const short    *vp8_filter
+);
+extern void vp8_filter_block1d16_h6_sse2
+(
+    unsigned char  *src_ptr,
+    unsigned short *output_ptr,
+    unsigned int    src_pixels_per_line,
+    unsigned int    pixel_step,
+    unsigned int    output_height,
+    unsigned int    output_width,
+    const short    *vp8_filter
+);
+extern void vp8_filter_block1d8_v6_sse2
+(
+    unsigned short *src_ptr,
+    unsigned char *output_ptr,
+    int dst_ptich,
+    unsigned int pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const short    *vp8_filter
+);
+extern void vp8_unpack_block1d16_h6_sse2
+(
+    unsigned char  *src_ptr,
+    unsigned short *output_ptr,
+    unsigned int    src_pixels_per_line,
+    unsigned int    output_height,
+    unsigned int    output_width
+);
+extern void vp8_unpack_block1d8_h6_sse2
+(
+    unsigned char  *src_ptr,
+    unsigned short *output_ptr,
+    unsigned int    src_pixels_per_line,
+    unsigned int    output_height,
+    unsigned int    output_width
+);
+extern void vp8_pack_block1d8_v6_sse2
+(
+    unsigned short *src_ptr,
+    unsigned char *output_ptr,
+    int dst_ptich,
+    unsigned int pixels_per_line,
+    unsigned int output_height,
+    unsigned int output_width
+);
+extern void vp8_pack_block1d16_v6_sse2
+(
+    unsigned short *src_ptr,
+    unsigned char *output_ptr,
+    int dst_ptich,
+    unsigned int pixels_per_line,
+    unsigned int output_height,
+    unsigned int output_width
+);
+extern prototype_subpixel_predict(vp8_bilinear_predict8x8_mmx);
+
+
+#if HAVE_MMX
+void vp8_sixtap_predict4x4_mmx
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 16*16);  // Temp data bufffer used in filtering
+    const short *HFilter, *VFilter;
+    HFilter = vp8_six_tap_mmx[xoffset];
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter);
+    VFilter = vp8_six_tap_mmx[yoffset];
+    vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4 , 4, 4, VFilter);
+
+}
+
+
+void vp8_sixtap_predict16x16_mmx
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24);  // Temp data bufffer used in filtering
+
+    const short *HFilter, *VFilter;
+
+
+    HFilter = vp8_six_tap_mmx[xoffset];
+
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),    FData2,   src_pixels_per_line, 1, 21, 32, HFilter);
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,  FData2 + 4, src_pixels_per_line, 1, 21, 32, HFilter);
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8,  FData2 + 8, src_pixels_per_line, 1, 21, 32, HFilter);
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, FData2 + 12, src_pixels_per_line, 1, 21, 32, HFilter);
+
+    VFilter = vp8_six_tap_mmx[yoffset];
+    vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr,   dst_pitch, 32, 16 , 16, 16, VFilter);
+    vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16 , 16, 16, VFilter);
+    vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter);
+    vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16 , 16, 16, VFilter);
+
+}
+
+
+void vp8_sixtap_predict8x8_mmx
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);    // Temp data bufffer used in filtering
+
+    const short *HFilter, *VFilter;
+
+    HFilter = vp8_six_tap_mmx[xoffset];
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),    FData2,   src_pixels_per_line, 1, 13, 16, HFilter);
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,  FData2 + 4, src_pixels_per_line, 1, 13, 16, HFilter);
+
+    VFilter = vp8_six_tap_mmx[yoffset];
+    vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 8, 8, VFilter);
+    vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 8, 8, VFilter);
+
+}
+
+
+void vp8_sixtap_predict8x4_mmx
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);    // Temp data bufffer used in filtering
+
+    const short *HFilter, *VFilter;
+
+    HFilter = vp8_six_tap_mmx[xoffset];
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),    FData2,   src_pixels_per_line, 1, 9, 16, HFilter);
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,  FData2 + 4, src_pixels_per_line, 1, 9, 16, HFilter);
+
+    VFilter = vp8_six_tap_mmx[yoffset];
+    vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 4, 8, VFilter);
+    vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 4, 8, VFilter);
+
+}
+
+
+
+void vp8_bilinear_predict16x16_mmx
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    vp8_bilinear_predict8x8_mmx(src_ptr,   src_pixels_per_line, xoffset, yoffset, dst_ptr,   dst_pitch);
+    vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + 8, dst_pitch);
+    vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line,   src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8,   dst_pitch);
+    vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8 + 8, dst_pitch);
+}
+#endif
+
+
+#if HAVE_SSE2
+void vp8_sixtap_predict16x16_sse2
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+
+)
+{
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24);    // Temp data bufffer used in filtering
+
+    const short *HFilter, *VFilter;
+
+    if (xoffset)
+    {
+        HFilter = vp8_six_tap_mmx[xoffset];
+        vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 1, 21, 32, HFilter);
+    }
+    else
+    {
+        vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 21, 32);
+    }
+
+    if (yoffset)
+    {
+        VFilter = vp8_six_tap_mmx[yoffset];
+        vp8_filter_block1d8_v6_sse2(FData2 + 32, dst_ptr,   dst_pitch, 32, 16 , 16, 16, VFilter);
+        vp8_filter_block1d8_v6_sse2(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter);
+    }
+    else
+    {
+        vp8_pack_block1d16_v6_sse2(FData2 + 32, dst_ptr,   dst_pitch, 32,  16, 16);
+    }
+}
+
+
+void vp8_sixtap_predict8x8_sse2
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);  // Temp data bufffer used in filtering
+    const short *HFilter, *VFilter;
+
+    if (xoffset)
+    {
+        HFilter = vp8_six_tap_mmx[xoffset];
+        vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 1, 13, 16, HFilter);
+    }
+    else
+    {
+        vp8_unpack_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 13, 16);
+    }
+
+    if (yoffset)
+    {
+        VFilter = vp8_six_tap_mmx[yoffset];
+        vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 8, dst_pitch, VFilter);
+    }
+    else
+    {
+        vp8_pack_block1d8_v6_sse2(FData2 + 16, dst_ptr,   dst_pitch, 16,  8, dst_pitch);
+    }
+
+
+}
+
+
+void vp8_sixtap_predict8x4_sse2
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);  // Temp data bufffer used in filtering
+    const short *HFilter, *VFilter;
+
+    if (xoffset)
+    {
+        HFilter = vp8_six_tap_mmx[xoffset];
+        vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 1, 9, 16, HFilter);
+    }
+    else
+    {
+        vp8_unpack_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 9, 16);
+    }
+
+    if (yoffset)
+    {
+        VFilter = vp8_six_tap_mmx[yoffset];
+        vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 4, dst_pitch, VFilter);
+    }
+    else
+    {
+        vp8_pack_block1d8_v6_sse2(FData2 + 16, dst_ptr,   dst_pitch, 16,  4, dst_pitch);
+    }
+
+
+}
+#endif
diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c
new file mode 100644
index 000000000..5312e06da
--- /dev/null
+++ b/vp8/common/x86/x86_systemdependent.c
@@ -0,0 +1,117 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_ports/x86.h"
+#include "g_common.h"
+#include "subpixel.h"
+#include "loopfilter.h"
+#include "recon.h"
+#include "idct.h"
+#include "pragmas.h"
+#include "onyxc_int.h"
+
+void vp8_arch_x86_common_init(VP8_COMMON *ctx)
+{
+#if CONFIG_RUNTIME_CPU_DETECT
+    VP8_COMMON_RTCD *rtcd = &ctx->rtcd;
+    int flags = x86_simd_caps();
+    int mmx_enabled = flags & HAS_MMX;
+    int xmm_enabled = flags & HAS_SSE;
+    int wmt_enabled = flags & HAS_SSE2;
+
+    /* Note:
+     *
+     * This platform can be built without runtime CPU detection as well. If
+     * you modify any of the function mappings present in this file, be sure
+     * to also update them in static mapings (<arch>/filename_<arch>.h)
+     */
+
+    /* Override default functions with fastest ones for this CPU. */
+#if HAVE_MMX
+
+    if (mmx_enabled)
+    {
+        rtcd->idct.idct1        = vp8_short_idct4x4llm_1_mmx;
+        rtcd->idct.idct16       = vp8_short_idct4x4llm_mmx;
+        rtcd->idct.idct1_scalar = vp8_dc_only_idct_mmx;
+        rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_mmx;
+        rtcd->idct.iwalsh1     = vp8_short_inv_walsh4x4_1_mmx;
+
+
+
+        rtcd->recon.recon       = vp8_recon_b_mmx;
+        rtcd->recon.copy8x8     = vp8_copy_mem8x8_mmx;
+        rtcd->recon.copy8x4     = vp8_copy_mem8x4_mmx;
+        rtcd->recon.copy16x16   = vp8_copy_mem16x16_mmx;
+
+        rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_mmx;
+        rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_mmx;
+        rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_mmx;
+        rtcd->subpix.sixtap4x4     = vp8_sixtap_predict4x4_mmx;
+        rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_mmx;
+        rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_mmx;
+        rtcd->subpix.bilinear8x4   = vp8_bilinear_predict8x4_mmx;
+        rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_mmx;
+
+        rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_mmx;
+        rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_mmx;
+        rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_mmx;
+        rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_mmx;
+        rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_mmx;
+        rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_mmx;
+        rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_mmx;
+        rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_mmx;
+
+#if CONFIG_POSTPROC
+        rtcd->postproc.down        = vp8_mbpost_proc_down_mmx;
+        //rtcd->postproc.across      = vp8_mbpost_proc_across_ip_c;
+        rtcd->postproc.downacross  = vp8_post_proc_down_and_across_mmx;
+        rtcd->postproc.addnoise    = vp8_plane_add_noise_mmx;
+#endif
+    }
+
+#endif
+#if HAVE_SSE2
+
+    if (wmt_enabled)
+    {
+        rtcd->recon.recon2      = vp8_recon2b_sse2;
+        rtcd->recon.recon4      = vp8_recon4b_sse2;
+        rtcd->recon.copy16x16   = vp8_copy_mem16x16_sse2;
+
+        rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_sse2;
+
+        rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_sse2;
+        rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_sse2;
+        rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_sse2;
+        rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_sse2;
+        rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_sse2;
+
+        rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_sse2;
+        rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_sse2;
+        rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_sse2;
+        rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_sse2;
+        rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_sse2;
+        rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_sse2;
+        rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_sse2;
+        rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_sse2;
+
+#if CONFIG_POSTPROC
+        rtcd->postproc.down        = vp8_mbpost_proc_down_xmm;
+        rtcd->postproc.across      = vp8_mbpost_proc_across_ip_xmm;
+        rtcd->postproc.downacross  = vp8_post_proc_down_and_across_xmm;
+        rtcd->postproc.addnoise    = vp8_plane_add_noise_wmt;
+#endif
+    }
+
+#endif
+#endif
+}
author	John Koleszar <jkoleszar@google.com>	2010-05-18 11:58:33 -0400
committer	John Koleszar <jkoleszar@google.com>	2010-05-18 11:58:33 -0400
commit	0ea50ce9cb4b65eee6afa1d041fe8beb5abda667 (patch)
tree	1f3b9019f28bc56fd3156f96e5a9653a983ee61b /vp8/common
download	libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.gz libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.bz2 libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.zip