Merge branch 'experimental' of ssh://on2-git.corp.google.com:29418/libvpx into test

Conflicts: vp8/common/blockd.h vp8/decoder/decodemv.c vp8/decoder/decodframe.c vp8/decoder/demode.c vp8/decoder/onyxd_if.c vp8/decoder/onyxd_int.h vp8/encoder/encodeframe.c Change-Id: Ic379f4dffaded9796dc19d56be304d3f8527c61f
author: Suman Sunkara <sunkaras@google.com> 2010-11-16 15:09:26 -0500
committer: Suman Sunkara <sunkaras@google.com> 2010-11-16 16:30:59 -0500
commit: 4b3f72001de952aaf3874d3ce2dca3cb3fbe3928 (patch)
tree: 754d87a4fa5cae908388f0a26b2affad7cd8a523 /vp8/decoder
parent: b9a18344cf8e5283928525c5ac0897ede79f9e57 (diff)
parent: 00fe7441e9c5cbd898a8382d49cf9396d8482464 (diff)
download: libvpx-4b3f72001de952aaf3874d3ce2dca3cb3fbe3928.tar
libvpx-4b3f72001de952aaf3874d3ce2dca3cb3fbe3928.tar.gz
libvpx-4b3f72001de952aaf3874d3ce2dca3cb3fbe3928.tar.bz2
libvpx-4b3f72001de952aaf3874d3ce2dca3cb3fbe3928.zip
55 files changed, 5403 insertions, 3924 deletions
diff --git a/vp8/decoder/arm/arm_dsystemdependent.c b/vp8/decoder/arm/arm_dsystemdependent.c
new file mode 100644
index 000000000..e9741e286
--- /dev/null
+++ b/vp8/decoder/arm/arm_dsystemdependent.c
@@ -0,0 +1,66 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_ports/arm.h"
+#include "blockd.h"
+#include "pragmas.h"
+#include "postproc.h"
+#include "dboolhuff.h"
+#include "dequantize.h"
+#include "onyxd_int.h"
+
+void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
+{
+#if CONFIG_RUNTIME_CPU_DETECT
+    int flags = pbi->common.rtcd.flags;
+    int has_edsp = flags & HAS_EDSP;
+    int has_media = flags & HAS_MEDIA;
+    int has_neon = flags & HAS_NEON;
+
+#if HAVE_ARMV6
+    if (has_media)
+    {
+        pbi->dequant.block               = vp8_dequantize_b_v6;
+        pbi->dequant.idct_add            = vp8_dequant_idct_add_v6;
+        pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_v6;
+        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_v6;
+        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_v6;
+        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_v6;
+#if 0 /*For use with RTCD, when implemented*/
+        pbi->dboolhuff.start             = vp8dx_start_decode_c;
+        pbi->dboolhuff.fill              = vp8dx_bool_decoder_fill_c;
+        pbi->dboolhuff.debool            = vp8dx_decode_bool_c;
+        pbi->dboolhuff.devalue           = vp8dx_decode_value_c;
+#endif
+    }
+#endif
+
+#if HAVE_ARMV7
+    if (has_neon)
+    {
+        pbi->dequant.block               = vp8_dequantize_b_neon;
+        pbi->dequant.idct_add            = vp8_dequant_idct_add_neon;
+        /*This is not used: NEON always dequants two blocks at once.
+        pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_neon;*/
+        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_neon;
+        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_neon;
+        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_neon;
+#if 0 /*For use with RTCD, when implemented*/
+        pbi->dboolhuff.start             = vp8dx_start_decode_c;
+        pbi->dboolhuff.fill              = vp8dx_bool_decoder_fill_c;
+        pbi->dboolhuff.debool            = vp8dx_decode_bool_c;
+        pbi->dboolhuff.devalue           = vp8dx_decode_value_c;
+#endif
+    }
+#endif
+#endif
+}
diff --git a/vp8/decoder/arm/armv5/dequantize_v5.asm b/vp8/decoder/arm/armv5/dequantize_v5.asm
index eb3f0307c..de3648ae2 100644
--- a/vp8/decoder/arm/armv5/dequantize_v5.asm
+++ b/vp8/decoder/arm/armv5/dequantize_v5.asm
@@ -1,10 +1,11 @@
 ;
-;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
 ;
 
 
diff --git a/vp8/decoder/arm/armv6/dboolhuff_v6.asm b/vp8/decoder/arm/armv6/dboolhuff_v6.asm
index 143e33e46..6515804bb 100644
--- a/vp8/decoder/arm/armv6/dboolhuff_v6.asm
+++ b/vp8/decoder/arm/armv6/dboolhuff_v6.asm
@@ -1,10 +1,11 @@
 ;
-;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
 ;
 
 
diff --git a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm b/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm
new file mode 100644
index 000000000..6bebda24f
--- /dev/null
+++ b/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm
@@ -0,0 +1,218 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT |vp8_dequant_dc_idct_add_v6|
+
+    AREA |.text|, CODE, READONLY
+
+;void vp8_dequant_dc_idct_v6(short *input, short *dq, unsigned char *pred,
+; unsigned char *dest, int pitch, int stride, int Dc)
+; r0 = input
+; r1 = dq
+; r2 = pred
+; r3 = dest
+; sp + 36 = pitch  ; +4 = 40
+; sp + 40 = stride  ; +4 = 44
+; sp + 44 = Dc  ; +4 = 48
+
+
+|vp8_dequant_dc_idct_add_v6| PROC
+    stmdb   sp!, {r4-r11, lr}
+
+    ldr     r6, [sp, #44]
+
+    ldr     r4, [r0]                ;input
+    ldr     r5, [r1], #4            ;dq
+
+    sub     sp, sp, #4
+    str     r3, [sp]
+
+    smultt  r7, r4, r5
+
+    ldr     r4, [r0, #4]            ;input
+    ldr     r5, [r1], #4            ;dq
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    smulbb  r6, r4, r5
+    smultt  r7, r4, r5
+
+    ldr     r4, [r0, #4]            ;input
+    ldr     r5, [r1], #4            ;dq
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    mov     r12, #3
+
+vp8_dequant_dc_add_loop
+    smulbb  r6, r4, r5
+    smultt  r7, r4, r5
+
+    ldr     r4, [r0, #4]            ;input
+    ldr     r5, [r1], #4            ;dq
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    smulbb  r6, r4, r5
+    smultt  r7, r4, r5
+
+    subs    r12, r12, #1
+
+    ldrne   r4, [r0, #4]
+    ldrne   r5, [r1], #4
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    bne     vp8_dequant_dc_add_loop
+
+    sub     r0, r0, #32
+    mov     r1, r0
+
+; short_idct4x4llm_v6_dual
+    ldr     r3, cospi8sqrt2minus1
+    ldr     r4, sinpi8sqrt2
+    ldr     r6, [r0, #8]
+    mov     r5, #2
+vp8_dequant_dc_idct_loop1_v6
+    ldr     r12, [r0, #24]
+    ldr     r14, [r0, #16]
+    smulwt  r9, r3, r6
+    smulwb  r7, r3, r6
+    smulwt  r10, r4, r6
+    smulwb  r8, r4, r6
+    pkhbt   r7, r7, r9, lsl #16
+    smulwt  r11, r3, r12
+    pkhbt   r8, r8, r10, lsl #16
+    uadd16  r6, r6, r7
+    smulwt  r7, r4, r12
+    smulwb  r9, r3, r12
+    smulwb  r10, r4, r12
+    subs    r5, r5, #1
+    pkhbt   r9, r9, r11, lsl #16
+    ldr     r11, [r0], #4
+    pkhbt   r10, r10, r7, lsl #16
+    uadd16  r7, r12, r9
+    usub16  r7, r8, r7
+    uadd16  r6, r6, r10
+    uadd16  r10, r11, r14
+    usub16  r8, r11, r14
+    uadd16  r9, r10, r6
+    usub16  r10, r10, r6
+    uadd16  r6, r8, r7
+    usub16  r7, r8, r7
+    str     r6, [r1, #8]
+    ldrne   r6, [r0, #8]
+    str     r7, [r1, #16]
+    str     r10, [r1, #24]
+    str     r9, [r1], #4
+    bne     vp8_dequant_dc_idct_loop1_v6
+
+    mov     r5, #2
+    sub     r0, r1, #8
+vp8_dequant_dc_idct_loop2_v6
+    ldr     r6, [r0], #4
+    ldr     r7, [r0], #4
+    ldr     r8, [r0], #4
+    ldr     r9, [r0], #4
+    smulwt  r1, r3, r6
+    smulwt  r12, r4, r6
+    smulwt  lr, r3, r8
+    smulwt  r10, r4, r8
+    pkhbt   r11, r8, r6, lsl #16
+    pkhbt   r1, lr, r1, lsl #16
+    pkhbt   r12, r10, r12, lsl #16
+    pkhtb   r6, r6, r8, asr #16
+    uadd16  r6, r1, r6
+    pkhbt   lr, r9, r7, lsl #16
+    uadd16  r10, r11, lr
+    usub16  lr, r11, lr
+    pkhtb   r8, r7, r9, asr #16
+    subs    r5, r5, #1
+    smulwt  r1, r3, r8
+    smulwb  r7, r3, r8
+    smulwt  r11, r4, r8
+    smulwb  r9, r4, r8
+    pkhbt   r1, r7, r1, lsl #16
+    uadd16  r8, r1, r8
+    pkhbt   r11, r9, r11, lsl #16
+    usub16  r1, r12, r8
+    uadd16  r8, r11, r6
+    ldr     r9, c0x00040004
+    ldr     r12, [sp, #40]
+    uadd16  r6, r10, r8
+    usub16  r7, r10, r8
+    uadd16  r7, r7, r9
+    uadd16  r6, r6, r9
+    uadd16  r10, r14, r1
+    usub16  r1, r14, r1
+    uadd16  r10, r10, r9
+    uadd16  r1, r1, r9
+    ldr     r11, [r2], r12
+    mov     r8, r7, asr #3
+    pkhtb   r9, r8, r10, asr #19
+    mov     r8, r1, asr #3
+    pkhtb   r8, r8, r6, asr #19
+    uxtb16  lr, r11, ror #8
+    qadd16  r9, r9, lr
+    uxtb16  lr, r11
+    qadd16  r8, r8, lr
+    usat16  r9, #8, r9
+    usat16  r8, #8, r8
+    orr     r9, r8, r9, lsl #8
+    ldr     r11, [r2], r12
+    ldr     lr, [sp]
+    ldr     r12, [sp, #44]
+    mov     r7, r7, lsl #16
+    mov     r1, r1, lsl #16
+    mov     r10, r10, lsl #16
+    mov     r6, r6, lsl #16
+    mov     r7, r7, asr #3
+    pkhtb   r7, r7, r10, asr #19
+    mov     r1, r1, asr #3
+    pkhtb   r1, r1, r6, asr #19
+    uxtb16  r8, r11, ror #8
+    qadd16  r7, r7, r8
+    uxtb16  r8, r11
+    qadd16  r1, r1, r8
+    usat16  r7, #8, r7
+    usat16  r1, #8, r1
+    orr     r1, r1, r7, lsl #8
+    str     r9, [lr], r12
+    str     r1, [lr], r12
+    str     lr, [sp]
+    bne     vp8_dequant_dc_idct_loop2_v6
+
+; vpx_memset
+    sub     r0, r0, #32
+    add     sp, sp, #4
+
+    mov     r12, #0
+    str     r12, [r0]
+    str     r12, [r0, #4]
+    str     r12, [r0, #8]
+    str     r12, [r0, #12]
+    str     r12, [r0, #16]
+    str     r12, [r0, #20]
+    str     r12, [r0, #24]
+    str     r12, [r0, #28]
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ; |vp8_dequant_dc_idct_add_v6|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x00004E7B
+sinpi8sqrt2       DCD 0x00008A8C
+c0x00040004       DCD 0x00040004
+
+    END
diff --git a/vp8/decoder/arm/armv6/dequant_idct_v6.asm b/vp8/decoder/arm/armv6/dequant_idct_v6.asm
new file mode 100644
index 000000000..47b671ca6
--- /dev/null
+++ b/vp8/decoder/arm/armv6/dequant_idct_v6.asm
@@ -0,0 +1,196 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+    EXPORT |vp8_dequant_idct_add_v6|
+
+    AREA |.text|, CODE, READONLY
+;void vp8_dequant_idct_v6(short *input, short *dq, unsigned char *pred,
+; unsigned char *dest, int pitch, int stride)
+; r0 = input
+; r1 = dq
+; r2 = pred
+; r3 = dest
+; sp + 36 = pitch  ; +4 = 40
+; sp + 40 = stride  ; +4 = 44
+
+
+|vp8_dequant_idct_add_v6| PROC
+    stmdb   sp!, {r4-r11, lr}
+
+    ldr     r4, [r0]                ;input
+    ldr     r5, [r1], #4            ;dq
+
+    sub     sp, sp, #4
+    str     r3, [sp]
+
+    mov     r12, #4
+
+vp8_dequant_add_loop
+    smulbb  r6, r4, r5
+    smultt  r7, r4, r5
+
+    ldr     r4, [r0, #4]            ;input
+    ldr     r5, [r1], #4            ;dq
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    smulbb  r6, r4, r5
+    smultt  r7, r4, r5
+
+    subs    r12, r12, #1
+
+    ldrne   r4, [r0, #4]
+    ldrne   r5, [r1], #4
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    bne     vp8_dequant_add_loop
+
+    sub     r0, r0, #32
+    mov     r1, r0
+
+; short_idct4x4llm_v6_dual
+    ldr     r3, cospi8sqrt2minus1
+    ldr     r4, sinpi8sqrt2
+    ldr     r6, [r0, #8]
+    mov     r5, #2
+vp8_dequant_idct_loop1_v6
+    ldr     r12, [r0, #24]
+    ldr     r14, [r0, #16]
+    smulwt  r9, r3, r6
+    smulwb  r7, r3, r6
+    smulwt  r10, r4, r6
+    smulwb  r8, r4, r6
+    pkhbt   r7, r7, r9, lsl #16
+    smulwt  r11, r3, r12
+    pkhbt   r8, r8, r10, lsl #16
+    uadd16  r6, r6, r7
+    smulwt  r7, r4, r12
+    smulwb  r9, r3, r12
+    smulwb  r10, r4, r12
+    subs    r5, r5, #1
+    pkhbt   r9, r9, r11, lsl #16
+    ldr     r11, [r0], #4
+    pkhbt   r10, r10, r7, lsl #16
+    uadd16  r7, r12, r9
+    usub16  r7, r8, r7
+    uadd16  r6, r6, r10
+    uadd16  r10, r11, r14
+    usub16  r8, r11, r14
+    uadd16  r9, r10, r6
+    usub16  r10, r10, r6
+    uadd16  r6, r8, r7
+    usub16  r7, r8, r7
+    str     r6, [r1, #8]
+    ldrne   r6, [r0, #8]
+    str     r7, [r1, #16]
+    str     r10, [r1, #24]
+    str     r9, [r1], #4
+    bne     vp8_dequant_idct_loop1_v6
+
+    mov     r5, #2
+    sub     r0, r1, #8
+vp8_dequant_idct_loop2_v6
+    ldr     r6, [r0], #4
+    ldr     r7, [r0], #4
+    ldr     r8, [r0], #4
+    ldr     r9, [r0], #4
+    smulwt  r1, r3, r6
+    smulwt  r12, r4, r6
+    smulwt  lr, r3, r8
+    smulwt  r10, r4, r8
+    pkhbt   r11, r8, r6, lsl #16
+    pkhbt   r1, lr, r1, lsl #16
+    pkhbt   r12, r10, r12, lsl #16
+    pkhtb   r6, r6, r8, asr #16
+    uadd16  r6, r1, r6
+    pkhbt   lr, r9, r7, lsl #16
+    uadd16  r10, r11, lr
+    usub16  lr, r11, lr
+    pkhtb   r8, r7, r9, asr #16
+    subs    r5, r5, #1
+    smulwt  r1, r3, r8
+    smulwb  r7, r3, r8
+    smulwt  r11, r4, r8
+    smulwb  r9, r4, r8
+    pkhbt   r1, r7, r1, lsl #16
+    uadd16  r8, r1, r8
+    pkhbt   r11, r9, r11, lsl #16
+    usub16  r1, r12, r8
+    uadd16  r8, r11, r6
+    ldr     r9, c0x00040004
+    ldr     r12, [sp, #40]
+    uadd16  r6, r10, r8
+    usub16  r7, r10, r8
+    uadd16  r7, r7, r9
+    uadd16  r6, r6, r9
+    uadd16  r10, r14, r1
+    usub16  r1, r14, r1
+    uadd16  r10, r10, r9
+    uadd16  r1, r1, r9
+    ldr     r11, [r2], r12
+    mov     r8, r7, asr #3
+    pkhtb   r9, r8, r10, asr #19
+    mov     r8, r1, asr #3
+    pkhtb   r8, r8, r6, asr #19
+    uxtb16  lr, r11, ror #8
+    qadd16  r9, r9, lr
+    uxtb16  lr, r11
+    qadd16  r8, r8, lr
+    usat16  r9, #8, r9
+    usat16  r8, #8, r8
+    orr     r9, r8, r9, lsl #8
+    ldr     r11, [r2], r12
+    ldr     lr, [sp]
+    ldr     r12, [sp, #44]
+    mov     r7, r7, lsl #16
+    mov     r1, r1, lsl #16
+    mov     r10, r10, lsl #16
+    mov     r6, r6, lsl #16
+    mov     r7, r7, asr #3
+    pkhtb   r7, r7, r10, asr #19
+    mov     r1, r1, asr #3
+    pkhtb   r1, r1, r6, asr #19
+    uxtb16  r8, r11, ror #8
+    qadd16  r7, r7, r8
+    uxtb16  r8, r11
+    qadd16  r1, r1, r8
+    usat16  r7, #8, r7
+    usat16  r1, #8, r1
+    orr     r1, r1, r7, lsl #8
+    str     r9, [lr], r12
+    str     r1, [lr], r12
+    str     lr, [sp]
+    bne     vp8_dequant_idct_loop2_v6
+
+; vpx_memset
+    sub     r0, r0, #32
+    add     sp, sp, #4
+
+    mov     r12, #0
+    str     r12, [r0]
+    str     r12, [r0, #4]
+    str     r12, [r0, #8]
+    str     r12, [r0, #12]
+    str     r12, [r0, #16]
+    str     r12, [r0, #20]
+    str     r12, [r0, #24]
+    str     r12, [r0, #28]
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ; |vp8_dequant_idct_add_v6|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x00004E7B
+sinpi8sqrt2       DCD 0x00008A8C
+c0x00040004       DCD 0x00040004
+
+    END
diff --git a/vp8/decoder/arm/armv6/dequantdcidct_v6.asm b/vp8/decoder/arm/armv6/dequantdcidct_v6.asm
deleted file mode 100644
index 3daa9b34f..000000000
--- a/vp8/decoder/arm/armv6/dequantdcidct_v6.asm
+++ /dev/null
@@ -1,202 +0,0 @@
-;
-;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_dequant_dc_idct_v6|
-    ; ARM
-    ; REQUIRE8
-    ; PRESERVE8
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-;void vp8_dequant_dc_idct_v6(short *input, short *dq, short *output, int pitch,int Dc)
-|vp8_dequant_dc_idct_v6| PROC
-    stmdb   sp!, {r4-r11, lr}
-
-    ldr     r6, [sp, #36]           ;load Dc
-
-    ldr     r4, [r0]                ;input
-    ldr     r5, [r1], #4            ;dq
-
-    sub     sp, sp, #4
-    str     r0, [sp]
-
-    smultt  r7, r4, r5
-
-    ldr     r4, [r0, #4]            ;input
-    ldr     r5, [r1], #4            ;dq
-
-    strh    r6, [r0], #2
-    strh    r7, [r0], #2
-
-    smulbb  r6, r4, r5
-    smultt  r7, r4, r5
-
-    ldr     r4, [r0, #4]            ;input
-    ldr     r5, [r1], #4            ;dq
-
-    strh    r6, [r0], #2
-    strh    r7, [r0], #2
-
-    mov     r12, #3
-
-dequant_dc_idct_loop
-    smulbb  r6, r4, r5
-    smultt  r7, r4, r5
-
-    ldr     r4, [r0, #4]            ;input
-    ldr     r5, [r1], #4            ;dq
-
-    strh    r6, [r0], #2
-    strh    r7, [r0], #2
-
-    smulbb  r6, r4, r5
-    smultt  r7, r4, r5
-
-    subs    r12, r12, #1
-
-    ldrne   r4, [r0, #4]
-    ldrne   r5, [r1], #4
-
-    strh    r6, [r0], #2
-    strh    r7, [r0], #2
-
-    bne     dequant_dc_idct_loop
-
-    sub     r0, r0, #32
-    mov     r1, r2
-    mov     r2, r3
-
-; short_idct4x4llm_v6_dual
-
-    mov r3, #0x00004E00 ;                   cos
-    orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
-    mov r4, #0x00008A00 ;                       sin
-    orr r4, r4, #0x0000008C ; sinpi8sqrt2
-    mov r5, #0x2    ; i=2                           i
-loop1_dual_11
-    ldr r6, [r0, #(4*2)]    ; i5 | i4                               5|4
-    ldr r12, [r0, #(12*2)]  ; i13 | i12                                                     13|12
-    ldr r14, [r0, #(8*2)]   ; i9 | i8                                                               9|8
-
-    smulwt  r9, r3, r6  ; (ip[5] * cospi8sqrt2minus1) >> 16                                         5c
-    smulwb  r7, r3, r6  ; (ip[4] * cospi8sqrt2minus1) >> 16                                 4c
-    smulwt  r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16                                               5s
-    smulwb  r8, r4, r6  ; (ip[4] * sinpi8sqrt2) >> 16                                       4s
-    pkhbt   r7, r7, r9, lsl #16 ; 5c | 4c
-    smulwt  r11, r3, r12    ; (ip[13] * cospi8sqrt2minus1) >> 16                                                    13c
-    pkhbt   r8, r8, r10, lsl #16    ; 5s | 4s
-    uadd16  r6, r6, r7  ; 5c+5 | 4c+4
-    smulwt  r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16                                  13s
-    smulwb  r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16                                            12c
-    smulwb  r10, r4, r12    ; (ip[12] * sinpi8sqrt2) >> 16                                              12s
-    subs    r5, r5, #0x1    ; i--                           --
-    pkhbt   r9, r9, r11, lsl #16    ; 13c | 12c
-    ldr r11, [r0], #0x4 ; i1 | i0       ++                                          1|0
-    pkhbt   r10, r10, r7, lsl #16   ; 13s | 12s
-    uadd16  r7, r12, r9 ; 13c+13 | 12c+12
-    usub16  r7, r8, r7  ; c                                 c
-    uadd16  r6, r6, r10 ; d                             d
-    uadd16  r10, r11, r14   ; a                                             a
-    usub16  r8, r11, r14    ; b                                     b
-    uadd16  r9, r10, r6 ; a+d                                           a+d
-    usub16  r10, r10, r6    ; a-d                                               a-d
-    uadd16  r6, r8, r7  ; b+c                               b+c
-    usub16  r7, r8, r7  ; b-c                                   b-c
-    str r6, [r1, r2]    ; o5 | o4
-    add r6, r2, r2  ; pitch * 2                             p2
-    str r7, [r1, r6]    ; o9 | o8
-    add r6,  r6, r2 ; pitch * 3                             p3
-    str r10, [r1, r6]   ; o13 | o12
-    str r9, [r1], #0x4  ; o1 | o0           ++
-    bne loop1_dual_11   ;
-    mov r5, #0x2    ; i=2                           i
-    sub r0, r1, #8  ; reset input/output        i/o
-loop2_dual_22
-    ldr r6, [r0, r2]    ; i5 | i4                               5|4
-    ldr r1, [r0]    ; i1 | i0           1|0
-    ldr r12, [r0, #0x4] ; i3 | i2                                                       3|2
-    add r14, r2, #0x4   ; pitch + 2                                                             p+2
-    ldr r14, [r0, r14]  ; i7 | i6                                                               7|6
-    smulwt  r9, r3, r6  ; (ip[5] * cospi8sqrt2minus1) >> 16                                         5c
-    smulwt  r7, r3, r1  ; (ip[1] * cospi8sqrt2minus1) >> 16                                 1c
-    smulwt  r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16                                               5s
-    smulwt  r8, r4, r1  ; (ip[1] * sinpi8sqrt2) >> 16                                       1s
-    pkhbt   r11, r6, r1, lsl #16    ; i0 | i4                                                   0|4
-    pkhbt   r7, r9, r7, lsl #16 ; 1c | 5c
-    pkhbt   r8, r10, r8, lsl #16    ; 1s | 5s = temp1 �                                     tc1
-    pkhtb   r1, r1, r6, asr #16 ; i1 | i5           1|5
-    uadd16  r1, r7, r1  ; 1c+1 | 5c+5 = temp2 (d)           td2
-    pkhbt   r9, r14, r12, lsl #16   ; i2 | i6                                           2|6
-    uadd16  r10, r11, r9    ; a                                             a
-    usub16  r9, r11, r9 ; b                                         b
-    pkhtb   r6, r12, r14, asr #16   ; i3 | i7                               3|7
-    subs    r5, r5, #0x1    ; i--                           --
-    smulwt  r7, r3, r6  ; (ip[3] * cospi8sqrt2minus1) >> 16                                 3c
-    smulwt  r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16                                                   3s
-    smulwb  r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16                                                     7c
-    smulwb  r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16                                                               7s
-
-    pkhbt   r7, r12, r7, lsl #16    ; 3c | 7c
-    pkhbt   r11, r14, r11, lsl #16  ; 3s | 7s = temp1 (d)                                                   td1
-    uadd16  r6, r7, r6  ; 3c+3 | 7c+7 = temp2  (c)                              tc2
-    usub16  r12, r8, r6 ; c (o1 | o5)                                                       c
-    uadd16  r6, r11, r1 ; d (o3 | o7)                               d
-    uadd16  r7, r10, r6 ; a+d                                   a+d
-    mov r8, #0x4    ; set up 4's                                        4
-    orr r8, r8, #0x40000    ;                                       4|4
-    usub16  r6, r10, r6 ; a-d                               a-d
-    uadd16  r6, r6, r8  ; a-d+4                             3|7
-    uadd16  r7, r7, r8  ; a+d+4                                 0|4
-    uadd16  r10, r9, r12    ; b+c                                               b+c
-    usub16  r1, r9, r12 ; b-c           b-c
-    uadd16  r10, r10, r8    ; b+c+4                                             1|5
-    uadd16  r1, r1, r8  ; b-c+4         2|6
-    mov r8, r10, asr #19    ; o1 >> 3
-    strh    r8, [r0, #2]    ; o1
-    mov r8, r1, asr #19 ; o2 >> 3
-    strh    r8, [r0, #4]    ; o2
-    mov r8, r6, asr #19 ; o3 >> 3
-    strh    r8, [r0, #6]    ; o3
-    mov r8, r7, asr #19 ; o0 >> 3
-    strh    r8, [r0], r2    ; o0        +p
-    sxth    r10, r10    ;
-    mov r8, r10, asr #3 ; o5 >> 3
-    strh    r8, [r0, #2]    ; o5
-    sxth    r1, r1  ;
-    mov r8, r1, asr #3  ; o6 >> 3
-    strh    r8, [r0, #4]    ; o6
-    sxth    r6, r6  ;
-    mov r8, r6, asr #3  ; o7 >> 3
-    strh    r8, [r0, #6]    ; o7
-    sxth    r7, r7  ;
-    mov r8, r7, asr #3  ; o4 >> 3
-    strh    r8, [r0], r2    ; o4        +p
-;;;;;   subs    r5, r5, #0x1    ; i--                           --
-    bne loop2_dual_22   ;
-
-
-;vpx_memset
-    ldr     r0, [sp]
-    add     sp, sp, #4
-
-    mov     r12, #0
-    str     r12, [r0]
-    str     r12, [r0, #4]
-    str     r12, [r0, #8]
-    str     r12, [r0, #12]
-    str     r12, [r0, #16]
-    str     r12, [r0, #20]
-    str     r12, [r0, #24]
-    str     r12, [r0, #28]
-
-    ldmia   sp!, {r4 - r11, pc} ; replace vars, return                      restore
-
-    ENDP    ;|vp8_dequant_dc_idct_v68|
-
-    END
diff --git a/vp8/decoder/arm/armv6/dequantidct_v6.asm b/vp8/decoder/arm/armv6/dequantidct_v6.asm
deleted file mode 100644
index 61bb48d04..000000000
--- a/vp8/decoder/arm/armv6/dequantidct_v6.asm
+++ /dev/null
@@ -1,183 +0,0 @@
-;
-;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_dequant_idct_v6|
-    ; ARM
-    ; REQUIRE8
-    ; PRESERVE8
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-;void vp8_dequant_idct_v6(short *input, short *dq, short *output, int pitch)
-|vp8_dequant_idct_v6| PROC
-    stmdb   sp!, {r4-r11, lr}
-
-    ldr     r4, [r0]            ;input
-    ldr     r5, [r1], #4            ;dq
-
-    sub     sp, sp, #4
-    str     r0, [sp]
-
-    mov     r12, #4
-
-dequant_idct_loop
-    smulbb  r6, r4, r5
-    smultt  r7, r4, r5
-
-    ldr     r4, [r0, #4]            ;input
-    ldr     r5, [r1], #4        ;dq
-
-    strh    r6, [r0], #2
-    strh    r7, [r0], #2
-
-    smulbb  r6, r4, r5
-    smultt  r7, r4, r5
-
-    subs    r12, r12, #1
-
-    ldrne   r4, [r0, #4]
-    ldrne   r5, [r1], #4
-
-    strh    r6, [r0], #2
-    strh    r7, [r0], #2
-
-    bne     dequant_idct_loop
-
-    sub     r0, r0, #32
-    mov     r1, r2
-    mov     r2, r3
-
-; short_idct4x4llm_v6_dual
-
-    mov r3, #0x00004E00 ;                   cos
-    orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
-    mov r4, #0x00008A00 ;                       sin
-    orr r4, r4, #0x0000008C ; sinpi8sqrt2
-    mov r5, #0x2    ; i=2                           i
-loop1_dual_1
-    ldr r6, [r0, #(4*2)]    ; i5 | i4                               5|4
-    ldr r12, [r0, #(12*2)]  ; i13 | i12                                                     13|12
-    ldr r14, [r0, #(8*2)]   ; i9 | i8                                                               9|8
-
-    smulwt  r9, r3, r6  ; (ip[5] * cospi8sqrt2minus1) >> 16                                         5c
-    smulwb  r7, r3, r6  ; (ip[4] * cospi8sqrt2minus1) >> 16                                 4c
-    smulwt  r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16                                               5s
-    smulwb  r8, r4, r6  ; (ip[4] * sinpi8sqrt2) >> 16                                       4s
-    pkhbt   r7, r7, r9, lsl #16 ; 5c | 4c
-    smulwt  r11, r3, r12    ; (ip[13] * cospi8sqrt2minus1) >> 16                                                    13c
-    pkhbt   r8, r8, r10, lsl #16    ; 5s | 4s
-    uadd16  r6, r6, r7  ; 5c+5 | 4c+4
-    smulwt  r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16                                  13s
-    smulwb  r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16                                            12c
-    smulwb  r10, r4, r12    ; (ip[12] * sinpi8sqrt2) >> 16                                              12s
-    subs    r5, r5, #0x1    ; i--                           --
-    pkhbt   r9, r9, r11, lsl #16    ; 13c | 12c
-    ldr r11, [r0], #0x4 ; i1 | i0       ++                                          1|0
-    pkhbt   r10, r10, r7, lsl #16   ; 13s | 12s
-    uadd16  r7, r12, r9 ; 13c+13 | 12c+12
-    usub16  r7, r8, r7  ; c                                 c
-    uadd16  r6, r6, r10 ; d                             d
-    uadd16  r10, r11, r14   ; a                                             a
-    usub16  r8, r11, r14    ; b                                     b
-    uadd16  r9, r10, r6 ; a+d                                           a+d
-    usub16  r10, r10, r6    ; a-d                                               a-d
-    uadd16  r6, r8, r7  ; b+c                               b+c
-    usub16  r7, r8, r7  ; b-c                                   b-c
-    str r6, [r1, r2]    ; o5 | o4
-    add r6, r2, r2  ; pitch * 2                             p2
-    str r7, [r1, r6]    ; o9 | o8
-    add r6,  r6, r2 ; pitch * 3                             p3
-    str r10, [r1, r6]   ; o13 | o12
-    str r9, [r1], #0x4  ; o1 | o0           ++
-    bne loop1_dual_1    ;
-    mov r5, #0x2    ; i=2                           i
-    sub r0, r1, #8  ; reset input/output        i/o
-loop2_dual_2
-    ldr r6, [r0, r2]    ; i5 | i4                               5|4
-    ldr r1, [r0]    ; i1 | i0           1|0
-    ldr r12, [r0, #0x4] ; i3 | i2                                                       3|2
-    add r14, r2, #0x4   ; pitch + 2                                                             p+2
-    ldr r14, [r0, r14]  ; i7 | i6                                                               7|6
-    smulwt  r9, r3, r6  ; (ip[5] * cospi8sqrt2minus1) >> 16                                         5c
-    smulwt  r7, r3, r1  ; (ip[1] * cospi8sqrt2minus1) >> 16                                 1c
-    smulwt  r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16                                               5s
-    smulwt  r8, r4, r1  ; (ip[1] * sinpi8sqrt2) >> 16                                       1s
-    pkhbt   r11, r6, r1, lsl #16    ; i0 | i4                                                   0|4
-    pkhbt   r7, r9, r7, lsl #16 ; 1c | 5c
-    pkhbt   r8, r10, r8, lsl #16    ; 1s | 5s = temp1 �                                     tc1
-    pkhtb   r1, r1, r6, asr #16 ; i1 | i5           1|5
-    uadd16  r1, r7, r1  ; 1c+1 | 5c+5 = temp2 (d)           td2
-    pkhbt   r9, r14, r12, lsl #16   ; i2 | i6                                           2|6
-    uadd16  r10, r11, r9    ; a                                             a
-    usub16  r9, r11, r9 ; b                                         b
-    pkhtb   r6, r12, r14, asr #16   ; i3 | i7                               3|7
-    subs    r5, r5, #0x1    ; i--                           --
-    smulwt  r7, r3, r6  ; (ip[3] * cospi8sqrt2minus1) >> 16                                 3c
-    smulwt  r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16                                                   3s
-    smulwb  r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16                                                     7c
-    smulwb  r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16                                                               7s
-
-    pkhbt   r7, r12, r7, lsl #16    ; 3c | 7c
-    pkhbt   r11, r14, r11, lsl #16  ; 3s | 7s = temp1 (d)                                                   td1
-    uadd16  r6, r7, r6  ; 3c+3 | 7c+7 = temp2  (c)                              tc2
-    usub16  r12, r8, r6 ; c (o1 | o5)                                                       c
-    uadd16  r6, r11, r1 ; d (o3 | o7)                               d
-    uadd16  r7, r10, r6 ; a+d                                   a+d
-    mov r8, #0x4    ; set up 4's                                        4
-    orr r8, r8, #0x40000    ;                                       4|4
-    usub16  r6, r10, r6 ; a-d                               a-d
-    uadd16  r6, r6, r8  ; a-d+4                             3|7
-    uadd16  r7, r7, r8  ; a+d+4                                 0|4
-    uadd16  r10, r9, r12    ; b+c                                               b+c
-    usub16  r1, r9, r12 ; b-c           b-c
-    uadd16  r10, r10, r8    ; b+c+4                                             1|5
-    uadd16  r1, r1, r8  ; b-c+4         2|6
-    mov r8, r10, asr #19    ; o1 >> 3
-    strh    r8, [r0, #2]    ; o1
-    mov r8, r1, asr #19 ; o2 >> 3
-    strh    r8, [r0, #4]    ; o2
-    mov r8, r6, asr #19 ; o3 >> 3
-    strh    r8, [r0, #6]    ; o3
-    mov r8, r7, asr #19 ; o0 >> 3
-    strh    r8, [r0], r2    ; o0        +p
-    sxth    r10, r10    ;
-    mov r8, r10, asr #3 ; o5 >> 3
-    strh    r8, [r0, #2]    ; o5
-    sxth    r1, r1  ;
-    mov r8, r1, asr #3  ; o6 >> 3
-    strh    r8, [r0, #4]    ; o6
-    sxth    r6, r6  ;
-    mov r8, r6, asr #3  ; o7 >> 3
-    strh    r8, [r0, #6]    ; o7
-    sxth    r7, r7  ;
-    mov r8, r7, asr #3  ; o4 >> 3
-    strh    r8, [r0], r2    ; o4        +p
-;;;;;   subs    r5, r5, #0x1    ; i--                           --
-    bne loop2_dual_2    ;
-            ;
-
-;vpx_memset
-    ldr     r0, [sp]
-    add     sp, sp, #4
-
-    mov     r12, #0
-    str     r12, [r0]
-    str     r12, [r0, #4]
-    str     r12, [r0, #8]
-    str     r12, [r0, #12]
-    str     r12, [r0, #16]
-    str     r12, [r0, #20]
-    str     r12, [r0, #24]
-    str     r12, [r0, #28]
-
-    ldmia   sp!, {r4 - r11, pc} ; replace vars, return                      restore
-
-    ENDP    ;|vp8_dequant_idct_v6|
-
-    END
diff --git a/vp8/decoder/arm/armv6/dequantize_v6.asm b/vp8/decoder/arm/armv6/dequantize_v6.asm
index 95e38594f..72f7e0ee5 100644
--- a/vp8/decoder/arm/armv6/dequantize_v6.asm
+++ b/vp8/decoder/arm/armv6/dequantize_v6.asm
@@ -1,10 +1,11 @@
 ;
-;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
 ;
 
 
diff --git a/vp8/decoder/arm/armv6/idct_blk_v6.c b/vp8/decoder/arm/armv6/idct_blk_v6.c
new file mode 100644
index 000000000..3c7bc502f
--- /dev/null
+++ b/vp8/decoder/arm/armv6/idct_blk_v6.c
@@ -0,0 +1,151 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "idct.h"
+#include "dequantize.h"
+
+void vp8_dequant_dc_idct_add_y_block_v6
+            (short *q, short *dq, unsigned char *pre,
+             unsigned char *dst, int stride, char *eobs, short *dc)
+{
+    int i;
+
+    for (i = 0; i < 4; i++)
+    {
+        if (eobs[0] > 1)
+            vp8_dequant_dc_idct_add_v6 (q, dq, pre, dst, 16, stride, dc[0]);
+        else
+            vp8_dc_only_idct_add_v6 (dc[0], pre, dst, 16, stride);
+
+        if (eobs[1] > 1)
+            vp8_dequant_dc_idct_add_v6 (q+16, dq, pre+4, dst+4, 16, stride, dc[1]);
+        else
+            vp8_dc_only_idct_add_v6 (dc[1], pre+4, dst+4, 16, stride);
+
+        if (eobs[2] > 1)
+            vp8_dequant_dc_idct_add_v6 (q+32, dq, pre+8, dst+8, 16, stride, dc[2]);
+        else
+            vp8_dc_only_idct_add_v6 (dc[2], pre+8, dst+8, 16, stride);
+
+        if (eobs[3] > 1)
+            vp8_dequant_dc_idct_add_v6 (q+48, dq, pre+12, dst+12, 16, stride, dc[3]);
+        else
+            vp8_dc_only_idct_add_v6 (dc[3], pre+12, dst+12, 16, stride);
+
+        q    += 64;
+        dc   += 4;
+        pre  += 64;
+        dst  += 4*stride;
+        eobs += 4;
+    }
+}
+
+void vp8_dequant_idct_add_y_block_v6
+            (short *q, short *dq, unsigned char *pre,
+             unsigned char *dst, int stride, char *eobs)
+{
+    int i;
+
+    for (i = 0; i < 4; i++)
+    {
+        if (eobs[0] > 1)
+            vp8_dequant_idct_add_v6 (q, dq, pre, dst, 16, stride);
+        else
+        {
+            vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dst, 16, stride);
+            ((int *)q)[0] = 0;
+        }
+
+        if (eobs[1] > 1)
+            vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dst+4, 16, stride);
+        else
+        {
+            vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dst+4, 16, stride);
+            ((int *)(q+16))[0] = 0;
+        }
+
+        if (eobs[2] > 1)
+            vp8_dequant_idct_add_v6 (q+32, dq, pre+8, dst+8, 16, stride);
+        else
+        {
+            vp8_dc_only_idct_add_v6 (q[32]*dq[0], pre+8, dst+8, 16, stride);
+            ((int *)(q+32))[0] = 0;
+        }
+
+        if (eobs[3] > 1)
+            vp8_dequant_idct_add_v6 (q+48, dq, pre+12, dst+12, 16, stride);
+        else
+        {
+            vp8_dc_only_idct_add_v6 (q[48]*dq[0], pre+12, dst+12, 16, stride);
+            ((int *)(q+48))[0] = 0;
+        }
+
+        q    += 64;
+        pre  += 64;
+        dst  += 4*stride;
+        eobs += 4;
+    }
+}
+
+void vp8_dequant_idct_add_uv_block_v6
+            (short *q, short *dq, unsigned char *pre,
+             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+{
+    int i;
+
+    for (i = 0; i < 2; i++)
+    {
+        if (eobs[0] > 1)
+            vp8_dequant_idct_add_v6 (q, dq, pre, dstu, 8, stride);
+        else
+        {
+            vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dstu, 8, stride);
+            ((int *)q)[0] = 0;
+        }
+
+        if (eobs[1] > 1)
+            vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dstu+4, 8, stride);
+        else
+        {
+            vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dstu+4, 8, stride);
+            ((int *)(q+16))[0] = 0;
+        }
+
+        q    += 32;
+        pre  += 32;
+        dstu += 4*stride;
+        eobs += 2;
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+        if (eobs[0] > 1)
+            vp8_dequant_idct_add_v6 (q, dq, pre, dstv, 8, stride);
+        else
+        {
+            vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dstv, 8, stride);
+            ((int *)q)[0] = 0;
+        }
+
+        if (eobs[1] > 1)
+            vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dstv+4, 8, stride);
+        else
+        {
+            vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dstv+4, 8, stride);
+            ((int *)(q+16))[0] = 0;
+        }
+
+        q    += 32;
+        pre  += 32;
+        dstv += 4*stride;
+        eobs += 2;
+    }
+}
diff --git a/vp8/decoder/arm/dboolhuff_arm.h b/vp8/decoder/arm/dboolhuff_arm.h
index 495004f9c..985951c7c 100644
--- a/vp8/decoder/arm/dboolhuff_arm.h
+++ b/vp8/decoder/arm/dboolhuff_arm.h
@@ -11,14 +11,11 @@
  * to be useless. However, its been left (for now)
  * for reference.
  */
-/*
+#if 0
 #if HAVE_ARMV6
 #undef vp8_dbool_start
 #define vp8_dbool_start vp8dx_start_decode_v6
 
-#undef vp8_dbool_stop
-#define vp8_dbool_stop vp8dx_stop_decode_v6
-
 #undef vp8_dbool_fill
 #define vp8_dbool_fill vp8_bool_decoder_fill_v6
 
@@ -27,15 +24,12 @@
 
 #undef vp8_dbool_devalue
 #define vp8_dbool_devalue vp8_decode_value_v6
-#endif // HAVE_ARMV6
+#endif /* HAVE_ARMV6 */
 
 #if HAVE_ARMV7
 #undef vp8_dbool_start
 #define vp8_dbool_start vp8dx_start_decode_neon
 
-#undef vp8_dbool_stop
-#define vp8_dbool_stop vp8dx_stop_decode_neon
-
 #undef vp8_dbool_fill
 #define vp8_dbool_fill vp8_bool_decoder_fill_neon
 
@@ -44,6 +38,6 @@
 
 #undef vp8_dbool_devalue
 #define vp8_dbool_devalue vp8_decode_value_neon
-#endif // HAVE_ARMV7
-*/
-#endif // DBOOLHUFF_ARM_H
+#endif /* HAVE_ARMV7 */
+#endif
+#endif /* DBOOLHUFF_ARM_H */
diff --git a/vp8/decoder/arm/dequantize_arm.c b/vp8/decoder/arm/dequantize_arm.c
index 54006a921..b3e14b793 100644
--- a/vp8/decoder/arm/dequantize_arm.c
+++ b/vp8/decoder/arm/dequantize_arm.c
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
@@ -29,7 +30,7 @@ void vp8_dequantize_b_neon(BLOCKD *d)
     int i;
     short *DQ  = d->dqcoeff;
     short *Q   = d->qcoeff;
-    short *DQC = &d->dequant[0][0];
+    short *DQC = d->dequant;
 
     vp8_dequantize_b_loop_neon(Q, DQC, DQ);
 }
@@ -41,7 +42,7 @@ void vp8_dequantize_b_v6(BLOCKD *d)
     int i;
     short *DQ  = d->dqcoeff;
     short *Q   = d->qcoeff;
-    short *DQC = &d->dequant[0][0];
+    short *DQC = d->dequant;
 
     vp8_dequantize_b_loop_v6(Q, DQC, DQ);
 }
diff --git a/vp8/decoder/arm/dequantize_arm.h b/vp8/decoder/arm/dequantize_arm.h
index c8a61a4a7..b7d800d26 100644
--- a/vp8/decoder/arm/dequantize_arm.h
+++ b/vp8/decoder/arm/dequantize_arm.h
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
@@ -13,32 +14,60 @@
 
 #if HAVE_ARMV6
 extern prototype_dequant_block(vp8_dequantize_b_v6);
-extern prototype_dequant_idct(vp8_dequant_idct_v6);
-extern prototype_dequant_idct_dc(vp8_dequant_dc_idct_v6);
+extern prototype_dequant_idct_add(vp8_dequant_idct_add_v6);
+extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_v6);
+extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_v6);
+extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_v6);
+extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
 
+#if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_dequant_block
 #define vp8_dequant_block vp8_dequantize_b_v6
 
-#undef  vp8_dequant_idct
-#define vp8_dequant_idct vp8_dequant_idct_v6
+#undef vp8_dequant_idct_add
+#define vp8_dequant_idct_add vp8_dequant_idct_add_v6
 
-#undef  vp8_dequant_idct_dc
-#define vp8_dequant_idct_dc vp8_dequant_dc_idct_v6
+#undef vp8_dequant_dc_idct_add
+#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_v6
+
+#undef vp8_dequant_dc_idct_add_y_block
+#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_v6
+
+#undef vp8_dequant_idct_add_y_block
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_v6
+
+#undef vp8_dequant_idct_add_uv_block
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_v6
+#endif
 #endif
 
 #if HAVE_ARMV7
 extern prototype_dequant_block(vp8_dequantize_b_neon);
-extern prototype_dequant_idct(vp8_dequant_idct_neon);
-extern prototype_dequant_idct_dc(vp8_dequant_dc_idct_neon);
+extern prototype_dequant_idct_add(vp8_dequant_idct_add_neon);
+extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_neon);
+extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_neon);
+extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_neon);
+extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
 
+#if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_dequant_block
 #define vp8_dequant_block vp8_dequantize_b_neon
 
-#undef  vp8_dequant_idct
-#define vp8_dequant_idct vp8_dequant_idct_neon
+#undef vp8_dequant_idct_add
+#define vp8_dequant_idct_add vp8_dequant_idct_add_neon
 
-#undef  vp8_dequant_idct_dc
-#define vp8_dequant_idct_dc vp8_dequant_dc_idct_neon
+#undef vp8_dequant_dc_idct_add
+#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_neon
+
+#undef vp8_dequant_dc_idct_add_y_block
+#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_neon
+
+#undef vp8_dequant_idct_add_y_block
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon
+
+#undef vp8_dequant_idct_add_uv_block
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon
+#endif
 #endif
 
 #endif
diff --git a/vp8/decoder/arm/detokenize.asm b/vp8/decoder/arm/detokenize.asm
new file mode 100644
index 000000000..45e068a9f
--- /dev/null
+++ b/vp8/decoder/arm/detokenize.asm
@@ -0,0 +1,320 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_decode_mb_tokens_v6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+    INCLUDE vpx_asm_offsets.asm
+
+l_qcoeff    EQU     0
+l_i         EQU     4
+l_type      EQU     8
+l_stop      EQU     12
+l_c         EQU     16
+l_l_ptr     EQU     20
+l_a_ptr     EQU     24
+l_bc        EQU     28
+l_coef_ptr  EQU     32
+l_stacksize EQU     64
+
+
+;; constant offsets -- these should be created at build time
+c_block2above_offset         EQU 25
+c_entropy_nodes              EQU 11
+c_dct_eob_token              EQU 11
+
+|vp8_decode_mb_tokens_v6| PROC
+    stmdb       sp!, {r4 - r11, lr}
+    sub         sp, sp, #l_stacksize
+    mov         r7, r1                      ; type
+    mov         r9, r0                      ; detoken
+
+    ldr         r1, [r9, #detok_current_bc]
+    ldr         r0, [r9, #detok_qcoeff_start_ptr]
+    mov         r11, #0                     ; i
+    mov         r3, #16                     ; stop
+
+    cmp         r7, #1                      ; type ?= 1
+    addeq       r11, r11, #24               ; i = 24
+    addeq       r3, r3, #8                  ; stop = 24
+    addeq       r0, r0, #3, 24              ; qcoefptr += 24*16
+
+    str         r0, [sp, #l_qcoeff]
+    str         r11, [sp, #l_i]
+    str         r7, [sp, #l_type]
+    str         r3, [sp, #l_stop]
+    str         r1, [sp, #l_bc]
+
+    add         lr, r9, r7, lsl #2          ; detoken + type*4
+
+    ldr         r8, [r1, #bool_decoder_user_buffer]
+
+    ldr         r10, [lr, #detok_coef_probs]
+    ldr         r5, [r1, #bool_decoder_count]
+    ldr         r6, [r1, #bool_decoder_range]
+    ldr         r4, [r1, #bool_decoder_value]
+
+    str         r10, [sp, #l_coef_ptr]
+
+BLOCK_LOOP
+    ldr         r3, [r9, #detok_ptr_block2leftabove]
+    ldr         r1, [r9, #detok_L]
+    ldr         r2, [r9, #detok_A]
+    ldrb        r12, [r3, r11]!             ; block2left[i]
+    ldrb        r3, [r3, #c_block2above_offset]; block2above[i]
+
+    cmp         r7, #0                      ; c = !type
+    moveq       r7, #1
+    movne       r7, #0
+
+    ldrb        r0, [r1, r12]!              ; *(L += block2left[i])
+    ldrb        r3, [r2, r3]!               ; *(A += block2above[i])
+    mov         lr, #c_entropy_nodes        ; ENTROPY_NODES = 11
+
+; VP8_COMBINEENTROPYCONTETEXTS(t, *a, *l) => t = ((*a) != 0) + ((*l) !=0)
+    cmp         r0, #0                      ; *l ?= 0
+    movne       r0, #1
+    cmp         r3, #0                      ; *a ?= 0
+    addne       r0, r0, #1                  ; t
+
+    str         r1, [sp, #l_l_ptr]          ; save &l
+    str         r2, [sp, #l_a_ptr]          ; save &a
+    smlabb      r0, r0, lr, r10             ; Prob = coef_probs + (t * ENTROPY_NODES)
+    mov         r1, #0                      ; t = 0
+    str         r7, [sp, #l_c]
+
+    ;align 4
+COEFF_LOOP
+    ldr         r3, [r9, #detok_ptr_coef_bands_x]
+    ldr         lr, [r9, #detok_coef_tree_ptr]
+    ;STALL
+    ldrb        r3, [r3, r7]                ; coef_bands_x[c]
+    ;STALL
+    ;STALL
+    add         r0, r0, r3                  ; Prob += coef_bands_x[c]
+
+get_token_loop
+    ldrb        r2, [r0, +r1, asr #1]       ; Prob[t >> 1]
+    mov         r3, r6, lsl #8              ; range << 8
+    sub         r3, r3, #256                ; (range << 8) - (1 << 8)
+    mov         r10, #1                     ; 1
+
+    smlawb      r2, r3, r2, r10             ; split = 1 + (((range-1) * probability) >> 8)
+
+    ldrb        r12, [r8]                   ; load cx data byte in stall slot : r8 = bufptr
+    ;++
+
+    subs        r3, r4, r2, lsl #24         ; value-(split<<24): used later to calculate shift for NORMALIZE
+    addhs       r1, r1, #1                  ; t += 1
+    movhs       r4, r3                      ; value -= bigsplit (split << 24)
+    subhs       r2, r6, r2                  ; range -= split
+ ;   movlo       r6, r2                      ; range = split
+
+    ldrsb     r1, [lr, r1]                  ; t = onyx_coef_tree_ptr[t]
+
+; NORMALIZE
+    clz         r3, r2                      ; vp8dx_bitreader_norm[range] + 24
+    sub         r3, r3, #24                 ; vp8dx_bitreader_norm[range]
+    subs        r5, r5, r3                  ; count -= shift
+    mov         r6, r2, lsl r3              ; range <<= shift
+    mov         r4, r4, lsl r3              ; value <<= shift
+
+; if count <= 0, += BR_COUNT; value |= *bufptr++ << (BR_COUNT-count); BR_COUNT = 8, but need to upshift values by +16
+    addle         r5, r5, #8                ; count += 8
+    rsble         r3, r5, #24               ; 24 - count
+    addle         r8, r8, #1                ; bufptr++
+    orrle         r4, r4, r12, lsl r3       ; value |= *bufptr << shift + 16
+
+    cmp         r1, #0                      ; t ?= 0
+    bgt         get_token_loop              ; while (t > 0)
+
+    cmn         r1, #c_dct_eob_token        ; if(t == -DCT_EOB_TOKEN)
+    beq         END_OF_BLOCK                ; break
+
+    rsb         lr, r1, #0                  ; v = -t;
+
+    cmp         lr, #4                      ; if(v > FOUR_TOKEN)
+    ble         SKIP_EXTRABITS
+
+    ldr         r3, [r9, #detok_teb_base_ptr]
+    mov         r11, #1                     ; 1 in split = 1 + ... nope, v+= 1 << bits_count
+    add         r7, r3, lr, lsl #4          ; detok_teb_base_ptr + (v << 4)
+
+    ldrsh       lr, [r7, #tokenextrabits_min_val] ; v = teb_ptr->min_val
+    ldrsh       r0, [r7, #tokenextrabits_length] ; bits_count = teb_ptr->Length
+
+extrabits_loop
+    add         r3, r0, r7                  ; &teb_ptr->Probs[bits_count]
+
+    ldrb        r2, [r3, #4]                ; probability. why +4?
+    mov         r3, r6, lsl #8              ; range << 8
+    sub         r3, r3, #256                ; range << 8 + 1 << 8
+
+    smlawb      r2, r3, r2, r11             ; split = 1 +  (((range-1) * probability) >> 8)
+
+    ldrb        r12, [r8]                   ; *bufptr
+    ;++
+
+    subs        r10, r4, r2, lsl #24        ; value - (split<<24)
+    movhs       r4, r10                     ; value = value - (split << 24)
+    subhs       r2, r6, r2                  ; range = range - split
+    addhs       lr, lr, r11, lsl r0         ; v += ((UINT16)1<<bits_count)
+
+; NORMALIZE
+    clz         r3, r2                      ; shift - leading zeros in split
+    sub         r3, r3, #24                 ; don't count first 3 bytes
+    subs        r5, r5, r3                  ; count -= shift
+    mov         r6, r2, lsl r3              ; range = range << shift
+    mov         r4, r4, lsl r3              ; value <<= shift
+
+    addle       r5, r5, #8                  ; count += BR_COUNT
+    addle       r8, r8, #1                  ; bufptr++
+    rsble       r3, r5, #24                 ; BR_COUNT - count
+    orrle       r4, r4, r12, lsl r3         ; value |= *bufptr << (BR_COUNT - count)
+
+    subs        r0, r0, #1                  ; bits_count --
+    bpl         extrabits_loop
+
+
+SKIP_EXTRABITS
+    ldr         r11, [sp, #l_qcoeff]
+    ldr         r0, [sp, #l_coef_ptr]       ; Prob = coef_probs
+
+    cmp         r1, #0                      ; check for nonzero token - if (t)
+    beq         SKIP_EOB_CHECK              ; if t is zero, we will skip the eob table chec
+
+    add         r3, r6, #1                  ; range + 1
+    mov         r2, r3, lsr #1              ; split = (range + 1) >> 1
+
+    subs        r3, r4, r2, lsl #24         ; value - (split<<24)
+    movhs       r4, r3                      ; value -= (split << 24)
+    subhs       r2, r6, r2                  ; range -= split
+    mvnhs       r3, lr                      ; -v
+    addhs       lr, r3, #1                  ; v = (v ^ -1) + 1
+
+; NORMALIZE
+    clz         r3, r2                      ; leading 0s in split
+    sub         r3, r3, #24                 ; shift
+    subs        r5, r5, r3                  ; count -= shift
+    mov         r6, r2, lsl r3              ; range <<= shift
+    mov         r4, r4, lsl r3              ; value <<= shift
+    ldrleb      r2, [r8], #1                ; *(bufptr++)
+    addle       r5, r5, #8                  ; count += 8
+    rsble       r3, r5, #24                 ; BR_COUNT - count
+    orrle       r4, r4, r2, lsl r3          ; value |= *bufptr << (BR_COUNT - count)
+
+    add         r0, r0, #11                 ; Prob += ENTROPY_NODES (11)
+
+    cmn         r1, #1                      ; t < -ONE_TOKEN
+
+    addlt       r0, r0, #11                 ; Prob += ENTROPY_NODES (11)
+
+    mvn         r1, #1                      ; t = -1 ???? C is -2
+
+SKIP_EOB_CHECK
+    ldr         r7, [sp, #l_c]              ; c
+    ldr         r3, [r9, #detok_scan]
+    add         r1, r1, #2                  ; t+= 2
+    cmp         r7, #15                     ; c should will be one higher
+
+    ldr         r3, [r3, +r7, lsl #2]       ; scan[c] this needs pre-inc c value
+    add         r7, r7, #1                  ; c++
+    add         r3, r11, r3, lsl #1         ; qcoeff + scan[c]
+
+    str         r7, [sp, #l_c]              ; store c
+    strh        lr, [r3]                    ; qcoef_ptr[scan[c]] = v
+
+    blt         COEFF_LOOP
+
+    sub         r7, r7, #1                  ; if(t != -DCT_EOB_TOKEN) --c
+
+END_OF_BLOCK
+    ldr         r3, [sp, #l_type]           ; type
+    ldr         r10, [sp, #l_coef_ptr]      ; coef_ptr
+    ldr         r0, [sp, #l_qcoeff]         ; qcoeff
+    ldr         r11, [sp, #l_i]             ; i
+    ldr         r12, [sp, #l_stop]          ; stop
+
+    cmp         r3, #0                      ; type ?= 0
+    moveq       r1, #1
+    movne       r1, #0
+    add         r3, r11, r9                 ; detok + i
+
+    cmp         r7, r1                      ; c ?= !type
+    strb        r7, [r3, #detok_eob]        ; eob[i] = c
+
+    ldr         r7, [sp, #l_l_ptr]          ; l
+    ldr         r2, [sp, #l_a_ptr]          ; a
+    movne       r3, #1                      ; t
+    moveq       r3, #0
+
+    add         r0, r0, #32                 ; qcoeff += 32 (16 * 2?)
+    add         r11, r11, #1                ; i++
+    strb        r3, [r7]                    ; *l = t
+    strb        r3, [r2]                    ; *a = t
+    str         r0, [sp, #l_qcoeff]         ; qcoeff
+    str         r11, [sp, #l_i]             ; i
+
+    cmp         r11, r12                    ; i < stop
+    ldr         r7, [sp, #l_type]           ; type
+
+    blt         BLOCK_LOOP
+
+    cmp         r11, #25                    ; i ?= 25
+    bne         ln2_decode_mb_to
+
+    ldr         r12, [r9, #detok_qcoeff_start_ptr]
+    ldr         r10, [r9, #detok_coef_probs]
+    mov         r7, #0                      ; type/i = 0
+    mov         r3, #16                     ; stop = 16
+    str         r12, [sp, #l_qcoeff]        ; qcoeff_ptr = qcoeff_start_ptr
+    str         r7, [sp, #l_i]
+    str         r7, [sp, #l_type]
+    str         r3, [sp, #l_stop]
+
+    str         r10, [sp, #l_coef_ptr]      ; coef_probs = coef_probs[type=0]
+
+    b           BLOCK_LOOP
+
+ln2_decode_mb_to
+    cmp         r11, #16                    ; i ?= 16
+    bne         ln1_decode_mb_to
+
+    mov         r10, #detok_coef_probs
+    add         r10, r10, #2*4              ; coef_probs[type]
+    ldr         r10, [r9, r10]              ; detok + detok_coef_probs[type]
+
+    mov         r7, #2                      ; type = 2
+    mov         r3, #24                     ; stop = 24
+
+    str         r7, [sp, #l_type]
+    str         r3, [sp, #l_stop]
+
+    str         r10, [sp, #l_coef_ptr]      ; coef_probs = coef_probs[type]
+    b           BLOCK_LOOP
+
+ln1_decode_mb_to
+    ldr         r2, [sp, #l_bc]
+    mov         r0, #0
+    nop
+
+    str         r8, [r2, #bool_decoder_user_buffer]
+    str         r5, [r2, #bool_decoder_count]
+    str         r4, [r2, #bool_decoder_value]
+    str         r6, [r2, #bool_decoder_range]
+
+    add         sp, sp, #l_stacksize
+    ldmia       sp!, {r4 - r11, pc}
+
+    ENDP  ; |vp8_decode_mb_tokens_v6|
+
+    END
diff --git a/vp8/decoder/arm/detokenize_arm.h b/vp8/decoder/arm/detokenize_arm.h
new file mode 100644
index 000000000..9bb19b6cf
--- /dev/null
+++ b/vp8/decoder/arm/detokenize_arm.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef DETOKENIZE_ARM_H
+#define DETOKENIZE_ARM_H
+
+#if HAVE_ARMV6
+#if CONFIG_ARM_ASM_DETOK
+void vp8_init_detokenizer(VP8D_COMP *dx);
+void vp8_decode_mb_tokens_v6(DETOK *detoken, int type);
+#endif
+#endif
+
+#endif
diff --git a/vp8/decoder/arm/detokenizearm_sjl.c b/vp8/decoder/arm/detokenizearm_sjl.c
deleted file mode 100644
index c714452a6..000000000
--- a/vp8/decoder/arm/detokenizearm_sjl.c
+++ /dev/null
@@ -1,730 +0,0 @@
-/*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
- */
-
-
-#include "type_aliases.h"
-#include "blockd.h"
-#include "onyxd_int.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/mem.h"
-
-#define BR_COUNT 8
-#define BOOL_DATA UINT8
-
-#define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES
-//ALIGN16 UINT16 onyx_coef_bands_x[16] = { 0, 1*OCB_X, 2*OCB_X, 3*OCB_X, 6*OCB_X, 4*OCB_X, 5*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 7*OCB_X};
-DECLARE_ALIGNED(16, UINT8, vp8_coef_bands_x[16]) = { 0, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X};
-
-#define EOB_CONTEXT_NODE            0
-#define ZERO_CONTEXT_NODE           1
-#define ONE_CONTEXT_NODE            2
-#define LOW_VAL_CONTEXT_NODE        3
-#define TWO_CONTEXT_NODE            4
-#define THREE_CONTEXT_NODE          5
-#define HIGH_LOW_CONTEXT_NODE       6
-#define CAT_ONE_CONTEXT_NODE        7
-#define CAT_THREEFOUR_CONTEXT_NODE  8
-#define CAT_THREE_CONTEXT_NODE      9
-#define CAT_FIVE_CONTEXT_NODE       10
-
-
-
-
-DECLARE_ALIGNED(16, static const TOKENEXTRABITS, vp8d_token_extra_bits2[MAX_ENTROPY_TOKENS]) =
-{
-    {  0, -1, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  //ZERO_TOKEN
-    {  1, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //ONE_TOKEN
-    {  2, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //TWO_TOKEN
-    {  3, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //THREE_TOKEN
-    {  4, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //FOUR_TOKEN
-    {  5, 0, { 159, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  //DCT_VAL_CATEGORY1
-    {  7, 1, { 145, 165, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY2
-    { 11, 2, { 140, 148, 173, 0,  0,  0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY3
-    { 19, 3, { 135, 140, 155, 176, 0,  0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY4
-    { 35, 4, { 130, 134, 141, 157, 180, 0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY5
-    { 67, 10, { 129, 130, 133, 140, 153, 177, 196, 230, 243, 254, 254, 0   } }, //DCT_VAL_CATEGORY6
-    {  0, -1, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  // EOB TOKEN
-};
-
-/*
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-DECLARE_ALIGNED(16, const UINT8, vp8_block2context_leftabove[25*3]) =
-{
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, //end of vp8_block2context
-    0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 1, 1, 0, 0, 1, 1, 0, //end of vp8_block2left
-    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0 //end of vp8_block2above
-};
-
-/*
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-
-void vp8_reset_mb_tokens_context(MACROBLOCKD *x)
-{
-    ENTROPY_CONTEXT **const A = x->above_context;
-    ENTROPY_CONTEXT(* const L)[4] = x->left_context;
-
-    ENTROPY_CONTEXT *a;
-    ENTROPY_CONTEXT *l;
-    int i;
-
-    for (i = 0; i < 24; i++)
-    {
-
-        a = A[ vp8_block2context[i] ] + vp8_block2above[i];
-        l = L[ vp8_block2context[i] ] + vp8_block2left[i];
-
-        *a = *l = 0;
-    }
-
-    if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
-    {
-        a = A[Y2CONTEXT] + vp8_block2above[24];
-        l = L[Y2CONTEXT] + vp8_block2left[24];
-        *a = *l = 0;
-    }
-
-
-}
-
-#define ONYXBLOCK2CONTEXT_OFFSET    0
-#define ONYXBLOCK2LEFT_OFFSET       25
-#define ONYXBLOCK2ABOVE_OFFSET 50
-
-DECLARE_ALIGNED(16, const static unsigned char, norm[128]) =
-{
-    0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
-};
-
-/*
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-void init_detokenizer(VP8D_COMP *dx)
-{
-    const VP8_COMMON *const oc = & dx->common;
-    MACROBLOCKD *x = & dx->mb;
-
-    dx->detoken.norm_ptr = (unsigned char *)norm;
-    dx->detoken.vp8_coef_tree_ptr = (vp8_tree_index *)vp8_coef_tree;
-    dx->detoken.ptr_onyxblock2context_leftabove = (UINT8 *)vp8_block2context_leftabove;
-    dx->detoken.ptr_onyx_coef_bands_x = vp8_coef_bands_x;
-    dx->detoken.scan = (int *)vp8_default_zig_zag1d;
-    dx->detoken.teb_base_ptr = (TOKENEXTRABITS *)vp8d_token_extra_bits2;
-
-    dx->detoken.qcoeff_start_ptr = &x->qcoeff[0];
-
-
-    dx->detoken.coef_probs[0] = (unsigned char *)(oc->fc.coef_probs [0] [ 0 ] [0]);
-    dx->detoken.coef_probs[1] = (unsigned char *)(oc->fc.coef_probs [1] [ 0 ] [0]);
-    dx->detoken.coef_probs[2] = (unsigned char *)(oc->fc.coef_probs [2] [ 0 ] [0]);
-    dx->detoken.coef_probs[3] = (unsigned char *)(oc->fc.coef_probs [3] [ 0 ] [0]);
-
-}
-
-/*
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-
-
-//shift = norm[range]; \
-//      shift = norm_ptr[range]; \
-
-#define NORMALIZE \
-    /*if(range < 0x80)*/                            \
-    { \
-        shift = detoken->norm_ptr[range]; \
-        range <<= shift; \
-        value <<= shift; \
-        count -= shift; \
-        if(count <= 0) \
-        { \
-            count += BR_COUNT ; \
-            value |= (*bufptr) << (BR_COUNT-count); \
-            bufptr++; \
-        } \
-    }
-#if 1
-#define DECODE_AND_APPLYSIGN(value_to_sign) \
-    split = (range + 1) >> 1; \
-    if ( (value >> 24) < split ) \
-    { \
-        range = split; \
-        v= value_to_sign; \
-    } \
-    else \
-    { \
-        range = range-split; \
-        value = value-(split<<24); \
-        v = -value_to_sign; \
-    } \
-    range +=range;                   \
-    value +=value;                   \
-    if (!--count) \
-    { \
-        count = BR_COUNT; \
-        value |= *bufptr; \
-        bufptr++; \
-    }
-
-#define DECODE_AND_BRANCH_IF_ZERO(probability,branch) \
-    { \
-        split = 1 +  ((( probability*(range-1) ) )>> 8); \
-        if ( (value >> 24) < split ) \
-        { \
-            range = split; \
-            NORMALIZE \
-            goto branch; \
-        } \
-        value -= (split<<24); \
-        range = range - split; \
-        NORMALIZE \
-    }
-
-#define DECODE_AND_LOOP_IF_ZERO(probability,branch) \
-    { \
-        split = 1 + ((( probability*(range-1) ) ) >> 8); \
-        if ( (value >> 24) < split ) \
-        { \
-            range = split; \
-            NORMALIZE \
-            Prob = coef_probs; \
-            ++c; \
-            Prob += vp8_coef_bands_x[c]; \
-            goto branch; \
-        } \
-        value -= (split<<24); \
-        range = range - split; \
-        NORMALIZE \
-    }
-
-#define DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val) \
-    DECODE_AND_APPLYSIGN(val) \
-    Prob = coef_probs + (ENTROPY_NODES*2); \
-    if(c < 15){\
-        qcoeff_ptr [ scan[c] ] = (INT16) v; \
-        ++c; \
-        goto DO_WHILE; }\
-    qcoeff_ptr [ scan[15] ] = (INT16) v; \
-    goto BLOCK_FINISHED;
-
-
-#define DECODE_EXTRABIT_AND_ADJUST_VAL(t,bits_count)\
-    split = 1 +  (((range-1) * vp8d_token_extra_bits2[t].Probs[bits_count]) >> 8); \
-    if(value >= (split<<24))\
-    {\
-        range = range-split;\
-        value = value-(split<<24);\
-        val += ((UINT16)1<<bits_count);\
-    }\
-    else\
-    {\
-        range = split;\
-    }\
-    NORMALIZE
-#endif
-
-#if 0
-int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
-{
-    ENTROPY_CONTEXT **const A = x->above_context;
-    ENTROPY_CONTEXT(* const L)[4] = x->left_context;
-    const VP8_COMMON *const oc = & dx->common;
-
-    BOOL_DECODER *bc = x->current_bc;
-
-    ENTROPY_CONTEXT *a;
-    ENTROPY_CONTEXT *l;
-    int i;
-
-    int eobtotal = 0;
-
-    register int count;
-
-    BOOL_DATA *bufptr;
-    register unsigned int range;
-    register unsigned int value;
-    const int *scan;
-    register unsigned int shift;
-    UINT32 split;
-    INT16 *qcoeff_ptr;
-
-    UINT8 *coef_probs;
-    int type;
-    int stop;
-    INT16 val, bits_count;
-    INT16 c;
-    INT16 t;
-    INT16 v;
-    vp8_prob *Prob;
-
-    //int *scan;
-    type = 3;
-    i = 0;
-    stop = 16;
-
-    if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
-    {
-        i = 24;
-        stop = 24;
-        type = 1;
-        qcoeff_ptr = &x->qcoeff[24*16];
-        scan = vp8_default_zig_zag1d;
-        eobtotal -= 16;
-    }
-    else
-    {
-        scan = vp8_default_zig_zag1d;
-        qcoeff_ptr = &x->qcoeff[0];
-    }
-
-    count   = bc->count;
-    range   = bc->range;
-    value   = bc->value;
-    bufptr  = &bc->buffer[bc->pos];
-
-
-    coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]);
-
-BLOCK_LOOP:
-    a = A[ vp8_block2context[i] ] + vp8_block2above[i];
-    l = L[ vp8_block2context[i] ] + vp8_block2left[i];
-    c = (INT16)(!type);
-
-    VP8_COMBINEENTROPYCONTEXTS(t, *a, *l);
-    Prob = coef_probs;
-    Prob += t * ENTROPY_NODES;
-
-DO_WHILE:
-    Prob += vp8_coef_bands_x[c];
-    DECODE_AND_BRANCH_IF_ZERO(Prob[EOB_CONTEXT_NODE], BLOCK_FINISHED);
-
-CHECK_0_:
-    DECODE_AND_LOOP_IF_ZERO(Prob[ZERO_CONTEXT_NODE], CHECK_0_);
-    DECODE_AND_BRANCH_IF_ZERO(Prob[ONE_CONTEXT_NODE], ONE_CONTEXT_NODE_0_);
-    DECODE_AND_BRANCH_IF_ZERO(Prob[LOW_VAL_CONTEXT_NODE], LOW_VAL_CONTEXT_NODE_0_);
-    DECODE_AND_BRANCH_IF_ZERO(Prob[HIGH_LOW_CONTEXT_NODE], HIGH_LOW_CONTEXT_NODE_0_);
-    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_THREEFOUR_CONTEXT_NODE], CAT_THREEFOUR_CONTEXT_NODE_0_);
-    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_FIVE_CONTEXT_NODE], CAT_FIVE_CONTEXT_NODE_0_);
-    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY6].min_val;
-    bits_count = vp8d_token_extra_bits2[DCT_VAL_CATEGORY6].Length;
-
-    do
-    {
-        DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY6, bits_count);
-        bits_count -- ;
-    }
-    while (bits_count >= 0);
-
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
-
-CAT_FIVE_CONTEXT_NODE_0_:
-    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY5].min_val;
-    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 4);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 3);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 2);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 1);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 0);
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
-
-CAT_THREEFOUR_CONTEXT_NODE_0_:
-    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_THREE_CONTEXT_NODE], CAT_THREE_CONTEXT_NODE_0_);
-    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY4].min_val;
-    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 3);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 2);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 1);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 0);
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
-
-CAT_THREE_CONTEXT_NODE_0_:
-    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY3].min_val;
-    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 2);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 1);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 0);
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
-
-HIGH_LOW_CONTEXT_NODE_0_:
-    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_ONE_CONTEXT_NODE], CAT_ONE_CONTEXT_NODE_0_);
-
-    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY2].min_val;
-    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY2, 1);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY2, 0);
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
-
-CAT_ONE_CONTEXT_NODE_0_:
-    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY1].min_val;
-    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY1, 0);
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
-
-LOW_VAL_CONTEXT_NODE_0_:
-    DECODE_AND_BRANCH_IF_ZERO(Prob[TWO_CONTEXT_NODE], TWO_CONTEXT_NODE_0_);
-    DECODE_AND_BRANCH_IF_ZERO(Prob[THREE_CONTEXT_NODE], THREE_CONTEXT_NODE_0_);
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(4);
-
-THREE_CONTEXT_NODE_0_:
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(3);
-
-TWO_CONTEXT_NODE_0_:
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(2);
-
-ONE_CONTEXT_NODE_0_:
-    DECODE_AND_APPLYSIGN(1);
-    Prob = coef_probs + ENTROPY_NODES;
-
-    if (c < 15)
-    {
-        qcoeff_ptr [ scan[c] ] = (INT16) v;
-        ++c;
-        goto DO_WHILE;
-    }
-
-    qcoeff_ptr [ scan[15] ] = (INT16) v;
-BLOCK_FINISHED:
-    t = ((x->Block[i].eob = c) != !type);   // any nonzero data?
-    eobtotal += x->Block[i].eob;
-    *a = *l = t;
-    qcoeff_ptr += 16;
-
-    i++;
-
-    if (i < stop)
-        goto BLOCK_LOOP;
-
-    if (i == 25)
-    {
-        scan = vp8_default_zig_zag1d;//x->scan_order1d;
-        type = 0;
-        i = 0;
-        stop = 16;
-        coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]);
-        qcoeff_ptr = &x->qcoeff[0];
-        goto BLOCK_LOOP;
-    }
-
-    if (i == 16)
-    {
-        type = 2;
-        coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]);
-        stop = 24;
-        goto BLOCK_LOOP;
-    }
-
-    bc->count = count;
-    bc->value = value;
-    bc->range = range;
-    bc->pos  = bufptr - bc->buffer;
-    return eobtotal;
-
-}
-//#endif
-#else
-/*
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-
-#if 0
-//uses relative offsets
-
-const vp8_tree_index vp8_coef_tree_x[ 22] =   /* corresponding _CONTEXT_NODEs */
-{
-    -DCT_EOB_TOKEN, 1,                             /* 0 = EOB */
-    -ZERO_TOKEN, 1,                               /* 1 = ZERO */
-    -ONE_TOKEN, 1,                               /* 2 = ONE */
-    2, 5,                                       /* 3 = LOW_VAL */
-    -TWO_TOKEN, 1,                         /* 4 = TWO */
-    -THREE_TOKEN, -FOUR_TOKEN,                /* 5 = THREE */
-    2, 3,                                  /* 6 = HIGH_LOW */
-    -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 7 = CAT_ONE */
-    2, 3,                                 /* 8 = CAT_THREEFOUR */
-    -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,  /* 9 = CAT_THREE */
-    -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6   /* 10 = CAT_FIVE */
-};
-#endif
-
-#define _SCALEDOWN 8 //16 //8
-
-int vp8_decode_mb_tokens_v5(DETOK *detoken, int type);
-
-int vp8_decode_mb_tokens_v5_c(DETOK *detoken, int type)
-{
-    BOOL_DECODER *bc = detoken->current_bc;
-
-    ENTROPY_CONTEXT *a;
-    ENTROPY_CONTEXT *l;
-    int i;
-
-    register int count;
-
-    BOOL_DATA *bufptr;
-    register unsigned int range;
-    register unsigned int value;
-    register unsigned int shift;
-    UINT32 split;
-    INT16 *qcoeff_ptr;
-
-    UINT8 *coef_probs;
-//  int type;
-    int stop;
-    INT16 c;
-    INT16 t;
-    INT16 v;
-    vp8_prob *Prob;
-
-
-
-//  type = 3;
-    i = 0;
-    stop = 16;
-    qcoeff_ptr = detoken->qcoeff_start_ptr;
-
-//  if( detoken->mode != B_PRED && detoken->mode != SPLITMV)
-    if (type == 1)
-    {
-        i += 24;
-        stop += 8; //24;
-//      type = 1;
-        qcoeff_ptr += 24 * 16;
-//      eobtotal-=16;
-    }
-
-    count   = bc->count;
-    range   = bc->range;
-    value   = bc->value;
-    bufptr  = &bc->buffer[bc->pos];
-
-
-    coef_probs = detoken->coef_probs[type]; //(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]);
-
-BLOCK_LOOP:
-    a = detoken->A[ detoken->ptr_onyxblock2context_leftabove[i] ];
-    l = detoken->L[ detoken->ptr_onyxblock2context_leftabove[i] ];
-    c = !type;
-    a += detoken->ptr_onyxblock2context_leftabove[i + ONYXBLOCK2ABOVE_OFFSET];
-    l += detoken->ptr_onyxblock2context_leftabove[i + ONYXBLOCK2LEFT_OFFSET];
-
-    //#define ONYX_COMBINEENTROPYCONTEXTS( Dest, A, B) \
-    //Dest = ((A)!=0) + ((B)!=0);
-
-    VP8_COMBINEENTROPYCONTEXTS(t, *a, *l);
-
-    Prob = coef_probs;
-    Prob += t * ENTROPY_NODES;
-    t = 0;
-
-    do
-    {
-
-        {
-//                  onyx_tree_index * onyx_coef_tree_ptr = onyx_coef_tree_x;
-
-            Prob += detoken->ptr_onyx_coef_bands_x[c];
-
-        GET_TOKEN_START:
-
-            do
-            {
-                split = 1 + (((range - 1) * (Prob[t>>1])) >> 8);
-
-                if (value >> 24 >= split)
-                {
-                    range = range - split;
-                    value = value - (split << 24);
-                    t += 1;
-
-                    //used to eliminate else branch
-                    split = range;
-                }
-
-                range = split;
-
-                t = detoken->vp8_coef_tree_ptr[ t ];
-
-                NORMALIZE
-
-            }
-            while (t  > 0) ;
-        }
-    GET_TOKEN_STOP:
-
-        if (t == -DCT_EOB_TOKEN)
-        {
-            break;
-        }
-
-        v = -t;
-
-        if (v > FOUR_TOKEN)
-        {
-            INT16 bits_count;
-            TOKENEXTRABITS *teb_ptr;
-
-//                      teb_ptr = &onyxd_token_extra_bits2[t];
-//                  teb_ptr = &onyxd_token_extra_bits2[v];
-            teb_ptr = &detoken->teb_base_ptr[v];
-
-
-            v = teb_ptr->min_val;
-            bits_count = teb_ptr->Length;
-
-            do
-            {
-                split = 1 + (((range - 1) * teb_ptr->Probs[bits_count]) >> _SCALEDOWN);
-
-                if ((value >> 24) >= split)
-                {
-                    range = range - split;
-                    value = value - (split << 24);
-                    v += ((UINT16)1 << bits_count);
-
-                    //used to eliminate else branch
-                    split = range;
-                }
-
-                range = split;
-
-                NORMALIZE
-
-                bits_count -- ;
-            }
-            while (bits_count >= 0);
-        }
-
-        Prob = coef_probs;
-
-        if (t)
-        {
-            split = 1 + (((range - 1) * vp8_prob_half) >> 8);
-
-            if ((value >> 24) >= split)
-            {
-                range = range - split;
-                value = value - (split << 24);
-                v = (v ^ -1) + 1;           /* negate w/out conditionals */
-
-                //used to eliminate else branch
-                split = range;
-            }
-
-            range = split;
-
-            NORMALIZE
-            Prob += ENTROPY_NODES;
-
-            if (t < -ONE_TOKEN)
-                Prob += ENTROPY_NODES;
-
-            t = -2;
-        }
-
-        //if t is zero, we will skip the eob table check
-        t += 2;
-        qcoeff_ptr [detoken->scan [c] ] = (INT16) v;
-
-    }
-    while (++c < 16);
-
-    if (t != -DCT_EOB_TOKEN)
-    {
-        --c;
-    }
-
-    t = ((detoken->eob[i] = c) != !type);   // any nonzero data?
-//  eobtotal += detoken->eob[i];
-    *a = *l = t;
-    qcoeff_ptr += 16;
-
-    i++;
-
-    if (i < stop)
-        goto BLOCK_LOOP;
-
-    if (i == 25)
-    {
-        type = 0;
-        i = 0;
-        stop = 16;
-//      coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]);
-        coef_probs = detoken->coef_probs[type]; //(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]);
-        qcoeff_ptr = detoken->qcoeff_start_ptr;
-        goto BLOCK_LOOP;
-    }
-
-    if (i == 16)
-    {
-        type = 2;
-//      coef_probs =(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]);
-        coef_probs = detoken->coef_probs[type]; //(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]);
-        stop = 24;
-        goto BLOCK_LOOP;
-    }
-
-    bc->count = count;
-    bc->value = value;
-    bc->range = range;
-    bc->pos  = bufptr - bc->buffer;
-    return 0;
-}
-//#if 0
-int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
-{
-//  const ONYX_COMMON * const oc = & dx->common;
-    int eobtotal = 0;
-    int i, type;
-    /*
-        dx->detoken.norm_ptr = norm;
-        dx->detoken.onyx_coef_tree_ptr = onyx_coef_tree;
-        dx->detoken.ptr_onyxblock2context_leftabove = ONYXBLOCK2CONTEXT_LEFTABOVE;
-        dx->detoken.ptr_onyx_coef_bands_x = onyx_coef_bands_x;
-        dx->detoken.scan = default_zig_zag1d;
-        dx->detoken.teb_base_ptr = onyxd_token_extra_bits2;
-
-        dx->detoken.qcoeff_start_ptr = &x->qcoeff[0];
-
-        dx->detoken.A = x->above_context;
-        dx->detoken.L = x->left_context;
-
-        dx->detoken.coef_probs[0] = (unsigned char *)( oc->fc.coef_probs [0] [ 0 ] [0]);
-        dx->detoken.coef_probs[1] = (unsigned char *)( oc->fc.coef_probs [1] [ 0 ] [0]);
-        dx->detoken.coef_probs[2] = (unsigned char *)( oc->fc.coef_probs [2] [ 0 ] [0]);
-        dx->detoken.coef_probs[3] = (unsigned char *)( oc->fc.coef_probs [3] [ 0 ] [0]);
-    */
-
-    dx->detoken.current_bc = x->current_bc;
-    dx->detoken.A = x->above_context;
-    dx->detoken.L = x->left_context;
-
-    type = 3;
-
-    if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
-    {
-        type = 1;
-        eobtotal -= 16;
-    }
-
-    vp8_decode_mb_tokens_v5(&dx->detoken, type);
-
-    for (i = 0; i < 25; i++)
-    {
-        x->Block[i].eob = dx->detoken.eob[i];
-        eobtotal += dx->detoken.eob[i];
-    }
-
-    return eobtotal;
-}
-#endif
diff --git a/vp8/decoder/arm/detokenizearm_v6.asm b/vp8/decoder/arm/detokenizearm_v6.asm
deleted file mode 100644
index 4d87ee5bd..000000000
--- a/vp8/decoder/arm/detokenizearm_v6.asm
+++ /dev/null
@@ -1,364 +0,0 @@
-;
-;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_decode_mb_tokens_v5|
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-
-    INCLUDE vpx_asm_offsets.asm
-
-l_qcoeff    EQU     0
-l_i         EQU     4
-l_type      EQU     8
-l_stop      EQU     12
-l_c         EQU     16
-l_l_ptr      EQU     20
-l_a_ptr      EQU     24
-l_bc        EQU     28
-l_coef_ptr   EQU     32
-l_stacksize EQU     64
-
-
-;; constant offsets -- these should be created at build time
-c_onyxblock2left_offset      EQU 25
-c_onyxblock2above_offset     EQU 50
-c_entropy_nodes              EQU 11
-c_dct_eob_token              EQU 11
-
-|vp8_decode_mb_tokens_v5| PROC
-    stmdb       sp!, {r4 - r11, lr}
-    sub         sp, sp, #l_stacksize
-    mov         r7, r1
-    mov         r9, r0                      ;DETOK *detoken
-
-    ldr         r1, [r9, #detok_current_bc]
-    ldr         r0, [r9, #detok_qcoeff_start_ptr]
-    mov         r11, #0
-    mov         r3, #0x10
-
-    cmp         r7, #1
-    addeq       r11, r11, #24
-    addeq       r3, r3, #8
-    addeq       r0, r0, #3, 24
-
-    str         r0, [sp, #l_qcoeff]
-    str         r11, [sp, #l_i]
-    str         r7, [sp, #l_type]
-    str         r3, [sp, #l_stop]
-    str         r1, [sp, #l_bc]
-
-    add         lr, r9, r7, lsl #2
-
-    ldr         r2, [r1, #bool_decoder_buffer]
-    ldr         r3, [r1, #bool_decoder_pos]
-
-    ldr         r10, [lr, #detok_coef_probs]
-    ldr         r5, [r1, #bool_decoder_count]
-    ldr         r6, [r1, #bool_decoder_range]
-    ldr         r4, [r1, #bool_decoder_value]
-    add         r8, r2, r3
-
-    str         r10, [sp, #l_coef_ptr]
-
-
-    ;align 4
-BLOCK_LOOP
-    ldr         r3, [r9, #detok_ptr_onyxblock2context_leftabove]
-    ldr         r2, [r9, #DETOK_A]
-    ldr         r1, [r9, #DETOK_L]
-    ldrb        r12, [r3, +r11]                                 ; detoken->ptr_onyxblock2context_leftabove[i]
-
-    cmp         r7, #0                                          ; check type
-    moveq       r7, #1
-    movne       r7, #0
-
-    ldr         r0, [r2, +r12, lsl #2]                          ; a
-    add         r1, r1, r12, lsl #4
-    add         r3, r3, r11
-
-    ldrb        r2, [r3, #c_onyxblock2above_offset]
-    ldrb        r3, [r3, #c_onyxblock2left_offset]
-    mov         lr, #c_entropy_nodes
-;;  ;++
-
-    ldr         r2, [r0, +r2, lsl #2]!
-    add         r3, r1, r3, lsl #2
-    str         r3, [sp, #l_l_ptr]
-    ldr         r3, [r3]
-
-    cmp         r2, #0
-    movne       r2, #1
-    cmp         r3, #0
-    addne       r2, r2, #1
-
-    str         r0, [sp, #l_a_ptr]
-    smlabb      r0, r2, lr, r10
-    mov         r1, #0                                          ; t = 0
-    str         r7, [sp, #l_c]
-
-    ;align 4
-COEFF_LOOP
-    ldr         r3, [r9, #detok_ptr_onyx_coef_bands_x]
-    ldr         lr, [r9, #detok_onyx_coef_tree_ptr]
-
-;;the following two lines are used if onyx_coef_bands_x is UINT16
-;;  add         r3, r3, r7, lsl #1
-;;  ldrh        r3, [r3]
-
-;;the following line is used if onyx_coef_bands_x is UINT8
-    ldrb        r3, [r7, +r3]
-
-
-;;  ;++
-;;  pld         [r8]
-    ;++
-    add         r0, r0, r3
-
-    ;align 4
-get_token_loop
-    ldrb        r2, [r0, +r1, asr #1]
-    mov         r3, r6, lsl #8
-    sub         r3, r3, #256                    ;split = 1 +  (((range-1) * probability) >> 8)
-    mov         r10, #1
-
-    smlawb      r2, r3, r2, r10
-    ldrb        r12, [r8]                       ;load cx data byte in stall slot
-    ;++
-
-    subs        r3, r4, r2, lsl #24             ;x = value-(split<<24)
-    addhs       r1, r1, #1                      ;t += 1
-    movhs       r4, r3                          ;update value
-    subhs       r2, r6, r2                      ;range = range - split
-    movlo       r6, r2
-
-;;; ldrsbhs     r1, [r1, +lr]
-    ldrsb     r1, [r1, +lr]
-
-
-;; use branch for short pipelines ???
-;;  cmp         r2, #0x80
-;;  bcs         |$LN22@decode_mb_to|
-
-    clz         r3, r2
-    sub         r3, r3, #24
-    subs        r5, r5, r3
-    mov         r6, r2, lsl r3
-    mov         r4, r4, lsl r3
-
-;; use branch for short pipelines ???
-;;  bgt         |$LN22@decode_mb_to|
-
-    addle         r5, r5, #8
-    rsble         r3, r5, #8
-    addle         r8, r8, #1
-    orrle         r4, r4, r12, lsl r3
-
-;;|$LN22@decode_mb_to|
-
-    cmp         r1, #0
-    bgt         get_token_loop
-
-    cmn         r1, #c_dct_eob_token             ;if(t == -DCT_EOB_TOKEN)
-    beq         END_OF_BLOCK
-
-    rsb         lr, r1, #0                      ;v = -t;
-
-    cmp         lr, #4                          ;if(v > FOUR_TOKEN)
-    ble         SKIP_EXTRABITS
-
-    ldr         r3, [r9, #detok_teb_base_ptr]
-    mov         r11, #1
-    add         r7, r3, lr, lsl #4
-
-    ldrsh       lr, [r7, #tokenextrabits_min_val];v = teb_ptr->min_val
-    ldrsh       r0, [r7, #tokenextrabits_length];bits_count = teb_ptr->Length
-
-extrabits_loop
-    add         r3, r0, r7
-
-    ldrb        r2, [r3, #4]
-    mov         r3, r6, lsl #8
-    sub         r3, r3, #256                    ;split = 1 +  (((range-1) * probability) >> 8)
-    mov         r10, #1
-
-    smlawb      r2, r3, r2, r10
-    ldrb        r12, [r8]
-    ;++
-
-    subs        r10, r4, r2, lsl #24            ;x = value-(split<<24)
-    movhs       r4, r10                         ;update value
-    subhs       r2, r6, r2                      ;range = range - split
-    addhs       lr, lr, r11, lsl r0             ;v += ((UINT16)1<<bits_count)
-    movlo       r6, r2                          ;range = split
-
-
-;; use branch for short pipelines ???
-;;  cmp         r2, #0x80
-;;  bcs         |$LN10@decode_mb_to|
-
-    clz         r3, r2
-    sub         r3, r3, #24
-    subs        r5, r5, r3
-    mov         r6, r2, lsl r3                  ;range
-    mov         r4, r4, lsl r3                  ;value
-
-    addle       r5, r5, #8
-    addle       r8, r8, #1
-    rsble       r3, r5, #8
-    orrle       r4, r4, r12, lsl r3
-
-;;|$LN10@decode_mb_to|
-    subs         r0, r0, #1
-    bpl         extrabits_loop
-
-
-SKIP_EXTRABITS
-    ldr         r11, [sp, #l_qcoeff]
-    ldr         r0, [sp, #l_coef_ptr]
-
-    cmp         r1, #0                          ;check for nonzero token
-    beq         SKIP_EOB_CHECK              ;if t is zero, we will skip the eob table chec
-
-    sub         r3, r6, #1                      ;range - 1
-    ;++
-    mov         r3, r3, lsl #7                  ; *= onyx_prob_half  (128)
-    ;++
-    mov         r3, r3, lsr #8
-    add         r2, r3, #1                      ;split
-
-    subs        r3, r4, r2, lsl #24             ;x = value-(split<<24)
-    movhs       r4, r3                          ;update value
-    subhs       r2, r6, r2                      ;range = range - split
-    mvnhs       r3, lr
-    addhs       lr, r3, #1                      ;v = (v ^ -1) + 1
-    movlo       r6, r2                          ;range = split
-
-;; use branch for short pipelines ???
-;;  cmp         r2, #0x80
-;;  bcs         |$LN6@decode_mb_to|
-
-    clz         r3, r2
-    sub         r3, r3, #24
-    subs        r5, r5, r3
-    mov         r6, r2, lsl r3
-    mov         r4, r4, lsl r3
-    ldrleb      r2, [r8], #1
-    addle       r5, r5, #8
-    rsble       r3, r5, #8
-    orrle       r4, r4, r2, lsl r3
-
-;;|$LN6@decode_mb_to|
-    add         r0, r0, #0xB
-
-    cmn         r1, #1
-
-    addlt       r0, r0, #0xB
-
-    mvn         r1, #1
-
-SKIP_EOB_CHECK
-    ldr         r7, [sp, #l_c]
-    ldr         r3, [r9, #detok_scan]
-    add         r1, r1, #2
-    cmp         r7, #(0x10 - 1)                     ;assume one less for now.... increment below
-
-    ldr         r3, [r3, +r7, lsl #2]
-    add         r7, r7, #1
-    add         r3, r11, r3, lsl #1
-
-    str         r7, [sp, #l_c]
-    strh        lr, [r3]
-
-    blt         COEFF_LOOP
-
-    sub         r7, r7, #1                          ;if(t != -DCT_EOB_TOKEN) --c
-
-END_OF_BLOCK
-    ldr         r3, [sp, #l_type]
-    ldr         r10, [sp, #l_coef_ptr]
-    ldr         r0, [sp, #l_qcoeff]
-    ldr         r11, [sp, #l_i]
-    ldr         r12, [sp, #l_stop]
-
-    cmp         r3, #0
-    moveq       r1, #1
-    movne       r1, #0
-    add         r3, r11, r9
-
-    cmp         r7, r1
-    strb        r7, [r3, #detok_eob]
-
-    ldr         r7, [sp, #l_l_ptr]
-    ldr         r2, [sp, #l_a_ptr]
-    movne       r3, #1
-    moveq       r3, #0
-
-    add         r0, r0, #0x20
-    add         r11, r11, #1
-    str         r3, [r7]
-    str         r3, [r2]
-    str         r0, [sp, #l_qcoeff]
-    str         r11, [sp, #l_i]
-
-    cmp         r11, r12                            ;i >= stop ?
-    ldr         r7, [sp, #l_type]
-    mov         lr, #0xB
-
-    blt         BLOCK_LOOP
-
-    cmp         r11, #0x19
-    bne         ln2_decode_mb_to
-
-    ldr         r12, [r9, #detok_qcoeff_start_ptr]
-    ldr         r10, [r9, #detok_coef_probs]
-    mov         r7, #0
-    mov         r3, #0x10
-    str         r12, [sp, #l_qcoeff]
-    str         r7, [sp, #l_i]
-    str         r7, [sp, #l_type]
-    str         r3, [sp, #l_stop]
-
-    str         r10, [sp, #l_coef_ptr]
-
-    b           BLOCK_LOOP
-
-ln2_decode_mb_to
-    cmp         r11, #0x10
-    bne         ln1_decode_mb_to
-
-    ldr         r10, [r9, #0x30]
-
-    mov         r7, #2
-    mov         r3, #0x18
-
-    str         r7, [sp, #l_type]
-    str         r3, [sp, #l_stop]
-
-    str         r10, [sp, #l_coef_ptr]
-    b           BLOCK_LOOP
-
-ln1_decode_mb_to
-    ldr         r2, [sp, #l_bc]
-    mov         r0, #0
-    nop
-
-    ldr         r3, [r2, #bool_decoder_buffer]
-    str         r5, [r2, #bool_decoder_count]
-    str         r4, [r2, #bool_decoder_value]
-    sub         r3, r8, r3
-    str         r3, [r2, #bool_decoder_pos]
-    str         r6, [r2, #bool_decoder_range]
-
-    add         sp, sp, #l_stacksize
-    ldmia       sp!, {r4 - r11, pc}
-
-    ENDP  ; |vp8_decode_mb_tokens_v5|
-
-    END
diff --git a/vp8/decoder/arm/dsystemdependent.c b/vp8/decoder/arm/dsystemdependent.c
deleted file mode 100644
index 455c83a9c..000000000
--- a/vp8/decoder/arm/dsystemdependent.c
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "blockd.h"
-#include "pragmas.h"
-#include "postproc.h"
-#include "dboolhuff.h"
-#include "dequantize.h"
-#include "onyxd_int.h"
-
-void vp8_dmachine_specific_config(VP8D_COMP *pbi)
-{
-#if CONFIG_RUNTIME_CPU_DETECT
-    pbi->mb.rtcd         = &pbi->common.rtcd;
-#if HAVE_ARMV7
-    pbi->dequant.block   = vp8_dequantize_b_neon;
-    pbi->dequant.idct    = vp8_dequant_idct_neon;
-    pbi->dequant.idct_dc = vp8_dequant_dc_idct_neon;
-    pbi->dboolhuff.start = vp8dx_start_decode_c;
-    pbi->dboolhuff.stop  = vp8dx_stop_decode_c;
-    pbi->dboolhuff.fill  = vp8dx_bool_decoder_fill_c;
-    pbi->dboolhuff.debool = vp8dx_decode_bool_c;
-    pbi->dboolhuff.devalue = vp8dx_decode_value_c;
-
-#elif HAVE_ARMV6
-    pbi->dequant.block   = vp8_dequantize_b_v6;
-    pbi->dequant.idct    = vp8_dequant_idct_v6;
-    pbi->dequant.idct_dc = vp8_dequant_dc_idct_v6;
-    pbi->dboolhuff.start = vp8dx_start_decode_c;
-    pbi->dboolhuff.stop  = vp8dx_stop_decode_c;
-    pbi->dboolhuff.fill  = vp8dx_bool_decoder_fill_c;
-    pbi->dboolhuff.debool = vp8dx_decode_bool_c;
-    pbi->dboolhuff.devalue = vp8dx_decode_value_c;
-#endif
-#endif
-}
diff --git a/vp8/decoder/arm/neon/dboolhuff_neon.asm b/vp8/decoder/arm/neon/dboolhuff_neon.asm
index 7ec62a3d8..ff3ffda97 100644
--- a/vp8/decoder/arm/neon/dboolhuff_neon.asm
+++ b/vp8/decoder/arm/neon/dboolhuff_neon.asm
@@ -1,10 +1,11 @@
 ;
-;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
 ;
 
 
diff --git a/vp8/decoder/arm/neon/dequantidct_neon.asm b/vp8/decoder/arm/neon/dequant_idct_neon.asm
index bba4d5dfb..1923be42a 100644
--- a/vp8/decoder/arm/neon/dequantidct_neon.asm
+++ b/vp8/decoder/arm/neon/dequant_idct_neon.asm
@@ -1,29 +1,41 @@
 ;
-;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
 ;
 
 
-    EXPORT  |vp8_dequant_idct_neon|
+    EXPORT  |vp8_dequant_idct_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_dequant_idct_c(short *input, short *dq, short *output, int pitch);
+;void vp8_dequant_idct_neon(short *input, short *dq, unsigned char *pred,
+;                           unsigned char *dest, int pitch, int stride)
 ; r0    short *input,
 ; r1    short *dq,
-; r2    short *output,
-; r3    int pitch,
-|vp8_dequant_idct_neon| PROC
+; r2    unsigned char *pred
+; r3    unsigned char *dest
+; sp    int pitch
+; sp+4  int stride
+
+|vp8_dequant_idct_add_neon| PROC
     vld1.16         {q3, q4}, [r0]
     vld1.16         {q5, q6}, [r1]
+    ldr             r1, [sp]                ; pitch
+    vld1.32         {d14[0]}, [r2], r1
+    vld1.32         {d14[1]}, [r2], r1
+    vld1.32         {d15[0]}, [r2], r1
+    vld1.32         {d15[1]}, [r2]
+
+    ldr             r1, [sp, #4]            ; stride
 
-    ldr             r12, _didct_coeff_
+    ldr             r12, _CONSTANTS_
 
     vmul.i16        q1, q3, q5              ;input for short_idct4x4llm_neon
     vmul.i16        q2, q4, q6
@@ -41,14 +53,9 @@
     vshr.s16        q3, q3, #1
     vshr.s16        q4, q4, #1
 
-    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+    vqadd.s16       q3, q3, q2
     vqadd.s16       q4, q4, q2
 
-    ;d6 - c1:temp1
-    ;d7 - d1:temp2
-    ;d8 - d1:temp1
-    ;d9 - c1:temp2
-
     vqsub.s16       d10, d6, d9             ;c1
     vqadd.s16       d11, d7, d8             ;d1
 
@@ -77,7 +84,7 @@
     vshr.s16        q3, q3, #1
     vshr.s16        q4, q4, #1
 
-    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+    vqadd.s16       q3, q3, q2
     vqadd.s16       q4, q4, q2
 
     vqsub.s16       d10, d6, d9             ;c1
@@ -95,34 +102,29 @@
     vrshr.s16       d4, d4, #3
     vrshr.s16       d5, d5, #3
 
-    add             r1, r2, r3
-    add             r12, r1, r3
-    add             r0, r12, r3
-
     vtrn.32         d2, d4
     vtrn.32         d3, d5
     vtrn.16         d2, d3
     vtrn.16         d4, d5
 
-    vst1.16         {d2}, [r2]
-    vst1.16         {d3}, [r1]
-    vst1.16         {d4}, [r12]
-    vst1.16         {d5}, [r0]
+    vaddw.u8        q1, q1, d14
+    vaddw.u8        q2, q2, d15
 
-    bx             lr
+    vqmovun.s16     d0, q1
+    vqmovun.s16     d1, q2
+
+    vst1.32         {d0[0]}, [r3], r1
+    vst1.32         {d0[1]}, [r3], r1
+    vst1.32         {d1[0]}, [r3], r1
+    vst1.32         {d1[1]}, [r3]
 
-    ENDP
+    bx             lr
 
-;-----------------
-    AREA    didct4x4_dat, DATA, READWRITE           ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_didct_coeff_
-    DCD     didct_coeff
-didct_coeff
-    DCD     0x4e7b4e7b, 0x8a8c8a8c
+    ENDP           ; |vp8_dequant_idct_add_neon|
 
-;20091, 20091, 35468, 35468
+; Constant Pool
+_CONSTANTS_       DCD cospi8sqrt2minus1
+cospi8sqrt2minus1 DCD 0x4e7b4e7b
+sinpi8sqrt2       DCD 0x8a8c8a8c
 
     END
diff --git a/vp8/decoder/arm/neon/dequantdcidct_neon.asm b/vp8/decoder/arm/neon/dequantdcidct_neon.asm
deleted file mode 100644
index 3392f2c2b..000000000
--- a/vp8/decoder/arm/neon/dequantdcidct_neon.asm
+++ /dev/null
@@ -1,133 +0,0 @@
-;
-;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_dequant_dc_idct_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_dequant_dc_idct_c(short *input, short *dq, short *output, int pitch, int Dc);
-; r0    short *input,
-; r1    short *dq,
-; r2    short *output,
-; r3    int pitch,
-; (stack)   int Dc
-|vp8_dequant_dc_idct_neon| PROC
-    vld1.16         {q3, q4}, [r0]
-    vld1.16         {q5, q6}, [r1]
-
-    ldr             r1, [sp]                ;load Dc from stack
-
-    ldr             r12, _dcidct_coeff_
-
-    vmul.i16        q1, q3, q5              ;input for short_idct4x4llm_neon
-    vmul.i16        q2, q4, q6
-
-    vmov.16         d2[0], r1
-
-;|short_idct4x4llm_neon| PROC
-    vld1.16         {d0}, [r12]
-    vswp            d3, d4                  ;q2(vp[4] vp[12])
-
-    vqdmulh.s16     q3, q2, d0[2]
-    vqdmulh.s16     q4, q2, d0[0]
-
-    vqadd.s16       d12, d2, d3             ;a1
-    vqsub.s16       d13, d2, d3             ;b1
-
-    vshr.s16        q3, q3, #1
-    vshr.s16        q4, q4, #1
-
-    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
-    vqadd.s16       q4, q4, q2
-
-    ;d6 - c1:temp1
-    ;d7 - d1:temp2
-    ;d8 - d1:temp1
-    ;d9 - c1:temp2
-
-    vqsub.s16       d10, d6, d9             ;c1
-    vqadd.s16       d11, d7, d8             ;d1
-
-    vqadd.s16       d2, d12, d11
-    vqadd.s16       d3, d13, d10
-    vqsub.s16       d4, d13, d10
-    vqsub.s16       d5, d12, d11
-
-    vtrn.32         d2, d4
-    vtrn.32         d3, d5
-    vtrn.16         d2, d3
-    vtrn.16         d4, d5
-
-; memset(input, 0, 32) -- 32bytes
-    vmov.i16        q14, #0
-
-    vswp            d3, d4
-    vqdmulh.s16     q3, q2, d0[2]
-    vqdmulh.s16     q4, q2, d0[0]
-
-    vqadd.s16       d12, d2, d3             ;a1
-    vqsub.s16       d13, d2, d3             ;b1
-
-    vmov            q15, q14
-
-    vshr.s16        q3, q3, #1
-    vshr.s16        q4, q4, #1
-
-    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
-    vqadd.s16       q4, q4, q2
-
-    vqsub.s16       d10, d6, d9             ;c1
-    vqadd.s16       d11, d7, d8             ;d1
-
-    vqadd.s16       d2, d12, d11
-    vqadd.s16       d3, d13, d10
-    vqsub.s16       d4, d13, d10
-    vqsub.s16       d5, d12, d11
-
-    vst1.16         {q14, q15}, [r0]
-
-    vrshr.s16       d2, d2, #3
-    vrshr.s16       d3, d3, #3
-    vrshr.s16       d4, d4, #3
-    vrshr.s16       d5, d5, #3
-
-    add             r1, r2, r3
-    add             r12, r1, r3
-    add             r0, r12, r3
-
-    vtrn.32         d2, d4
-    vtrn.32         d3, d5
-    vtrn.16         d2, d3
-    vtrn.16         d4, d5
-
-    vst1.16         {d2}, [r2]
-    vst1.16         {d3}, [r1]
-    vst1.16         {d4}, [r12]
-    vst1.16         {d5}, [r0]
-
-    bx             lr
-
-    ENDP
-
-;-----------------
-    AREA    dcidct4x4_dat, DATA, READWRITE          ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_dcidct_coeff_
-    DCD     dcidct_coeff
-dcidct_coeff
-    DCD     0x4e7b4e7b, 0x8a8c8a8c
-
-;20091, 20091, 35468, 35468
-
-    END
diff --git a/vp8/decoder/arm/neon/dequantizeb_neon.asm b/vp8/decoder/arm/neon/dequantizeb_neon.asm
index 1bde94607..c8e0c31f2 100644
--- a/vp8/decoder/arm/neon/dequantizeb_neon.asm
+++ b/vp8/decoder/arm/neon/dequantizeb_neon.asm
@@ -1,10 +1,11 @@
 ;
-;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
 ;
 
 
diff --git a/vp8/decoder/arm/neon/idct_blk_neon.c b/vp8/decoder/arm/neon/idct_blk_neon.c
new file mode 100644
index 000000000..fe4f2e0d4
--- /dev/null
+++ b/vp8/decoder/arm/neon/idct_blk_neon.c
@@ -0,0 +1,115 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "idct.h"
+#include "dequantize.h"
+
+/* place these declarations here because we don't want to maintain them
+ * outside of this scope
+ */
+void idct_dequant_dc_full_2x_neon
+            (short *input, short *dq, unsigned char *pre, unsigned char *dst,
+             int stride, short *dc);
+void idct_dequant_dc_0_2x_neon
+            (short *dc, unsigned char *pre, unsigned char *dst, int stride);
+void idct_dequant_full_2x_neon
+            (short *q, short *dq, unsigned char *pre, unsigned char *dst,
+             int pitch, int stride);
+void idct_dequant_0_2x_neon
+            (short *q, short dq, unsigned char *pre, int pitch,
+             unsigned char *dst, int stride);
+
+void vp8_dequant_dc_idct_add_y_block_neon
+            (short *q, short *dq, unsigned char *pre,
+             unsigned char *dst, int stride, char *eobs, short *dc)
+{
+    int i;
+
+    for (i = 0; i < 4; i++)
+    {
+        if (((short *)eobs)[0] & 0xfefe)
+            idct_dequant_dc_full_2x_neon (q, dq, pre, dst, stride, dc);
+        else
+            idct_dequant_dc_0_2x_neon(dc, pre, dst, stride);
+
+        if (((short *)eobs)[1] & 0xfefe)
+            idct_dequant_dc_full_2x_neon (q+32, dq, pre+8, dst+8, stride, dc+2);
+        else
+            idct_dequant_dc_0_2x_neon(dc+2, pre+8, dst+8, stride);
+
+        q    += 64;
+        dc   += 4;
+        pre  += 64;
+        dst  += 4*stride;
+        eobs += 4;
+    }
+}
+
+void vp8_dequant_idct_add_y_block_neon
+            (short *q, short *dq, unsigned char *pre,
+             unsigned char *dst, int stride, char *eobs)
+{
+    int i;
+
+    for (i = 0; i < 4; i++)
+    {
+        if (((short *)eobs)[0] & 0xfefe)
+            idct_dequant_full_2x_neon (q, dq, pre, dst, 16, stride);
+        else
+            idct_dequant_0_2x_neon (q, dq[0], pre, 16, dst, stride);
+
+        if (((short *)eobs)[1] & 0xfefe)
+            idct_dequant_full_2x_neon (q+32, dq, pre+8, dst+8, 16, stride);
+        else
+            idct_dequant_0_2x_neon (q+32, dq[0], pre+8, 16, dst+8, stride);
+
+        q    += 64;
+        pre  += 64;
+        dst  += 4*stride;
+        eobs += 4;
+    }
+}
+
+void vp8_dequant_idct_add_uv_block_neon
+            (short *q, short *dq, unsigned char *pre,
+             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+{
+    if (((short *)eobs)[0] & 0xfefe)
+        idct_dequant_full_2x_neon (q, dq, pre, dstu, 8, stride);
+    else
+        idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstu, stride);
+
+    q    += 32;
+    pre  += 32;
+    dstu += 4*stride;
+
+    if (((short *)eobs)[1] & 0xfefe)
+        idct_dequant_full_2x_neon (q, dq, pre, dstu, 8, stride);
+    else
+        idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstu, stride);
+
+    q += 32;
+    pre += 32;
+
+    if (((short *)eobs)[2] & 0xfefe)
+        idct_dequant_full_2x_neon (q, dq, pre, dstv, 8, stride);
+    else
+        idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstv, stride);
+
+    q    += 32;
+    pre  += 32;
+    dstv += 4*stride;
+
+    if (((short *)eobs)[3] & 0xfefe)
+        idct_dequant_full_2x_neon (q, dq, pre, dstv, 8, stride);
+    else
+        idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstv, stride);
+}
diff --git a/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
new file mode 100644
index 000000000..456f8e1d4
--- /dev/null
+++ b/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
@@ -0,0 +1,79 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |idct_dequant_0_2x_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *pre,
+;                            int pitch, unsigned char *dst, int stride);
+; r0   *q
+; r1   dq
+; r2   *pre
+; r3   pitch
+; sp   *dst
+; sp+4 stride
+|idct_dequant_0_2x_neon| PROC
+    add             r12, r2, #4
+    vld1.32         {d2[0]}, [r2], r3
+    vld1.32         {d2[1]}, [r2], r3
+    vld1.32         {d4[0]}, [r2], r3
+    vld1.32         {d4[1]}, [r2]
+    vld1.32         {d8[0]}, [r12], r3
+    vld1.32         {d8[1]}, [r12], r3
+    vld1.32         {d10[0]}, [r12], r3
+    vld1.32         {d10[1]}, [r12]
+
+    ldrh            r12, [r0]               ; lo q
+    ldrh            r2, [r0, #32]           ; hi q
+    mov             r3, #0
+    strh            r3, [r0]
+    strh            r3, [r0, #32]
+
+    sxth            r12, r12                ; lo
+    mul             r0, r12, r1
+    add             r0, r0, #4
+    asr             r0, r0, #3
+    vdup.16         q0, r0
+    sxth            r2, r2                  ; hi
+    mul             r0, r2, r1
+    add             r0, r0, #4
+    asr             r0, r0, #3
+    vdup.16         q3, r0
+
+    vaddw.u8        q1, q0, d2              ; lo
+    vaddw.u8        q2, q0, d4
+    vaddw.u8        q4, q3, d8              ; hi
+    vaddw.u8        q5, q3, d10
+
+    ldr             r2, [sp]                ; dst
+    ldr             r3, [sp, #4]            ; stride
+
+    vqmovun.s16     d2, q1                  ; lo
+    vqmovun.s16     d4, q2
+    vqmovun.s16     d8, q4                  ; hi
+    vqmovun.s16     d10, q5
+
+    add             r0, r2, #4
+    vst1.32         {d2[0]}, [r2], r3       ; lo
+    vst1.32         {d2[1]}, [r2], r3
+    vst1.32         {d4[0]}, [r2], r3
+    vst1.32         {d4[1]}, [r2]
+    vst1.32         {d8[0]}, [r0], r3       ; hi
+    vst1.32         {d8[1]}, [r0], r3
+    vst1.32         {d10[0]}, [r0], r3
+    vst1.32         {d10[1]}, [r0]
+
+    bx             lr
+
+    ENDP           ; |idct_dequant_0_2x_neon|
+    END
diff --git a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
new file mode 100644
index 000000000..0dc036acb
--- /dev/null
+++ b/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
@@ -0,0 +1,69 @@
+;
+;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |idct_dequant_dc_0_2x_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_dc_0_2x_neon(short *dc, unsigned char *pre,
+;                               unsigned char *dst, int stride);
+; r0  *dc
+; r1  *pre
+; r2  *dst
+; r3  stride
+|idct_dequant_dc_0_2x_neon| PROC
+    ldr             r0, [r0]                ; *dc
+    mov             r12, #16
+
+    vld1.32         {d2[0]}, [r1], r12      ; lo
+    vld1.32         {d2[1]}, [r1], r12
+    vld1.32         {d4[0]}, [r1], r12
+    vld1.32         {d4[1]}, [r1]
+    sub             r1, r1, #44
+    vld1.32         {d8[0]}, [r1], r12      ; hi
+    vld1.32         {d8[1]}, [r1], r12
+    vld1.32         {d10[0]}, [r1], r12
+    vld1.32         {d10[1]}, [r1]
+
+    sxth            r1, r0                  ; lo *dc
+    add             r1, r1, #4
+    asr             r1, r1, #3
+    vdup.16         q0, r1
+    sxth            r0, r0, ror #16         ; hi *dc
+    add             r0, r0, #4
+    asr             r0, r0, #3
+    vdup.16         q3, r0
+
+    vaddw.u8        q1, q0, d2              ; lo
+    vaddw.u8        q2, q0, d4
+    vaddw.u8        q4, q3, d8              ; hi
+    vaddw.u8        q5, q3, d10
+
+    vqmovun.s16     d2, q1                  ; lo
+    vqmovun.s16     d4, q2
+    vqmovun.s16     d8, q4                  ; hi
+    vqmovun.s16     d10, q5
+
+    add             r0, r2, #4
+    vst1.32         {d2[0]}, [r2], r3       ; lo
+    vst1.32         {d2[1]}, [r2], r3
+    vst1.32         {d4[0]}, [r2], r3
+    vst1.32         {d4[1]}, [r2]
+    vst1.32         {d8[0]}, [r0], r3       ; hi
+    vst1.32         {d8[1]}, [r0], r3
+    vst1.32         {d10[0]}, [r0], r3
+    vst1.32         {d10[1]}, [r0]
+
+    bx             lr
+
+    ENDP           ;|idct_dequant_dc_0_2x_neon|
+    END
diff --git a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
new file mode 100644
index 000000000..ad4364adc
--- /dev/null
+++ b/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
@@ -0,0 +1,206 @@
+;
+;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |idct_dequant_dc_full_2x_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_dc_full_2x_neon(short *q, short *dq, unsigned char *pre,
+;                                  unsigned char *dst, int stride, short *dc);
+; r0    *q,
+; r1    *dq,
+; r2    *pre
+; r3    *dst
+; sp    stride
+; sp+4  *dc
+|idct_dequant_dc_full_2x_neon| PROC
+    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)
+    vld1.16         {q2, q3}, [r0]          ; l q
+    mov             r1, #16                 ; pitch
+    add             r0, r0, #32
+    vld1.16         {q4, q5}, [r0]          ; r q
+    add             r12, r2, #4
+    ; interleave the predictors
+    vld1.32         {d28[0]}, [r2], r1      ; l pre
+    vld1.32         {d28[1]}, [r12], r1     ; r pre
+    vld1.32         {d29[0]}, [r2], r1
+    vld1.32         {d29[1]}, [r12], r1
+    vld1.32         {d30[0]}, [r2], r1
+    vld1.32         {d30[1]}, [r12], r1
+    vld1.32         {d31[0]}, [r2]
+    ldr             r1, [sp, #4]
+    vld1.32         {d31[1]}, [r12]
+
+    ldr             r2, _CONSTANTS_
+
+    ldrh            r12, [r1], #2           ; lo *dc
+    ldrh            r1, [r1]                ; hi *dc
+
+    ; dequant: q[i] = q[i] * dq[i]
+    vmul.i16        q2, q2, q0
+    vmul.i16        q3, q3, q1
+    vmul.i16        q4, q4, q0
+    vmul.i16        q5, q5, q1
+
+    ; move dc up to neon and overwrite first element
+    vmov.16         d4[0], r12
+    vmov.16         d8[0], r1
+
+    vld1.16         {d0}, [r2]
+
+    ; q2: l0r0  q3: l8r8
+    ; q4: l4r4  q5: l12r12
+    vswp            d5, d8
+    vswp            d7, d10
+
+    ; _CONSTANTS_ * 4,12 >> 16
+    ; q6:  4 * sinpi : c1/temp1
+    ; q7: 12 * sinpi : d1/temp2
+    ; q8:  4 * cospi
+    ; q9: 12 * cospi
+    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2
+    vqdmulh.s16     q7, q5, d0[2]
+    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1
+    vqdmulh.s16     q9, q5, d0[0]
+
+    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8
+    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8
+
+    ; vqdmulh only accepts signed values. this was a problem because
+    ; our constant had the high bit set, and was treated as a negative value.
+    ; vqdmulh also doubles the value before it shifts by 16. we need to
+    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
+    ; so we can shift the constant without losing precision. this avoids
+    ; shift again afterward, but also avoids the sign issue. win win!
+    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
+    ; pre-shift it
+    vshr.s16        q8, q8, #1
+    vshr.s16        q9, q9, #1
+
+    ; q4:  4 +  4 * cospi : d1/temp1
+    ; q5: 12 + 12 * cospi : c1/temp2
+    vqadd.s16       q4, q4, q8
+    vqadd.s16       q5, q5, q9
+
+    ; c1 = temp1 - temp2
+    ; d1 = temp1 + temp2
+    vqsub.s16       q2, q6, q5
+    vqadd.s16       q3, q4, q7
+
+    ; [0]: a1+d1
+    ; [1]: b1+c1
+    ; [2]: b1-c1
+    ; [3]: a1-d1
+    vqadd.s16       q4, q10, q3
+    vqadd.s16       q5, q11, q2
+    vqsub.s16       q6, q11, q2
+    vqsub.s16       q7, q10, q3
+
+    ; rotate
+    vtrn.32         q4, q6
+    vtrn.32         q5, q7
+    vtrn.16         q4, q5
+    vtrn.16         q6, q7
+    ; idct loop 2
+    ; q4: l 0, 4, 8,12 r 0, 4, 8,12
+    ; q5: l 1, 5, 9,13 r 1, 5, 9,13
+    ; q6: l 2, 6,10,14 r 2, 6,10,14
+    ; q7: l 3, 7,11,15 r 3, 7,11,15
+
+    ; q8:  1 * sinpi : c1/temp1
+    ; q9:  3 * sinpi : d1/temp2
+    ; q10: 1 * cospi
+    ; q11: 3 * cospi
+    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2
+    vqdmulh.s16     q9, q7, d0[2]
+    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1
+    vqdmulh.s16     q11, q7, d0[0]
+
+    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2
+    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2
+
+    ; see note on shifting above
+    vshr.s16        q10, q10, #1
+    vshr.s16        q11, q11, #1
+
+    ; q10: 1 + 1 * cospi : d1/temp1
+    ; q11: 3 + 3 * cospi : c1/temp2
+    vqadd.s16       q10, q5, q10
+    vqadd.s16       q11, q7, q11
+
+    ; q8: c1 = temp1 - temp2
+    ; q9: d1 = temp1 + temp2
+    vqsub.s16       q8, q8, q11
+    vqadd.s16       q9, q10, q9
+
+    ; a1+d1
+    ; b1+c1
+    ; b1-c1
+    ; a1-d1
+    vqadd.s16       q4, q2, q9
+    vqadd.s16       q5, q3, q8
+    vqsub.s16       q6, q3, q8
+    vqsub.s16       q7, q2, q9
+
+    ; +4 >> 3 (rounding)
+    vrshr.s16       q4, q4, #3              ; lo
+    vrshr.s16       q5, q5, #3
+    vrshr.s16       q6, q6, #3              ; hi
+    vrshr.s16       q7, q7, #3
+
+    vtrn.32         q4, q6
+    vtrn.32         q5, q7
+    vtrn.16         q4, q5
+    vtrn.16         q6, q7
+
+    ; adding pre
+    ; input is still packed. pre was read interleaved
+    vaddw.u8        q4, q4, d28
+    vaddw.u8        q5, q5, d29
+    vaddw.u8        q6, q6, d30
+    vaddw.u8        q7, q7, d31
+
+    vmov.i16        q14, #0
+    vmov            q15, q14
+    vst1.16         {q14, q15}, [r0]        ; write over high input
+    sub             r0, r0, #32
+    vst1.16         {q14, q15}, [r0]        ; write over low input
+
+    ;saturate and narrow
+    vqmovun.s16     d0, q4                  ; lo
+    vqmovun.s16     d1, q5
+    vqmovun.s16     d2, q6                  ; hi
+    vqmovun.s16     d3, q7
+
+    ldr             r1, [sp]                ; stride
+    add             r2, r3, #4              ; hi
+    vst1.32         {d0[0]}, [r3], r1       ; lo
+    vst1.32         {d0[1]}, [r2], r1       ; hi
+    vst1.32         {d1[0]}, [r3], r1
+    vst1.32         {d1[1]}, [r2], r1
+    vst1.32         {d2[0]}, [r3], r1
+    vst1.32         {d2[1]}, [r2], r1
+    vst1.32         {d3[0]}, [r3]
+    vst1.32         {d3[1]}, [r2]
+
+    bx             lr
+
+    ENDP           ; |idct_dequant_dc_full_2x_neon|
+
+; Constant Pool
+_CONSTANTS_       DCD cospi8sqrt2minus1
+cospi8sqrt2minus1 DCD 0x4e7b
+; because the lowest bit in 0x8a8c is 0, we can pre-shift this
+sinpi8sqrt2       DCD 0x4546
+
+    END
diff --git a/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
new file mode 100644
index 000000000..85fff11b3
--- /dev/null
+++ b/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
@@ -0,0 +1,198 @@
+;
+;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |idct_dequant_full_2x_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *pre,
+;                               unsigned char *dst, int pitch, int stride);
+; r0    *q,
+; r1    *dq,
+; r2    *pre
+; r3    *dst
+; sp    pitch
+; sp+4  stride
+|idct_dequant_full_2x_neon| PROC
+    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)
+    vld1.16         {q2, q3}, [r0]          ; l q
+    ldr             r1, [sp]                ; pitch
+    add             r0, r0, #32
+    vld1.16         {q4, q5}, [r0]          ; r q
+    add             r12, r2, #4
+    ; interleave the predictors
+    vld1.32         {d28[0]}, [r2], r1      ; l pre
+    vld1.32         {d28[1]}, [r12], r1     ; r pre
+    vld1.32         {d29[0]}, [r2], r1
+    vld1.32         {d29[1]}, [r12], r1
+    vld1.32         {d30[0]}, [r2], r1
+    vld1.32         {d30[1]}, [r12], r1
+    vld1.32         {d31[0]}, [r2]
+    vld1.32         {d31[1]}, [r12]
+
+    ldr             r2, _CONSTANTS_
+
+    ; dequant: q[i] = q[i] * dq[i]
+    vmul.i16        q2, q2, q0
+    vmul.i16        q3, q3, q1
+    vmul.i16        q4, q4, q0
+    vmul.i16        q5, q5, q1
+
+    vld1.16         {d0}, [r2]
+
+    ; q2: l0r0  q3: l8r8
+    ; q4: l4r4  q5: l12r12
+    vswp            d5, d8
+    vswp            d7, d10
+
+    ; _CONSTANTS_ * 4,12 >> 16
+    ; q6:  4 * sinpi : c1/temp1
+    ; q7: 12 * sinpi : d1/temp2
+    ; q8:  4 * cospi
+    ; q9: 12 * cospi
+    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2
+    vqdmulh.s16     q7, q5, d0[2]
+    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1
+    vqdmulh.s16     q9, q5, d0[0]
+
+    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8
+    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8
+
+    ; vqdmulh only accepts signed values. this was a problem because
+    ; our constant had the high bit set, and was treated as a negative value.
+    ; vqdmulh also doubles the value before it shifts by 16. we need to
+    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
+    ; so we can shift the constant without losing precision. this avoids
+    ; shift again afterward, but also avoids the sign issue. win win!
+    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
+    ; pre-shift it
+    vshr.s16        q8, q8, #1
+    vshr.s16        q9, q9, #1
+
+    ; q4:  4 +  4 * cospi : d1/temp1
+    ; q5: 12 + 12 * cospi : c1/temp2
+    vqadd.s16       q4, q4, q8
+    vqadd.s16       q5, q5, q9
+
+    ; c1 = temp1 - temp2
+    ; d1 = temp1 + temp2
+    vqsub.s16       q2, q6, q5
+    vqadd.s16       q3, q4, q7
+
+    ; [0]: a1+d1
+    ; [1]: b1+c1
+    ; [2]: b1-c1
+    ; [3]: a1-d1
+    vqadd.s16       q4, q10, q3
+    vqadd.s16       q5, q11, q2
+    vqsub.s16       q6, q11, q2
+    vqsub.s16       q7, q10, q3
+
+    ; rotate
+    vtrn.32         q4, q6
+    vtrn.32         q5, q7
+    vtrn.16         q4, q5
+    vtrn.16         q6, q7
+    ; idct loop 2
+    ; q4: l 0, 4, 8,12 r 0, 4, 8,12
+    ; q5: l 1, 5, 9,13 r 1, 5, 9,13
+    ; q6: l 2, 6,10,14 r 2, 6,10,14
+    ; q7: l 3, 7,11,15 r 3, 7,11,15
+
+    ; q8:  1 * sinpi : c1/temp1
+    ; q9:  3 * sinpi : d1/temp2
+    ; q10: 1 * cospi
+    ; q11: 3 * cospi
+    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2
+    vqdmulh.s16     q9, q7, d0[2]
+    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1
+    vqdmulh.s16     q11, q7, d0[0]
+
+    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2
+    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2
+
+    ; see note on shifting above
+    vshr.s16        q10, q10, #1
+    vshr.s16        q11, q11, #1
+
+    ; q10: 1 + 1 * cospi : d1/temp1
+    ; q11: 3 + 3 * cospi : c1/temp2
+    vqadd.s16       q10, q5, q10
+    vqadd.s16       q11, q7, q11
+
+    ; q8: c1 = temp1 - temp2
+    ; q9: d1 = temp1 + temp2
+    vqsub.s16       q8, q8, q11
+    vqadd.s16       q9, q10, q9
+
+    ; a1+d1
+    ; b1+c1
+    ; b1-c1
+    ; a1-d1
+    vqadd.s16       q4, q2, q9
+    vqadd.s16       q5, q3, q8
+    vqsub.s16       q6, q3, q8
+    vqsub.s16       q7, q2, q9
+
+    ; +4 >> 3 (rounding)
+    vrshr.s16       q4, q4, #3              ; lo
+    vrshr.s16       q5, q5, #3
+    vrshr.s16       q6, q6, #3              ; hi
+    vrshr.s16       q7, q7, #3
+
+    vtrn.32         q4, q6
+    vtrn.32         q5, q7
+    vtrn.16         q4, q5
+    vtrn.16         q6, q7
+
+    ; adding pre
+    ; input is still packed. pre was read interleaved
+    vaddw.u8        q4, q4, d28
+    vaddw.u8        q5, q5, d29
+    vaddw.u8        q6, q6, d30
+    vaddw.u8        q7, q7, d31
+
+    vmov.i16        q14, #0
+    vmov            q15, q14
+    vst1.16         {q14, q15}, [r0]        ; write over high input
+    sub             r0, r0, #32
+    vst1.16         {q14, q15}, [r0]        ; write over low input
+
+    ;saturate and narrow
+    vqmovun.s16     d0, q4                  ; lo
+    vqmovun.s16     d1, q5
+    vqmovun.s16     d2, q6                  ; hi
+    vqmovun.s16     d3, q7
+
+    ldr             r1, [sp, #4]            ; stride
+    add             r2, r3, #4              ; hi
+    vst1.32         {d0[0]}, [r3], r1       ; lo
+    vst1.32         {d0[1]}, [r2], r1       ; hi
+    vst1.32         {d1[0]}, [r3], r1
+    vst1.32         {d1[1]}, [r2], r1
+    vst1.32         {d2[0]}, [r3], r1
+    vst1.32         {d2[1]}, [r2], r1
+    vst1.32         {d3[0]}, [r3]
+    vst1.32         {d3[1]}, [r2]
+
+    bx             lr
+
+    ENDP           ; |idct_dequant_full_2x_neon|
+
+; Constant Pool
+_CONSTANTS_       DCD cospi8sqrt2minus1
+cospi8sqrt2minus1 DCD 0x4e7b
+; because the lowest bit in 0x8a8c is 0, we can pre-shift this
+sinpi8sqrt2       DCD 0x4546
+
+    END
diff --git a/vp8/decoder/dboolhuff.c b/vp8/decoder/dboolhuff.c
index 442054ed3..57cba16a3 100644
--- a/vp8/decoder/dboolhuff.c
+++ b/vp8/decoder/dboolhuff.c
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
@@ -12,7 +13,7 @@
 #include "vpx_ports/mem.h"
 #include "vpx_mem/vpx_mem.h"
 
-DECLARE_ALIGNED(16, const unsigned int, vp8dx_bitreader_norm[256]) =
+DECLARE_ALIGNED(16, const unsigned char, vp8dx_bitreader_norm[256]) =
 {
     0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
@@ -25,86 +26,41 @@ DECLARE_ALIGNED(16, const unsigned int, vp8dx_bitreader_norm[256]) =
 };
 
 
-static void copy_in(BOOL_DECODER *br, unsigned int to_write)
-{
-    if (to_write > br->user_buffer_sz)
-        to_write = br->user_buffer_sz;
-
-    memcpy(br->write_ptr, br->user_buffer, to_write);
-    br->user_buffer += to_write;
-    br->user_buffer_sz -= to_write;
-    br->write_ptr = br_ptr_advance(br->write_ptr, to_write);
-}
-
 int vp8dx_start_decode_c(BOOL_DECODER *br, const unsigned char *source,
                         unsigned int source_sz)
 {
-    br->lowvalue = 0;
+    br->user_buffer_end = source+source_sz;
+    br->user_buffer     = source;
+    br->value    = 0;
+    br->count    = -8;
     br->range    = 255;
-    br->count    = 0;
-    br->user_buffer    = source;
-    br->user_buffer_sz = source_sz;
 
     if (source_sz && !source)
         return 1;
 
-    /* Allocate the ring buffer backing store with alignment equal to the
-     * buffer size*2 so that a single pointer can be used for wrapping rather
-     * than a pointer+offset.
-     */
-    br->decode_buffer  = vpx_memalign(VP8_BOOL_DECODER_SZ * 2,
-                                      VP8_BOOL_DECODER_SZ);
-
-    if (!br->decode_buffer)
-        return 1;
-
     /* Populate the buffer */
-    br->read_ptr = br->decode_buffer;
-    br->write_ptr = br->decode_buffer;
-    copy_in(br, VP8_BOOL_DECODER_SZ);
+    vp8dx_bool_decoder_fill_c(br);
 
-    /* Read the first byte */
-    br->value = (*br->read_ptr++) << 8;
     return 0;
 }
 
 
 void vp8dx_bool_decoder_fill_c(BOOL_DECODER *br)
 {
-    int          left, right;
-
-    /* Find available room in the buffer */
-    left = 0;
-    right = br->read_ptr - br->write_ptr;
-
-    if (right < 0)
-    {
-        /* Read pointer is behind the write pointer. We can write from the
-         * write pointer to the end of the buffer.
-         */
-        right = VP8_BOOL_DECODER_SZ - (br->write_ptr - br->decode_buffer);
-        left = br->read_ptr - br->decode_buffer;
-    }
-
-    if (right + left < 128)
-        return;
-
-    if (right)
-        copy_in(br, right);
-
-    if (left)
-    {
-        br->write_ptr = br->decode_buffer;
-        copy_in(br, left);
-    }
-
-}
-
-
-void vp8dx_stop_decode_c(BOOL_DECODER *bc)
-{
-    vpx_free(bc->decode_buffer);
-    bc->decode_buffer = 0;
+    const unsigned char *bufptr;
+    const unsigned char *bufend;
+    VP8_BD_VALUE         value;
+    int                  count;
+    bufend = br->user_buffer_end;
+    bufptr = br->user_buffer;
+    value = br->value;
+    count = br->count;
+
+    VP8DX_BOOL_DECODER_FILL(count, value, bufptr, bufend);
+
+    br->user_buffer = bufptr;
+    br->value = value;
+    br->count = count;
 }
 
 #if 0
@@ -119,13 +75,18 @@ void vp8dx_stop_decode_c(BOOL_DECODER *bc)
 int vp8dx_decode_bool_c(BOOL_DECODER *br, int probability)
 {
     unsigned int bit=0;
+    VP8_BD_VALUE value;
     unsigned int split;
-    unsigned int bigsplit;
-    register unsigned int range = br->range;
-    register unsigned int value = br->value;
+    VP8_BD_VALUE bigsplit;
+    int count;
+    unsigned int range;
+
+    value = br->value;
+    count = br->count;
+    range = br->range;
 
     split = 1 + (((range-1) * probability) >> 8);
-    bigsplit = (split<<8);
+    bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8);
 
     range = split;
     if(value >= bigsplit)
@@ -143,21 +104,16 @@ int vp8dx_decode_bool_c(BOOL_DECODER *br, int probability)
     }*/
 
     {
-        int count = br->count;
         register unsigned int shift = vp8dx_bitreader_norm[range];
         range <<= shift;
         value <<= shift;
         count -= shift;
-        if(count <= 0)
-        {
-            value |= (*br->read_ptr) << (-count);
-            br->read_ptr = br_ptr_advance(br->read_ptr, 1);
-            count += 8 ;
-        }
-        br->count = count;
     }
     br->value = value;
+    br->count = count;
     br->range = range;
+    if (count < 0)
+        vp8dx_bool_decoder_fill_c(br);
     return bit;
 }
 
diff --git a/vp8/decoder/dboolhuff.h b/vp8/decoder/dboolhuff.h
index 772dbdb2e..c851aa7e5 100644
--- a/vp8/decoder/dboolhuff.h
+++ b/vp8/decoder/dboolhuff.h
@@ -1,60 +1,41 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
 #ifndef DBOOLHUFF_H
 #define DBOOLHUFF_H
+#include <stddef.h>
+#include <limits.h>
 #include "vpx_ports/config.h"
 #include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"
 
-/* Size of the bool decoder backing storage
- *
- * This size was chosen to be greater than the worst case encoding of a
- * single macroblock. This was calcluated as follows (python):
- *
- *     def max_cost(prob):
- *         return max(prob_costs[prob], prob_costs[255-prob]) / 256;
- *
- *     tree_nodes_cost = 7 * max_cost(255)
- *     extra_bits_cost = sum([max_cost(bit) for bit in extra_bits])
- *     sign_bit_cost = max_cost(128)
- *     total_cost = tree_nodes_cost + extra_bits_cost + sign_bit_cost
- *
- * where the prob_costs table was taken from the C vp8_prob_cost table in
- * boolhuff.c and the extra_bits table was taken from the 11 extrabits for
- * a category 6 token as defined in vp8d_token_extra_bits2/detokenize.c
- *
- * This equation produced a maximum of 79 bits per coefficient. Scaling up
- * to the macroblock level:
- *
- *     79 bits/coeff * 16 coeff/block * 25 blocks/macroblock = 31600 b/mb
- *
- *     4096 bytes = 32768 bits > 31600
- */
-#define VP8_BOOL_DECODER_SZ       4096
-#define VP8_BOOL_DECODER_MASK     (VP8_BOOL_DECODER_SZ-1)
-#define VP8_BOOL_DECODER_PTR_MASK (~(uintptr_t)(VP8_BOOL_DECODER_SZ))
+typedef size_t VP8_BD_VALUE;
+
+# define VP8_BD_VALUE_SIZE ((int)sizeof(VP8_BD_VALUE)*CHAR_BIT)
+/*This is meant to be a large, positive constant that can still be efficiently
+   loaded as an immediate (on platforms like ARM, for example).
+  Even relatively modest values like 100 would work fine.*/
+# define VP8_LOTS_OF_BITS (0x40000000)
+
+
 
 struct vp8_dboolhuff_rtcd_vtable;
 
 typedef struct
 {
-    unsigned int         lowvalue;
-    unsigned int         range;
-    unsigned int         value;
-    int                  count;
+    const unsigned char *user_buffer_end;
     const unsigned char *user_buffer;
-    unsigned int         user_buffer_sz;
-    unsigned char       *decode_buffer;
-    const unsigned char *read_ptr;
-    unsigned char       *write_ptr;
+    VP8_BD_VALUE         value;
+    int                  count;
+    unsigned int         range;
 #if CONFIG_RUNTIME_CPU_DETECT
     struct vp8_dboolhuff_rtcd_vtable *rtcd;
 #endif
@@ -62,10 +43,9 @@ typedef struct
 
 #define prototype_dbool_start(sym) int sym(BOOL_DECODER *br, \
     const unsigned char *source, unsigned int source_sz)
-#define prototype_dbool_stop(sym) void sym(BOOL_DECODER *bc)
 #define prototype_dbool_fill(sym) void sym(BOOL_DECODER *br)
 #define prototype_dbool_debool(sym) int sym(BOOL_DECODER *br, int probability)
-#define prototype_dbool_devalue(sym) int sym(BOOL_DECODER *br, int bits);
+#define prototype_dbool_devalue(sym) int sym(BOOL_DECODER *br, int bits)
 
 #if ARCH_ARM
 #include "arm/dboolhuff_arm.h"
@@ -75,10 +55,6 @@ typedef struct
 #define vp8_dbool_start vp8dx_start_decode_c
 #endif
 
-#ifndef vp8_dbool_stop
-#define vp8_dbool_stop vp8dx_stop_decode_c
-#endif
-
 #ifndef vp8_dbool_fill
 #define vp8_dbool_fill vp8dx_bool_decoder_fill_c
 #endif
@@ -92,48 +68,35 @@ typedef struct
 #endif
 
 extern prototype_dbool_start(vp8_dbool_start);
-extern prototype_dbool_stop(vp8_dbool_stop);
 extern prototype_dbool_fill(vp8_dbool_fill);
 extern prototype_dbool_debool(vp8_dbool_debool);
 extern prototype_dbool_devalue(vp8_dbool_devalue);
 
 typedef prototype_dbool_start((*vp8_dbool_start_fn_t));
-typedef prototype_dbool_stop((*vp8_dbool_stop_fn_t));
 typedef prototype_dbool_fill((*vp8_dbool_fill_fn_t));
 typedef prototype_dbool_debool((*vp8_dbool_debool_fn_t));
 typedef prototype_dbool_devalue((*vp8_dbool_devalue_fn_t));
 
 typedef struct vp8_dboolhuff_rtcd_vtable {
     vp8_dbool_start_fn_t   start;
-    vp8_dbool_stop_fn_t    stop;
     vp8_dbool_fill_fn_t    fill;
     vp8_dbool_debool_fn_t  debool;
     vp8_dbool_devalue_fn_t devalue;
 } vp8_dboolhuff_rtcd_vtable_t;
 
-// There are no processor-specific versions of these
-// functions right now. Disable RTCD to avoid using
-// function pointers which gives a speed boost
-//#ifdef ENABLE_RUNTIME_CPU_DETECT
-//#define DBOOLHUFF_INVOKE(ctx,fn) (ctx)->fn
-//#define IF_RTCD(x) (x)
-//#else
+/* There are no processor-specific versions of these
+ * functions right now. Disable RTCD to avoid using
+ * function pointers which gives a speed boost
+ */
+/*#ifdef ENABLE_RUNTIME_CPU_DETECT
+#define DBOOLHUFF_INVOKE(ctx,fn) (ctx)->fn
+#define IF_RTCD(x) (x)
+#else*/
 #define DBOOLHUFF_INVOKE(ctx,fn) vp8_dbool_##fn
 #define IF_RTCD(x) NULL
-//#endif
-
-static unsigned char *br_ptr_advance(const unsigned char *_ptr,
-                                     unsigned int n)
-{
-    uintptr_t  ptr = (uintptr_t)_ptr;
-
-    ptr += n;
-    ptr &= VP8_BOOL_DECODER_PTR_MASK;
-
-    return (void *)ptr;
-}
+/*#endif*/
 
-DECLARE_ALIGNED(16, extern const unsigned int, vp8dx_bitreader_norm[256]);
+DECLARE_ALIGNED(16, extern const unsigned char, vp8dx_bitreader_norm[256]);
 
 /* wrapper functions to hide RTCD. static means inline means hopefully no
  * penalty
@@ -146,12 +109,34 @@ static int vp8dx_start_decode(BOOL_DECODER *br,
 #endif
     return DBOOLHUFF_INVOKE(rtcd, start)(br, source, source_sz);
 }
-static void vp8dx_stop_decode(BOOL_DECODER *br) {
-    DBOOLHUFF_INVOKE(br->rtcd, stop)(br);
-}
 static void vp8dx_bool_decoder_fill(BOOL_DECODER *br) {
     DBOOLHUFF_INVOKE(br->rtcd, fill)(br);
 }
+
+/*The refill loop is used in several places, so define it in a macro to make
+   sure they're all consistent.
+  An inline function would be cleaner, but has a significant penalty, because
+   multiple BOOL_DECODER fields must be modified, and the compiler is not smart
+   enough to eliminate the stores to those fields and the subsequent reloads
+   from them when inlining the function.*/
+#define VP8DX_BOOL_DECODER_FILL(_count,_value,_bufptr,_bufend) \
+    do \
+    { \
+        int shift; \
+        for(shift = VP8_BD_VALUE_SIZE - 8 - ((_count) + 8); shift >= 0; ) \
+        { \
+            if((_bufptr) >= (_bufend)) { \
+                (_count) = VP8_LOTS_OF_BITS; \
+                break; \
+            } \
+            (_count) += 8; \
+            (_value) |= (VP8_BD_VALUE)*(_bufptr)++ << shift; \
+            shift -= 8; \
+        } \
+    } \
+    while(0)
+
+
 static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
   /*
    * Until optimized versions of this function are available, we
@@ -160,13 +145,18 @@ static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
    *return DBOOLHUFF_INVOKE(br->rtcd, debool)(br, probability);
    */
     unsigned int bit = 0;
+    VP8_BD_VALUE value;
     unsigned int split;
-    unsigned int bigsplit;
-    register unsigned int range = br->range;
-    register unsigned int value = br->value;
+    VP8_BD_VALUE bigsplit;
+    int count;
+    unsigned int range;
+
+    value = br->value;
+    count = br->count;
+    range = br->range;
 
     split = 1 + (((range - 1) * probability) >> 8);
-    bigsplit = (split << 8);
+    bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8);
 
     range = split;
 
@@ -185,23 +175,16 @@ static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
     }*/
 
     {
-        int count = br->count;
         register unsigned int shift = vp8dx_bitreader_norm[range];
         range <<= shift;
         value <<= shift;
         count -= shift;
-
-        if (count <= 0)
-        {
-            value |= (*br->read_ptr) << (-count);
-            br->read_ptr = br_ptr_advance(br->read_ptr, 1);
-            count += 8 ;
-        }
-
-        br->count = count;
     }
     br->value = value;
+    br->count = count;
     br->range = range;
+    if(count < 0)
+        vp8dx_bool_decoder_fill(br);
     return bit;
 }
 
diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c
index f151ef3cc..415392b68 100644
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
@@ -13,10 +14,127 @@
 #include "entropymode.h"
 #include "onyxd_int.h"
 #include "findnearmv.h"
-#include "demode.h"
+
 #if CONFIG_DEBUG
 #include <assert.h>
 #endif
+static int vp8_read_bmode(vp8_reader *bc, const vp8_prob *p)
+{
+    const int i = vp8_treed_read(bc, vp8_bmode_tree, p);
+
+    return i;
+}
+
+
+static int vp8_read_ymode(vp8_reader *bc, const vp8_prob *p)
+{
+    const int i = vp8_treed_read(bc, vp8_ymode_tree, p);
+
+    return i;
+}
+
+static int vp8_kfread_ymode(vp8_reader *bc, const vp8_prob *p)
+{
+    const int i = vp8_treed_read(bc, vp8_kf_ymode_tree, p);
+
+    return i;
+}
+
+
+
+static int vp8_read_uv_mode(vp8_reader *bc, const vp8_prob *p)
+{
+    const int i = vp8_treed_read(bc, vp8_uv_mode_tree, p);
+
+    return i;
+}
+
+static void vp8_read_mb_features(vp8_reader *r, MB_MODE_INFO *mi, MACROBLOCKD *x)
+{
+    /* Is segmentation enabled */
+    if (x->segmentation_enabled && x->update_mb_segmentation_map)
+    {
+        /* If so then read the segment id. */
+        if (vp8_read(r, x->mb_segment_tree_probs[0]))
+            mi->segment_id = (unsigned char)(2 + vp8_read(r, x->mb_segment_tree_probs[2]));
+        else
+            mi->segment_id = (unsigned char)(vp8_read(r, x->mb_segment_tree_probs[1]));
+    }
+}
+
+static void vp8_kfread_modes(VP8D_COMP *pbi, MODE_INFO *m, int mb_row, int mb_col)
+{
+    vp8_reader *const bc = & pbi->bc;
+    const int mis = pbi->common.mode_info_stride;
+
+        {
+            MB_PREDICTION_MODE y_mode;
+
+            /* Read the Macroblock segmentation map if it is being updated explicitly this frame (reset to 0 above by default)
+             * By default on a key frame reset all MBs to segment 0
+             */
+            m->mbmi.segment_id = 0;
+
+            if (pbi->mb.update_mb_segmentation_map)
+                vp8_read_mb_features(bc, &m->mbmi, &pbi->mb);
+
+            /* Read the macroblock coeff skip flag if this feature is in use, else default to 0 */
+            if (pbi->common.mb_no_coeff_skip)
+                m->mbmi.mb_skip_coeff = vp8_read(bc, pbi->prob_skip_false);
+            else
+                m->mbmi.mb_skip_coeff = 0;
+
+            y_mode = (MB_PREDICTION_MODE) vp8_kfread_ymode(bc, pbi->common.kf_ymode_prob);
+
+            m->mbmi.ref_frame = INTRA_FRAME;
+
+            if ((m->mbmi.mode = y_mode) == B_PRED)
+            {
+                int i = 0;
+
+                do
+                {
+                    const B_PREDICTION_MODE A = vp8_above_bmi(m, i, mis)->mode;
+                    const B_PREDICTION_MODE L = vp8_left_bmi(m, i)->mode;
+
+                    m->bmi[i].mode = (B_PREDICTION_MODE) vp8_read_bmode(bc, pbi->common.kf_bmode_prob [A] [L]);
+                }
+                while (++i < 16);
+            }
+            else
+            {
+                int BMode;
+                int i = 0;
+
+                switch (y_mode)
+                {
+                case DC_PRED:
+                    BMode = B_DC_PRED;
+                    break;
+                case V_PRED:
+                    BMode = B_VE_PRED;
+                    break;
+                case H_PRED:
+                    BMode = B_HE_PRED;
+                    break;
+                case TM_PRED:
+                    BMode = B_TM_PRED;
+                    break;
+                default:
+                    BMode = B_DC_PRED;
+                    break;
+                }
+
+                do
+                {
+                    m->bmi[i].mode = (B_PREDICTION_MODE)BMode;
+                }
+                while (++i < 16);
+            }
+
+            m->mbmi.uv_mode = (MB_PREDICTION_MODE)vp8_read_uv_mode(bc, pbi->common.kf_uv_mode_prob);
+        }
+}
 
 static int read_mvcomponent(vp8_reader *r, const MV_CONTEXT *mvc)
 {
@@ -98,6 +216,8 @@ static MB_PREDICTION_MODE sub_mv_ref(vp8_reader *bc, const vp8_prob *p)
 
     return (MB_PREDICTION_MODE)i;
 }
+
+#ifdef VPX_MODE_COUNT
 unsigned int vp8_mv_cont_count[5][4] =
 {
     { 0, 0, 0, 0 },
@@ -106,87 +226,108 @@ unsigned int vp8_mv_cont_count[5][4] =
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 }
 };
+#endif
 
-void vp8_decode_mode_mvs(VP8D_COMP *pbi)
-{
-    const MV Zero = { 0, 0};
+unsigned char vp8_mbsplit_offset[4][16] = {
+    { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
+};
 
-    VP8_COMMON *const pc = & pbi->common;
-    vp8_reader *const bc = & pbi->bc;
-    MACROBLOCKD *xd = &pbi->mb;
-    MODE_INFO *mi = pc->mi, *ms;
-    const int mis = pc->mode_info_stride;
+unsigned char vp8_mbsplit_fill_count[4] = {8, 8, 4, 1};
+unsigned char vp8_mbsplit_fill_offset[4][16] = {
+    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
+    { 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,   6,  7, 10, 11, 14, 15},
+    { 0,  1,  4,  5,  2,  3,  6,  7,  8,  9,  12, 13, 10, 11, 14, 15},
+    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
+};
 
-    MV_CONTEXT *const mvc = pc->fc.mvc;
 
-    int mb_row = -1;
+
+void vp8_mb_mode_mv_init(VP8D_COMP *pbi)
+{
+    vp8_reader *const bc = & pbi->bc;
+    MV_CONTEXT *const mvc = pbi->common.fc.mvc;
 #if CONFIG_SEGMENTATION
-    int left_id, above_id;
-    int i;
-    int sum;
-    int index = 0;
+    MACROBLOCKD *const xd  = & pbi->mb;
 #endif
-    vp8_prob prob_intra;
-    vp8_prob prob_last;
-    vp8_prob prob_gf;
-    vp8_prob prob_skip_false = 0;
-
-    if (pc->mb_no_coeff_skip)
-        prob_skip_false = (vp8_prob)vp8_read_literal(bc, 8);
-
-    prob_intra = (vp8_prob)vp8_read_literal(bc, 8);
-    prob_last  = (vp8_prob)vp8_read_literal(bc, 8);
-    prob_gf    = (vp8_prob)vp8_read_literal(bc, 8);
 
-    ms = pc->mi - 1;
+    pbi->prob_skip_false = 0;
+    if (pbi->common.mb_no_coeff_skip)
+        pbi->prob_skip_false = (vp8_prob)vp8_read_literal(bc, 8);
 
-    if (vp8_read_bit(bc))
+    if(pbi->common.frame_type != KEY_FRAME)
     {
-        int i = 0;
+        pbi->prob_intra = (vp8_prob)vp8_read_literal(bc, 8);
+        pbi->prob_last  = (vp8_prob)vp8_read_literal(bc, 8);
+        pbi->prob_gf    = (vp8_prob)vp8_read_literal(bc, 8);
 
-        do
+        if (vp8_read_bit(bc))
         {
-            pc->fc.ymode_prob[i] = (vp8_prob) vp8_read_literal(bc, 8);
-        }
-        while (++i < 4);
-    }
+            int i = 0;
 
-    if (vp8_read_bit(bc))
-    {
-        int i = 0;
+            do
+            {
+                pbi->common.fc.ymode_prob[i] = (vp8_prob) vp8_read_literal(bc, 8);
+            }
+            while (++i < 4);
+        }
 
-        do
+        if (vp8_read_bit(bc))
         {
-            pc->fc.uv_mode_prob[i] = (vp8_prob) vp8_read_literal(bc, 8);
+            int i = 0;
+
+            do
+            {
+                pbi->common.fc.uv_mode_prob[i] = (vp8_prob) vp8_read_literal(bc, 8);
+            }
+            while (++i < 3);
         }
-        while (++i < 3);
-    }
 
-    read_mvcontexts(bc, mvc);
+        read_mvcontexts(bc, mvc);
 #if CONFIG_SEGMENTATION
     xd->temporal_update = vp8_read_bit(bc);
 #endif
-    while (++mb_row < pc->mb_rows)
-    {
-        int mb_col = -1;
+    }
+}
 
-        while (++mb_col < pc->mb_cols)
-        {
-            MB_MODE_INFO *const mbmi = & mi->mbmi;
-            MV *const mv = & mbmi->mv.as_mv;
-            VP8_COMMON *const pc = &pbi->common;
-            // MACROBLOCKD *xd = &pbi->mb;
-
-            vp8dx_bool_decoder_fill(bc);
-            // Distance of Mb to the various image edges.
-            // These specified to 8th pel as they are always compared to MV values that are in 1/8th pel units
-            xd->mb_to_left_edge = -((mb_col * 16) << 3);
-            xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
-            xd->mb_to_top_edge = -((mb_row * 16)) << 3;
-            xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
-
-            // If required read in new segmentation data for this MB
-            if (pbi->mb.update_mb_segmentation_map)
+void vp8_read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
+                            int mb_row, int mb_col)
+{
+    const MV Zero = { 0, 0};
+    vp8_reader *const bc = & pbi->bc;
+    MV_CONTEXT *const mvc = pbi->common.fc.mvc;
+    const int mis = pbi->common.mode_info_stride;
+#if CONFIG_SEGMENTATION
+    MACROBLOCKD *const xd  = & pbi->mb;
+    int sum;
+    int index = mb_row * pbi->common.mb_cols + mb_col;
+#endif
+    MV *const mv = & mbmi->mv.as_mv;
+    int mb_to_left_edge;
+    int mb_to_right_edge;
+    int mb_to_top_edge;
+    int mb_to_bottom_edge;
+
+    mb_to_top_edge = pbi->mb.mb_to_top_edge;
+    mb_to_bottom_edge = pbi->mb.mb_to_bottom_edge;
+    mb_to_top_edge -= LEFT_TOP_MARGIN;
+    mb_to_bottom_edge += RIGHT_BOTTOM_MARGIN;
+    mbmi->need_to_clamp_mvs = 0;
+    /* Distance of Mb to the various image edges.
+     * These specified to 8th pel as they are always compared to MV values that are in 1/8th pel units
+     */
+    pbi->mb.mb_to_left_edge =
+    mb_to_left_edge = -((mb_col * 16) << 3);
+    mb_to_left_edge -= LEFT_TOP_MARGIN;
+
+    pbi->mb.mb_to_right_edge =
+    mb_to_right_edge = ((pbi->common.mb_cols - 1 - mb_col) * 16) << 3;
+    mb_to_right_edge += RIGHT_BOTTOM_MARGIN;
+
+    /* If required read in new segmentation data for this MB */
+    if (pbi->mb.update_mb_segmentation_map)
             {
 #if CONFIG_SEGMENTATION
                 if (xd->temporal_update)
@@ -196,7 +337,7 @@ void vp8_decode_mode_mvs(VP8D_COMP *pbi)
                     if (mb_col != 0)
                         sum += (mi-1)->mbmi.segment_flag;
                     if (mb_row != 0)
-                        sum += (mi-pc->mb_cols)->mbmi.segment_flag;
+                        sum += (mi-pbi->common.mb_cols)->mbmi.segment_flag;
 
                     if (vp8_read(bc, xd->mb_segment_tree_probs[3+sum]) == 0)
                     {
@@ -223,236 +364,237 @@ void vp8_decode_mode_mvs(VP8D_COMP *pbi)
             }
 
 
-            // Read the macroblock coeff skip flag if this feature is in use, else default to 0
-            if (pc->mb_no_coeff_skip)
-                mbmi->mb_skip_coeff = vp8_read(bc, prob_skip_false);
-            else
-                mbmi->mb_skip_coeff = 0;
-
-            mbmi->uv_mode = DC_PRED;
-
-            if ((mbmi->ref_frame = (MV_REFERENCE_FRAME) vp8_read(bc, prob_intra)))    /* inter MB */
-            {
-                int rct[4];
-                vp8_prob mv_ref_p [VP8_MVREFS-1];
-                MV nearest, nearby, best_mv;
-
-                if (vp8_read(bc, prob_last))
-                {
-                    mbmi->ref_frame = (MV_REFERENCE_FRAME)((int)mbmi->ref_frame + (int)(1 + vp8_read(bc, prob_gf)));
-                }
-
-                vp8_find_near_mvs(xd, mi, &nearest, &nearby, &best_mv, rct, mbmi->ref_frame, pbi->common.ref_frame_sign_bias);
-
-                vp8_mv_ref_probs(mv_ref_p, rct);
-
-                switch (mbmi->mode = read_mv_ref(bc, mv_ref_p))
-                {
-                case SPLITMV:
-                {
-                    const int s = mbmi->partitioning = vp8_treed_read(
-                                                           bc, vp8_mbsplit_tree, vp8_mbsplit_probs
-                                                       );
-                    const int num_p = vp8_mbsplit_count [s];
-                    const int *const  L = vp8_mbsplits [s];
-                    int j = 0;
-
-                    do  /* for each subset j */
-                    {
-                        B_MODE_INFO *const bmi = mbmi->partition_bmi + j;
-                        MV *const mv = & bmi->mv.as_mv;
-
-                        int k = -1;  /* first block in subset j */
-                        int mv_contz;
-
-                        while (j != L[++k])
-                            if (k >= 16)
-#if CONFIG_DEBUG
-                                assert(0);
-
-#else
-                                ;
-#endif
-
-                        mv_contz = vp8_mv_cont(&(vp8_left_bmi(mi, k)->mv.as_mv), &(vp8_above_bmi(mi, k, mis)->mv.as_mv));
-
-                        switch (bmi->mode = (B_PREDICTION_MODE) sub_mv_ref(bc, vp8_sub_mv_ref_prob2 [mv_contz])) //pc->fc.sub_mv_ref_prob))
-                        {
-                        case NEW4X4:
-                            read_mv(bc, mv, (const MV_CONTEXT *) mvc);
-                            mv->row += best_mv.row;
-                            mv->col += best_mv.col;
-#ifdef VPX_MODE_COUNT
-                            vp8_mv_cont_count[mv_contz][3]++;
-#endif
-                            break;
-                        case LEFT4X4:
-                            *mv = vp8_left_bmi(mi, k)->mv.as_mv;
-#ifdef VPX_MODE_COUNT
-                            vp8_mv_cont_count[mv_contz][0]++;
-#endif
-                            break;
-                        case ABOVE4X4:
-                            *mv = vp8_above_bmi(mi, k, mis)->mv.as_mv;
-#ifdef VPX_MODE_COUNT
-                            vp8_mv_cont_count[mv_contz][1]++;
-#endif
-                            break;
-                        case ZERO4X4:
-                            *mv = Zero;
-#ifdef VPX_MODE_COUNT
-                            vp8_mv_cont_count[mv_contz][2]++;
-#endif
-                            break;
-                        default:
-                            break;
-                        }
-
-                        /* Fill (uniform) modes, mvs of jth subset.
-                           Must do it here because ensuing subsets can
-                           refer back to us via "left" or "above". */
-                        do
-                            if (j == L[k])
-                                mi->bmi[k] = *bmi;
-
-                        while (++k < 16);
-                    }
-                    while (++j < num_p);
-                }
-
-                *mv = mi->bmi[15].mv.as_mv;
+    /* Read the macroblock coeff skip flag if this feature is in use, else default to 0 */
+    if (pbi->common.mb_no_coeff_skip)
+        mbmi->mb_skip_coeff = vp8_read(bc, pbi->prob_skip_false);
+    else
+        mbmi->mb_skip_coeff = 0;
 
-                break;  /* done with SPLITMV */
-
-                case NEARMV:
-                    *mv = nearby;
-
-                    // Clip "next_nearest" so that it does not extend to far out of image
-                    if (mv->col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
-                        mv->col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
-                    else if (mv->col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
-                        mv->col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
+    if ((mbmi->ref_frame = (MV_REFERENCE_FRAME) vp8_read(bc, pbi->prob_intra)))    /* inter MB */
+    {
+        int rct[4];
+        vp8_prob mv_ref_p [VP8_MVREFS-1];
+        MV nearest, nearby, best_mv;
 
-                    if (mv->row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
-                        mv->row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
-                    else if (mv->row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
-                        mv->row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
+        if (vp8_read(bc, pbi->prob_last))
+        {
+            mbmi->ref_frame = (MV_REFERENCE_FRAME)((int)mbmi->ref_frame + (int)(1 + vp8_read(bc, pbi->prob_gf)));
+        }
 
-                    goto propagate_mv;
+        vp8_find_near_mvs(&pbi->mb, mi, &nearest, &nearby, &best_mv, rct, mbmi->ref_frame, pbi->common.ref_frame_sign_bias);
 
-                case NEARESTMV:
-                    *mv = nearest;
+        vp8_mv_ref_probs(mv_ref_p, rct);
 
-                    // Clip "next_nearest" so that it does not extend to far out of image
-                    if (mv->col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
-                        mv->col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
-                    else if (mv->col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
-                        mv->col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
+        mbmi->uv_mode = DC_PRED;
+        switch (mbmi->mode = read_mv_ref(bc, mv_ref_p))
+        {
+        case SPLITMV:
+        {
+            const int s = mbmi->partitioning =
+                      vp8_treed_read(bc, vp8_mbsplit_tree, vp8_mbsplit_probs);
+            const int num_p = vp8_mbsplit_count [s];
+            int j = 0;
 
-                    if (mv->row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
-                        mv->row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
-                    else if (mv->row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
-                        mv->row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
+            do  /* for each subset j */
+            {
+                B_MODE_INFO bmi;
+                MV *const mv = & bmi.mv.as_mv;
 
-                    goto propagate_mv;
+                int k;  /* first block in subset j */
+                int mv_contz;
+                k = vp8_mbsplit_offset[s][j];
 
-                case ZEROMV:
-                    *mv = Zero;
-                    goto propagate_mv;
+                mv_contz = vp8_mv_cont(&(vp8_left_bmi(mi, k)->mv.as_mv), &(vp8_above_bmi(mi, k, mis)->mv.as_mv));
 
-                case NEWMV:
+                switch (bmi.mode = (B_PREDICTION_MODE) sub_mv_ref(bc, vp8_sub_mv_ref_prob2 [mv_contz])) /*pc->fc.sub_mv_ref_prob))*/
+                {
+                case NEW4X4:
                     read_mv(bc, mv, (const MV_CONTEXT *) mvc);
                     mv->row += best_mv.row;
                     mv->col += best_mv.col;
-                    /* Encoder should not produce invalid motion vectors, but since
-                     * arbitrary length MVs can be parsed from the bitstream, we
-                     * need to clamp them here in case we're reading bad data to
-                     * avoid a crash.
-                     */
-#if CONFIG_DEBUG
-                    assert(mv->col >= (xd->mb_to_left_edge - LEFT_TOP_MARGIN));
-                    assert(mv->col <= (xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN));
-                    assert(mv->row >= (xd->mb_to_top_edge - LEFT_TOP_MARGIN));
-                    assert(mv->row <= (xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN));
-#endif
+  #ifdef VPX_MODE_COUNT
+                    vp8_mv_cont_count[mv_contz][3]++;
+  #endif
+                    break;
+                case LEFT4X4:
+                    *mv = vp8_left_bmi(mi, k)->mv.as_mv;
+  #ifdef VPX_MODE_COUNT
+                    vp8_mv_cont_count[mv_contz][0]++;
+  #endif
+                    break;
+                case ABOVE4X4:
+                    *mv = vp8_above_bmi(mi, k, mis)->mv.as_mv;
+  #ifdef VPX_MODE_COUNT
+                    vp8_mv_cont_count[mv_contz][1]++;
+  #endif
+                    break;
+                case ZERO4X4:
+                    *mv = Zero;
+  #ifdef VPX_MODE_COUNT
+                    vp8_mv_cont_count[mv_contz][2]++;
+  #endif
+                    break;
+                default:
+                    break;
+                }
 
-                    if (mv->col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
-                        mv->col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
-                    else if (mv->col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
-                        mv->col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
+                mbmi->need_to_clamp_mvs |= (mv->col < mb_to_left_edge) ? 1 : 0;
+                mbmi->need_to_clamp_mvs |= (mv->col > mb_to_right_edge) ? 1 : 0;
+                mbmi->need_to_clamp_mvs |= (mv->row < mb_to_top_edge) ? 1 : 0;
+                mbmi->need_to_clamp_mvs |= (mv->row > mb_to_bottom_edge) ? 1 : 0;
 
-                    if (mv->row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
-                        mv->row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
-                    else if (mv->row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
-                        mv->row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
+                {
+                    /* Fill (uniform) modes, mvs of jth subset.
+                     Must do it here because ensuing subsets can
+                     refer back to us via "left" or "above". */
+                    unsigned char *fill_offset;
+                    unsigned int fill_count = vp8_mbsplit_fill_count[s];
 
-                propagate_mv:  /* same MV throughout */
-                    {
-                        //int i=0;
-                        //do
-                        //{
-                        //  mi->bmi[i].mv.as_mv = *mv;
-                        //}
-                        //while( ++i < 16);
-
-                        mi->bmi[0].mv.as_mv = *mv;
-                        mi->bmi[1].mv.as_mv = *mv;
-                        mi->bmi[2].mv.as_mv = *mv;
-                        mi->bmi[3].mv.as_mv = *mv;
-                        mi->bmi[4].mv.as_mv = *mv;
-                        mi->bmi[5].mv.as_mv = *mv;
-                        mi->bmi[6].mv.as_mv = *mv;
-                        mi->bmi[7].mv.as_mv = *mv;
-                        mi->bmi[8].mv.as_mv = *mv;
-                        mi->bmi[9].mv.as_mv = *mv;
-                        mi->bmi[10].mv.as_mv = *mv;
-                        mi->bmi[11].mv.as_mv = *mv;
-                        mi->bmi[12].mv.as_mv = *mv;
-                        mi->bmi[13].mv.as_mv = *mv;
-                        mi->bmi[14].mv.as_mv = *mv;
-                        mi->bmi[15].mv.as_mv = *mv;
-                    }
+                    fill_offset = &vp8_mbsplit_fill_offset[s][(unsigned char)j * vp8_mbsplit_fill_count[s]];
 
-                    break;
+                    do {
+                        mi->bmi[ *fill_offset] = bmi;
+                      fill_offset++;
 
-                default:;
-#if CONFIG_DEBUG
-                    assert(0);
-#endif
+                    }while (--fill_count);
                 }
 
             }
-            else
-            {
-                /* MB is intra coded */
-
-                int j = 0;
+            while (++j < num_p);
+        }
 
+        *mv = mi->bmi[15].mv.as_mv;
+
+        break;  /* done with SPLITMV */
+
+        case NEARMV:
+            *mv = nearby;
+            /* Clip "next_nearest" so that it does not extend to far out of image */
+            mv->col = (mv->col < mb_to_left_edge) ? mb_to_left_edge : mv->col;
+            mv->col = (mv->col > mb_to_right_edge) ? mb_to_right_edge : mv->col;
+            mv->row = (mv->row < mb_to_top_edge) ? mb_to_top_edge : mv->row;
+            mv->row = (mv->row > mb_to_bottom_edge) ? mb_to_bottom_edge : mv->row;
+            goto propagate_mv;
+
+        case NEARESTMV:
+            *mv = nearest;
+            /* Clip "next_nearest" so that it does not extend to far out of image */
+            mv->col = (mv->col < mb_to_left_edge) ? mb_to_left_edge : mv->col;
+            mv->col = (mv->col > mb_to_right_edge) ? mb_to_right_edge : mv->col;
+            mv->row = (mv->row < mb_to_top_edge) ? mb_to_top_edge : mv->row;
+            mv->row = (mv->row > mb_to_bottom_edge) ? mb_to_bottom_edge : mv->row;
+            goto propagate_mv;
+
+        case ZEROMV:
+            *mv = Zero;
+            goto propagate_mv;
+
+        case NEWMV:
+            read_mv(bc, mv, (const MV_CONTEXT *) mvc);
+            mv->row += best_mv.row;
+            mv->col += best_mv.col;
+
+            /* Don't need to check this on NEARMV and NEARESTMV modes
+             * since those modes clamp the MV. The NEWMV mode does not,
+             * so signal to the prediction stage whether special
+             * handling may be required.
+             */
+            mbmi->need_to_clamp_mvs = (mv->col < mb_to_left_edge) ? 1 : 0;
+            mbmi->need_to_clamp_mvs |= (mv->col > mb_to_right_edge) ? 1 : 0;
+            mbmi->need_to_clamp_mvs |= (mv->row < mb_to_top_edge) ? 1 : 0;
+            mbmi->need_to_clamp_mvs |= (mv->row > mb_to_bottom_edge) ? 1 : 0;
+
+        propagate_mv:  /* same MV throughout */
+            {
+                /*int i=0;
                 do
                 {
-                    mi->bmi[j].mv.as_mv = Zero;
+                  mi->bmi[i].mv.as_mv = *mv;
                 }
-                while (++j < 16);
+                while( ++i < 16);*/
+
+                mi->bmi[0].mv.as_mv = *mv;
+                mi->bmi[1].mv.as_mv = *mv;
+                mi->bmi[2].mv.as_mv = *mv;
+                mi->bmi[3].mv.as_mv = *mv;
+                mi->bmi[4].mv.as_mv = *mv;
+                mi->bmi[5].mv.as_mv = *mv;
+                mi->bmi[6].mv.as_mv = *mv;
+                mi->bmi[7].mv.as_mv = *mv;
+                mi->bmi[8].mv.as_mv = *mv;
+                mi->bmi[9].mv.as_mv = *mv;
+                mi->bmi[10].mv.as_mv = *mv;
+                mi->bmi[11].mv.as_mv = *mv;
+                mi->bmi[12].mv.as_mv = *mv;
+                mi->bmi[13].mv.as_mv = *mv;
+                mi->bmi[14].mv.as_mv = *mv;
+                mi->bmi[15].mv.as_mv = *mv;
+            }
+            break;
+        default:;
+  #if CONFIG_DEBUG
+            assert(0);
+  #endif
+        }
+    }
+    else
+    {
+        /* MB is intra coded */
+        int j = 0;
+        do
+        {
+            mi->bmi[j].mv.as_mv = Zero;
+        }
+        while (++j < 16);
 
-                *mv = Zero;
+        if ((mbmi->mode = (MB_PREDICTION_MODE) vp8_read_ymode(bc, pbi->common.fc.ymode_prob)) == B_PRED)
+        {
+            j = 0;
+            do
+            {
+                mi->bmi[j].mode = (B_PREDICTION_MODE)vp8_read_bmode(bc, pbi->common.fc.bmode_prob);
+            }
+            while (++j < 16);
+        }
 
-                if ((mbmi->mode = (MB_PREDICTION_MODE) vp8_read_ymode(bc, pc->fc.ymode_prob)) == B_PRED)
-                {
-                    int j = 0;
+        mbmi->uv_mode = (MB_PREDICTION_MODE)vp8_read_uv_mode(bc, pbi->common.fc.uv_mode_prob);
+    }
 
-                    do
-                    {
-                        mi->bmi[j].mode = (B_PREDICTION_MODE)vp8_read_bmode(bc, pc->fc.bmode_prob);
-                    }
-                    while (++j < 16);
-                }
+}
 
-                mbmi->uv_mode = (MB_PREDICTION_MODE)vp8_read_uv_mode(bc, pc->fc.uv_mode_prob);
-            }
+void vp8_decode_mode_mvs(VP8D_COMP *pbi)
+{
+    MODE_INFO *mi = pbi->common.mi;
+    int mb_row = -1;
+
+    vp8_mb_mode_mv_init(pbi);
+
+    while (++mb_row < pbi->common.mb_rows)
+    {
+        int mb_col = -1;
+        int mb_to_top_edge;
+        int mb_to_bottom_edge;
 
-            mi++;       // next macroblock
+        pbi->mb.mb_to_top_edge =
+        mb_to_top_edge = -((mb_row * 16)) << 3;
+        mb_to_top_edge -= LEFT_TOP_MARGIN;
+
+        pbi->mb.mb_to_bottom_edge =
+        mb_to_bottom_edge = ((pbi->common.mb_rows - 1 - mb_row) * 16) << 3;
+        mb_to_bottom_edge += RIGHT_BOTTOM_MARGIN;
+
+        while (++mb_col < pbi->common.mb_cols)
+        {
+            /*vp8_read_mb_modes_mv(pbi, xd->mode_info_context, &xd->mode_info_context->mbmi, mb_row, mb_col);*/
+            if(pbi->common.frame_type == KEY_FRAME)
+                vp8_kfread_modes(pbi, mi, mb_row, mb_col);
+            else
+                vp8_read_mb_modes_mv(pbi, mi, &mi->mbmi, mb_row, mb_col);
+
+            mi++;       /* next macroblock */
         }
 
-        mi++;           // skip left predictor each row
+        mi++;           /* skip left predictor each row */
     }
 }
+
diff --git a/vp8/decoder/decodemv.h b/vp8/decoder/decodemv.h
index 403007183..940342447 100644
--- a/vp8/decoder/decodemv.h
+++ b/vp8/decoder/decodemv.h
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
diff --git a/vp8/decoder/decoderthreading.h b/vp8/decoder/decoderthreading.h
index ebc5c27b2..25dee8fe8 100644
--- a/vp8/decoder/decoderthreading.h
+++ b/vp8/decoder/decoderthreading.h
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
@@ -14,11 +15,12 @@
 #ifndef _DECODER_THREADING_H
 #define _DECODER_THREADING_H
 
-
-extern void vp8_mtdecode_mb_rows(VP8D_COMP *pbi,
-                                 MACROBLOCKD *xd);
-extern void vp8_stop_lfthread(VP8D_COMP *pbi);
-extern void vp8_start_lfthread(VP8D_COMP *pbi);
+#if CONFIG_MULTITHREAD
+extern void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd);
 extern void vp8_decoder_remove_threads(VP8D_COMP *pbi);
 extern void vp8_decoder_create_threads(VP8D_COMP *pbi);
+extern int vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows);
+extern void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows);
+#endif
+
 #endif
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 01cd7aedf..06204fec6 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
@@ -20,9 +21,10 @@
 #include "alloccommon.h"
 #include "entropymode.h"
 #include "quant_common.h"
-#include "segmentation_common.h"
+#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/yv12extend.h"
 #include "setupintrarecon.h"
-#include "demode.h"
+
 #include "decodemv.h"
 #include "extend.h"
 #include "vpx_mem/vpx_mem.h"
@@ -38,56 +40,53 @@
 
 void vp8cx_init_de_quantizer(VP8D_COMP *pbi)
 {
-    int r, c;
     int i;
     int Q;
     VP8_COMMON *const pc = & pbi->common;
 
     for (Q = 0; Q < QINDEX_RANGE; Q++)
     {
-        pc->Y1dequant[Q][0][0] = (short)vp8_dc_quant(Q, pc->y1dc_delta_q);
-        pc->Y2dequant[Q][0][0] = (short)vp8_dc2quant(Q, pc->y2dc_delta_q);
-        pc->UVdequant[Q][0][0] = (short)vp8_dc_uv_quant(Q, pc->uvdc_delta_q);
+        pc->Y1dequant[Q][0] = (short)vp8_dc_quant(Q, pc->y1dc_delta_q);
+        pc->Y2dequant[Q][0] = (short)vp8_dc2quant(Q, pc->y2dc_delta_q);
+        pc->UVdequant[Q][0] = (short)vp8_dc_uv_quant(Q, pc->uvdc_delta_q);
 
-        // all the ac values = ;
+        /* all the ac values = ; */
         for (i = 1; i < 16; i++)
         {
             int rc = vp8_default_zig_zag1d[i];
-            r = (rc >> 2);
-            c = (rc & 3);
 
-            pc->Y1dequant[Q][r][c] = (short)vp8_ac_yquant(Q);
-            pc->Y2dequant[Q][r][c] = (short)vp8_ac2quant(Q, pc->y2ac_delta_q);
-            pc->UVdequant[Q][r][c] = (short)vp8_ac_uv_quant(Q, pc->uvac_delta_q);
+            pc->Y1dequant[Q][rc] = (short)vp8_ac_yquant(Q);
+            pc->Y2dequant[Q][rc] = (short)vp8_ac2quant(Q, pc->y2ac_delta_q);
+            pc->UVdequant[Q][rc] = (short)vp8_ac_uv_quant(Q, pc->uvac_delta_q);
         }
     }
 }
 
-static void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)
+void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)
 {
     int i;
     int QIndex;
     MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
     VP8_COMMON *const pc = & pbi->common;
 
-    // Decide whether to use the default or alternate baseline Q value.
+    /* Decide whether to use the default or alternate baseline Q value. */
     if (xd->segmentation_enabled)
     {
-        // Abs Value
+        /* Abs Value */
         if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA)
             QIndex = xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
 
-        // Delta Value
+        /* Delta Value */
         else
         {
             QIndex = pc->base_qindex + xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
-            QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;    // Clamp to valid range
+            QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;    /* Clamp to valid range */
         }
     }
     else
         QIndex = pc->base_qindex;
 
-    // Set up the block level dequant pointers
+    /* Set up the block level dequant pointers */
     for (i = 0; i < 16; i++)
     {
         xd->block[i].dequant = pc->Y1dequant[QIndex];
@@ -108,11 +107,12 @@ static void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)
 #define RTCD_VTABLE(x) NULL
 #endif
 
-//skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it
-// to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy.
+/* skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it
+ *  to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy.
+ */
 static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd)
 {
-    if (xd->frame_type == KEY_FRAME  ||  xd->mbmi.ref_frame == INTRA_FRAME)
+    if (xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
     {
 
         vp8_build_intra_predictors_mbuv_s(xd);
@@ -125,42 +125,114 @@ static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd)
     }
 }
 
-static void reconstruct_mb(VP8D_COMP *pbi, MACROBLOCKD *xd)
+static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd)
+{
+    /* If the MV points so far into the UMV border that no visible pixels
+     * are used for reconstruction, the subpel part of the MV can be
+     * discarded and the MV limited to 16 pixels with equivalent results.
+     *
+     * This limit kicks in at 19 pixels for the top and left edges, for
+     * the 16 pixels plus 3 taps right of the central pixel when subpel
+     * filtering. The bottom and right edges use 16 pixels plus 2 pixels
+     * left of the central pixel when filtering.
+     */
+    if (mv->col < (xd->mb_to_left_edge - (19 << 3)))
+        mv->col = xd->mb_to_left_edge - (16 << 3);
+    else if (mv->col > xd->mb_to_right_edge + (18 << 3))
+        mv->col = xd->mb_to_right_edge + (16 << 3);
+
+    if (mv->row < (xd->mb_to_top_edge - (19 << 3)))
+        mv->row = xd->mb_to_top_edge - (16 << 3);
+    else if (mv->row > xd->mb_to_bottom_edge + (18 << 3))
+        mv->row = xd->mb_to_bottom_edge + (16 << 3);
+}
+
+/* A version of the above function for chroma block MVs.*/
+static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd)
+{
+    mv->col = (2*mv->col < (xd->mb_to_left_edge - (19 << 3))) ? (xd->mb_to_left_edge - (16 << 3)) >> 1 : mv->col;
+    mv->col = (2*mv->col > xd->mb_to_right_edge + (18 << 3)) ? (xd->mb_to_right_edge + (16 << 3)) >> 1 : mv->col;
+
+    mv->row = (2*mv->row < (xd->mb_to_top_edge - (19 << 3))) ? (xd->mb_to_top_edge - (16 << 3)) >> 1 : mv->row;
+    mv->row = (2*mv->row > xd->mb_to_bottom_edge + (18 << 3)) ? (xd->mb_to_bottom_edge + (16 << 3)) >> 1 : mv->row;
+}
+
+void clamp_mvs(MACROBLOCKD *xd)
 {
-    if (xd->frame_type == KEY_FRAME  ||  xd->mbmi.ref_frame == INTRA_FRAME)
+    if (xd->mode_info_context->mbmi.mode == SPLITMV)
+    {
+        int i;
+
+        for (i=0; i<16; i++)
+            clamp_mv_to_umv_border(&xd->block[i].bmi.mv.as_mv, xd);
+        for (i=16; i<24; i++)
+            clamp_uvmv_to_umv_border(&xd->block[i].bmi.mv.as_mv, xd);
+    }
+    else
+    {
+        clamp_mv_to_umv_border(&xd->mode_info_context->mbmi.mv.as_mv, xd);
+        clamp_uvmv_to_umv_border(&xd->block[16].bmi.mv.as_mv, xd);
+    }
+
+}
+
+void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
+{
+    int eobtotal = 0;
+    int i, do_clamp = xd->mode_info_context->mbmi.need_to_clamp_mvs;
+
+    if (xd->mode_info_context->mbmi.mb_skip_coeff)
+    {
+        vp8_reset_mb_tokens_context(xd);
+    }
+    else
+    {
+        eobtotal = vp8_decode_mb_tokens(pbi, xd);
+    }
+
+    /* Perform temporary clamping of the MV to be used for prediction */
+    if (do_clamp)
+    {
+        clamp_mvs(xd);
+    }
+
+    xd->mode_info_context->mbmi.dc_diff = 1;
+
+    if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV && eobtotal == 0)
+    {
+        xd->mode_info_context->mbmi.dc_diff = 0;
+        skip_recon_mb(pbi, xd);
+        return;
+    }
+
+    if (xd->segmentation_enabled)
+        mb_init_dequantizer(pbi, xd);
+
+    /* do prediction */
+    if (xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
     {
         vp8_build_intra_predictors_mbuv(xd);
 
-        if (xd->mbmi.mode != B_PRED)
+        if (xd->mode_info_context->mbmi.mode != B_PRED)
         {
             vp8_build_intra_predictors_mby_ptr(xd);
-            vp8_recon16x16mb(RTCD_VTABLE(recon), xd);
-        }
-        else
-        {
-            vp8_recon_intra4x4mb(RTCD_VTABLE(recon), xd);
+        } else {
+            vp8_intra_prediction_down_copy(xd);
         }
     }
     else
     {
         vp8_build_inter_predictors_mb(xd);
-        vp8_recon16x16mb(RTCD_VTABLE(recon), xd);
     }
-}
 
-
-static void de_quantand_idct(VP8D_COMP *pbi, MACROBLOCKD *xd)
-{
-    int i;
-    BLOCKD *b = &xd->block[24];
-
-
-    if (xd->mbmi.mode != B_PRED && xd->mbmi.mode != SPLITMV)
+    /* dequantization and idct */
+    if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV)
     {
+        BLOCKD *b = &xd->block[24];
         DEQUANT_INVOKE(&pbi->dequant, block)(b);
 
-        // do 2nd order transform on the dc block
-        if (b->eob > 1)
+        /* do 2nd order transform on the dc block */
+        if (xd->eobs[24] > 1)
         {
             IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
             ((int *)b->qcoeff)[0] = 0;
@@ -178,86 +250,50 @@ static void de_quantand_idct(VP8D_COMP *pbi, MACROBLOCKD *xd)
             ((int *)b->qcoeff)[0] = 0;
         }
 
-
-        for (i = 0; i < 16; i++)
-        {
-
-            b = &xd->block[i];
-
-            if (b->eob > 1)
-            {
-                DEQUANT_INVOKE(&pbi->dequant, idct_dc)(b->qcoeff, &b->dequant[0][0], b->diff, 32, xd->block[24].diff[i]);
-            }
-            else
-            {
-                IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar)(xd->block[24].diff[i], b->diff, 32);
-            }
-        }
-
-        for (i = 16; i < 24; i++)
-        {
-            b = &xd->block[i];
-
-            if (b->eob > 1)
-            {
-                DEQUANT_INVOKE(&pbi->dequant, idct)(b->qcoeff, &b->dequant[0][0], b->diff, 16);
-            }
-            else
-            {
-                IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar)(b->qcoeff[0] * b->dequant[0][0], b->diff, 16);
-                ((int *)b->qcoeff)[0] = 0;
-            }
-        }
+        DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
+                        (xd->qcoeff, xd->block[0].dequant,
+                         xd->predictor, xd->dst.y_buffer,
+                         xd->dst.y_stride, xd->eobs, xd->block[24].diff);
     }
-    else
+    else if ((xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) && xd->mode_info_context->mbmi.mode == B_PRED)
     {
-        for (i = 0; i < 24; i++)
+        for (i = 0; i < 16; i++)
         {
 
-            b = &xd->block[i];
+            BLOCKD *b = &xd->block[i];
+            vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
 
-            if (b->eob > 1)
+            if (xd->eobs[i] > 1)
             {
-                DEQUANT_INVOKE(&pbi->dequant, idct)(b->qcoeff, &b->dequant[0][0], b->diff, (32 - (i & 16)));
+                DEQUANT_INVOKE(&pbi->dequant, idct_add)
+                    (b->qcoeff, b->dequant,  b->predictor,
+                    *(b->base_dst) + b->dst, 16, b->dst_stride);
             }
             else
             {
-                IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar)(b->qcoeff[0] * b->dequant[0][0], b->diff, (32 - (i & 16)));
+                IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
+                    (b->qcoeff[0] * b->dequant[0], b->predictor,
+                    *(b->base_dst) + b->dst, 16, b->dst_stride);
                 ((int *)b->qcoeff)[0] = 0;
             }
         }
-    }
-}
-
-void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
-{
-    int eobtotal = 0;
 
-    if (xd->mbmi.mb_skip_coeff)
-    {
-        vp8_reset_mb_tokens_context(xd);
     }
     else
     {
-        eobtotal = vp8_decode_mb_tokens(pbi, xd);
-    }
-
-    xd->mode_info_context->mbmi.dc_diff = 1;
-
-    if (xd->mbmi.mode != B_PRED && xd->mbmi.mode != SPLITMV && eobtotal == 0)
-    {
-        xd->mode_info_context->mbmi.dc_diff = 0;
-        skip_recon_mb(pbi, xd);
-        return;
+        DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
+                        (xd->qcoeff, xd->block[0].dequant,
+                         xd->predictor, xd->dst.y_buffer,
+                         xd->dst.y_stride, xd->eobs);
     }
 
-    if (xd->segmentation_enabled)
-        mb_init_dequantizer(pbi, xd);
-
-    de_quantand_idct(pbi, xd);
-    reconstruct_mb(pbi, xd);
+    DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
+                    (xd->qcoeff+16*16, xd->block[16].dequant,
+                     xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
+                     xd->dst.uv_stride, xd->eobs+16);
 }
 
+
 static int get_delta_q(vp8_reader *bc, int prev, int *q_update)
 {
     int ret_val = 0;
@@ -293,18 +329,17 @@ void vp8_decode_mb_row(VP8D_COMP *pbi,
     int i;
     int recon_yoffset, recon_uvoffset;
     int mb_col;
-    int recon_y_stride = pc->last_frame.y_stride;
-    int recon_uv_stride = pc->last_frame.uv_stride;
+    int ref_fb_idx = pc->lst_fb_idx;
+    int dst_fb_idx = pc->new_fb_idx;
+    int recon_y_stride = pc->yv12_fb[ref_fb_idx].y_stride;
+    int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;
 
-    vpx_memset(pc->left_context, 0, sizeof(pc->left_context));
+    vpx_memset(&pc->left_context, 0, sizeof(pc->left_context));
     recon_yoffset = mb_row * recon_y_stride * 16;
     recon_uvoffset = mb_row * recon_uv_stride * 8;
-    // reset above block coeffs
+    /* reset above block coeffs */
 
-    xd->above_context[Y1CONTEXT] = pc->above_context[Y1CONTEXT];
-    xd->above_context[UCONTEXT ] = pc->above_context[UCONTEXT];
-    xd->above_context[VCONTEXT ] = pc->above_context[VCONTEXT];
-    xd->above_context[Y2CONTEXT] = pc->above_context[Y2CONTEXT];
+    xd->above_context = pc->above_context;
     xd->up_available = (mb_row != 0);
 
     xd->mb_to_top_edge = -((mb_row * 16)) << 3;
@@ -312,10 +347,8 @@ void vp8_decode_mb_row(VP8D_COMP *pbi,
 
     for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
     {
-        // Take a copy of the mode and Mv information for this macroblock into the xd->mbmi
-        vpx_memcpy(&xd->mbmi, &xd->mode_info_context->mbmi, 32); //sizeof(MB_MODE_INFO) );
 
-        if (xd->mbmi.mode == SPLITMV || xd->mbmi.mode == B_PRED)
+        if (xd->mode_info_context->mbmi.mode == SPLITMV || xd->mode_info_context->mbmi.mode == B_PRED)
         {
             for (i = 0; i < 16; i++)
             {
@@ -324,48 +357,38 @@ void vp8_decode_mb_row(VP8D_COMP *pbi,
             }
         }
 
-        // Distance of Mb to the various image edges.
-        // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
+        /* Distance of Mb to the various image edges.
+         * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
+         */
         xd->mb_to_left_edge = -((mb_col * 16) << 3);
         xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
 
-        xd->dst.y_buffer = pc->new_frame.y_buffer + recon_yoffset;
-        xd->dst.u_buffer = pc->new_frame.u_buffer + recon_uvoffset;
-        xd->dst.v_buffer = pc->new_frame.v_buffer + recon_uvoffset;
+        xd->dst.y_buffer = pc->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+        xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+        xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
 
         xd->left_available = (mb_col != 0);
 
-        // Select the appropriate reference frame for this MB
-        if (xd->mbmi.ref_frame == LAST_FRAME)
-        {
-            xd->pre.y_buffer = pc->last_frame.y_buffer + recon_yoffset;
-            xd->pre.u_buffer = pc->last_frame.u_buffer + recon_uvoffset;
-            xd->pre.v_buffer = pc->last_frame.v_buffer + recon_uvoffset;
-        }
-        else if (xd->mbmi.ref_frame == GOLDEN_FRAME)
-        {
-            // Golden frame reconstruction buffer
-            xd->pre.y_buffer = pc->golden_frame.y_buffer + recon_yoffset;
-            xd->pre.u_buffer = pc->golden_frame.u_buffer + recon_uvoffset;
-            xd->pre.v_buffer = pc->golden_frame.v_buffer + recon_uvoffset;
-        }
+        /* Select the appropriate reference frame for this MB */
+        if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
+            ref_fb_idx = pc->lst_fb_idx;
+        else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
+            ref_fb_idx = pc->gld_fb_idx;
         else
-        {
-            // Alternate reference frame reconstruction buffer
-            xd->pre.y_buffer = pc->alt_ref_frame.y_buffer + recon_yoffset;
-            xd->pre.u_buffer = pc->alt_ref_frame.u_buffer + recon_uvoffset;
-            xd->pre.v_buffer = pc->alt_ref_frame.v_buffer + recon_uvoffset;
-        }
+            ref_fb_idx = pc->alt_fb_idx;
+
+        xd->pre.y_buffer = pc->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
+        xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
+        xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
 
         vp8_build_uvmvs(xd, pc->full_pixel);
 
         /*
-        if(pbi->common.current_video_frame==0 &&mb_col==1 && mb_row==0)
+        if(pc->current_video_frame==0 &&mb_col==1 && mb_row==0)
         pbi->debugoutput =1;
         else
         pbi->debugoutput =0;
         */
-        vp8dx_bool_decoder_fill(xd->current_bc);
         vp8_decode_macroblock(pbi, xd);
 
 
@@ -374,25 +397,17 @@ void vp8_decode_mb_row(VP8D_COMP *pbi,
 
         ++xd->mode_info_context;  /* next mb */
 
-        xd->gf_active_ptr++;      // GF useage flag for next MB
+        xd->above_context++;
 
-        xd->above_context[Y1CONTEXT] += 4;
-        xd->above_context[UCONTEXT ] += 2;
-        xd->above_context[VCONTEXT ] += 2;
-        xd->above_context[Y2CONTEXT] ++;
-
-        pbi->current_mb_col_main = mb_col;
     }
 
-    // adjust to the next row of mbs
+    /* adjust to the next row of mbs */
     vp8_extend_mb_row(
-        &pc->new_frame,
+        &pc->yv12_fb[dst_fb_idx],
         xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8
     );
 
     ++xd->mode_info_context;      /* skip prediction column */
-
-    pbi->last_mb_row_decoded = mb_row;
 }
 
 
@@ -432,7 +447,7 @@ static void setup_token_decoder(VP8D_COMP *pbi,
     for (i = 0; i < num_part; i++)
     {
         const unsigned char *partition_size_ptr = cx_data + i * 3;
-        unsigned int         partition_size;
+        ptrdiff_t            partition_size;
 
         /* Calculate the length of this partition. The last partition
          * size is implicit.
@@ -446,7 +461,8 @@ static void setup_token_decoder(VP8D_COMP *pbi,
             partition_size = user_data_end - partition;
         }
 
-        if (partition + partition_size > user_data_end)
+        if (partition + partition_size > user_data_end
+            || partition + partition_size < partition)
             vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                                "Truncated packet or corrupt partition "
                                "%d length", i + 1);
@@ -473,18 +489,7 @@ static void stop_token_decoder(VP8D_COMP *pbi)
     VP8_COMMON *pc = &pbi->common;
 
     if (pc->multi_token_partition != ONE_PARTITION)
-    {
-        int num_part = (1 << pc->multi_token_partition);
-
-        for (i = 0; i < num_part; i++)
-        {
-            vp8dx_stop_decode(&pbi->mbc[i]);
-        }
-
         vpx_free(pbi->mbc);
-    }
-    else
-        vp8dx_stop_decode(& pbi->bc2);
 }
 
 static void init_frame(VP8D_COMP *pbi)
@@ -494,7 +499,7 @@ static void init_frame(VP8D_COMP *pbi)
 
     if (pc->frame_type == KEY_FRAME)
     {
-        // Various keyframe initializations
+        /* Various keyframe initializations */
         vpx_memcpy(pc->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
 
         vp8_init_mbmode_probs(pc);
@@ -502,22 +507,23 @@ static void init_frame(VP8D_COMP *pbi)
         vp8_default_coef_probs(pc);
         vp8_kf_default_bmode_probs(pc->kf_bmode_prob);
 
-        // reset the segment feature data to 0 with delta coding (Default state).
+        /* reset the segment feature data to 0 with delta coding (Default state). */
         vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
         xd->mb_segement_abs_delta = SEGMENT_DELTADATA;
 
-       // reset the mode ref deltasa for loop filter
+        /* reset the mode ref deltasa for loop filter */
         vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
         vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
 
-        // All buffers are implicitly updated on key frames.
+        /* All buffers are implicitly updated on key frames. */
         pc->refresh_golden_frame = 1;
         pc->refresh_alt_ref_frame = 1;
         pc->copy_buffer_to_gf = 0;
         pc->copy_buffer_to_arf = 0;
 
-        // Note that Golden and Altref modes cannot be used on a key frame so
-        // ref_frame_sign_bias[] is undefined and meaningless
+        /* Note that Golden and Altref modes cannot be used on a key frame so
+         * ref_frame_sign_bias[] is undefined and meaningless
+         */
         pc->ref_frame_sign_bias[GOLDEN_FRAME] = 0;
         pc->ref_frame_sign_bias[ALTREF_FRAME] = 0;
     }
@@ -528,7 +534,7 @@ static void init_frame(VP8D_COMP *pbi)
         else
             pc->mcomp_filter_type = BILINEAR;
 
-        // To enable choice of different interploation filters
+        /* To enable choice of different interploation filters */
         if (pc->mcomp_filter_type == SIXTAP)
         {
             xd->subpixel_predict      = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap4x4);
@@ -548,7 +554,7 @@ static void init_frame(VP8D_COMP *pbi)
     xd->left_context = &pc->left_context;
     xd->mode_info_context = pc->mi;
     xd->frame_type = pc->frame_type;
-    xd->mbmi.mode = DC_PRED;
+    xd->mode_info_context->mbmi.mode = DC_PRED;
     xd->mode_info_stride = pc->mode_info_stride;
 }
 
@@ -559,11 +565,14 @@ int vp8_decode_frame(VP8D_COMP *pbi)
     MACROBLOCKD *const xd  = & pbi->mb;
     const unsigned char *data = (const unsigned char *)pbi->Source;
     const unsigned char *const data_end = data + pbi->source_sz;
-    int first_partition_length_in_bytes;
+    ptrdiff_t first_partition_length_in_bytes;
     int mb_row;
     int i, j, k, l;
     const int *const mb_feature_data_bits = vp8_mb_feature_data_bits;
 
+    if (data_end - data < 3)
+        vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                           "Truncated packet");
     pc->frame_type = (FRAME_TYPE)(data[0] & 1);
     pc->version = (data[0] >> 1) & 7;
     pc->show_frame = (data[0] >> 4) & 1;
@@ -571,7 +580,8 @@ int vp8_decode_frame(VP8D_COMP *pbi)
         (data[0] | (data[1] << 8) | (data[2] << 16)) >> 5;
     data += 3;
 
-    if (data + first_partition_length_in_bytes > data_end)
+    if (data + first_partition_length_in_bytes > data_end
+        || data + first_partition_length_in_bytes < data)
         vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                            "Truncated packet or corrupt partition 0 length");
     vp8_setup_version(pc);
@@ -581,7 +591,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
         const int Width = pc->Width;
         const int Height = pc->Height;
 
-        // vet via sync code
+        /* vet via sync code */
         if (data[0] != 0x9d || data[1] != 0x01 || data[2] != 0x2a)
             vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,
                                "Invalid frame sync code");
@@ -594,6 +604,8 @@ int vp8_decode_frame(VP8D_COMP *pbi)
 
         if (Width != pc->Width  ||  Height != pc->Height)
         {
+            int prev_mb_rows = pc->mb_rows;
+
             if (pc->Width <= 0)
             {
                 pc->Width = Width;
@@ -608,9 +620,14 @@ int vp8_decode_frame(VP8D_COMP *pbi)
                                    "Invalid frame height");
             }
 
-            if (vp8_alloc_frame_buffers(&pbi->common, pc->Width, pc->Height))
+            if (vp8_alloc_frame_buffers(pc, pc->Width, pc->Height))
                 vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                                    "Failed to allocate frame buffers");
+
+#if CONFIG_MULTITHREAD
+            if (pbi->b_multithreaded_rd)
+                vp8mt_alloc_temp_buffers(pbi, pc->Width, prev_mb_rows);
+#endif
         }
     }
 
@@ -630,11 +647,11 @@ int vp8_decode_frame(VP8D_COMP *pbi)
         pc->clamp_type  = (CLAMP_TYPE)vp8_read_bit(bc);
     }
 
-    // Is segmentation enabled
+    /* Is segmentation enabled */
     xd->segmentation_enabled = (unsigned char)vp8_read_bit(bc);
     if (xd->segmentation_enabled)
     {
-        // Signal whether or not the segmentation map is being explicitly updated this frame.
+        /* Signal whether or not the segmentation map is being explicitly updated this frame. */
         xd->update_mb_segmentation_map = (unsigned char)vp8_read_bit(bc);
         xd->update_mb_segmentation_data = (unsigned char)vp8_read_bit(bc);
 
@@ -644,12 +661,12 @@ int vp8_decode_frame(VP8D_COMP *pbi)
 
             vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
 
-            // For each segmentation feature (Quant and loop filter level)
+            /* For each segmentation feature (Quant and loop filter level) */
             for (i = 0; i < MB_LVL_MAX; i++)
             {
                 for (j = 0; j < MAX_MB_SEGMENTS; j++)
                 {
-                    // Frame level data
+                    /* Frame level data */
                     if (vp8_read_bit(bc))
                     {
                         xd->segment_feature_data[i][j] = (signed char)vp8_read_literal(bc, mb_feature_data_bits[i]);
@@ -665,60 +682,60 @@ int vp8_decode_frame(VP8D_COMP *pbi)
 
         if (xd->update_mb_segmentation_map)
         {
-            // Which macro block level features are enabled
+            /* Which macro block level features are enabled */
             vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
 #if CONFIG_SEGMENTATION
-            // Read the probs used to decode the segment id for each macro block.
+            /* Read the probs used to decode the segment id for each macro block. */
             for (i = 0; i < MB_FEATURE_TREE_PROBS+3; i++)
 #else
             for (i = 0; i < MB_FEATURE_TREE_PROBS; i++)
 #endif
             {
-                // If not explicitly set value is defaulted to 255 by memset above
+                /* If not explicitly set value is defaulted to 255 by memset above */
                 if (vp8_read_bit(bc))
                     xd->mb_segment_tree_probs[i] = (vp8_prob)vp8_read_literal(bc, 8);
             }
         }
     }
 
-    // Read the loop filter level and type
+    /* Read the loop filter level and type */
     pc->filter_type = (LOOPFILTERTYPE) vp8_read_bit(bc);
     pc->filter_level = vp8_read_literal(bc, 6);
     pc->sharpness_level = vp8_read_literal(bc, 3);
 
-    // Read in loop filter deltas applied at the MB level based on mode or ref frame.
+    /* Read in loop filter deltas applied at the MB level based on mode or ref frame. */
     xd->mode_ref_lf_delta_update = 0;
     xd->mode_ref_lf_delta_enabled = (unsigned char)vp8_read_bit(bc);
 
     if (xd->mode_ref_lf_delta_enabled)
     {
-        // Do the deltas need to be updated
+        /* Do the deltas need to be updated */
         xd->mode_ref_lf_delta_update = (unsigned char)vp8_read_bit(bc);
 
         if (xd->mode_ref_lf_delta_update)
         {
-            // Send update
+            /* Send update */
             for (i = 0; i < MAX_REF_LF_DELTAS; i++)
             {
                 if (vp8_read_bit(bc))
                 {
-                    //sign = vp8_read_bit( bc );
+                    /*sign = vp8_read_bit( bc );*/
                     xd->ref_lf_deltas[i] = (signed char)vp8_read_literal(bc, 6);
 
-                    if (vp8_read_bit(bc))        // Apply sign
+                    if (vp8_read_bit(bc))        /* Apply sign */
                         xd->ref_lf_deltas[i] = xd->ref_lf_deltas[i] * -1;
                 }
             }
 
-            // Send update
+            /* Send update */
             for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
             {
                 if (vp8_read_bit(bc))
                 {
-                    //sign = vp8_read_bit( bc );
+                    /*sign = vp8_read_bit( bc );*/
                     xd->mode_lf_deltas[i] = (signed char)vp8_read_literal(bc, 6);
 
-                    if (vp8_read_bit(bc))        // Apply sign
+                    if (vp8_read_bit(bc))        /* Apply sign */
                         xd->mode_lf_deltas[i] = xd->mode_lf_deltas[i] * -1;
                 }
             }
@@ -728,11 +745,11 @@ int vp8_decode_frame(VP8D_COMP *pbi)
     setup_token_decoder(pbi, data + first_partition_length_in_bytes);
     xd->current_bc = &pbi->bc2;
 
-    // Read the default quantizers.
+    /* Read the default quantizers. */
     {
         int Q, q_update;
 
-        Q = vp8_read_literal(bc, 7);  // AC 1st order Q = default
+        Q = vp8_read_literal(bc, 7);  /* AC 1st order Q = default */
         pc->base_qindex = Q;
         q_update = 0;
         pc->y1dc_delta_q = get_delta_q(bc, pc->y1dc_delta_q, &q_update);
@@ -744,20 +761,21 @@ int vp8_decode_frame(VP8D_COMP *pbi)
         if (q_update)
             vp8cx_init_de_quantizer(pbi);
 
-        // MB level dequantizer setup
+        /* MB level dequantizer setup */
         mb_init_dequantizer(pbi, &pbi->mb);
     }
 
-    // Determine if the golden frame or ARF buffer should be updated and how.
-    // For all non key frames the GF and ARF refresh flags and sign bias
-    // flags must be set explicitly.
+    /* Determine if the golden frame or ARF buffer should be updated and how.
+     * For all non key frames the GF and ARF refresh flags and sign bias
+     * flags must be set explicitly.
+     */
     if (pc->frame_type != KEY_FRAME)
     {
-        // Should the GF or ARF be updated from the current frame
+        /* Should the GF or ARF be updated from the current frame */
         pc->refresh_golden_frame = vp8_read_bit(bc);
         pc->refresh_alt_ref_frame = vp8_read_bit(bc);
 
-        // Buffer to buffer copy flags.
+        /* Buffer to buffer copy flags. */
         pc->copy_buffer_to_gf = 0;
 
         if (!pc->refresh_golden_frame)
@@ -793,9 +811,8 @@ int vp8_decode_frame(VP8D_COMP *pbi)
         fclose(z);
     }
 
-    vp8dx_bool_decoder_fill(bc);
     {
-        // read coef probability tree
+        /* read coef probability tree */
 
         for (i = 0; i < BLOCK_TYPES; i++)
             for (j = 0; j < COEF_BANDS; j++)
@@ -813,57 +830,54 @@ int vp8_decode_frame(VP8D_COMP *pbi)
                     }
     }
 
-    vpx_memcpy(&xd->pre, &pc->last_frame, sizeof(YV12_BUFFER_CONFIG));
-    vpx_memcpy(&xd->dst, &pc->new_frame, sizeof(YV12_BUFFER_CONFIG));
+    vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->lst_fb_idx], sizeof(YV12_BUFFER_CONFIG));
+    vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], sizeof(YV12_BUFFER_CONFIG));
 
 #if CONFIG_SEGMENTATION
      // Create the encoder segmentation map and set all entries to 0
      CHECK_MEM_ERROR(pbi->segmentation_map, vpx_calloc((pc->mb_rows * pc->mb_cols), 1));
 #endif
 
-    // set up frame new frame for intra coded blocks
-    vp8_setup_intra_recon(&pc->new_frame);
+    /* set up frame new frame for intra coded blocks */
+    if (!(pbi->b_multithreaded_rd) || pc->multi_token_partition == ONE_PARTITION || !(pc->filter_level))
+        vp8_setup_intra_recon(&pc->yv12_fb[pc->new_fb_idx]);
 
     vp8_setup_block_dptrs(xd);
 
     vp8_build_block_doffsets(xd);
 
-    // clear out the coeff buffer
+    /* clear out the coeff buffer */
     vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
 
-    // Read the mb_no_coeff_skip flag
+    /* Read the mb_no_coeff_skip flag */
     pc->mb_no_coeff_skip = (int)vp8_read_bit(bc);
 
-    if (pc->frame_type == KEY_FRAME)
-        vp8_kfread_modes(pbi);
-    else
-        vp8_decode_mode_mvs(pbi);
 
-    // reset since these guys are used as iterators
-    vpx_memset(pc->above_context[Y1CONTEXT], 0, sizeof(ENTROPY_CONTEXT) * pc->mb_cols * 4);
-    vpx_memset(pc->above_context[UCONTEXT ], 0, sizeof(ENTROPY_CONTEXT) * pc->mb_cols * 2);
-    vpx_memset(pc->above_context[VCONTEXT ], 0, sizeof(ENTROPY_CONTEXT) * pc->mb_cols * 2);
-    vpx_memset(pc->above_context[Y2CONTEXT], 0, sizeof(ENTROPY_CONTEXT) * pc->mb_cols);
-
-    xd->gf_active_ptr = (signed char *)pc->gf_active_flags;     // Point to base of GF active flags data structure
+    vp8_decode_mode_mvs(pbi);
 
+    vpx_memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
 
     vpx_memcpy(&xd->block[0].bmi, &xd->mode_info_context->bmi[0], sizeof(B_MODE_INFO));
 
-
-    if (pbi->b_multithreaded_lf && pbi->common.filter_level != 0)
-        vp8_start_lfthread(pbi);
-
-    if (pbi->b_multithreaded_rd && pbi->common.multi_token_partition != ONE_PARTITION)
+    if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION)
     {
-        vp8_mtdecode_mb_rows(pbi, xd);
+        vp8mt_decode_mb_rows(pbi, xd);
+        if(pbi->common.filter_level)
+        {
+            /*vp8_mt_loop_filter_frame(pbi);*/ /*cm, &pbi->mb, cm->filter_level);*/
+
+            pc->last_frame_type = pc->frame_type;
+            pc->last_filter_type = pc->filter_type;
+            pc->last_sharpness_level = pc->sharpness_level;
+        }
+        vp8_yv12_extend_frame_borders_ptr(&pc->yv12_fb[pc->new_fb_idx]);    /*cm->frame_to_show);*/
     }
     else
     {
         int ibc = 0;
-        int num_part = 1 << pbi->common.multi_token_partition;
+        int num_part = 1 << pc->multi_token_partition;
 
-        // Decode the individual macro block
+        /* Decode the individual macro block */
         for (mb_row = 0; mb_row < pc->mb_rows; mb_row++)
         {
 
@@ -878,20 +892,19 @@ int vp8_decode_frame(VP8D_COMP *pbi)
 
             vp8_decode_mb_row(pbi, pc, mb_row, xd);
         }
-
-        pbi->last_mb_row_decoded = mb_row;
     }
 
 
     stop_token_decoder(pbi);
 
-    vp8dx_stop_decode(bc);
-
-    // vpx_log("Decoder: Frame Decoded, Size Roughly:%d bytes  \n",bc->pos+pbi->bc2.pos);
+    /* vpx_log("Decoder: Frame Decoded, Size Roughly:%d bytes  \n",bc->pos+pbi->bc2.pos); */
 
-    // If this was a kf or Gf note the Q used
-    if ((pc->frame_type == KEY_FRAME) || (pc->refresh_golden_frame) || pbi->common.refresh_alt_ref_frame)
+    /* If this was a kf or Gf note the Q used */
+    if ((pc->frame_type == KEY_FRAME) ||
+         pc->refresh_golden_frame || pc->refresh_alt_ref_frame)
+    {
         pc->last_kf_gf_q = pc->base_qindex;
+    }
 
     if (pc->refresh_entropy_probs == 0)
     {
diff --git a/vp8/decoder/demode.c b/vp8/decoder/demode.c
deleted file mode 100644
index 74fe91803..000000000
--- a/vp8/decoder/demode.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
- */
-
-
-#include "onyxd_int.h"
-#include "entropymode.h"
-#include "findnearmv.h"
-
-
-int vp8_read_bmode(vp8_reader *bc, const vp8_prob *p)
-{
-    const int i = vp8_treed_read(bc, vp8_bmode_tree, p);
-
-    return i;
-}
-
-
-int vp8_read_ymode(vp8_reader *bc, const vp8_prob *p)
-{
-    const int i = vp8_treed_read(bc, vp8_ymode_tree, p);
-
-    return i;
-}
-
-int vp8_kfread_ymode(vp8_reader *bc, const vp8_prob *p)
-{
-    const int i = vp8_treed_read(bc, vp8_kf_ymode_tree, p);
-
-    return i;
-}
-
-
-
-int vp8_read_uv_mode(vp8_reader *bc, const vp8_prob *p)
-{
-    const int i = vp8_treed_read(bc, vp8_uv_mode_tree, p);
-
-    return i;
-}
-
-void vp8_read_mb_features(vp8_reader *r, MB_MODE_INFO *mi, MACROBLOCKD *x)
-{
-    // Is segmentation enabled
-    if (x->segmentation_enabled && x->update_mb_segmentation_map)
-    {
-        // If so then read the segment id.
-        if (vp8_read(r, x->mb_segment_tree_probs[0]))
-            mi->segment_id = (unsigned char)(2 + vp8_read(r, x->mb_segment_tree_probs[2]));
-        else
-            mi->segment_id = (unsigned char)(vp8_read(r, x->mb_segment_tree_probs[1]));
-    }
-}
-
-void vp8_kfread_modes(VP8D_COMP *pbi)
-{
-    VP8_COMMON *const cp = & pbi->common;
-    vp8_reader *const bc = & pbi->bc;
-
-    MODE_INFO *m = cp->mi;
-    const int ms = cp->mode_info_stride;
-#if CONFIG_SEGMENTATION
-    int left_id,above_id;
-    int i;
-#endif
-    int mb_row = -1;
-    vp8_prob prob_skip_false = 0;
-
-    if (cp->mb_no_coeff_skip)
-        prob_skip_false = (vp8_prob)(vp8_read_literal(bc, 8));
-
-    while (++mb_row < cp->mb_rows)
-    {
-        int mb_col = -1;
-
-        while (++mb_col < cp->mb_cols)
-        {
-            MB_PREDICTION_MODE y_mode;
-            vp8dx_bool_decoder_fill(bc);
-
-            // Read the Macroblock segmentation map if it is being updated explicitly this frame (reset to 0 above by default)
-            // By default on a key frame reset all MBs to segment 0
-            m->mbmi.segment_id = 0;
-
-            if (pbi->mb.update_mb_segmentation_map)
-            {
-
-#if CONFIG_SEGMENTATION
-                vp8_read_mb_features(bc, &m->mbmi, &pbi->mb);
-                pbi->segmentation_map[(mb_row * cp->mb_cols) + mb_col] = m->mbmi.segment_id;
-#else
-                vp8_read_mb_features(bc, &m->mbmi, &pbi->mb);
-#endif
-            }
-
-        // Read the macroblock coeff skip flag if this feature is in use, else default to 0
-            if (cp->mb_no_coeff_skip)
-                m->mbmi.mb_skip_coeff = vp8_read(bc, prob_skip_false);
-            else
-                m->mbmi.mb_skip_coeff = 0;
-
-            y_mode = (MB_PREDICTION_MODE) vp8_kfread_ymode(bc, cp->kf_ymode_prob);
-
-            m->mbmi.ref_frame = INTRA_FRAME;
-
-            if ((m->mbmi.mode = y_mode) == B_PRED)
-            {
-                int i = 0;
-
-                do
-                {
-                    const B_PREDICTION_MODE A = vp8_above_bmi(m, i, ms)->mode;
-                    const B_PREDICTION_MODE L = vp8_left_bmi(m, i)->mode;
-
-                    m->bmi[i].mode = (B_PREDICTION_MODE) vp8_read_bmode(bc, cp->kf_bmode_prob [A] [L]);
-                }
-                while (++i < 16);
-            }
-            else
-            {
-                int BMode;
-                int i = 0;
-
-                switch (y_mode)
-                {
-                case DC_PRED:
-                    BMode = B_DC_PRED;
-                    break;
-                case V_PRED:
-                    BMode = B_VE_PRED;
-                    break;
-                case H_PRED:
-                    BMode = B_HE_PRED;
-                    break;
-                case TM_PRED:
-                    BMode = B_TM_PRED;
-                    break;
-                default:
-                    BMode = B_DC_PRED;
-                    break;
-                }
-
-                do
-                {
-                    m->bmi[i].mode = (B_PREDICTION_MODE)BMode;
-                }
-                while (++i < 16);
-            }
-
-            (m++)->mbmi.uv_mode = (MB_PREDICTION_MODE)vp8_read_uv_mode(bc, cp->kf_uv_mode_prob);
-        }
-
-        m++; // skip the border
-    }
-}
diff --git a/vp8/decoder/demode.h b/vp8/decoder/demode.h
deleted file mode 100644
index 51bbc5e7a..000000000
--- a/vp8/decoder/demode.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
- */
-
-
-#include "onyxd_int.h"
-
-/* Read (intra) modes for all blocks in a keyframe */
-
-void vp8_kfread_modes(VP8D_COMP *pbi);
-
-/* Intra mode for a Y subblock */
-
-int vp8_read_bmode(vp8_reader *, const vp8_prob *);
-
-/* MB intra Y mode trees differ for key and inter frames. */
-
-int   vp8_read_ymode(vp8_reader *, const vp8_prob *);
-int vp8_kfread_ymode(vp8_reader *, const vp8_prob *);
-
-/* MB intra UV mode trees are the same for key and inter frames. */
-
-int vp8_read_uv_mode(vp8_reader *, const vp8_prob *);
-
-/* Read any macroblock-level features that may be present. */
-
-void vp8_read_mb_features(vp8_reader *, MB_MODE_INFO *, MACROBLOCKD *);
diff --git a/vp8/decoder/dequantize.c b/vp8/decoder/dequantize.c
index 14798d9af..84a9fd943 100644
--- a/vp8/decoder/dequantize.c
+++ b/vp8/decoder/dequantize.c
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
@@ -23,7 +24,7 @@ void vp8_dequantize_b_c(BLOCKD *d)
     int i;
     short *DQ  = d->dqcoeff;
     short *Q   = d->qcoeff;
-    short *DQC = &d->dequant[0][0];
+    short *DQC = d->dequant;
 
     for (i = 0; i < 16; i++)
     {
@@ -31,8 +32,12 @@ void vp8_dequantize_b_c(BLOCKD *d)
     }
 }
 
-void vp8_dequant_idct_c(short *input, short *dq, short *output, int pitch)
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
+                            unsigned char *dest, int pitch, int stride)
 {
+    short output[16];
+    short *diff_ptr = output;
+    int r, c;
     int i;
 
     for (i = 0; i < 16; i++)
@@ -40,13 +45,40 @@ void vp8_dequant_idct_c(short *input, short *dq, short *output, int pitch)
         input[i] = dq[i] * input[i];
     }
 
-    vp8_short_idct4x4llm_c(input, output, pitch);
+    /* the idct halves ( >> 1) the pitch */
+    vp8_short_idct4x4llm_c(input, output, 4 << 1);
+
     vpx_memset(input, 0, 32);
+
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 4; c++)
+        {
+            int a = diff_ptr[c] + pred[c];
+
+            if (a < 0)
+                a = 0;
+
+            if (a > 255)
+                a = 255;
+
+            dest[c] = (unsigned char) a;
+        }
+
+        dest += stride;
+        diff_ptr += 4;
+        pred += pitch;
+    }
 }
 
-void vp8_dequant_dc_idct_c(short *input, short *dq, short *output, int pitch, int Dc)
+void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
+                               unsigned char *dest, int pitch, int stride,
+                               int Dc)
 {
     int i;
+    short output[16];
+    short *diff_ptr = output;
+    int r, c;
 
     input[0] = (short)Dc;
 
@@ -55,6 +87,28 @@ void vp8_dequant_dc_idct_c(short *input, short *dq, short *output, int pitch, in
         input[i] = dq[i] * input[i];
     }
 
-    vp8_short_idct4x4llm_c(input, output, pitch);
+    /* the idct halves ( >> 1) the pitch */
+    vp8_short_idct4x4llm_c(input, output, 4 << 1);
+
     vpx_memset(input, 0, 32);
+
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 4; c++)
+        {
+            int a = diff_ptr[c] + pred[c];
+
+            if (a < 0)
+                a = 0;
+
+            if (a > 255)
+                a = 255;
+
+            dest[c] = (unsigned char) a;
+        }
+
+        dest += stride;
+        diff_ptr += 4;
+        pred += pitch;
+    }
 }
diff --git a/vp8/decoder/dequantize.h b/vp8/decoder/dequantize.h
index d16b02e58..b78e39c1d 100644
--- a/vp8/decoder/dequantize.h
+++ b/vp8/decoder/dequantize.h
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
@@ -15,11 +16,31 @@
 #define prototype_dequant_block(sym) \
     void sym(BLOCKD *x)
 
-#define prototype_dequant_idct(sym) \
-    void sym(short *input, short *dq, short *output, int pitch)
+#define prototype_dequant_idct_add(sym) \
+    void sym(short *input, short *dq, \
+             unsigned char *pred, unsigned char *output, \
+             int pitch, int stride)
 
-#define prototype_dequant_idct_dc(sym) \
-    void sym(short *input, short *dq, short *output, int pitch, int dc)
+#define prototype_dequant_dc_idct_add(sym) \
+    void sym(short *input, short *dq, \
+             unsigned char *pred, unsigned char *output, \
+             int pitch, int stride, \
+             int dc)
+
+#define prototype_dequant_dc_idct_add_y_block(sym) \
+    void sym(short *q, short *dq, \
+             unsigned char *pre, unsigned char *dst, \
+             int stride, char *eobs, short *dc)
+
+#define prototype_dequant_idct_add_y_block(sym) \
+    void sym(short *q, short *dq, \
+             unsigned char *pre, unsigned char *dst, \
+             int stride, char *eobs)
+
+#define prototype_dequant_idct_add_uv_block(sym) \
+    void sym(short *q, short *dq, \
+             unsigned char *pre, unsigned char *dst_u, \
+             unsigned char *dst_v, int stride, char *eobs)
 
 #if ARCH_X86 || ARCH_X86_64
 #include "x86/dequantize_x86.h"
@@ -34,25 +55,52 @@
 #endif
 extern prototype_dequant_block(vp8_dequant_block);
 
-#ifndef vp8_dequant_idct
-#define vp8_dequant_idct vp8_dequant_idct_c
+#ifndef vp8_dequant_idct_add
+#define vp8_dequant_idct_add vp8_dequant_idct_add_c
+#endif
+extern prototype_dequant_idct_add(vp8_dequant_idct_add);
+
+#ifndef vp8_dequant_dc_idct_add
+#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_c
+#endif
+extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add);
+
+#ifndef vp8_dequant_dc_idct_add_y_block
+#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_c
 #endif
-extern prototype_dequant_idct(vp8_dequant_idct);
+extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block);
 
-#ifndef vp8_dequant_idct_dc
-#define vp8_dequant_idct_dc vp8_dequant_dc_idct_c
+#ifndef vp8_dequant_idct_add_y_block
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c
 #endif
-extern prototype_dequant_idct_dc(vp8_dequant_idct_dc);
+extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block);
+
+#ifndef vp8_dequant_idct_add_uv_block
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c
+#endif
+extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block);
 
 
 typedef prototype_dequant_block((*vp8_dequant_block_fn_t));
-typedef prototype_dequant_idct((*vp8_dequant_idct_fn_t));
-typedef prototype_dequant_idct_dc((*vp8_dequant_idct_dc_fn_t));
+
+typedef prototype_dequant_idct_add((*vp8_dequant_idct_add_fn_t));
+
+typedef prototype_dequant_dc_idct_add((*vp8_dequant_dc_idct_add_fn_t));
+
+typedef prototype_dequant_dc_idct_add_y_block((*vp8_dequant_dc_idct_add_y_block_fn_t));
+
+typedef prototype_dequant_idct_add_y_block((*vp8_dequant_idct_add_y_block_fn_t));
+
+typedef prototype_dequant_idct_add_uv_block((*vp8_dequant_idct_add_uv_block_fn_t));
+
 typedef struct
 {
-    vp8_dequant_block_fn_t    block;
-    vp8_dequant_idct_fn_t     idct;
-    vp8_dequant_idct_dc_fn_t  idct_dc;
+    vp8_dequant_block_fn_t               block;
+    vp8_dequant_idct_add_fn_t            idct_add;
+    vp8_dequant_dc_idct_add_fn_t         dc_idct_add;
+    vp8_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block;
+    vp8_dequant_idct_add_y_block_fn_t    idct_add_y_block;
+    vp8_dequant_idct_add_uv_block_fn_t   idct_add_uv_block;
 } vp8_dequant_rtcd_vtable_t;
 
 #if CONFIG_RUNTIME_CPU_DETECT
diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c
index a42f18dd7..7d013d240 100644
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
@@ -13,12 +14,12 @@
 #include "onyxd_int.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
+#include "detokenize.h"
 
-#define BR_COUNT 8
 #define BOOL_DATA UINT8
 
 #define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES
-DECLARE_ALIGNED(16, UINT16, vp8_coef_bands_x[16]) = { 0, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X};
+DECLARE_ALIGNED(16, UINT8, vp8_coef_bands_x[16]) = { 0, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X};
 #define EOB_CONTEXT_NODE            0
 #define ZERO_CONTEXT_NODE           1
 #define ONE_CONTEXT_NODE            2
@@ -43,49 +44,72 @@ typedef struct
 
 DECLARE_ALIGNED(16, static const TOKENEXTRABITS, vp8d_token_extra_bits2[MAX_ENTROPY_TOKENS]) =
 {
-    {  0, -1, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  //ZERO_TOKEN
-    {  1, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //ONE_TOKEN
-    {  2, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //TWO_TOKEN
-    {  3, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //THREE_TOKEN
-    {  4, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //FOUR_TOKEN
-    {  5, 0, { 159, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  //DCT_VAL_CATEGORY1
-    {  7, 1, { 145, 165, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY2
-    { 11, 2, { 140, 148, 173, 0,  0,  0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY3
-    { 19, 3, { 135, 140, 155, 176, 0,  0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY4
-    { 35, 4, { 130, 134, 141, 157, 180, 0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY5
-    { 67, 10, { 129, 130, 133, 140, 153, 177, 196, 230, 243, 254, 254, 0   } }, //DCT_VAL_CATEGORY6
-    {  0, -1, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  // EOB TOKEN
+    {  0, -1, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  /* ZERO_TOKEN */
+    {  1, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   /* ONE_TOKEN */
+    {  2, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   /* TWO_TOKEN */
+    {  3, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   /* THREE_TOKEN */
+    {  4, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   /* FOUR_TOKEN */
+    {  5, 0, { 159, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  /* DCT_VAL_CATEGORY1 */
+    {  7, 1, { 145, 165, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } }, /* DCT_VAL_CATEGORY2 */
+    { 11, 2, { 140, 148, 173, 0,  0,  0,  0,  0,  0,  0,  0,  0   } }, /* DCT_VAL_CATEGORY3 */
+    { 19, 3, { 135, 140, 155, 176, 0,  0,  0,  0,  0,  0,  0,  0   } }, /* DCT_VAL_CATEGORY4 */
+    { 35, 4, { 130, 134, 141, 157, 180, 0,  0,  0,  0,  0,  0,  0   } }, /* DCT_VAL_CATEGORY5 */
+    { 67, 10, { 129, 130, 133, 140, 153, 177, 196, 230, 243, 254, 254, 0   } }, /* DCT_VAL_CATEGORY6 */
+    {  0, -1, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  /*  EOB TOKEN */
 };
 
 
 void vp8_reset_mb_tokens_context(MACROBLOCKD *x)
 {
-    ENTROPY_CONTEXT **const A = x->above_context;
-    ENTROPY_CONTEXT(* const L)[4] = x->left_context;
-
-    ENTROPY_CONTEXT *a;
-    ENTROPY_CONTEXT *l;
-    int i;
-
-    for (i = 0; i < 24; i++)
+    /* Clear entropy contexts for Y2 blocks */
+    if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV)
     {
-
-        a = A[ vp8_block2context[i] ] + vp8_block2above[i];
-        l = L[ vp8_block2context[i] ] + vp8_block2left[i];
-
-        *a = *l = 0;
+        vpx_memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+        vpx_memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
     }
-
-    if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
+    else
     {
-        a = A[Y2CONTEXT] + vp8_block2above[24];
-        l = L[Y2CONTEXT] + vp8_block2left[24];
-        *a = *l = 0;
+        vpx_memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+        vpx_memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
     }
+}
 
+#if CONFIG_ARM_ASM_DETOK
+/* mashup of vp8_block2left and vp8_block2above so we only need one pointer
+ * for the assembly version.
+ */
+DECLARE_ALIGNED(16, const UINT8, vp8_block2leftabove[25*2]) =
+{
+    /* vp8_block2left */
+    0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    /* vp8_block2above */
+    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
+};
 
+void vp8_init_detokenizer(VP8D_COMP *dx)
+{
+    const VP8_COMMON *const oc = & dx->common;
+    MACROBLOCKD *x = & dx->mb;
+
+    dx->detoken.vp8_coef_tree_ptr = vp8_coef_tree;
+    dx->detoken.ptr_block2leftabove = vp8_block2leftabove;
+    dx->detoken.ptr_coef_bands_x = vp8_coef_bands_x;
+    dx->detoken.scan = vp8_default_zig_zag1d;
+    dx->detoken.teb_base_ptr = vp8d_token_extra_bits2;
+    dx->detoken.qcoeff_start_ptr = &x->qcoeff[0];
+
+    dx->detoken.coef_probs[0] = (oc->fc.coef_probs [0] [ 0 ] [0]);
+    dx->detoken.coef_probs[1] = (oc->fc.coef_probs [1] [ 0 ] [0]);
+    dx->detoken.coef_probs[2] = (oc->fc.coef_probs [2] [ 0 ] [0]);
+    dx->detoken.coef_probs[3] = (oc->fc.coef_probs [3] [ 0 ] [0]);
 }
-DECLARE_ALIGNED(16, extern const unsigned int, vp8dx_bitreader_norm[256]);
+#endif
+
+DECLARE_ALIGNED(16, extern const unsigned char, vp8dx_bitreader_norm[256]);
+#define FILL \
+    if(count < 0) \
+        VP8DX_BOOL_DECODER_FILL(count, value, bufptr, bufend);
+
 #define NORMALIZE \
     /*if(range < 0x80)*/                            \
     { \
@@ -93,17 +117,13 @@ DECLARE_ALIGNED(16, extern const unsigned int, vp8dx_bitreader_norm[256]);
         range <<= shift; \
         value <<= shift; \
         count -= shift; \
-        if(count <= 0) \
-        { \
-            count += BR_COUNT ; \
-            value |= (*bufptr) << (BR_COUNT-count); \
-            bufptr = br_ptr_advance(bufptr, 1); \
-        } \
     }
 
 #define DECODE_AND_APPLYSIGN(value_to_sign) \
     split = (range + 1) >> 1; \
-    if ( (value >> 8) < split ) \
+    bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8); \
+    FILL \
+    if ( value < bigsplit ) \
     { \
         range = split; \
         v= value_to_sign; \
@@ -111,28 +131,25 @@ DECLARE_ALIGNED(16, extern const unsigned int, vp8dx_bitreader_norm[256]);
     else \
     { \
         range = range-split; \
-        value = value-(split<<8); \
+        value = value-bigsplit; \
         v = -value_to_sign; \
     } \
     range +=range;                   \
     value +=value;                   \
-    if (!--count) \
-    { \
-        count = BR_COUNT; \
-        value |= *bufptr; \
-        bufptr = br_ptr_advance(bufptr, 1); \
-    }
+    count--;
 
 #define DECODE_AND_BRANCH_IF_ZERO(probability,branch) \
     { \
         split = 1 +  ((( probability*(range-1) ) )>> 8); \
-        if ( (value >> 8) < split ) \
+        bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8); \
+        FILL \
+        if ( value < bigsplit ) \
         { \
             range = split; \
             NORMALIZE \
             goto branch; \
         } \
-        value -= (split<<8); \
+        value -= bigsplit; \
         range = range - split; \
         NORMALIZE \
     }
@@ -140,7 +157,9 @@ DECLARE_ALIGNED(16, extern const unsigned int, vp8dx_bitreader_norm[256]);
 #define DECODE_AND_LOOP_IF_ZERO(probability,branch) \
     { \
         split = 1 + ((( probability*(range-1) ) ) >> 8); \
-        if ( (value >> 8) < split ) \
+        bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8); \
+        FILL \
+        if ( value < bigsplit ) \
         { \
             range = split; \
             NORMALIZE \
@@ -151,7 +170,7 @@ DECLARE_ALIGNED(16, extern const unsigned int, vp8dx_bitreader_norm[256]);
             goto branch; \
             } goto BLOCK_FINISHED; /*for malformed input */\
         } \
-        value -= (split<<8); \
+        value -= bigsplit; \
         range = range - split; \
         NORMALIZE \
     }
@@ -169,10 +188,12 @@ DECLARE_ALIGNED(16, extern const unsigned int, vp8dx_bitreader_norm[256]);
 
 #define DECODE_EXTRABIT_AND_ADJUST_VAL(t,bits_count)\
     split = 1 +  (((range-1) * vp8d_token_extra_bits2[t].Probs[bits_count]) >> 8); \
-    if(value >= (split<<8))\
+    bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8); \
+    FILL \
+    if(value >= bigsplit)\
     {\
         range = range-split;\
-        value = value-(split<<8);\
+        value = value-bigsplit;\
         val += ((UINT16)1<<bits_count);\
     }\
     else\
@@ -181,14 +202,45 @@ DECLARE_ALIGNED(16, extern const unsigned int, vp8dx_bitreader_norm[256]);
     }\
     NORMALIZE
 
+#if CONFIG_ARM_ASM_DETOK
 int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
 {
-    ENTROPY_CONTEXT **const A = x->above_context;
-    ENTROPY_CONTEXT(* const L)[4] = x->left_context;
+    int eobtotal = 0;
+    int i, type;
+
+    dx->detoken.current_bc = x->current_bc;
+    dx->detoken.A = x->above_context;
+    dx->detoken.L = x->left_context;
+
+    type = 3;
+
+    if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV)
+    {
+        type = 1;
+        eobtotal -= 16;
+    }
+
+    vp8_decode_mb_tokens_v6(&dx->detoken, type);
+
+    for (i = 0; i < 25; i++)
+    {
+        x->eobs[i] = dx->detoken.eob[i];
+        eobtotal += dx->detoken.eob[i];
+    }
+
+    return eobtotal;
+}
+#else
+int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
+{
+    ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)x->above_context;
+    ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)x->left_context;
     const VP8_COMMON *const oc = & dx->common;
 
     BOOL_DECODER *bc = x->current_bc;
 
+    char *eobs = x->eobs;
+
     ENTROPY_CONTEXT *a;
     ENTROPY_CONTEXT *l;
     int i;
@@ -198,11 +250,13 @@ int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
     register int count;
 
     const BOOL_DATA *bufptr;
+    const BOOL_DATA *bufend;
     register unsigned int range;
-    register unsigned int value;
+    VP8_BD_VALUE value;
     const int *scan;
     register unsigned int shift;
     UINT32 split;
+    VP8_BD_VALUE bigsplit;
     INT16 *qcoeff_ptr;
 
     const vp8_prob *coef_probs;
@@ -210,46 +264,44 @@ int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
     int stop;
     INT16 val, bits_count;
     INT16 c;
-    INT16 t;
     INT16 v;
     const vp8_prob *Prob;
 
-    //int *scan;
     type = 3;
     i = 0;
     stop = 16;
 
-    if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
+    scan = vp8_default_zig_zag1d;
+    qcoeff_ptr = &x->qcoeff[0];
+
+    if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV)
     {
         i = 24;
         stop = 24;
         type = 1;
-        qcoeff_ptr = &x->qcoeff[24*16];
-        scan = vp8_default_zig_zag1d;
+        qcoeff_ptr += 24*16;
         eobtotal -= 16;
     }
-    else
-    {
-        scan = vp8_default_zig_zag1d;
-        qcoeff_ptr = &x->qcoeff[0];
-    }
 
+    bufend  = bc->user_buffer_end;
+    bufptr  = bc->user_buffer;
+    value   = bc->value;
     count   = bc->count;
     range   = bc->range;
-    value   = bc->value;
-    bufptr  = bc->read_ptr;
 
 
     coef_probs = oc->fc.coef_probs [type] [ 0 ] [0];
 
 BLOCK_LOOP:
-    a = A[ vp8_block2context[i] ] + vp8_block2above[i];
-    l = L[ vp8_block2context[i] ] + vp8_block2left[i];
+    a = A + vp8_block2above[i];
+    l = L + vp8_block2left[i];
+
     c = (INT16)(!type);
 
-    VP8_COMBINEENTROPYCONTEXTS(t, *a, *l);
+    /*Dest = ((A)!=0) + ((B)!=0);*/
+    VP8_COMBINEENTROPYCONTEXTS(v, *a, *l);
     Prob = coef_probs;
-    Prob += t * ENTROPY_NODES;
+    Prob += v * ENTROPY_NODES;
 
 DO_WHILE:
     Prob += vp8_coef_bands_x[c];
@@ -336,9 +388,8 @@ ONE_CONTEXT_NODE_0_:
 
     qcoeff_ptr [ scan[15] ] = (INT16) v;
 BLOCK_FINISHED:
-    t = ((x->block[i].eob = c) != !type);   // any nonzero data?
-    eobtotal += x->block[i].eob;
-    *a = *l = t;
+    *a = *l = ((eobs[i] = c) != !type);   /* any nonzero data? */
+    eobtotal += c;
     qcoeff_ptr += 16;
 
     i++;
@@ -348,12 +399,11 @@ BLOCK_FINISHED:
 
     if (i == 25)
     {
-        scan = vp8_default_zig_zag1d;//x->scan_order1d;
         type = 0;
         i = 0;
         stop = 16;
         coef_probs = oc->fc.coef_probs [type] [ 0 ] [0];
-        qcoeff_ptr = &x->qcoeff[0];
+        qcoeff_ptr -= (24*16 + 16);
         goto BLOCK_LOOP;
     }
 
@@ -365,10 +415,12 @@ BLOCK_FINISHED:
         goto BLOCK_LOOP;
     }
 
-    bc->count = count;
+    FILL
+    bc->user_buffer = bufptr;
     bc->value = value;
+    bc->count = count;
     bc->range = range;
-    bc->read_ptr = bufptr;
     return eobtotal;
 
 }
+#endif /*!CONFIG_ASM_DETOK*/
diff --git a/vp8/decoder/detokenize.h b/vp8/decoder/detokenize.h
index 6a9a47607..294a4a55d 100644
--- a/vp8/decoder/detokenize.h
+++ b/vp8/decoder/detokenize.h
@@ -1,19 +1,24 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
-#ifndef detokenize_h
-#define detokenize_h 1
+#ifndef DETOKENIZE_H
+#define DETOKENIZE_H
 
 #include "onyxd_int.h"
 
+#if ARCH_ARM
+#include "arm/detokenize_arm.h"
+#endif
+
 void vp8_reset_mb_tokens_context(MACROBLOCKD *x);
 int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *);
 
-#endif /* detokenize_h */
+#endif /* DETOKENIZE_H */
diff --git a/vp8/decoder/generic/dsystemdependent.c b/vp8/decoder/generic/dsystemdependent.c
index 302b64bf8..2e284729b 100644
--- a/vp8/decoder/generic/dsystemdependent.c
+++ b/vp8/decoder/generic/dsystemdependent.c
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
@@ -13,19 +14,22 @@
 #include "onyxd_int.h"
 
 extern void vp8_arch_x86_decode_init(VP8D_COMP *pbi);
+extern void vp8_arch_arm_decode_init(VP8D_COMP *pbi);
 
 void vp8_dmachine_specific_config(VP8D_COMP *pbi)
 {
-    // Pure C:
+    /* Pure C: */
 #if CONFIG_RUNTIME_CPU_DETECT
-    pbi->mb.rtcd         = &pbi->common.rtcd;
-    pbi->dequant.block   = vp8_dequantize_b_c;
-    pbi->dequant.idct    = vp8_dequant_idct_c;
-    pbi->dequant.idct_dc = vp8_dequant_dc_idct_c;
-    pbi->dboolhuff.start = vp8dx_start_decode_c;
-    pbi->dboolhuff.stop  = vp8dx_stop_decode_c;
-    pbi->dboolhuff.fill  = vp8dx_bool_decoder_fill_c;
-#if 0 //For use with RTCD, when implemented
+    pbi->mb.rtcd                     = &pbi->common.rtcd;
+    pbi->dequant.block               = vp8_dequantize_b_c;
+    pbi->dequant.idct_add            = vp8_dequant_idct_add_c;
+    pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_c;
+    pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_c;
+    pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_c;
+    pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_c;
+    pbi->dboolhuff.start             = vp8dx_start_decode_c;
+    pbi->dboolhuff.fill              = vp8dx_bool_decoder_fill_c;
+#if 0 /*For use with RTCD, when implemented*/
     pbi->dboolhuff.debool = vp8dx_decode_bool_c;
     pbi->dboolhuff.devalue = vp8dx_decode_value_c;
 #endif
@@ -34,4 +38,8 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
 #if ARCH_X86 || ARCH_X86_64
     vp8_arch_x86_decode_init(pbi);
 #endif
+
+#if ARCH_ARM
+    vp8_arch_arm_decode_init(pbi);
+#endif
 }
diff --git a/vp8/decoder/idct_blk.c b/vp8/decoder/idct_blk.c
new file mode 100644
index 000000000..c98bd5bb8
--- /dev/null
+++ b/vp8/decoder/idct_blk.c
@@ -0,0 +1,124 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "idct.h"
+#include "dequantize.h"
+
+void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
+                               unsigned char *dest, int pitch, int stride,
+                               int Dc);
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
+                            unsigned char *dest, int pitch, int stride);
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
+                            unsigned char *dst_ptr, int pitch, int stride);
+
+void vp8_dequant_dc_idct_add_y_block_c
+            (short *q, short *dq, unsigned char *pre,
+             unsigned char *dst, int stride, char *eobs, short *dc)
+{
+    int i, j;
+
+    for (i = 0; i < 4; i++)
+    {
+        for (j = 0; j < 4; j++)
+        {
+            if (*eobs++ > 1)
+                vp8_dequant_dc_idct_add_c (q, dq, pre, dst, 16, stride, dc[0]);
+            else
+                vp8_dc_only_idct_add_c (dc[0], pre, dst, 16, stride);
+
+            q   += 16;
+            pre += 4;
+            dst += 4;
+            dc  ++;
+        }
+
+        pre += 64 - 16;
+        dst += 4*stride - 16;
+    }
+}
+
+void vp8_dequant_idct_add_y_block_c
+            (short *q, short *dq, unsigned char *pre,
+             unsigned char *dst, int stride, char *eobs)
+{
+    int i, j;
+
+    for (i = 0; i < 4; i++)
+    {
+        for (j = 0; j < 4; j++)
+        {
+            if (*eobs++ > 1)
+                vp8_dequant_idct_add_c (q, dq, pre, dst, 16, stride);
+            else
+            {
+                vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dst, 16, stride);
+                ((int *)q)[0] = 0;
+            }
+
+            q   += 16;
+            pre += 4;
+            dst += 4;
+        }
+
+        pre += 64 - 16;
+        dst += 4*stride - 16;
+    }
+}
+
+void vp8_dequant_idct_add_uv_block_c
+            (short *q, short *dq, unsigned char *pre,
+             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+{
+    int i, j;
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            if (*eobs++ > 1)
+                vp8_dequant_idct_add_c (q, dq, pre, dstu, 8, stride);
+            else
+            {
+                vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dstu, 8, stride);
+                ((int *)q)[0] = 0;
+            }
+
+            q    += 16;
+            pre  += 4;
+            dstu += 4;
+        }
+
+        pre  += 32 - 8;
+        dstu += 4*stride - 8;
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            if (*eobs++ > 1)
+                vp8_dequant_idct_add_c (q, dq, pre, dstv, 8, stride);
+            else
+            {
+                vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dstv, 8, stride);
+                ((int *)q)[0] = 0;
+            }
+
+            q    += 16;
+            pre  += 4;
+            dstv += 4;
+        }
+
+        pre  += 32 - 8;
+        dstv += 4*stride - 8;
+    }
+}
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index 8d2b267a9..063b6a468 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
@@ -23,18 +24,19 @@
 #include "threading.h"
 #include "decoderthreading.h"
 #include <stdio.h>
-#include "segmentation_common.h"
+
 #include "quant_common.h"
 #include "vpx_scale/vpxscale.h"
 #include "systemdependent.h"
 #include "vpx_ports/vpx_timer.h"
-
+#include "detokenize.h"
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
 
 extern void vp8_init_loop_filter(VP8_COMMON *cm);
-
 extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
 
-// DEBUG code
 #if CONFIG_DEBUG
 void vp8_recon_write_yuv_frame(unsigned char *name, YV12_BUFFER_CONFIG *s)
 {
@@ -110,12 +112,13 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
     pbi->common.current_video_frame = 0;
     pbi->ready_for_new_data = 1;
 
-    pbi->CPUFreq = 0; //vp8_get_processor_freq();
+    pbi->CPUFreq = 0; /*vp8_get_processor_freq();*/
     pbi->max_threads = oxcf->max_threads;
     vp8_decoder_create_threads(pbi);
 
-    //vp8cx_init_de_quantizer() is first called here. Add check in frame_init_dequantizer() to avoid
-    // unnecessary calling of vp8cx_init_de_quantizer() for every frame.
+    /* vp8cx_init_de_quantizer() is first called here. Add check in frame_init_dequantizer() to avoid
+     *  unnecessary calling of vp8cx_init_de_quantizer() for every frame.
+     */
     vp8cx_init_de_quantizer(pbi);
 
     {
@@ -127,6 +130,9 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
         cm->last_sharpness_level = cm->sharpness_level;
     }
 
+#if CONFIG_ARM_ASM_DETOK
+    vp8_init_detokenizer(pbi);
+#endif
     pbi->common.error.setjmp = 0;
     return (VP8D_PTR) pbi;
 }
@@ -142,6 +148,11 @@ void vp8dx_remove_decompressor(VP8D_PTR ptr)
     if (pbi->segmentation_map != 0)
         vpx_free(pbi->segmentation_map);
 #endif
+
+#if CONFIG_MULTITHREAD
+    if (pbi->b_multithreaded_rd)
+        vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows);
+#endif
     vp8_decoder_remove_threads(pbi);
     vp8_remove_common(&pbi->common);
     vpx_free(pbi);
@@ -181,57 +192,143 @@ int vp8dx_get_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_C
 {
     VP8D_COMP *pbi = (VP8D_COMP *) ptr;
     VP8_COMMON *cm = &pbi->common;
+    int ref_fb_idx;
 
     if (ref_frame_flag == VP8_LAST_FLAG)
-        vp8_yv12_copy_frame_ptr(&cm->last_frame, sd);
-
+        ref_fb_idx = cm->lst_fb_idx;
     else if (ref_frame_flag == VP8_GOLD_FLAG)
-        vp8_yv12_copy_frame_ptr(&cm->golden_frame, sd);
-
+        ref_fb_idx = cm->gld_fb_idx;
     else if (ref_frame_flag == VP8_ALT_FLAG)
-        vp8_yv12_copy_frame_ptr(&cm->alt_ref_frame, sd);
-
+        ref_fb_idx = cm->alt_fb_idx;
     else
         return -1;
 
+    vp8_yv12_copy_frame_ptr(&cm->yv12_fb[ref_fb_idx], sd);
+
     return 0;
 }
 int vp8dx_set_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
 {
     VP8D_COMP *pbi = (VP8D_COMP *) ptr;
     VP8_COMMON *cm = &pbi->common;
+    int ref_fb_idx;
 
     if (ref_frame_flag == VP8_LAST_FLAG)
-        vp8_yv12_copy_frame_ptr(sd, &cm->last_frame);
-
+        ref_fb_idx = cm->lst_fb_idx;
     else if (ref_frame_flag == VP8_GOLD_FLAG)
-        vp8_yv12_copy_frame_ptr(sd, &cm->golden_frame);
-
+        ref_fb_idx = cm->gld_fb_idx;
     else if (ref_frame_flag == VP8_ALT_FLAG)
-        vp8_yv12_copy_frame_ptr(sd, &cm->alt_ref_frame);
-
+        ref_fb_idx = cm->alt_fb_idx;
     else
         return -1;
 
+    vp8_yv12_copy_frame_ptr(sd, &cm->yv12_fb[ref_fb_idx]);
+
     return 0;
 }
 
-//For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.
+/*For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.*/
 #if HAVE_ARMV7
 extern void vp8_push_neon(INT64 *store);
 extern void vp8_pop_neon(INT64 *store);
-static INT64 dx_store_reg[8];
 #endif
+
+static int get_free_fb (VP8_COMMON *cm)
+{
+    int i;
+    for (i = 0; i < NUM_YV12_BUFFERS; i++)
+        if (cm->fb_idx_ref_cnt[i] == 0)
+            break;
+
+    cm->fb_idx_ref_cnt[i] = 1;
+    return i;
+}
+
+static void ref_cnt_fb (int *buf, int *idx, int new_idx)
+{
+    if (buf[*idx] > 0)
+        buf[*idx]--;
+
+    *idx = new_idx;
+
+    buf[new_idx]++;
+}
+
+/* If any buffer copy / swapping is signalled it should be done here. */
+static int swap_frame_buffers (VP8_COMMON *cm)
+{
+    int fb_to_update_with, err = 0;
+
+    if (cm->refresh_last_frame)
+        fb_to_update_with = cm->lst_fb_idx;
+    else
+        fb_to_update_with = cm->new_fb_idx;
+
+    /* The alternate reference frame or golden frame can be updated
+     *  using the new, last, or golden/alt ref frame.  If it
+     *  is updated using the newly decoded frame it is a refresh.
+     *  An update using the last or golden/alt ref frame is a copy.
+     */
+    if (cm->copy_buffer_to_arf)
+    {
+        int new_fb = 0;
+
+        if (cm->copy_buffer_to_arf == 1)
+            new_fb = fb_to_update_with;
+        else if (cm->copy_buffer_to_arf == 2)
+            new_fb = cm->gld_fb_idx;
+        else
+            err = -1;
+
+        ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->alt_fb_idx, new_fb);
+    }
+
+    if (cm->copy_buffer_to_gf)
+    {
+        int new_fb = 0;
+
+        if (cm->copy_buffer_to_gf == 1)
+            new_fb = fb_to_update_with;
+        else if (cm->copy_buffer_to_gf == 2)
+            new_fb = cm->alt_fb_idx;
+        else
+            err = -1;
+
+        ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->gld_fb_idx, new_fb);
+    }
+
+    if (cm->refresh_golden_frame)
+        ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->gld_fb_idx, cm->new_fb_idx);
+
+    if (cm->refresh_alt_ref_frame)
+        ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->alt_fb_idx, cm->new_fb_idx);
+
+    if (cm->refresh_last_frame)
+    {
+        ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->lst_fb_idx, cm->new_fb_idx);
+
+        cm->frame_to_show = &cm->yv12_fb[cm->lst_fb_idx];
+    }
+    else
+        cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
+
+    cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
+
+    return err;
+}
+
 int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsigned char *source, INT64 time_stamp)
 {
+#if HAVE_ARMV7
+    INT64 dx_store_reg[8];
+#endif
     VP8D_COMP *pbi = (VP8D_COMP *) ptr;
     VP8_COMMON *cm = &pbi->common;
     int retcode = 0;
-
     struct vpx_usec_timer timer;
 
-//  if(pbi->ready_for_new_data == 0)
-//      return -1;
+    /*if(pbi->ready_for_new_data == 0)
+        return -1;*/
 
     if (ptr == 0)
     {
@@ -240,21 +337,38 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
 
     pbi->common.error.error_code = VPX_CODEC_OK;
 
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+    if (cm->rtcd.flags & HAS_NEON)
+#endif
+    {
+        vp8_push_neon(dx_store_reg);
+    }
+#endif
+
+    cm->new_fb_idx = get_free_fb (cm);
+
     if (setjmp(pbi->common.error.jmp))
     {
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+        if (cm->rtcd.flags & HAS_NEON)
+#endif
+        {
+            vp8_pop_neon(dx_store_reg);
+        }
+#endif
         pbi->common.error.setjmp = 0;
+        if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
+          cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
         return -1;
     }
 
     pbi->common.error.setjmp = 1;
 
-#if HAVE_ARMV7
-    vp8_push_neon(dx_store_reg);
-#endif
-
     vpx_usec_timer_start(&timer);
 
-    //cm->current_video_frame++;
+    /*cm->current_video_frame++;*/
     pbi->Source = source;
     pbi->source_sz = size;
 
@@ -263,103 +377,80 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
     if (retcode < 0)
     {
 #if HAVE_ARMV7
-        vp8_pop_neon(dx_store_reg);
+#if CONFIG_RUNTIME_CPU_DETECT
+        if (cm->rtcd.flags & HAS_NEON)
+#endif
+        {
+            vp8_pop_neon(dx_store_reg);
+        }
 #endif
         pbi->common.error.error_code = VPX_CODEC_ERROR;
         pbi->common.error.setjmp = 0;
+        if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
+          cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
         return retcode;
     }
 
-    // Update the GF useage maps.
-    vp8_update_gf_useage_maps(cm, &pbi->mb);
-
-    if (pbi->b_multithreaded_lf && pbi->common.filter_level != 0)
-        vp8_stop_lfthread(pbi);
-
-    if (cm->refresh_last_frame)
+    if (pbi->b_multithreaded_rd && cm->multi_token_partition != ONE_PARTITION)
     {
-        vp8_swap_yv12_buffer(&cm->last_frame, &cm->new_frame);
-
-        cm->frame_to_show = &cm->last_frame;
-    }
-    else
-    {
-        cm->frame_to_show = &cm->new_frame;
-    }
-
-    if (!pbi->b_multithreaded_lf)
+        if (swap_frame_buffers (cm))
+        {
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+            if (cm->rtcd.flags & HAS_NEON)
+#endif
+            {
+                vp8_pop_neon(dx_store_reg);
+            }
+#endif
+            pbi->common.error.error_code = VPX_CODEC_ERROR;
+            pbi->common.error.setjmp = 0;
+            return -1;
+        }
+    } else
     {
-        struct vpx_usec_timer lpftimer;
-        vpx_usec_timer_start(&lpftimer);
-        // Apply the loop filter if appropriate.
+        if (swap_frame_buffers (cm))
+        {
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+            if (cm->rtcd.flags & HAS_NEON)
+#endif
+            {
+                vp8_pop_neon(dx_store_reg);
+            }
+#endif
+            pbi->common.error.error_code = VPX_CODEC_ERROR;
+            pbi->common.error.setjmp = 0;
+            return -1;
+        }
 
-        if (cm->filter_level > 0)
+        if(pbi->common.filter_level)
         {
+            struct vpx_usec_timer lpftimer;
+            vpx_usec_timer_start(&lpftimer);
+            /* Apply the loop filter if appropriate. */
+
             vp8_loop_filter_frame(cm, &pbi->mb, cm->filter_level);
+
+            vpx_usec_timer_mark(&lpftimer);
+            pbi->time_loop_filtering += vpx_usec_timer_elapsed(&lpftimer);
+
             cm->last_frame_type = cm->frame_type;
             cm->last_filter_type = cm->filter_type;
             cm->last_sharpness_level = cm->sharpness_level;
-
         }
-
-        vpx_usec_timer_mark(&lpftimer);
-        pbi->time_loop_filtering += vpx_usec_timer_elapsed(&lpftimer);
+        vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
     }
 
-    vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
-
-
-    // DEBUG code
 #if 0
+    /* DEBUG code */
+    /*vp8_recon_write_yuv_frame("recon.yuv", cm->frame_to_show);*/
     vp8_recon_write_yuv_frame("recon.yuv", cm->frame_to_show);
 
     if (cm->current_video_frame <= 5)
         write_dx_frame_to_file(cm->frame_to_show, cm->current_video_frame);
 #endif
 
-    // If any buffer copy / swaping is signalled it should be done here.
-    if (cm->copy_buffer_to_arf)
-    {
-        if (cm->copy_buffer_to_arf == 1)
-        {
-            if (cm->refresh_last_frame)
-                vp8_yv12_copy_frame_ptr(&cm->new_frame, &cm->alt_ref_frame);
-            else
-                vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->alt_ref_frame);
-        }
-        else if (cm->copy_buffer_to_arf == 2)
-            vp8_yv12_copy_frame_ptr(&cm->golden_frame, &cm->alt_ref_frame);
-    }
-
-    if (cm->copy_buffer_to_gf)
-    {
-        if (cm->copy_buffer_to_gf == 1)
-        {
-            if (cm->refresh_last_frame)
-                vp8_yv12_copy_frame_ptr(&cm->new_frame, &cm->golden_frame);
-            else
-                vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->golden_frame);
-        }
-        else if (cm->copy_buffer_to_gf == 2)
-            vp8_yv12_copy_frame_ptr(&cm->alt_ref_frame, &cm->golden_frame);
-    }
-
-    // Should the golden or alternate reference frame be refreshed?
-    if (cm->refresh_golden_frame || cm->refresh_alt_ref_frame)
-    {
-        if (cm->refresh_golden_frame)
-            vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->golden_frame);
-
-        if (cm->refresh_alt_ref_frame)
-            vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->alt_ref_frame);
-
-        //vpx_log("Decoder: recovery frame received \n");
-
-        // Update data structures that monitors GF useage
-        vpx_memset(cm->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
-        cm->gf_active_count = cm->mb_rows * cm->mb_cols;
-    }
-
     vp8_clear_system_state();
 
     vpx_usec_timer_mark(&timer);
@@ -367,7 +458,7 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
 
     pbi->time_decoding += pbi->decode_microseconds;
 
-//  vp8_print_modes_and_motion_vectors( cm->mi, cm->mb_rows,cm->mb_cols, cm->current_video_frame);
+    /*vp8_print_modes_and_motion_vectors( cm->mi, cm->mb_rows,cm->mb_cols, cm->current_video_frame);*/
 
     if (cm->show_frame)
         cm->current_video_frame++;
@@ -410,12 +501,17 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
 #endif
 
 #if HAVE_ARMV7
-    vp8_pop_neon(dx_store_reg);
+#if CONFIG_RUNTIME_CPU_DETECT
+    if (cm->rtcd.flags & HAS_NEON)
+#endif
+    {
+        vp8_pop_neon(dx_store_reg);
+    }
 #endif
     pbi->common.error.setjmp = 0;
     return retcode;
 }
-int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, int deblock_level,  int noise_level, int flags)
+int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, vp8_ppflags_t *flags)
 {
     int ret = -1;
     VP8D_COMP *pbi = (VP8D_COMP *) ptr;
@@ -423,7 +519,7 @@ int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp,
     if (pbi->ready_for_new_data == 1)
         return ret;
 
-    // ie no raw frame to show!!!
+    /* ie no raw frame to show!!! */
     if (pbi->common.show_frame == 0)
         return ret;
 
@@ -433,7 +529,7 @@ int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp,
 
     sd->clrtype = pbi->common.clr_type;
 #if CONFIG_POSTPROC
-    ret = vp8_post_proc_frame(&pbi->common, sd, deblock_level, noise_level, flags);
+    ret = vp8_post_proc_frame(&pbi->common, sd, flags);
 #else
 
     if (pbi->common.frame_to_show)
@@ -449,7 +545,7 @@ int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp,
         ret = -1;
     }
 
-#endif //!CONFIG_POSTPROC
+#endif /*!CONFIG_POSTPROC*/
     vp8_clear_system_state();
     return ret;
 }
diff --git a/vp8/decoder/onyxd_if_sjl.c b/vp8/decoder/onyxd_if_sjl.c
deleted file mode 100644
index 363ad5d72..000000000
--- a/vp8/decoder/onyxd_if_sjl.c
+++ /dev/null
@@ -1,398 +0,0 @@
-/*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
- */
-
-
-#include "onyxc_int.h"
-#include "postproc.h"
-#include "onyxd.h"
-#include "onyxd_int.h"
-#include "vpx_mem/vpx_mem.h"
-#include "alloccommon.h"
-#include "vpx_scale/yv12extend.h"
-#include "loopfilter.h"
-#include "swapyv12buffer.h"
-#include "g_common.h"
-#include "threading.h"
-#include "decoderthreading.h"
-#include <stdio.h>
-#include "segmentation_common.h"
-#include "quant_common.h"
-#include "vpx_scale/vpxscale.h"
-#include "systemdependent.h"
-#include "vpx_ports/vpx_timer.h"
-
-
-#ifndef VPX_NO_GLOBALS
-static int init_ct = 0;
-#else
-# include "vpx_global_handling.h"
-# define init_ct ((int)vpxglobalm(onyxd,init_ct))
-#endif
-
-extern void vp8_init_loop_filter(VP8_COMMON *cm);
-
-extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
-extern void init_detokenizer(VP8D_COMP *dx);
-
-// DEBUG code
-void vp8_recon_write_yuv_frame(unsigned char *name, YV12_BUFFER_CONFIG *s)
-{
-    FILE *yuv_file = fopen((char *)name, "ab");
-    unsigned char *src = s->y_buffer;
-    int h = s->y_height;
-
-    do
-    {
-        fwrite(src, s->y_width, 1,  yuv_file);
-        src += s->y_stride;
-    }
-    while (--h);
-
-    src = s->u_buffer;
-    h = s->uv_height;
-
-    do
-    {
-        fwrite(src, s->uv_width, 1,  yuv_file);
-        src += s->uv_stride;
-    }
-    while (--h);
-
-    src = s->v_buffer;
-    h = s->uv_height;
-
-    do
-    {
-        fwrite(src, s->uv_width, 1, yuv_file);
-        src += s->uv_stride;
-    }
-    while (--h);
-
-    fclose(yuv_file);
-}
-
-void vp8dx_initialize()
-{
-    if (!init_ct++)
-    {
-        vp8_initialize_common();
-        vp8_scale_machine_specific_config();
-    }
-}
-
-void vp8dx_shutdown()
-{
-    if (!--init_ct)
-    {
-        vp8_shutdown_common();
-    }
-}
-
-
-VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
-{
-    VP8D_COMP *pbi = vpx_memalign(32, sizeof(VP8D_COMP));
-
-    if (!pbi)
-        return NULL;
-
-    vpx_memset(pbi, 0, sizeof(VP8D_COMP));
-
-    vp8dx_initialize();
-
-    vp8_create_common(&pbi->common);
-    vp8_dmachine_specific_config(pbi);
-
-    pbi->common.current_video_frame = 0;
-    pbi->ready_for_new_data = 1;
-
-    pbi->CPUFreq = 0; //vp8_get_processor_freq();
-    pbi->max_threads = oxcf->max_threads;
-    vp8_decoder_create_threads(pbi);
-
-    //vp8cx_init_de_quantizer() is first called here. Add check in frame_init_dequantizer() to avoid
-    // unnecessary calling of vp8cx_init_de_quantizer() for every frame.
-    vp8cx_init_de_quantizer(pbi);
-
-    {
-        VP8_COMMON *cm = &pbi->common;
-
-        vp8_init_loop_filter(cm);
-        cm->last_frame_type = KEY_FRAME;
-        cm->last_filter_type = cm->filter_type;
-        cm->last_sharpness_level = cm->sharpness_level;
-    }
-
-    init_detokenizer(pbi);
-
-    return (VP8D_PTR) pbi;
-}
-void vp8dx_remove_decompressor(VP8D_PTR ptr)
-{
-    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
-
-    if (!pbi)
-        return;
-
-    vp8_decoder_remove_threads(pbi);
-    vp8_remove_common(&pbi->common);
-    vpx_free(pbi);
-    vp8dx_shutdown();
-
-}
-
-void vp8dx_set_setting(VP8D_PTR comp, VP8D_SETTING oxst, int x)
-{
-    VP8D_COMP *pbi = (VP8D_COMP *) comp;
-
-    (void) pbi;
-    (void) x;
-
-    switch (oxst)
-    {
-    case VP8D_OK:
-        break;
-    }
-}
-
-int vp8dx_get_setting(VP8D_PTR comp, VP8D_SETTING oxst)
-{
-    VP8D_COMP *pbi = (VP8D_COMP *) comp;
-
-    (void) pbi;
-
-    switch (oxst)
-    {
-    case VP8D_OK:
-        break;
-    }
-
-    return -1;
-}
-
-int vp8dx_get_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
-{
-    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
-    VP8_COMMON *cm = &pbi->common;
-
-    if (ref_frame_flag == VP8_LAST_FLAG)
-        vp8_yv12_copy_frame_ptr(&cm->last_frame, sd);
-
-    else if (ref_frame_flag == VP8_GOLD_FLAG)
-        vp8_yv12_copy_frame_ptr(&cm->golden_frame, sd);
-
-    else if (ref_frame_flag == VP8_ALT_FLAG)
-        vp8_yv12_copy_frame_ptr(&cm->alt_ref_frame, sd);
-
-    else
-        return -1;
-
-    return 0;
-}
-int vp8dx_set_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
-{
-    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
-    VP8_COMMON *cm = &pbi->common;
-
-    if (ref_frame_flag == VP8_LAST_FLAG)
-        vp8_yv12_copy_frame_ptr(sd, &cm->last_frame);
-
-    else if (ref_frame_flag == VP8_GOLD_FLAG)
-        vp8_yv12_copy_frame_ptr(sd, &cm->golden_frame);
-
-    else if (ref_frame_flag == VP8_ALT_FLAG)
-        vp8_yv12_copy_frame_ptr(sd, &cm->alt_ref_frame);
-
-    else
-        return -1;
-
-    return 0;
-}
-int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, char *source, INT64 time_stamp)
-{
-    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
-    VP8_COMMON *cm = &pbi->common;
-    int retcode = 0;
-
-    struct vpx_usec_timer timer;
-    (void) size;
-
-//  if(pbi->ready_for_new_data == 0)
-//      return -1;
-
-    vpx_usec_timer_start(&timer);
-
-    if (ptr == 0)
-    {
-        return -1;
-    }
-
-    //cm->current_video_frame++;
-    pbi->Source = source;
-
-    retcode = vp8_decode_frame(pbi);
-
-    if (retcode < 0)
-        return retcode;
-
-    // Update the GF useage maps.
-    vp8_update_gf_useage_maps(cm, &pbi->mb);
-
-    if (pbi->b_multithreaded)
-        vp8_stop_lfthread(pbi);
-
-    if (cm->refresh_last_frame)
-    {
-        vp8_swap_yv12_buffer(&cm->last_frame, &cm->new_frame);
-
-        cm->frame_to_show = &cm->last_frame;
-    }
-    else
-    {
-        cm->frame_to_show = &cm->new_frame;
-    }
-
-    if (!pbi->b_multithreaded)
-    {
-        struct vpx_usec_timer lpftimer;
-        vpx_usec_timer_start(&lpftimer);
-        // Apply the loop filter if appropriate.
-
-        if (cm->filter_level > 0)
-        {
-            vp8_loop_filter_frame(cm, &pbi->mb, cm->filter_level);
-            cm->last_frame_type = cm->frame_type;
-            cm->last_filter_type = cm->filter_type;
-            cm->last_sharpness_level = cm->sharpness_level;
-
-        }
-
-        vpx_usec_timer_mark(&lpftimer);
-        pbi->time_loop_filtering += vpx_usec_timer_elapsed(&lpftimer);
-    }
-
-    vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
-
-#if 0
-    // DEBUG code
-    //vp8_recon_write_yuv_frame("recon.yuv", cm->frame_to_show);
-    if (cm->current_video_frame <= 5)
-        write_dx_frame_to_file(cm->frame_to_show, cm->current_video_frame);
-#endif
-
-    // If any buffer copy / swaping is signalled it should be done here.
-    if (cm->copy_buffer_to_arf)
-    {
-        if (cm->copy_buffer_to_arf == 1)
-        {
-            if (cm->refresh_last_frame)
-                vp8_yv12_copy_frame_ptr(&cm->new_frame, &cm->alt_ref_frame);
-            else
-                vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->alt_ref_frame);
-        }
-        else if (cm->copy_buffer_to_arf == 2)
-            vp8_yv12_copy_frame_ptr(&cm->golden_frame, &cm->alt_ref_frame);
-    }
-
-    if (cm->copy_buffer_to_gf)
-    {
-        if (cm->copy_buffer_to_gf == 1)
-        {
-            if (cm->refresh_last_frame)
-                vp8_yv12_copy_frame_ptr(&cm->new_frame, &cm->golden_frame);
-            else
-                vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->golden_frame);
-        }
-        else if (cm->copy_buffer_to_gf == 2)
-            vp8_yv12_copy_frame_ptr(&cm->alt_ref_frame, &cm->golden_frame);
-    }
-
-    // Should the golden or alternate reference frame be refreshed?
-    if (cm->refresh_golden_frame || cm->refresh_alt_ref_frame)
-    {
-        if (cm->refresh_golden_frame)
-            vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->golden_frame);
-
-        if (cm->refresh_alt_ref_frame)
-            vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->alt_ref_frame);
-
-        //vpx_log("Decoder: recovery frame received \n");
-
-        // Update data structures that monitors GF useage
-        vpx_memset(cm->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
-        cm->gf_active_count = cm->mb_rows * cm->mb_cols;
-    }
-
-    vp8_clear_system_state();
-
-    vpx_usec_timer_mark(&timer);
-    pbi->decode_microseconds = vpx_usec_timer_elapsed(&timer);
-
-    pbi->time_decoding += pbi->decode_microseconds;
-
-//  vp8_print_modes_and_motion_vectors( cm->mi, cm->mb_rows,cm->mb_cols, cm->current_video_frame);
-
-    cm->current_video_frame++;
-    pbi->ready_for_new_data = 0;
-    pbi->last_time_stamp = time_stamp;
-
-    {
-        int i;
-        INT64 earliest_time = pbi->dr[0].time_stamp;
-        INT64 latest_time = pbi->dr[0].time_stamp;
-        INT64 time_diff = 0;
-        int bytes = 0;
-
-        pbi->dr[pbi->common.current_video_frame&0xf].size = pbi->bc.pos + pbi->bc2.pos + 4;;
-        pbi->dr[pbi->common.current_video_frame&0xf].time_stamp = time_stamp;
-
-        for (i = 0; i < 16; i++)
-        {
-
-            bytes += pbi->dr[i].size;
-
-            if (pbi->dr[i].time_stamp < earliest_time)
-                earliest_time = pbi->dr[i].time_stamp;
-
-            if (pbi->dr[i].time_stamp > latest_time)
-                latest_time = pbi->dr[i].time_stamp;
-        }
-
-        time_diff = latest_time - earliest_time;
-
-        if (time_diff > 0)
-        {
-            pbi->common.bitrate = 80000.00 * bytes / time_diff  ;
-            pbi->common.framerate = 160000000.00 / time_diff ;
-        }
-
-    }
-    return retcode;
-}
-int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, int deblock_level,  int noise_level, int flags)
-{
-    int ret = -1;
-    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
-
-    if (pbi->ready_for_new_data == 1)
-        return ret;
-
-    // ie no raw frame to show!!!
-    if (pbi->common.show_frame == 0)
-        return ret;
-
-    pbi->ready_for_new_data = 1;
-    *time_stamp = pbi->last_time_stamp;
-    *time_end_stamp = 0;
-
-    sd->clrtype = pbi->common.clr_type;
-    ret = vp8_post_proc_frame(&pbi->common, sd, deblock_level, noise_level, flags);
-    vp8_clear_system_state();
-    return ret;
-}
diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
index e8b5d409a..fc1811d7f 100644
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
@@ -47,21 +48,20 @@ typedef struct
 
 typedef struct
 {
-    int *scan;
-    UINT8 *ptr_onyxblock2context_leftabove;
-    vp8_tree_index *vp8_coef_tree_ptr;  //onyx_coef_tree_ptr; ???
-    TOKENEXTRABITS *teb_base_ptr;
+    int const *scan;
+    UINT8 const *ptr_block2leftabove;
+    vp8_tree_index const *vp8_coef_tree_ptr;
+    TOKENEXTRABITS const *teb_base_ptr;
     unsigned char *norm_ptr;
-//  UINT16 *ptr_onyx_coef_bands_x;
-    UINT8 *ptr_onyx_coef_bands_x;
+    UINT8 *ptr_coef_bands_x;
 
-    ENTROPY_CONTEXT   **A;
-    ENTROPY_CONTEXT(*L)[4];
+    ENTROPY_CONTEXT_PLANES *A;
+    ENTROPY_CONTEXT_PLANES *L;
 
     INT16 *qcoeff_start_ptr;
     BOOL_DECODER *current_bc;
 
-    UINT8 *coef_probs[4];
+    vp8_prob const *coef_probs[4];
 
     UINT8 eob[25];
 
@@ -88,27 +88,32 @@ typedef struct VP8Decompressor
     unsigned int time_loop_filtering;
 
     volatile int b_multithreaded_rd;
-    volatile int b_multithreaded_lf;
     int max_threads;
-    int last_mb_row_decoded;
     int current_mb_col_main;
     int decoding_thread_count;
     int allocated_decoding_thread_count;
-    // variable for threading
-    DECLARE_ALIGNED(16, MACROBLOCKD, lpfmb);
+    /* variable for threading */
 #if CONFIG_MULTITHREAD
-    pthread_t           h_thread_lpf;         // thread for postprocessing
-    sem_t               h_event_lpf;          // Event for post_proc completed
-    sem_t               h_event_start_lpf;
-#endif
+    int mt_baseline_filter_level[MAX_MB_SEGMENTS];
+    int sync_range;
+    int *mt_current_mb_col;                  /* Each row remembers its already decoded column. */
+
+    unsigned char **mt_yabove_row;           /* mb_rows x width */
+    unsigned char **mt_uabove_row;
+    unsigned char **mt_vabove_row;
+    unsigned char **mt_yleft_col;            /* mb_rows x 16 */
+    unsigned char **mt_uleft_col;            /* mb_rows x 8 */
+    unsigned char **mt_vleft_col;            /* mb_rows x 8 */
+
     MB_ROW_DEC           *mb_row_di;
-    DECODETHREAD_DATA   *de_thread_data;
-#if CONFIG_MULTITHREAD
+    DECODETHREAD_DATA    *de_thread_data;
+
     pthread_t           *h_decoding_thread;
-    sem_t               *h_event_mbrdecoding;
-    sem_t               h_event_main;
-    // end of threading data
+    sem_t               *h_event_start_decoding;
+    sem_t                h_event_end_decoding;
+    /* end of threading data */
 #endif
+
     vp8_reader *mbc;
     INT64 last_time_stamp;
     int   ready_for_new_data;
@@ -122,6 +127,12 @@ typedef struct VP8Decompressor
     struct vp8_dboolhuff_rtcd_vtable dboolhuff;
 #endif
 
+
+    vp8_prob prob_intra;
+    vp8_prob prob_last;
+    vp8_prob prob_gf;
+    vp8_prob prob_skip_false;
+
 } VP8D_COMP;
 
 int vp8_decode_frame(VP8D_COMP *cpi);
diff --git a/vp8/decoder/reconintra_mt.c b/vp8/decoder/reconintra_mt.c
new file mode 100644
index 000000000..ad4324b27
--- /dev/null
+++ b/vp8/decoder/reconintra_mt.c
@@ -0,0 +1,982 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "recon.h"
+#include "reconintra.h"
+#include "vpx_mem/vpx_mem.h"
+#include "onyxd_int.h"
+
+/* For skip_recon_mb(), add vp8_build_intra_predictors_mby_s(MACROBLOCKD *x) and
+ * vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x).
+ */
+
+void vp8mt_build_intra_predictors_mby(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
+{
+#if CONFIG_MULTITHREAD
+    unsigned char *yabove_row;    /* = x->dst.y_buffer - x->dst.y_stride; */
+    unsigned char *yleft_col;
+    unsigned char yleft_buf[16];
+    unsigned char ytop_left;      /* = yabove_row[-1]; */
+    unsigned char *ypred_ptr = x->predictor;
+    int r, c, i;
+
+    if (pbi->common.filter_level)
+    {
+        yabove_row = pbi->mt_yabove_row[mb_row] + mb_col*16 +32;
+        yleft_col = pbi->mt_yleft_col[mb_row];
+    } else
+    {
+        yabove_row = x->dst.y_buffer - x->dst.y_stride;
+
+        for (i = 0; i < 16; i++)
+            yleft_buf[i] = x->dst.y_buffer [i* x->dst.y_stride -1];
+        yleft_col = yleft_buf;
+    }
+
+    ytop_left = yabove_row[-1];
+
+    /* for Y */
+    switch (x->mode_info_context->mbmi.mode)
+    {
+    case DC_PRED:
+    {
+        int expected_dc;
+        int i;
+        int shift;
+        int average = 0;
+
+
+        if (x->up_available || x->left_available)
+        {
+            if (x->up_available)
+            {
+                for (i = 0; i < 16; i++)
+                {
+                    average += yabove_row[i];
+                }
+            }
+
+            if (x->left_available)
+            {
+
+                for (i = 0; i < 16; i++)
+                {
+                    average += yleft_col[i];
+                }
+
+            }
+
+
+
+            shift = 3 + x->up_available + x->left_available;
+            expected_dc = (average + (1 << (shift - 1))) >> shift;
+        }
+        else
+        {
+            expected_dc = 128;
+        }
+
+        vpx_memset(ypred_ptr, expected_dc, 256);
+    }
+    break;
+    case V_PRED:
+    {
+
+        for (r = 0; r < 16; r++)
+        {
+
+            ((int *)ypred_ptr)[0] = ((int *)yabove_row)[0];
+            ((int *)ypred_ptr)[1] = ((int *)yabove_row)[1];
+            ((int *)ypred_ptr)[2] = ((int *)yabove_row)[2];
+            ((int *)ypred_ptr)[3] = ((int *)yabove_row)[3];
+            ypred_ptr += 16;
+        }
+    }
+    break;
+    case H_PRED:
+    {
+
+        for (r = 0; r < 16; r++)
+        {
+
+            vpx_memset(ypred_ptr, yleft_col[r], 16);
+            ypred_ptr += 16;
+        }
+
+    }
+    break;
+    case TM_PRED:
+    {
+
+        for (r = 0; r < 16; r++)
+        {
+            for (c = 0; c < 16; c++)
+            {
+                int pred =  yleft_col[r] + yabove_row[ c] - ytop_left;
+
+                if (pred < 0)
+                    pred = 0;
+
+                if (pred > 255)
+                    pred = 255;
+
+                ypred_ptr[c] = pred;
+            }
+
+            ypred_ptr += 16;
+        }
+
+    }
+    break;
+    case B_PRED:
+    case NEARESTMV:
+    case NEARMV:
+    case ZEROMV:
+    case NEWMV:
+    case SPLITMV:
+    case MB_MODE_COUNT:
+        break;
+    }
+#else
+    (void) pbi;
+    (void) x;
+    (void) mb_row;
+    (void) mb_col;
+#endif
+}
+
+void vp8mt_build_intra_predictors_mby_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
+{
+#if CONFIG_MULTITHREAD
+    unsigned char *yabove_row;    /* = x->dst.y_buffer - x->dst.y_stride; */
+    unsigned char *yleft_col;
+    unsigned char yleft_buf[16];
+    unsigned char ytop_left;      /* = yabove_row[-1]; */
+    unsigned char *ypred_ptr = x->predictor;
+    int r, c, i;
+
+    int y_stride = x->dst.y_stride;
+    ypred_ptr = x->dst.y_buffer; /*x->predictor;*/
+
+    if (pbi->common.filter_level)
+    {
+        yabove_row = pbi->mt_yabove_row[mb_row] + mb_col*16 +32;
+        yleft_col = pbi->mt_yleft_col[mb_row];
+    } else
+    {
+        yabove_row = x->dst.y_buffer - x->dst.y_stride;
+
+        for (i = 0; i < 16; i++)
+            yleft_buf[i] = x->dst.y_buffer [i* x->dst.y_stride -1];
+        yleft_col = yleft_buf;
+    }
+
+    ytop_left = yabove_row[-1];
+
+    /* for Y */
+    switch (x->mode_info_context->mbmi.mode)
+    {
+    case DC_PRED:
+    {
+        int expected_dc;
+        int i;
+        int shift;
+        int average = 0;
+
+
+        if (x->up_available || x->left_available)
+        {
+            if (x->up_available)
+            {
+                for (i = 0; i < 16; i++)
+                {
+                    average += yabove_row[i];
+                }
+            }
+
+            if (x->left_available)
+            {
+
+                for (i = 0; i < 16; i++)
+                {
+                    average += yleft_col[i];
+                }
+
+            }
+
+
+
+            shift = 3 + x->up_available + x->left_available;
+            expected_dc = (average + (1 << (shift - 1))) >> shift;
+        }
+        else
+        {
+            expected_dc = 128;
+        }
+
+        /*vpx_memset(ypred_ptr, expected_dc, 256);*/
+        for (r = 0; r < 16; r++)
+        {
+            vpx_memset(ypred_ptr, expected_dc, 16);
+            ypred_ptr += y_stride; /*16;*/
+        }
+    }
+    break;
+    case V_PRED:
+    {
+
+        for (r = 0; r < 16; r++)
+        {
+
+            ((int *)ypred_ptr)[0] = ((int *)yabove_row)[0];
+            ((int *)ypred_ptr)[1] = ((int *)yabove_row)[1];
+            ((int *)ypred_ptr)[2] = ((int *)yabove_row)[2];
+            ((int *)ypred_ptr)[3] = ((int *)yabove_row)[3];
+            ypred_ptr += y_stride; /*16;*/
+        }
+    }
+    break;
+    case H_PRED:
+    {
+
+        for (r = 0; r < 16; r++)
+        {
+
+            vpx_memset(ypred_ptr, yleft_col[r], 16);
+            ypred_ptr += y_stride;  /*16;*/
+        }
+
+    }
+    break;
+    case TM_PRED:
+    {
+
+        for (r = 0; r < 16; r++)
+        {
+            for (c = 0; c < 16; c++)
+            {
+                int pred =  yleft_col[r] + yabove_row[ c] - ytop_left;
+
+                if (pred < 0)
+                    pred = 0;
+
+                if (pred > 255)
+                    pred = 255;
+
+                ypred_ptr[c] = pred;
+            }
+
+            ypred_ptr += y_stride;  /*16;*/
+        }
+
+    }
+    break;
+    case B_PRED:
+    case NEARESTMV:
+    case NEARMV:
+    case ZEROMV:
+    case NEWMV:
+    case SPLITMV:
+    case MB_MODE_COUNT:
+        break;
+    }
+#else
+    (void) pbi;
+    (void) x;
+    (void) mb_row;
+    (void) mb_col;
+#endif
+}
+
+void vp8mt_build_intra_predictors_mbuv(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
+{
+#if CONFIG_MULTITHREAD
+    unsigned char *uabove_row;   /* = x->dst.u_buffer - x->dst.uv_stride; */
+    unsigned char *uleft_col;    /*[16];*/
+    unsigned char uleft_buf[8];
+    unsigned char utop_left;     /* = uabove_row[-1]; */
+    unsigned char *vabove_row;   /* = x->dst.v_buffer - x->dst.uv_stride; */
+    unsigned char *vleft_col;    /*[20];*/
+    unsigned char vleft_buf[8];
+    unsigned char vtop_left;     /* = vabove_row[-1]; */
+    unsigned char *upred_ptr = &x->predictor[256];
+    unsigned char *vpred_ptr = &x->predictor[320];
+    int i, j;
+
+    if (pbi->common.filter_level)
+    {
+        uabove_row = pbi->mt_uabove_row[mb_row] + mb_col*8 +16;
+        vabove_row = pbi->mt_vabove_row[mb_row] + mb_col*8 +16;
+        uleft_col = pbi->mt_uleft_col[mb_row];
+        vleft_col = pbi->mt_vleft_col[mb_row];
+    } else
+    {
+        uabove_row = x->dst.u_buffer - x->dst.uv_stride;
+        vabove_row = x->dst.v_buffer - x->dst.uv_stride;
+
+        for (i = 0; i < 8; i++)
+        {
+            uleft_buf[i] = x->dst.u_buffer [i* x->dst.uv_stride -1];
+            vleft_buf[i] = x->dst.v_buffer [i* x->dst.uv_stride -1];
+        }
+        uleft_col = uleft_buf;
+        vleft_col = vleft_buf;
+    }
+    utop_left = uabove_row[-1];
+    vtop_left = vabove_row[-1];
+
+    switch (x->mode_info_context->mbmi.uv_mode)
+    {
+    case DC_PRED:
+    {
+        int expected_udc;
+        int expected_vdc;
+        int i;
+        int shift;
+        int Uaverage = 0;
+        int Vaverage = 0;
+
+        if (x->up_available)
+        {
+            for (i = 0; i < 8; i++)
+            {
+                Uaverage += uabove_row[i];
+                Vaverage += vabove_row[i];
+            }
+        }
+
+        if (x->left_available)
+        {
+            for (i = 0; i < 8; i++)
+            {
+                Uaverage += uleft_col[i];
+                Vaverage += vleft_col[i];
+            }
+        }
+
+        if (!x->up_available && !x->left_available)
+        {
+            expected_udc = 128;
+            expected_vdc = 128;
+        }
+        else
+        {
+            shift = 2 + x->up_available + x->left_available;
+            expected_udc = (Uaverage + (1 << (shift - 1))) >> shift;
+            expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift;
+        }
+
+
+        vpx_memset(upred_ptr, expected_udc, 64);
+        vpx_memset(vpred_ptr, expected_vdc, 64);
+
+
+    }
+    break;
+    case V_PRED:
+    {
+        int i;
+
+        for (i = 0; i < 8; i++)
+        {
+            vpx_memcpy(upred_ptr, uabove_row, 8);
+            vpx_memcpy(vpred_ptr, vabove_row, 8);
+            upred_ptr += 8;
+            vpred_ptr += 8;
+        }
+
+    }
+    break;
+    case H_PRED:
+    {
+        int i;
+
+        for (i = 0; i < 8; i++)
+        {
+            vpx_memset(upred_ptr, uleft_col[i], 8);
+            vpx_memset(vpred_ptr, vleft_col[i], 8);
+            upred_ptr += 8;
+            vpred_ptr += 8;
+        }
+    }
+
+    break;
+    case TM_PRED:
+    {
+        int i;
+
+        for (i = 0; i < 8; i++)
+        {
+            for (j = 0; j < 8; j++)
+            {
+                int predu = uleft_col[i] + uabove_row[j] - utop_left;
+                int predv = vleft_col[i] + vabove_row[j] - vtop_left;
+
+                if (predu < 0)
+                    predu = 0;
+
+                if (predu > 255)
+                    predu = 255;
+
+                if (predv < 0)
+                    predv = 0;
+
+                if (predv > 255)
+                    predv = 255;
+
+                upred_ptr[j] = predu;
+                vpred_ptr[j] = predv;
+            }
+
+            upred_ptr += 8;
+            vpred_ptr += 8;
+        }
+
+    }
+    break;
+    case B_PRED:
+    case NEARESTMV:
+    case NEARMV:
+    case ZEROMV:
+    case NEWMV:
+    case SPLITMV:
+    case MB_MODE_COUNT:
+        break;
+    }
+#else
+    (void) pbi;
+    (void) x;
+    (void) mb_row;
+    (void) mb_col;
+#endif
+}
+
+void vp8mt_build_intra_predictors_mbuv_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
+{
+#if CONFIG_MULTITHREAD
+    unsigned char *uabove_row;  /* = x->dst.u_buffer - x->dst.uv_stride; */
+    unsigned char *uleft_col;   /*[16];*/
+    unsigned char uleft_buf[8];
+    unsigned char utop_left;    /* = uabove_row[-1]; */
+    unsigned char *vabove_row;  /* = x->dst.v_buffer - x->dst.uv_stride; */
+    unsigned char *vleft_col;   /*[20];*/
+    unsigned char vleft_buf[8];
+    unsigned char vtop_left;    /* = vabove_row[-1]; */
+    unsigned char *upred_ptr = x->dst.u_buffer; /*&x->predictor[256];*/
+    unsigned char *vpred_ptr = x->dst.v_buffer; /*&x->predictor[320];*/
+    int uv_stride = x->dst.uv_stride;
+    int i, j;
+
+    if (pbi->common.filter_level)
+    {
+        uabove_row = pbi->mt_uabove_row[mb_row] + mb_col*8 +16;
+        vabove_row = pbi->mt_vabove_row[mb_row] + mb_col*8 +16;
+        uleft_col = pbi->mt_uleft_col[mb_row];
+        vleft_col = pbi->mt_vleft_col[mb_row];
+    } else
+    {
+        uabove_row = x->dst.u_buffer - x->dst.uv_stride;
+        vabove_row = x->dst.v_buffer - x->dst.uv_stride;
+
+        for (i = 0; i < 8; i++)
+        {
+            uleft_buf[i] = x->dst.u_buffer [i* x->dst.uv_stride -1];
+            vleft_buf[i] = x->dst.v_buffer [i* x->dst.uv_stride -1];
+        }
+        uleft_col = uleft_buf;
+        vleft_col = vleft_buf;
+    }
+    utop_left = uabove_row[-1];
+    vtop_left = vabove_row[-1];
+
+    switch (x->mode_info_context->mbmi.uv_mode)
+    {
+    case DC_PRED:
+    {
+        int expected_udc;
+        int expected_vdc;
+        int i;
+        int shift;
+        int Uaverage = 0;
+        int Vaverage = 0;
+
+        if (x->up_available)
+        {
+            for (i = 0; i < 8; i++)
+            {
+                Uaverage += uabove_row[i];
+                Vaverage += vabove_row[i];
+            }
+        }
+
+        if (x->left_available)
+        {
+            for (i = 0; i < 8; i++)
+            {
+                Uaverage += uleft_col[i];
+                Vaverage += vleft_col[i];
+            }
+        }
+
+        if (!x->up_available && !x->left_available)
+        {
+            expected_udc = 128;
+            expected_vdc = 128;
+        }
+        else
+        {
+            shift = 2 + x->up_available + x->left_available;
+            expected_udc = (Uaverage + (1 << (shift - 1))) >> shift;
+            expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift;
+        }
+
+
+        /*vpx_memset(upred_ptr,expected_udc,64);
+        vpx_memset(vpred_ptr,expected_vdc,64);*/
+        for (i = 0; i < 8; i++)
+        {
+            vpx_memset(upred_ptr, expected_udc, 8);
+            vpx_memset(vpred_ptr, expected_vdc, 8);
+            upred_ptr += uv_stride; /*8;*/
+            vpred_ptr += uv_stride; /*8;*/
+        }
+    }
+    break;
+    case V_PRED:
+    {
+        int i;
+
+        for (i = 0; i < 8; i++)
+        {
+            vpx_memcpy(upred_ptr, uabove_row, 8);
+            vpx_memcpy(vpred_ptr, vabove_row, 8);
+            upred_ptr += uv_stride; /*8;*/
+            vpred_ptr += uv_stride; /*8;*/
+        }
+
+    }
+    break;
+    case H_PRED:
+    {
+        int i;
+
+        for (i = 0; i < 8; i++)
+        {
+            vpx_memset(upred_ptr, uleft_col[i], 8);
+            vpx_memset(vpred_ptr, vleft_col[i], 8);
+            upred_ptr += uv_stride; /*8;*/
+            vpred_ptr += uv_stride; /*8;*/
+        }
+    }
+
+    break;
+    case TM_PRED:
+    {
+        int i;
+
+        for (i = 0; i < 8; i++)
+        {
+            for (j = 0; j < 8; j++)
+            {
+                int predu = uleft_col[i] + uabove_row[j] - utop_left;
+                int predv = vleft_col[i] + vabove_row[j] - vtop_left;
+
+                if (predu < 0)
+                    predu = 0;
+
+                if (predu > 255)
+                    predu = 255;
+
+                if (predv < 0)
+                    predv = 0;
+
+                if (predv > 255)
+                    predv = 255;
+
+                upred_ptr[j] = predu;
+                vpred_ptr[j] = predv;
+            }
+
+            upred_ptr += uv_stride; /*8;*/
+            vpred_ptr += uv_stride; /*8;*/
+        }
+
+    }
+    break;
+    case B_PRED:
+    case NEARESTMV:
+    case NEARMV:
+    case ZEROMV:
+    case NEWMV:
+    case SPLITMV:
+    case MB_MODE_COUNT:
+        break;
+    }
+#else
+    (void) pbi;
+    (void) x;
+    (void) mb_row;
+    (void) mb_col;
+#endif
+}
+
+
+void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
+                          MACROBLOCKD *xd,
+                          int b_mode,
+                          unsigned char *predictor,
+                          int mb_row,
+                          int mb_col,
+                          int num)
+{
+#if CONFIG_MULTITHREAD
+    int i, r, c;
+
+    unsigned char *Above;   /* = *(x->base_dst) + x->dst - x->dst_stride; */
+    unsigned char Left[4];
+    unsigned char top_left; /* = Above[-1]; */
+
+    BLOCKD *x = &xd->block[num];
+
+    /*Caution: For some b_mode, it needs 8 pixels (4 above + 4 above-right).*/
+    if (num < 4 && pbi->common.filter_level)
+        Above = pbi->mt_yabove_row[mb_row] + mb_col*16 + num*4 + 32;
+    else
+        Above = *(x->base_dst) + x->dst - x->dst_stride;
+
+    if (num%4==0 && pbi->common.filter_level)
+    {
+        for (i=0; i<4; i++)
+            Left[i] = pbi->mt_yleft_col[mb_row][num + i];
+    }else
+    {
+        Left[0] = (*(x->base_dst))[x->dst - 1];
+        Left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride];
+        Left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride];
+        Left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride];
+    }
+
+    if ((num==4 || num==8 || num==12) && pbi->common.filter_level)
+        top_left = pbi->mt_yleft_col[mb_row][num-1];
+    else
+        top_left = Above[-1];
+
+     switch (b_mode)
+    {
+    case B_DC_PRED:
+    {
+        int expected_dc = 0;
+
+        for (i = 0; i < 4; i++)
+        {
+            expected_dc += Above[i];
+            expected_dc += Left[i];
+        }
+
+        expected_dc = (expected_dc + 4) >> 3;
+
+        for (r = 0; r < 4; r++)
+        {
+            for (c = 0; c < 4; c++)
+            {
+                predictor[c] = expected_dc;
+            }
+
+            predictor += 16;
+        }
+    }
+    break;
+    case B_TM_PRED:
+    {
+        /* prediction similar to true_motion prediction */
+        for (r = 0; r < 4; r++)
+        {
+            for (c = 0; c < 4; c++)
+            {
+                int pred = Above[c] - top_left + Left[r];
+
+                if (pred < 0)
+                    pred = 0;
+
+                if (pred > 255)
+                    pred = 255;
+
+                predictor[c] = pred;
+            }
+
+            predictor += 16;
+        }
+    }
+    break;
+
+    case B_VE_PRED:
+    {
+
+        unsigned int ap[4];
+        ap[0] = (top_left  + 2 * Above[0] + Above[1] + 2) >> 2;
+        ap[1] = (Above[0] + 2 * Above[1] + Above[2] + 2) >> 2;
+        ap[2] = (Above[1] + 2 * Above[2] + Above[3] + 2) >> 2;
+        ap[3] = (Above[2] + 2 * Above[3] + Above[4] + 2) >> 2;
+
+        for (r = 0; r < 4; r++)
+        {
+            for (c = 0; c < 4; c++)
+            {
+
+                predictor[c] = ap[c];
+            }
+
+            predictor += 16;
+        }
+
+    }
+    break;
+
+
+    case B_HE_PRED:
+    {
+
+        unsigned int lp[4];
+        lp[0] = (top_left + 2 * Left[0] + Left[1] + 2) >> 2;
+        lp[1] = (Left[0] + 2 * Left[1] + Left[2] + 2) >> 2;
+        lp[2] = (Left[1] + 2 * Left[2] + Left[3] + 2) >> 2;
+        lp[3] = (Left[2] + 2 * Left[3] + Left[3] + 2) >> 2;
+
+        for (r = 0; r < 4; r++)
+        {
+            for (c = 0; c < 4; c++)
+            {
+                predictor[c] = lp[r];
+            }
+
+            predictor += 16;
+        }
+    }
+    break;
+    case B_LD_PRED:
+    {
+        unsigned char *ptr = Above;
+        predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
+        predictor[0 * 16 + 1] =
+            predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
+        predictor[0 * 16 + 2] =
+            predictor[1 * 16 + 1] =
+                predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
+        predictor[0 * 16 + 3] =
+            predictor[1 * 16 + 2] =
+                predictor[2 * 16 + 1] =
+                    predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
+        predictor[1 * 16 + 3] =
+            predictor[2 * 16 + 2] =
+                predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
+        predictor[2 * 16 + 3] =
+            predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
+        predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
+
+    }
+    break;
+    case B_RD_PRED:
+    {
+
+        unsigned char pp[9];
+
+        pp[0] = Left[3];
+        pp[1] = Left[2];
+        pp[2] = Left[1];
+        pp[3] = Left[0];
+        pp[4] = top_left;
+        pp[5] = Above[0];
+        pp[6] = Above[1];
+        pp[7] = Above[2];
+        pp[8] = Above[3];
+
+        predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+        predictor[3 * 16 + 1] =
+            predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        predictor[3 * 16 + 2] =
+            predictor[2 * 16 + 1] =
+                predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+        predictor[3 * 16 + 3] =
+            predictor[2 * 16 + 2] =
+                predictor[1 * 16 + 1] =
+                    predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+        predictor[2 * 16 + 3] =
+            predictor[1 * 16 + 2] =
+                predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+        predictor[1 * 16 + 3] =
+            predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+        predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
+
+    }
+    break;
+    case B_VR_PRED:
+    {
+
+        unsigned char pp[9];
+
+        pp[0] = Left[3];
+        pp[1] = Left[2];
+        pp[2] = Left[1];
+        pp[3] = Left[0];
+        pp[4] = top_left;
+        pp[5] = Above[0];
+        pp[6] = Above[1];
+        pp[7] = Above[2];
+        pp[8] = Above[3];
+
+
+        predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+        predictor[3 * 16 + 1] =
+            predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+        predictor[2 * 16 + 1] =
+            predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1;
+        predictor[3 * 16 + 2] =
+            predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+        predictor[2 * 16 + 2] =
+            predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1;
+        predictor[3 * 16 + 3] =
+            predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+        predictor[2 * 16 + 3] =
+            predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1;
+        predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
+        predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1;
+
+    }
+    break;
+    case B_VL_PRED:
+    {
+
+        unsigned char *pp = Above;
+
+        predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
+        predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+        predictor[2 * 16 + 0] =
+            predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1;
+        predictor[1 * 16 + 1] =
+            predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        predictor[2 * 16 + 1] =
+            predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1;
+        predictor[3 * 16 + 1] =
+            predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+        predictor[0 * 16 + 3] =
+            predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1;
+        predictor[1 * 16 + 3] =
+            predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+        predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+        predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+    }
+    break;
+
+    case B_HD_PRED:
+    {
+        unsigned char pp[9];
+        pp[0] = Left[3];
+        pp[1] = Left[2];
+        pp[2] = Left[1];
+        pp[3] = Left[0];
+        pp[4] = top_left;
+        pp[5] = Above[0];
+        pp[6] = Above[1];
+        pp[7] = Above[2];
+        pp[8] = Above[3];
+
+
+        predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
+        predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+        predictor[2 * 16 + 0] =
+            predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1;
+        predictor[2 * 16 + 1] =
+            predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        predictor[2 * 16 + 2] =
+            predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
+        predictor[2 * 16 + 3] =
+            predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+        predictor[1 * 16 + 2] =
+            predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1;
+        predictor[1 * 16 + 3] =
+            predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+        predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+        predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+    }
+    break;
+
+
+    case B_HU_PRED:
+    {
+        unsigned char *pp = Left;
+        predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
+        predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+        predictor[0 * 16 + 2] =
+            predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1;
+        predictor[0 * 16 + 3] =
+            predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        predictor[1 * 16 + 2] =
+            predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
+        predictor[1 * 16 + 3] =
+            predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
+        predictor[2 * 16 + 2] =
+            predictor[2 * 16 + 3] =
+                predictor[3 * 16 + 0] =
+                    predictor[3 * 16 + 1] =
+                        predictor[3 * 16 + 2] =
+                            predictor[3 * 16 + 3] = pp[3];
+    }
+    break;
+
+
+    }
+#else
+    (void) pbi;
+    (void) xd;
+    (void) b_mode;
+    (void) predictor;
+    (void) mb_row;
+    (void) mb_col;
+    (void) num;
+#endif
+}
+
+/* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
+ * to the right prediction have filled in pixels to use.
+ */
+void vp8mt_intra_prediction_down_copy(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
+{
+#if CONFIG_MULTITHREAD
+    unsigned char *above_right;   /* = *(x->block[0].base_dst) + x->block[0].dst - x->block[0].dst_stride + 16; */
+    unsigned int *src_ptr;
+    unsigned int *dst_ptr0;
+    unsigned int *dst_ptr1;
+    unsigned int *dst_ptr2;
+
+    if (pbi->common.filter_level)
+        above_right = pbi->mt_yabove_row[mb_row] + mb_col*16 + 32 +16;
+    else
+        above_right = *(x->block[0].base_dst) + x->block[0].dst - x->block[0].dst_stride + 16;
+
+    src_ptr = (unsigned int *)above_right;
+    /*dst_ptr0 = (unsigned int *)(above_right + 4 * x->block[0].dst_stride);
+    dst_ptr1 = (unsigned int *)(above_right + 8 * x->block[0].dst_stride);
+    dst_ptr2 = (unsigned int *)(above_right + 12 * x->block[0].dst_stride);*/
+    dst_ptr0 = (unsigned int *)(*(x->block[0].base_dst) + x->block[0].dst + 16 + 3 * x->block[0].dst_stride);
+    dst_ptr1 = (unsigned int *)(*(x->block[0].base_dst) + x->block[0].dst + 16 + 7 * x->block[0].dst_stride);
+    dst_ptr2 = (unsigned int *)(*(x->block[0].base_dst) + x->block[0].dst + 16 + 11 * x->block[0].dst_stride);
+    *dst_ptr0 = *src_ptr;
+    *dst_ptr1 = *src_ptr;
+    *dst_ptr2 = *src_ptr;
+#else
+    (void) pbi;
+    (void) x;
+    (void) mb_row;
+    (void) mb_col;
+#endif
+}
diff --git a/vp8/decoder/reconintra_mt.h b/vp8/decoder/reconintra_mt.h
new file mode 100644
index 000000000..d401295b2
--- /dev/null
+++ b/vp8/decoder/reconintra_mt.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_RECONINTRA_MT_H
+#define __INC_RECONINTRA_MT_H
+
+/* reconintra functions used in multi-threaded decoder */
+#if CONFIG_MULTITHREAD
+extern void vp8mt_build_intra_predictors_mby(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
+extern void vp8mt_build_intra_predictors_mby_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
+extern void vp8mt_build_intra_predictors_mbuv(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
+extern void vp8mt_build_intra_predictors_mbuv_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
+
+extern void vp8mt_predict_intra4x4(VP8D_COMP *pbi, MACROBLOCKD *x, int b_mode, unsigned char *predictor, int mb_row, int mb_col, int num);
+extern void vp8mt_intra_prediction_down_copy(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
+#endif
+
+#endif
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index e35d1757f..fc2fad516 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -1,16 +1,20 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
 #ifndef WIN32
 # include <unistd.h>
 #endif
+#ifdef __APPLE__
+#include <mach/mach_init.h>
+#endif
 #include "onyxd_int.h"
 #include "vpx_mem/vpx_mem.h"
 #include "threading.h"
@@ -18,20 +22,22 @@
 #include "loopfilter.h"
 #include "extend.h"
 #include "vpx_ports/vpx_timer.h"
+#include "detokenize.h"
+#include "reconinter.h"
+#include "reconintra_mt.h"
 
-extern void vp8_decode_mb_row(VP8D_COMP *pbi,
-                              VP8_COMMON *pc,
-                              int mb_row,
-                              MACROBLOCKD *xd);
-
+extern void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
+extern void clamp_mvs(MACROBLOCKD *xd);
 extern void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel);
-extern void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd);
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define RTCD_VTABLE(x) (&(pbi)->common.rtcd.x)
+#else
+#define RTCD_VTABLE(x) NULL
+#endif
 
 void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
 {
-
-
-
 #if CONFIG_MULTITHREAD
     VP8_COMMON *const pc = & pbi->common;
     int i, j;
@@ -42,15 +48,11 @@ void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC
 #if CONFIG_RUNTIME_CPU_DETECT
         mbd->rtcd = xd->rtcd;
 #endif
-
-
         mbd->subpixel_predict        = xd->subpixel_predict;
         mbd->subpixel_predict8x4     = xd->subpixel_predict8x4;
         mbd->subpixel_predict8x8     = xd->subpixel_predict8x8;
         mbd->subpixel_predict16x16   = xd->subpixel_predict16x16;
-        mbd->gf_active_ptr            = xd->gf_active_ptr;
 
-        mbd->mode_info        = pc->mi - 1;
         mbd->mode_info_context = pc->mi   + pc->mode_info_stride * (i + 1);
         mbd->mode_info_stride  = pc->mode_info_stride;
 
@@ -58,11 +60,8 @@ void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC
         mbd->frames_since_golden      = pc->frames_since_golden;
         mbd->frames_till_alt_ref_frame  = pc->frames_till_alt_ref_frame;
 
-        mbd->pre = pc->last_frame;
-        mbd->dst = pc->new_frame;
-
-
-
+        mbd->pre = pc->yv12_fb[pc->lst_fb_idx];
+        mbd->dst = pc->yv12_fb[pc->new_fb_idx];
 
         vp8_setup_block_dptrs(mbd);
         vp8_build_block_doffsets(mbd);
@@ -70,8 +69,14 @@ void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC
         mbd->mb_segement_abs_delta     = xd->mb_segement_abs_delta;
         vpx_memcpy(mbd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));
 
-        mbd->mbmi.mode = DC_PRED;
-        mbd->mbmi.uv_mode = DC_PRED;
+        /*signed char ref_lf_deltas[MAX_REF_LF_DELTAS];*/
+        vpx_memcpy(mbd->ref_lf_deltas, xd->ref_lf_deltas, sizeof(xd->ref_lf_deltas));
+        /*signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];*/
+        vpx_memcpy(mbd->mode_lf_deltas, xd->mode_lf_deltas, sizeof(xd->mode_lf_deltas));
+        /*unsigned char mode_ref_lf_delta_enabled;
+        unsigned char mode_ref_lf_delta_update;*/
+        mbd->mode_ref_lf_delta_enabled    = xd->mode_ref_lf_delta_enabled;
+        mbd->mode_ref_lf_delta_update    = xd->mode_ref_lf_delta_update;
 
         mbd->current_bc = &pbi->bc2;
 
@@ -81,6 +86,8 @@ void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC
         }
     }
 
+    for (i=0; i< pc->mb_rows; i++)
+        pbi->mt_current_mb_col[i]=-1;
 #else
     (void) pbi;
     (void) xd;
@@ -90,348 +97,390 @@ void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC
 }
 
 
-THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
+void vp8mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col)
 {
 #if CONFIG_MULTITHREAD
-    int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
-    VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
-    MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2);
-    ENTROPY_CONTEXT mb_row_left_context[4][4];
+    int eobtotal = 0;
+    int i, do_clamp = xd->mode_info_context->mbmi.need_to_clamp_mvs;
+    VP8_COMMON *pc = &pbi->common;
 
-    while (1)
+    if (xd->mode_info_context->mbmi.mb_skip_coeff)
     {
-        if (pbi->b_multithreaded_rd == 0)
-            break;
-
-        //if(WaitForSingleObject(pbi->h_event_mbrdecoding[ithread], INFINITE) == WAIT_OBJECT_0)
-        if (sem_wait(&pbi->h_event_mbrdecoding[ithread]) == 0)
-        {
-            if (pbi->b_multithreaded_rd == 0)
-                break;
-            else
-            {
-                VP8_COMMON *pc = &pbi->common;
-                int mb_row       = mbrd->mb_row;
-                MACROBLOCKD *xd = &mbrd->mbd;
-
-                //printf("ithread:%d mb_row %d\n", ithread, mb_row);
-                int i;
-                int recon_yoffset, recon_uvoffset;
-                int mb_col;
-                int recon_y_stride = pc->last_frame.y_stride;
-                int recon_uv_stride = pc->last_frame.uv_stride;
-
-                volatile int *last_row_current_mb_col;
-
-                if (ithread > 0)
-                    last_row_current_mb_col = &pbi->mb_row_di[ithread-1].current_mb_col;
-                else
-                    last_row_current_mb_col = &pbi->current_mb_col_main;
-
-                recon_yoffset = mb_row * recon_y_stride * 16;
-                recon_uvoffset = mb_row * recon_uv_stride * 8;
-                // reset above block coeffs
-
-                xd->above_context[Y1CONTEXT] = pc->above_context[Y1CONTEXT];
-                xd->above_context[UCONTEXT ] = pc->above_context[UCONTEXT];
-                xd->above_context[VCONTEXT ] = pc->above_context[VCONTEXT];
-                xd->above_context[Y2CONTEXT] = pc->above_context[Y2CONTEXT];
-                xd->left_context = mb_row_left_context;
-                vpx_memset(mb_row_left_context, 0, sizeof(mb_row_left_context));
-                xd->up_available = (mb_row != 0);
-
-                xd->mb_to_top_edge = -((mb_row * 16)) << 3;
-                xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
-
-                for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
-                {
-
-                    while (mb_col > (*last_row_current_mb_col - 1) && *last_row_current_mb_col != pc->mb_cols - 1)
-                    {
-                        x86_pause_hint();
-                        thread_sleep(0);
-                    }
-
-                    // Take a copy of the mode and Mv information for this macroblock into the xd->mbmi
-                    vpx_memcpy(&xd->mbmi, &xd->mode_info_context->mbmi, 32); //sizeof(MB_MODE_INFO) );
-
-                    if (xd->mbmi.mode == SPLITMV || xd->mbmi.mode == B_PRED)
-                    {
-                        for (i = 0; i < 16; i++)
-                        {
-                            BLOCKD *d = &xd->block[i];
-                            vpx_memcpy(&d->bmi, &xd->mode_info_context->bmi[i], sizeof(B_MODE_INFO));
-                        }
-                    }
-
-                    // Distance of Mb to the various image edges.
-                    // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
-                    xd->mb_to_left_edge = -((mb_col * 16) << 3);
-                    xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
-
-                    xd->dst.y_buffer = pc->new_frame.y_buffer + recon_yoffset;
-                    xd->dst.u_buffer = pc->new_frame.u_buffer + recon_uvoffset;
-                    xd->dst.v_buffer = pc->new_frame.v_buffer + recon_uvoffset;
-
-                    xd->left_available = (mb_col != 0);
-
-                    // Select the appropriate reference frame for this MB
-                    if (xd->mbmi.ref_frame == LAST_FRAME)
-                    {
-                        xd->pre.y_buffer = pc->last_frame.y_buffer + recon_yoffset;
-                        xd->pre.u_buffer = pc->last_frame.u_buffer + recon_uvoffset;
-                        xd->pre.v_buffer = pc->last_frame.v_buffer + recon_uvoffset;
-                    }
-                    else if (xd->mbmi.ref_frame == GOLDEN_FRAME)
-                    {
-                        // Golden frame reconstruction buffer
-                        xd->pre.y_buffer = pc->golden_frame.y_buffer + recon_yoffset;
-                        xd->pre.u_buffer = pc->golden_frame.u_buffer + recon_uvoffset;
-                        xd->pre.v_buffer = pc->golden_frame.v_buffer + recon_uvoffset;
-                    }
-                    else
-                    {
-                        // Alternate reference frame reconstruction buffer
-                        xd->pre.y_buffer = pc->alt_ref_frame.y_buffer + recon_yoffset;
-                        xd->pre.u_buffer = pc->alt_ref_frame.u_buffer + recon_uvoffset;
-                        xd->pre.v_buffer = pc->alt_ref_frame.v_buffer + recon_uvoffset;
-                    }
-
-                    vp8_build_uvmvs(xd, pc->full_pixel);
-
-                    vp8dx_bool_decoder_fill(xd->current_bc);
-                    vp8_decode_macroblock(pbi, xd);
-
+        vp8_reset_mb_tokens_context(xd);
+    }
+    else
+    {
+        eobtotal = vp8_decode_mb_tokens(pbi, xd);
+    }
 
-                    recon_yoffset += 16;
-                    recon_uvoffset += 8;
+    /* Perform temporary clamping of the MV to be used for prediction */
+    if (do_clamp)
+    {
+        clamp_mvs(xd);
+    }
 
-                    ++xd->mode_info_context;  /* next mb */
+    xd->mode_info_context->mbmi.dc_diff = 1;
 
-                    xd->gf_active_ptr++;      // GF useage flag for next MB
+    if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV && eobtotal == 0)
+    {
+        xd->mode_info_context->mbmi.dc_diff = 0;
 
-                    xd->above_context[Y1CONTEXT] += 4;
-                    xd->above_context[UCONTEXT ] += 2;
-                    xd->above_context[VCONTEXT ] += 2;
-                    xd->above_context[Y2CONTEXT] ++;
-                    pbi->mb_row_di[ithread].current_mb_col = mb_col;
+        /*mt_skip_recon_mb(pbi, xd, mb_row, mb_col);*/
+        if (xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+        {
+            vp8mt_build_intra_predictors_mbuv_s(pbi, xd, mb_row, mb_col);
+            vp8mt_build_intra_predictors_mby_s(pbi, xd, mb_row, mb_col);
+        }
+        else
+        {
+            vp8_build_inter_predictors_mb_s(xd);
+        }
+        return;
+    }
 
-                }
+    if (xd->segmentation_enabled)
+        mb_init_dequantizer(pbi, xd);
 
-                // adjust to the next row of mbs
-                vp8_extend_mb_row(
-                    &pc->new_frame,
-                    xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8
-                );
+    /* do prediction */
+    if (xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+    {
+        vp8mt_build_intra_predictors_mbuv(pbi, xd, mb_row, mb_col);
 
-                ++xd->mode_info_context;      /* skip prediction column */
+        if (xd->mode_info_context->mbmi.mode != B_PRED)
+        {
+            vp8mt_build_intra_predictors_mby(pbi, xd, mb_row, mb_col);
+        } else {
+            vp8mt_intra_prediction_down_copy(pbi, xd, mb_row, mb_col);
+        }
+    }
+    else
+    {
+        vp8_build_inter_predictors_mb(xd);
+    }
 
-                // since we have multithread
-                xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
+    /* dequantization and idct */
+    if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV)
+    {
+        BLOCKD *b = &xd->block[24];
+        DEQUANT_INVOKE(&pbi->dequant, block)(b);
 
-                //memcpy(&pbi->lpfmb, &pbi->mb, sizeof(pbi->mb));
-                if ((mb_row & 1) == 1)
-                {
-                    pbi->last_mb_row_decoded = mb_row;
-                    //printf("S%d", pbi->last_mb_row_decoded);
-                }
+        /* do 2nd order transform on the dc block */
+        if (xd->eobs[24] > 1)
+        {
+            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
+            ((int *)b->qcoeff)[0] = 0;
+            ((int *)b->qcoeff)[1] = 0;
+            ((int *)b->qcoeff)[2] = 0;
+            ((int *)b->qcoeff)[3] = 0;
+            ((int *)b->qcoeff)[4] = 0;
+            ((int *)b->qcoeff)[5] = 0;
+            ((int *)b->qcoeff)[6] = 0;
+            ((int *)b->qcoeff)[7] = 0;
+        }
+        else
+        {
+            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
+            ((int *)b->qcoeff)[0] = 0;
+        }
 
-                if (ithread == (pbi->decoding_thread_count - 1) || mb_row == pc->mb_rows - 1)
-                {
-                    //SetEvent(pbi->h_event_main);
-                    sem_post(&pbi->h_event_main);
+        DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
+                        (xd->qcoeff, xd->block[0].dequant,
+                         xd->predictor, xd->dst.y_buffer,
+                         xd->dst.y_stride, xd->eobs, xd->block[24].diff);
+    }
+    else if ((xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) && xd->mode_info_context->mbmi.mode == B_PRED)
+    {
+        for (i = 0; i < 16; i++)
+        {
+            BLOCKD *b = &xd->block[i];
+            vp8mt_predict_intra4x4(pbi, xd, b->bmi.mode, b->predictor, mb_row, mb_col, i);
 
-                }
+            if (xd->eobs[i] > 1)
+            {
+                DEQUANT_INVOKE(&pbi->dequant, idct_add)
+                    (b->qcoeff, b->dequant,  b->predictor,
+                    *(b->base_dst) + b->dst, 16, b->dst_stride);
+            }
+            else
+            {
+                IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
+                    (b->qcoeff[0] * b->dequant[0], b->predictor,
+                    *(b->base_dst) + b->dst, 16, b->dst_stride);
+                ((int *)b->qcoeff)[0] = 0;
             }
         }
     }
+    else
+    {
+        DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
+                        (xd->qcoeff, xd->block[0].dequant,
+                         xd->predictor, xd->dst.y_buffer,
+                         xd->dst.y_stride, xd->eobs);
+    }
 
+    DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
+                    (xd->qcoeff+16*16, xd->block[16].dequant,
+                     xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
+                     xd->dst.uv_stride, xd->eobs+16);
 #else
-    (void) p_data;
+    (void) pbi;
+    (void) xd;
+    (void) mb_row;
+    (void) mb_col;
 #endif
-
-    return 0 ;
 }
 
-THREAD_FUNCTION vp8_thread_loop_filter(void *p_data)
+
+THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
 {
 #if CONFIG_MULTITHREAD
-    VP8D_COMP *pbi = (VP8D_COMP *)p_data;
+    int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
+    VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
+    MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2);
+    ENTROPY_CONTEXT_PLANES mb_row_left_context;
 
     while (1)
     {
-        if (pbi->b_multithreaded_lf == 0)
+        if (pbi->b_multithreaded_rd == 0)
             break;
 
-        //printf("before waiting for start_lpf\n");
-
-        //if(WaitForSingleObject(pbi->h_event_start_lpf, INFINITE) == WAIT_OBJECT_0)
-        if (sem_wait(&pbi->h_event_start_lpf) == 0)
+        /*if(WaitForSingleObject(pbi->h_event_start_decoding[ithread], INFINITE) == WAIT_OBJECT_0)*/
+        if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0)
         {
-            if (pbi->b_multithreaded_lf == 0) // we're shutting down
+            if (pbi->b_multithreaded_rd == 0)
                 break;
             else
             {
+                VP8_COMMON *pc = &pbi->common;
+                MACROBLOCKD *xd = &mbrd->mbd;
 
-                VP8_COMMON *cm  = &pbi->common;
-                MACROBLOCKD *mbd = &pbi->lpfmb;
-                int default_filt_lvl = pbi->common.filter_level;
+                int mb_row;
+                int num_part = 1 << pbi->common.multi_token_partition;
+                volatile int *last_row_current_mb_col;
+                int nsync = pbi->sync_range;
 
-                YV12_BUFFER_CONFIG *post = &cm->new_frame;
-                loop_filter_info *lfi = cm->lf_info;
+                for (mb_row = ithread+1; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
+                {
+                    int i;
+                    int recon_yoffset, recon_uvoffset;
+                    int mb_col;
+                    int ref_fb_idx = pc->lst_fb_idx;
+                    int dst_fb_idx = pc->new_fb_idx;
+                    int recon_y_stride = pc->yv12_fb[ref_fb_idx].y_stride;
+                    int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;
 
-                int mb_row;
-                int mb_col;
+                    int filter_level;
+                    loop_filter_info *lfi = pc->lf_info;
+                    int alt_flt_enabled = xd->segmentation_enabled;
+                    int Segment;
 
+                    pbi->mb_row_di[ithread].mb_row = mb_row;
+                    pbi->mb_row_di[ithread].mbd.current_bc =  &pbi->mbc[mb_row%num_part];
 
-                int baseline_filter_level[MAX_MB_SEGMENTS];
-                int filter_level;
-                int alt_flt_enabled = mbd->segmentation_enabled;
+                    last_row_current_mb_col = &pbi->mt_current_mb_col[mb_row -1];
 
-                int i;
-                unsigned char *y_ptr, *u_ptr, *v_ptr;
+                    recon_yoffset = mb_row * recon_y_stride * 16;
+                    recon_uvoffset = mb_row * recon_uv_stride * 8;
+                    /* reset above block coeffs */
 
-                volatile int *last_mb_row_decoded = &pbi->last_mb_row_decoded;
+                    xd->above_context = pc->above_context;
+                    xd->left_context = &mb_row_left_context;
+                    vpx_memset(&mb_row_left_context, 0, sizeof(mb_row_left_context));
+                    xd->up_available = (mb_row != 0);
 
-                //MODE_INFO * this_mb_mode_info = cm->mi;
-                mbd->mode_info_context = cm->mi;          // Point at base of Mb MODE_INFO list
+                    xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+                    xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
 
-                // Note the baseline filter values for each segment
-                if (alt_flt_enabled)
-                {
-                    for (i = 0; i < MAX_MB_SEGMENTS; i++)
+                    for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
                     {
-                        if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
-                            baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-                        else
+                        if ((mb_col & (nsync-1)) == 0)
                         {
-                            baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-                            baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  // Clamp to valid range
+                            while (mb_col > (*last_row_current_mb_col - nsync) && *last_row_current_mb_col != pc->mb_cols - 1)
+                            {
+                                x86_pause_hint();
+                                thread_sleep(0);
+                            }
                         }
-                    }
-                }
-                else
-                {
-                    for (i = 0; i < MAX_MB_SEGMENTS; i++)
-                        baseline_filter_level[i] = default_filt_lvl;
-                }
 
-                // Initialize the loop filter for this frame.
-                vp8_init_loop_filter(cm);
+                        if (xd->mode_info_context->mbmi.mode == SPLITMV || xd->mode_info_context->mbmi.mode == B_PRED)
+                        {
+                            for (i = 0; i < 16; i++)
+                            {
+                                BLOCKD *d = &xd->block[i];
+                                vpx_memcpy(&d->bmi, &xd->mode_info_context->bmi[i], sizeof(B_MODE_INFO));
+                            }
+                        }
 
-                // Set up the buffer pointers
-                y_ptr = post->y_buffer;
-                u_ptr = post->u_buffer;
-                v_ptr = post->v_buffer;
+                        if(pbi->common.filter_level)
+                        {
+                            /*update loopfilter info*/
+                            Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
+                            filter_level = pbi->mt_baseline_filter_level[Segment];
+                            /* Distance of Mb to the various image edges.
+                             * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
+                             * Apply any context driven MB level adjustment
+                             */
+                            vp8_adjust_mb_lf_value(xd, &filter_level);
+                        }
 
-                // vp8_filter each macro block
-                for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
-                {
+                        /* Distance of Mb to the various image edges.
+                         * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
+                         */
+                        xd->mb_to_left_edge = -((mb_col * 16) << 3);
+                        xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
 
-                    while (mb_row >= *last_mb_row_decoded)
-                    {
-                        x86_pause_hint();
-                        thread_sleep(0);
-                    }
-
-                    //printf("R%d", mb_row);
-                    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
-                    {
-                        int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
+                        xd->dst.y_buffer = pc->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+                        xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+                        xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
 
-                        filter_level = baseline_filter_level[Segment];
+                        xd->left_available = (mb_col != 0);
 
-                        // Apply any context driven MB level adjustment
-                        vp8_adjust_mb_lf_value(mbd, &filter_level);
-
-                        if (filter_level)
-                        {
-                            if (mb_col > 0)
-                                cm->lf_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                        /* Select the appropriate reference frame for this MB */
+                        if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
+                            ref_fb_idx = pc->lst_fb_idx;
+                        else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
+                            ref_fb_idx = pc->gld_fb_idx;
+                        else
+                            ref_fb_idx = pc->alt_fb_idx;
 
-                            if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                                cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                        xd->pre.y_buffer = pc->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
+                        xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
+                        xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
 
-                            // don't apply across umv border
-                            if (mb_row > 0)
-                                cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                        vp8_build_uvmvs(xd, pc->full_pixel);
+                        vp8mt_decode_macroblock(pbi, xd, mb_row, mb_col);
 
-                            if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                                cm->lf_bh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                        if (pbi->common.filter_level)
+                        {
+                            if( mb_row != pc->mb_rows-1 )
+                            {
+                                /* Save decoded MB last row data for next-row decoding */
+                                vpx_memcpy((pbi->mt_yabove_row[mb_row + 1] + 32 + mb_col*16), (xd->dst.y_buffer + 15 * recon_y_stride), 16);
+                                vpx_memcpy((pbi->mt_uabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.u_buffer + 7 * recon_uv_stride), 8);
+                                vpx_memcpy((pbi->mt_vabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.v_buffer + 7 * recon_uv_stride), 8);
+                            }
+
+                            /* save left_col for next MB decoding */
+                            if(mb_col != pc->mb_cols-1)
+                            {
+                                MODE_INFO *next = xd->mode_info_context +1;
+
+                                if (xd->frame_type == KEY_FRAME  ||  next->mbmi.ref_frame == INTRA_FRAME)
+                                {
+                                    for (i = 0; i < 16; i++)
+                                        pbi->mt_yleft_col[mb_row][i] = xd->dst.y_buffer [i* recon_y_stride + 15];
+                                    for (i = 0; i < 8; i++)
+                                    {
+                                        pbi->mt_uleft_col[mb_row][i] = xd->dst.u_buffer [i* recon_uv_stride + 7];
+                                        pbi->mt_vleft_col[mb_row][i] = xd->dst.v_buffer [i* recon_uv_stride + 7];
+                                    }
+                                }
+                            }
+
+                          /* loopfilter on this macroblock. */
+                            if (filter_level)
+                            {
+                                if (mb_col > 0)
+                                    pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+
+                                if (xd->mode_info_context->mbmi.dc_diff > 0)
+                                    pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+
+                                /* don't apply across umv border */
+                                if (mb_row > 0)
+                                    pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+
+                                if (xd->mode_info_context->mbmi.dc_diff > 0)
+                                    pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+                            }
                         }
 
-                        y_ptr += 16;
-                        u_ptr += 8;
-                        v_ptr += 8;
+                        recon_yoffset += 16;
+                        recon_uvoffset += 8;
+
+                        ++xd->mode_info_context;  /* next mb */
 
-                        mbd->mode_info_context++;     // step to next MB
+                        xd->above_context++;
 
+                        /*pbi->mb_row_di[ithread].current_mb_col = mb_col;*/
+                        pbi->mt_current_mb_col[mb_row] = mb_col;
                     }
 
-                    y_ptr += post->y_stride  * 16 - post->y_width;
-                    u_ptr += post->uv_stride *  8 - post->uv_width;
-                    v_ptr += post->uv_stride *  8 - post->uv_width;
+                    /* adjust to the next row of mbs */
+                    if (pbi->common.filter_level)
+                    {
+                        if(mb_row != pc->mb_rows-1)
+                        {
+                            int lasty = pc->yv12_fb[ref_fb_idx].y_width + VP8BORDERINPIXELS;
+                            int lastuv = (pc->yv12_fb[ref_fb_idx].y_width>>1) + (VP8BORDERINPIXELS>>1);
+
+                            for (i = 0; i < 4; i++)
+                            {
+                                pbi->mt_yabove_row[mb_row +1][lasty + i] = pbi->mt_yabove_row[mb_row +1][lasty -1];
+                                pbi->mt_uabove_row[mb_row +1][lastuv + i] = pbi->mt_uabove_row[mb_row +1][lastuv -1];
+                                pbi->mt_vabove_row[mb_row +1][lastuv + i] = pbi->mt_vabove_row[mb_row +1][lastuv -1];
+                            }
+                        }
+                    } else
+                        vp8_extend_mb_row(&pc->yv12_fb[dst_fb_idx], xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
 
-                    mbd->mode_info_context++;         // Skip border mb
-                }
+                    ++xd->mode_info_context;      /* skip prediction column */
 
-                //printf("R%d\n", mb_row);
-                // When done, signal main thread that ME is finished
-                //SetEvent(pbi->h_event_lpf);
-                sem_post(&pbi->h_event_lpf);
+                    /* since we have multithread */
+                    xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
+                }
             }
-
+        }
+        /*  add this to each frame */
+        if ((mbrd->mb_row == pbi->common.mb_rows-1) || ((mbrd->mb_row == pbi->common.mb_rows-2) && (pbi->common.mb_rows % (pbi->decoding_thread_count+1))==1))
+        {
+            /*SetEvent(pbi->h_event_end_decoding);*/
+            sem_post(&pbi->h_event_end_decoding);
         }
     }
-
 #else
     (void) p_data;
 #endif
-    return 0;
+
+    return 0 ;
 }
 
+
 void vp8_decoder_create_threads(VP8D_COMP *pbi)
 {
 #if CONFIG_MULTITHREAD
     int core_count = 0;
     int ithread;
+    int i;
 
     pbi->b_multithreaded_rd = 0;
-    pbi->b_multithreaded_lf = 0;
     pbi->allocated_decoding_thread_count = 0;
-    core_count = (pbi->max_threads > 16) ? 16 : pbi->max_threads; //vp8_get_proc_core_count();
-    if (core_count > 1)
-    {
-        sem_init(&pbi->h_event_lpf, 0, 0);
-        sem_init(&pbi->h_event_start_lpf, 0, 0);
-        pbi->b_multithreaded_lf = 1;
-        pthread_create(&pbi->h_thread_lpf, 0, vp8_thread_loop_filter, (pbi));
-    }
+    core_count = (pbi->max_threads > 16) ? 16 : pbi->max_threads;
 
     if (core_count > 1)
     {
         pbi->b_multithreaded_rd = 1;
-        pbi->decoding_thread_count = core_count - 1;
+        pbi->decoding_thread_count = core_count -1;
 
         CHECK_MEM_ERROR(pbi->h_decoding_thread, vpx_malloc(sizeof(pthread_t) * pbi->decoding_thread_count));
-        CHECK_MEM_ERROR(pbi->h_event_mbrdecoding, vpx_malloc(sizeof(sem_t) * pbi->decoding_thread_count));
+        CHECK_MEM_ERROR(pbi->h_event_start_decoding, vpx_malloc(sizeof(sem_t) * pbi->decoding_thread_count));
         CHECK_MEM_ERROR(pbi->mb_row_di, vpx_memalign(32, sizeof(MB_ROW_DEC) * pbi->decoding_thread_count));
         vpx_memset(pbi->mb_row_di, 0, sizeof(MB_ROW_DEC) * pbi->decoding_thread_count);
         CHECK_MEM_ERROR(pbi->de_thread_data, vpx_malloc(sizeof(DECODETHREAD_DATA) * pbi->decoding_thread_count));
 
         for (ithread = 0; ithread < pbi->decoding_thread_count; ithread++)
         {
-            sem_init(&pbi->h_event_mbrdecoding[ithread], 0, 0);
+            sem_init(&pbi->h_event_start_decoding[ithread], 0, 0);
 
             pbi->de_thread_data[ithread].ithread  = ithread;
             pbi->de_thread_data[ithread].ptr1     = (void *)pbi;
             pbi->de_thread_data[ithread].ptr2     = (void *) &pbi->mb_row_di[ithread];
 
             pthread_create(&pbi->h_decoding_thread[ithread], 0, vp8_thread_decoding_proc, (&pbi->de_thread_data[ithread]));
-
         }
 
-        sem_init(&pbi->h_event_main, 0, 0);
+        sem_init(&pbi->h_event_end_decoding, 0, 0);
+
         pbi->allocated_decoding_thread_count = pbi->decoding_thread_count;
     }
 
@@ -440,45 +489,196 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi)
 #endif
 }
 
-void vp8_decoder_remove_threads(VP8D_COMP *pbi)
+
+void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows)
 {
 #if CONFIG_MULTITHREAD
+    VP8_COMMON *const pc = & pbi->common;
+    int i;
 
-    if (pbi->b_multithreaded_lf)
-    {
-        pbi->b_multithreaded_lf = 0;
-        sem_post(&pbi->h_event_start_lpf);
-        pthread_join(pbi->h_thread_lpf, 0);
-        sem_destroy(&pbi->h_event_start_lpf);
-    }
-
-    //shutdown MB Decoding thread;
     if (pbi->b_multithreaded_rd)
     {
-        pbi->b_multithreaded_rd = 0;
-        // allow all threads to exit
+        if (pbi->mt_current_mb_col)
         {
-            int i;
+            vpx_free(pbi->mt_current_mb_col);
+            pbi->mt_current_mb_col = NULL ;
+        }
 
-            for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
+        /* Free above_row buffers. */
+        if (pbi->mt_yabove_row)
+        {
+            for (i=0; i< mb_rows; i++)
             {
+                if (pbi->mt_yabove_row[i])
+                {
+                    vpx_free(pbi->mt_yabove_row[i]);
+                    pbi->mt_yabove_row[i] = NULL ;
+                }
+            }
+            vpx_free(pbi->mt_yabove_row);
+            pbi->mt_yabove_row = NULL ;
+        }
 
-                sem_post(&pbi->h_event_mbrdecoding[i]);
-                pthread_join(pbi->h_decoding_thread[i], NULL);
+        if (pbi->mt_uabove_row)
+        {
+            for (i=0; i< mb_rows; i++)
+            {
+                if (pbi->mt_uabove_row[i])
+                {
+                    vpx_free(pbi->mt_uabove_row[i]);
+                    pbi->mt_uabove_row[i] = NULL ;
+                }
             }
+            vpx_free(pbi->mt_uabove_row);
+            pbi->mt_uabove_row = NULL ;
         }
+
+        if (pbi->mt_vabove_row)
         {
+            for (i=0; i< mb_rows; i++)
+            {
+                if (pbi->mt_vabove_row[i])
+                {
+                    vpx_free(pbi->mt_vabove_row[i]);
+                    pbi->mt_vabove_row[i] = NULL ;
+                }
+            }
+            vpx_free(pbi->mt_vabove_row);
+            pbi->mt_vabove_row = NULL ;
+        }
 
-            int i;
-            for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
+        /* Free left_col buffers. */
+        if (pbi->mt_yleft_col)
+        {
+            for (i=0; i< mb_rows; i++)
+            {
+                if (pbi->mt_yleft_col[i])
+                {
+                    vpx_free(pbi->mt_yleft_col[i]);
+                    pbi->mt_yleft_col[i] = NULL ;
+                }
+            }
+            vpx_free(pbi->mt_yleft_col);
+            pbi->mt_yleft_col = NULL ;
+        }
+
+        if (pbi->mt_uleft_col)
+        {
+            for (i=0; i< mb_rows; i++)
             {
-                sem_destroy(&pbi->h_event_mbrdecoding[i]);
+                if (pbi->mt_uleft_col[i])
+                {
+                    vpx_free(pbi->mt_uleft_col[i]);
+                    pbi->mt_uleft_col[i] = NULL ;
+                }
             }
+            vpx_free(pbi->mt_uleft_col);
+            pbi->mt_uleft_col = NULL ;
+        }
+
+        if (pbi->mt_vleft_col)
+        {
+            for (i=0; i< mb_rows; i++)
+            {
+                if (pbi->mt_vleft_col[i])
+                {
+                    vpx_free(pbi->mt_vleft_col[i]);
+                    pbi->mt_vleft_col[i] = NULL ;
+                }
+            }
+            vpx_free(pbi->mt_vleft_col);
+            pbi->mt_vleft_col = NULL ;
+        }
+    }
+#else
+    (void) pbi;
+#endif
+}
+
 
+int vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
+{
+#if CONFIG_MULTITHREAD
+    VP8_COMMON *const pc = & pbi->common;
+    int i;
+    int uv_width;
+
+    if (pbi->b_multithreaded_rd)
+    {
+        vp8mt_de_alloc_temp_buffers(pbi, prev_mb_rows);
+
+        /* our internal buffers are always multiples of 16 */
+        if ((width & 0xf) != 0)
+            width += 16 - (width & 0xf);
+
+        if (width < 640) pbi->sync_range = 1;
+        else if (width <= 1280) pbi->sync_range = 8;
+        else if (width <= 2560) pbi->sync_range =16;
+        else pbi->sync_range = 32;
+
+        uv_width = width >>1;
+
+        /* Allocate an int for each mb row. */
+        CHECK_MEM_ERROR(pbi->mt_current_mb_col, vpx_malloc(sizeof(int) * pc->mb_rows));
+
+        /* Allocate memory for above_row buffers. */
+        CHECK_MEM_ERROR(pbi->mt_yabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
+        for (i=0; i< pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_yabove_row[i], vpx_calloc(sizeof(unsigned char) * (width + (VP8BORDERINPIXELS<<1)), 1));
+
+        CHECK_MEM_ERROR(pbi->mt_uabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
+        for (i=0; i< pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_uabove_row[i], vpx_calloc(sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS), 1));
+
+        CHECK_MEM_ERROR(pbi->mt_vabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
+        for (i=0; i< pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_vabove_row[i], vpx_calloc(sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS), 1));
+
+        /* Allocate memory for left_col buffers. */
+        CHECK_MEM_ERROR(pbi->mt_yleft_col, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
+        for (i=0; i< pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_yleft_col[i], vpx_calloc(sizeof(unsigned char) * 16, 1));
+
+        CHECK_MEM_ERROR(pbi->mt_uleft_col, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
+        for (i=0; i< pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_uleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
+
+        CHECK_MEM_ERROR(pbi->mt_vleft_col, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
+        for (i=0; i< pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_vleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
+    }
+    return 0;
+#else
+    (void) pbi;
+    (void) width;
+#endif
+}
+
+
+void vp8_decoder_remove_threads(VP8D_COMP *pbi)
+{
+#if CONFIG_MULTITHREAD
 
+    /* shutdown MB Decoding thread; */
+    if (pbi->b_multithreaded_rd)
+    {
+        int i;
+
+        pbi->b_multithreaded_rd = 0;
+
+        /* allow all threads to exit */
+        for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
+        {
+            sem_post(&pbi->h_event_start_decoding[i]);
+            pthread_join(pbi->h_decoding_thread[i], NULL);
+        }
+
+        for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
+        {
+            sem_destroy(&pbi->h_event_start_decoding[i]);
         }
 
-        sem_destroy(&pbi->h_event_main);
+        sem_destroy(&pbi->h_event_end_decoding);
 
         if (pbi->h_decoding_thread)
         {
@@ -486,10 +686,10 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi)
             pbi->h_decoding_thread = NULL;
         }
 
-        if (pbi->h_event_mbrdecoding)
+        if (pbi->h_event_start_decoding)
         {
-            vpx_free(pbi->h_event_mbrdecoding);
-            pbi->h_event_mbrdecoding = NULL;
+            vpx_free(pbi->h_event_start_decoding);
+            pbi->h_event_start_decoding = NULL;
         }
 
         if (pbi->mb_row_di)
@@ -504,43 +704,65 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi)
             pbi->de_thread_data = NULL;
         }
     }
-
 #else
     (void) pbi;
 #endif
 }
 
 
-void vp8_start_lfthread(VP8D_COMP *pbi)
+void vp8mt_lpf_init( VP8D_COMP *pbi, int default_filt_lvl)
 {
 #if CONFIG_MULTITHREAD
-    memcpy(&pbi->lpfmb, &pbi->mb, sizeof(pbi->mb));
-    pbi->last_mb_row_decoded = 0;
-    sem_post(&pbi->h_event_start_lpf);
-#else
-    (void) pbi;
-#endif
-}
-
-void vp8_stop_lfthread(VP8D_COMP *pbi)
-{
-#if CONFIG_MULTITHREAD
-    struct vpx_usec_timer timer;
-
-    vpx_usec_timer_start(&timer);
-
-    sem_wait(&pbi->h_event_lpf);
+    VP8_COMMON *cm  = &pbi->common;
+    MACROBLOCKD *mbd = &pbi->mb;
+    /*YV12_BUFFER_CONFIG *post = &cm->new_frame;*/  /*frame_to_show;*/
+    loop_filter_info *lfi = cm->lf_info;
+    FRAME_TYPE frame_type = cm->frame_type;
+
+    /*int mb_row;
+    int mb_col;
+    int baseline_filter_level[MAX_MB_SEGMENTS];*/
+    int filter_level;
+    int alt_flt_enabled = mbd->segmentation_enabled;
+
+    int i;
+    /*unsigned char *y_ptr, *u_ptr, *v_ptr;*/
+
+    /* Note the baseline filter values for each segment */
+    if (alt_flt_enabled)
+    {
+        for (i = 0; i < MAX_MB_SEGMENTS; i++)
+        {
+            /* Abs value */
+            if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+                pbi->mt_baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+            /* Delta Value */
+            else
+            {
+                pbi->mt_baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+                pbi->mt_baseline_filter_level[i] = (pbi->mt_baseline_filter_level[i] >= 0) ? ((pbi->mt_baseline_filter_level[i] <= MAX_LOOP_FILTER) ? pbi->mt_baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  /* Clamp to valid range */
+            }
+        }
+    }
+    else
+    {
+        for (i = 0; i < MAX_MB_SEGMENTS; i++)
+            pbi->mt_baseline_filter_level[i] = default_filt_lvl;
+    }
 
-    vpx_usec_timer_mark(&timer);
-    pbi->time_loop_filtering += vpx_usec_timer_elapsed(&timer);
+    /* Initialize the loop filter for this frame. */
+    if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
+        vp8_init_loop_filter(cm);
+    else if (frame_type != cm->last_frame_type)
+        vp8_frame_init_loop_filter(lfi, frame_type);
 #else
     (void) pbi;
+    (void) default_filt_lvl;
 #endif
 }
 
 
-void vp8_mtdecode_mb_rows(VP8D_COMP *pbi,
-                          MACROBLOCKD *xd)
+void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
 {
 #if CONFIG_MULTITHREAD
     int mb_row;
@@ -548,47 +770,212 @@ void vp8_mtdecode_mb_rows(VP8D_COMP *pbi,
 
     int ibc = 0;
     int num_part = 1 << pbi->common.multi_token_partition;
+    int i, j;
+    volatile int *last_row_current_mb_col = NULL;
+    int nsync = pbi->sync_range;
+
+    int filter_level;
+    loop_filter_info *lfi = pc->lf_info;
+    int alt_flt_enabled = xd->segmentation_enabled;
+    int Segment;
+
+    if(pbi->common.filter_level)
+    {
+        /* Set above_row buffer to 127 for decoding first MB row */
+        vpx_memset(pbi->mt_yabove_row[0] + VP8BORDERINPIXELS-1, 127, pc->yv12_fb[pc->lst_fb_idx].y_width + 5);
+        vpx_memset(pbi->mt_uabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (pc->yv12_fb[pc->lst_fb_idx].y_width>>1) +5);
+        vpx_memset(pbi->mt_vabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (pc->yv12_fb[pc->lst_fb_idx].y_width>>1) +5);
+
+        for (i=1; i<pc->mb_rows; i++)
+        {
+            vpx_memset(pbi->mt_yabove_row[i] + VP8BORDERINPIXELS-1, (unsigned char)129, 1);
+            vpx_memset(pbi->mt_uabove_row[i] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
+            vpx_memset(pbi->mt_vabove_row[i] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
+        }
+
+        /* Set left_col to 129 initially */
+        for (i=0; i<pc->mb_rows; i++)
+        {
+            vpx_memset(pbi->mt_yleft_col[i], (unsigned char)129, 16);
+            vpx_memset(pbi->mt_uleft_col[i], (unsigned char)129, 8);
+            vpx_memset(pbi->mt_vleft_col[i], (unsigned char)129, 8);
+        }
+        vp8mt_lpf_init(pbi, pc->filter_level);
+    }
 
     vp8_setup_decoding_thread_data(pbi, xd, pbi->mb_row_di, pbi->decoding_thread_count);
 
+    for (i = 0; i < pbi->decoding_thread_count; i++)
+        sem_post(&pbi->h_event_start_decoding[i]);
+
     for (mb_row = 0; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
     {
         int i;
-        pbi->current_mb_col_main = -1;
-
-        xd->current_bc = &pbi->mbc[ibc];
-        ibc++ ;
 
-        if (ibc == num_part)
-            ibc = 0;
+        xd->current_bc = &pbi->mbc[mb_row%num_part];
 
-        for (i = 0; i < pbi->decoding_thread_count; i++)
+        /* vp8_decode_mb_row(pbi, pc, mb_row, xd); */
         {
-            if ((mb_row + i + 1) >= pc->mb_rows)
-                break;
+            int i;
+            int recon_yoffset, recon_uvoffset;
+            int mb_col;
+            int ref_fb_idx = pc->lst_fb_idx;
+            int dst_fb_idx = pc->new_fb_idx;
+            int recon_y_stride = pc->yv12_fb[ref_fb_idx].y_stride;
+            int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;
 
-            pbi->mb_row_di[i].mb_row = mb_row + i + 1;
-            pbi->mb_row_di[i].mbd.current_bc =  &pbi->mbc[ibc];
-            ibc++;
+           /* volatile int *last_row_current_mb_col = NULL; */
+            if (mb_row > 0)
+                last_row_current_mb_col = &pbi->mt_current_mb_col[mb_row -1];
 
-            if (ibc == num_part)
-                ibc = 0;
+            vpx_memset(&pc->left_context, 0, sizeof(pc->left_context));
+            recon_yoffset = mb_row * recon_y_stride * 16;
+            recon_uvoffset = mb_row * recon_uv_stride * 8;
+            /* reset above block coeffs */
 
-            pbi->mb_row_di[i].current_mb_col = -1;
-            sem_post(&pbi->h_event_mbrdecoding[i]);
-        }
+            xd->above_context = pc->above_context;
+            xd->up_available = (mb_row != 0);
 
-        vp8_decode_mb_row(pbi, pc, mb_row, xd);
+            xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+            xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
 
-        xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
+            for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
+            {
+                if ( mb_row > 0 && (mb_col & (nsync-1)) == 0){
+                    while (mb_col > (*last_row_current_mb_col - nsync) && *last_row_current_mb_col != pc->mb_cols - 1)
+                    {
+                        x86_pause_hint();
+                        thread_sleep(0);
+                    }
+                }
 
-        if (mb_row < pc->mb_rows - 1)
-        {
-            sem_wait(&pbi->h_event_main);
+                if (xd->mode_info_context->mbmi.mode == SPLITMV || xd->mode_info_context->mbmi.mode == B_PRED)
+                {
+                    for (i = 0; i < 16; i++)
+                    {
+                        BLOCKD *d = &xd->block[i];
+                        vpx_memcpy(&d->bmi, &xd->mode_info_context->bmi[i], sizeof(B_MODE_INFO));
+                    }
+                }
+
+                if(pbi->common.filter_level)
+                {
+                    /* update loopfilter info */
+                    Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
+                    filter_level = pbi->mt_baseline_filter_level[Segment];
+                    /* Distance of Mb to the various image edges.
+                     * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
+                     * Apply any context driven MB level adjustment
+                     */
+                    vp8_adjust_mb_lf_value(xd, &filter_level);
+                }
+
+                /* Distance of Mb to the various image edges.
+                 * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
+                 */
+                xd->mb_to_left_edge = -((mb_col * 16) << 3);
+                xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+
+                xd->dst.y_buffer = pc->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+                xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+                xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+
+                xd->left_available = (mb_col != 0);
+
+                /* Select the appropriate reference frame for this MB */
+                if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
+                    ref_fb_idx = pc->lst_fb_idx;
+                else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
+                    ref_fb_idx = pc->gld_fb_idx;
+                else
+                    ref_fb_idx = pc->alt_fb_idx;
+
+                xd->pre.y_buffer = pc->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
+                xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
+                xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+
+                vp8_build_uvmvs(xd, pc->full_pixel);
+                vp8mt_decode_macroblock(pbi, xd, mb_row, mb_col);
+
+                if (pbi->common.filter_level)
+                {
+                    /* Save decoded MB last row data for next-row decoding */
+                    if(mb_row != pc->mb_rows-1)
+                    {
+                        vpx_memcpy((pbi->mt_yabove_row[mb_row +1] + 32 + mb_col*16), (xd->dst.y_buffer + 15 * recon_y_stride), 16);
+                        vpx_memcpy((pbi->mt_uabove_row[mb_row +1] + 16 + mb_col*8), (xd->dst.u_buffer + 7 * recon_uv_stride), 8);
+                        vpx_memcpy((pbi->mt_vabove_row[mb_row +1] + 16 + mb_col*8), (xd->dst.v_buffer + 7 * recon_uv_stride), 8);
+                    }
+
+                    /* save left_col for next MB decoding */
+                    if(mb_col != pc->mb_cols-1)
+                    {
+                        MODE_INFO *next = xd->mode_info_context +1;
+
+                        if (xd->frame_type == KEY_FRAME  ||  next->mbmi.ref_frame == INTRA_FRAME)
+                        {
+                            for (i = 0; i < 16; i++)
+                                pbi->mt_yleft_col[mb_row][i] = xd->dst.y_buffer [i* recon_y_stride + 15];
+                            for (i = 0; i < 8; i++)
+                            {
+                                pbi->mt_uleft_col[mb_row][i] = xd->dst.u_buffer [i* recon_uv_stride + 7];
+                                pbi->mt_vleft_col[mb_row][i] = xd->dst.v_buffer [i* recon_uv_stride + 7];
+                            }
+                        }
+                    }
+
+                    /* loopfilter on this macroblock. */
+                    if (filter_level)
+                    {
+                        if (mb_col > 0)
+                            pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+
+                        if (xd->mode_info_context->mbmi.dc_diff > 0)
+                            pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+
+                        /* don't apply across umv border */
+                        if (mb_row > 0)
+                            pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+
+                        if (xd->mode_info_context->mbmi.dc_diff > 0)
+                            pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+                    }
+                }
+
+                recon_yoffset += 16;
+                recon_uvoffset += 8;
+
+                ++xd->mode_info_context;  /* next mb */
+
+                xd->above_context++;
+
+                pbi->mt_current_mb_col[mb_row] = mb_col;
+            }
+
+            /* adjust to the next row of mbs */
+            if (pbi->common.filter_level)
+            {
+                if(mb_row != pc->mb_rows-1)
+                {
+                    int lasty = pc->yv12_fb[ref_fb_idx].y_width + VP8BORDERINPIXELS;
+                    int lastuv = (pc->yv12_fb[ref_fb_idx].y_width>>1) + (VP8BORDERINPIXELS>>1);
+
+                    for (i = 0; i < 4; i++)
+                    {
+                        pbi->mt_yabove_row[mb_row +1][lasty + i] = pbi->mt_yabove_row[mb_row +1][lasty -1];
+                        pbi->mt_uabove_row[mb_row +1][lastuv + i] = pbi->mt_uabove_row[mb_row +1][lastuv -1];
+                        pbi->mt_vabove_row[mb_row +1][lastuv + i] = pbi->mt_vabove_row[mb_row +1][lastuv -1];
+                    }
+                }
+            }else
+                vp8_extend_mb_row(&pc->yv12_fb[dst_fb_idx], xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
+
+            ++xd->mode_info_context;      /* skip prediction column */
         }
+        xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
     }
 
-    pbi->last_mb_row_decoded = mb_row;
+    sem_wait(&pbi->h_event_end_decoding);   /* add back for each frame */
 #else
     (void) pbi;
     (void) xd;
diff --git a/vp8/decoder/treereader.h b/vp8/decoder/treereader.h
index eb10e2460..277842896 100644
--- a/vp8/decoder/treereader.h
+++ b/vp8/decoder/treereader.h
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
diff --git a/vp8/decoder/x86/dequantize_mmx.asm b/vp8/decoder/x86/dequantize_mmx.asm
index 02be4872e..0d6133a46 100644
--- a/vp8/decoder/x86/dequantize_mmx.asm
+++ b/vp8/decoder/x86/dequantize_mmx.asm
@@ -1,10 +1,11 @@
 ;
-;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
 ;
 
 
@@ -49,12 +50,12 @@ sym(vp8_dequantize_b_impl_mmx):
     ret
 
 
-;void dequant_idct_mmx(short *input, short *dq, short *output, int pitch)
-global sym(vp8_dequant_idct_mmx)
-sym(vp8_dequant_idct_mmx):
+;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)
+global sym(vp8_dequant_idct_add_mmx)
+sym(vp8_dequant_idct_add_mmx):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 6
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -76,7 +77,8 @@ sym(vp8_dequant_idct_mmx):
         movq        mm3,    [rax+24]
         pmullw      mm3,    [rdx+24]
 
-        mov         rdx,    arg(2) ;output
+        mov         rdx,    arg(3) ;dest
+        mov         rsi,    arg(2) ;pred
         pxor        mm7,    mm7
 
 
@@ -87,7 +89,8 @@ sym(vp8_dequant_idct_mmx):
         movq        [rax+24],mm7
 
 
-        movsxd      rax,            dword ptr arg(3) ;pitch
+        movsxd      rax,            dword ptr arg(4) ;pitch
+        movsxd      rdi,            dword ptr arg(5) ;stride
 
         psubw       mm0,            mm2             ; b1= 0-2
         paddw       mm2,            mm2             ;
@@ -95,11 +98,11 @@ sym(vp8_dequant_idct_mmx):
         movq        mm5,            mm1
         paddw       mm2,            mm0             ; a1 =0+2
 
-        pmulhw      mm5,            [x_s1sqr2 GLOBAL];
+        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
 
         movq        mm7,            mm3             ;
-        pmulhw      mm7,            [x_c1sqr2less1 GLOBAL];
+        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
 
         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
         psubw       mm7,            mm5             ; c1
@@ -107,10 +110,10 @@ sym(vp8_dequant_idct_mmx):
         movq        mm5,            mm1
         movq        mm4,            mm3
 
-        pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
+        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
         paddw       mm5,            mm1
 
-        pmulhw      mm3,            [x_s1sqr2 GLOBAL]
+        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
         paddw       mm3,            mm4
 
         paddw       mm3,            mm5             ; d1
@@ -150,11 +153,11 @@ sym(vp8_dequant_idct_mmx):
         movq        mm5,            mm1
         paddw       mm2,            mm0             ; a1 =0+2
 
-        pmulhw      mm5,            [x_s1sqr2 GLOBAL];
+        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
 
         movq        mm7,            mm3             ;
-        pmulhw      mm7,            [x_c1sqr2less1 GLOBAL];
+        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
 
         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
         psubw       mm7,            mm5             ; c1
@@ -162,16 +165,16 @@ sym(vp8_dequant_idct_mmx):
         movq        mm5,            mm1
         movq        mm4,            mm3
 
-        pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
+        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
         paddw       mm5,            mm1
 
-        pmulhw      mm3,            [x_s1sqr2 GLOBAL]
+        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
         paddw       mm3,            mm4
 
         paddw       mm3,            mm5             ; d1
-        paddw       mm0,            [fours GLOBAL]
+        paddw       mm0,            [GLOBAL(fours)]
 
-        paddw       mm2,            [fours GLOBAL]
+        paddw       mm2,            [GLOBAL(fours)]
         movq        mm6,            mm2             ; a1
 
         movq        mm4,            mm0             ; b1
@@ -206,13 +209,34 @@ sym(vp8_dequant_idct_mmx):
         punpckldq   mm2,            mm4             ; 32 22 12 02
         punpckhdq   mm5,            mm4             ; 33 23 13 03
 
-        movq        [rdx],          mm0
+        pxor        mm7,            mm7
 
-        movq        [rdx+rax],      mm1
-        movq        [rdx+rax*2],    mm2
+        movd        mm4,            [rsi]
+        punpcklbw   mm4,            mm7
+        paddsw      mm0,            mm4
+        packuswb    mm0,            mm7
+        movd        [rdx],          mm0
 
-        add         rdx,            rax
-        movq        [rdx+rax*2],    mm5
+        movd        mm4,            [rsi+rax]
+        punpcklbw   mm4,            mm7
+        paddsw      mm1,            mm4
+        packuswb    mm1,            mm7
+        movd        [rdx+rdi],      mm1
+
+        movd        mm4,            [rsi+2*rax]
+        punpcklbw   mm4,            mm7
+        paddsw      mm2,            mm4
+        packuswb    mm2,            mm7
+        movd        [rdx+rdi*2],    mm2
+
+        add         rdx,            rdi
+        add         rsi,            rax
+
+        movd        mm4,            [rsi+2*rax]
+        punpcklbw   mm4,            mm7
+        paddsw      mm5,            mm4
+        packuswb    mm5,            mm7
+        movd        [rdx+rdi*2],    mm5
 
     ; begin epilog
     pop rdi
@@ -223,12 +247,12 @@ sym(vp8_dequant_idct_mmx):
     ret
 
 
-;void dequant_dc_idct_mmx(short *input, short *dq, short *output, int pitch, int Dc)
-global sym(vp8_dequant_dc_idct_mmx)
-sym(vp8_dequant_dc_idct_mmx):
+;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)
+global sym(vp8_dequant_dc_idct_add_mmx)
+sym(vp8_dequant_dc_idct_add_mmx):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
+    SHADOW_ARGS_TO_STACK 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -237,8 +261,6 @@ sym(vp8_dequant_dc_idct_mmx):
         mov         rax,    arg(0) ;input
         mov         rdx,    arg(1) ;dq
 
-        movsxd      rcx,    dword ptr arg(4) ;Dc
-
         movq        mm0,    [rax   ]
         pmullw      mm0,    [rdx]
 
@@ -251,7 +273,8 @@ sym(vp8_dequant_dc_idct_mmx):
         movq        mm3,    [rax+24]
         pmullw      mm3,    [rdx+24]
 
-        mov         rdx,    arg(2) ;output
+        mov         rdx,    arg(3) ;dest
+        mov         rsi,    arg(2) ;pred
         pxor        mm7,    mm7
 
 
@@ -261,8 +284,15 @@ sym(vp8_dequant_dc_idct_mmx):
         movq        [rax+16],mm7
         movq        [rax+24],mm7
 
-        pinsrw      mm0,    rcx,  0
-        movsxd      rax,            dword ptr arg(3) ;pitch
+        ; move lower word of Dc to lower word of mm0
+        psrlq       mm0,    16
+        movzx       rcx,    word ptr arg(6) ;Dc
+        psllq       mm0,    16
+        movq        mm7,    rcx
+        por         mm0,    mm7
+
+        movsxd      rax,            dword ptr arg(4) ;pitch
+        movsxd      rdi,            dword ptr arg(5) ;stride
 
         psubw       mm0,            mm2             ; b1= 0-2
         paddw       mm2,            mm2             ;
@@ -270,11 +300,11 @@ sym(vp8_dequant_dc_idct_mmx):
         movq        mm5,            mm1
         paddw       mm2,            mm0             ; a1 =0+2
 
-        pmulhw      mm5,            [x_s1sqr2 GLOBAL];
+        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
 
         movq        mm7,            mm3             ;
-        pmulhw      mm7,            [x_c1sqr2less1 GLOBAL];
+        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
 
         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
         psubw       mm7,            mm5             ; c1
@@ -282,10 +312,10 @@ sym(vp8_dequant_dc_idct_mmx):
         movq        mm5,            mm1
         movq        mm4,            mm3
 
-        pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
+        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
         paddw       mm5,            mm1
 
-        pmulhw      mm3,            [x_s1sqr2 GLOBAL]
+        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
         paddw       mm3,            mm4
 
         paddw       mm3,            mm5             ; d1
@@ -325,11 +355,11 @@ sym(vp8_dequant_dc_idct_mmx):
         movq        mm5,            mm1
         paddw       mm2,            mm0             ; a1 =0+2
 
-        pmulhw      mm5,            [x_s1sqr2 GLOBAL];
+        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
 
         movq        mm7,            mm3             ;
-        pmulhw      mm7,            [x_c1sqr2less1 GLOBAL];
+        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
 
         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
         psubw       mm7,            mm5             ; c1
@@ -337,16 +367,16 @@ sym(vp8_dequant_dc_idct_mmx):
         movq        mm5,            mm1
         movq        mm4,            mm3
 
-        pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
+        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
         paddw       mm5,            mm1
 
-        pmulhw      mm3,            [x_s1sqr2 GLOBAL]
+        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
         paddw       mm3,            mm4
 
         paddw       mm3,            mm5             ; d1
-        paddw       mm0,            [fours GLOBAL]
+        paddw       mm0,            [GLOBAL(fours)]
 
-        paddw       mm2,            [fours GLOBAL]
+        paddw       mm2,            [GLOBAL(fours)]
         movq        mm6,            mm2             ; a1
 
         movq        mm4,            mm0             ; b1
@@ -381,13 +411,34 @@ sym(vp8_dequant_dc_idct_mmx):
         punpckldq   mm2,            mm4             ; 32 22 12 02
         punpckhdq   mm5,            mm4             ; 33 23 13 03
 
-        movq        [rdx],          mm0
-
-        movq        [rdx+rax],      mm1
-        movq        [rdx+rax*2],    mm2
-
-        add         rdx,            rax
-        movq        [rdx+rax*2],    mm5
+        pxor        mm7,            mm7
+
+        movd        mm4,            [rsi]
+        punpcklbw   mm4,            mm7
+        paddsw      mm0,            mm4
+        packuswb    mm0,            mm7
+        movd        [rdx],          mm0
+
+        movd        mm4,            [rsi+rax]
+        punpcklbw   mm4,            mm7
+        paddsw      mm1,            mm4
+        packuswb    mm1,            mm7
+        movd        [rdx+rdi],      mm1
+
+        movd        mm4,            [rsi+2*rax]
+        punpcklbw   mm4,            mm7
+        paddsw      mm2,            mm4
+        packuswb    mm2,            mm7
+        movd        [rdx+rdi*2],    mm2
+
+        add         rdx,            rdi
+        add         rsi,            rax
+
+        movd        mm4,            [rsi+2*rax]
+        punpcklbw   mm4,            mm7
+        paddsw      mm5,            mm4
+        packuswb    mm5,            mm7
+        movd        [rdx+rdi*2],    mm5
 
     ; begin epilog
     pop rdi
diff --git a/vp8/decoder/x86/dequantize_x86.h b/vp8/decoder/x86/dequantize_x86.h
index 5def406d3..dc68daab3 100644
--- a/vp8/decoder/x86/dequantize_x86.h
+++ b/vp8/decoder/x86/dequantize_x86.h
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
@@ -20,19 +21,48 @@
  */
 #if HAVE_MMX
 extern prototype_dequant_block(vp8_dequantize_b_mmx);
-extern prototype_dequant_idct(vp8_dequant_idct_mmx);
-extern prototype_dequant_idct_dc(vp8_dequant_dc_idct_mmx);
-
+extern prototype_dequant_idct_add(vp8_dequant_idct_add_mmx);
+extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_mmx);
+extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_mmx);
+extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_mmx);
+extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_dequant_block
 #define vp8_dequant_block vp8_dequantize_b_mmx
 
-#undef  vp8_dequant_idct
-#define vp8_dequant_idct vp8_dequant_idct_mmx
+#undef  vp8_dequant_idct_add
+#define vp8_dequant_idct_add vp8_dequant_idct_add_mmx
+
+#undef  vp8_dequant_dc_idct_add
+#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_mmx
+
+#undef vp8_dequant_dc_idct_add_y_block
+#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_mmx
+
+#undef vp8_dequant_idct_add_y_block
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_mmx
+
+#undef vp8_dequant_idct_add_uv_block
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_mmx
+
+#endif
+#endif
+
+#if HAVE_SSE2
+extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_sse2);
+extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_sse2);
+extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_sse2);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_dequant_dc_idct_add_y_block
+#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_sse2
+
+#undef vp8_dequant_idct_add_y_block
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2
 
-#undef  vp8_dequant_idct_dc
-#define vp8_dequant_idct_dc vp8_dequant_dc_idct_mmx
+#undef vp8_dequant_idct_add_uv_block
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_sse2
 
 #endif
 #endif
diff --git a/vp8/decoder/x86/idct_blk_mmx.c b/vp8/decoder/x86/idct_blk_mmx.c
new file mode 100644
index 000000000..78c91d3d2
--- /dev/null
+++ b/vp8/decoder/x86/idct_blk_mmx.c
@@ -0,0 +1,151 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "idct.h"
+#include "dequantize.h"
+
+void vp8_dequant_dc_idct_add_y_block_mmx
+            (short *q, short *dq, unsigned char *pre,
+             unsigned char *dst, int stride, char *eobs, short *dc)
+{
+    int i;
+
+    for (i = 0; i < 4; i++)
+    {
+        if (eobs[0] > 1)
+            vp8_dequant_dc_idct_add_mmx (q, dq, pre, dst, 16, stride, dc[0]);
+        else
+            vp8_dc_only_idct_add_mmx (dc[0], pre, dst, 16, stride);
+
+        if (eobs[1] > 1)
+            vp8_dequant_dc_idct_add_mmx (q+16, dq, pre+4, dst+4, 16, stride, dc[1]);
+        else
+            vp8_dc_only_idct_add_mmx (dc[1], pre+4, dst+4, 16, stride);
+
+        if (eobs[2] > 1)
+            vp8_dequant_dc_idct_add_mmx (q+32, dq, pre+8, dst+8, 16, stride, dc[2]);
+        else
+            vp8_dc_only_idct_add_mmx (dc[2], pre+8, dst+8, 16, stride);
+
+        if (eobs[3] > 1)
+            vp8_dequant_dc_idct_add_mmx (q+48, dq, pre+12, dst+12, 16, stride, dc[3]);
+        else
+            vp8_dc_only_idct_add_mmx (dc[3], pre+12, dst+12, 16, stride);
+
+        q    += 64;
+        dc   += 4;
+        pre  += 64;
+        dst  += 4*stride;
+        eobs += 4;
+    }
+}
+
+void vp8_dequant_idct_add_y_block_mmx
+            (short *q, short *dq, unsigned char *pre,
+             unsigned char *dst, int stride, char *eobs)
+{
+    int i;
+
+    for (i = 0; i < 4; i++)
+    {
+        if (eobs[0] > 1)
+            vp8_dequant_idct_add_mmx (q, dq, pre, dst, 16, stride);
+        else
+        {
+            vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dst, 16, stride);
+            ((int *)q)[0] = 0;
+        }
+
+        if (eobs[1] > 1)
+            vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dst+4, 16, stride);
+        else
+        {
+            vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dst+4, 16, stride);
+            ((int *)(q+16))[0] = 0;
+        }
+
+        if (eobs[2] > 1)
+            vp8_dequant_idct_add_mmx (q+32, dq, pre+8, dst+8, 16, stride);
+        else
+        {
+            vp8_dc_only_idct_add_mmx (q[32]*dq[0], pre+8, dst+8, 16, stride);
+            ((int *)(q+32))[0] = 0;
+        }
+
+        if (eobs[3] > 1)
+            vp8_dequant_idct_add_mmx (q+48, dq, pre+12, dst+12, 16, stride);
+        else
+        {
+            vp8_dc_only_idct_add_mmx (q[48]*dq[0], pre+12, dst+12, 16, stride);
+            ((int *)(q+48))[0] = 0;
+        }
+
+        q    += 64;
+        pre  += 64;
+        dst  += 4*stride;
+        eobs += 4;
+    }
+}
+
+void vp8_dequant_idct_add_uv_block_mmx
+            (short *q, short *dq, unsigned char *pre,
+             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+{
+    int i;
+
+    for (i = 0; i < 2; i++)
+    {
+        if (eobs[0] > 1)
+            vp8_dequant_idct_add_mmx (q, dq, pre, dstu, 8, stride);
+        else
+        {
+            vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dstu, 8, stride);
+            ((int *)q)[0] = 0;
+        }
+
+        if (eobs[1] > 1)
+            vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dstu+4, 8, stride);
+        else
+        {
+            vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dstu+4, 8, stride);
+            ((int *)(q+16))[0] = 0;
+        }
+
+        q    += 32;
+        pre  += 32;
+        dstu += 4*stride;
+        eobs += 2;
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+        if (eobs[0] > 1)
+            vp8_dequant_idct_add_mmx (q, dq, pre, dstv, 8, stride);
+        else
+        {
+            vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dstv, 8, stride);
+            ((int *)q)[0] = 0;
+        }
+
+        if (eobs[1] > 1)
+            vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dstv+4, 8, stride);
+        else
+        {
+            vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dstv+4, 8, stride);
+            ((int *)(q+16))[0] = 0;
+        }
+
+        q    += 32;
+        pre  += 32;
+        dstv += 4*stride;
+        eobs += 2;
+    }
+}
diff --git a/vp8/decoder/x86/idct_blk_sse2.c b/vp8/decoder/x86/idct_blk_sse2.c
new file mode 100644
index 000000000..0273d6ed2
--- /dev/null
+++ b/vp8/decoder/x86/idct_blk_sse2.c
@@ -0,0 +1,114 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "idct.h"
+#include "dequantize.h"
+
+void idct_dequant_dc_0_2x_sse2
+            (short *q, short *dq, unsigned char *pre,
+             unsigned char *dst, int dst_stride, short *dc);
+void idct_dequant_dc_full_2x_sse2
+            (short *q, short *dq, unsigned char *pre,
+             unsigned char *dst, int dst_stride, short *dc);
+
+void idct_dequant_0_2x_sse2
+            (short *q, short *dq ,unsigned char *pre,
+             unsigned char *dst, int dst_stride, int blk_stride);
+void idct_dequant_full_2x_sse2
+            (short *q, short *dq ,unsigned char *pre,
+             unsigned char *dst, int dst_stride, int blk_stride);
+
+void vp8_dequant_dc_idct_add_y_block_sse2
+            (short *q, short *dq, unsigned char *pre,
+             unsigned char *dst, int stride, char *eobs, short *dc)
+{
+    int i;
+
+    for (i = 0; i < 4; i++)
+    {
+        if (((short *)(eobs))[0] & 0xfefe)
+            idct_dequant_dc_full_2x_sse2 (q, dq, pre, dst, stride, dc);
+        else
+            idct_dequant_dc_0_2x_sse2 (q, dq, pre, dst, stride, dc);
+
+        if (((short *)(eobs))[1] & 0xfefe)
+            idct_dequant_dc_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2);
+        else
+            idct_dequant_dc_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2);
+
+        q    += 64;
+        dc   += 4;
+        pre  += 64;
+        dst  += stride*4;
+        eobs += 4;
+    }
+}
+
+void vp8_dequant_idct_add_y_block_sse2
+            (short *q, short *dq, unsigned char *pre,
+             unsigned char *dst, int stride, char *eobs)
+{
+    int i;
+
+    for (i = 0; i < 4; i++)
+    {
+        if (((short *)(eobs))[0] & 0xfefe)
+            idct_dequant_full_2x_sse2 (q, dq, pre, dst, stride, 16);
+        else
+            idct_dequant_0_2x_sse2 (q, dq, pre, dst, stride, 16);
+
+        if (((short *)(eobs))[1] & 0xfefe)
+            idct_dequant_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16);
+        else
+            idct_dequant_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16);
+
+        q    += 64;
+        pre  += 64;
+        dst  += stride*4;
+        eobs += 4;
+    }
+}
+
+void vp8_dequant_idct_add_uv_block_sse2
+            (short *q, short *dq, unsigned char *pre,
+             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+{
+    if (((short *)(eobs))[0] & 0xfefe)
+        idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8);
+    else
+        idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8);
+
+    q    += 32;
+    pre  += 32;
+    dstu += stride*4;
+
+    if (((short *)(eobs))[1] & 0xfefe)
+        idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8);
+    else
+        idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8);
+
+    q    += 32;
+    pre  += 32;
+
+    if (((short *)(eobs))[2] & 0xfefe)
+        idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8);
+    else
+        idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8);
+
+    q    += 32;
+    pre  += 32;
+    dstv += stride*4;
+
+    if (((short *)(eobs))[3] & 0xfefe)
+        idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8);
+    else
+        idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8);
+}
diff --git a/vp8/decoder/x86/onyxdxv.c b/vp8/decoder/x86/onyxdxv.c
index 75a676a07..50293c792 100644
--- a/vp8/decoder/x86/onyxdxv.c
+++ b/vp8/decoder/x86/onyxdxv.c
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
diff --git a/vp8/decoder/x86/x86_dsystemdependent.c b/vp8/decoder/x86/x86_dsystemdependent.c
index 6d7cc3666..47e346dd9 100644
--- a/vp8/decoder/x86/x86_dsystemdependent.c
+++ b/vp8/decoder/x86/x86_dsystemdependent.c
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
@@ -38,14 +39,24 @@ void vp8_arch_x86_decode_init(VP8D_COMP *pbi)
 #if CONFIG_RUNTIME_CPU_DETECT
     /* Override default functions with fastest ones for this CPU. */
 #if HAVE_MMX
-
     if (flags & HAS_MMX)
     {
-        pbi->dequant.block   = vp8_dequantize_b_mmx;
-        pbi->dequant.idct    = vp8_dequant_idct_mmx;
-        pbi->dequant.idct_dc = vp8_dequant_dc_idct_mmx;
+        pbi->dequant.block               = vp8_dequantize_b_mmx;
+        pbi->dequant.idct_add            = vp8_dequant_idct_add_mmx;
+        pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_mmx;
+        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_mmx;
+        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_mmx;
+        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_mmx;
+    }
+#endif
+#if HAVE_SSE2
+    if (flags & HAS_SSE2)
+    {
+        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_sse2;
+        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_sse2;
+        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_sse2;
     }
-
 #endif
+
 #endif
 }
diff --git a/vp8/decoder/xprintf.c b/vp8/decoder/xprintf.c
deleted file mode 100644
index cb2221c15..000000000
--- a/vp8/decoder/xprintf.c
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     xprintf.cpp
-*
-*   Description  :     Display a printf style message on the current video frame.
-*
-****************************************************************************/
-
-/****************************************************************************
-*  Header Files
-****************************************************************************/
-
-#include <stdio.h>
-#include <stdarg.h>
-#ifdef _WIN32_WCE
-#include <windows.h>
-#endif
-#include "xprintf.h"
-
-/****************************************************************************
- *
- *  ROUTINE       : xprintf
- *
- *  INPUTS        : const PB_INSTANCE *ppbi : Pointer to decoder instance.
- *                  long n_pixel             : Offset into buffer to write text.
- *                  const char *format      : Format string for print.
- *                  ...                     : Variable length argument list.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : int: Size (in bytes) of the formatted text.
- *
- *  FUNCTION      : Display a printf style message on the current video frame.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-int onyx_xprintf(unsigned char *ppbuffer, long n_pixel, long n_size, long n_stride, const char *format, ...)
-{
-    BOOL b_rc;
-    va_list arglist;
-    HFONT hfont, hfonto;
-
-    int rc = 0;
-    char sz_formatted[256] = "";
-    unsigned char *p_dest = &ppbuffer[n_pixel];
-
-#ifdef _WIN32_WCE
-    //  Set up temporary bitmap
-    HDC hdc_memory   = NULL;
-    HBITMAP hbm_temp = NULL;
-    HBITMAP hbm_orig = NULL;
-
-    RECT rect;
-
-    //  Copy bitmap to video frame
-    long x;
-    long y;
-
-    //  Format text
-    va_start(arglist, format);
-    _vsnprintf(sz_formatted, sizeof(sz_formatted), format, arglist);
-    va_end(arglist);
-
-    rect.left   = 0;
-    rect.top    = 0;
-    rect.right  = 8 * strlen(sz_formatted);
-    rect.bottom = 8;
-
-    hdc_memory = create_compatible_dc(NULL);
-
-    if (hdc_memory == NULL)
-        goto Exit;
-
-    hbm_temp = create_bitmap(rect.right, rect.bottom, 1, 1, NULL);
-
-    if (hbm_temp == NULL)
-        goto Exit;
-
-    hbm_orig = (HBITMAP)(select_object(hdc_memory, hbm_temp));
-
-    if (!hbm_orig)
-        goto Exit;
-
-    //  Write text into bitmap
-    //  font?
-    hfont = create_font(8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, VARIABLE_PITCH | FF_SWISS, "");
-
-    if (hfont == NULL)
-        goto Exit;
-
-    hfonto = (HFONT)(select_object(hdc_memory, hbm_temp));
-
-    if (!hfonto)
-        goto Exit;
-
-    select_object(hdc_memory, hfont);
-    set_text_color(hdc_memory, 1);
-    set_bk_color(hdc_memory, 0);
-    set_bk_mode(hdc_memory, TRANSPARENT);
-
-    b_rc = bit_blt(hdc_memory, rect.left, rect.top, rect.right, rect.bottom, hdc_memory, rect.left, rect.top, BLACKNESS);
-
-    if (!b_rc)
-        goto Exit;
-
-    b_rc = ext_text_out(hdc_memory, 0, 0, ETO_CLIPPED, &rect, sz_formatted, strlen(sz_formatted), NULL);
-
-    if (!b_rc)
-        goto Exit;
-
-    for (y = rect.top; y < rect.bottom; ++y)
-    {
-        for (x = rect.left; x < rect.right; ++x)
-        {
-            if (get_pixel(hdc_memory, x, rect.bottom - 1 - y))
-                p_dest[x] = 255;
-        }
-
-        p_dest += n_stride;
-    }
-
-    rc = strlen(sz_formatted);
-
-Exit:
-
-    if (hbm_temp != NULL)
-    {
-        if (hbm_orig != NULL)
-        {
-            select_object(hdc_memory, hbm_orig);
-        }
-
-        delete_object(hbm_temp);
-    }
-
-    if (hfont != NULL)
-    {
-        if (hfonto != NULL)
-            select_object(hdc_memory, hfonto);
-
-        delete_object(hfont);
-    }
-
-    if (hdc_memory != NULL)
-        delete_dc(hdc_memory);
-
-    hdc_memory = 0;
-
-#endif
-
-    return rc;
-}
diff --git a/vp8/decoder/xprintf.h b/vp8/decoder/xprintf.h
deleted file mode 100644
index 2f175e943..000000000
--- a/vp8/decoder/xprintf.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     xprintf.h
-*
-*   Description  :     Debug print interface header file.
-*
-****************************************************************************/
-#ifndef __INC_XPRINTF_H
-#define __INC_XPRINTF_H
-
-/****************************************************************************
-*  Header Files
-****************************************************************************/
-
-/****************************************************************************
-*  Functions
-****************************************************************************/
-
-// Display a printf style message on the current video frame
-extern int onyx_xprintf(unsigned char *ppbuffer, long n_pixel, long n_size, long n_stride, const char *format, ...);
-
-#endif
author	Suman Sunkara <sunkaras@google.com>	2010-11-16 15:09:26 -0500
committer	Suman Sunkara <sunkaras@google.com>	2010-11-16 16:30:59 -0500
commit	4b3f72001de952aaf3874d3ce2dca3cb3fbe3928 (patch)
tree	754d87a4fa5cae908388f0a26b2affad7cd8a523 /vp8/decoder
parent	b9a18344cf8e5283928525c5ac0897ede79f9e57 (diff)
parent	00fe7441e9c5cbd898a8382d49cf9396d8482464 (diff)
download	libvpx-4b3f72001de952aaf3874d3ce2dca3cb3fbe3928.tar libvpx-4b3f72001de952aaf3874d3ce2dca3cb3fbe3928.tar.gz libvpx-4b3f72001de952aaf3874d3ce2dca3cb3fbe3928.tar.bz2 libvpx-4b3f72001de952aaf3874d3ce2dca3cb3fbe3928.zip