Merge remote branch 'internal/upstream' into HEAD

author: John Koleszar <jkoleszar@google.com> 2011-03-15 00:05:07 -0400
committer: John Koleszar <jkoleszar@google.com> 2011-03-15 00:05:07 -0400
commit: b210797a6ada5816a56cacb0b4106a5bf525314c (patch)
tree: f6a19e82285b78c105f7a7a05338a1e07eb65966 /vp8
parent: 23c4cbdfbae53661877d449072241651b788d016 (diff)
parent: 8c48c943e7c082b4330061e74ce85547253de301 (diff)
download: libvpx-b210797a6ada5816a56cacb0b4106a5bf525314c.tar
libvpx-b210797a6ada5816a56cacb0b4106a5bf525314c.tar.gz
libvpx-b210797a6ada5816a56cacb0b4106a5bf525314c.tar.bz2
libvpx-b210797a6ada5816a56cacb0b4106a5bf525314c.zip
5 files changed, 159 insertions, 27 deletions
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index 7fc901054..314a8d7fd 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -296,18 +296,6 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
                             }
                         }
 
-                        if(pbi->common.filter_level)
-                        {
-                            /*update loopfilter info*/
-                            Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
-                            filter_level = pbi->mt_baseline_filter_level[Segment];
-                            /* Distance of Mb to the various image edges.
-                             * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
-                             * Apply any context driven MB level adjustment
-                             */
-                            filter_level = vp8_adjust_mb_lf_value(xd, filter_level);
-                        }
-
                         /* Distance of Mb to the various image edges.
                          * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
                          */
@@ -362,7 +350,16 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
                                 }
                             }
 
-                          /* loopfilter on this macroblock. */
+                            /* update loopfilter info */
+                            Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
+                            filter_level = pbi->mt_baseline_filter_level[Segment];
+                            /* Distance of Mb to the various image edges.
+                             * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
+                             * Apply any context driven MB level adjustment
+                             */
+                            filter_level = vp8_adjust_mb_lf_value(xd, filter_level);
+
+                            /* loopfilter on this macroblock. */
                             if (filter_level)
                             {
                                 if (mb_col > 0)
@@ -778,18 +775,6 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
                     }
                 }
 
-                if(pbi->common.filter_level)
-                {
-                    /* update loopfilter info */
-                    Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
-                    filter_level = pbi->mt_baseline_filter_level[Segment];
-                    /* Distance of Mb to the various image edges.
-                     * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
-                     * Apply any context driven MB level adjustment
-                     */
-                    filter_level = vp8_adjust_mb_lf_value(xd, filter_level);
-                }
-
                 /* Distance of Mb to the various image edges.
                  * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
                  */
@@ -853,6 +838,15 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
                         }
                     }
 
+                    /* update loopfilter info */
+                    Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
+                    filter_level = pbi->mt_baseline_filter_level[Segment];
+                    /* Distance of Mb to the various image edges.
+                     * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
+                     * Apply any context driven MB level adjustment
+                     */
+                    filter_level = vp8_adjust_mb_lf_value(xd, filter_level);
+
                     /* loopfilter on this macroblock. */
                     if (filter_level)
                     {
diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c
index b643137a6..5ba14f375 100644
--- a/vp8/encoder/arm/arm_csystemdependent.c
+++ b/vp8/encoder/arm/arm_csystemdependent.c
@@ -50,8 +50,8 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.variance.halfpixvar16x16_v     = vp8_variance_halfpixvar16x16_v_armv6;
         cpi->rtcd.variance.halfpixvar16x16_hv    = vp8_variance_halfpixvar16x16_hv_armv6;
 
-        /*cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;
-        cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;*/
+        cpi->rtcd.variance.mse16x16              = vp8_mse16x16_armv6;
+        /*cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;*/
 
         /*cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_c;
         cpi->rtcd.variance.get8x8var             = vp8_get8x8var_c;
diff --git a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
new file mode 100644
index 000000000..a9060d76f
--- /dev/null
+++ b/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
@@ -0,0 +1,133 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_mse16x16_armv6|
+
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+;
+;note: Based on vp8_variance16x16_armv6. In this function, sum is never used.
+;      So, we can remove this part of calculation.
+
+|vp8_mse16x16_armv6| PROC
+
+    push    {r4-r9, lr}
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+
+    mov     r4, #0              ; initialize sse = 0
+
+loop
+    ; 1st 4 pixels
+    ldr     r5, [r0, #0x0]      ; load 4 src pixels
+    ldr     r6, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0x4]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+    ldr     r5, [r0, #0x8]      ; load 4 src pixels
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r6, [r2, #0x8]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0xc]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r6, [r2, #0xc]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    subs    r12, r12, #1        ; next row
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    bne     loop
+
+    ; return stuff
+    ldr     r1, [sp, #28]       ; get address of sse
+    mov     r0, r4              ; return sse
+    str     r4, [r1]            ; store sse
+
+    pop     {r4-r9, pc}
+
+    ENDP
+
+    END
diff --git a/vp8/encoder/arm/variance_arm.h b/vp8/encoder/arm/variance_arm.h
index 7ac0ac08e..7ad7c76d3 100644
--- a/vp8/encoder/arm/variance_arm.h
+++ b/vp8/encoder/arm/variance_arm.h
@@ -20,6 +20,7 @@ extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_armv6);
 extern prototype_variance(vp8_variance_halfpixvar16x16_h_armv6);
 extern prototype_variance(vp8_variance_halfpixvar16x16_v_armv6);
 extern prototype_variance(vp8_variance_halfpixvar16x16_hv_armv6);
+extern prototype_variance(vp8_mse16x16_armv6);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 
@@ -32,6 +33,9 @@ extern prototype_variance(vp8_variance_halfpixvar16x16_hv_armv6);
 #undef  vp8_variance_var16x16
 #define vp8_variance_var16x16 vp8_variance16x16_armv6
 
+#undef  vp8_variance_mse16x16
+#define vp8_variance_mse16x16 vp8_mse16x16_armv6
+
 #undef  vp8_variance_halfpixvar16x16_h
 #define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_armv6
 
diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index f8979f4f7..b07ee8ffb 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -37,6 +37,7 @@ VP8_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_partitions_ar
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/walsh_v6$(ASM)
 
 #File list for neon
author	John Koleszar <jkoleszar@google.com>	2011-03-15 00:05:07 -0400
committer	John Koleszar <jkoleszar@google.com>	2011-03-15 00:05:07 -0400
commit	b210797a6ada5816a56cacb0b4106a5bf525314c (patch)
tree	f6a19e82285b78c105f7a7a05338a1e07eb65966 /vp8
parent	23c4cbdfbae53661877d449072241651b788d016 (diff)
parent	8c48c943e7c082b4330061e74ce85547253de301 (diff)
download	libvpx-b210797a6ada5816a56cacb0b4106a5bf525314c.tar libvpx-b210797a6ada5816a56cacb0b4106a5bf525314c.tar.gz libvpx-b210797a6ada5816a56cacb0b4106a5bf525314c.tar.bz2 libvpx-b210797a6ada5816a56cacb0b4106a5bf525314c.zip