diff options
author | Johann <johannkoenig@google.com> | 2015-05-15 11:52:03 -0700 |
---|---|---|
committer | Johann <johannkoenig@google.com> | 2015-05-26 12:01:52 -0700 |
commit | c3bdffb0a508ad08d5dfa613c029f368d4293d4c (patch) | |
tree | 4c087783da1d12bfbe09311ebb33f200e789ebf3 /vp8 | |
parent | 976f7f42c1ad1ff3cc0792572f9c4f41f05bb375 (diff) | |
download | libvpx-c3bdffb0a508ad08d5dfa613c029f368d4293d4c.tar libvpx-c3bdffb0a508ad08d5dfa613c029f368d4293d4c.tar.gz libvpx-c3bdffb0a508ad08d5dfa613c029f368d4293d4c.tar.bz2 libvpx-c3bdffb0a508ad08d5dfa613c029f368d4293d4c.zip |
Move variance functions to vpx_dsp
subpel functions will be moved in another patch.
Change-Id: Idb2e049bad0b9b32ac42cc7731cd6903de2826ce
Diffstat (limited to 'vp8')
-rw-r--r-- | vp8/common/arm/armv6/vp8_variance16x16_armv6.asm | 154 | ||||
-rw-r--r-- | vp8/common/arm/armv6/vp8_variance8x8_armv6.asm | 101 | ||||
-rw-r--r-- | vp8/common/arm/neon/variance_neon.c | 320 | ||||
-rw-r--r-- | vp8/common/arm/variance_arm.c | 19 | ||||
-rw-r--r-- | vp8/common/mfqe.c | 20 | ||||
-rw-r--r-- | vp8/common/rtcd_defs.pl | 39 | ||||
-rw-r--r-- | vp8/common/variance.h | 34 | ||||
-rw-r--r-- | vp8/common/variance_c.c | 147 | ||||
-rw-r--r-- | vp8/common/x86/variance_impl_mmx.asm | 498 | ||||
-rw-r--r-- | vp8/common/x86/variance_impl_sse2.asm | 387 | ||||
-rw-r--r-- | vp8/common/x86/variance_mmx.c | 140 | ||||
-rw-r--r-- | vp8/common/x86/variance_sse2.c | 141 | ||||
-rw-r--r-- | vp8/common/x86/variance_ssse3.c | 9 | ||||
-rw-r--r-- | vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm | 138 | ||||
-rw-r--r-- | vp8/encoder/arm/neon/vp8_mse16x16_neon.c | 131 | ||||
-rw-r--r-- | vp8/encoder/encodeframe.c | 3 | ||||
-rw-r--r-- | vp8/encoder/encodeintra.c | 3 | ||||
-rw-r--r-- | vp8/encoder/firstpass.c | 11 | ||||
-rw-r--r-- | vp8/encoder/onyx_if.c | 21 | ||||
-rw-r--r-- | vp8/encoder/pickinter.c | 36 | ||||
-rw-r--r-- | vp8/encoder/picklpf.c | 3 | ||||
-rw-r--r-- | vp8/encoder/rdopt.c | 7 | ||||
-rw-r--r-- | vp8/vp8_common.mk | 3 | ||||
-rw-r--r-- | vp8/vp8cx_arm.mk | 2 |
24 files changed, 80 insertions, 2287 deletions
diff --git a/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm b/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm deleted file mode 100644 index 39919579f..000000000 --- a/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm +++ /dev/null @@ -1,154 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_variance16x16_armv6| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp8_variance16x16_armv6| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - -loop - ; 1st 4 pixels - ldr r4, [r0, #0] ; load 4 src pixels - ldr r5, [r2, #0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load 4 src pixels - ldr r5, [r2, #4] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load 4 src pixels - ldr r5, [r2, #8] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load 4 src pixels - ldr r5, [r2, #12] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - - subs r12, r12, #1 - - bne loop - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - - END - diff --git a/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm b/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm deleted file mode 100644 index 915ee4993..000000000 --- a/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm +++ /dev/null @@ -1,101 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_variance8x8_armv6| - - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp8_variance8x8_armv6| PROC - - push {r4-r10, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r12, #8 ; set loop counter to 8 (=block height) - mov r4, #0 ; initialize sum = 0 - mov r5, #0 ; initialize sse = 0 - -loop - ; 1st 4 pixels - ldr r6, [r0, #0x0] ; load 4 src pixels - ldr r7, [r2, #0x0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r8, r6, r7 ; calculate difference - pld [r0, r1, lsl #1] - sel r10, r8, lr ; select bytes with positive difference - usub8 r9, r7, r6 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r6, r10, lr ; calculate sum of positive differences - usad8 r7, r8, lr ; calculate sum of negative differences - orr r8, r8, r10 ; differences of all 4 pixels - ; calculate total sum - add r4, r4, r6 ; add positive differences to sum - sub r4, r4, r7 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r7, r8 ; byte (two pixels) to halfwords - uxtb16 r10, r8, ror #8 ; another two pixels to halfwords - smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r6, [r0, #0x4] ; load 4 src pixels - ldr r7, [r2, #0x4] ; load 4 ref pixels - smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r6, r7 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r10, r8, lr ; select bytes with positive difference - usub8 r9, r7, r6 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r6, r10, lr ; calculate sum of positive differences - usad8 r7, r8, lr ; calculate sum of negative differences - orr r8, r8, r10 ; differences of all 4 pixels - - ; calculate total sum - add r4, r4, r6 ; add positive differences to sum - sub r4, r4, r7 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r7, r8 ; byte (two pixels) to halfwords - uxtb16 r10, r8, ror #8 ; another two pixels to halfwords - smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) - subs r12, r12, #1 ; next row - smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) - - bne loop - - ; return stuff - ldr r8, [sp, #32] ; get address of sse - mul r1, r4, r4 ; sum * sum - str r5, [r8] ; store sse - sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6)) - - pop {r4-r10, pc} - - ENDP - - END diff --git a/vp8/common/arm/neon/variance_neon.c b/vp8/common/arm/neon/variance_neon.c deleted file mode 100644 index 1b1979073..000000000 --- a/vp8/common/arm/neon/variance_neon.c +++ /dev/null @@ -1,320 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <arm_neon.h> -#include "vpx_ports/mem.h" - -unsigned int vp8_variance16x16_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 8; i++) { - q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q1u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); - - q2u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); - - q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); - q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); - q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); - q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); - q9s32 = vmlal_s16(q9s32, d26s16, d26s16); - q10s32 = vmlal_s16(q10s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), - vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - -unsigned int vp8_variance16x8_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 4; i++) { // variance16x8_neon_loop - q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q1u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); - - q2u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); - - q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); - q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); - q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); - q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); - q9s32 = vmlal_s16(q9s32, d26s16, d26s16); - q10s32 = vmlal_s16(q10s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), - vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - -unsigned int vp8_variance8x16_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - uint8x8_t d0u8, d2u8, d4u8, d6u8; - int16x4_t d22s16, d23s16, d24s16, d25s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint16x8_t q11u16, q12u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 8; i++) { // variance8x16_neon_loop - d0u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d2u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); - - d4u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d6u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); - - q11u16 = vsubl_u8(d0u8, d4u8); - q12u16 = vsubl_u8(d2u8, d6u8); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), - vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - -unsigned int vp8_variance8x8_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 2; i++) { // variance8x8_neon_loop - d0u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d1u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d2u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d3u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - - d4u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d5u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d6u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d7u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - - q11u16 = vsubl_u8(d0u8, d4u8); - q12u16 = vsubl_u8(d1u8, d5u8); - q13u16 = vsubl_u8(d2u8, d6u8); - q14u16 = vsubl_u8(d3u8, d7u8); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); - q9s32 = vmlal_s16(q9s32, d26s16, d26s16); - q10s32 = vmlal_s16(q10s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), - vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} diff --git a/vp8/common/arm/variance_arm.c b/vp8/common/arm/variance_arm.c index 467a50942..0f293f03d 100644 --- a/vp8/common/arm/variance_arm.c +++ b/vp8/common/arm/variance_arm.c @@ -9,10 +9,14 @@ */ #include "vpx_config.h" -#include "vp8_rtcd.h" +#include "./vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "vp8/common/variance.h" #include "vp8/common/filter.h" +// TODO(johannkoenig): Move this to vpx_dsp or vp8/encoder +#if CONFIG_VP8_ENCODER + #if HAVE_MEDIA #include "vp8/common/arm/bilinearfilter_arm.h" @@ -40,8 +44,8 @@ unsigned int vp8_sub_pixel_variance8x8_armv6 vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, 8, 8, 8, VFilter); - return vp8_variance8x8_armv6(second_pass, 8, dst_ptr, - dst_pixels_per_line, sse); + return vpx_variance8x8_media(second_pass, 8, dst_ptr, + dst_pixels_per_line, sse); } unsigned int vp8_sub_pixel_variance16x16_armv6 @@ -86,13 +90,13 @@ unsigned int vp8_sub_pixel_variance16x16_armv6 vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, 16, 16, 16, VFilter); - var = vp8_variance16x16_armv6(second_pass, 16, dst_ptr, - dst_pixels_per_line, sse); + var = vpx_variance16x16_media(second_pass, 16, dst_ptr, + dst_pixels_per_line, sse); } return var; } -#endif /* HAVE_MEDIA */ +#endif // HAVE_MEDIA #if HAVE_NEON @@ -129,4 +133,5 @@ unsigned int vp8_sub_pixel_variance16x16_neon return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); } -#endif +#endif // HAVE_NEON +#endif // CONFIG_VP8_ENCODER diff --git a/vp8/common/mfqe.c b/vp8/common/mfqe.c index d12dea193..5c0680f42 100644 --- a/vp8/common/mfqe.c +++ b/vp8/common/mfqe.c @@ -151,14 +151,14 @@ static void multiframe_quality_enhance_block if (blksize == 16) { - actd = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8; - act = (vp8_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8; + actd = (vpx_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8; + act = (vpx_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8; #ifdef USE_SSD - vp8_variance16x16(y, y_stride, yd, yd_stride, &sse); + vpx_variance16x16(y, y_stride, yd, yd_stride, &sse); sad = (sse + 128)>>8; - vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse); + vpx_variance8x8(u, uv_stride, ud, uvd_stride, &sse); usad = (sse + 32)>>6; - vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse); + vpx_variance8x8(v, uv_stride, vd, uvd_stride, &sse); vsad = (sse + 32)>>6; #else sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8; @@ -168,14 +168,14 @@ static void multiframe_quality_enhance_block } else /* if (blksize == 8) */ { - actd = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6; - act = (vp8_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6; + actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6; + act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6; #ifdef USE_SSD - vp8_variance8x8(y, y_stride, yd, yd_stride, &sse); + vpx_variance8x8(y, y_stride, yd, yd_stride, &sse); sad = (sse + 32)>>6; - vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse); + vpx_variance4x4(u, uv_stride, ud, uvd_stride, &sse); usad = (sse + 8)>>4; - vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse); + vpx_variance4x4(v, uv_stride, vd, uvd_stride, &sse); vsad = (sse + 8)>>4; #else sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6; diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index c9f14d58a..4b820338e 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -237,31 +237,6 @@ specialize qw/vp8_bilinear_predict4x4 mmx media neon/; $vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6; # -# Whole-pixel Variance -# -add_proto qw/unsigned int vp8_variance4x4/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_variance4x4 mmx sse2/; -$vp8_variance4x4_sse2=vp8_variance4x4_wmt; - -add_proto qw/unsigned int vp8_variance8x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_variance8x8 mmx sse2 media neon/; -$vp8_variance8x8_sse2=vp8_variance8x8_wmt; -$vp8_variance8x8_media=vp8_variance8x8_armv6; - -add_proto qw/unsigned int vp8_variance8x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_variance8x16 mmx sse2 neon/; -$vp8_variance8x16_sse2=vp8_variance8x16_wmt; - -add_proto qw/unsigned int vp8_variance16x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_variance16x8 mmx sse2 neon/; -$vp8_variance16x8_sse2=vp8_variance16x8_wmt; - -add_proto qw/unsigned int vp8_variance16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_variance16x16 mmx sse2 media neon/; -$vp8_variance16x16_sse2=vp8_variance16x16_wmt; -$vp8_variance16x16_media=vp8_variance16x16_armv6; - -# # Sub-pixel Variance # add_proto qw/unsigned int vp8_sub_pixel_variance4x4/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"; @@ -309,26 +284,12 @@ $vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6; if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") { # -# Sum of squares (vector) -# -add_proto qw/unsigned int vp8_get_mb_ss/, "const short *"; -specialize qw/vp8_get_mb_ss mmx sse2/; - -# # SSE (Sum Squared Error) # add_proto qw/unsigned int vp8_sub_pixel_mse16x16/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"; specialize qw/vp8_sub_pixel_mse16x16 mmx sse2/; $vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt; -add_proto qw/unsigned int vp8_mse16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_mse16x16 mmx sse2 media neon/; -$vp8_mse16x16_sse2=vp8_mse16x16_wmt; -$vp8_mse16x16_media=vp8_mse16x16_armv6; - -add_proto qw/unsigned int vp8_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride"; -specialize qw/vp8_get4x4sse_cs mmx neon/; - # # Block copy # diff --git a/vp8/common/variance.h b/vp8/common/variance.h index b62cc6136..c6c9f41bf 100644 --- a/vp8/common/variance.h +++ b/vp8/common/variance.h @@ -39,6 +39,7 @@ typedef void (*vpx_sad_multi_fn_t)( const unsigned char *ref_array, int ref_stride, unsigned int *sad_array); + typedef void (*vpx_sad_multi_d_fn_t) ( const unsigned char *src_ptr, @@ -48,7 +49,7 @@ typedef void (*vpx_sad_multi_d_fn_t) unsigned int *sad_array ); -typedef unsigned int (*vp8_variance_fn_t) +typedef unsigned int (*vpx_variance_fn_t) ( const unsigned char *src_ptr, int source_stride, @@ -68,37 +69,14 @@ typedef unsigned int (*vp8_subpixvariance_fn_t) unsigned int *sse ); -typedef void (*vp8_ssimpf_fn_t) - ( - unsigned char *s, - int sp, - unsigned char *r, - int rp, - unsigned long *sum_s, - unsigned long *sum_r, - unsigned long *sum_sq_s, - unsigned long *sum_sq_r, - unsigned long *sum_sxr - ); - -typedef unsigned int (*vp8_getmbss_fn_t)(const short *); - -typedef unsigned int (*vp8_get16x16prederror_fn_t) - ( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int ref_stride - ); - typedef struct variance_vtable { vpx_sad_fn_t sdf; - vp8_variance_fn_t vf; + vpx_variance_fn_t vf; vp8_subpixvariance_fn_t svf; - vp8_variance_fn_t svf_halfpix_h; - vp8_variance_fn_t svf_halfpix_v; - vp8_variance_fn_t svf_halfpix_hv; + vpx_variance_fn_t svf_halfpix_h; + vpx_variance_fn_t svf_halfpix_v; + vpx_variance_fn_t svf_halfpix_hv; vpx_sad_multi_fn_t sdx3f; vpx_sad_multi_fn_t sdx8f; vpx_sad_multi_d_fn_t sdx4df; diff --git a/vp8/common/variance_c.c b/vp8/common/variance_c.c index dc95bfeb3..79d1ca00c 100644 --- a/vp8/common/variance_c.c +++ b/vp8/common/variance_c.c @@ -8,44 +8,34 @@ * be found in the AUTHORS file in the root of the source tree. */ - #include "./vp8_rtcd.h" #include "filter.h" #include "variance.h" - -unsigned int vp8_get_mb_ss_c -( - const short *src_ptr -) -{ - unsigned int i = 0, sum = 0; - - do - { - sum += (src_ptr[i] * src_ptr[i]); - i++; - } - while (i < 256); - - return sum; +/* This is a bad idea. + * ctz = count trailing zeros */ +static int ctz(int a) { + int b = 0; + while (a != 1) { + a >>= 1; + b++; + } + return b; } - -static void variance( +static unsigned int variance( const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, int w, int h, - unsigned int *sse, - int *sum) + unsigned int *sse) { int i, j; - int diff; + int diff, sum; - *sum = 0; + sum = 0; *sse = 0; for (i = 0; i < h; i++) @@ -53,114 +43,17 @@ static void variance( for (j = 0; j < w; j++) { diff = src_ptr[j] - ref_ptr[j]; - *sum += diff; + sum += diff; *sse += diff * diff; } src_ptr += source_stride; ref_ptr += recon_stride; } -} - -unsigned int vp8_variance16x16_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 8)); -} - -unsigned int vp8_variance8x16_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); + return (*sse - (((unsigned int)sum * sum) >> (int)((ctz(w) + ctz(h))))); } -unsigned int vp8_variance16x8_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); -} - - -unsigned int vp8_variance8x8_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 6)); -} - -unsigned int vp8_variance4x4_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 4)); -} - - -unsigned int vp8_mse16x16_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg); - *sse = var; - return var; -} - - /**************************************************************************** * * ROUTINE : filter_block2d_bil_first_pass @@ -304,7 +197,7 @@ unsigned int vp8_sub_pixel_variance4x4_c /* Now filter Verticaly */ var_filter_block2d_bil_second_pass(FData3, temp2, 4, 4, 4, 4, VFilter); - return vp8_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse); + return variance(temp2, 4, dst_ptr, dst_pixels_per_line, 4, 4, sse); } @@ -329,7 +222,7 @@ unsigned int vp8_sub_pixel_variance8x8_c var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter); - return vp8_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse); + return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 8, sse); } unsigned int vp8_sub_pixel_variance16x16_c @@ -353,7 +246,7 @@ unsigned int vp8_sub_pixel_variance16x16_c var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter); - return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse); + return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 16, sse); } @@ -429,7 +322,7 @@ unsigned int vp8_sub_pixel_variance16x8_c var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter); - return vp8_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse); + return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 8, sse); } unsigned int vp8_sub_pixel_variance8x16_c @@ -455,5 +348,5 @@ unsigned int vp8_sub_pixel_variance8x16_c var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter); - return vp8_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse); + return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 16, sse); } diff --git a/vp8/common/x86/variance_impl_mmx.asm b/vp8/common/x86/variance_impl_mmx.asm index 7d5e6810b..97f25275d 100644 --- a/vp8/common/x86/variance_impl_mmx.asm +++ b/vp8/common/x86/variance_impl_mmx.asm @@ -11,504 +11,6 @@ %include "vpx_ports/x86_abi_support.asm" -;unsigned int vp8_get_mb_ss_mmx( short *src_ptr ) -global sym(vp8_get_mb_ss_mmx) PRIVATE -sym(vp8_get_mb_ss_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - GET_GOT rbx - push rsi - push rdi - sub rsp, 8 - ; end prolog - - mov rax, arg(0) ;src_ptr - mov rcx, 16 - pxor mm4, mm4 - -.NEXTROW: - movq mm0, [rax] - movq mm1, [rax+8] - movq mm2, [rax+16] - movq mm3, [rax+24] - pmaddwd mm0, mm0 - pmaddwd mm1, mm1 - pmaddwd mm2, mm2 - pmaddwd mm3, mm3 - - paddd mm4, mm0 - paddd mm4, mm1 - paddd mm4, mm2 - paddd mm4, mm3 - - add rax, 32 - dec rcx - ja .NEXTROW - movq QWORD PTR [rsp], mm4 - - ;return sum[0]+sum[1]; - movsxd rax, dword ptr [rsp] - movsxd rcx, dword ptr [rsp+4] - add rax, rcx - - - ; begin epilog - add rsp, 8 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp8_get8x8var_mmx -;( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride, -; unsigned int *SSE, -; int *Sum -;) -global sym(vp8_get8x8var_mmx) PRIVATE -sym(vp8_get8x8var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - push rbx - sub rsp, 16 - ; end prolog - - - pxor mm5, mm5 ; Blank mmx6 - pxor mm6, mm6 ; Blank mmx7 - pxor mm7, mm7 ; Blank mmx7 - - mov rax, arg(0) ;[src_ptr] ; Load base addresses - mov rbx, arg(2) ;[ref_ptr] - movsxd rcx, dword ptr arg(1) ;[source_stride] - movsxd rdx, dword ptr arg(3) ;[recon_stride] - - ; Row 1 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm1, [rbx] ; Copy eight bytes to mm1 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - - ; Row 2 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 3 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 4 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 5 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - ; movq mm4, [rbx + rdx] - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 6 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 7 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 8 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Now accumulate the final results. - movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory - movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory - movsx rdx, WORD PTR [rsp+8] - movsx rcx, WORD PTR [rsp+10] - movsx rbx, WORD PTR [rsp+12] - movsx rax, WORD PTR [rsp+14] - add rdx, rcx - add rbx, rax - add rdx, rbx ;XSum - movsxd rax, DWORD PTR [rsp] - movsxd rcx, DWORD PTR [rsp+4] - add rax, rcx ;XXSum - mov rsi, arg(4) ;SSE - mov rdi, arg(5) ;Sum - mov dword ptr [rsi], eax - mov dword ptr [rdi], edx - xor rax, rax ; return 0 - - - ; begin epilog - add rsp, 16 - pop rbx - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - - -;unsigned int -;vp8_get4x4var_mmx -;( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride, -; unsigned int *SSE, -; int *Sum -;) -global sym(vp8_get4x4var_mmx) PRIVATE -sym(vp8_get4x4var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - push rbx - sub rsp, 16 - ; end prolog - - - pxor mm5, mm5 ; Blank mmx6 - pxor mm6, mm6 ; Blank mmx7 - pxor mm7, mm7 ; Blank mmx7 - - mov rax, arg(0) ;[src_ptr] ; Load base addresses - mov rbx, arg(2) ;[ref_ptr] - movsxd rcx, dword ptr arg(1) ;[source_stride] - movsxd rdx, dword ptr arg(3) ;[recon_stride] - - ; Row 1 - movd mm0, [rax] ; Copy four bytes to mm0 - movd mm1, [rbx] ; Copy four bytes to mm1 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - paddw mm5, mm0 ; accumulate differences in mm5 - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy four bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - - ; Row 2 - movd mm0, [rax] ; Copy four bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - paddw mm5, mm0 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy four bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 3 - movd mm0, [rax] ; Copy four bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher precision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - paddw mm5, mm0 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy four bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 4 - movd mm0, [rax] ; Copy four bytes to mm0 - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - - paddw mm5, mm0 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - paddd mm7, mm0 ; accumulate in mm7 - - - ; Now accumulate the final results. - movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory - movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory - movsx rdx, WORD PTR [rsp+8] - movsx rcx, WORD PTR [rsp+10] - movsx rbx, WORD PTR [rsp+12] - movsx rax, WORD PTR [rsp+14] - add rdx, rcx - add rbx, rax - add rdx, rbx ;XSum - movsxd rax, DWORD PTR [rsp] - movsxd rcx, DWORD PTR [rsp+4] - add rax, rcx ;XXSum - mov rsi, arg(4) ;SSE - mov rdi, arg(5) ;Sum - mov dword ptr [rsi], eax - mov dword ptr [rdi], edx - xor rax, rax ; return 0 - - - ; begin epilog - add rsp, 16 - pop rbx - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - - -;unsigned int -;vp8_get4x4sse_cs_mmx -;( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride -;) -global sym(vp8_get4x4sse_cs_mmx) PRIVATE -sym(vp8_get4x4sse_cs_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - push rbx - ; end prolog - - - pxor mm6, mm6 ; Blank mmx7 - pxor mm7, mm7 ; Blank mmx7 - - mov rax, arg(0) ;[src_ptr] ; Load base addresses - mov rbx, arg(2) ;[ref_ptr] - movsxd rcx, dword ptr arg(1) ;[source_stride] - movsxd rdx, dword ptr arg(3) ;[recon_stride] - ; Row 1 - movd mm0, [rax] ; Copy eight bytes to mm0 - movd mm1, [rbx] ; Copy eight bytes to mm1 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 2 - movd mm0, [rax] ; Copy eight bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 3 - movd mm0, [rax] ; Copy eight bytes to mm0 - punpcklbw mm1, mm6 - punpcklbw mm0, mm6 ; unpack to higher prrcision - psubsw mm0, mm1 ; A-B (low order) to MM0 - - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 4 - movd mm0, [rax] ; Copy eight bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - pmaddwd mm0, mm0 ; square and accumulate - paddd mm7, mm0 ; accumulate in mm7 - - movq mm0, mm7 ; - psrlq mm7, 32 - - paddd mm0, mm7 - movq rax, mm0 - - - ; begin epilog - pop rbx - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - %define mmx_filter_shift 7 ;void vp8_filter_block2d_bil4x4_var_mmx diff --git a/vp8/common/x86/variance_impl_sse2.asm b/vp8/common/x86/variance_impl_sse2.asm index 761433c11..26de5e860 100644 --- a/vp8/common/x86/variance_impl_sse2.asm +++ b/vp8/common/x86/variance_impl_sse2.asm @@ -13,393 +13,6 @@ %define xmm_filter_shift 7 -;unsigned int vp8_get_mb_ss_sse2 -;( -; short *src_ptr -;) -global sym(vp8_get_mb_ss_sse2) PRIVATE -sym(vp8_get_mb_ss_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 1 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - - mov rax, arg(0) ;[src_ptr] - mov rcx, 8 - pxor xmm4, xmm4 - -.NEXTROW: - movdqa xmm0, [rax] - movdqa xmm1, [rax+16] - movdqa xmm2, [rax+32] - movdqa xmm3, [rax+48] - pmaddwd xmm0, xmm0 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - pmaddwd xmm3, xmm3 - - paddd xmm0, xmm1 - paddd xmm2, xmm3 - paddd xmm4, xmm0 - paddd xmm4, xmm2 - - add rax, 0x40 - dec rcx - ja .NEXTROW - - movdqa xmm3,xmm4 - psrldq xmm4,8 - paddd xmm4,xmm3 - movdqa xmm3,xmm4 - psrldq xmm4,4 - paddd xmm4,xmm3 - movq rax,xmm4 - - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp8_get16x16var_sse2 -;( -; unsigned char * src_ptr, -; int source_stride, -; unsigned char * ref_ptr, -; int recon_stride, -; unsigned int * SSE, -; int * Sum -;) -global sym(vp8_get16x16var_sse2) PRIVATE -sym(vp8_get16x16var_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;[src_ptr] - mov rdi, arg(2) ;[ref_ptr] - - movsxd rax, DWORD PTR arg(1) ;[source_stride] - movsxd rdx, DWORD PTR arg(3) ;[recon_stride] - - ; Prefetch data - lea rcx, [rax+rax*2] - prefetcht0 [rsi] - prefetcht0 [rsi+rax] - prefetcht0 [rsi+rax*2] - prefetcht0 [rsi+rcx] - lea rbx, [rsi+rax*4] - prefetcht0 [rbx] - prefetcht0 [rbx+rax] - prefetcht0 [rbx+rax*2] - prefetcht0 [rbx+rcx] - - lea rcx, [rdx+rdx*2] - prefetcht0 [rdi] - prefetcht0 [rdi+rdx] - prefetcht0 [rdi+rdx*2] - prefetcht0 [rdi+rcx] - lea rbx, [rdi+rdx*4] - prefetcht0 [rbx] - prefetcht0 [rbx+rdx] - prefetcht0 [rbx+rdx*2] - prefetcht0 [rbx+rcx] - - pxor xmm0, xmm0 ; clear xmm0 for unpack - pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs - - pxor xmm6, xmm6 ; clear xmm6 for accumulating sse - mov rcx, 16 - -.var16loop: - movdqu xmm1, XMMWORD PTR [rsi] - movdqu xmm2, XMMWORD PTR [rdi] - - prefetcht0 [rsi+rax*8] - prefetcht0 [rdi+rdx*8] - - movdqa xmm3, xmm1 - movdqa xmm4, xmm2 - - - punpcklbw xmm1, xmm0 - punpckhbw xmm3, xmm0 - - punpcklbw xmm2, xmm0 - punpckhbw xmm4, xmm0 - - - psubw xmm1, xmm2 - psubw xmm3, xmm4 - - paddw xmm7, xmm1 - pmaddwd xmm1, xmm1 - - paddw xmm7, xmm3 - pmaddwd xmm3, xmm3 - - paddd xmm6, xmm1 - paddd xmm6, xmm3 - - add rsi, rax - add rdi, rdx - - sub rcx, 1 - jnz .var16loop - - - movdqa xmm1, xmm6 - pxor xmm6, xmm6 - - pxor xmm5, xmm5 - punpcklwd xmm6, xmm7 - - punpckhwd xmm5, xmm7 - psrad xmm5, 16 - - psrad xmm6, 16 - paddd xmm6, xmm5 - - movdqa xmm2, xmm1 - punpckldq xmm1, xmm0 - - punpckhdq xmm2, xmm0 - movdqa xmm7, xmm6 - - paddd xmm1, xmm2 - punpckldq xmm6, xmm0 - - punpckhdq xmm7, xmm0 - paddd xmm6, xmm7 - - movdqa xmm2, xmm1 - movdqa xmm7, xmm6 - - psrldq xmm1, 8 - psrldq xmm6, 8 - - paddd xmm7, xmm6 - paddd xmm1, xmm2 - - mov rax, arg(5) ;[Sum] - mov rdi, arg(4) ;[SSE] - - movd DWORD PTR [rax], xmm7 - movd DWORD PTR [rdi], xmm1 - - - ; begin epilog - pop rdi - pop rsi - pop rbx - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - - - -;unsigned int vp8_get8x8var_sse2 -;( -; unsigned char * src_ptr, -; int source_stride, -; unsigned char * ref_ptr, -; int recon_stride, -; unsigned int * SSE, -; int * Sum -;) -global sym(vp8_get8x8var_sse2) PRIVATE -sym(vp8_get8x8var_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - mov rsi, arg(0) ;[src_ptr] - mov rdi, arg(2) ;[ref_ptr] - - movsxd rax, DWORD PTR arg(1) ;[source_stride] - movsxd rdx, DWORD PTR arg(3) ;[recon_stride] - - pxor xmm0, xmm0 ; clear xmm0 for unpack - pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs - - movq xmm1, QWORD PTR [rsi] - movq xmm2, QWORD PTR [rdi] - - punpcklbw xmm1, xmm0 - punpcklbw xmm2, xmm0 - - psubsw xmm1, xmm2 - paddw xmm7, xmm1 - - pmaddwd xmm1, xmm1 - - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - movq xmm2, QWORD PTR[rsi + rax * 2] - movq xmm3, QWORD PTR[rdi + rdx * 2] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - lea rsi, [rsi + rax * 2] - lea rdi, [rdi + rdx * 2] - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - movq xmm2, QWORD PTR[rsi + rax *2] - movq xmm3, QWORD PTR[rdi + rdx *2] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - lea rsi, [rsi + rax * 2] - lea rdi, [rdi + rdx * 2] - - - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - movq xmm2, QWORD PTR[rsi + rax *2] - movq xmm3, QWORD PTR[rdi + rdx *2] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - lea rsi, [rsi + rax * 2] - lea rdi, [rdi + rdx * 2] - - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - movdqa xmm6, xmm7 - punpcklwd xmm6, xmm0 - - punpckhwd xmm7, xmm0 - movdqa xmm2, xmm1 - - paddw xmm6, xmm7 - punpckldq xmm1, xmm0 - - punpckhdq xmm2, xmm0 - movdqa xmm7, xmm6 - - paddd xmm1, xmm2 - punpckldq xmm6, xmm0 - - punpckhdq xmm7, xmm0 - paddw xmm6, xmm7 - - movdqa xmm2, xmm1 - movdqa xmm7, xmm6 - - psrldq xmm1, 8 - psrldq xmm6, 8 - - paddw xmm7, xmm6 - paddd xmm1, xmm2 - - mov rax, arg(5) ;[Sum] - mov rdi, arg(4) ;[SSE] - - movq rdx, xmm7 - movsx rcx, dx - - mov dword ptr [rax], ecx - movd DWORD PTR [rdi], xmm1 - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - ;void vp8_filter_block2d_bil_var_sse2 ;( ; unsigned char *ref_ptr, diff --git a/vp8/common/x86/variance_mmx.c b/vp8/common/x86/variance_mmx.c index 10a58b822..25ae5767f 100644 --- a/vp8/common/x86/variance_mmx.c +++ b/vp8/common/x86/variance_mmx.c @@ -35,25 +35,6 @@ extern void filter_block1d_v6_mmx short *filter ); -extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr); -extern unsigned int vp8_get8x8var_mmx -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); -extern unsigned int vp8_get4x4var_mmx -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); extern void vp8_filter_block2d_bil4x4_var_mmx ( const unsigned char *ref_ptr, @@ -78,127 +59,6 @@ extern void vp8_filter_block2d_bil_var_mmx unsigned int *sumsquared ); - -unsigned int vp8_variance4x4_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 4)); - -} - -unsigned int vp8_variance8x8_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; - *sse = var; - - return (var - (((unsigned int)avg * avg) >> 6)); - -} - -unsigned int vp8_mse16x16_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, sse2, sse3, var; - int sum0, sum1, sum2, sum3; - - - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); - vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ; - vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); - - var = sse0 + sse1 + sse2 + sse3; - *sse = var; - return var; -} - - -unsigned int vp8_variance16x16_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, sse2, sse3, var; - int sum0, sum1, sum2, sum3, avg; - - - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); - vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ; - vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); - - var = sse0 + sse1 + sse2 + sse3; - avg = sum0 + sum1 + sum2 + sum3; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 8)); -} - -unsigned int vp8_variance16x8_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); - - var = sse0 + sse1; - avg = sum0 + sum1; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); - -} - - -unsigned int vp8_variance8x16_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ; - - var = sse0 + sse1; - avg = sum0 + sum1; - *sse = var; - - return (var - (((unsigned int)avg * avg) >> 7)); - -} - - unsigned int vp8_sub_pixel_variance4x4_mmx ( const unsigned char *src_ptr, diff --git a/vp8/common/x86/variance_sse2.c b/vp8/common/x86/variance_sse2.c index 6c6539d8e..f6dfb2787 100644 --- a/vp8/common/x86/variance_sse2.c +++ b/vp8/common/x86/variance_sse2.c @@ -31,38 +31,6 @@ extern void vp8_filter_block2d_bil4x4_var_mmx unsigned int *sumsquared ); -extern unsigned int vp8_get4x4var_mmx -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); - -unsigned int vp8_get_mb_ss_sse2 -( - const short *src_ptr -); -unsigned int vp8_get16x16var_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); -unsigned int vp8_get8x8var_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); void vp8_filter_block2d_bil_var_sse2 ( const unsigned char *ref_ptr, @@ -136,115 +104,6 @@ void vp8_half_vert_variance16x_h_sse2 unsigned int *sumsquared ); -unsigned int vp8_variance4x4_wmt( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 4)); - -} - -unsigned int vp8_variance8x8_wmt -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 6)); - -} - - -unsigned int vp8_variance16x16_wmt -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0; - int sum0; - - - vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - *sse = sse0; - return (sse0 - (((unsigned int)sum0 * sum0) >> 8)); -} -unsigned int vp8_mse16x16_wmt( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - - unsigned int sse0; - int sum0; - vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - *sse = sse0; - return sse0; - -} - - -unsigned int vp8_variance16x8_wmt -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - - vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); - - var = sse0 + sse1; - avg = sum0 + sum1; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); - -} - -unsigned int vp8_variance8x16_wmt -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - - vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ; - - var = sse0 + sse1; - avg = sum0 + sum1; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); - -} - unsigned int vp8_sub_pixel_variance4x4_wmt ( const unsigned char *src_ptr, diff --git a/vp8/common/x86/variance_ssse3.c b/vp8/common/x86/variance_ssse3.c index d8c8da540..2a0df640a 100644 --- a/vp8/common/x86/variance_ssse3.c +++ b/vp8/common/x86/variance_ssse3.c @@ -13,15 +13,6 @@ #include "vp8/common/variance.h" #include "vpx_ports/mem.h" -extern unsigned int vp8_get16x16var_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); extern void vp8_half_horiz_vert_variance16x_h_sse2 ( const unsigned char *ref_ptr, diff --git a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm deleted file mode 100644 index 000805d4f..000000000 --- a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm +++ /dev/null @@ -1,138 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_mse16x16_armv6| - - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -; -;note: Based on vp8_variance16x16_armv6. In this function, sum is never used. -; So, we can remove this part of calculation. - -|vp8_mse16x16_armv6| PROC - - push {r4-r9, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r12, #16 ; set loop counter to 16 (=block height) - mov r4, #0 ; initialize sse = 0 - -loop - ; 1st 4 pixels - ldr r5, [r0, #0x0] ; load 4 src pixels - ldr r6, [r2, #0x0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r8, r5, r6 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - ldr r5, [r0, #0x4] ; load 4 src pixels - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r6, [r2, #0x4] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - ldr r5, [r0, #0x8] ; load 4 src pixels - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r6, [r2, #0x8] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - ldr r5, [r0, #0xc] ; load 4 src pixels - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r6, [r2, #0xc] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - subs r12, r12, #1 ; next row - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - bne loop - - ; return stuff - ldr r1, [sp, #28] ; get address of sse - mov r0, r4 ; return sse - str r4, [r1] ; store sse - - pop {r4-r9, pc} - - ENDP - - END diff --git a/vp8/encoder/arm/neon/vp8_mse16x16_neon.c b/vp8/encoder/arm/neon/vp8_mse16x16_neon.c deleted file mode 100644 index f806809df..000000000 --- a/vp8/encoder/arm/neon/vp8_mse16x16_neon.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <arm_neon.h> - -unsigned int vp8_mse16x16_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - int64x1_t d0s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - int32x4_t q7s32, q8s32, q9s32, q10s32; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int64x2_t q1s64; - - q7s32 = vdupq_n_s32(0); - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 8; i++) { // mse16x16_neon_loop - q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q1u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q2u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - - q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); - q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); - q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); - q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q7s32 = vmlal_s16(q7s32, d22s16, d22s16); - q8s32 = vmlal_s16(q8s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q7s32 = vmlal_s16(q7s32, d26s16, d26s16); - q8s32 = vmlal_s16(q8s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q7s32 = vaddq_s32(q7s32, q8s32); - q9s32 = vaddq_s32(q9s32, q10s32); - q10s32 = vaddq_s32(q7s32, q9s32); - - q1s64 = vpaddlq_s32(q10s32); - d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0); - return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); -} - -unsigned int vp8_get4x4sse_cs_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride) { - int16x4_t d22s16, d24s16, d26s16, d28s16; - int64x1_t d0s64; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - int32x4_t q7s32, q8s32, q9s32, q10s32; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int64x2_t q1s64; - - d0u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d4u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d1u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d5u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d2u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d6u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d3u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d7u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - - q11u16 = vsubl_u8(d0u8, d4u8); - q12u16 = vsubl_u8(d1u8, d5u8); - q13u16 = vsubl_u8(d2u8, d6u8); - q14u16 = vsubl_u8(d3u8, d7u8); - - d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16)); - d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16)); - d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16)); - d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16)); - - q7s32 = vmull_s16(d22s16, d22s16); - q8s32 = vmull_s16(d24s16, d24s16); - q9s32 = vmull_s16(d26s16, d26s16); - q10s32 = vmull_s16(d28s16, d28s16); - - q7s32 = vaddq_s32(q7s32, q8s32); - q9s32 = vaddq_s32(q9s32, q10s32); - q9s32 = vaddq_s32(q7s32, q9s32); - - q1s64 = vpaddlq_s32(q9s32); - d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); -} diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index 378e902c6..d381d8ddf 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -11,6 +11,7 @@ #include "vpx_config.h" #include "vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "encodemb.h" #include "encodemv.h" #include "vp8/common/common.h" @@ -90,7 +91,7 @@ static unsigned int tt_activity_measure( VP8_COMP *cpi, MACROBLOCK *x ) * lambda using a non-linear combination (e.g., the smallest, or second * smallest, etc.). */ - act = vp8_variance16x16(x->src.y_buffer, + act = vpx_variance16x16(x->src.y_buffer, x->src.y_stride, VP8_VAR_OFFS, 0, &sse); act = act<<4; diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c index cfa4cb927..e2de5eecb 100644 --- a/vp8/encoder/encodeintra.c +++ b/vp8/encoder/encodeintra.c @@ -11,6 +11,7 @@ #include "vpx_config.h" #include "vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "quantize.h" #include "vp8/common/reconintra4x4.h" #include "encodemb.h" @@ -44,7 +45,7 @@ int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred) } } - intra_pred_var = vp8_get_mb_ss(x->src_diff); + intra_pred_var = vpx_get_mb_ss(x->src_diff); return intra_pred_var; } diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index a6ff0e7a0..3deb4abb3 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -12,6 +12,7 @@ #include <limits.h> #include <stdio.h> +#include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "block.h" #include "onyx_int.h" @@ -422,14 +423,14 @@ static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, /* Set up pointers for this macro block raw buffer */ raw_ptr = (unsigned char *)(raw_buffer->y_buffer + recon_yoffset + d->offset); - vp8_mse16x16 ( src_ptr, src_stride, raw_ptr, raw_stride, - (unsigned int *)(raw_motion_err)); + vpx_mse16x16(src_ptr, src_stride, raw_ptr, raw_stride, + (unsigned int *)(raw_motion_err)); /* Set up pointers for this macro block recon buffer */ xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset; ref_ptr = (unsigned char *)(xd->pre.y_buffer + d->offset ); - vp8_mse16x16 ( src_ptr, src_stride, ref_ptr, ref_stride, - (unsigned int *)(best_motion_err)); + vpx_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride, + (unsigned int *)(best_motion_err)); } static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, @@ -453,7 +454,7 @@ static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, int new_mv_mode_penalty = 256; /* override the default variance function to use MSE */ - v_fn_ptr.vf = vp8_mse16x16; + v_fn_ptr.vf = vpx_mse16x16; /* Set up pointers for this macro block recon buffer */ xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset; diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index c2bb23295..40e29e191 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -2131,7 +2131,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) #endif cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16; - cpi->fn_ptr[BLOCK_16X16].vf = vp8_variance16x16; + cpi->fn_ptr[BLOCK_16X16].vf = vpx_variance16x16; cpi->fn_ptr[BLOCK_16X16].svf = vp8_sub_pixel_variance16x16; cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vp8_variance_halfpixvar16x16_h; cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vp8_variance_halfpixvar16x16_v; @@ -2141,7 +2141,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d; cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8; - cpi->fn_ptr[BLOCK_16X8].vf = vp8_variance16x8; + cpi->fn_ptr[BLOCK_16X8].vf = vpx_variance16x8; cpi->fn_ptr[BLOCK_16X8].svf = vp8_sub_pixel_variance16x8; cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL; @@ -2151,7 +2151,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_16X8].sdx4df = vpx_sad16x8x4d; cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16; - cpi->fn_ptr[BLOCK_8X16].vf = vp8_variance8x16; + cpi->fn_ptr[BLOCK_8X16].vf = vpx_variance8x16; cpi->fn_ptr[BLOCK_8X16].svf = vp8_sub_pixel_variance8x16; cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL; @@ -2161,7 +2161,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_8X16].sdx4df = vpx_sad8x16x4d; cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8; - cpi->fn_ptr[BLOCK_8X8].vf = vp8_variance8x8; + cpi->fn_ptr[BLOCK_8X8].vf = vpx_variance8x8; cpi->fn_ptr[BLOCK_8X8].svf = vp8_sub_pixel_variance8x8; cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL; @@ -2171,7 +2171,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_8X8].sdx4df = vpx_sad8x8x4d; cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4; - cpi->fn_ptr[BLOCK_4X4].vf = vp8_variance4x4; + cpi->fn_ptr[BLOCK_4X4].vf = vpx_variance4x4; cpi->fn_ptr[BLOCK_4X4].svf = vp8_sub_pixel_variance4x4; cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL; @@ -2558,7 +2558,7 @@ static uint64_t calc_plane_error(unsigned char *orig, int orig_stride, { unsigned int sse; - vp8_mse16x16(orig + col, orig_stride, + vpx_mse16x16(orig + col, orig_stride, recon + col, recon_stride, &sse); total_sse += sse; @@ -3384,7 +3384,7 @@ static int measure_square_diff_partial(YV12_BUFFER_CONFIG *source, int index = block_index_row + (j >> 4); if (cpi->consec_zero_last[index] >= min_consec_zero_last) { unsigned int sse; - Total += vp8_mse16x16(src + j, + Total += vpx_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, &sse); @@ -3448,7 +3448,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) { int index = block_index_row + (j >> 4); if (cpi->consec_zero_last[index] >= min_consec_zero_last) { unsigned int sse; - const unsigned int var = vp8_variance16x16(src + j, + const unsigned int var = vpx_variance16x16(src + j, ystride, dst + j, ystride, @@ -3458,7 +3458,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) { // is small (to avoid effects from lighting change). if ((sse - var) < 128) { unsigned int sse2; - const unsigned int act = vp8_variance16x16(src + j, + const unsigned int act = vpx_variance16x16(src + j, ystride, const_source, 0, @@ -5993,7 +5993,8 @@ int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest) for (j = 0; j < source->y_width; j += 16) { unsigned int sse; - Total += vp8_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, &sse); + Total += vpx_mse16x16(src + j, source->y_stride, + dst + j, dest->y_stride, &sse); } src += 16 * source->y_stride; diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index 98ea5a040..053bf119a 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -11,6 +11,7 @@ #include <limits.h> #include "vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "onyx_int.h" #include "modecosts.h" #include "encodeintra.h" @@ -215,33 +216,6 @@ int vp8_get_inter_mbpred_error(MACROBLOCK *mb, } - -unsigned int vp8_get4x4sse_cs_c -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride -) -{ - int distortion = 0; - int r, c; - - for (r = 0; r < 4; r++) - { - for (c = 0; c < 4; c++) - { - int diff = src_ptr[c] - ref_ptr[c]; - distortion += diff * diff; - } - - src_ptr += source_stride; - ref_ptr += recon_stride; - } - - return distortion; -} - static int get_prediction_error(BLOCK *be, BLOCKD *b) { unsigned char *sptr; @@ -249,7 +223,7 @@ static int get_prediction_error(BLOCK *be, BLOCKD *b) sptr = (*(be->base_src) + be->src); dptr = b->predictor; - return vp8_get4x4sse_cs(sptr, be->src_stride, dptr, 16); + return vpx_get4x4sse_cs(sptr, be->src_stride, dptr, 16); } @@ -1037,7 +1011,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, else { rate2 += rate; - distortion2 = vp8_variance16x16( + distortion2 = vpx_variance16x16( *(b->base_src), b->src_stride, x->e_mbd.predictor, 16, &sse); this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); @@ -1066,7 +1040,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, xd->dst.y_stride, xd->predictor, 16); - distortion2 = vp8_variance16x16 + distortion2 = vpx_variance16x16 (*(b->base_src), b->src_stride, x->e_mbd.predictor, 16, &sse); rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode]; @@ -1547,7 +1521,7 @@ void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_) xd->dst.y_stride, xd->predictor, 16); - distortion = vp8_variance16x16 + distortion = vpx_variance16x16 (*(b->base_src), b->src_stride, xd->predictor, 16, &sse); rate = x->mbmode_cost[xd->frame_type][mode]; this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c index 890053dcf..875b37f68 100644 --- a/vp8/encoder/picklpf.c +++ b/vp8/encoder/picklpf.c @@ -9,6 +9,7 @@ */ +#include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "vp8/common/onyxc_int.h" #include "onyx_int.h" @@ -83,7 +84,7 @@ static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source, for (j = 0; j < source->y_width; j += 16) { unsigned int sse; - Total += vp8_mse16x16(src + j, source->y_stride, + Total += vpx_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, &sse); } diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 9ccd85eb9..17194f0d4 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -15,6 +15,7 @@ #include <assert.h> #include "vpx_config.h" #include "vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "tokenize.h" #include "treewriter.h" #include "onyx_int.h" @@ -507,9 +508,9 @@ int VP8_UVSSE(MACROBLOCK *x) } else { - vp8_variance8x8(uptr, pre_stride, + vpx_variance8x8(uptr, pre_stride, upred_ptr, uv_stride, &sse2); - vp8_variance8x8(vptr, pre_stride, + vpx_variance8x8(vptr, pre_stride, vpred_ptr, uv_stride, &sse1); sse2 += sse1; } @@ -1783,7 +1784,7 @@ static int evaluate_inter_mode_rd(int mdcounts[4], if(threshold < x->encode_breakout) threshold = x->encode_breakout; - var = vp8_variance16x16 + var = vpx_variance16x16 (*(b->base_src), b->src_stride, x->e_mbd.predictor, 16, &sse); diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index b4c814075..c71d592f5 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -145,8 +145,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/intra4x4_predict_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequant_idct_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequantize_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/idct_blk_v6.c -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance8x8_armv6$(ASM) -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance16x16_armv6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM) @@ -168,7 +166,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/reconintra_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/variance_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance_neon.c $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl)) diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk index 050030179..0b0f6a70a 100644 --- a/vp8/vp8cx_arm.mk +++ b/vp8/vp8cx_arm.mk @@ -18,7 +18,6 @@ VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/dct_arm.c #File list for media # encoder VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM) -VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM) VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM) #File list for neon @@ -27,5 +26,4 @@ VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/fastquantizeb_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon.c -VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_mse16x16_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c |