changes to start experimenting with color segmentation prediction modes.

author: Jim Bankoski <jimbankoski@google.com> 2010-11-16 14:38:40 -0500
committer: Jim Bankoski <jimbankoski@google.com> 2010-11-16 14:38:40 -0500
commit: b4a3602f662519c88c06a9efb9cbeb0d224cda16 (patch)
tree: 5f0f76af56de4bfdb0faf7f0d9f62dc2cadc77e5 /vp8/common/x86
parent: 00fe7441e9c5cbd898a8382d49cf9396d8482464 (diff)
download: libvpx-b4a3602f662519c88c06a9efb9cbeb0d224cda16.tar
libvpx-b4a3602f662519c88c06a9efb9cbeb0d224cda16.tar.gz
libvpx-b4a3602f662519c88c06a9efb9cbeb0d224cda16.tar.bz2
libvpx-b4a3602f662519c88c06a9efb9cbeb0d224cda16.zip
1 files changed, 484 insertions, 0 deletions
diff --git a/vp8/common/x86/mask_sse3.asm b/vp8/common/x86/mask_sse3.asm
new file mode 100644
index 000000000..0d90cfa86
--- /dev/null
+++ b/vp8/common/x86/mask_sse3.asm
@@ -0,0 +1,484 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void int vp8_makemask_sse3(
+;    unsigned char *y,
+;    unsigned char *u,
+;    unsigned char *v,
+;    unsigned char *ym,
+;    unsigned char *uvm,
+;    int yp,
+;    int uvp,
+;    int ys,
+;    int us,
+;    int vs,
+;    int yt,
+;    int ut,
+;    int vt)
+global sym(vp8_makemask_sse3)
+sym(vp8_makemask_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 14
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;y
+        mov             rdi,        arg(1) ;u
+        mov             rcx,        arg(2) ;v
+        mov             rax,        arg(3) ;ym
+        movsxd          rbx,        dword arg(4) ;yp
+        movsxd          rdx,        dword arg(5) ;uvp
+
+        pxor            xmm0,xmm0
+
+        ;make 16 copies of the center y value
+        movd            xmm1, arg(6)
+        pshufb          xmm1, xmm0
+
+        ; make 16 copies of the center u value
+        movd            xmm2, arg(7)
+        pshufb          xmm2, xmm0
+
+        ; make 16 copies of the center v value
+        movd            xmm3, arg(8)
+        pshufb          xmm3, xmm0
+        unpcklpd        xmm2, xmm3
+
+        ;make 16 copies of the y tolerance
+        movd            xmm3, arg(9)
+        pshufb          xmm3, xmm0
+
+        ;make 16 copies of the u tolerance
+        movd            xmm4, arg(10)
+        pshufb          xmm4, xmm0
+
+        ;make 16 copies of the v tolerance
+        movd            xmm5, arg(11)
+        pshufb          xmm5, xmm0
+        unpckhpd        xmm4, xmm5
+
+        mov             r8,8
+
+NextPairOfRows:
+
+        ;grab the y source values
+        movdqu          xmm0, [rsi]
+
+        ;compute abs difference between source and y target
+        movdqa          xmm6, xmm1
+        movdqa          xmm7, xmm0
+        psubusb         xmm0, xmm1
+        psubusb         xmm6, xmm7
+        por             xmm0, xmm6
+
+        ;compute abs difference between
+        movdqa          xmm6, xmm3
+        pcmpgtb         xmm6, xmm0
+
+        ;grab the y source values
+        add             rsi, rbx
+        movdqu          xmm0, [rsi]
+
+        ;compute abs difference between source and y target
+        movdqa          xmm11, xmm1
+        movdqa          xmm7, xmm0
+        psubusb         xmm0, xmm1
+        psubusb         xmm11, xmm7
+        por             xmm0, xmm11
+
+        ;compute abs difference between
+        movdqa          xmm11, xmm3
+        pcmpgtb         xmm11, xmm0
+
+
+        ;grab the u and v source values
+        movdqu          xmm7, [rdi]
+        movdqu          xmm8, [rcx]
+        unpcklpd        xmm7, xmm8
+
+        ;compute abs difference between source and uv targets
+        movdqa          xmm9, xmm2
+        movdqa          xmm10, xmm7
+        psubusb         xmm7, xmm2
+        psubusb         xmm9, xmm10
+        por             xmm7, xmm9
+
+        ;check whether the number is < tolerance
+        movdqa          xmm0, xmm4
+        pcmpgtb         xmm0, xmm7
+
+        ;double  u and v masks
+        movdqa          xmm8, xmm0
+        punpckhbw       xmm0, xmm0
+        punpcklbw       xmm8, xmm8
+
+        ;mask row 0 and output
+        pand            xmm6, xmm8
+        pand            xmm6, xmm0
+        movdqa          [rax],xmm6
+
+        ;mask row 1 and output
+        pand            xmm11, xmm8
+        pand            xmm11, xmm0
+        movdqa          [rax+16],xmm11
+
+
+        ; to the next row or set of rows
+        add             rsi, rbx
+        add             rdi, rdx
+        add             rcx, rdx
+        add             rax,32
+        dec r8
+        jnz NextPairOfRows
+
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;GROW_HORIZ (register for result, source register or mem local)
+; takes source and shifts left and ors with source
+; then shifts right and ors with source
+%macro GROW_HORIZ 2
+    movdqa          %1, %2
+    movdqa          xmm14, %1
+    movdqa          xmm15, %1
+    pslldq          xmm14, 1
+    psrldq          xmm15, 1
+    por             %1,xmm14
+    por             %1,xmm15
+%endmacro
+;GROW_VERT (result, center row, above row, below row)
+%macro GROW_VERT 4
+    movdqa          %1,%2
+    por             %1,%3
+    por             %1,%4
+%endmacro
+
+;GROW_NEXTLINE (new line to grow, new source, line to write)
+%macro GROW_NEXTLINE 3
+    GROW_HORIZ %1, %2
+    GROW_VERT xmm3, xmm0, xmm1, xmm2
+    movdqa %3,xmm3
+%endmacro
+
+
+;void int vp8_growmaskmb_sse3(
+;    unsigned char *om,
+;    unsigned char *nm,
+global sym(vp8_growmaskmb_sse3)
+sym(vp8_growmaskmb_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;src
+    mov             rdi,        arg(1) ;rst
+
+    GROW_HORIZ xmm0, [rsi]
+    GROW_HORIZ xmm1, [rsi+16]
+    GROW_HORIZ xmm2, [rsi+32]
+
+    GROW_VERT xmm3, xmm0, xmm1, xmm2
+    por xmm0,xmm1
+    movdqa [rdi], xmm0
+    movdqa [rdi+16],xmm3
+
+    GROW_NEXTLINE xmm0,[rsi+48],[rdi+32]
+    GROW_NEXTLINE xmm1,[rsi+64],[rdi+48]
+    GROW_NEXTLINE xmm2,[rsi+80],[rdi+64]
+    GROW_NEXTLINE xmm0,[rsi+96],[rdi+80]
+    GROW_NEXTLINE xmm1,[rsi+112],[rdi+96]
+    GROW_NEXTLINE xmm2,[rsi+128],[rdi+112]
+    GROW_NEXTLINE xmm0,[rsi+144],[rdi+128]
+    GROW_NEXTLINE xmm1,[rsi+160],[rdi+144]
+    GROW_NEXTLINE xmm2,[rsi+176],[rdi+160]
+    GROW_NEXTLINE xmm0,[rsi+192],[rdi+176]
+    GROW_NEXTLINE xmm1,[rsi+208],[rdi+192]
+    GROW_NEXTLINE xmm2,[rsi+224],[rdi+208]
+    GROW_NEXTLINE xmm0,[rsi+240],[rdi+224]
+
+    por xmm0,xmm2
+    movdqa [rdi+240], xmm0
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;unsigned int vp8_sad16x16_masked_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned char *mask)
+global sym(vp8_sad16x16_masked_wmt)
+sym(vp8_sad16x16_masked_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+    mov             rsi,        arg(0) ;src_ptr
+    mov             rdi,        arg(2) ;ref_ptr
+
+    mov             rbx,        arg(4) ;mask
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+    mov             rcx,        16
+
+    pxor            xmm3,       xmm3
+
+NextSadRow:
+    movdqu          xmm0,       [rsi]
+    movdqu          xmm1,       [rdi]
+    movdqu          xmm2,       [rbx]
+    pand            xmm0,       xmm2
+    pand            xmm1,       xmm2
+
+    psadbw          xmm0,       xmm1
+    paddw           xmm3,       xmm0
+
+    add             rsi, rax
+    add             rdi, rdx
+    add             rbx,  16
+
+    dec rcx
+    jnz NextSadRow
+
+    movdqa          xmm4 ,     xmm3
+    psrldq          xmm4,       8
+    paddw           xmm3,      xmm4
+    movq            rax,       xmm3
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_sad16x16_unmasked_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned char *mask)
+global sym(vp8_sad16x16_unmasked_wmt)
+sym(vp8_sad16x16_unmasked_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+    mov             rsi,        arg(0) ;src_ptr
+    mov             rdi,        arg(2) ;ref_ptr
+
+    mov             rbx,        arg(4) ;mask
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+    mov             rcx,        16
+
+    pxor            xmm3,       xmm3
+
+next_vp8_sad16x16_unmasked_wmt:
+    movdqu          xmm0,       [rsi]
+    movdqu          xmm1,       [rdi]
+    movdqu          xmm2,       [rbx]
+    por             xmm0,       xmm2
+    por             xmm1,       xmm2
+
+    psadbw          xmm0,       xmm1
+    paddw           xmm3,       xmm0
+
+    add             rsi, rax
+    add             rdi, rdx
+    add             rbx,  16
+
+    dec rcx
+    jnz next_vp8_sad16x16_unmasked_wmt
+
+    movdqa          xmm4 ,     xmm3
+    psrldq          xmm4,       8
+    paddw           xmm3,      xmm4
+    movq            rax,        xmm3
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_masked_predictor_wmt(
+;    unsigned char *masked,
+;    unsigned char *unmasked,
+;    int  src_stride,
+;    unsigned char *dst_ptr,
+;    int  dst_stride,
+;    unsigned char *mask)
+global sym(vp8_masked_predictor_wmt)
+sym(vp8_masked_predictor_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+    mov             rsi,        arg(0) ;src_ptr
+    mov             rdi,        arg(1) ;ref_ptr
+
+    mov             rbx,        arg(5) ;mask
+    movsxd          rax,        dword ptr arg(2) ;src_stride
+    mov             r11,        arg(3) ; destination
+    movsxd          rdx,        dword ptr arg(4) ;dst_stride
+
+    mov             rcx,        16
+
+    pxor            xmm3,       xmm3
+
+next_vp8_masked_predictor_wmt:
+    movdqu          xmm0,       [rsi]
+    movdqu          xmm1,       [rdi]
+    movdqu          xmm2,       [rbx]
+
+    pand            xmm0,       xmm2
+    pandn           xmm2,       xmm1
+    por             xmm0,       xmm2
+    movdqu          [r11],      xmm0
+
+    add             r11, rdx
+    add             rsi, rax
+    add             rdi, rdx
+    add             rbx,  16
+
+    dec rcx
+    jnz next_vp8_masked_predictor_wmt
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;unsigned int vp8_masked_predictor_uv_wmt(
+;    unsigned char *masked,
+;    unsigned char *unmasked,
+;    int  src_stride,
+;    unsigned char *dst_ptr,
+;    int  dst_stride,
+;    unsigned char *mask)
+global sym(vp8_masked_predictor_uv_wmt)
+sym(vp8_masked_predictor_uv_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+    mov             rsi,        arg(0) ;src_ptr
+    mov             rdi,        arg(1) ;ref_ptr
+
+    mov             rbx,        arg(5) ;mask
+    movsxd          rax,        dword ptr arg(2) ;src_stride
+    mov             r11,        arg(3) ; destination
+    movsxd          rdx,        dword ptr arg(4) ;dst_stride
+
+    mov             rcx,        8
+
+    pxor            xmm3,       xmm3
+
+next_vp8_masked_predictor_uv_wmt:
+    movq            xmm0,       [rsi]
+    movq            xmm1,       [rdi]
+    movq            xmm2,       [rbx]
+
+    pand            xmm0,       xmm2
+    pandn           xmm2,       xmm1
+    por             xmm0,       xmm2
+    movq            [r11],      xmm0
+
+    add             r11, rdx
+    add             rsi, rax
+    add             rdi, rax
+    add             rbx,  8
+
+    dec rcx
+    jnz next_vp8_masked_predictor_uv_wmt
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_uv_from_y_mask(
+;    unsigned char *ymask,
+;    unsigned char *uvmask)
+global sym(vp8_uv_from_y_mask)
+sym(vp8_uv_from_y_mask):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+    mov             rsi,        arg(0) ;src_ptr
+    mov             rdi,        arg(1) ;dst_ptr
+
+
+    mov             rcx,        8
+
+    pxor            xmm3,       xmm3
+
+next_p8_uv_from_y_mask:
+    movdqu          xmm0,       [rsi]
+    pshufb          xmm0, [shuf1b] ;[GLOBAL(shuf1b)]
+    movq            [rdi],xmm0
+    add             rdi, 8
+    add             rsi,32
+
+    dec rcx
+    jnz next_p8_uv_from_y_mask
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+shuf1b:
+    db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
+
author	Jim Bankoski <jimbankoski@google.com>	2010-11-16 14:38:40 -0500
committer	Jim Bankoski <jimbankoski@google.com>	2010-11-16 14:38:40 -0500
commit	b4a3602f662519c88c06a9efb9cbeb0d224cda16 (patch)
tree	5f0f76af56de4bfdb0faf7f0d9f62dc2cadc77e5 /vp8/common/x86
parent	00fe7441e9c5cbd898a8382d49cf9396d8482464 (diff)
download	libvpx-b4a3602f662519c88c06a9efb9cbeb0d224cda16.tar libvpx-b4a3602f662519c88c06a9efb9cbeb0d224cda16.tar.gz libvpx-b4a3602f662519c88c06a9efb9cbeb0d224cda16.tar.bz2 libvpx-b4a3602f662519c88c06a9efb9cbeb0d224cda16.zip