summaryrefslogtreecommitdiff
path: root/vp8/common/x86/iwalsh_sse2.asm
blob: 5a7133d6c0b3e2e8d1db7e67d753b3f4f7775050 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
;
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;


%include "vpx_ports/x86_abi_support.asm"

;void vp8_short_inv_walsh4x4_sse2(short *input, short *output)
global sym(vp8_short_inv_walsh4x4_sse2)
sym(vp8_short_inv_walsh4x4_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 2
    SAVE_XMM 6
    push        rsi
    push        rdi
    ; end prolog

    mov     rsi, arg(0)
    mov     rdi, arg(1)
    mov     rax, 3

    movdqa    xmm0, [rsi + 0]       ;ip[4] ip[0]
    movdqa    xmm1, [rsi + 16]      ;ip[12] ip[8]

    shl     rax, 16
    or      rax, 3            ;00030003h

    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
    movdqa    xmm3, xmm0          ;ip[4] ip[0]

    paddw   xmm0, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1

    movdqa    xmm4, xmm0
    punpcklqdq  xmm0, xmm3          ;d1 a1
    punpckhqdq  xmm4, xmm3          ;c1 b1
    movd    xmm6, eax

    movdqa    xmm1, xmm4          ;c1 b1
    paddw   xmm4, xmm0          ;dl+cl a1+b1 aka op[4] op[0]
    psubw   xmm0, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]

;;;temp output
;;  movdqu  [rdi + 0], xmm4
;;  movdqu  [rdi + 16], xmm3

;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    ; 13 12 11 10 03 02 01 00
    ;
    ; 33 32 31 30 23 22 21 20
    ;
    movdqa    xmm3, xmm4          ; 13 12 11 10 03 02 01 00
    punpcklwd xmm4, xmm0          ; 23 03 22 02 21 01 20 00
    punpckhwd xmm3, xmm0          ; 33 13 32 12 31 11 30 10
    movdqa    xmm1, xmm4          ; 23 03 22 02 21 01 20 00
    punpcklwd xmm4, xmm3          ; 31 21 11 01 30 20 10 00
    punpckhwd xmm1, xmm3          ; 33 23 13 03 32 22 12 02
    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
    movdqa    xmm3, xmm4          ;ip[4] ip[0]

    pshufd    xmm6, xmm6, 0       ;03 03 03 03 03 03 03 03

    paddw   xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1

    movdqa    xmm5, xmm4
    punpcklqdq  xmm4, xmm3          ;d1 a1
    punpckhqdq  xmm5, xmm3          ;c1 b1

    movdqa    xmm1, xmm5          ;c1 b1
    paddw   xmm5, xmm4          ;dl+cl a1+b1 aka op[4] op[0]
    psubw   xmm4, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    ; 13 12 11 10 03 02 01 00
    ;
    ; 33 32 31 30 23 22 21 20
    ;
    movdqa    xmm0, xmm5          ; 13 12 11 10 03 02 01 00
    punpcklwd xmm5, xmm4          ; 23 03 22 02 21 01 20 00
    punpckhwd xmm0, xmm4          ; 33 13 32 12 31 11 30 10
    movdqa    xmm1, xmm5          ; 23 03 22 02 21 01 20 00
    punpcklwd xmm5, xmm0          ; 31 21 11 01 30 20 10 00
    punpckhwd xmm1, xmm0          ; 33 23 13 03 32 22 12 02
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    paddw   xmm5, xmm6
    paddw   xmm1, xmm6

    psraw   xmm5, 3
    psraw   xmm1, 3

;;    movdqa  [rdi + 0], xmm5
;;    movdqa  [rdi + 16], xmm1

    movd    eax, xmm5
    psrldq   xmm5, 4
    mov     word ptr[rdi+32*0], ax
    shr     eax, 16
    mov     word ptr[rdi+32*1], ax
    movd    eax, xmm5
    psrldq   xmm5, 4
    mov     word ptr[rdi+32*2], ax
    shr     eax, 16
    mov     word ptr[rdi+32*3], ax

    movd    eax, xmm5
    psrldq   xmm5, 4
    mov     word ptr[rdi+32*4], ax
    shr     eax, 16
    mov     word ptr[rdi+32*5], ax
    movd    eax, xmm5
    mov     word ptr[rdi+32*6], ax
    shr     eax, 16
    mov     word ptr[rdi+32*7], ax

    movd    eax, xmm1
    psrldq   xmm1, 4
    mov     word ptr[rdi+32*8], ax
    shr     eax, 16
    mov     word ptr[rdi+32*9], ax
    movd    eax, xmm1
    psrldq   xmm1, 4
    mov     word ptr[rdi+32*10], ax
    shr     eax, 16
    mov     word ptr[rdi+32*11], ax

    movd    eax, xmm1
    psrldq   xmm1, 4
    mov     word ptr[rdi+32*12], ax
    shr     eax, 16
    mov     word ptr[rdi+32*13], ax
    movd    eax, xmm1
    mov     word ptr[rdi+32*14], ax
    shr     eax, 16
    mov     word ptr[rdi+32*15], ax

    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret

SECTION_RODATA
align 16
x_s1sqr2:
    times 4 dw 0x8A8C
align 16
x_c1sqr2less1:
    times 4 dw 0x4E7B
align 16
fours:
    times 4 dw 0x0004