1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
|
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%include "vp8_asm_enc_offsets.asm"
; void vp8_fast_quantize_b_ssse3 | arg
; (BLOCK *b, | 0
; BLOCKD *d) | 1
;
global sym(vp8_fast_quantize_b_ssse3) PRIVATE
sym(vp8_fast_quantize_b_ssse3):
push rbp
mov rbp, rsp
GET_GOT rbx
%if ABI_IS_32BIT
push rdi
push rsi
%else
%if LIBVPX_YASM_WIN64
push rdi
push rsi
%endif
%endif
; end prolog
%if ABI_IS_32BIT
mov rdi, arg(0) ; BLOCK *b
mov rsi, arg(1) ; BLOCKD *d
%else
%if LIBVPX_YASM_WIN64
mov rdi, rcx ; BLOCK *b
mov rsi, rdx ; BLOCKD *d
%else
;mov rdi, rdi ; BLOCK *b
;mov rsi, rsi ; BLOCKD *d
%endif
%endif
mov rax, [rdi + vp8_block_coeff]
mov rcx, [rdi + vp8_block_round]
mov rdx, [rdi + vp8_block_quant_fast]
; coeff
movdqa xmm0, [rax]
movdqa xmm4, [rax + 16]
; round
movdqa xmm2, [rcx]
movdqa xmm3, [rcx + 16]
movdqa xmm1, xmm0
movdqa xmm5, xmm4
; sz = z >> 15
psraw xmm0, 15
psraw xmm4, 15
pabsw xmm1, xmm1
pabsw xmm5, xmm5
paddw xmm1, xmm2
paddw xmm5, xmm3
; quant_fast
pmulhw xmm1, [rdx]
pmulhw xmm5, [rdx + 16]
mov rax, [rsi + vp8_blockd_qcoeff]
mov rdi, [rsi + vp8_blockd_dequant]
mov rcx, [rsi + vp8_blockd_dqcoeff]
movdqa xmm2, xmm1 ;store y for getting eob
movdqa xmm3, xmm5
pxor xmm1, xmm0
pxor xmm5, xmm4
psubw xmm1, xmm0
psubw xmm5, xmm4
movdqa [rax], xmm1
movdqa [rax + 16], xmm5
movdqa xmm0, [rdi]
movdqa xmm4, [rdi + 16]
pmullw xmm0, xmm1
pmullw xmm4, xmm5
pxor xmm1, xmm1
pcmpgtw xmm2, xmm1 ;calculate eob
pcmpgtw xmm3, xmm1
packsswb xmm2, xmm3
pshufb xmm2, [GLOBAL(zz_shuf)]
pmovmskb edx, xmm2
movdqa [rcx], xmm0 ;store dqcoeff
movdqa [rcx + 16], xmm4 ;store dqcoeff
mov rcx, [rsi + vp8_blockd_eob]
bsr eax, edx ;count 0
add eax, 1
cmp edx, 0 ;if all 0, eob=0
cmove eax, edx
mov BYTE PTR [rcx], al ;store eob
; begin epilog
%if ABI_IS_32BIT
pop rsi
pop rdi
%else
%if LIBVPX_YASM_WIN64
pop rsi
pop rdi
%endif
%endif
RESTORE_GOT
pop rbp
ret
SECTION_RODATA
align 16
zz_shuf:
db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
|