summaryrefslogtreecommitdiff
path: root/vp9/encoder/x86/vp9_quantize_ssse3.asm
blob: e082af1f5c46f65d418a3020874380c7ea0faba9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
;
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license and patent
;  grant that can be found in the LICENSE file in the root of the source
;  tree. All contributing project authors may be found in the AUTHORS
;  file in the root of the source tree.
;


%include "vpx_ports/x86_abi_support.asm"
%include "vp9_asm_enc_offsets.asm"


; void vp9_fast_quantize_b_ssse3 | arg
;  (BLOCK  *b,                   |  0
;   BLOCKD *d)                   |  1
;

global sym(vp9_fast_quantize_b_ssse3) PRIVATE
sym(vp9_fast_quantize_b_ssse3):
    push        rbp
    mov         rbp, rsp
    GET_GOT     rbx

%if ABI_IS_32BIT
    push        rdi
    push        rsi
%else
  %if LIBVPX_YASM_WIN64
    push        rdi
    push        rsi
  %endif
%endif
    ; end prolog

%if ABI_IS_32BIT
    mov         rdi, arg(0)                 ; BLOCK *b
    mov         rsi, arg(1)                 ; BLOCKD *d
%else
  %if LIBVPX_YASM_WIN64
    mov         rdi, rcx                    ; BLOCK *b
    mov         rsi, rdx                    ; BLOCKD *d
  %else
    ;mov         rdi, rdi                    ; BLOCK *b
    ;mov         rsi, rsi                    ; BLOCKD *d
  %endif
%endif

    mov         rax, [rdi + vp9_block_coeff]
    mov         rcx, [rdi + vp9_block_round]
    mov         rdx, [rdi + vp9_block_quant_fast]

    ; coeff
    movdqa      xmm0, [rax]
    movdqa      xmm4, [rax + 16]

    ; round
    movdqa      xmm2, [rcx]
    movdqa      xmm3, [rcx + 16]

    movdqa      xmm1, xmm0
    movdqa      xmm5, xmm4

    ; sz = z >> 15
    psraw       xmm0, 15
    psraw       xmm4, 15

    pabsw       xmm1, xmm1
    pabsw       xmm5, xmm5

    paddw       xmm1, xmm2
    paddw       xmm5, xmm3

    ; quant_fast
    pmulhw      xmm1, [rdx]
    pmulhw      xmm5, [rdx + 16]

    mov         rax, [rsi + vp9_blockd_qcoeff]
    mov         rdi, [rsi + vp9_blockd_dequant]
    mov         rcx, [rsi + vp9_blockd_dqcoeff]

    pxor        xmm1, xmm0
    pxor        xmm5, xmm4
    psubw       xmm1, xmm0
    psubw       xmm5, xmm4

    movdqa      [rax], xmm1
    movdqa      [rax + 16], xmm5

    movdqa      xmm2, [rdi]
    movdqa      xmm3, [rdi + 16]

    pxor        xmm4, xmm4
    pmullw      xmm2, xmm1
    pmullw      xmm3, xmm5

    pcmpeqw     xmm1, xmm4                  ;non zero mask
    pcmpeqw     xmm5, xmm4                  ;non zero mask
    packsswb    xmm1, xmm5
    pshufb      xmm1, [GLOBAL(zz_shuf)]

    pmovmskb    edx, xmm1

    xor         rdi, rdi
    mov         eax, -1
    xor         dx, ax                      ;flip the bits for bsr
    bsr         eax, edx

    movdqa      [rcx], xmm2                 ;store dqcoeff
    movdqa      [rcx + 16], xmm3            ;store dqcoeff

    sub         edi, edx                    ;check for all zeros in bit mask
    sar         edi, 31                     ;0 or -1
    add         eax, 1
    and         eax, edi                    ;if the bit mask was all zero,
                                            ;then eob = 0
    mov         [rsi + vp9_blockd_eob], eax

    ; begin epilog
%if ABI_IS_32BIT
    pop         rsi
    pop         rdi
%else
  %if LIBVPX_YASM_WIN64
    pop         rsi
    pop         rdi
  %endif
%endif

    RESTORE_GOT
    pop         rbp
    ret

SECTION_RODATA
align 16
zz_shuf:
    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15