vp8/encoder/arm/neon/fastquantizeb_neon.asm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258

;
;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;


    EXPORT  |vp8_fast_quantize_b_neon|
    EXPORT  |vp8_fast_quantize_b_pair_neon|

    INCLUDE asm_enc_offsets.asm

    ARM
    REQUIRE8
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=4

;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);
|vp8_fast_quantize_b_pair_neon| PROC

    stmfd           sp!, {r4-r9}
    vstmdb          sp!, {q4-q7}

    ldr             r4, [r0, #vp8_block_coeff]
    ldr             r5, [r0, #vp8_block_quant_fast]
    ldr             r6, [r0, #vp8_block_round]

    vld1.16         {q0, q1}, [r4@128]  ; load z

    ldr             r7, [r2, #vp8_blockd_qcoeff]

    vabs.s16        q4, q0              ; calculate x = abs(z)
    vabs.s16        q5, q1

    ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
    vshr.s16        q2, q0, #15         ; sz
    vshr.s16        q3, q1, #15

    vld1.s16        {q6, q7}, [r6@128]  ; load round_ptr [0-15]
    vld1.s16        {q8, q9}, [r5@128]  ; load quant_ptr [0-15]

    ldr             r4, [r1, #vp8_block_coeff]

    vadd.s16        q4, q6              ; x + Round
    vadd.s16        q5, q7

    vld1.16         {q0, q1}, [r4@128]  ; load z2

    vqdmulh.s16     q4, q8              ; y = ((Round+abs(z)) * Quant) >> 16
    vqdmulh.s16     q5, q9

    vabs.s16        q10, q0             ; calculate x2 = abs(z_2)
    vabs.s16        q11, q1
    vshr.s16        q12, q0, #15        ; sz2
    vshr.s16        q13, q1, #15

    ;modify data to have its original sign
    veor.s16        q4, q2              ; y^sz
    veor.s16        q5, q3

    vadd.s16        q10, q6             ; x2 + Round
    vadd.s16        q11, q7

    ldr             r8, [r2, #vp8_blockd_dequant]

    vqdmulh.s16     q10, q8             ; y2 = ((Round+abs(z)) * Quant) >> 16
    vqdmulh.s16     q11, q9

    vshr.s16        q4, #1              ; right shift 1 after vqdmulh
    vshr.s16        q5, #1

    vld1.s16        {q6, q7}, [r8@128]  ;load dequant_ptr[i]

    vsub.s16        q4, q2              ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
    vsub.s16        q5, q3

    vshr.s16        q10, #1             ; right shift 1 after vqdmulh
    vshr.s16        q11, #1

    ldr             r9, [r2, #vp8_blockd_dqcoeff]

    veor.s16        q10, q12            ; y2^sz2
    veor.s16        q11, q13

    vst1.s16        {q4, q5}, [r7]      ; store: qcoeff = x1


    vsub.s16        q10, q12            ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement)
    vsub.s16        q11, q13

    ldr             r6, [r3, #vp8_blockd_qcoeff]

    vmul.s16        q2, q6, q4          ; x * Dequant
    vmul.s16        q3, q7, q5

    adr             r0, inv_zig_zag     ; load ptr of inverse zigzag table

    vceq.s16        q8, q8              ; set q8 to all 1

    vst1.s16        {q10, q11}, [r6]    ; store: qcoeff = x2

    vmul.s16        q12, q6, q10        ; x2 * Dequant
    vmul.s16        q13, q7, q11

    vld1.16         {q6, q7}, [r0@128]  ; load inverse scan order

    vtst.16         q14, q4, q8         ; now find eob
    vtst.16         q15, q5, q8         ; non-zero element is set to all 1

    vst1.s16        {q2, q3}, [r9]      ; store dqcoeff = x * Dequant

    ldr             r7, [r3, #vp8_blockd_dqcoeff]

    vand            q0, q6, q14         ; get all valid numbers from scan array
    vand            q1, q7, q15

    vst1.s16        {q12, q13}, [r7]    ; store dqcoeff = x * Dequant

    vtst.16         q2, q10, q8         ; now find eob
    vtst.16         q3, q11, q8         ; non-zero element is set to all 1

    vmax.u16        q0, q0, q1          ; find maximum value in q0, q1

    vand            q10, q6, q2         ; get all valid numbers from scan array
    vand            q11, q7, q3
    vmax.u16        q10, q10, q11       ; find maximum value in q10, q11

    vmax.u16        d0, d0, d1
    vmax.u16        d20, d20, d21
    vmovl.u16       q0, d0
    vmovl.u16       q10, d20

    vmax.u32        d0, d0, d1
    vmax.u32        d20, d20, d21
    vpmax.u32       d0, d0, d0
    vpmax.u32       d20, d20, d20

    ldr             r4, [r2, #vp8_blockd_eob]
    ldr             r5, [r3, #vp8_blockd_eob]

    vst1.8          {d0[0]}, [r4]       ; store eob
    vst1.8          {d20[0]}, [r5]      ; store eob

    vldmia          sp!, {q4-q7}
    ldmfd           sp!, {r4-r9}
    bx              lr

    ENDP

;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
|vp8_fast_quantize_b_neon| PROC

    stmfd           sp!, {r4-r7}

    ldr             r3, [r0, #vp8_block_coeff]
    ldr             r4, [r0, #vp8_block_quant_fast]
    ldr             r5, [r0, #vp8_block_round]

    vld1.16         {q0, q1}, [r3@128]  ; load z
    vorr.s16        q14, q0, q1         ; check if all zero (step 1)
    ldr             r6, [r1, #vp8_blockd_qcoeff]
    ldr             r7, [r1, #vp8_blockd_dqcoeff]
    vorr.s16        d28, d28, d29       ; check if all zero (step 2)

    vabs.s16        q12, q0             ; calculate x = abs(z)
    vabs.s16        q13, q1

    ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
    vshr.s16        q2, q0, #15         ; sz
    vmov            r2, r3, d28         ; check if all zero (step 3)
    vshr.s16        q3, q1, #15

    vld1.s16        {q14, q15}, [r5@128]; load round_ptr [0-15]
    vld1.s16        {q8, q9}, [r4@128]  ; load quant_ptr [0-15]

    vadd.s16        q12, q14            ; x + Round
    vadd.s16        q13, q15

    adr             r0, inv_zig_zag     ; load ptr of inverse zigzag table

    vqdmulh.s16     q12, q8             ; y = ((Round+abs(z)) * Quant) >> 16
    vqdmulh.s16     q13, q9

    vld1.16         {q10, q11}, [r0@128]; load inverse scan order

    vceq.s16        q8, q8              ; set q8 to all 1

    ldr             r4, [r1, #vp8_blockd_dequant]

    vshr.s16        q12, #1             ; right shift 1 after vqdmulh
    vshr.s16        q13, #1

    ldr             r5, [r1, #vp8_blockd_eob]

    orr             r2, r2, r3          ; check if all zero (step 4)
    cmp             r2, #0              ; check if all zero (step 5)
    beq             zero_output         ; check if all zero (step 6)

    ;modify data to have its original sign
    veor.s16        q12, q2             ; y^sz
    veor.s16        q13, q3

    vsub.s16        q12, q2             ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
    vsub.s16        q13, q3

    vld1.s16        {q2, q3}, [r4@128]  ; load dequant_ptr[i]

    vtst.16         q14, q12, q8        ; now find eob
    vtst.16         q15, q13, q8        ; non-zero element is set to all 1

    vst1.s16        {q12, q13}, [r6@128]; store: qcoeff = x1

    vand            q10, q10, q14       ; get all valid numbers from scan array
    vand            q11, q11, q15


    vmax.u16        q0, q10, q11        ; find maximum value in q0, q1
    vmax.u16        d0, d0, d1
    vmovl.u16       q0, d0

    vmul.s16        q2, q12             ; x * Dequant
    vmul.s16        q3, q13

    vmax.u32        d0, d0, d1
    vpmax.u32       d0, d0, d0

    vst1.s16        {q2, q3}, [r7@128]  ; store dqcoeff = x * Dequant

    vst1.8          {d0[0]}, [r5]       ; store eob

    ldmfd           sp!, {r4-r7}
    bx              lr

zero_output
    strb            r2, [r5]            ; store eob
    vst1.s16        {q0, q1}, [r6@128]  ; qcoeff = 0
    vst1.s16        {q0, q1}, [r7@128]  ; dqcoeff = 0

    ldmfd           sp!, {r4-r7}
    bx              lr

    ENDP

; default inverse zigzag table is defined in vp8/common/entropy.c
    ALIGN 16    ; enable use of @128 bit aligned loads
inv_zig_zag
    DCW 0x0001, 0x0002, 0x0006, 0x0007
    DCW 0x0003, 0x0005, 0x0008, 0x000d
    DCW 0x0004, 0x0009, 0x000c, 0x000e
    DCW 0x000a, 0x000b, 0x000f, 0x0010

    END