Merge "Merge branch 'master' into experimental" into experimental

author: John Koleszar <jkoleszar@google.com> 2013-03-27 12:29:32 -0700
committer: Gerrit Code Review <gerrit@gerrit.golo.chromium.org> 2013-03-27 12:29:32 -0700
commit: 103207ba8f0f422214f7433e13fc3c58cdffa5ce (patch)
tree: 70f97e3c49a6da477532208185a0726a6fa239dc /vp8/encoder/x86
parent: 28d9202ed4d8af3eb128fa9192e6e3c4b1c9f67d (diff)
parent: 771fc832f3889c067b76899c8209bd0854ba7f7d (diff)
download: libvpx-103207ba8f0f422214f7433e13fc3c58cdffa5ce.tar
libvpx-103207ba8f0f422214f7433e13fc3c58cdffa5ce.tar.gz
libvpx-103207ba8f0f422214f7433e13fc3c58cdffa5ce.tar.bz2
libvpx-103207ba8f0f422214f7433e13fc3c58cdffa5ce.zip
2 files changed, 132 insertions, 251 deletions
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
deleted file mode 100644
index b41768ce0..000000000
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ /dev/null
@@ -1,245 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-%include "vp8_asm_enc_offsets.asm"
-
-
-; void vp8_regular_quantize_b_sse2 | arg
-;  (BLOCK  *b,                     |  0
-;   BLOCKD *d)                     |  1
-
-global sym(vp8_regular_quantize_b_sse2) PRIVATE
-sym(vp8_regular_quantize_b_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SAVE_XMM 7
-    GET_GOT     rbx
-
-%if ABI_IS_32BIT
-    push        rdi
-    push        rsi
-%else
-  %if LIBVPX_YASM_WIN64
-    push        rdi
-    push        rsi
-  %endif
-%endif
-
-    ALIGN_STACK 16, rax
-    %define zrun_zbin_boost   0  ;  8
-    %define abs_minus_zbin    8  ; 32
-    %define temp_qcoeff       40 ; 32
-    %define qcoeff            72 ; 32
-    %define stack_size        104
-    sub         rsp, stack_size
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %if LIBVPX_YASM_WIN64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rdx, [rdi + vp8_block_coeff] ; coeff_ptr
-    mov         rcx, [rdi + vp8_block_zbin] ; zbin_ptr
-    movd        xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value
-
-    ; z
-    movdqa      xmm0, [rdx]
-    movdqa      xmm4, [rdx + 16]
-    mov         rdx, [rdi + vp8_block_round] ; round_ptr
-
-    pshuflw     xmm7, xmm7, 0
-    punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm5, xmm4
-
-    ; sz
-    psraw       xmm0, 15
-    psraw       xmm4, 15
-
-    ; (z ^ sz)
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-
-    ; x = abs(z)
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    movdqa      xmm2, [rcx]
-    movdqa      xmm3, [rcx + 16]
-    mov         rcx, [rdi + vp8_block_quant] ; quant_ptr
-
-    ; *zbin_ptr + zbin_oq_value
-    paddw       xmm2, xmm7
-    paddw       xmm3, xmm7
-
-    ; x - (*zbin_ptr + zbin_oq_value)
-    psubw       xmm1, xmm2
-    psubw       xmm5, xmm3
-    movdqa      [rsp + abs_minus_zbin], xmm1
-    movdqa      [rsp + abs_minus_zbin + 16], xmm5
-
-    ; add (zbin_ptr + zbin_oq_value) back
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm3
-
-    movdqa      xmm2, [rdx]
-    movdqa      xmm6, [rdx + 16]
-
-    movdqa      xmm3, [rcx]
-    movdqa      xmm7, [rcx + 16]
-
-    ; x + round
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm6
-
-    ; y = x * quant_ptr >> 16
-    pmulhw      xmm3, xmm1
-    pmulhw      xmm7, xmm5
-
-    ; y += x
-    paddw       xmm1, xmm3
-    paddw       xmm5, xmm7
-
-    movdqa      [rsp + temp_qcoeff], xmm1
-    movdqa      [rsp + temp_qcoeff + 16], xmm5
-
-    pxor        xmm6, xmm6
-    ; zero qcoeff
-    movdqa      [rsp + qcoeff], xmm6
-    movdqa      [rsp + qcoeff + 16], xmm6
-
-    mov         rdx, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr
-    mov         rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr
-    mov         [rsp + zrun_zbin_boost], rdx
-
-%macro ZIGZAG_LOOP 1
-    ; x
-    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]
-
-    ; if (x >= zbin)
-    sub         cx, WORD PTR[rdx]           ; x - zbin
-    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
-    jl          .rq_zigzag_loop_%1           ; x < zbin
-
-    movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
-
-    ; downshift by quant_shift[rc]
-    movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc]
-    sar         edi, cl                     ; also sets Z bit
-    je          .rq_zigzag_loop_%1           ; !y
-    mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
-    mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
-.rq_zigzag_loop_%1:
-%endmacro
-; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
-ZIGZAG_LOOP  0
-ZIGZAG_LOOP  1
-ZIGZAG_LOOP  4
-ZIGZAG_LOOP  8
-ZIGZAG_LOOP  5
-ZIGZAG_LOOP  2
-ZIGZAG_LOOP  3
-ZIGZAG_LOOP  6
-ZIGZAG_LOOP  9
-ZIGZAG_LOOP 12
-ZIGZAG_LOOP 13
-ZIGZAG_LOOP 10
-ZIGZAG_LOOP  7
-ZIGZAG_LOOP 11
-ZIGZAG_LOOP 14
-ZIGZAG_LOOP 15
-
-    movdqa      xmm2, [rsp + qcoeff]
-    movdqa      xmm3, [rsp + qcoeff + 16]
-
-    mov         rcx, [rsi + vp8_blockd_dequant] ; dequant_ptr
-    mov         rdi, [rsi + vp8_blockd_dqcoeff] ; dqcoeff_ptr
-
-    ; y ^ sz
-    pxor        xmm2, xmm0
-    pxor        xmm3, xmm4
-    ; x = (y ^ sz) - sz
-    psubw       xmm2, xmm0
-    psubw       xmm3, xmm4
-
-    ; dequant
-    movdqa      xmm0, [rcx]
-    movdqa      xmm1, [rcx + 16]
-
-    mov         rcx, [rsi + vp8_blockd_qcoeff] ; qcoeff_ptr
-
-    pmullw      xmm0, xmm2
-    pmullw      xmm1, xmm3
-
-    movdqa      [rcx], xmm2        ; store qcoeff
-    movdqa      [rcx + 16], xmm3
-    movdqa      [rdi], xmm0        ; store dqcoeff
-    movdqa      [rdi + 16], xmm1
-
-    mov         rcx, [rsi + vp8_blockd_eob]
-
-    ; select the last value (in zig_zag order) for EOB
-    pcmpeqw     xmm2, xmm6
-    pcmpeqw     xmm3, xmm6
-    ; !
-    pcmpeqw     xmm6, xmm6
-    pxor        xmm2, xmm6
-    pxor        xmm3, xmm6
-    ; mask inv_zig_zag
-    pand        xmm2, [GLOBAL(inv_zig_zag)]
-    pand        xmm3, [GLOBAL(inv_zig_zag + 16)]
-    ; select the max value
-    pmaxsw      xmm2, xmm3
-    pshufd      xmm3, xmm2, 00001110b
-    pmaxsw      xmm2, xmm3
-    pshuflw     xmm3, xmm2, 00001110b
-    pmaxsw      xmm2, xmm3
-    pshuflw     xmm3, xmm2, 00000001b
-    pmaxsw      xmm2, xmm3
-    movd        eax, xmm2
-    and         eax, 0xff
-
-    mov         BYTE PTR [rcx], al          ; store eob
-
-    ; begin epilog
-    add         rsp, stack_size
-    pop         rsp
-%if ABI_IS_32BIT
-    pop         rsi
-    pop         rdi
-%else
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    pop         rdi
-  %endif
-%endif
-    RESTORE_GOT
-    RESTORE_XMM
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-inv_zig_zag:
-  dw 0x0001, 0x0002, 0x0006, 0x0007
-  dw 0x0003, 0x0005, 0x0008, 0x000d
-  dw 0x0004, 0x0009, 0x000c, 0x000e
-  dw 0x000a, 0x000b, 0x000f, 0x0010
diff --git a/vp8/encoder/x86/quantize_sse2_intrinsics.c b/vp8/encoder/x86/quantize_sse2_intrinsics.c
index 55d57ad62..f495bf287 100644
--- a/vp8/encoder/x86/quantize_sse2_intrinsics.c
+++ b/vp8/encoder/x86/quantize_sse2_intrinsics.c
@@ -9,13 +9,139 @@
  */
 
 
-#include "vp8/common/blockd.h"
-#include "vp8/common/entropy.h"
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx_ports/x86.h"
+#include "vpx_mem/vpx_mem.h"
 #include "vp8/encoder/block.h"
-
-#include <mmintrin.h> //MMX
-#include <xmmintrin.h> //SSE
-#include <emmintrin.h> //SSE2
+#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */
+
+#include <mmintrin.h> /* MMX */
+#include <xmmintrin.h> /* SSE */
+#include <emmintrin.h> /* SSE2 */
+
+#define SELECT_EOB(i, z) \
+    do { \
+        short boost = *zbin_boost_ptr; \
+        int cmp = (x[z] < boost) | (y[z] == 0); \
+        zbin_boost_ptr++; \
+        if (cmp) \
+            goto select_eob_end_##i; \
+        qcoeff_ptr[z] = y[z]; \
+        eob = i; \
+        zbin_boost_ptr = b->zrun_zbin_boost; \
+        select_eob_end_##i:; \
+    } while (0)
+
+void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d)
+{
+    char eob = 0;
+    short *zbin_boost_ptr  = b->zrun_zbin_boost;
+    short *qcoeff_ptr      = d->qcoeff;
+    DECLARE_ALIGNED_ARRAY(16, short, x, 16);
+    DECLARE_ALIGNED_ARRAY(16, short, y, 16);
+
+    __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1;
+    __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
+    __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
+    __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
+    __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8));
+    __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra);
+    __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin));
+    __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8));
+    __m128i round0 = _mm_load_si128((__m128i *)(b->round));
+    __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
+    __m128i quant0 = _mm_load_si128((__m128i *)(b->quant));
+    __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8));
+    __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
+    __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
+
+    vpx_memset(qcoeff_ptr, 0, 32);
+
+    /* Duplicate to all lanes. */
+    zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
+    zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);
+
+    /* Sign of z: z >> 15 */
+    sz0 = _mm_srai_epi16(z0, 15);
+    sz1 = _mm_srai_epi16(z1, 15);
+
+    /* x = abs(z): (z ^ sz) - sz */
+    x0 = _mm_xor_si128(z0, sz0);
+    x1 = _mm_xor_si128(z1, sz1);
+    x0 = _mm_sub_epi16(x0, sz0);
+    x1 = _mm_sub_epi16(x1, sz1);
+
+    /* zbin[] + zbin_extra */
+    zbin0 = _mm_add_epi16(zbin0, zbin_extra);
+    zbin1 = _mm_add_epi16(zbin1, zbin_extra);
+
+    /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance
+     * the equation because boost is the only value which can change:
+     * x - (zbin[] + extra) >= boost */
+    x_minus_zbin0 = _mm_sub_epi16(x0, zbin0);
+    x_minus_zbin1 = _mm_sub_epi16(x1, zbin1);
+
+    _mm_store_si128((__m128i *)(x), x_minus_zbin0);
+    _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1);
+
+    /* All the remaining calculations are valid whether they are done now with
+     * simd or later inside the loop one at a time. */
+    x0 = _mm_add_epi16(x0, round0);
+    x1 = _mm_add_epi16(x1, round1);
+
+    y0 = _mm_mulhi_epi16(x0, quant0);
+    y1 = _mm_mulhi_epi16(x1, quant1);
+
+    y0 = _mm_add_epi16(y0, x0);
+    y1 = _mm_add_epi16(y1, x1);
+
+    /* Instead of shifting each value independently we convert the scaling
+     * factor with 1 << (16 - shift) so we can use multiply/return high half. */
+    y0 = _mm_mulhi_epi16(y0, quant_shift0);
+    y1 = _mm_mulhi_epi16(y1, quant_shift1);
+
+    /* Return the sign: (y ^ sz) - sz */
+    y0 = _mm_xor_si128(y0, sz0);
+    y1 = _mm_xor_si128(y1, sz1);
+    y0 = _mm_sub_epi16(y0, sz0);
+    y1 = _mm_sub_epi16(y1, sz1);
+
+    _mm_store_si128((__m128i *)(y), y0);
+    _mm_store_si128((__m128i *)(y + 8), y1);
+
+    zbin_boost_ptr = b->zrun_zbin_boost;
+
+    /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
+    SELECT_EOB(1, 0);
+    SELECT_EOB(2, 1);
+    SELECT_EOB(3, 4);
+    SELECT_EOB(4, 8);
+    SELECT_EOB(5, 5);
+    SELECT_EOB(6, 2);
+    SELECT_EOB(7, 3);
+    SELECT_EOB(8, 6);
+    SELECT_EOB(9, 9);
+    SELECT_EOB(10, 12);
+    SELECT_EOB(11, 13);
+    SELECT_EOB(12, 10);
+    SELECT_EOB(13, 7);
+    SELECT_EOB(14, 11);
+    SELECT_EOB(15, 14);
+    SELECT_EOB(16, 15);
+
+    y0 = _mm_load_si128((__m128i *)(d->qcoeff));
+    y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8));
+
+    /* dqcoeff = qcoeff * dequant */
+    y0 = _mm_mullo_epi16(y0, dequant0);
+    y1 = _mm_mullo_epi16(y1, dequant1);
+
+    _mm_store_si128((__m128i *)(d->dqcoeff), y0);
+    _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1);
+
+    *d->eob = eob;
+}
 
 void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
 {
author	John Koleszar <jkoleszar@google.com>	2013-03-27 12:29:32 -0700
committer	Gerrit Code Review <gerrit@gerrit.golo.chromium.org>	2013-03-27 12:29:32 -0700
commit	103207ba8f0f422214f7433e13fc3c58cdffa5ce (patch)
tree	70f97e3c49a6da477532208185a0726a6fa239dc /vp8/encoder/x86
parent	28d9202ed4d8af3eb128fa9192e6e3c4b1c9f67d (diff)
parent	771fc832f3889c067b76899c8209bd0854ba7f7d (diff)
download	libvpx-103207ba8f0f422214f7433e13fc3c58cdffa5ce.tar libvpx-103207ba8f0f422214f7433e13fc3c58cdffa5ce.tar.gz libvpx-103207ba8f0f422214f7433e13fc3c58cdffa5ce.tar.bz2 libvpx-103207ba8f0f422214f7433e13fc3c58cdffa5ce.zip