Merge master branch into experimental

Picks up some build system changes, compiler warning fixes, etc. Change-Id: I2712f99e653502818a101a72696ad54018152d4e
author: John Koleszar <jkoleszar@google.com> 2013-02-28 16:25:38 -0800
committer: John Koleszar <jkoleszar@google.com> 2013-03-01 11:06:05 -0800
commit: 69c67c9531b08c1b15985b351677a162739af43e (patch)
tree: 5e07512832085e972ce4bdc7e96472d2c4f5c545 /vp8/encoder/x86
parent: db4dc6f0c0c30f56d675a489c93b22517eedb9f8 (diff)
parent: a07bed2b2b2ca84fdb145ee8e4f1ca100d39915e (diff)
download: libvpx-69c67c9531b08c1b15985b351677a162739af43e.tar
libvpx-69c67c9531b08c1b15985b351677a162739af43e.tar.gz
libvpx-69c67c9531b08c1b15985b351677a162739af43e.tar.bz2
libvpx-69c67c9531b08c1b15985b351677a162739af43e.zip
6 files changed, 115 insertions, 153 deletions
diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm
index d880ce0c4..d06bca592 100644
--- a/vp8/encoder/x86/dct_sse2.asm
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -29,7 +29,7 @@
     movsxd      rax, dword ptr arg(2)
     lea         rcx, [rsi + rax*2]
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     %define     input       rcx
     %define     output      rdx
     %define     pitch       r8
@@ -53,7 +53,7 @@
     RESTORE_GOT
     pop         rbp
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     RESTORE_XMM
   %endif
 %endif
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
index fe9464b3d..b41768ce0 100644
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -27,7 +27,7 @@ sym(vp8_regular_quantize_b_sse2):
     push        rdi
     push        rsi
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     push        rdi
     push        rsi
   %endif
@@ -46,7 +46,7 @@ sym(vp8_regular_quantize_b_sse2):
     mov         rdi, arg(0)                 ; BLOCK *b
     mov         rsi, arg(1)                 ; BLOCKD *d
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     mov         rdi, rcx                    ; BLOCK *b
     mov         rsi, rdx                    ; BLOCKD *d
   %else
@@ -226,7 +226,7 @@ ZIGZAG_LOOP 15
     pop         rsi
     pop         rdi
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     pop         rsi
     pop         rdi
   %endif
@@ -236,147 +236,6 @@ ZIGZAG_LOOP 15
     pop         rbp
     ret
 
-; void vp8_fast_quantize_b_sse2 | arg
-;  (BLOCK  *b,                  |  0
-;   BLOCKD *d)                  |  1
-
-global sym(vp8_fast_quantize_b_sse2) PRIVATE
-sym(vp8_fast_quantize_b_sse2):
-    push        rbp
-    mov         rbp, rsp
-    GET_GOT     rbx
-
-%if ABI_IS_32BIT
-    push        rdi
-    push        rsi
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    push        rdi
-    push        rsi
-  %else
-    ; these registers are used for passing arguments
-  %endif
-%endif
-
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rax, [rdi + vp8_block_coeff]
-    mov         rcx, [rdi + vp8_block_round]
-    mov         rdx, [rdi + vp8_block_quant_fast]
-
-    ; z = coeff
-    movdqa      xmm0, [rax]
-    movdqa      xmm4, [rax + 16]
-
-    ; dup z so we can save sz
-    movdqa      xmm1, xmm0
-    movdqa      xmm5, xmm4
-
-    ; sz = z >> 15
-    psraw       xmm0, 15
-    psraw       xmm4, 15
-
-    ; x = abs(z) = (z ^ sz) - sz
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    ; x += round
-    paddw       xmm1, [rcx]
-    paddw       xmm5, [rcx + 16]
-
-    mov         rax, [rsi + vp8_blockd_qcoeff]
-    mov         rcx, [rsi + vp8_blockd_dequant]
-    mov         rdi, [rsi + vp8_blockd_dqcoeff]
-
-    ; y = x * quant >> 16
-    pmulhw      xmm1, [rdx]
-    pmulhw      xmm5, [rdx + 16]
-
-    ; x = (y ^ sz) - sz
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    ; qcoeff = x
-    movdqa      [rax], xmm1
-    movdqa      [rax + 16], xmm5
-
-    ; x * dequant
-    movdqa      xmm2, xmm1
-    movdqa      xmm3, xmm5
-    pmullw      xmm2, [rcx]
-    pmullw      xmm3, [rcx + 16]
-
-    ; dqcoeff = x * dequant
-    movdqa      [rdi], xmm2
-    movdqa      [rdi + 16], xmm3
-
-    pxor        xmm4, xmm4                  ;clear all bits
-    pcmpeqw     xmm1, xmm4
-    pcmpeqw     xmm5, xmm4
-
-    pcmpeqw     xmm4, xmm4                  ;set all bits
-    pxor        xmm1, xmm4
-    pxor        xmm5, xmm4
-
-    pand        xmm1, [GLOBAL(inv_zig_zag)]
-    pand        xmm5, [GLOBAL(inv_zig_zag + 16)]
-
-    pmaxsw      xmm1, xmm5
-
-    mov         rcx, [rsi + vp8_blockd_eob]
-
-    ; now down to 8
-    pshufd      xmm5, xmm1, 00001110b
-
-    pmaxsw      xmm1, xmm5
-
-    ; only 4 left
-    pshuflw     xmm5, xmm1, 00001110b
-
-    pmaxsw      xmm1, xmm5
-
-    ; okay, just 2!
-    pshuflw     xmm5, xmm1, 00000001b
-
-    pmaxsw      xmm1, xmm5
-
-    movd        eax, xmm1
-    and         eax, 0xff
-
-    mov         BYTE PTR [rcx], al          ; store eob
-
-    ; begin epilog
-%if ABI_IS_32BIT
-    pop         rsi
-    pop         rdi
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    pop         rsi
-    pop         rdi
-  %endif
-%endif
-
-    RESTORE_GOT
-    pop         rbp
-    ret
-
 SECTION_RODATA
 align 16
 inv_zig_zag:
diff --git a/vp8/encoder/x86/quantize_sse2.c b/vp8/encoder/x86/quantize_sse2.c
new file mode 100644
index 000000000..55d57ad62
--- /dev/null
+++ b/vp8/encoder/x86/quantize_sse2.c
@@ -0,0 +1,103 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/blockd.h"
+#include "vp8/common/entropy.h"
+#include "vp8/encoder/block.h"
+
+#include <mmintrin.h> //MMX
+#include <xmmintrin.h> //SSE
+#include <emmintrin.h> //SSE2
+
+void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
+{
+  __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
+  __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));
+  __m128i round0 = _mm_load_si128((__m128i *)(b->round));
+  __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
+  __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast));
+  __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8));
+  __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
+  __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
+  __m128i inv_zig_zag0 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag));
+  __m128i inv_zig_zag1 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8));
+
+  __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones;
+
+  /* sign of z: z >> 15 */
+  sz0 = _mm_srai_epi16(z0, 15);
+  sz1 = _mm_srai_epi16(z1, 15);
+
+  /* x = abs(z): (z ^ sz) - sz */
+  x0 = _mm_xor_si128(z0, sz0);
+  x1 = _mm_xor_si128(z1, sz1);
+  x0 = _mm_sub_epi16(x0, sz0);
+  x1 = _mm_sub_epi16(x1, sz1);
+
+  /* x += round */
+  x0 = _mm_add_epi16(x0, round0);
+  x1 = _mm_add_epi16(x1, round1);
+
+  /* y = (x * quant) >> 16 */
+  y0 = _mm_mulhi_epi16(x0, quant_fast0);
+  y1 = _mm_mulhi_epi16(x1, quant_fast1);
+
+  /* x = abs(y) = (y ^ sz) - sz */
+  y0 = _mm_xor_si128(y0, sz0);
+  y1 = _mm_xor_si128(y1, sz1);
+  x0 = _mm_sub_epi16(y0, sz0);
+  x1 = _mm_sub_epi16(y1, sz1);
+
+  /* qcoeff = x */
+  _mm_store_si128((__m128i *)(d->qcoeff), x0);
+  _mm_store_si128((__m128i *)(d->qcoeff + 8), x1);
+
+  /* x * dequant */
+  xdq0 = _mm_mullo_epi16(x0, dequant0);
+  xdq1 = _mm_mullo_epi16(x1, dequant1);
+
+  /* dqcoeff = x * dequant */
+  _mm_store_si128((__m128i *)(d->dqcoeff), xdq0);
+  _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1);
+
+  /* build a mask for the zig zag */
+  zeros = _mm_setzero_si128();
+
+  x0 = _mm_cmpeq_epi16(x0, zeros);
+  x1 = _mm_cmpeq_epi16(x1, zeros);
+
+  ones = _mm_cmpeq_epi16(zeros, zeros);
+
+  x0 = _mm_xor_si128(x0, ones);
+  x1 = _mm_xor_si128(x1, ones);
+
+  x0 = _mm_and_si128(x0, inv_zig_zag0);
+  x1 = _mm_and_si128(x1, inv_zig_zag1);
+
+  x0 = _mm_max_epi16(x0, x1);
+
+  /* now down to 8 */
+  x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110
+
+  x0 = _mm_max_epi16(x0, x1);
+
+  /* only 4 left */
+  x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110
+
+  x0 = _mm_max_epi16(x0, x1);
+
+  /* okay, just 2! */
+  x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001
+
+  x0 = _mm_max_epi16(x0, x1);
+
+  *d->eob = 0xFF & _mm_cvtsi128_si32(x0);
+}
diff --git a/vp8/encoder/x86/quantize_sse4.asm b/vp8/encoder/x86/quantize_sse4.asm
index f21146457..dbd171bfc 100644
--- a/vp8/encoder/x86/quantize_sse4.asm
+++ b/vp8/encoder/x86/quantize_sse4.asm
@@ -31,7 +31,7 @@ sym(vp8_regular_quantize_b_sse4):
     %define stack_size 32
     sub         rsp, stack_size
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     SAVE_XMM 8, u
     push        rdi
     push        rsi
@@ -43,7 +43,7 @@ sym(vp8_regular_quantize_b_sse4):
     mov         rdi, arg(0)                 ; BLOCK *b
     mov         rsi, arg(1)                 ; BLOCKD *d
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     mov         rdi, rcx                    ; BLOCK *b
     mov         rsi, rdx                    ; BLOCKD *d
   %else
@@ -240,7 +240,7 @@ ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
     pop         rbp
 %else
   %undef xmm5
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     pop         rsi
     pop         rdi
     RESTORE_XMM
diff --git a/vp8/encoder/x86/quantize_ssse3.asm b/vp8/encoder/x86/quantize_ssse3.asm
index 35368894d..7b1dc119f 100644
--- a/vp8/encoder/x86/quantize_ssse3.asm
+++ b/vp8/encoder/x86/quantize_ssse3.asm
@@ -27,7 +27,7 @@ sym(vp8_fast_quantize_b_ssse3):
     push        rdi
     push        rsi
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     push        rdi
     push        rsi
   %endif
@@ -38,7 +38,7 @@ sym(vp8_fast_quantize_b_ssse3):
     mov         rdi, arg(0)                 ; BLOCK *b
     mov         rsi, arg(1)                 ; BLOCKD *d
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     mov         rdi, rcx                    ; BLOCK *b
     mov         rsi, rdx                    ; BLOCKD *d
   %else
@@ -122,7 +122,7 @@ sym(vp8_fast_quantize_b_ssse3):
     pop         rsi
     pop         rdi
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     pop         rsi
     pop         rdi
   %endif
diff --git a/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
index ce9d9836b..bd92b398a 100644
--- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -50,7 +50,7 @@ sym(vp8_temporal_filter_apply_sse2):
         ; 0x8000 >> (16 - strength)
         mov         rdx,            16
         sub         rdx,            arg(4) ; 16 - strength
-        movd        xmm4,           rdx    ; can't use rdx w/ shift
+        movq        xmm4,           rdx    ; can't use rdx w/ shift
         movdqa      xmm5,           [GLOBAL(_const_top_bit)]
         psrlw       xmm5,           xmm4
         movdqa      [rsp + rounding_bit], xmm5
author	John Koleszar <jkoleszar@google.com>	2013-02-28 16:25:38 -0800
committer	John Koleszar <jkoleszar@google.com>	2013-03-01 11:06:05 -0800
commit	69c67c9531b08c1b15985b351677a162739af43e (patch)
tree	5e07512832085e972ce4bdc7e96472d2c4f5c545 /vp8/encoder/x86
parent	db4dc6f0c0c30f56d675a489c93b22517eedb9f8 (diff)
parent	a07bed2b2b2ca84fdb145ee8e4f1ca100d39915e (diff)
download	libvpx-69c67c9531b08c1b15985b351677a162739af43e.tar libvpx-69c67c9531b08c1b15985b351677a162739af43e.tar.gz libvpx-69c67c9531b08c1b15985b351677a162739af43e.tar.bz2 libvpx-69c67c9531b08c1b15985b351677a162739af43e.zip