summaryrefslogtreecommitdiff
path: root/vp8/encoder/x86
diff options
context:
space:
mode:
authorJohn Koleszar <jkoleszar@google.com>2013-02-28 16:25:38 -0800
committerJohn Koleszar <jkoleszar@google.com>2013-03-01 11:06:05 -0800
commit69c67c9531b08c1b15985b351677a162739af43e (patch)
tree5e07512832085e972ce4bdc7e96472d2c4f5c545 /vp8/encoder/x86
parentdb4dc6f0c0c30f56d675a489c93b22517eedb9f8 (diff)
parenta07bed2b2b2ca84fdb145ee8e4f1ca100d39915e (diff)
downloadlibvpx-69c67c9531b08c1b15985b351677a162739af43e.tar
libvpx-69c67c9531b08c1b15985b351677a162739af43e.tar.gz
libvpx-69c67c9531b08c1b15985b351677a162739af43e.tar.bz2
libvpx-69c67c9531b08c1b15985b351677a162739af43e.zip
Merge master branch into experimental
Picks up some build system changes, compiler warning fixes, etc. Change-Id: I2712f99e653502818a101a72696ad54018152d4e
Diffstat (limited to 'vp8/encoder/x86')
-rw-r--r--vp8/encoder/x86/dct_sse2.asm4
-rw-r--r--vp8/encoder/x86/quantize_sse2.asm147
-rw-r--r--vp8/encoder/x86/quantize_sse2.c103
-rw-r--r--vp8/encoder/x86/quantize_sse4.asm6
-rw-r--r--vp8/encoder/x86/quantize_ssse3.asm6
-rw-r--r--vp8/encoder/x86/temporal_filter_apply_sse2.asm2
6 files changed, 115 insertions, 153 deletions
diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm
index d880ce0c4..d06bca592 100644
--- a/vp8/encoder/x86/dct_sse2.asm
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -29,7 +29,7 @@
movsxd rax, dword ptr arg(2)
lea rcx, [rsi + rax*2]
%else
- %ifidn __OUTPUT_FORMAT__,x64
+ %if LIBVPX_YASM_WIN64
%define input rcx
%define output rdx
%define pitch r8
@@ -53,7 +53,7 @@
RESTORE_GOT
pop rbp
%else
- %ifidn __OUTPUT_FORMAT__,x64
+ %if LIBVPX_YASM_WIN64
RESTORE_XMM
%endif
%endif
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
index fe9464b3d..b41768ce0 100644
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -27,7 +27,7 @@ sym(vp8_regular_quantize_b_sse2):
push rdi
push rsi
%else
- %ifidn __OUTPUT_FORMAT__,x64
+ %if LIBVPX_YASM_WIN64
push rdi
push rsi
%endif
@@ -46,7 +46,7 @@ sym(vp8_regular_quantize_b_sse2):
mov rdi, arg(0) ; BLOCK *b
mov rsi, arg(1) ; BLOCKD *d
%else
- %ifidn __OUTPUT_FORMAT__,x64
+ %if LIBVPX_YASM_WIN64
mov rdi, rcx ; BLOCK *b
mov rsi, rdx ; BLOCKD *d
%else
@@ -226,7 +226,7 @@ ZIGZAG_LOOP 15
pop rsi
pop rdi
%else
- %ifidn __OUTPUT_FORMAT__,x64
+ %if LIBVPX_YASM_WIN64
pop rsi
pop rdi
%endif
@@ -236,147 +236,6 @@ ZIGZAG_LOOP 15
pop rbp
ret
-; void vp8_fast_quantize_b_sse2 | arg
-; (BLOCK *b, | 0
-; BLOCKD *d) | 1
-
-global sym(vp8_fast_quantize_b_sse2) PRIVATE
-sym(vp8_fast_quantize_b_sse2):
- push rbp
- mov rbp, rsp
- GET_GOT rbx
-
-%if ABI_IS_32BIT
- push rdi
- push rsi
-%else
- %ifidn __OUTPUT_FORMAT__,x64
- push rdi
- push rsi
- %else
- ; these registers are used for passing arguments
- %endif
-%endif
-
- ; end prolog
-
-%if ABI_IS_32BIT
- mov rdi, arg(0) ; BLOCK *b
- mov rsi, arg(1) ; BLOCKD *d
-%else
- %ifidn __OUTPUT_FORMAT__,x64
- mov rdi, rcx ; BLOCK *b
- mov rsi, rdx ; BLOCKD *d
- %else
- ;mov rdi, rdi ; BLOCK *b
- ;mov rsi, rsi ; BLOCKD *d
- %endif
-%endif
-
- mov rax, [rdi + vp8_block_coeff]
- mov rcx, [rdi + vp8_block_round]
- mov rdx, [rdi + vp8_block_quant_fast]
-
- ; z = coeff
- movdqa xmm0, [rax]
- movdqa xmm4, [rax + 16]
-
- ; dup z so we can save sz
- movdqa xmm1, xmm0
- movdqa xmm5, xmm4
-
- ; sz = z >> 15
- psraw xmm0, 15
- psraw xmm4, 15
-
- ; x = abs(z) = (z ^ sz) - sz
- pxor xmm1, xmm0
- pxor xmm5, xmm4
- psubw xmm1, xmm0
- psubw xmm5, xmm4
-
- ; x += round
- paddw xmm1, [rcx]
- paddw xmm5, [rcx + 16]
-
- mov rax, [rsi + vp8_blockd_qcoeff]
- mov rcx, [rsi + vp8_blockd_dequant]
- mov rdi, [rsi + vp8_blockd_dqcoeff]
-
- ; y = x * quant >> 16
- pmulhw xmm1, [rdx]
- pmulhw xmm5, [rdx + 16]
-
- ; x = (y ^ sz) - sz
- pxor xmm1, xmm0
- pxor xmm5, xmm4
- psubw xmm1, xmm0
- psubw xmm5, xmm4
-
- ; qcoeff = x
- movdqa [rax], xmm1
- movdqa [rax + 16], xmm5
-
- ; x * dequant
- movdqa xmm2, xmm1
- movdqa xmm3, xmm5
- pmullw xmm2, [rcx]
- pmullw xmm3, [rcx + 16]
-
- ; dqcoeff = x * dequant
- movdqa [rdi], xmm2
- movdqa [rdi + 16], xmm3
-
- pxor xmm4, xmm4 ;clear all bits
- pcmpeqw xmm1, xmm4
- pcmpeqw xmm5, xmm4
-
- pcmpeqw xmm4, xmm4 ;set all bits
- pxor xmm1, xmm4
- pxor xmm5, xmm4
-
- pand xmm1, [GLOBAL(inv_zig_zag)]
- pand xmm5, [GLOBAL(inv_zig_zag + 16)]
-
- pmaxsw xmm1, xmm5
-
- mov rcx, [rsi + vp8_blockd_eob]
-
- ; now down to 8
- pshufd xmm5, xmm1, 00001110b
-
- pmaxsw xmm1, xmm5
-
- ; only 4 left
- pshuflw xmm5, xmm1, 00001110b
-
- pmaxsw xmm1, xmm5
-
- ; okay, just 2!
- pshuflw xmm5, xmm1, 00000001b
-
- pmaxsw xmm1, xmm5
-
- movd eax, xmm1
- and eax, 0xff
-
- mov BYTE PTR [rcx], al ; store eob
-
- ; begin epilog
-%if ABI_IS_32BIT
- pop rsi
- pop rdi
-%else
- %ifidn __OUTPUT_FORMAT__,x64
- pop rsi
- pop rdi
- %endif
-%endif
-
- RESTORE_GOT
- pop rbp
- ret
-
SECTION_RODATA
align 16
inv_zig_zag:
diff --git a/vp8/encoder/x86/quantize_sse2.c b/vp8/encoder/x86/quantize_sse2.c
new file mode 100644
index 000000000..55d57ad62
--- /dev/null
+++ b/vp8/encoder/x86/quantize_sse2.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/blockd.h"
+#include "vp8/common/entropy.h"
+#include "vp8/encoder/block.h"
+
+#include <mmintrin.h> //MMX
+#include <xmmintrin.h> //SSE
+#include <emmintrin.h> //SSE2
+
+void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
+{
+ __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
+ __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));
+ __m128i round0 = _mm_load_si128((__m128i *)(b->round));
+ __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
+ __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast));
+ __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8));
+ __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
+ __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
+ __m128i inv_zig_zag0 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag));
+ __m128i inv_zig_zag1 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8));
+
+ __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones;
+
+ /* sign of z: z >> 15 */
+ sz0 = _mm_srai_epi16(z0, 15);
+ sz1 = _mm_srai_epi16(z1, 15);
+
+ /* x = abs(z): (z ^ sz) - sz */
+ x0 = _mm_xor_si128(z0, sz0);
+ x1 = _mm_xor_si128(z1, sz1);
+ x0 = _mm_sub_epi16(x0, sz0);
+ x1 = _mm_sub_epi16(x1, sz1);
+
+ /* x += round */
+ x0 = _mm_add_epi16(x0, round0);
+ x1 = _mm_add_epi16(x1, round1);
+
+ /* y = (x * quant) >> 16 */
+ y0 = _mm_mulhi_epi16(x0, quant_fast0);
+ y1 = _mm_mulhi_epi16(x1, quant_fast1);
+
+ /* x = abs(y) = (y ^ sz) - sz */
+ y0 = _mm_xor_si128(y0, sz0);
+ y1 = _mm_xor_si128(y1, sz1);
+ x0 = _mm_sub_epi16(y0, sz0);
+ x1 = _mm_sub_epi16(y1, sz1);
+
+ /* qcoeff = x */
+ _mm_store_si128((__m128i *)(d->qcoeff), x0);
+ _mm_store_si128((__m128i *)(d->qcoeff + 8), x1);
+
+ /* x * dequant */
+ xdq0 = _mm_mullo_epi16(x0, dequant0);
+ xdq1 = _mm_mullo_epi16(x1, dequant1);
+
+ /* dqcoeff = x * dequant */
+ _mm_store_si128((__m128i *)(d->dqcoeff), xdq0);
+ _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1);
+
+ /* build a mask for the zig zag */
+ zeros = _mm_setzero_si128();
+
+ x0 = _mm_cmpeq_epi16(x0, zeros);
+ x1 = _mm_cmpeq_epi16(x1, zeros);
+
+ ones = _mm_cmpeq_epi16(zeros, zeros);
+
+ x0 = _mm_xor_si128(x0, ones);
+ x1 = _mm_xor_si128(x1, ones);
+
+ x0 = _mm_and_si128(x0, inv_zig_zag0);
+ x1 = _mm_and_si128(x1, inv_zig_zag1);
+
+ x0 = _mm_max_epi16(x0, x1);
+
+ /* now down to 8 */
+ x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110
+
+ x0 = _mm_max_epi16(x0, x1);
+
+ /* only 4 left */
+ x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110
+
+ x0 = _mm_max_epi16(x0, x1);
+
+ /* okay, just 2! */
+ x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001
+
+ x0 = _mm_max_epi16(x0, x1);
+
+ *d->eob = 0xFF & _mm_cvtsi128_si32(x0);
+}
diff --git a/vp8/encoder/x86/quantize_sse4.asm b/vp8/encoder/x86/quantize_sse4.asm
index f21146457..dbd171bfc 100644
--- a/vp8/encoder/x86/quantize_sse4.asm
+++ b/vp8/encoder/x86/quantize_sse4.asm
@@ -31,7 +31,7 @@ sym(vp8_regular_quantize_b_sse4):
%define stack_size 32
sub rsp, stack_size
%else
- %ifidn __OUTPUT_FORMAT__,x64
+ %if LIBVPX_YASM_WIN64
SAVE_XMM 8, u
push rdi
push rsi
@@ -43,7 +43,7 @@ sym(vp8_regular_quantize_b_sse4):
mov rdi, arg(0) ; BLOCK *b
mov rsi, arg(1) ; BLOCKD *d
%else
- %ifidn __OUTPUT_FORMAT__,x64
+ %if LIBVPX_YASM_WIN64
mov rdi, rcx ; BLOCK *b
mov rsi, rdx ; BLOCKD *d
%else
@@ -240,7 +240,7 @@ ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
pop rbp
%else
%undef xmm5
- %ifidn __OUTPUT_FORMAT__,x64
+ %if LIBVPX_YASM_WIN64
pop rsi
pop rdi
RESTORE_XMM
diff --git a/vp8/encoder/x86/quantize_ssse3.asm b/vp8/encoder/x86/quantize_ssse3.asm
index 35368894d..7b1dc119f 100644
--- a/vp8/encoder/x86/quantize_ssse3.asm
+++ b/vp8/encoder/x86/quantize_ssse3.asm
@@ -27,7 +27,7 @@ sym(vp8_fast_quantize_b_ssse3):
push rdi
push rsi
%else
- %ifidn __OUTPUT_FORMAT__,x64
+ %if LIBVPX_YASM_WIN64
push rdi
push rsi
%endif
@@ -38,7 +38,7 @@ sym(vp8_fast_quantize_b_ssse3):
mov rdi, arg(0) ; BLOCK *b
mov rsi, arg(1) ; BLOCKD *d
%else
- %ifidn __OUTPUT_FORMAT__,x64
+ %if LIBVPX_YASM_WIN64
mov rdi, rcx ; BLOCK *b
mov rsi, rdx ; BLOCKD *d
%else
@@ -122,7 +122,7 @@ sym(vp8_fast_quantize_b_ssse3):
pop rsi
pop rdi
%else
- %ifidn __OUTPUT_FORMAT__,x64
+ %if LIBVPX_YASM_WIN64
pop rsi
pop rdi
%endif
diff --git a/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
index ce9d9836b..bd92b398a 100644
--- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -50,7 +50,7 @@ sym(vp8_temporal_filter_apply_sse2):
; 0x8000 >> (16 - strength)
mov rdx, 16
sub rdx, arg(4) ; 16 - strength
- movd xmm4, rdx ; can't use rdx w/ shift
+ movq xmm4, rdx ; can't use rdx w/ shift
movdqa xmm5, [GLOBAL(_const_top_bit)]
psrlw xmm5, xmm4
movdqa [rsp + rounding_bit], xmm5