summaryrefslogtreecommitdiff
path: root/vp8
diff options
context:
space:
mode:
Diffstat (limited to 'vp8')
-rw-r--r--vp8/common/rtcd_defs.pl2
-rw-r--r--vp8/common/x86/bilinear_filter_sse2.c93
-rw-r--r--vp8/common/x86/filter_x86.c29
-rw-r--r--vp8/common/x86/filter_x86.h33
-rw-r--r--vp8/common/x86/subpixel_mmx.asm121
-rw-r--r--vp8/common/x86/vp8_asm_stubs.c1
-rw-r--r--vp8/vp8_common.mk2
7 files changed, 94 insertions, 187 deletions
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index f67025767..3ab89a338 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -167,7 +167,7 @@ add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src, int src_pitch,
specialize qw/vp8_bilinear_predict8x4 sse2 neon msa/;
add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_bilinear_predict4x4 mmx neon msa/;
+specialize qw/vp8_bilinear_predict4x4 sse2 neon msa/;
#
# Encoder functions below this point.
diff --git a/vp8/common/x86/bilinear_filter_sse2.c b/vp8/common/x86/bilinear_filter_sse2.c
index 224c1b32a..14e10eca4 100644
--- a/vp8/common/x86/bilinear_filter_sse2.c
+++ b/vp8/common/x86/bilinear_filter_sse2.c
@@ -14,6 +14,7 @@
#include "./vp8_rtcd.h"
#include "./vpx_config.h"
#include "vp8/common/filter.h"
+#include "vpx_dsp/x86/mem_sse2.h"
#include "vpx_ports/mem.h"
static INLINE void horizontal_16x16(uint8_t *src, const int stride,
@@ -241,3 +242,95 @@ void vp8_bilinear_predict8x4_sse2(uint8_t *src_ptr, int src_pixels_per_line,
vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 4);
}
+
+static INLINE void horizontal_4x4(uint8_t *src, const int stride, uint16_t *dst,
+ const int xoffset) {
+ int h;
+ const __m128i zero = _mm_setzero_si128();
+
+ if (xoffset == 0) {
+ for (h = 0; h < 5; ++h) {
+ const __m128i a = load_unaligned_u32(src);
+ const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
+ _mm_storel_epi64((__m128i *)dst, a_u16);
+ src += stride;
+ dst += 4;
+ }
+ return;
+ }
+
+ {
+ const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+ const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
+ const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
+
+ for (h = 0; h < 5; ++h) {
+ const __m128i a = load_unaligned_u32(src);
+ const __m128i b = load_unaligned_u32(src + 1);
+ const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
+ const __m128i b_u16 = _mm_unpacklo_epi8(b, zero);
+ const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0);
+ const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1);
+ const __m128i sum = _mm_add_epi16(a_filtered, b_filtered);
+ const __m128i compensated = _mm_add_epi16(sum, round_factor);
+ const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
+ _mm_storel_epi64((__m128i *)dst, shifted);
+ src += stride;
+ dst += 4;
+ }
+ }
+}
+
+static INLINE void vertical_4x4(uint16_t *src, uint8_t *dst, const int stride,
+ const int yoffset) {
+ int h;
+
+ if (yoffset == 0) {
+ for (h = 0; h < 4; h += 2) {
+ const __m128i row = _mm_load_si128((__m128i *)src);
+ __m128i packed = _mm_packus_epi16(row, row);
+ store_unaligned_u32(dst, packed);
+ dst += stride;
+ packed = _mm_srli_si128(packed, 4);
+ store_unaligned_u32(dst, packed);
+ dst += stride;
+ src += 8;
+ }
+ return;
+ }
+
+ {
+ const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+ const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
+ const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
+
+ for (h = 0; h < 4; h += 2) {
+ const __m128i row_0 = _mm_load_si128((__m128i *)src);
+ const __m128i row_1 = _mm_loadu_si128((__m128i *)(src + 4));
+ const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0);
+ const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1);
+ const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered);
+ const __m128i compensated = _mm_add_epi16(sum, round_factor);
+ const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
+ __m128i packed = _mm_packus_epi16(shifted, shifted);
+ storeu_uint32(dst, _mm_cvtsi128_si32(packed));
+ packed = _mm_srli_si128(packed, 4);
+ dst += stride;
+ storeu_uint32(dst, _mm_cvtsi128_si32(packed));
+ dst += stride;
+ src += 8;
+ }
+ }
+}
+
+void vp8_bilinear_predict4x4_sse2(uint8_t *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset, uint8_t *dst_ptr,
+ int dst_pitch) {
+ uint16_t FData[4 * 5];
+
+ assert((xoffset | yoffset) != 0);
+
+ horizontal_4x4(src_ptr, src_pixels_per_line, FData, xoffset);
+
+ vertical_4x4(FData, dst_ptr, dst_pitch, yoffset);
+}
diff --git a/vp8/common/x86/filter_x86.c b/vp8/common/x86/filter_x86.c
deleted file mode 100644
index 2405342f0..000000000
--- a/vp8/common/x86/filter_x86.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp8/common/x86/filter_x86.h"
-
-DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) = {
- { 128, 128, 128, 128, 0, 0, 0, 0 }, { 112, 112, 112, 112, 16, 16, 16, 16 },
- { 96, 96, 96, 96, 32, 32, 32, 32 }, { 80, 80, 80, 80, 48, 48, 48, 48 },
- { 64, 64, 64, 64, 64, 64, 64, 64 }, { 48, 48, 48, 48, 80, 80, 80, 80 },
- { 32, 32, 32, 32, 96, 96, 96, 96 }, { 16, 16, 16, 16, 112, 112, 112, 112 }
-};
-
-DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) = {
- { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 },
- { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
- { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
- { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
- { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
- { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
- { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
- { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 }
-};
diff --git a/vp8/common/x86/filter_x86.h b/vp8/common/x86/filter_x86.h
deleted file mode 100644
index 570ff8666..000000000
--- a/vp8/common/x86/filter_x86.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_VP8_COMMON_X86_FILTER_X86_H_
-#define VPX_VP8_COMMON_X86_FILTER_X86_H_
-
-#include "vpx_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with
- * duplicated values */
-
-/* duplicated 4x */
-extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]);
-
-/* duplicated 8x */
-extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // VPX_VP8_COMMON_X86_FILTER_X86_H_
diff --git a/vp8/common/x86/subpixel_mmx.asm b/vp8/common/x86/subpixel_mmx.asm
index 05320d58d..67bcd0cbd 100644
--- a/vp8/common/x86/subpixel_mmx.asm
+++ b/vp8/common/x86/subpixel_mmx.asm
@@ -10,8 +10,6 @@
%include "vpx_ports/x86_abi_support.asm"
-extern sym(vp8_bilinear_filters_x86_8)
-
%define BLOCK_HEIGHT_WIDTH 4
%define vp8_filter_weight 128
@@ -205,125 +203,6 @@ sym(vp8_filter_block1dc_v6_mmx):
ret
-;void bilinear_predict4x4_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
-sym(vp8_bilinear_predict4x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
- ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
-
- movsxd rax, dword ptr arg(2) ;xoffset
- mov rdi, arg(4) ;dst_ptr ;
-
- lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
- shl rax, 5
-
- add rax, rcx ; HFilter
- mov rsi, arg(0) ;src_ptr ;
-
- movsxd rdx, dword ptr arg(5) ;ldst_pitch
- movq mm1, [rax] ;
-
- movq mm2, [rax+16] ;
- movsxd rax, dword ptr arg(3) ;yoffset
-
- pxor mm0, mm0 ;
- shl rax, 5
-
- add rax, rcx
- lea rcx, [rdi+rdx*4] ;
-
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
-
- ; get the first horizontal line done ;
- movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
-
- pmullw mm3, mm1 ;
- movd mm5, [rsi+1] ;
-
- punpcklbw mm5, mm0 ;
- pmullw mm5, mm2 ;
-
- paddw mm3, mm5 ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
-
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- movq mm7, mm3 ;
- packuswb mm7, mm0 ;
-
- add rsi, rdx ; next line
-.next_row_4x4:
- movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
-
- pmullw mm3, mm1 ;
- movd mm5, [rsi+1] ;
-
- punpcklbw mm5, mm0 ;
- pmullw mm5, mm2 ;
-
- paddw mm3, mm5 ;
-
- movq mm5, mm7 ;
- punpcklbw mm5, mm0 ;
-
- pmullw mm5, [rax] ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
-
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
- movq mm7, mm3 ;
-
- packuswb mm7, mm0 ;
-
- pmullw mm3, [rax+16] ;
- paddw mm3, mm5 ;
-
-
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- packuswb mm3, mm0
- movd [rdi], mm3 ; store the results in the destination
-
-%if ABI_IS_32BIT
- add rsi, rdx ; next line
- add rdi, dword ptr arg(5) ;dst_pitch ;
-%else
- movsxd r8, dword ptr arg(5) ;dst_pitch ;
- add rsi, rdx ; next line
- add rdi, r8
-%endif
-
- cmp rdi, rcx ;
- jne .next_row_4x4
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
SECTION_RODATA
align 16
rd:
diff --git a/vp8/common/x86/vp8_asm_stubs.c b/vp8/common/x86/vp8_asm_stubs.c
index de836f19d..7fb83c2d5 100644
--- a/vp8/common/x86/vp8_asm_stubs.c
+++ b/vp8/common/x86/vp8_asm_stubs.c
@@ -11,7 +11,6 @@
#include "vpx_config.h"
#include "vp8_rtcd.h"
#include "vpx_ports/mem.h"
-#include "filter_x86.h"
extern const short vp8_six_tap_x86[8][6 * 8];
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index d2d5712a5..9f106a2c3 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -70,8 +70,6 @@ VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h
VP8_COMMON_SRCS-yes += common/treecoder.c
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.h
VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c
VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c
VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/mfqe.c