diff options
author | Johann Koenig <johannkoenig@google.com> | 2018-12-21 19:30:04 +0000 |
---|---|---|
committer | Gerrit Code Review <noreply-gerritcodereview@google.com> | 2018-12-21 19:30:04 +0000 |
commit | 1cb039529d4c1b535093c759850835fae794d424 (patch) | |
tree | ce131fbb2b94257c1b29b77ca756f36eed553873 /vpx_dsp | |
parent | 2f7c4d276a321868210cc522dd85ff60a3e7c111 (diff) | |
parent | c67a2e76a1b317995fc6f7fe40ba773ea55272ba (diff) | |
download | libvpx-1cb039529d4c1b535093c759850835fae794d424.tar libvpx-1cb039529d4c1b535093c759850835fae794d424.tar.gz libvpx-1cb039529d4c1b535093c759850835fae794d424.tar.bz2 libvpx-1cb039529d4c1b535093c759850835fae794d424.zip |
Merge "subpixel_8t sse2: resolve missing declarations"
Diffstat (limited to 'vpx_dsp')
-rw-r--r-- | vpx_dsp/vpx_dsp.mk | 1 | ||||
-rw-r--r-- | vpx_dsp/x86/vpx_asm_stubs.c | 194 | ||||
-rw-r--r-- | vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm | 12 | ||||
-rw-r--r-- | vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c | 261 |
4 files changed, 223 insertions, 245 deletions
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 2495db3f4..87460bedf 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -89,7 +89,6 @@ DSP_SRCS-yes += vpx_convolve.h DSP_SRCS-yes += vpx_filter.h DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h -DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/vpx_asm_stubs.c DSP_SRCS-$(HAVE_SSE2) += x86/convolve_sse2.h DSP_SRCS-$(HAVE_SSSE3) += x86/convolve_ssse3.h diff --git a/vpx_dsp/x86/vpx_asm_stubs.c b/vpx_dsp/x86/vpx_asm_stubs.c deleted file mode 100644 index 9d6f83787..000000000 --- a/vpx_dsp/x86/vpx_asm_stubs.c +++ /dev/null @@ -1,194 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_config.h" -#include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/x86/convolve.h" - -#if HAVE_SSE2 -filter8_1dfunction vpx_filter_block1d16_v8_sse2; -filter8_1dfunction vpx_filter_block1d16_h8_sse2; -filter8_1dfunction vpx_filter_block1d8_v8_sse2; -filter8_1dfunction vpx_filter_block1d8_h8_sse2; -filter8_1dfunction vpx_filter_block1d4_v8_sse2; -filter8_1dfunction vpx_filter_block1d4_h8_sse2; -filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2; -filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2; -filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2; -filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2; -filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2; -filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2; - -filter8_1dfunction vpx_filter_block1d16_h4_sse2; -filter8_1dfunction vpx_filter_block1d16_v4_sse2; -filter8_1dfunction vpx_filter_block1d8_h4_sse2; -filter8_1dfunction vpx_filter_block1d8_v4_sse2; -filter8_1dfunction vpx_filter_block1d4_h4_sse2; -filter8_1dfunction vpx_filter_block1d4_v4_sse2; -#define vpx_filter_block1d16_v4_avg_sse2 vpx_filter_block1d16_v8_avg_sse2 -#define vpx_filter_block1d16_h4_avg_sse2 vpx_filter_block1d16_h8_avg_sse2 -#define vpx_filter_block1d8_v4_avg_sse2 vpx_filter_block1d8_v8_avg_sse2 -#define vpx_filter_block1d8_h4_avg_sse2 vpx_filter_block1d8_h8_avg_sse2 -#define vpx_filter_block1d4_v4_avg_sse2 vpx_filter_block1d4_v8_avg_sse2 -#define vpx_filter_block1d4_h4_avg_sse2 vpx_filter_block1d4_h8_avg_sse2 - -filter8_1dfunction vpx_filter_block1d16_v2_sse2; -filter8_1dfunction vpx_filter_block1d16_h2_sse2; -filter8_1dfunction vpx_filter_block1d8_v2_sse2; -filter8_1dfunction vpx_filter_block1d8_h2_sse2; -filter8_1dfunction vpx_filter_block1d4_v2_sse2; -filter8_1dfunction vpx_filter_block1d4_h2_sse2; -filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2; -filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2; -filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2; -filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2; -filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2; -filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2; - -// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, int y_step_q4, -// int w, int h); -// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, int y_step_q4, -// int w, int h); -// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, -// int y_step_q4, int w, int h); -// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2); -FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2); -FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2); -FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, sse2); - -// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, int y_step_q4, -// int w, int h); -// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, int y_step_q4, -// int w, int h); -FUN_CONV_2D(, sse2); -FUN_CONV_2D(avg_, sse2); - -#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; - -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v4_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h4_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v4_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h4_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v4_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h4_sse2; -#define vpx_highbd_filter_block1d16_v4_avg_sse2 \ - vpx_highbd_filter_block1d16_v8_avg_sse2 -#define vpx_highbd_filter_block1d16_h4_avg_sse2 \ - vpx_highbd_filter_block1d16_h8_avg_sse2 -#define vpx_highbd_filter_block1d8_v4_avg_sse2 \ - vpx_highbd_filter_block1d8_v8_avg_sse2 -#define vpx_highbd_filter_block1d8_h4_avg_sse2 \ - vpx_highbd_filter_block1d8_h8_avg_sse2 -#define vpx_highbd_filter_block1d4_v4_avg_sse2 \ - vpx_highbd_filter_block1d4_v8_avg_sse2 -#define vpx_highbd_filter_block1d4_h4_avg_sse2 \ - vpx_highbd_filter_block1d4_h8_avg_sse2 - -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; - -// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2); -HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2); -HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2); -HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, - sse2); - -// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, int y_step_q4, -// int w, int h, int bd); -// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, -// int y_step_q4, int w, int h, int bd); -HIGH_FUN_CONV_2D(, sse2); -HIGH_FUN_CONV_2D(avg_, sse2); -#endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 -#endif // HAVE_SSE2 diff --git a/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm b/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm index e6e72b826..c57149657 100644 --- a/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm +++ b/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm @@ -199,7 +199,7 @@ SECTION .text -;void vpx_filter_block1d4_v8_sse2 +;void vpx_highbd_filter_block1d4_v8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pitch, @@ -269,7 +269,7 @@ sym(vpx_highbd_filter_block1d4_v8_sse2): pop rbp ret -;void vpx_filter_block1d8_v8_sse2 +;void vpx_highbd_filter_block1d8_v8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pitch, @@ -328,7 +328,7 @@ sym(vpx_highbd_filter_block1d8_v8_sse2): pop rbp ret -;void vpx_filter_block1d16_v8_sse2 +;void vpx_highbd_filter_block1d16_v8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pitch, @@ -554,7 +554,7 @@ sym(vpx_highbd_filter_block1d16_v8_avg_sse2): pop rbp ret -;void vpx_filter_block1d4_h8_sse2 +;void vpx_highbd_filter_block1d4_h8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pixels_per_line, @@ -629,7 +629,7 @@ sym(vpx_highbd_filter_block1d4_h8_sse2): pop rbp ret -;void vpx_filter_block1d8_h8_sse2 +;void vpx_highbd_filter_block1d8_h8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pixels_per_line, @@ -695,7 +695,7 @@ sym(vpx_highbd_filter_block1d8_h8_sse2): pop rbp ret -;void vpx_filter_block1d16_h8_sse2 +;void vpx_highbd_filter_block1d16_h8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pixels_per_line, diff --git a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c index 0be2c0fef..e40fe693a 100644 --- a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c +++ b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c @@ -19,9 +19,10 @@ #define CONV8_ROUNDING_BITS (7) #define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1)) -void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, - uint8_t *dst_ptr, ptrdiff_t dst_stride, - uint32_t height, const int16_t *kernel) { +static void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { __m128i kernel_reg; // Kernel __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding @@ -100,9 +101,10 @@ void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, /* The macro used to generate functions shifts the src_ptr up by 3 rows already * */ -void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, - uint8_t *dst_ptr, ptrdiff_t dst_stride, - uint32_t height, const int16_t *kernel) { +static void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { // Register for source s[-1:3, :] __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; // Interleaved rows of the source. lo is first half, hi second @@ -255,9 +257,10 @@ void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, } } -void vpx_filter_block1d8_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, - uint8_t *dst_ptr, ptrdiff_t dst_stride, - uint32_t height, const int16_t *kernel) { +static void vpx_filter_block1d8_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { __m128i kernel_reg; // Kernel __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding @@ -312,9 +315,10 @@ void vpx_filter_block1d8_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, } } -void vpx_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, - uint8_t *dst_ptr, ptrdiff_t dst_stride, - uint32_t height, const int16_t *kernel) { +static void vpx_filter_block1d8_v4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { // Register for source s[-1:3, :] __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; // Interleaved rows of the source. lo is first half, hi second @@ -430,9 +434,10 @@ void vpx_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, } } -void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, - uint8_t *dst_ptr, ptrdiff_t dst_stride, - uint32_t height, const int16_t *kernel) { +static void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { __m128i kernel_reg; // Kernel __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding @@ -495,9 +500,10 @@ void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, } } -void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, - uint8_t *dst_ptr, ptrdiff_t dst_stride, - uint32_t height, const int16_t *kernel) { +static void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { // Register for source s[-1:3, :] __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; // Interleaved rows of the source. lo is first half, hi second @@ -608,10 +614,10 @@ void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, } } -void vpx_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr, - ptrdiff_t src_stride, uint16_t *dst_ptr, - ptrdiff_t dst_stride, uint32_t height, - const int16_t *kernel, int bd) { +#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 +static void vpx_highbd_filter_block1d4_h4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { // We will load multiple shifted versions of the row and shuffle them into // 16-bit words of the form // ... s[2] s[1] s[0] s[-1] @@ -670,10 +676,9 @@ void vpx_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr, } } -void vpx_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr, - ptrdiff_t src_stride, uint16_t *dst_ptr, - ptrdiff_t dst_stride, uint32_t height, - const int16_t *kernel, int bd) { +static void vpx_highbd_filter_block1d4_v4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { // We will load two rows of pixels as 16-bit words, and shuffle them into the // form // ... s[0,1] s[-1,1] s[0,0] s[-1,0] @@ -774,10 +779,9 @@ void vpx_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr, } } -void vpx_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr, - ptrdiff_t src_stride, uint16_t *dst_ptr, - ptrdiff_t dst_stride, uint32_t height, - const int16_t *kernel, int bd) { +static void vpx_highbd_filter_block1d8_h4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { // We will load multiple shifted versions of the row and shuffle them into // 16-bit words of the form // ... s[2] s[1] s[0] s[-1] @@ -852,10 +856,9 @@ void vpx_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr, } } -void vpx_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr, - ptrdiff_t src_stride, uint16_t *dst_ptr, - ptrdiff_t dst_stride, uint32_t height, - const int16_t *kernel, int bd) { +static void vpx_highbd_filter_block1d8_v4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { // We will load two rows of pixels as 16-bit words, and shuffle them into the // form // ... s[0,1] s[-1,1] s[0,0] s[-1,0] @@ -982,24 +985,194 @@ void vpx_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr, } } -void vpx_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, uint32_t height, - const int16_t *kernel, int bd) { +static void vpx_highbd_filter_block1d16_h4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { vpx_highbd_filter_block1d8_h4_sse2(src_ptr, src_stride, dst_ptr, dst_stride, height, kernel, bd); vpx_highbd_filter_block1d8_h4_sse2(src_ptr + 8, src_stride, dst_ptr + 8, dst_stride, height, kernel, bd); } -void vpx_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, uint32_t height, - const int16_t *kernel, int bd) { +static void vpx_highbd_filter_block1d16_v4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { vpx_highbd_filter_block1d8_v4_sse2(src_ptr, src_stride, dst_ptr, dst_stride, height, kernel, bd); vpx_highbd_filter_block1d8_v4_sse2(src_ptr + 8, src_stride, dst_ptr + 8, dst_stride, height, kernel, bd); } +#endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 + +// From vpx_subpixel_8t_sse2.asm. +filter8_1dfunction vpx_filter_block1d16_v8_sse2; +filter8_1dfunction vpx_filter_block1d16_h8_sse2; +filter8_1dfunction vpx_filter_block1d8_v8_sse2; +filter8_1dfunction vpx_filter_block1d8_h8_sse2; +filter8_1dfunction vpx_filter_block1d4_v8_sse2; +filter8_1dfunction vpx_filter_block1d4_h8_sse2; +filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2; +filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2; + +// Use the [vh]8 version because there is no [vh]4 implementation. +#define vpx_filter_block1d16_v4_avg_sse2 vpx_filter_block1d16_v8_avg_sse2 +#define vpx_filter_block1d16_h4_avg_sse2 vpx_filter_block1d16_h8_avg_sse2 +#define vpx_filter_block1d8_v4_avg_sse2 vpx_filter_block1d8_v8_avg_sse2 +#define vpx_filter_block1d8_h4_avg_sse2 vpx_filter_block1d8_h8_avg_sse2 +#define vpx_filter_block1d4_v4_avg_sse2 vpx_filter_block1d4_v8_avg_sse2 +#define vpx_filter_block1d4_h4_avg_sse2 vpx_filter_block1d4_h8_avg_sse2 + +// From vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm. +filter8_1dfunction vpx_filter_block1d16_v2_sse2; +filter8_1dfunction vpx_filter_block1d16_h2_sse2; +filter8_1dfunction vpx_filter_block1d8_v2_sse2; +filter8_1dfunction vpx_filter_block1d8_h2_sse2; +filter8_1dfunction vpx_filter_block1d4_v2_sse2; +filter8_1dfunction vpx_filter_block1d4_h2_sse2; +filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2; +filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2; + +// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); +// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2); +FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2); +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2); +FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, sse2); + +// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, sse2); +FUN_CONV_2D(avg_, sse2); + +#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 +// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm. +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; + +// Use the [vh]8 version because there is no [vh]4 implementation. +#define vpx_highbd_filter_block1d16_v4_avg_sse2 \ + vpx_highbd_filter_block1d16_v8_avg_sse2 +#define vpx_highbd_filter_block1d16_h4_avg_sse2 \ + vpx_highbd_filter_block1d16_h8_avg_sse2 +#define vpx_highbd_filter_block1d8_v4_avg_sse2 \ + vpx_highbd_filter_block1d8_v8_avg_sse2 +#define vpx_highbd_filter_block1d8_h4_avg_sse2 \ + vpx_highbd_filter_block1d8_h8_avg_sse2 +#define vpx_highbd_filter_block1d4_v4_avg_sse2 \ + vpx_highbd_filter_block1d4_v8_avg_sse2 +#define vpx_highbd_filter_block1d4_h4_avg_sse2 \ + vpx_highbd_filter_block1d4_h8_avg_sse2 + +// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm. +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; + +// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2); +HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2); +HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2); +HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, + sse2); + +// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h, int bd); +HIGH_FUN_CONV_2D(, sse2); +HIGH_FUN_CONV_2D(avg_, sse2); +#endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 |