diff options
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/common/x86/convolve.h | 296 | ||||
-rw-r--r-- | vp9/common/x86/vp9_asm_stubs.c | 416 | ||||
-rw-r--r-- | vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c | 122 | ||||
-rw-r--r-- | vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c | 195 | ||||
-rw-r--r-- | vp9/vp9_common.mk | 1 |
5 files changed, 529 insertions, 501 deletions
diff --git a/vp9/common/x86/convolve.h b/vp9/common/x86/convolve.h new file mode 100644 index 000000000..de2df47e5 --- /dev/null +++ b/vp9/common/x86/convolve.h @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VP9_COMMON_X86_CONVOLVE_H_ +#define VP9_COMMON_X86_CONVOLVE_H_ + +#include <assert.h> + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +typedef void filter8_1dfunction ( + const uint8_t *src_ptr, + ptrdiff_t src_pitch, + uint8_t *output_ptr, + ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter +); + +#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ + uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, \ + int w, int h) { \ + if (step_q4 == 16 && filter[3] != 128) { \ + if (filter[0] || filter[1] || filter[2]) { \ + while (w >= 16) { \ + vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + vp9_filter_block1d16_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_filter_block1d8_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_filter_block1d4_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + } \ + if (w) { \ + vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h); \ + } \ +} + +#define FUN_CONV_2D(avg, opt) \ +void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ + uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, \ + int w, int h) { \ + assert(w <= 64); \ + assert(h <= 64); \ + if (x_step_q4 == 16 && y_step_q4 == 16) { \ + if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ + filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \ + vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h + 7); \ + vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h); \ + } else { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \ + vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h + 1); \ + vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h); \ + } \ + } else { \ + vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, w, h); \ + } \ +} + +#if CONFIG_VP9_HIGHBITDEPTH + +typedef void highbd_filter8_1dfunction ( + const uint16_t *src_ptr, + const ptrdiff_t src_pitch, + uint16_t *output_ptr, + ptrdiff_t out_pitch, + unsigned int output_height, + const int16_t *filter, + int bd +); + +#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void vp9_highbd_convolve8_##name##_##opt(const uint8_t *src8, \ + ptrdiff_t src_stride, \ + uint8_t *dst8, \ + ptrdiff_t dst_stride, \ + const int16_t *filter_x, \ + int x_step_q4, \ + const int16_t *filter_y, \ + int y_step_q4, \ + int w, int h, int bd) { \ + if (step_q4 == 16 && filter[3] != 128) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + if (filter[0] || filter[1] || filter[2]) { \ + while (w >= 16) { \ + vp9_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + vp9_highbd_filter_block1d16_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_highbd_filter_block1d8_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_highbd_filter_block1d4_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + } \ + if (w) { \ + vp9_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h, bd); \ + } \ +} + +#define HIGH_FUN_CONV_2D(avg, opt) \ +void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ + uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, \ + int w, int h, int bd) { \ + assert(w <= 64); \ + assert(h <= 64); \ + if (x_step_q4 == 16 && y_step_q4 == 16) { \ + if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ + filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ + vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ + CONVERT_TO_BYTEPTR(fdata2), 64, \ + filter_x, x_step_q4, \ + filter_y, y_step_q4, \ + w, h + 7, bd); \ + vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \ + 64, dst, dst_stride, \ + filter_x, x_step_q4, \ + filter_y, y_step_q4, \ + w, h, bd); \ + } else { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ + vp9_highbd_convolve8_horiz_##opt(src, src_stride, \ + CONVERT_TO_BYTEPTR(fdata2), 64, \ + filter_x, x_step_q4, \ + filter_y, y_step_q4, \ + w, h + 1, bd); \ + vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \ + dst, dst_stride, \ + filter_x, x_step_q4, \ + filter_y, y_step_q4, \ + w, h, bd); \ + } \ + } else { \ + vp9_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, w, \ + h, bd); \ + } \ +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +#endif // VP9_COMMON_X86_CONVOLVE_H_ diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c index 963023c53..fd55fb8c6 100644 --- a/vp9/common/x86/vp9_asm_stubs.c +++ b/vp9/common/x86/vp9_asm_stubs.c @@ -8,421 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include <assert.h> - -#include "./vpx_config.h" #include "./vp9_rtcd.h" -#include "vpx_ports/mem.h" - -typedef void filter8_1dfunction ( - const unsigned char *src_ptr, - const ptrdiff_t src_pitch, - unsigned char *output_ptr, - ptrdiff_t out_pitch, - unsigned int output_height, - const short *filter -); - -#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ - void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ - uint8_t *dst, ptrdiff_t dst_stride, \ - const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, \ - int w, int h) { \ - if (step_q4 == 16 && filter[3] != 128) { \ - if (filter[0] || filter[1] || filter[2]) { \ - while (w >= 16) { \ - vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } else { \ - while (w >= 16) { \ - vp9_filter_block1d16_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vp9_filter_block1d8_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vp9_filter_block1d4_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } \ - } \ - if (w) { \ - vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h); \ - } \ -} - -#define FUN_CONV_2D(avg, opt) \ -void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ - uint8_t *dst, ptrdiff_t dst_stride, \ - const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, \ - int w, int h) { \ - assert(w <= 64); \ - assert(h <= 64); \ - if (x_step_q4 == 16 && y_step_q4 == 16) { \ - if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ - filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ - DECLARE_ALIGNED(16, unsigned char, fdata2[64 * 71]); \ - vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h + 7); \ - vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h); \ - } else { \ - DECLARE_ALIGNED(16, unsigned char, fdata2[64 * 65]); \ - vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h + 1); \ - vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h); \ - } \ - } else { \ - vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, w, h); \ - } \ -} - -#if CONFIG_VP9_HIGHBITDEPTH - -typedef void highbd_filter8_1dfunction ( - const uint16_t *src_ptr, - const ptrdiff_t src_pitch, - uint16_t *output_ptr, - ptrdiff_t out_pitch, - unsigned int output_height, - const int16_t *filter, - int bd -); - -#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ - void vp9_highbd_convolve8_##name##_##opt(const uint8_t *src8, \ - ptrdiff_t src_stride, \ - uint8_t *dst8, \ - ptrdiff_t dst_stride, \ - const int16_t *filter_x, \ - int x_step_q4, \ - const int16_t *filter_y, \ - int y_step_q4, \ - int w, int h, int bd) { \ - if (step_q4 == 16 && filter[3] != 128) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - if (filter[0] || filter[1] || filter[2]) { \ - while (w >= 16) { \ - vp9_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vp9_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vp9_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } else { \ - while (w >= 16) { \ - vp9_highbd_filter_block1d16_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vp9_highbd_filter_block1d8_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vp9_highbd_filter_block1d4_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } \ - } \ - if (w) { \ - vp9_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h, bd); \ - } \ -} - -#define HIGH_FUN_CONV_2D(avg, opt) \ -void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ - uint8_t *dst, ptrdiff_t dst_stride, \ - const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, \ - int w, int h, int bd) { \ - assert(w <= 64); \ - assert(h <= 64); \ - if (x_step_q4 == 16 && y_step_q4 == 16) { \ - if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ - filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ - vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ - CONVERT_TO_BYTEPTR(fdata2), 64, \ - filter_x, x_step_q4, \ - filter_y, y_step_q4, \ - w, h + 7, bd); \ - vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \ - 64, dst, dst_stride, \ - filter_x, x_step_q4, \ - filter_y, y_step_q4, \ - w, h, bd); \ - } else { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ - vp9_highbd_convolve8_horiz_##opt(src, src_stride, \ - CONVERT_TO_BYTEPTR(fdata2), 64, \ - filter_x, x_step_q4, \ - filter_y, y_step_q4, \ - w, h + 1, bd); \ - vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \ - dst, dst_stride, \ - filter_x, x_step_q4, \ - filter_y, y_step_q4, \ - w, h, bd); \ - } \ - } else { \ - vp9_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, w, \ - h, bd); \ - } \ -} -#endif // CONFIG_VP9_HIGHBITDEPTH - -#if HAVE_AVX2 && HAVE_SSSE3 -filter8_1dfunction vp9_filter_block1d16_v8_avx2; -filter8_1dfunction vp9_filter_block1d16_h8_avx2; -filter8_1dfunction vp9_filter_block1d4_v8_ssse3; -#if ARCH_X86_64 -filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; -#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3 -#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3 -#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3 -#else // ARCH_X86 -filter8_1dfunction vp9_filter_block1d8_v8_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_ssse3; -#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3 -#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3 -#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3 -#endif // ARCH_X86_64 / ARCH_X86 -filter8_1dfunction vp9_filter_block1d16_v2_ssse3; -filter8_1dfunction vp9_filter_block1d16_h2_ssse3; -filter8_1dfunction vp9_filter_block1d8_v2_ssse3; -filter8_1dfunction vp9_filter_block1d8_h2_ssse3; -filter8_1dfunction vp9_filter_block1d4_v2_ssse3; -filter8_1dfunction vp9_filter_block1d4_h2_ssse3; -#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3 -#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3 -#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3 -#define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3 -#define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3 -#define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3 -#define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3 -// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); - -// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_2D(, avx2); -#endif // HAVE_AX2 && HAVE_SSSE3 -#if HAVE_SSSE3 -#if ARCH_X86_64 -filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d4_v8_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; -#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3 -#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3 -#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3 -#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3 -#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3 -#else // ARCH_X86 -filter8_1dfunction vp9_filter_block1d16_v8_ssse3; -filter8_1dfunction vp9_filter_block1d16_h8_ssse3; -filter8_1dfunction vp9_filter_block1d8_v8_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_ssse3; -filter8_1dfunction vp9_filter_block1d4_v8_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_ssse3; -#endif // ARCH_X86_64 / ARCH_X86 -filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; - -filter8_1dfunction vp9_filter_block1d16_v2_ssse3; -filter8_1dfunction vp9_filter_block1d16_h2_ssse3; -filter8_1dfunction vp9_filter_block1d8_v2_ssse3; -filter8_1dfunction vp9_filter_block1d8_h2_ssse3; -filter8_1dfunction vp9_filter_block1d4_v2_ssse3; -filter8_1dfunction vp9_filter_block1d4_h2_ssse3; -filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3; -filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3; -filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3; -filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3; -filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3; -filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3; - -// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); -FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, - ssse3); - -// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_2D(, ssse3); -FUN_CONV_2D(avg_ , ssse3); -#endif // HAVE_SSSE3 +#include "./vpx_config.h" +#include "vp9/common/x86/convolve.h" #if HAVE_SSE2 filter8_1dfunction vp9_filter_block1d16_v8_sse2; diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c index 3bc7d3918..cee8d1e76 100644 --- a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c +++ b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c @@ -8,7 +8,14 @@ * be found in the AUTHORS file in the root of the source tree. */ +// Due to a header conflict between math.h and intrinsics includes with ceil() +// in certain configurations under vs9 this include needs to precede +// immintrin.h. +#include "./vp9_rtcd.h" + #include <immintrin.h> + +#include "vp9/common/x86/convolve.h" #include "vpx_ports/mem.h" // filters for 16_h8 and 16_v8 @@ -53,23 +60,23 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) #endif // __clang__ -void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - int16_t *filter) { +static void vp9_filter_block1d16_h8_avx2(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, + ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; __m256i srcReg32b1, srcReg32b2, filtersReg32; unsigned int i; - unsigned int src_stride, dst_stride; + ptrdiff_t src_stride, dst_stride; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); + filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); @@ -104,9 +111,9 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, for (i = output_height; i > 1; i-=2) { // load the 2 strides of source srcReg32b1 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr-3))); + _mm_loadu_si128((const __m128i *)(src_ptr - 3))); srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, - _mm_loadu_si128((__m128i *) + _mm_loadu_si128((const __m128i *) (src_ptr+src_pixels_per_line-3)), 1); // filter the source buffer @@ -135,9 +142,9 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) srcReg32b2 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+5))); + _mm_loadu_si128((const __m128i *)(src_ptr + 5))); srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, - _mm_loadu_si128((__m128i *) + _mm_loadu_si128((const __m128i *) (src_ptr+src_pixels_per_line+5)), 1); // add and saturate the results together @@ -202,7 +209,7 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; __m128i srcRegFilt2, srcRegFilt3; - srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, @@ -237,7 +244,7 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, // reading the next 16 bytes // (part of it was being read by earlier read) - srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, @@ -297,12 +304,12 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, } } -void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - int16_t *filter) { +static void vp9_filter_block1d16_v8_avx2(const uint8_t *src_ptr, + ptrdiff_t src_pitch, + uint8_t *output_ptr, + ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64; __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; @@ -310,11 +317,11 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, __m256i srcReg32b11, srcReg32b12, filtersReg32; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; unsigned int i; - unsigned int src_stride, dst_stride; + ptrdiff_t src_stride, dst_stride; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); + filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); @@ -344,19 +351,19 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, // load 16 bytes 7 times in stride of src_pitch srcReg32b1 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr))); + _mm_loadu_si128((const __m128i *)(src_ptr))); srcReg32b2 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); srcReg32b3 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); srcReg32b4 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); srcReg32b5 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); srcReg32b6 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); srcReg32b7 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); // have each consecutive loads on the same 256 register srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, @@ -393,11 +400,11 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, // load the last 2 loads of 16 bytes and have every two // consecutive loads in the same 256 bit register srcReg32b8 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, _mm256_castsi256_si128(srcReg32b8), 1); srcReg32b9 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, _mm256_castsi256_si128(srcReg32b9), 1); @@ -476,7 +483,7 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; // load the last 16 bytes - srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); + srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); // merge the last 2 results together srcRegFilt4 = _mm_unpacklo_epi8( @@ -542,3 +549,54 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); } } + +#if HAVE_AVX2 && HAVE_SSSE3 +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +#if ARCH_X86_64 +filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; +#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3 +#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3 +#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3 +#else // ARCH_X86 +filter8_1dfunction vp9_filter_block1d8_v8_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_ssse3; +#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3 +#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3 +#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3 +#endif // ARCH_X86_64 +filter8_1dfunction vp9_filter_block1d16_v2_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_ssse3; +#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3 +#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3 +#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3 +#define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3 +#define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3 +#define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3 +#define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3 +// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); + +// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, avx2); +#endif // HAVE_AX2 && HAVE_SSSE3 diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c index dbd2b5b19..5fd2857e1 100644 --- a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c +++ b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c @@ -15,6 +15,7 @@ #include <tmmintrin.h> +#include "vp9/common/x86/convolve.h" #include "vpx_ports/mem.h" #include "vpx_ports/emmintrin_compat.h" @@ -44,12 +45,17 @@ DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 }; -void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - int16_t *filter) { +// These are reused by the avx2 intrinsics. +filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; + +void vp9_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, + ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { __m128i firstFilters, secondFilters, shuffle1, shuffle2; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; __m128i addFilterReg64, filtersReg, srcReg, minReg; @@ -57,7 +63,7 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 =_mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); + filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); @@ -78,7 +84,7 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); for (i = 0; i < output_height; i++) { - srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1); @@ -115,12 +121,12 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, } } -void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - int16_t *filter) { +void vp9_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, + ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; @@ -129,7 +135,7 @@ void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); + filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); @@ -153,7 +159,7 @@ void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, filt4Reg = _mm_load_si128((__m128i const *)filt4_global); for (i = 0; i < output_height; i++) { - srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg); @@ -195,12 +201,12 @@ void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, } } -void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - int16_t *filter) { +static void vp9_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, + ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; @@ -209,7 +215,7 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); + filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); @@ -233,7 +239,7 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, filt4Reg = _mm_load_si128((__m128i const *)filt4_global); for (i = 0; i < output_height; i++) { - srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); @@ -260,7 +266,7 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, // reading the next 16 bytes. // (part of it was being read by earlier read) - srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, @@ -312,12 +318,12 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, } } -void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - int16_t *filter) { +void vp9_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_pitch, + uint8_t *output_ptr, + ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter) { __m128i addFilterReg64, filtersReg, minReg; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; @@ -327,7 +333,7 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); + filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); @@ -342,17 +348,17 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); // load the first 7 rows of 8 bytes - srcReg1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]); - srcReg2 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch)[0]); - srcReg3 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 2)[0]); - srcReg4 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 3)[0]); - srcReg5 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 4)[0]); - srcReg6 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 5)[0]); - srcReg7 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 6)[0]); + srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr); + srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); + srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); for (i = 0; i < output_height; i++) { // load the last 8 bytes - srcReg8 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 7)[0]); + srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); // merge the result together srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); @@ -400,12 +406,12 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, } } -void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - int16_t *filter) { +static void vp9_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_pitch, + uint8_t *output_ptr, + ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter) { __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; @@ -415,7 +421,7 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); + filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); @@ -430,17 +436,17 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); // load the first 7 rows of 16 bytes - srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr)); - srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch)); - srcReg3 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 2)); - srcReg4 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 3)); - srcReg5 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 4)); - srcReg6 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 5)); - srcReg7 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 6)); + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)); + srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg7 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); for (i = 0; i < output_height; i++) { // load the last 16 bytes - srcReg8 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 7)); + srcReg8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); // merge the result together srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2); @@ -514,3 +520,82 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, output_ptr+=out_pitch; } } + +#if ARCH_X86_64 +filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; +#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3 +#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3 +#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3 +#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3 +#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3 +#else // ARCH_X86 +filter8_1dfunction vp9_filter_block1d16_v8_ssse3; +filter8_1dfunction vp9_filter_block1d16_h8_ssse3; +filter8_1dfunction vp9_filter_block1d8_v8_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_ssse3; +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_ssse3; +#endif // ARCH_X86_64 +filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; + +filter8_1dfunction vp9_filter_block1d16_v2_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_ssse3; +filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3; + +// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); +FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, + ssse3); + +// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, ssse3); +FUN_CONV_2D(avg_ , ssse3); diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index f629d98b8..cbc04888b 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -69,6 +69,7 @@ VP9_COMMON_SRCS-yes += common/vp9_common_data.h VP9_COMMON_SRCS-yes += common/vp9_scan.c VP9_COMMON_SRCS-yes += common/vp9_scan.h +VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/convolve.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c |