diff options
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/common/vp9_filter.h | 8 | ||||
-rw-r--r-- | vp9/encoder/arm/neon/vp9_variance_neon.c | 40 | ||||
-rw-r--r-- | vp9/encoder/vp9_mcomp.c | 16 | ||||
-rw-r--r-- | vp9/encoder/vp9_variance.c | 51 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_highbd_subpel_variance.asm | 16 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_subpel_variance.asm | 24 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c | 16 |
7 files changed, 61 insertions, 110 deletions
diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h index d963ee235..808a270fa 100644 --- a/vp9/common/vp9_filter.h +++ b/vp9/common/vp9_filter.h @@ -43,14 +43,6 @@ typedef int16_t InterpKernel[SUBPEL_TAPS]; const InterpKernel *vp9_get_interp_kernel(INTERP_FILTER filter); -DECLARE_ALIGNED(256, extern const InterpKernel, - vp9_bilinear_filters[SUBPEL_SHIFTS]); - -// The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear -// filter kernel as a 2 tap filter. -#define BILINEAR_FILTERS_2TAP(x) \ - (vp9_bilinear_filters[(x)] + SUBPEL_TAPS/2 - 1) - #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c index 166156af7..0ac194e92 100644 --- a/vp9/encoder/arm/neon/vp9_variance_neon.c +++ b/vp9/encoder/arm/neon/vp9_variance_neon.c @@ -16,10 +16,18 @@ #include "vpx_ports/mem.h" #include "vpx/vpx_integer.h" -#include "vp9/common/vp9_common.h" #include "vp9/common/vp9_filter.h" -#include "vp9/encoder/vp9_variance.h" +static uint8_t bilinear_filters[8][2] = { + { 128, 0, }, + { 112, 16, }, + { 96, 32, }, + { 80, 48, }, + { 64, 64, }, + { 48, 80, }, + { 32, 96, }, + { 16, 112, }, +}; static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *output_ptr, @@ -27,9 +35,9 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, int pixel_step, unsigned int output_height, unsigned int output_width, - const int16_t *vp9_filter) { - const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]); - const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]); + const uint8_t *vp9_filter) { + const uint8x8_t f0 = vmov_n_u8(vp9_filter[0]); + const uint8x8_t f1 = vmov_n_u8(vp9_filter[1]); unsigned int i; for (i = 0; i < output_height; ++i) { const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); @@ -50,9 +58,9 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, int pixel_step, unsigned int output_height, unsigned int output_width, - const int16_t *vp9_filter) { - const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]); - const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]); + const uint8_t *vp9_filter) { + const uint8x8_t f0 = vmov_n_u8(vp9_filter[0]); + const uint8x8_t f1 = vmov_n_u8(vp9_filter[1]); unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; j += 16) { @@ -84,9 +92,9 @@ unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src, var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8, - BILINEAR_FILTERS_2TAP(xoffset)); + bilinear_filters[xoffset]); var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, - 8, BILINEAR_FILTERS_2TAP(yoffset)); + 8, bilinear_filters[yoffset]); return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse); } @@ -102,9 +110,9 @@ unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src, var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16, - BILINEAR_FILTERS_2TAP(xoffset)); + bilinear_filters[xoffset]); var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, - 16, BILINEAR_FILTERS_2TAP(yoffset)); + 16, bilinear_filters[yoffset]); return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse); } @@ -120,9 +128,9 @@ unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32, - BILINEAR_FILTERS_2TAP(xoffset)); + bilinear_filters[xoffset]); var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, - 32, BILINEAR_FILTERS_2TAP(yoffset)); + 32, bilinear_filters[yoffset]); return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse); } @@ -138,8 +146,8 @@ unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src, var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64, - BILINEAR_FILTERS_2TAP(xoffset)); + bilinear_filters[xoffset]); var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, - 64, BILINEAR_FILTERS_2TAP(yoffset)); + 64, bilinear_filters[yoffset]); return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse); } diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 15f95829f..234272697 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -162,9 +162,9 @@ void vp9_init3smotion_compensation(search_site_config *cfg, int stride) { error_per_bit + 4096) >> 13 : 0) -// convert motion vector component to offset for svf calc +// convert motion vector component to offset for sv[a]f calc static INLINE int sp(int x) { - return (x & 7) << 1; + return x & 7; } static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { @@ -679,16 +679,14 @@ int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x, tc = bc + search_step[idx].col; if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3); - int row_offset = (tr & 0x07) << 1; - int col_offset = (tc & 0x07) << 1; MV this_mv; this_mv.row = tr; this_mv.col = tc; if (second_pred == NULL) - thismse = vfp->svf(pre_address, y_stride, col_offset, row_offset, + thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address, src_stride, &sse); else - thismse = vfp->svaf(pre_address, y_stride, col_offset, row_offset, + thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), src_address, src_stride, &sse, second_pred); cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); @@ -709,14 +707,12 @@ int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x, tr = br + (cost_array[2] < cost_array[3] ? -hstep : hstep); if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3); - int row_offset = (tr & 0x07) << 1; - int col_offset = (tc & 0x07) << 1; MV this_mv = {tr, tc}; if (second_pred == NULL) - thismse = vfp->svf(pre_address, y_stride, col_offset, row_offset, + thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address, src_stride, &sse); else - thismse = vfp->svaf(pre_address, y_stride, col_offset, row_offset, + thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), src_address, src_stride, &sse, second_pred); cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c index 1f6b083c4..c571b7c95 100644 --- a/vp9/encoder/vp9_variance.c +++ b/vp9/encoder/vp9_variance.c @@ -19,6 +19,17 @@ #include "vp9/encoder/vp9_variance.h" +static uint8_t bilinear_filters[8][2] = { + { 128, 0, }, + { 112, 16, }, + { 96, 32, }, + { 80, 48, }, + { 64, 64, }, + { 48, 80, }, + { 32, 96, }, + { 16, 112, }, +}; + // Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal // or vertical direction to produce the filtered output block. Used to implement // first-pass of 2-D separable filter. @@ -33,7 +44,7 @@ static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr, int pixel_step, unsigned int output_height, unsigned int output_width, - const int16_t *vp9_filter) { + const uint8_t *vp9_filter) { unsigned int i, j; for (i = 0; i < output_height; i++) { @@ -65,7 +76,7 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, - const int16_t *vp9_filter) { + const uint8_t *vp9_filter) { unsigned int i, j; for (i = 0; i < output_height; i++) { @@ -91,9 +102,9 @@ unsigned int vp9_sub_pixel_variance##W##x##H##_c( \ uint8_t temp2[H * W]; \ \ var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \ - BILINEAR_FILTERS_2TAP(xoffset)); \ + bilinear_filters[xoffset]); \ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - BILINEAR_FILTERS_2TAP(yoffset)); \ + bilinear_filters[yoffset]); \ \ return vpx_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \ } @@ -110,9 +121,9 @@ unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ \ var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \ - BILINEAR_FILTERS_2TAP(xoffset)); \ + bilinear_filters[xoffset]); \ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - BILINEAR_FILTERS_2TAP(yoffset)); \ + bilinear_filters[yoffset]); \ \ vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ \ @@ -166,7 +177,7 @@ static void highbd_var_filter_block2d_bil_first_pass( int pixel_step, unsigned int output_height, unsigned int output_width, - const int16_t *vp9_filter) { + const uint8_t *vp9_filter) { unsigned int i, j; uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8); for (i = 0; i < output_height; i++) { @@ -192,7 +203,7 @@ static void highbd_var_filter_block2d_bil_second_pass( unsigned int pixel_step, unsigned int output_height, unsigned int output_width, - const int16_t *vp9_filter) { + const uint8_t *vp9_filter) { unsigned int i, j; for (i = 0; i < output_height; i++) { @@ -219,9 +230,9 @@ unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \ uint16_t temp2[H * W]; \ \ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ - W, BILINEAR_FILTERS_2TAP(xoffset)); \ + W, bilinear_filters[xoffset]); \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - BILINEAR_FILTERS_2TAP(yoffset)); \ + bilinear_filters[yoffset]); \ \ return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \ dst_stride, sse); \ @@ -236,9 +247,9 @@ unsigned int vp9_highbd_10_sub_pixel_variance##W##x##H##_c( \ uint16_t temp2[H * W]; \ \ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ - W, BILINEAR_FILTERS_2TAP(xoffset)); \ + W, bilinear_filters[xoffset]); \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - BILINEAR_FILTERS_2TAP(yoffset)); \ + bilinear_filters[yoffset]); \ \ return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ W, dst, dst_stride, sse); \ @@ -253,9 +264,9 @@ unsigned int vp9_highbd_12_sub_pixel_variance##W##x##H##_c( \ uint16_t temp2[H * W]; \ \ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ - W, BILINEAR_FILTERS_2TAP(xoffset)); \ + W, bilinear_filters[xoffset]); \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - BILINEAR_FILTERS_2TAP(yoffset)); \ + bilinear_filters[yoffset]); \ \ return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ W, dst, dst_stride, sse); \ @@ -273,9 +284,9 @@ unsigned int vp9_highbd_sub_pixel_avg_variance##W##x##H##_c( \ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ \ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ - W, BILINEAR_FILTERS_2TAP(xoffset)); \ + W, bilinear_filters[xoffset]); \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - BILINEAR_FILTERS_2TAP(yoffset)); \ + bilinear_filters[yoffset]); \ \ vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W); \ @@ -295,9 +306,9 @@ unsigned int vp9_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ \ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ - W, BILINEAR_FILTERS_2TAP(xoffset)); \ + W, bilinear_filters[xoffset]); \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - BILINEAR_FILTERS_2TAP(yoffset)); \ + bilinear_filters[yoffset]); \ \ vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W); \ @@ -317,9 +328,9 @@ unsigned int vp9_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ \ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ - W, BILINEAR_FILTERS_2TAP(xoffset)); \ + W, bilinear_filters[xoffset]); \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - BILINEAR_FILTERS_2TAP(yoffset)); \ + bilinear_filters[yoffset]); \ \ vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W); \ diff --git a/vp9/encoder/x86/vp9_highbd_subpel_variance.asm b/vp9/encoder/x86/vp9_highbd_subpel_variance.asm index 987729f96..4594bb1aa 100644 --- a/vp9/encoder/x86/vp9_highbd_subpel_variance.asm +++ b/vp9/encoder/x86/vp9_highbd_subpel_variance.asm @@ -14,35 +14,19 @@ SECTION_RODATA pw_8: times 8 dw 8 bilin_filter_m_sse2: times 8 dw 16 times 8 dw 0 - times 8 dw 15 - times 8 dw 1 times 8 dw 14 times 8 dw 2 - times 8 dw 13 - times 8 dw 3 times 8 dw 12 times 8 dw 4 - times 8 dw 11 - times 8 dw 5 times 8 dw 10 times 8 dw 6 - times 8 dw 9 - times 8 dw 7 times 16 dw 8 - times 8 dw 7 - times 8 dw 9 times 8 dw 6 times 8 dw 10 - times 8 dw 5 - times 8 dw 11 times 8 dw 4 times 8 dw 12 - times 8 dw 3 - times 8 dw 13 times 8 dw 2 times 8 dw 14 - times 8 dw 1 - times 8 dw 15 SECTION .text diff --git a/vp9/encoder/x86/vp9_subpel_variance.asm b/vp9/encoder/x86/vp9_subpel_variance.asm index 06b8b034a..292cf34d1 100644 --- a/vp9/encoder/x86/vp9_subpel_variance.asm +++ b/vp9/encoder/x86/vp9_subpel_variance.asm @@ -14,52 +14,28 @@ SECTION_RODATA pw_8: times 8 dw 8 bilin_filter_m_sse2: times 8 dw 16 times 8 dw 0 - times 8 dw 15 - times 8 dw 1 times 8 dw 14 times 8 dw 2 - times 8 dw 13 - times 8 dw 3 times 8 dw 12 times 8 dw 4 - times 8 dw 11 - times 8 dw 5 times 8 dw 10 times 8 dw 6 - times 8 dw 9 - times 8 dw 7 times 16 dw 8 - times 8 dw 7 - times 8 dw 9 times 8 dw 6 times 8 dw 10 - times 8 dw 5 - times 8 dw 11 times 8 dw 4 times 8 dw 12 - times 8 dw 3 - times 8 dw 13 times 8 dw 2 times 8 dw 14 - times 8 dw 1 - times 8 dw 15 bilin_filter_m_ssse3: times 8 db 16, 0 - times 8 db 15, 1 times 8 db 14, 2 - times 8 db 13, 3 times 8 db 12, 4 - times 8 db 11, 5 times 8 db 10, 6 - times 8 db 9, 7 times 16 db 8 - times 8 db 7, 9 times 8 db 6, 10 - times 8 db 5, 11 times 8 db 4, 12 - times 8 db 3, 13 times 8 db 2, 14 - times 8 db 1, 15 SECTION .text diff --git a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c index 19ac5c0df..b1c797520 100644 --- a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c +++ b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c @@ -17,36 +17,20 @@ DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, - 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, - 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, - 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, - 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, - 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, - 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, - 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, - 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, - 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, - 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, - 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, - 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, - 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, - 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, - 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15 }; #define FILTER_SRC(filter) \ |