diff options
-rw-r--r-- | test/convolve_test.cc | 22 | ||||
-rw-r--r-- | vp9/common/vp9_reconinter.h | 5 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.c | 9 | ||||
-rw-r--r-- | vp9/encoder/vp9_pickmode.c | 16 | ||||
-rw-r--r-- | vp9/encoder/vp9_rdopt.c | 7 | ||||
-rw-r--r-- | vpx_dsp/arm/highbd_vpx_convolve8_neon.c | 16 | ||||
-rw-r--r-- | vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c | 4 | ||||
-rw-r--r-- | vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c | 4 | ||||
-rw-r--r-- | vpx_dsp/arm/highbd_vpx_convolve_neon.c | 22 | ||||
-rw-r--r-- | vpx_dsp/vpx_convolve.c | 42 | ||||
-rw-r--r-- | vpx_dsp/x86/convolve.h | 65 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_convolve_avx2.c | 8 | ||||
-rw-r--r-- | vpx_dsp/x86/vpx_convolve_copy_sse2.asm | 2 |
13 files changed, 113 insertions, 109 deletions
diff --git a/test/convolve_test.cc b/test/convolve_test.cc index 8b339fadf..a8bab4082 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -301,9 +301,9 @@ void wrapper_filter_average_block2d_8_c( filter_average_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr, dst_stride, output_width, output_height); } else { - highbd_filter_average_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride, + highbd_filter_average_block2d_8_c(CAST_TO_SHORTPTR(src_ptr), src_stride, hfilter, vfilter, - CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, + CAST_TO_SHORTPTR(dst_ptr), dst_stride, output_width, output_height, use_highbd); } #else @@ -324,8 +324,8 @@ void wrapper_filter_block2d_8_c(const uint8_t *src_ptr, filter_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr, dst_stride, output_width, output_height); } else { - highbd_filter_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride, hfilter, - vfilter, CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, + highbd_filter_block2d_8_c(CAST_TO_SHORTPTR(src_ptr), src_stride, hfilter, + vfilter, CAST_TO_SHORTPTR(dst_ptr), dst_stride, output_width, output_height, use_highbd); } #else @@ -460,7 +460,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> { if (UUT_->use_highbd_ == 0) { return input_ + offset; } else { - return CONVERT_TO_BYTEPTR(input16_) + offset; + return CAST_TO_BYTEPTR(input16_ + offset); } #else return input_ + offset; @@ -473,7 +473,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> { if (UUT_->use_highbd_ == 0) { return output_ + offset; } else { - return CONVERT_TO_BYTEPTR(output16_) + offset; + return CAST_TO_BYTEPTR(output16_ + offset); } #else return output_ + offset; @@ -486,7 +486,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> { if (UUT_->use_highbd_ == 0) { return output_ref_ + offset; } else { - return CONVERT_TO_BYTEPTR(output16_ref_) + offset; + return CAST_TO_BYTEPTR(output16_ref_ + offset); } #else return output_ref_ + offset; @@ -498,7 +498,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> { if (UUT_->use_highbd_ == 0) { return list[index]; } else { - return CONVERT_TO_SHORTPTR(list)[index]; + return CAST_TO_SHORTPTR(list)[index]; } #else return list[index]; @@ -510,7 +510,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> { if (UUT_->use_highbd_ == 0) { list[index] = (uint8_t)val; } else { - CONVERT_TO_SHORTPTR(list)[index] = val; + CAST_TO_SHORTPTR(list)[index] = val; } #else list[index] = (uint8_t)val; @@ -718,7 +718,7 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) { if (UUT_->use_highbd_ == 0) { ref = ref8; } else { - ref = CONVERT_TO_BYTEPTR(ref16); + ref = CAST_TO_BYTEPTR(ref16); } #else uint8_t ref[kOutputStride * kMaxDimension]; @@ -797,7 +797,7 @@ TEST_P(ConvolveTest, FilterExtremes) { if (UUT_->use_highbd_ == 0) { ref = ref8; } else { - ref = CONVERT_TO_BYTEPTR(ref16); + ref = CAST_TO_BYTEPTR(ref16); } #else uint8_t ref[kOutputStride * kMaxDimension]; diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h index 4fed4f7f6..cb7d1c63a 100644 --- a/vp9/common/vp9_reconinter.h +++ b/vp9/common/vp9_reconinter.h @@ -37,8 +37,9 @@ static INLINE void highbd_inter_predictor( const int subpel_x, const int subpel_y, const struct scale_factors *sf, int w, int h, int ref, const InterpKernel *kernel, int xs, int ys, int bd) { sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref]( - src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y], - ys, w, h, bd); + CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(src)), src_stride, + CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst)), dst_stride, kernel[subpel_x], + xs, kernel[subpel_y], ys, w, h, bd); } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 591a85ee0..ebe6758c8 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2417,10 +2417,11 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor); if (src->flags & YV12_FLAG_HIGHBITDEPTH) { - vpx_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride, - kernel[x_q4 & 0xf], 16 * src_w / dst_w, - kernel[y_q4 & 0xf], 16 * src_h / dst_h, - 16 / factor, 16 / factor, bd); + vpx_highbd_convolve8( + CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(src_ptr)), src_stride, + CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst_ptr)), dst_stride, + kernel[x_q4 & 0xf], 16 * src_w / dst_w, kernel[y_q4 & 0xf], + 16 * src_h / dst_h, 16 / factor, 16 / factor, bd); } else { vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel[x_q4 & 0xf], 16 * src_w / dst_w, diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index db2bbe7c2..f177814d6 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -2053,9 +2053,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, this_mode_pred = &tmp[get_pred_buffer(tmp, 3)]; #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) - vpx_highbd_convolve_copy(best_pred->data, best_pred->stride, - this_mode_pred->data, this_mode_pred->stride, - NULL, 0, NULL, 0, bw, bh, xd->bd); + vpx_highbd_convolve_copy( + CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(best_pred->data)), + best_pred->stride, + CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(this_mode_pred->data)), + this_mode_pred->stride, NULL, 0, NULL, 0, bw, bh, xd->bd); else vpx_convolve_copy(best_pred->data, best_pred->stride, this_mode_pred->data, this_mode_pred->stride, NULL, @@ -2162,9 +2164,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) { #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) - vpx_highbd_convolve_copy(best_pred->data, best_pred->stride, - pd->dst.buf, pd->dst.stride, NULL, 0, NULL, 0, - bw, bh, xd->bd); + vpx_highbd_convolve_copy( + CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(best_pred->data)), + best_pred->stride, + CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(pd->dst.buf)), pd->dst.stride, + NULL, 0, NULL, 0, bw, bh, xd->bd); else vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf, pd->dst.stride, NULL, 0, NULL, 0, bw, bh); diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index d23d32446..2278ddc0f 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -599,9 +599,10 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - recon = CONVERT_TO_BYTEPTR(recon); - vpx_highbd_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, - bs, bs, xd->bd); + vpx_highbd_convolve_copy(CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst)), + dst_stride, recon, 32, NULL, 0, NULL, 0, bs, + bs, xd->bd); + recon = CONVERT_TO_BYTEPTR(recon16); if (xd->lossless) { vp9_highbd_iwht4x4_add(dqcoeff, recon, 32, *eob, xd->bd); } else { diff --git a/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/vpx_dsp/arm/highbd_vpx_convolve8_neon.c index 1fde13e8d..a00aa0444 100644 --- a/vpx_dsp/arm/highbd_vpx_convolve8_neon.c +++ b/vpx_dsp/arm/highbd_vpx_convolve8_neon.c @@ -145,8 +145,8 @@ void vpx_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride, vpx_highbd_convolve8_horiz_c(src8, src_stride, dst8, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); } else { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const uint16_t *src = CAST_TO_SHORTPTR(src8); + uint16_t *dst = CAST_TO_SHORTPTR(dst8); const int16x8_t filters = vld1q_s16(filter_x); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); uint16x8_t t0, t1, t2, t3; @@ -348,8 +348,8 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint8_t *src8, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); } else { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const uint16_t *src = CAST_TO_SHORTPTR(src8); + uint16_t *dst = CAST_TO_SHORTPTR(dst8); const int16x8_t filters = vld1q_s16(filter_x); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); uint16x8_t t0, t1, t2, t3; @@ -579,8 +579,8 @@ void vpx_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride, vpx_highbd_convolve8_vert_c(src8, src_stride, dst8, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); } else { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const uint16_t *src = CAST_TO_SHORTPTR(src8); + uint16_t *dst = CAST_TO_SHORTPTR(dst8); const int16x8_t filters = vld1q_s16(filter_y); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); @@ -748,8 +748,8 @@ void vpx_highbd_convolve8_avg_vert_neon(const uint8_t *src8, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); } else { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const uint16_t *src = CAST_TO_SHORTPTR(src8); + uint16_t *dst = CAST_TO_SHORTPTR(dst8); const int16x8_t filters = vld1q_s16(filter_y); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); diff --git a/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c b/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c index f4d70761e..b244caea9 100644 --- a/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c +++ b/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c @@ -18,8 +18,8 @@ void vpx_highbd_convolve_avg_neon(const uint8_t *src8, ptrdiff_t src_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd) { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const uint16_t *src = CAST_TO_SHORTPTR(src8); + uint16_t *dst = CAST_TO_SHORTPTR(dst8); (void)filter_x; (void)filter_x_stride; diff --git a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c index a980ab1a3..9401e7b8c 100644 --- a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c +++ b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c @@ -18,8 +18,8 @@ void vpx_highbd_convolve_copy_neon(const uint8_t *src8, ptrdiff_t src_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd) { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const uint16_t *src = CAST_TO_SHORTPTR(src8); + uint16_t *dst = CAST_TO_SHORTPTR(dst8); (void)filter_x; (void)filter_x_stride; diff --git a/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/vpx_dsp/arm/highbd_vpx_convolve_neon.c index 4e6e10992..03a36e4a0 100644 --- a/vpx_dsp/arm/highbd_vpx_convolve_neon.c +++ b/vpx_dsp/arm/highbd_vpx_convolve_neon.c @@ -18,7 +18,7 @@ void vpx_highbd_convolve8_neon(const uint8_t *src8, ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *src = CAST_TO_SHORTPTR(src8); const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y)); // + 1 to make it divisible by 4 DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]); @@ -29,13 +29,12 @@ void vpx_highbd_convolve8_neon(const uint8_t *src8, ptrdiff_t src_stride, * height and filter a multiple of 4 lines. Since this goes in to the temp * buffer which has lots of extra room and is subsequently discarded this is * safe if somewhat less than ideal. */ - vpx_highbd_convolve8_horiz_neon(CONVERT_TO_BYTEPTR(src - src_stride * 3), - src_stride, CONVERT_TO_BYTEPTR(temp), w, - filter_x, x_step_q4, filter_y, y_step_q4, w, - intermediate_height, bd); + vpx_highbd_convolve8_horiz_neon( + CAST_TO_BYTEPTR(src - src_stride * 3), src_stride, CAST_TO_BYTEPTR(temp), + w, filter_x, x_step_q4, filter_y, y_step_q4, w, intermediate_height, bd); /* Step into the temp buffer 3 lines to get the actual frame data */ - vpx_highbd_convolve8_vert_neon(CONVERT_TO_BYTEPTR(temp + w * 3), w, dst, + vpx_highbd_convolve8_vert_neon(CAST_TO_BYTEPTR(temp + w * 3), w, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); } @@ -45,7 +44,7 @@ void vpx_highbd_convolve8_avg_neon(const uint8_t *src8, ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *src = CAST_TO_SHORTPTR(src8); const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y)); // + 1 to make it divisible by 4 DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]); @@ -55,11 +54,10 @@ void vpx_highbd_convolve8_avg_neon(const uint8_t *src8, ptrdiff_t src_stride, /* This implementation has the same issues as above. In addition, we only want * to average the values after both passes. */ - vpx_highbd_convolve8_horiz_neon(CONVERT_TO_BYTEPTR(src - src_stride * 3), - src_stride, CONVERT_TO_BYTEPTR(temp), w, - filter_x, x_step_q4, filter_y, y_step_q4, w, - intermediate_height, bd); - vpx_highbd_convolve8_avg_vert_neon(CONVERT_TO_BYTEPTR(temp + w * 3), w, dst, + vpx_highbd_convolve8_horiz_neon( + CAST_TO_BYTEPTR(src - src_stride * 3), src_stride, CAST_TO_BYTEPTR(temp), + w, filter_x, x_step_q4, filter_y, y_step_q4, w, intermediate_height, bd); + vpx_highbd_convolve8_avg_vert_neon(CAST_TO_BYTEPTR(temp + w * 3), w, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); } diff --git a/vpx_dsp/vpx_convolve.c b/vpx_dsp/vpx_convolve.c index cab6368e6..5a62836eb 100644 --- a/vpx_dsp/vpx_convolve.c +++ b/vpx_dsp/vpx_convolve.c @@ -324,8 +324,8 @@ static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h, int bd) { int x, y; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const uint16_t *src = CAST_TO_SHORTPTR(src8); + uint16_t *dst = CAST_TO_SHORTPTR(dst8); src -= SUBPEL_TAPS / 2 - 1; for (y = 0; y < h; ++y) { @@ -348,8 +348,8 @@ static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h, int bd) { int x, y; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const uint16_t *src = CAST_TO_SHORTPTR(src8); + uint16_t *dst = CAST_TO_SHORTPTR(dst8); src -= SUBPEL_TAPS / 2 - 1; for (y = 0; y < h; ++y) { @@ -374,8 +374,8 @@ static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h, int bd) { int x, y; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const uint16_t *src = CAST_TO_SHORTPTR(src8); + uint16_t *dst = CAST_TO_SHORTPTR(dst8); src -= src_stride * (SUBPEL_TAPS / 2 - 1); for (x = 0; x < w; ++x) { @@ -400,8 +400,8 @@ static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h, int bd) { int x, y; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const uint16_t *src = CAST_TO_SHORTPTR(src8); + uint16_t *dst = CAST_TO_SHORTPTR(dst8); src -= src_stride * (SUBPEL_TAPS / 2 - 1); for (x = 0; x < w; ++x) { @@ -449,12 +449,12 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride, assert(y_step_q4 <= 32); assert(x_step_q4 <= 32); - highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, - CONVERT_TO_BYTEPTR(temp), 64, x_filters, x0_q4, + highbd_convolve_horiz(CAST_TO_BYTEPTR(CAST_TO_SHORTPTR(src) - + src_stride * (SUBPEL_TAPS / 2 - 1)), + src_stride, CAST_TO_BYTEPTR(temp), 64, x_filters, x0_q4, x_step_q4, w, intermediate_height, bd); - highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1), - 64, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, - bd); + highbd_convolve_vert(CAST_TO_BYTEPTR(temp + 64 * (SUBPEL_TAPS / 2 - 1)), 64, + dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd); } void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, @@ -541,10 +541,10 @@ void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, assert(w <= 64); assert(h <= 64); - vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64, - filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); - vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride, NULL, - 0, NULL, 0, w, h, bd); + vpx_highbd_convolve8_c(src, src_stride, CAST_TO_BYTEPTR(temp), 64, filter_x, + x_step_q4, filter_y, y_step_q4, w, h, bd); + vpx_highbd_convolve_avg_c(CAST_TO_BYTEPTR(temp), 64, dst, dst_stride, NULL, 0, + NULL, 0, w, h, bd); } void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, @@ -553,8 +553,8 @@ void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd) { int r; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const uint16_t *src = CAST_TO_SHORTPTR(src8); + uint16_t *dst = CAST_TO_SHORTPTR(dst8); (void)filter_x; (void)filter_x_stride; @@ -575,8 +575,8 @@ void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd) { int x, y; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const uint16_t *src = CAST_TO_SHORTPTR(src8); + uint16_t *dst = CAST_TO_SHORTPTR(dst8); (void)filter_x; (void)filter_x_stride; diff --git a/vpx_dsp/x86/convolve.h b/vpx_dsp/x86/convolve.h index d7468ad7c..ea7016416 100644 --- a/vpx_dsp/x86/convolve.h +++ b/vpx_dsp/x86/convolve.h @@ -107,8 +107,8 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ if (step_q4 == 16 && filter[3] != 128) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *src = CAST_TO_SHORTPTR(src8); \ + uint16_t *dst = CAST_TO_SHORTPTR(dst8); \ if (filter[0] | filter[1] | filter[2]) { \ while (w >= 16) { \ vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ @@ -162,36 +162,37 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, } \ } -#define HIGH_FUN_CONV_2D(avg, opt) \ - void vpx_highbd_convolve8_##avg##opt( \ - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ - assert(w <= 64); \ - assert(h <= 64); \ - if (x_step_q4 == 16 && y_step_q4 == 16) { \ - if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ - vpx_highbd_convolve8_horiz_##opt( \ - src - 3 * src_stride, src_stride, CONVERT_TO_BYTEPTR(fdata2), 64, \ - filter_x, x_step_q4, filter_y, y_step_q4, w, h + 7, bd); \ - vpx_highbd_convolve8_##avg##vert_##opt( \ - CONVERT_TO_BYTEPTR(fdata2) + 192, 64, dst, dst_stride, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h, bd); \ - } else { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ - vpx_highbd_convolve8_horiz_##opt( \ - src, src_stride, CONVERT_TO_BYTEPTR(fdata2), 64, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h + 1, bd); \ - vpx_highbd_convolve8_##avg##vert_##opt( \ - CONVERT_TO_BYTEPTR(fdata2), 64, dst, dst_stride, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h, bd); \ - } \ - } else { \ - vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h, bd); \ - } \ +#define HIGH_FUN_CONV_2D(avg, opt) \ + void vpx_highbd_convolve8_##avg##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ + assert(w <= 64); \ + assert(h <= 64); \ + if (x_step_q4 == 16 && y_step_q4 == 16) { \ + if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ + vpx_highbd_convolve8_horiz_##opt( \ + CAST_TO_BYTEPTR(CAST_TO_SHORTPTR(src) - 3 * src_stride), \ + src_stride, CAST_TO_BYTEPTR(fdata2), 64, filter_x, x_step_q4, \ + filter_y, y_step_q4, w, h + 7, bd); \ + vpx_highbd_convolve8_##avg##vert_##opt( \ + CAST_TO_BYTEPTR(fdata2 + 192), 64, dst, dst_stride, filter_x, \ + x_step_q4, filter_y, y_step_q4, w, h, bd); \ + } else { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ + vpx_highbd_convolve8_horiz_##opt( \ + src, src_stride, CAST_TO_BYTEPTR(fdata2), 64, filter_x, x_step_q4, \ + filter_y, y_step_q4, w, h + 1, bd); \ + vpx_highbd_convolve8_##avg##vert_##opt( \ + CAST_TO_BYTEPTR(fdata2), 64, dst, dst_stride, filter_x, x_step_q4, \ + filter_y, y_step_q4, w, h, bd); \ + } \ + } else { \ + vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h, bd); \ + } \ } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/x86/highbd_convolve_avx2.c b/vpx_dsp/x86/highbd_convolve_avx2.c index 75589d32a..2b774bf23 100644 --- a/vpx_dsp/x86/highbd_convolve_avx2.c +++ b/vpx_dsp/x86/highbd_convolve_avx2.c @@ -21,8 +21,8 @@ void vpx_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int width, int h, int bd) { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const uint16_t *src = CAST_TO_SHORTPTR(src8); + uint16_t *dst = CAST_TO_SHORTPTR(dst8); (void)filter_x; (void)filter_y; (void)filter_x_stride; @@ -104,8 +104,8 @@ void vpx_highbd_convolve_avg_avx2(const uint8_t *src8, ptrdiff_t src_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int width, int h, int bd) { - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + uint16_t *src = CAST_TO_SHORTPTR(src8); + uint16_t *dst = CAST_TO_SHORTPTR(dst8); (void)filter_x; (void)filter_y; (void)filter_x_stride; diff --git a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm index e2311c116..389a692db 100644 --- a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm +++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm @@ -32,9 +32,7 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ mov r4d, dword wm %ifidn %2, highbd shl r4d, 1 - shl srcq, 1 shl src_strideq, 1 - shl dstq, 1 shl dst_strideq, 1 %else cmp r4d, 4 |